From a083420cac1648b364fec1e3f364475065c71708 Mon Sep 17 00:00:00 2001 From: "Philip.Hazel" Date: Fri, 16 Jun 2017 17:51:13 +0000 Subject: [PATCH] Implement PCRE2_EXTRA_MATCH_LINE and PCRE2_EXTRA_MATCH_WORD for the benefit of pcre2grep. --- ChangeLog | 3 ++ doc/pcre2api.3 | 38 +++++++++++++----- doc/pcre2test.1 | 6 ++- src/pcre2.h | 6 ++- src/pcre2.h.in | 6 ++- src/pcre2_compile.c | 92 ++++++++++++++++++++++++++++++++------------ src/pcre2test.c | 2 + testdata/testinput2 | 22 ++++++++++- testdata/testoutput2 | 39 +++++++++++++++++-- 9 files changed, 170 insertions(+), 44 deletions(-) diff --git a/ChangeLog b/ChangeLog index d41ddff..3812c23 100644 --- a/ChangeLog +++ b/ChangeLog @@ -189,6 +189,9 @@ pattern lines. 41. Implement PCRE2_LITERAL and use it to support REG_NOSPEC. +42. Implement PCRE2_EXTRA_MATCH_LINE and PCRE2_EXTRA_MATCH_WORD for the benefit +of pcre2grep. + Version 10.23 14-February-2017 ------------------------------ diff --git a/doc/pcre2api.3 b/doc/pcre2api.3 index 5972d3e..fe589fb 100644 --- a/doc/pcre2api.3 +++ b/doc/pcre2api.3 @@ -1,4 +1,4 @@ -.TH PCRE2API 3 "15 June 2017" "PCRE2 10.30" +.TH PCRE2API 3 "16 June 2017" "PCRE2 10.30" .SH NAME PCRE2 - Perl-compatible regular expressions (revised API) .sp @@ -1396,14 +1396,16 @@ words, whichever limit comes first is used. .sp PCRE2_LITERAL .sp -If this option is set, all meta-characters in the pattern are disabled, and it -is treated as a literal string. Matching literal strings with a regular -expression engine is not the most efficient way of doing it. If you are doing a -lot of literal matching and are worried about efficiency, you should consider -using other approaches. The only other options that are allowed with -PCRE2_LITERAL are: PCRE2_ANCHORED, PCRE2_ENDANCHORED, PCRE2_AUTO_CALLOUT, +If this option is set, all meta-characters in the pattern are disabled, and it +is treated as a literal string. Matching literal strings with a regular +expression engine is not the most efficient way of doing it. If you are doing a +lot of literal matching and are worried about efficiency, you should consider +using other approaches. The only other main options that are allowed with +PCRE2_LITERAL are: PCRE2_ANCHORED, PCRE2_ENDANCHORED, PCRE2_AUTO_CALLOUT, PCRE2_CASELESS, PCRE2_FIRSTLINE, PCRE2_NO_START_OPTIMIZE, PCRE2_NO_UTF_CHECK, -PCRE2_UTF, and PCRE2_USE_OFFSET_LIMIT. Any other options cause an error. +PCRE2_UTF, and PCRE2_USE_OFFSET_LIMIT. The extra options PCRE2_EXTRA_MATCH_LINE +and PCRE2_EXTRA_MATCH_WORD are also supported. Any other options cause an +error. .sp PCRE2_MATCH_UNSET_BACKREF .sp @@ -1689,6 +1691,24 @@ treated as single-character escapes. For example, \ej is a literal "j" and \ex{2z} is treated as the literal string "x{2z}". Setting this option means that typos in patterns may go undetected and have unexpected results. This is a dangerous option. Use with care. +.sp + PCRE2_EXTRA_MATCH_LINE +.sp +This option is provided for use by the \fB-x\fP option of \fBpcre2grep\fP. It +causes the pattern only to match complete lines. This is achieved by +automatically inserting the code for "^(?:" at the start of the compiled +pattern and ")$" at the end. Thus, when PCRE2_MULTILINE is set, the matched +line may be in the middle of the subject string. This option can be used with +PCRE2_LITERAL. +.sp + PCRE2_EXTRA_MATCH_WORD +.sp +This option is provided for use by the \fB-w\fP option of \fBpcre2grep\fP. It +causes the pattern only to match strings that have a word boundary at the start +and the end. This is achieved by automatically inserting the code for "\eb(?:" +at the start of the compiled pattern and ")\eb" at the end. The option may be +used with PCRE2_LITERAL. However, it is ignored if PCRE2_EXTRA_MATCH_LINE is +also set. . . .SH "COMPILATION ERROR CODES" @@ -3519,6 +3539,6 @@ Cambridge, England. .rs .sp .nf -Last updated: 15 June 2017 +Last updated: 16 June 2017 Copyright (c) 1997-2017 University of Cambridge. .fi diff --git a/doc/pcre2test.1 b/doc/pcre2test.1 index d0bcce2..39b3a16 100644 --- a/doc/pcre2test.1 +++ b/doc/pcre2test.1 @@ -1,4 +1,4 @@ -.TH PCRE2TEST 1 "15 June 2017" "PCRE 10.30" +.TH PCRE2TEST 1 "16 June 2017" "PCRE 10.30" .SH NAME pcre2test - a program for testing Perl-compatible regular expressions. .SH SYNOPSIS @@ -556,7 +556,9 @@ for a description of the effects of these options. /xx extended_more set PCRE2_EXTENDED_MORE firstline set PCRE2_FIRSTLINE literal set PCRE2_LITERAL + match_line set PCRE2_EXTRA_MATCH_LINE match_unset_backref set PCRE2_MATCH_UNSET_BACKREF + match_word set PCRE2_EXTRA_MATCH_WORD /m multiline set PCRE2_MULTILINE never_backslash_c set PCRE2_NEVER_BACKSLASH_C never_ucp set PCRE2_NEVER_UCP @@ -1835,6 +1837,6 @@ Cambridge, England. .rs .sp .nf -Last updated: 15 June 2017 +Last updated: 16 June 2017 Copyright (c) 1997-2017 University of Cambridge. .fi diff --git a/src/pcre2.h b/src/pcre2.h index 2024263..bab45b2 100644 --- a/src/pcre2.h +++ b/src/pcre2.h @@ -142,8 +142,10 @@ D is inspected during pcre2_dfa_match() execution /* An additional compile options word is available in the compile context. */ -#define PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES 0x00000001u /* C */ -#define PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL 0x00000002u /* C */ +#define PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES 0x00000001u /* C */ +#define PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL 0x00000002u /* C */ +#define PCRE2_EXTRA_MATCH_WORD 0x00000004u /* C */ +#define PCRE2_EXTRA_MATCH_LINE 0x00000008u /* C */ /* These are for pcre2_jit_compile(). */ diff --git a/src/pcre2.h.in b/src/pcre2.h.in index ec080cc..a110638 100644 --- a/src/pcre2.h.in +++ b/src/pcre2.h.in @@ -142,8 +142,10 @@ D is inspected during pcre2_dfa_match() execution /* An additional compile options word is available in the compile context. */ -#define PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES 0x00000001u /* C */ -#define PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL 0x00000002u /* C */ +#define PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES 0x00000001u /* C */ +#define PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL 0x00000002u /* C */ +#define PCRE2_EXTRA_MATCH_WORD 0x00000004u /* C */ +#define PCRE2_EXTRA_MATCH_LINE 0x00000008u /* C */ /* These are for pcre2_jit_compile(). */ diff --git a/src/pcre2_compile.c b/src/pcre2_compile.c index ff9261b..a8801b3 100644 --- a/src/pcre2_compile.c +++ b/src/pcre2_compile.c @@ -690,24 +690,30 @@ static int posix_substitutes[] = { #define POSIX_SUBSIZE (sizeof(posix_substitutes) / (2*sizeof(uint32_t))) #endif /* SUPPORT_UNICODE */ -/* Masks for checking option settings. */ - -#define PUBLIC_COMPILE_OPTIONS \ - (PCRE2_ANCHORED|PCRE2_ALLOW_EMPTY_CLASS|PCRE2_ALT_BSUX|PCRE2_ALT_CIRCUMFLEX| \ - PCRE2_ALT_VERBNAMES|PCRE2_AUTO_CALLOUT|PCRE2_CASELESS|PCRE2_DOLLAR_ENDONLY| \ - PCRE2_DOTALL|PCRE2_DUPNAMES|PCRE2_ENDANCHORED|PCRE2_EXTENDED| \ - PCRE2_EXTENDED_MORE|PCRE2_FIRSTLINE|PCRE2_LITERAL| \ - PCRE2_MATCH_UNSET_BACKREF|PCRE2_MULTILINE|PCRE2_NEVER_BACKSLASH_C| \ - PCRE2_NEVER_UCP|PCRE2_NEVER_UTF|PCRE2_NO_AUTO_CAPTURE| \ - PCRE2_NO_AUTO_POSSESS|PCRE2_NO_DOTSTAR_ANCHOR|PCRE2_NO_START_OPTIMIZE| \ - PCRE2_NO_UTF_CHECK|PCRE2_UCP|PCRE2_UNGREEDY|PCRE2_USE_OFFSET_LIMIT| \ - PCRE2_UTF) +/* Masks for checking option settings. When PCRE2_LITERAL is set, only a subset +are allowed. */ #define PUBLIC_LITERAL_COMPILE_OPTIONS \ (PCRE2_ANCHORED|PCRE2_AUTO_CALLOUT|PCRE2_CASELESS|PCRE2_ENDANCHORED| \ PCRE2_FIRSTLINE|PCRE2_LITERAL|PCRE2_NO_START_OPTIMIZE| \ PCRE2_NO_UTF_CHECK|PCRE2_USE_OFFSET_LIMIT|PCRE2_UTF) +#define PUBLIC_COMPILE_OPTIONS \ + (PUBLIC_LITERAL_COMPILE_OPTIONS| \ + PCRE2_ALLOW_EMPTY_CLASS|PCRE2_ALT_BSUX|PCRE2_ALT_CIRCUMFLEX| \ + PCRE2_ALT_VERBNAMES|PCRE2_DOLLAR_ENDONLY|PCRE2_DOTALL|PCRE2_DUPNAMES| \ + PCRE2_EXTENDED|PCRE2_EXTENDED_MORE|PCRE2_MATCH_UNSET_BACKREF| \ + PCRE2_MULTILINE|PCRE2_NEVER_BACKSLASH_C|PCRE2_NEVER_UCP| \ + PCRE2_NEVER_UTF|PCRE2_NO_AUTO_CAPTURE|PCRE2_NO_AUTO_POSSESS| \ + PCRE2_NO_DOTSTAR_ANCHOR|PCRE2_UCP|PCRE2_UNGREEDY) + +#define PUBLIC_LITERAL_COMPILE_EXTRA_OPTIONS \ + (PCRE2_EXTRA_MATCH_LINE|PCRE2_EXTRA_MATCH_WORD) + +#define PUBLIC_COMPILE_EXTRA_OPTIONS \ + (PUBLIC_LITERAL_COMPILE_EXTRA_OPTIONS| \ + PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES|PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL) + /* Compile time error code numbers. They are given names so that they can more easily be tracked. When a new number is added, the tables called eint1 and eint2 in pcre2posix.c may need to be updated, and a new error text must be @@ -2304,6 +2310,20 @@ PCRE2_SPTR verbnamestart = NULL; /* Value avoids compiler warning */ named_group *ng; nest_save *top_nest, *end_nests; +/* Insert leading items for word and line matching (features provided for the +benefit of pcre2grep). */ + +if ((cb->cx->extra_options & PCRE2_EXTRA_MATCH_LINE) != 0) + { + *parsed_pattern++ = META_CIRCUMFLEX; + *parsed_pattern++ = META_NOCAPTURE; + } +else if ((cb->cx->extra_options & PCRE2_EXTRA_MATCH_WORD) != 0) + { + *parsed_pattern++ = META_ESCAPE + ESC_b; + *parsed_pattern++ = META_NOCAPTURE; + } + /* If the pattern is actually a literal string, process it separately to avoid cluttering up the main loop. */ @@ -2323,8 +2343,7 @@ if ((options & PCRE2_LITERAL) != 0) auto_callout, parsed_pattern, cb); PARSED_LITERAL(c, parsed_pattern); } - *parsed_pattern = META_END; - return 0; + goto PARSED_END; } /* Process a real regex which may contain meta-characters. */ @@ -4166,9 +4185,24 @@ if (inverbname && ptr >= ptrend) /* Manage callout for the final item */ +PARSED_END: parsed_pattern = manage_callouts(ptr, &previous_callout, auto_callout, parsed_pattern, cb); +/* Insert trailing items for word and line matching (features provided for the +benefit of pcre2grep). */ + +if ((cb->cx->extra_options & PCRE2_EXTRA_MATCH_LINE) != 0) + { + *parsed_pattern++ = META_KET; + *parsed_pattern++ = META_DOLLAR; + } +else if ((cb->cx->extra_options & PCRE2_EXTRA_MATCH_WORD) != 0) + { + *parsed_pattern++ = META_KET; + *parsed_pattern++ = META_ESCAPE + ESC_b; + } + /* Terminate the parsed pattern, then return success if all groups are closed. Otherwise we have unclosed parentheses. */ @@ -4177,6 +4211,7 @@ if (parsed_pattern >= parsed_pattern_end) errorcode = ERR63; /* Internal error (parsed pattern overflow) */ goto FAILED; } + *parsed_pattern = META_END; if (nest_depth == 0) return 0; @@ -8984,26 +9019,28 @@ if (pattern == NULL) return NULL; } +/* A NULL compile context means "use a default context" */ + +if (ccontext == NULL) + ccontext = (pcre2_compile_context *)(&PRIV(default_compile_context)); + /* Check that all undefined public option bits are zero. */ -if ((options & ~PUBLIC_COMPILE_OPTIONS) != 0) +if ((options & ~PUBLIC_COMPILE_OPTIONS) != 0 || + (ccontext->extra_options & ~PUBLIC_COMPILE_EXTRA_OPTIONS) != 0) { *errorptr = ERR17; return NULL; } if ((options & PCRE2_LITERAL) != 0 && - (options & ~PUBLIC_LITERAL_COMPILE_OPTIONS) != 0) + ((options & ~PUBLIC_LITERAL_COMPILE_OPTIONS) != 0 || + (ccontext->extra_options & ~PUBLIC_LITERAL_COMPILE_EXTRA_OPTIONS) != 0)) { *errorptr = ERR92; return NULL; } -/* A NULL compile context means "use a default context" */ - -if (ccontext == NULL) - ccontext = (pcre2_compile_context *)(&PRIV(default_compile_context)); - /* A zero-terminated pattern is indicated by the special length value PCRE2_ZERO_TERMINATED. Check for an overlong pattern. */ @@ -9262,10 +9299,10 @@ and comments removed (amongst other things). In all but one case, when PCRE2_AUTO_CALLOUT is not set, the number of unsigned 32-bit ints in the parsed pattern is bounded by the length of the pattern plus -one (for the terminator). The exceptional case is when running in 32-bit, -non-UTF mode, when literal characters greater than META_END (0x80000000) have -to be coded as two units. In this case, therefore, we scan the pattern to check -for such values. */ +one (for the terminator) plus four if PCRE2_EXTRA_WORD or PCRE2_EXTRA_LINE is +set. The exceptional case is when running in 32-bit, non-UTF mode, when literal +characters greater than META_END (0x80000000) have to be coded as two units. In +this case, therefore, we scan the pattern to check for such values. */ #if PCRE2_CODE_UNIT_WIDTH == 32 if (!utf) @@ -9282,6 +9319,11 @@ many smaller patterns the vector on the stack (which was set up above) can be used. */ parsed_size_needed = patlen - skipatstart + big32count; + +if ((ccontext->extra_options & + (PCRE2_EXTRA_MATCH_WORD|PCRE2_EXTRA_MATCH_LINE)) != 0) + parsed_size_needed += 4; + if ((options & PCRE2_AUTO_CALLOUT) != 0) parsed_size_needed = (parsed_size_needed + 1) * 5; diff --git a/src/pcre2test.c b/src/pcre2test.c index 14e9153..1555719 100644 --- a/src/pcre2test.c +++ b/src/pcre2test.c @@ -638,7 +638,9 @@ static modstruct modlist[] = { { "locale", MOD_PAT, MOD_STR, LOCALESIZE, PO(locale) }, { "mark", MOD_PNDP, MOD_CTL, CTL_MARK, PO(control) }, { "match_limit", MOD_CTM, MOD_INT, 0, MO(match_limit) }, + { "match_line", MOD_CTC, MOD_OPT, PCRE2_EXTRA_MATCH_LINE, CO(extra_options) }, { "match_unset_backref", MOD_PAT, MOD_OPT, PCRE2_MATCH_UNSET_BACKREF, PO(options) }, + { "match_word", MOD_CTC, MOD_OPT, PCRE2_EXTRA_MATCH_WORD, CO(extra_options) }, { "max_pattern_length", MOD_CTC, MOD_SIZ, 0, CO(max_pattern_length) }, { "memory", MOD_PD, MOD_CTL, CTL_MEMORY, PD(control) }, { "multiline", MOD_PATP, MOD_OPT, PCRE2_MULTILINE, PO(options) }, diff --git a/testdata/testinput2 b/testdata/testinput2 index 64640a7..77b0a1a 100644 --- a/testdata/testinput2 +++ b/testdata/testinput2 @@ -5305,7 +5305,7 @@ a)"xI X\na\\b(c /a\b?c/literal,use_offset_limit - XXXXa\\b?c\=offset_limit=5 + XXXXa\\b?c\=offset_limit=4 \= Expect no match XXXXa\\b?c\=offset_limit=3 @@ -5327,4 +5327,24 @@ a)"xI /(*CR)abc/literal (*CR)abc +/cat|dog/match_word + the cat sat +\= Expect no match + caterpillar + snowcat + syndicate + +/(cat)|dog/match_line,literal + (cat)|dog +\= Expect no match + the cat sat + caterpillar + snowcat + syndicate + +/a whole line/match_line,multiline + Rhubarb \na whole line\n custard +\= Expect no match + Not a whole line + # End of testinput2 diff --git a/testdata/testoutput2 b/testdata/testoutput2 index f80bd56..5db311c 100644 --- a/testdata/testoutput2 +++ b/testdata/testoutput2 @@ -16033,7 +16033,7 @@ Failed: error 108 at offset 4: range out of order in character class No match /a\b?c/literal,use_offset_limit - XXXXa\\b?c\=offset_limit=5 + XXXXa\\b?c\=offset_limit=4 0: a\b?c \= Expect no match XXXXa\\b?c\=offset_limit=3 @@ -16064,7 +16064,8 @@ Failed: error 192 at offset 0: invalid option bits with PCRE2_LITERAL +1 ^^ \ +2 ^ ^ b +3 ^ ^ ( - +4 ^ ^ + +4 ^ ^ c + +5 ^ ^ 0: a\b(c /a\b(c/literal,auto_callout @@ -16074,13 +16075,45 @@ Failed: error 192 at offset 0: invalid option bits with PCRE2_LITERAL +1 ^^ \ +2 ^ ^ b +3 ^ ^ ( - +4 ^ ^ + +4 ^ ^ c + +5 ^ ^ 0: a\b(c /(*CR)abc/literal (*CR)abc 0: (*CR)abc +/cat|dog/match_word + the cat sat + 0: cat +\= Expect no match + caterpillar +No match + snowcat +No match + syndicate +No match + +/(cat)|dog/match_line,literal + (cat)|dog + 0: (cat)|dog +\= Expect no match + the cat sat +No match + caterpillar +No match + snowcat +No match + syndicate +No match + +/a whole line/match_line,multiline + Rhubarb \na whole line\n custard + 0: a whole line +\= Expect no match + Not a whole line +No match + # End of testinput2 Error -65: PCRE2_ERROR_BADDATA (unknown error number) Error -62: bad serialized data