From 9332d4be690e3450a9b70a2c76ef9d48b3d29461 Mon Sep 17 00:00:00 2001 From: "Philip.Hazel" Date: Sat, 4 Aug 2018 08:20:18 +0000 Subject: [PATCH] Fix dynamic options changing bug. --- ChangeLog | 7 +++++++ src/pcre2_compile.c | 36 ++++++++++++++++++++---------------- testdata/testinput1 | 5 +++++ testdata/testoutput1 | 8 ++++++++ 4 files changed, 40 insertions(+), 16 deletions(-) diff --git a/ChangeLog b/ChangeLog index c4f682e..9918619 100644 --- a/ChangeLog +++ b/ChangeLog @@ -140,6 +140,13 @@ generated by pcre2_maketables(), which uses isspace() to identify white space. Now, when Unicode support is compiled, PCRE2_EXTENDED also discards U+0085, U+200E, U+200F, U+2028, and U+2029, which are additional characters defined by Unicode as "Pattern White Space". This makes PCRE2 compatible with Perl. + +32. In certain circumstances, option settings within patterns were not being +correctly processed. For example, the pattern /((?i)A)(?m)B/ incorrectly +matched "ab". (The (?m) setting lost the fact that (?i) should be reset at the +end of its group during the parse process, but without another setting such as +(?m) the compile phase got it right.) This bug was introduced by the +refactoring in release 10.23. Version 10.31 12-February-2018 diff --git a/src/pcre2_compile.c b/src/pcre2_compile.c index f5eae11..42adde7 100644 --- a/src/pcre2_compile.c +++ b/src/pcre2_compile.c @@ -2284,11 +2284,14 @@ typedef struct nest_save { #define NSF_RESET 0x0001u #define NSF_CONDASSERT 0x0002u -/* Of the options that are changeable within the pattern, these are tracked -during parsing. The rest are used from META_OPTIONS items when compiling. */ +/* Options that are changeable within the pattern must be tracked during +parsing. Some (e.g. PCRE2_EXTENDED) are implemented entirely during parsing, +but all must be tracked so that META_OPTIONS items set the correct values for +the main compiling phase. */ -#define PARSE_TRACKED_OPTIONS \ - (PCRE2_DUPNAMES|PCRE2_EXTENDED|PCRE2_EXTENDED_MORE|PCRE2_NO_AUTO_CAPTURE) +#define PARSE_TRACKED_OPTIONS (PCRE2_CASELESS|PCRE2_DOTALL|PCRE2_DUPNAMES| \ + PCRE2_EXTENDED|PCRE2_EXTENDED_MORE|PCRE2_MULTILINE|PCRE2_NO_AUTO_CAPTURE| \ + PCRE2_UNGREEDY) /* States used for analyzing ranges in character classes. The two OK values must be last. */ @@ -2468,16 +2471,16 @@ while (ptr < ptrend) /* EITHER: not both options set */ ((options & (PCRE2_EXTENDED | PCRE2_ALT_VERBNAMES)) != (PCRE2_EXTENDED | PCRE2_ALT_VERBNAMES)) || -#ifdef SUPPORT_UNICODE +#ifdef SUPPORT_UNICODE /* OR: character > 255 AND not Unicode Pattern White Space */ (c > 255 && (c|1) != 0x200f && (c|1) != 0x2029) || -#endif +#endif /* OR: not a # comment or isspace() white space */ (c < 256 && c != CHAR_NUMBER_SIGN && (cb->ctypes[c] & ctype_space) == 0 #ifdef SUPPORT_UNICODE /* and not CHAR_NEL when Unicode is supported */ && c != CHAR_NEL -#endif +#endif ))) { PCRE2_SIZE verbnamelength; @@ -2562,16 +2565,16 @@ while (ptr < ptrend) character, not a code unit, so we must not use MAX_255 to test its size because MAX_255 tests code units and is assumed TRUE in 8-bit mode. The whitespace characters are those designated as "Pattern White Space" by - Unicode, which are the isspace() characters plus CHAR_NEL (newline), which is - U+0085 in Unicode, plus U+200E, U+200F, U+2028, and U+2029. These are a + Unicode, which are the isspace() characters plus CHAR_NEL (newline), which is + U+0085 in Unicode, plus U+200E, U+200F, U+2028, and U+2029. These are a subset of space characters that match \h and \v. */ if ((options & PCRE2_EXTENDED) != 0) { if (c < 256 && (cb->ctypes[c] & ctype_space) != 0) continue; -#ifdef SUPPORT_UNICODE +#ifdef SUPPORT_UNICODE if (c == CHAR_NEL || (c|1) == 0x200f || (c|1) == 0x2029) continue; -#endif +#endif if (c == CHAR_NUMBER_SIGN) { while (ptr < ptrend) @@ -3590,6 +3593,8 @@ while (ptr < ptrend) else { BOOL hyphenok = TRUE; + uint32_t oldoptions = options; + top_nest->reset_group = 0; top_nest->max_group = 0; set = unset = 0; @@ -3602,7 +3607,7 @@ while (ptr < ptrend) options &= ~(PCRE2_CASELESS|PCRE2_MULTILINE|PCRE2_NO_AUTO_CAPTURE| PCRE2_DOTALL|PCRE2_EXTENDED|PCRE2_EXTENDED_MORE); hyphenok = FALSE; - ptr++; + ptr++; } while (ptr < ptrend && *ptr != CHAR_RIGHT_PARENTHESIS && @@ -3618,7 +3623,7 @@ while (ptr < ptrend) goto FAILED; } optset = &unset; - hyphenok = FALSE; + hyphenok = FALSE; break; case CHAR_J: /* Record that it changed in the external options */ @@ -3677,10 +3682,9 @@ while (ptr < ptrend) } else *parsed_pattern++ = META_NOCAPTURE; - /* If nothing changed, no need to record. The check of hyphenok catches - the (?^) case. */ + /* If nothing changed, no need to record. */ - if (set != 0 || unset != 0 || !hyphenok) + if (options != oldoptions) { *parsed_pattern++ = META_OPTIONS; *parsed_pattern++ = options; diff --git a/testdata/testinput1 b/testdata/testinput1 index f4fb889..d8615ee 100644 --- a/testdata/testinput1 +++ b/testdata/testinput1 @@ -2184,6 +2184,11 @@ Blah blah blaH blah +/((?i)blah)\s+(?m)A(?i:\1)/ + blah ABLAH +\= Expect no match + blah aBLAH + /(?>a*)*/ a aa diff --git a/testdata/testoutput1 b/testdata/testoutput1 index 2778363..77b9ff0 100644 --- a/testdata/testoutput1 +++ b/testdata/testoutput1 @@ -3346,6 +3346,14 @@ No match 0: blaH blah 1: blaH +/((?i)blah)\s+(?m)A(?i:\1)/ + blah ABLAH + 0: blah ABLAH + 1: blah +\= Expect no match + blah aBLAH +No match + /(?>a*)*/ a 0: a