diff --git a/ChangeLog b/ChangeLog index 878f563..58dd453 100644 --- a/ChangeLog +++ b/ChangeLog @@ -80,6 +80,19 @@ variables PCRE2GREP_COLOUR and PCRE2GREP_COLOR are not found. 14. Add the -t (grand total) option to pcre2grep. +15. A number of bugs have been mended relating to match start-up optimizations +when the first thing in a pattern is a positive lookahead. These all applied +only when PCRE2_NO_START_OPTIMIZE was *not* set: + + (a) A pattern such as (?=.*X)X$ was incorrectly optimized as if it needed + both an initial 'X' and a following 'X'. + (b) Some patterns starting with an assertion that started with .* were + incorrectly optimized as having to match at the start of the subject or + after a newline. There are cases where this is not true, for example, + (?=.*[A-Z])(?=.{8,16})(?!.*[\s]) matches after the start in lines that + start with spaces. Starting .* in an assertion is no longer taken as an + indication of matching at the start (or after a newline). + Version 10.22 29-July-2016 -------------------------- diff --git a/src/pcre2_compile.c b/src/pcre2_compile.c index 248db5d..ff4972a 100644 --- a/src/pcre2_compile.c +++ b/src/pcre2_compile.c @@ -3312,8 +3312,8 @@ while (ptr < ptrend) if (ptr >= ptrend || *ptr != CHAR_RIGHT_PARENTHESIS) { errorcode = ERR58; - goto FAILED; - } + goto FAILED; + } goto SET_RECURSION; /* An item starting (?- followed by a digit comes here via the "default" @@ -5994,7 +5994,7 @@ for (;; pptr++) zerofirstcuflags = firstcuflags; groupsetfirstcu = FALSE; - if (bravalue >= OP_ONCE) + if (bravalue >= OP_ONCE) /* Not an assertion */ { /* If we have not yet set a firstcu in this branch, take it from the subpattern, remembering that it was set here so that a repeat of more @@ -6034,15 +6034,19 @@ for (;; pptr++) } } - /* For a forward assertion, we take the reqcu, if set. This can be - helpful if the pattern that follows the assertion doesn't set a different - char. For example, it's useful for /(?=abcde).+/. We can't set firstcu - for an assertion, however because it leads to incorrect effect for patterns - such as /(?=a)a.+/ when the "real" "a" would then become a reqcu instead - of a firstcu. This is overcome by a scan at the end if there's no - firstcu, looking for an asserted first char. */ + /* For a forward assertion, we take the reqcu, if set, provided that the + group has also set a firstcu. This can be helpful if the pattern that + follows the assertion doesn't set a different char. For example, it's + useful for /(?=abcde).+/. We can't set firstcu for an assertion, however + because it leads to incorrect effect for patterns such as /(?=a)a.+/ when + the "real" "a" would then become a reqcu instead of a firstcu. This is + overcome by a scan at the end if there's no firstcu, looking for an + asserted first char. A similar effect for patterns like /(?=.*X)X$/ means + we must only take the reqcu when the group also set a firstcu. Otherwise, + in that example, 'X' ends up set for both. */ - else if (bravalue == OP_ASSERT && subreqcuflags >= 0) + else if (bravalue == OP_ASSERT && subreqcuflags >= 0 && + subfirstcuflags >= 0) { reqcu = subreqcu; reqcuflags = subreqcuflags; @@ -6542,8 +6546,9 @@ for (;; pptr++) *lengthptr += delta; } - /* This is compiling for real. If there is a set first code unit for - the group, and we have not yet set a "required code unit", set it. */ + /* This is compiling for real. If there is a set first code unit + for the group, and we have not yet set a "required code unit", set + it. */ else { @@ -7701,8 +7706,8 @@ matching and for non-DOTALL patterns that start with .* (which must start at the beginning or after \n). As in the case of is_anchored() (see above), we have to take account of back references to capturing brackets that contain .* because in that case we can't make the assumption. Also, the appearance of .* -inside atomic brackets or in a pattern that contains *PRUNE or *SKIP does not -count, because once again the assumption no longer holds. +inside atomic brackets or in an assertion, or in a pattern that contains *PRUNE +or *SKIP does not count, because once again the assumption no longer holds. Arguments: code points to start of the compiled pattern or a group @@ -7711,13 +7716,14 @@ Arguments: the less precise approach cb points to the compile data atomcount atomic group level + inassert TRUE if in an assertion Returns: TRUE or FALSE */ static BOOL is_startline(PCRE2_SPTR code, unsigned int bracket_map, compile_block *cb, - int atomcount) + int atomcount, BOOL inassert) { do { PCRE2_SPTR scode = first_significant_code( @@ -7748,7 +7754,7 @@ do { return FALSE; default: /* Assertion */ - if (!is_startline(scode, bracket_map, cb, atomcount)) return FALSE; + if (!is_startline(scode, bracket_map, cb, atomcount, TRUE)) return FALSE; do scode += GET(scode, 1); while (*scode == OP_ALT); scode += 1 + LINK_SIZE; break; @@ -7762,7 +7768,8 @@ do { if (op == OP_BRA || op == OP_BRAPOS || op == OP_SBRA || op == OP_SBRAPOS) { - if (!is_startline(scode, bracket_map, cb, atomcount)) return FALSE; + if (!is_startline(scode, bracket_map, cb, atomcount, inassert)) + return FALSE; } /* Capturing brackets */ @@ -7772,33 +7779,36 @@ do { { int n = GET2(scode, 1+LINK_SIZE); int new_map = bracket_map | ((n < 32)? (1u << n) : 1); - if (!is_startline(scode, new_map, cb, atomcount)) return FALSE; + if (!is_startline(scode, new_map, cb, atomcount, inassert)) return FALSE; } /* Positive forward assertions */ else if (op == OP_ASSERT) { - if (!is_startline(scode, bracket_map, cb, atomcount)) return FALSE; + if (!is_startline(scode, bracket_map, cb, atomcount, TRUE)) + return FALSE; } /* Atomic brackets */ else if (op == OP_ONCE || op == OP_ONCE_NC) { - if (!is_startline(scode, bracket_map, cb, atomcount + 1)) return FALSE; + if (!is_startline(scode, bracket_map, cb, atomcount + 1, inassert)) + return FALSE; } /* .* means "start at start or after \n" if it isn't in atomic brackets or - brackets that may be referenced, as long as the pattern does not contain - *PRUNE or *SKIP, because these break the feature. Consider, for example, - /.*?a(*PRUNE)b/ with the subject "aab", which matches "ab", i.e. not at the - start of a line. There is also an option that disables this optimization. */ + brackets that may be referenced or an assertion, and as long as the pattern + does not contain *PRUNE or *SKIP, because these break the feature. Consider, + for example, /.*?a(*PRUNE)b/ with the subject "aab", which matches "ab", + i.e. not at the start of a line. There is also an option that disables this + optimization. */ else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR || op == OP_TYPEPOSSTAR) { if (scode[1] != OP_ANY || (bracket_map & cb->backref_map) != 0 || - atomcount > 0 || cb->had_pruneorskip || + atomcount > 0 || cb->had_pruneorskip || inassert || (cb->external_options & PCRE2_NO_DOTSTAR_ANCHOR) != 0) return FALSE; } @@ -9452,7 +9462,8 @@ if ((re->overall_options & (PCRE2_ANCHORED|PCRE2_NO_START_OPTIMIZE)) == 0) when *PRUNE and SKIP are not present. (There is an option that disables this case.) */ - else if (is_startline(codestart, 0, &cb, 0)) re->flags |= PCRE2_STARTLINE; + else if (is_startline(codestart, 0, &cb, 0, FALSE)) + re->flags |= PCRE2_STARTLINE; } /* Handle the "required code unit", if one is set. In the case of an anchored diff --git a/testdata/testinput1 b/testdata/testinput1 index 3f0ccb6..bcc42bd 100644 --- a/testdata/testinput1 +++ b/testdata/testinput1 @@ -5806,4 +5806,10 @@ ef) x/x,mark \= Expect no match baaab +/(?=.*[A-Z])(?=.*[a-z])(?=.*[0-9])(?=.*[,;:])(?=.{8,16})(?!.*[\s])/ + \ Fred:099 + +/(?=.*X)X$/ + \ X + # End of testinput1 diff --git a/testdata/testinput2 b/testdata/testinput2 index 4120643..3a81dc0 100644 --- a/testdata/testinput2 +++ b/testdata/testinput2 @@ -4892,4 +4892,6 @@ a)"xI /(?abc)(?(R)xyz)/B +/(?=.*[A-Z])/I + # End of testinput2 diff --git a/testdata/testinput4 b/testdata/testinput4 index ce9145d..73582b7 100644 --- a/testdata/testinput4 +++ b/testdata/testinput4 @@ -2282,4 +2282,10 @@ \x{389} \x{20ac} +/(?=.*b)\pL/ + 11bb + +/(?(?=.*b)(?=.*b)\pL|.*c)/ + 11bb + # End of testinput4 diff --git a/testdata/testoutput1 b/testdata/testoutput1 index 68f7015..837d1f4 100644 --- a/testdata/testoutput1 +++ b/testdata/testoutput1 @@ -9277,4 +9277,12 @@ MK: ab cd # comment\x0aef baaab No match +/(?=.*[A-Z])(?=.*[a-z])(?=.*[0-9])(?=.*[,;:])(?=.{8,16})(?!.*[\s])/ + \ Fred:099 + 0: + +/(?=.*X)X$/ + \ X + 0: X + # End of testinput1 diff --git a/testdata/testoutput2 b/testdata/testoutput2 index ff30167..ccdb5ef 100644 --- a/testdata/testoutput2 +++ b/testdata/testoutput2 @@ -8750,7 +8750,6 @@ Subject length lower bound = 1 /(?(?=.*b).*b|^d)/I Capturing subpattern count = 0 -First code unit at start or follows newline Subject length lower bound = 1 /xyz/auto_callout @@ -15334,6 +15333,11 @@ Failed: error 150 at offset 5: invalid range in character class End ------------------------------------------------------------------ +/(?=.*[A-Z])/I +Capturing subpattern count = 0 +May match empty string +Subject length lower bound = 0 + # End of testinput2 Error -63: PCRE2_ERROR_BADDATA (unknown error number) Error -62: bad serialized data diff --git a/testdata/testoutput4 b/testdata/testoutput4 index 701d411..d2d5e51 100644 --- a/testdata/testoutput4 +++ b/testdata/testoutput4 @@ -3703,4 +3703,12 @@ No match \x{20ac} No match +/(?=.*b)\pL/ + 11bb + 0: b + +/(?(?=.*b)(?=.*b)\pL|.*c)/ + 11bb + 0: b + # End of testinput4