Fix optimization bugs when pattern starts with lookahead.

This commit is contained in:
Philip.Hazel 2016-10-18 11:22:40 +00:00
parent 1828179ef9
commit a22c5e4204
8 changed files with 86 additions and 28 deletions

View File

@ -80,6 +80,19 @@ variables PCRE2GREP_COLOUR and PCRE2GREP_COLOR are not found.
14. Add the -t (grand total) option to pcre2grep. 14. Add the -t (grand total) option to pcre2grep.
15. A number of bugs have been mended relating to match start-up optimizations
when the first thing in a pattern is a positive lookahead. These all applied
only when PCRE2_NO_START_OPTIMIZE was *not* set:
(a) A pattern such as (?=.*X)X$ was incorrectly optimized as if it needed
both an initial 'X' and a following 'X'.
(b) Some patterns starting with an assertion that started with .* were
incorrectly optimized as having to match at the start of the subject or
after a newline. There are cases where this is not true, for example,
(?=.*[A-Z])(?=.{8,16})(?!.*[\s]) matches after the start in lines that
start with spaces. Starting .* in an assertion is no longer taken as an
indication of matching at the start (or after a newline).
Version 10.22 29-July-2016 Version 10.22 29-July-2016
-------------------------- --------------------------

View File

@ -3312,8 +3312,8 @@ while (ptr < ptrend)
if (ptr >= ptrend || *ptr != CHAR_RIGHT_PARENTHESIS) if (ptr >= ptrend || *ptr != CHAR_RIGHT_PARENTHESIS)
{ {
errorcode = ERR58; errorcode = ERR58;
goto FAILED; goto FAILED;
} }
goto SET_RECURSION; goto SET_RECURSION;
/* An item starting (?- followed by a digit comes here via the "default" /* An item starting (?- followed by a digit comes here via the "default"
@ -5994,7 +5994,7 @@ for (;; pptr++)
zerofirstcuflags = firstcuflags; zerofirstcuflags = firstcuflags;
groupsetfirstcu = FALSE; groupsetfirstcu = FALSE;
if (bravalue >= OP_ONCE) if (bravalue >= OP_ONCE) /* Not an assertion */
{ {
/* If we have not yet set a firstcu in this branch, take it from the /* If we have not yet set a firstcu in this branch, take it from the
subpattern, remembering that it was set here so that a repeat of more subpattern, remembering that it was set here so that a repeat of more
@ -6034,15 +6034,19 @@ for (;; pptr++)
} }
} }
/* For a forward assertion, we take the reqcu, if set. This can be /* For a forward assertion, we take the reqcu, if set, provided that the
helpful if the pattern that follows the assertion doesn't set a different group has also set a firstcu. This can be helpful if the pattern that
char. For example, it's useful for /(?=abcde).+/. We can't set firstcu follows the assertion doesn't set a different char. For example, it's
for an assertion, however because it leads to incorrect effect for patterns useful for /(?=abcde).+/. We can't set firstcu for an assertion, however
such as /(?=a)a.+/ when the "real" "a" would then become a reqcu instead because it leads to incorrect effect for patterns such as /(?=a)a.+/ when
of a firstcu. This is overcome by a scan at the end if there's no the "real" "a" would then become a reqcu instead of a firstcu. This is
firstcu, looking for an asserted first char. */ overcome by a scan at the end if there's no firstcu, looking for an
asserted first char. A similar effect for patterns like /(?=.*X)X$/ means
we must only take the reqcu when the group also set a firstcu. Otherwise,
in that example, 'X' ends up set for both. */
else if (bravalue == OP_ASSERT && subreqcuflags >= 0) else if (bravalue == OP_ASSERT && subreqcuflags >= 0 &&
subfirstcuflags >= 0)
{ {
reqcu = subreqcu; reqcu = subreqcu;
reqcuflags = subreqcuflags; reqcuflags = subreqcuflags;
@ -6542,8 +6546,9 @@ for (;; pptr++)
*lengthptr += delta; *lengthptr += delta;
} }
/* This is compiling for real. If there is a set first code unit for /* This is compiling for real. If there is a set first code unit
the group, and we have not yet set a "required code unit", set it. */ for the group, and we have not yet set a "required code unit", set
it. */
else else
{ {
@ -7701,8 +7706,8 @@ matching and for non-DOTALL patterns that start with .* (which must start at
the beginning or after \n). As in the case of is_anchored() (see above), we the beginning or after \n). As in the case of is_anchored() (see above), we
have to take account of back references to capturing brackets that contain .* have to take account of back references to capturing brackets that contain .*
because in that case we can't make the assumption. Also, the appearance of .* because in that case we can't make the assumption. Also, the appearance of .*
inside atomic brackets or in a pattern that contains *PRUNE or *SKIP does not inside atomic brackets or in an assertion, or in a pattern that contains *PRUNE
count, because once again the assumption no longer holds. or *SKIP does not count, because once again the assumption no longer holds.
Arguments: Arguments:
code points to start of the compiled pattern or a group code points to start of the compiled pattern or a group
@ -7711,13 +7716,14 @@ Arguments:
the less precise approach the less precise approach
cb points to the compile data cb points to the compile data
atomcount atomic group level atomcount atomic group level
inassert TRUE if in an assertion
Returns: TRUE or FALSE Returns: TRUE or FALSE
*/ */
static BOOL static BOOL
is_startline(PCRE2_SPTR code, unsigned int bracket_map, compile_block *cb, is_startline(PCRE2_SPTR code, unsigned int bracket_map, compile_block *cb,
int atomcount) int atomcount, BOOL inassert)
{ {
do { do {
PCRE2_SPTR scode = first_significant_code( PCRE2_SPTR scode = first_significant_code(
@ -7748,7 +7754,7 @@ do {
return FALSE; return FALSE;
default: /* Assertion */ default: /* Assertion */
if (!is_startline(scode, bracket_map, cb, atomcount)) return FALSE; if (!is_startline(scode, bracket_map, cb, atomcount, TRUE)) return FALSE;
do scode += GET(scode, 1); while (*scode == OP_ALT); do scode += GET(scode, 1); while (*scode == OP_ALT);
scode += 1 + LINK_SIZE; scode += 1 + LINK_SIZE;
break; break;
@ -7762,7 +7768,8 @@ do {
if (op == OP_BRA || op == OP_BRAPOS || if (op == OP_BRA || op == OP_BRAPOS ||
op == OP_SBRA || op == OP_SBRAPOS) op == OP_SBRA || op == OP_SBRAPOS)
{ {
if (!is_startline(scode, bracket_map, cb, atomcount)) return FALSE; if (!is_startline(scode, bracket_map, cb, atomcount, inassert))
return FALSE;
} }
/* Capturing brackets */ /* Capturing brackets */
@ -7772,33 +7779,36 @@ do {
{ {
int n = GET2(scode, 1+LINK_SIZE); int n = GET2(scode, 1+LINK_SIZE);
int new_map = bracket_map | ((n < 32)? (1u << n) : 1); int new_map = bracket_map | ((n < 32)? (1u << n) : 1);
if (!is_startline(scode, new_map, cb, atomcount)) return FALSE; if (!is_startline(scode, new_map, cb, atomcount, inassert)) return FALSE;
} }
/* Positive forward assertions */ /* Positive forward assertions */
else if (op == OP_ASSERT) else if (op == OP_ASSERT)
{ {
if (!is_startline(scode, bracket_map, cb, atomcount)) return FALSE; if (!is_startline(scode, bracket_map, cb, atomcount, TRUE))
return FALSE;
} }
/* Atomic brackets */ /* Atomic brackets */
else if (op == OP_ONCE || op == OP_ONCE_NC) else if (op == OP_ONCE || op == OP_ONCE_NC)
{ {
if (!is_startline(scode, bracket_map, cb, atomcount + 1)) return FALSE; if (!is_startline(scode, bracket_map, cb, atomcount + 1, inassert))
return FALSE;
} }
/* .* means "start at start or after \n" if it isn't in atomic brackets or /* .* means "start at start or after \n" if it isn't in atomic brackets or
brackets that may be referenced, as long as the pattern does not contain brackets that may be referenced or an assertion, and as long as the pattern
*PRUNE or *SKIP, because these break the feature. Consider, for example, does not contain *PRUNE or *SKIP, because these break the feature. Consider,
/.*?a(*PRUNE)b/ with the subject "aab", which matches "ab", i.e. not at the for example, /.*?a(*PRUNE)b/ with the subject "aab", which matches "ab",
start of a line. There is also an option that disables this optimization. */ i.e. not at the start of a line. There is also an option that disables this
optimization. */
else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR || op == OP_TYPEPOSSTAR) else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR || op == OP_TYPEPOSSTAR)
{ {
if (scode[1] != OP_ANY || (bracket_map & cb->backref_map) != 0 || if (scode[1] != OP_ANY || (bracket_map & cb->backref_map) != 0 ||
atomcount > 0 || cb->had_pruneorskip || atomcount > 0 || cb->had_pruneorskip || inassert ||
(cb->external_options & PCRE2_NO_DOTSTAR_ANCHOR) != 0) (cb->external_options & PCRE2_NO_DOTSTAR_ANCHOR) != 0)
return FALSE; return FALSE;
} }
@ -9452,7 +9462,8 @@ if ((re->overall_options & (PCRE2_ANCHORED|PCRE2_NO_START_OPTIMIZE)) == 0)
when *PRUNE and SKIP are not present. (There is an option that disables this when *PRUNE and SKIP are not present. (There is an option that disables this
case.) */ case.) */
else if (is_startline(codestart, 0, &cb, 0)) re->flags |= PCRE2_STARTLINE; else if (is_startline(codestart, 0, &cb, 0, FALSE))
re->flags |= PCRE2_STARTLINE;
} }
/* Handle the "required code unit", if one is set. In the case of an anchored /* Handle the "required code unit", if one is set. In the case of an anchored

6
testdata/testinput1 vendored
View File

@ -5806,4 +5806,10 @@ ef) x/x,mark
\= Expect no match \= Expect no match
baaab baaab
/(?=.*[A-Z])(?=.*[a-z])(?=.*[0-9])(?=.*[,;:])(?=.{8,16})(?!.*[\s])/
\ Fred:099
/(?=.*X)X$/
\ X
# End of testinput1 # End of testinput1

2
testdata/testinput2 vendored
View File

@ -4892,4 +4892,6 @@ a)"xI
/(?<R>abc)(?(R)xyz)/B /(?<R>abc)(?(R)xyz)/B
/(?=.*[A-Z])/I
# End of testinput2 # End of testinput2

6
testdata/testinput4 vendored
View File

@ -2282,4 +2282,10 @@
\x{389} \x{389}
\x{20ac} \x{20ac}
/(?=.*b)\pL/
11bb
/(?(?=.*b)(?=.*b)\pL|.*c)/
11bb
# End of testinput4 # End of testinput4

View File

@ -9277,4 +9277,12 @@ MK: ab cd # comment\x0aef
baaab baaab
No match No match
/(?=.*[A-Z])(?=.*[a-z])(?=.*[0-9])(?=.*[,;:])(?=.{8,16})(?!.*[\s])/
\ Fred:099
0:
/(?=.*X)X$/
\ X
0: X
# End of testinput1 # End of testinput1

View File

@ -8750,7 +8750,6 @@ Subject length lower bound = 1
/(?(?=.*b).*b|^d)/I /(?(?=.*b).*b|^d)/I
Capturing subpattern count = 0 Capturing subpattern count = 0
First code unit at start or follows newline
Subject length lower bound = 1 Subject length lower bound = 1
/xyz/auto_callout /xyz/auto_callout
@ -15334,6 +15333,11 @@ Failed: error 150 at offset 5: invalid range in character class
End End
------------------------------------------------------------------ ------------------------------------------------------------------
/(?=.*[A-Z])/I
Capturing subpattern count = 0
May match empty string
Subject length lower bound = 0
# End of testinput2 # End of testinput2
Error -63: PCRE2_ERROR_BADDATA (unknown error number) Error -63: PCRE2_ERROR_BADDATA (unknown error number)
Error -62: bad serialized data Error -62: bad serialized data

View File

@ -3703,4 +3703,12 @@ No match
\x{20ac} \x{20ac}
No match No match
/(?=.*b)\pL/
11bb
0: b
/(?(?=.*b)(?=.*b)\pL|.*c)/
11bb
0: b
# End of testinput4 # End of testinput4