Fix optimization bugs when pattern starts with lookahead.
This commit is contained in:
parent
1828179ef9
commit
a22c5e4204
13
ChangeLog
13
ChangeLog
|
@ -80,6 +80,19 @@ variables PCRE2GREP_COLOUR and PCRE2GREP_COLOR are not found.
|
||||||
|
|
||||||
14. Add the -t (grand total) option to pcre2grep.
|
14. Add the -t (grand total) option to pcre2grep.
|
||||||
|
|
||||||
|
15. A number of bugs have been mended relating to match start-up optimizations
|
||||||
|
when the first thing in a pattern is a positive lookahead. These all applied
|
||||||
|
only when PCRE2_NO_START_OPTIMIZE was *not* set:
|
||||||
|
|
||||||
|
(a) A pattern such as (?=.*X)X$ was incorrectly optimized as if it needed
|
||||||
|
both an initial 'X' and a following 'X'.
|
||||||
|
(b) Some patterns starting with an assertion that started with .* were
|
||||||
|
incorrectly optimized as having to match at the start of the subject or
|
||||||
|
after a newline. There are cases where this is not true, for example,
|
||||||
|
(?=.*[A-Z])(?=.{8,16})(?!.*[\s]) matches after the start in lines that
|
||||||
|
start with spaces. Starting .* in an assertion is no longer taken as an
|
||||||
|
indication of matching at the start (or after a newline).
|
||||||
|
|
||||||
|
|
||||||
Version 10.22 29-July-2016
|
Version 10.22 29-July-2016
|
||||||
--------------------------
|
--------------------------
|
||||||
|
|
|
@ -3312,8 +3312,8 @@ while (ptr < ptrend)
|
||||||
if (ptr >= ptrend || *ptr != CHAR_RIGHT_PARENTHESIS)
|
if (ptr >= ptrend || *ptr != CHAR_RIGHT_PARENTHESIS)
|
||||||
{
|
{
|
||||||
errorcode = ERR58;
|
errorcode = ERR58;
|
||||||
goto FAILED;
|
goto FAILED;
|
||||||
}
|
}
|
||||||
goto SET_RECURSION;
|
goto SET_RECURSION;
|
||||||
|
|
||||||
/* An item starting (?- followed by a digit comes here via the "default"
|
/* An item starting (?- followed by a digit comes here via the "default"
|
||||||
|
@ -5994,7 +5994,7 @@ for (;; pptr++)
|
||||||
zerofirstcuflags = firstcuflags;
|
zerofirstcuflags = firstcuflags;
|
||||||
groupsetfirstcu = FALSE;
|
groupsetfirstcu = FALSE;
|
||||||
|
|
||||||
if (bravalue >= OP_ONCE)
|
if (bravalue >= OP_ONCE) /* Not an assertion */
|
||||||
{
|
{
|
||||||
/* If we have not yet set a firstcu in this branch, take it from the
|
/* If we have not yet set a firstcu in this branch, take it from the
|
||||||
subpattern, remembering that it was set here so that a repeat of more
|
subpattern, remembering that it was set here so that a repeat of more
|
||||||
|
@ -6034,15 +6034,19 @@ for (;; pptr++)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/* For a forward assertion, we take the reqcu, if set. This can be
|
/* For a forward assertion, we take the reqcu, if set, provided that the
|
||||||
helpful if the pattern that follows the assertion doesn't set a different
|
group has also set a firstcu. This can be helpful if the pattern that
|
||||||
char. For example, it's useful for /(?=abcde).+/. We can't set firstcu
|
follows the assertion doesn't set a different char. For example, it's
|
||||||
for an assertion, however because it leads to incorrect effect for patterns
|
useful for /(?=abcde).+/. We can't set firstcu for an assertion, however
|
||||||
such as /(?=a)a.+/ when the "real" "a" would then become a reqcu instead
|
because it leads to incorrect effect for patterns such as /(?=a)a.+/ when
|
||||||
of a firstcu. This is overcome by a scan at the end if there's no
|
the "real" "a" would then become a reqcu instead of a firstcu. This is
|
||||||
firstcu, looking for an asserted first char. */
|
overcome by a scan at the end if there's no firstcu, looking for an
|
||||||
|
asserted first char. A similar effect for patterns like /(?=.*X)X$/ means
|
||||||
|
we must only take the reqcu when the group also set a firstcu. Otherwise,
|
||||||
|
in that example, 'X' ends up set for both. */
|
||||||
|
|
||||||
else if (bravalue == OP_ASSERT && subreqcuflags >= 0)
|
else if (bravalue == OP_ASSERT && subreqcuflags >= 0 &&
|
||||||
|
subfirstcuflags >= 0)
|
||||||
{
|
{
|
||||||
reqcu = subreqcu;
|
reqcu = subreqcu;
|
||||||
reqcuflags = subreqcuflags;
|
reqcuflags = subreqcuflags;
|
||||||
|
@ -6542,8 +6546,9 @@ for (;; pptr++)
|
||||||
*lengthptr += delta;
|
*lengthptr += delta;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* This is compiling for real. If there is a set first code unit for
|
/* This is compiling for real. If there is a set first code unit
|
||||||
the group, and we have not yet set a "required code unit", set it. */
|
for the group, and we have not yet set a "required code unit", set
|
||||||
|
it. */
|
||||||
|
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
|
@ -7701,8 +7706,8 @@ matching and for non-DOTALL patterns that start with .* (which must start at
|
||||||
the beginning or after \n). As in the case of is_anchored() (see above), we
|
the beginning or after \n). As in the case of is_anchored() (see above), we
|
||||||
have to take account of back references to capturing brackets that contain .*
|
have to take account of back references to capturing brackets that contain .*
|
||||||
because in that case we can't make the assumption. Also, the appearance of .*
|
because in that case we can't make the assumption. Also, the appearance of .*
|
||||||
inside atomic brackets or in a pattern that contains *PRUNE or *SKIP does not
|
inside atomic brackets or in an assertion, or in a pattern that contains *PRUNE
|
||||||
count, because once again the assumption no longer holds.
|
or *SKIP does not count, because once again the assumption no longer holds.
|
||||||
|
|
||||||
Arguments:
|
Arguments:
|
||||||
code points to start of the compiled pattern or a group
|
code points to start of the compiled pattern or a group
|
||||||
|
@ -7711,13 +7716,14 @@ Arguments:
|
||||||
the less precise approach
|
the less precise approach
|
||||||
cb points to the compile data
|
cb points to the compile data
|
||||||
atomcount atomic group level
|
atomcount atomic group level
|
||||||
|
inassert TRUE if in an assertion
|
||||||
|
|
||||||
Returns: TRUE or FALSE
|
Returns: TRUE or FALSE
|
||||||
*/
|
*/
|
||||||
|
|
||||||
static BOOL
|
static BOOL
|
||||||
is_startline(PCRE2_SPTR code, unsigned int bracket_map, compile_block *cb,
|
is_startline(PCRE2_SPTR code, unsigned int bracket_map, compile_block *cb,
|
||||||
int atomcount)
|
int atomcount, BOOL inassert)
|
||||||
{
|
{
|
||||||
do {
|
do {
|
||||||
PCRE2_SPTR scode = first_significant_code(
|
PCRE2_SPTR scode = first_significant_code(
|
||||||
|
@ -7748,7 +7754,7 @@ do {
|
||||||
return FALSE;
|
return FALSE;
|
||||||
|
|
||||||
default: /* Assertion */
|
default: /* Assertion */
|
||||||
if (!is_startline(scode, bracket_map, cb, atomcount)) return FALSE;
|
if (!is_startline(scode, bracket_map, cb, atomcount, TRUE)) return FALSE;
|
||||||
do scode += GET(scode, 1); while (*scode == OP_ALT);
|
do scode += GET(scode, 1); while (*scode == OP_ALT);
|
||||||
scode += 1 + LINK_SIZE;
|
scode += 1 + LINK_SIZE;
|
||||||
break;
|
break;
|
||||||
|
@ -7762,7 +7768,8 @@ do {
|
||||||
if (op == OP_BRA || op == OP_BRAPOS ||
|
if (op == OP_BRA || op == OP_BRAPOS ||
|
||||||
op == OP_SBRA || op == OP_SBRAPOS)
|
op == OP_SBRA || op == OP_SBRAPOS)
|
||||||
{
|
{
|
||||||
if (!is_startline(scode, bracket_map, cb, atomcount)) return FALSE;
|
if (!is_startline(scode, bracket_map, cb, atomcount, inassert))
|
||||||
|
return FALSE;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Capturing brackets */
|
/* Capturing brackets */
|
||||||
|
@ -7772,33 +7779,36 @@ do {
|
||||||
{
|
{
|
||||||
int n = GET2(scode, 1+LINK_SIZE);
|
int n = GET2(scode, 1+LINK_SIZE);
|
||||||
int new_map = bracket_map | ((n < 32)? (1u << n) : 1);
|
int new_map = bracket_map | ((n < 32)? (1u << n) : 1);
|
||||||
if (!is_startline(scode, new_map, cb, atomcount)) return FALSE;
|
if (!is_startline(scode, new_map, cb, atomcount, inassert)) return FALSE;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Positive forward assertions */
|
/* Positive forward assertions */
|
||||||
|
|
||||||
else if (op == OP_ASSERT)
|
else if (op == OP_ASSERT)
|
||||||
{
|
{
|
||||||
if (!is_startline(scode, bracket_map, cb, atomcount)) return FALSE;
|
if (!is_startline(scode, bracket_map, cb, atomcount, TRUE))
|
||||||
|
return FALSE;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Atomic brackets */
|
/* Atomic brackets */
|
||||||
|
|
||||||
else if (op == OP_ONCE || op == OP_ONCE_NC)
|
else if (op == OP_ONCE || op == OP_ONCE_NC)
|
||||||
{
|
{
|
||||||
if (!is_startline(scode, bracket_map, cb, atomcount + 1)) return FALSE;
|
if (!is_startline(scode, bracket_map, cb, atomcount + 1, inassert))
|
||||||
|
return FALSE;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* .* means "start at start or after \n" if it isn't in atomic brackets or
|
/* .* means "start at start or after \n" if it isn't in atomic brackets or
|
||||||
brackets that may be referenced, as long as the pattern does not contain
|
brackets that may be referenced or an assertion, and as long as the pattern
|
||||||
*PRUNE or *SKIP, because these break the feature. Consider, for example,
|
does not contain *PRUNE or *SKIP, because these break the feature. Consider,
|
||||||
/.*?a(*PRUNE)b/ with the subject "aab", which matches "ab", i.e. not at the
|
for example, /.*?a(*PRUNE)b/ with the subject "aab", which matches "ab",
|
||||||
start of a line. There is also an option that disables this optimization. */
|
i.e. not at the start of a line. There is also an option that disables this
|
||||||
|
optimization. */
|
||||||
|
|
||||||
else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR || op == OP_TYPEPOSSTAR)
|
else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR || op == OP_TYPEPOSSTAR)
|
||||||
{
|
{
|
||||||
if (scode[1] != OP_ANY || (bracket_map & cb->backref_map) != 0 ||
|
if (scode[1] != OP_ANY || (bracket_map & cb->backref_map) != 0 ||
|
||||||
atomcount > 0 || cb->had_pruneorskip ||
|
atomcount > 0 || cb->had_pruneorskip || inassert ||
|
||||||
(cb->external_options & PCRE2_NO_DOTSTAR_ANCHOR) != 0)
|
(cb->external_options & PCRE2_NO_DOTSTAR_ANCHOR) != 0)
|
||||||
return FALSE;
|
return FALSE;
|
||||||
}
|
}
|
||||||
|
@ -9452,7 +9462,8 @@ if ((re->overall_options & (PCRE2_ANCHORED|PCRE2_NO_START_OPTIMIZE)) == 0)
|
||||||
when *PRUNE and SKIP are not present. (There is an option that disables this
|
when *PRUNE and SKIP are not present. (There is an option that disables this
|
||||||
case.) */
|
case.) */
|
||||||
|
|
||||||
else if (is_startline(codestart, 0, &cb, 0)) re->flags |= PCRE2_STARTLINE;
|
else if (is_startline(codestart, 0, &cb, 0, FALSE))
|
||||||
|
re->flags |= PCRE2_STARTLINE;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Handle the "required code unit", if one is set. In the case of an anchored
|
/* Handle the "required code unit", if one is set. In the case of an anchored
|
||||||
|
|
|
@ -5806,4 +5806,10 @@ ef) x/x,mark
|
||||||
\= Expect no match
|
\= Expect no match
|
||||||
baaab
|
baaab
|
||||||
|
|
||||||
|
/(?=.*[A-Z])(?=.*[a-z])(?=.*[0-9])(?=.*[,;:])(?=.{8,16})(?!.*[\s])/
|
||||||
|
\ Fred:099
|
||||||
|
|
||||||
|
/(?=.*X)X$/
|
||||||
|
\ X
|
||||||
|
|
||||||
# End of testinput1
|
# End of testinput1
|
||||||
|
|
|
@ -4892,4 +4892,6 @@ a)"xI
|
||||||
|
|
||||||
/(?<R>abc)(?(R)xyz)/B
|
/(?<R>abc)(?(R)xyz)/B
|
||||||
|
|
||||||
|
/(?=.*[A-Z])/I
|
||||||
|
|
||||||
# End of testinput2
|
# End of testinput2
|
||||||
|
|
|
@ -2282,4 +2282,10 @@
|
||||||
\x{389}
|
\x{389}
|
||||||
\x{20ac}
|
\x{20ac}
|
||||||
|
|
||||||
|
/(?=.*b)\pL/
|
||||||
|
11bb
|
||||||
|
|
||||||
|
/(?(?=.*b)(?=.*b)\pL|.*c)/
|
||||||
|
11bb
|
||||||
|
|
||||||
# End of testinput4
|
# End of testinput4
|
||||||
|
|
|
@ -9277,4 +9277,12 @@ MK: ab cd # comment\x0aef
|
||||||
baaab
|
baaab
|
||||||
No match
|
No match
|
||||||
|
|
||||||
|
/(?=.*[A-Z])(?=.*[a-z])(?=.*[0-9])(?=.*[,;:])(?=.{8,16})(?!.*[\s])/
|
||||||
|
\ Fred:099
|
||||||
|
0:
|
||||||
|
|
||||||
|
/(?=.*X)X$/
|
||||||
|
\ X
|
||||||
|
0: X
|
||||||
|
|
||||||
# End of testinput1
|
# End of testinput1
|
||||||
|
|
|
@ -8750,7 +8750,6 @@ Subject length lower bound = 1
|
||||||
|
|
||||||
/(?(?=.*b).*b|^d)/I
|
/(?(?=.*b).*b|^d)/I
|
||||||
Capturing subpattern count = 0
|
Capturing subpattern count = 0
|
||||||
First code unit at start or follows newline
|
|
||||||
Subject length lower bound = 1
|
Subject length lower bound = 1
|
||||||
|
|
||||||
/xyz/auto_callout
|
/xyz/auto_callout
|
||||||
|
@ -15334,6 +15333,11 @@ Failed: error 150 at offset 5: invalid range in character class
|
||||||
End
|
End
|
||||||
------------------------------------------------------------------
|
------------------------------------------------------------------
|
||||||
|
|
||||||
|
/(?=.*[A-Z])/I
|
||||||
|
Capturing subpattern count = 0
|
||||||
|
May match empty string
|
||||||
|
Subject length lower bound = 0
|
||||||
|
|
||||||
# End of testinput2
|
# End of testinput2
|
||||||
Error -63: PCRE2_ERROR_BADDATA (unknown error number)
|
Error -63: PCRE2_ERROR_BADDATA (unknown error number)
|
||||||
Error -62: bad serialized data
|
Error -62: bad serialized data
|
||||||
|
|
|
@ -3703,4 +3703,12 @@ No match
|
||||||
\x{20ac}
|
\x{20ac}
|
||||||
No match
|
No match
|
||||||
|
|
||||||
|
/(?=.*b)\pL/
|
||||||
|
11bb
|
||||||
|
0: b
|
||||||
|
|
||||||
|
/(?(?=.*b)(?=.*b)\pL|.*c)/
|
||||||
|
11bb
|
||||||
|
0: b
|
||||||
|
|
||||||
# End of testinput4
|
# End of testinput4
|
||||||
|
|
Loading…
Reference in New Issue