Fix auto-anchor bug when .* is inside an assertion.

This commit is contained in:
Philip.Hazel 2016-11-01 15:58:28 +00:00
parent 12a6d697fe
commit 4fd8feaa50
4 changed files with 46 additions and 19 deletions

View File

@ -29,15 +29,7 @@ some minor bugs and Perl incompatibilities were fixed, including:
existing subpattern. existing subpattern.
(e) A conditional recursion test such as (?(R)...) misbehaved if there was a (e) A conditional recursion test such as (?(R)...) misbehaved if there was a
group whose name began with "R". group whose name began with "R".
(f) The amount of memory needed for a compiled pattern was miscalculated if a
lookbehind contained more than one toplevel branch and the first branch
was of length zero.
(g) In UTF-8 or UTF-16 modes with PCRE2_EXTENDED (/x) set and a non-zero-
terminated pattern, if a # comment ran on to the end of the pattern, one
or more code units past the end were being read.
(h) An unterminated repeat at the end of a non-zero-terminated pattern (e.g.
"{2,2") could cause reading beyond the pattern.
One effect of the refactoring is that some error numbers and messages have One effect of the refactoring is that some error numbers and messages have
changed, and the pattern offset given for compiling errors is not always the changed, and the pattern offset given for compiling errors is not always the
right-most character that has been read. In particular, for a variable-length right-most character that has been read. In particular, for a variable-length
@ -61,6 +53,17 @@ Some bugs in the refactored code were subsequently fixed before release:
a lookup outside one of the global tables. A similar bug existed for wide a lookup outside one of the global tables. A similar bug existed for wide
characters in *VERB names. characters in *VERB names.
(d) The amount of memory needed for a compiled pattern was miscalculated if a
lookbehind contained more than one toplevel branch and the first branch
was of length zero.
(e) In UTF-8 or UTF-16 modes with PCRE2_EXTENDED (/x) set and a non-zero-
terminated pattern, if a # comment ran on to the end of the pattern, one
or more code units past the end were being read.
(f) An unterminated repeat at the end of a non-zero-terminated pattern (e.g.
"{2,2") could cause reading beyond the pattern.
4. Back references are now permitted in lookbehind assertions when there are 4. Back references are now permitted in lookbehind assertions when there are
no duplicated group numbers (that is, (?| has not been used), and, if the no duplicated group numbers (that is, (?| has not been used), and, if the
reference is by name, there is only one group of that name. The referenced reference is by name, there is only one group of that name. The referenced
@ -122,6 +125,10 @@ library containing a test function that can be called by fuzzers to be
compiled. A non-installed binary to run the test function locally, called compiled. A non-installed binary to run the test function locally, called
pcre2fuzzcheck is also compiled. pcre2fuzzcheck is also compiled.
18. A pattern with PCRE2_DOTALL (/s) set but not PCRE2_NO_DOTSTAR_ANCHOR, and
which started with .* inside a positive lookahead was incorrectly being
compiled as implicitly anchored.
Version 10.22 29-July-2016 Version 10.22 29-July-2016
-------------------------- --------------------------

View File

@ -7634,13 +7634,14 @@ Arguments:
the less precise approach the less precise approach
cb points to the compile data block cb points to the compile data block
atomcount atomic group level atomcount atomic group level
inassert TRUE if in an assertion
Returns: TRUE or FALSE Returns: TRUE or FALSE
*/ */
static BOOL static BOOL
is_anchored(PCRE2_SPTR code, unsigned int bracket_map, compile_block *cb, is_anchored(PCRE2_SPTR code, unsigned int bracket_map, compile_block *cb,
int atomcount) int atomcount, BOOL inassert)
{ {
do { do {
PCRE2_SPTR scode = first_significant_code( PCRE2_SPTR scode = first_significant_code(
@ -7652,7 +7653,8 @@ do {
if (op == OP_BRA || op == OP_BRAPOS || if (op == OP_BRA || op == OP_BRAPOS ||
op == OP_SBRA || op == OP_SBRAPOS) op == OP_SBRA || op == OP_SBRAPOS)
{ {
if (!is_anchored(scode, bracket_map, cb, atomcount)) return FALSE; if (!is_anchored(scode, bracket_map, cb, atomcount, inassert))
return FALSE;
} }
/* Capturing brackets */ /* Capturing brackets */
@ -7662,33 +7664,44 @@ do {
{ {
int n = GET2(scode, 1+LINK_SIZE); int n = GET2(scode, 1+LINK_SIZE);
int new_map = bracket_map | ((n < 32)? (1u << n) : 1); int new_map = bracket_map | ((n < 32)? (1u << n) : 1);
if (!is_anchored(scode, new_map, cb, atomcount)) return FALSE; if (!is_anchored(scode, new_map, cb, atomcount, inassert)) return FALSE;
} }
/* Positive forward assertions and conditions */ /* Positive forward assertion */
else if (op == OP_ASSERT || op == OP_COND) else if (op == OP_ASSERT)
{ {
if (!is_anchored(scode, bracket_map, cb, atomcount)) return FALSE; if (!is_anchored(scode, bracket_map, cb, atomcount, TRUE)) return FALSE;
}
/* Condition */
else if (op == OP_COND)
{
if (!is_anchored(scode, bracket_map, cb, atomcount, inassert))
return FALSE;
} }
/* Atomic groups */ /* Atomic groups */
else if (op == OP_ONCE || op == OP_ONCE_NC) else if (op == OP_ONCE || op == OP_ONCE_NC)
{ {
if (!is_anchored(scode, bracket_map, cb, atomcount + 1)) if (!is_anchored(scode, bracket_map, cb, atomcount + 1, inassert))
return FALSE; return FALSE;
} }
/* .* is not anchored unless DOTALL is set (which generates OP_ALLANY) and /* .* is not anchored unless DOTALL is set (which generates OP_ALLANY) and
it isn't in brackets that are or may be referenced or inside an atomic it isn't in brackets that are or may be referenced or inside an atomic
group. There is also an option that disables auto-anchoring. */ group or an assertion. Also the pattern must not contain *PRUNE or *SKIP,
because these break the feature. Consider, for example, /(?s).*?(*PRUNE)b/
with the subject "aab", which matches "b", i.e. not at the start of a line.
There is also an option that disables auto-anchoring. */
else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR || else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR ||
op == OP_TYPEPOSSTAR)) op == OP_TYPEPOSSTAR))
{ {
if (scode[1] != OP_ALLANY || (bracket_map & cb->backref_map) != 0 || if (scode[1] != OP_ALLANY || (bracket_map & cb->backref_map) != 0 ||
atomcount > 0 || cb->had_pruneorskip || atomcount > 0 || cb->had_pruneorskip || inassert ||
(cb->external_options & PCRE2_NO_DOTSTAR_ANCHOR) != 0) (cb->external_options & PCRE2_NO_DOTSTAR_ANCHOR) != 0)
return FALSE; return FALSE;
} }
@ -9423,7 +9436,7 @@ there are no occurrences of *PRUNE or *SKIP (though there is an option to
disable this case). */ disable this case). */
if ((re->overall_options & PCRE2_ANCHORED) == 0 && if ((re->overall_options & PCRE2_ANCHORED) == 0 &&
is_anchored(codestart, 0, &cb, 0)) is_anchored(codestart, 0, &cb, 0, FALSE))
re->overall_options |= PCRE2_ANCHORED; re->overall_options |= PCRE2_ANCHORED;
/* If the pattern is still not anchored and we do not have a first code unit, /* If the pattern is still not anchored and we do not have a first code unit,

3
testdata/testinput1 vendored
View File

@ -5812,4 +5812,7 @@ ef) x/x,mark
/(?=.*X)X$/ /(?=.*X)X$/
\ X \ X
/(?s)(?=.*?)b/
aabc
# End of testinput1 # End of testinput1

View File

@ -9285,4 +9285,8 @@ No match
\ X \ X
0: X 0: X
/(?s)(?=.*?)b/
aabc
0: b
# End of testinput1 # End of testinput1