From 4fd8feaa508256606e0f150d661caeb5bc45ec3e Mon Sep 17 00:00:00 2001 From: "Philip.Hazel" Date: Tue, 1 Nov 2016 15:58:28 +0000 Subject: [PATCH] Fix auto-anchor bug when .* is inside an assertion. --- ChangeLog | 25 ++++++++++++++++--------- src/pcre2_compile.c | 33 +++++++++++++++++++++++---------- testdata/testinput1 | 3 +++ testdata/testoutput1 | 4 ++++ 4 files changed, 46 insertions(+), 19 deletions(-) diff --git a/ChangeLog b/ChangeLog index 3403d1d..d7568e0 100644 --- a/ChangeLog +++ b/ChangeLog @@ -29,15 +29,7 @@ some minor bugs and Perl incompatibilities were fixed, including: existing subpattern. (e) A conditional recursion test such as (?(R)...) misbehaved if there was a group whose name began with "R". - (f) The amount of memory needed for a compiled pattern was miscalculated if a - lookbehind contained more than one toplevel branch and the first branch - was of length zero. - (g) In UTF-8 or UTF-16 modes with PCRE2_EXTENDED (/x) set and a non-zero- - terminated pattern, if a # comment ran on to the end of the pattern, one - or more code units past the end were being read. - (h) An unterminated repeat at the end of a non-zero-terminated pattern (e.g. - "{2,2") could cause reading beyond the pattern. - + One effect of the refactoring is that some error numbers and messages have changed, and the pattern offset given for compiling errors is not always the right-most character that has been read. In particular, for a variable-length @@ -61,6 +53,17 @@ Some bugs in the refactored code were subsequently fixed before release: a lookup outside one of the global tables. A similar bug existed for wide characters in *VERB names. + (d) The amount of memory needed for a compiled pattern was miscalculated if a + lookbehind contained more than one toplevel branch and the first branch + was of length zero. + + (e) In UTF-8 or UTF-16 modes with PCRE2_EXTENDED (/x) set and a non-zero- + terminated pattern, if a # comment ran on to the end of the pattern, one + or more code units past the end were being read. + + (f) An unterminated repeat at the end of a non-zero-terminated pattern (e.g. + "{2,2") could cause reading beyond the pattern. + 4. Back references are now permitted in lookbehind assertions when there are no duplicated group numbers (that is, (?| has not been used), and, if the reference is by name, there is only one group of that name. The referenced @@ -122,6 +125,10 @@ library containing a test function that can be called by fuzzers to be compiled. A non-installed binary to run the test function locally, called pcre2fuzzcheck is also compiled. +18. A pattern with PCRE2_DOTALL (/s) set but not PCRE2_NO_DOTSTAR_ANCHOR, and +which started with .* inside a positive lookahead was incorrectly being +compiled as implicitly anchored. + Version 10.22 29-July-2016 -------------------------- diff --git a/src/pcre2_compile.c b/src/pcre2_compile.c index 06be3bf..edb49d0 100644 --- a/src/pcre2_compile.c +++ b/src/pcre2_compile.c @@ -7634,13 +7634,14 @@ Arguments: the less precise approach cb points to the compile data block atomcount atomic group level + inassert TRUE if in an assertion Returns: TRUE or FALSE */ static BOOL is_anchored(PCRE2_SPTR code, unsigned int bracket_map, compile_block *cb, - int atomcount) + int atomcount, BOOL inassert) { do { PCRE2_SPTR scode = first_significant_code( @@ -7652,7 +7653,8 @@ do { if (op == OP_BRA || op == OP_BRAPOS || op == OP_SBRA || op == OP_SBRAPOS) { - if (!is_anchored(scode, bracket_map, cb, atomcount)) return FALSE; + if (!is_anchored(scode, bracket_map, cb, atomcount, inassert)) + return FALSE; } /* Capturing brackets */ @@ -7662,33 +7664,44 @@ do { { int n = GET2(scode, 1+LINK_SIZE); int new_map = bracket_map | ((n < 32)? (1u << n) : 1); - if (!is_anchored(scode, new_map, cb, atomcount)) return FALSE; + if (!is_anchored(scode, new_map, cb, atomcount, inassert)) return FALSE; } - /* Positive forward assertions and conditions */ + /* Positive forward assertion */ - else if (op == OP_ASSERT || op == OP_COND) + else if (op == OP_ASSERT) { - if (!is_anchored(scode, bracket_map, cb, atomcount)) return FALSE; + if (!is_anchored(scode, bracket_map, cb, atomcount, TRUE)) return FALSE; + } + + /* Condition */ + + else if (op == OP_COND) + { + if (!is_anchored(scode, bracket_map, cb, atomcount, inassert)) + return FALSE; } /* Atomic groups */ else if (op == OP_ONCE || op == OP_ONCE_NC) { - if (!is_anchored(scode, bracket_map, cb, atomcount + 1)) + if (!is_anchored(scode, bracket_map, cb, atomcount + 1, inassert)) return FALSE; } /* .* is not anchored unless DOTALL is set (which generates OP_ALLANY) and it isn't in brackets that are or may be referenced or inside an atomic - group. There is also an option that disables auto-anchoring. */ + group or an assertion. Also the pattern must not contain *PRUNE or *SKIP, + because these break the feature. Consider, for example, /(?s).*?(*PRUNE)b/ + with the subject "aab", which matches "b", i.e. not at the start of a line. + There is also an option that disables auto-anchoring. */ else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR || op == OP_TYPEPOSSTAR)) { if (scode[1] != OP_ALLANY || (bracket_map & cb->backref_map) != 0 || - atomcount > 0 || cb->had_pruneorskip || + atomcount > 0 || cb->had_pruneorskip || inassert || (cb->external_options & PCRE2_NO_DOTSTAR_ANCHOR) != 0) return FALSE; } @@ -9423,7 +9436,7 @@ there are no occurrences of *PRUNE or *SKIP (though there is an option to disable this case). */ if ((re->overall_options & PCRE2_ANCHORED) == 0 && - is_anchored(codestart, 0, &cb, 0)) + is_anchored(codestart, 0, &cb, 0, FALSE)) re->overall_options |= PCRE2_ANCHORED; /* If the pattern is still not anchored and we do not have a first code unit, diff --git a/testdata/testinput1 b/testdata/testinput1 index bcc42bd..7978e0c 100644 --- a/testdata/testinput1 +++ b/testdata/testinput1 @@ -5812,4 +5812,7 @@ ef) x/x,mark /(?=.*X)X$/ \ X +/(?s)(?=.*?)b/ + aabc + # End of testinput1 diff --git a/testdata/testoutput1 b/testdata/testoutput1 index 837d1f4..617ca8a 100644 --- a/testdata/testoutput1 +++ b/testdata/testoutput1 @@ -9285,4 +9285,8 @@ No match \ X 0: X +/(?s)(?=.*?)b/ + aabc + 0: b + # End of testinput1