Fix /x bug when pattern starts with whitespace followed by (?-x).

This commit is contained in:
Philip.Hazel 2015-12-03 16:58:31 +00:00
parent d71b70cdf7
commit 1f9b2a2e4b
4 changed files with 42 additions and 59 deletions

View File

@ -365,6 +365,12 @@ displaying fields containing NULLS:
(a) Within /x extended #-comments (a) Within /x extended #-comments
(b) Within the "name" part of (*MARK) and other *verbs (b) Within the "name" part of (*MARK) and other *verbs
(c) Within the text argument of a callout (c) Within the text argument of a callout
108. If a pattern that was compiled with PCRE2_EXTENDED started with white
space or a #-type comment that was followed by (?-x), which turns off
PCRE2_EXTENDED, and there was no subsequent (?x) to turn it on again,
pcre2_compile() assumed that (?-x) applied to the whole pattern and
consequently mis-compiled it. This bug was found by the LLVM fuzzer.
Version 10.20 30-June-2015 Version 10.20 30-June-2015

View File

@ -6862,44 +6862,16 @@ for (;; ptr++)
newoptions = (options | set) & (~unset); newoptions = (options | set) & (~unset);
/* If the options ended with ')' this is not the start of a nested /* If the options ended with ')' this is not the start of a nested
group with option changes, so the options change at this level. If this group with option changes, so the options change at this level. They
item is right at the start of the pattern, the options can be must also be passed back for use in subsequent branches. Reset the
abstracted and made external in the pre-compile phase, and ignored in greedy defaults and the case value for firstcu and reqcu. */
the compile phase. This can be helpful when matching -- for instance in
caseless checking of required bytes.
If the code pointer is not (cb->start_code + 1 + LINK_SIZE), we are
definitely *not* at the start of the pattern because something has been
compiled. In the pre-compile phase, however, the code pointer can have
that value after the start, because it gets reset as code is discarded
during the pre-compile. However, this can happen only at top level - if
we are within parentheses, the starting BRA will still be present. At
any parenthesis level, the length value can be used to test if anything
has been compiled at that level. Thus, a test for both these conditions
is necessary to ensure we correctly detect the start of the pattern in
both phases.
If we are not at the pattern start, reset the greedy defaults and the
case value for firstcu and reqcu. */
if (*ptr == CHAR_RIGHT_PARENTHESIS) if (*ptr == CHAR_RIGHT_PARENTHESIS)
{ {
if (code == cb->start_code + 1 + LINK_SIZE &&
(lengthptr == NULL || *lengthptr == 2 + 2*LINK_SIZE))
{
cb->external_options = newoptions;
}
else
{
greedy_default = ((newoptions & PCRE2_UNGREEDY) != 0);
greedy_non_default = greedy_default ^ 1;
req_caseopt = ((newoptions & PCRE2_CASELESS) != 0)? REQ_CASELESS:0;
}
/* Change options at this level, and pass them back for use
in subsequent branches. */
*optionsptr = options = newoptions; *optionsptr = options = newoptions;
greedy_default = ((newoptions & PCRE2_UNGREEDY) != 0);
greedy_non_default = greedy_default ^ 1;
req_caseopt = ((newoptions & PCRE2_CASELESS) != 0)? REQ_CASELESS:0;
previous = NULL; /* This item can't be repeated */ previous = NULL; /* This item can't be repeated */
continue; /* It is complete */ continue; /* It is complete */
} }

11
testdata/testinput2 vendored
View File

@ -4724,4 +4724,15 @@ a)"xI
# /A(?#X\x00Y)B/ # /A(?#X\x00Y)B/
/41 28 3f 23 7b 00 7d 29 42/B,hex /41 28 3f 23 7b 00 7d 29 42/B,hex
# Tests for leading comment in extended patterns
/ (?-x):?/extended
/ (?-x):?/extended
/0b 28 3f 2d 78 29 3a/hex,extended
/#comment
(?-x):?/extended
# End of testinput2 # End of testinput2

44
testdata/testoutput2 vendored
View File

@ -431,8 +431,6 @@ Subject length lower bound = 2
/(?U)<.*>/I /(?U)<.*>/I
Capturing subpattern count = 0 Capturing subpattern count = 0
Compile options: <none>
Overall options: ungreedy
First code unit = '<' First code unit = '<'
Last code unit = '>' Last code unit = '>'
Subject length lower bound = 2 Subject length lower bound = 2
@ -459,8 +457,6 @@ Subject length lower bound = 3
/(?U)={3,}?/I /(?U)={3,}?/I
Capturing subpattern count = 0 Capturing subpattern count = 0
Compile options: <none>
Overall options: ungreedy
First code unit = '=' First code unit = '='
Last code unit = '=' Last code unit = '='
Subject length lower bound = 3 Subject length lower bound = 3
@ -494,8 +490,6 @@ Failed: error 125 at offset 12: lookbehind assertion is not fixed length
/(?i)abc/I /(?i)abc/I
Capturing subpattern count = 0 Capturing subpattern count = 0
Compile options: <none>
Overall options: caseless
First code unit = 'a' (caseless) First code unit = 'a' (caseless)
Last code unit = 'c' (caseless) Last code unit = 'c' (caseless)
Subject length lower bound = 3 Subject length lower bound = 3
@ -508,7 +502,7 @@ Subject length lower bound = 1
/(?i)^1234/I /(?i)^1234/I
Capturing subpattern count = 0 Capturing subpattern count = 0
Compile options: <none> Compile options: <none>
Overall options: anchored caseless Overall options: anchored
Subject length lower bound = 4 Subject length lower bound = 4
/(^b|(?i)^d)/I /(^b|(?i)^d)/I
@ -521,7 +515,7 @@ Subject length lower bound = 1
Capturing subpattern count = 0 Capturing subpattern count = 0
May match empty string May match empty string
Compile options: <none> Compile options: <none>
Overall options: anchored dotall Overall options: anchored
Subject length lower bound = 0 Subject length lower bound = 0
/[abcd]/I /[abcd]/I
@ -531,15 +525,11 @@ Subject length lower bound = 1
/(?i)[abcd]/I /(?i)[abcd]/I
Capturing subpattern count = 0 Capturing subpattern count = 0
Compile options: <none>
Overall options: caseless
Starting code units: A B C D a b c d Starting code units: A B C D a b c d
Subject length lower bound = 1 Subject length lower bound = 1
/(?m)[xy]|(b|c)/I /(?m)[xy]|(b|c)/I
Capturing subpattern count = 1 Capturing subpattern count = 1
Compile options: <none>
Overall options: multiline
Starting code units: b c x y Starting code units: b c x y
Subject length lower bound = 1 Subject length lower bound = 1
@ -551,8 +541,7 @@ Subject length lower bound = 1
/(?i)(^a|^b)/Im /(?i)(^a|^b)/Im
Capturing subpattern count = 1 Capturing subpattern count = 1
Compile options: multiline Options: multiline
Overall options: caseless multiline
First code unit at start or follows newline First code unit at start or follows newline
Subject length lower bound = 1 Subject length lower bound = 1
@ -1153,7 +1142,7 @@ Subject length lower bound = 1
------------------------------------------------------------------ ------------------------------------------------------------------
Capturing subpattern count = 1 Capturing subpattern count = 1
Compile options: <none> Compile options: <none>
Overall options: anchored dotall Overall options: anchored
Subject length lower bound = 1 Subject length lower bound = 1
/(?s:.*X|^B)/IB /(?s:.*X|^B)/IB
@ -2682,8 +2671,7 @@ No match
End End
------------------------------------------------------------------ ------------------------------------------------------------------
Capturing subpattern count = 0 Capturing subpattern count = 0
Compile options: extended Options: extended
Overall options: caseless extended
First code unit = 'a' (caseless) First code unit = 'a' (caseless)
Last code unit = 'c' (caseless) Last code unit = 'c' (caseless)
Subject length lower bound = 3 Subject length lower bound = 3
@ -2697,8 +2685,7 @@ Subject length lower bound = 3
End End
------------------------------------------------------------------ ------------------------------------------------------------------
Capturing subpattern count = 0 Capturing subpattern count = 0
Compile options: extended Options: extended
Overall options: caseless extended
First code unit = 'a' (caseless) First code unit = 'a' (caseless)
Last code unit = 'c' (caseless) Last code unit = 'c' (caseless)
Subject length lower bound = 3 Subject length lower bound = 3
@ -3043,8 +3030,6 @@ Subject length lower bound = 3
End End
------------------------------------------------------------------ ------------------------------------------------------------------
Capturing subpattern count = 0 Capturing subpattern count = 0
Compile options: <none>
Overall options: ungreedy
First code unit = 'x' First code unit = 'x'
Last code unit = 'b' Last code unit = 'b'
Subject length lower bound = 3 Subject length lower bound = 3
@ -3427,8 +3412,6 @@ Subject length lower bound = 1
/(?i)[ab]/I /(?i)[ab]/I
Capturing subpattern count = 0 Capturing subpattern count = 0
Compile options: <none>
Overall options: caseless
Starting code units: A B a b Starting code units: A B a b
Subject length lower bound = 1 Subject length lower bound = 1
@ -5841,7 +5824,7 @@ Named capturing subpatterns:
A 2 A 2
A 3 A 3
Compile options: <none> Compile options: <none>
Overall options: anchored dupnames Overall options: anchored
Duplicate name status changes Duplicate name status changes
Subject length lower bound = 2 Subject length lower bound = 2
a1b\=copy=A a1b\=copy=A
@ -13734,7 +13717,7 @@ Subject length lower bound = 1
/(*NO_DOTSTAR_ANCHOR)(?s).*\d/info /(*NO_DOTSTAR_ANCHOR)(?s).*\d/info
Capturing subpattern count = 0 Capturing subpattern count = 0
Compile options: <none> Compile options: <none>
Overall options: dotall no_dotstar_anchor Overall options: no_dotstar_anchor
Subject length lower bound = 1 Subject length lower bound = 1
'^(?:(a)|b)(?(1)A|B)' '^(?:(a)|b)(?(1)A|B)'
@ -15060,4 +15043,15 @@ Subject length lower bound = 0
End End
------------------------------------------------------------------ ------------------------------------------------------------------
# Tests for leading comment in extended patterns
/ (?-x):?/extended
/ (?-x):?/extended
/0b 28 3f 2d 78 29 3a/hex,extended
/#comment
(?-x):?/extended
# End of testinput2 # End of testinput2