Fix auto-possessification bug at the end of a capturing group that is called

recursively.
This commit is contained in:
Philip.Hazel 2018-01-31 17:53:56 +00:00
parent 7a5b962509
commit 53a588431c
6 changed files with 271 additions and 14 deletions

View File

@ -134,6 +134,15 @@ groups, making the ovector larger than this. The number has been increased to
131072, which allows for the maximum number of captures (65535) plus the
overall match. This fixes oss-fuzz issue 5415.
31. Auto-possessification at the end of a capturing group was dependent on what
follows the group (e.g. /(a+)b/ would auto-possessify the a+) but this caused
incorrect behaviour when the group was called recursively from elsewhere in the
pattern where something different might follow. This bug is an unforseen
consequence of change #1 for 10.30 - the implementation of backtracking into
recursions. Iterators at the ends of capturing groups are no longer considered
for auto-possessification if the pattern contains any recursions. Fixes
Bugzilla #2232.
Version 10.30 14-August-2017
----------------------------

View File

@ -558,47 +558,73 @@ for(;;)
continue;
}
/* At the end of a branch, skip to the end of the group. */
if (c == OP_ALT)
{
do code += GET(code, 1); while (*code == OP_ALT);
c = *code;
}
/* Inspect the next opcode. */
switch(c)
{
case OP_END:
case OP_KETRPOS:
/* TRUE only in greedy case. The non-greedy case could be replaced by
an OP_EXACT, but it is probably not worth it. (And note that OP_EXACT
uses more memory, which we cannot get at this stage.) */
/* We can always possessify a greedy iterator at the end of the pattern,
which is reached after skipping over the final OP_KET. A non-greedy
iterator must never be possessified. */
case OP_END:
return base_list[1] != 0;
/* When an iterator is at the end of certain kinds of group we can inspect
what follows the group by skipping over the closing ket. Note that this
does not apply to OP_KETRMAX or OP_KETRMIN because what follows any given
iteration is variable (could be another iteration or could be the next
item). As these two opcodes are not listed in the next switch, they will
end up as the next code to inspect, and return FALSE by virtue of being
unsupported. */
case OP_KET:
/* If the bracket is capturing, and referenced by an OP_RECURSE, or
it is an atomic sub-pattern (assert, once, etc.) the non-greedy case
cannot be converted to a possessive form. */
case OP_KETRPOS:
/* The non-greedy case cannot be converted to a possessive form. */
if (base_list[1] == 0) return FALSE;
/* If the bracket is capturing it might be referenced by an OP_RECURSE
so its last iterator can never be possessified if the pattern contains
recursions. (This could be improved by keeping a list of group numbers that
are called by recursion.) */
switch(*(code - GET(code, 1)))
{
case OP_CBRA:
case OP_SCBRA:
case OP_CBRAPOS:
case OP_SCBRAPOS:
if (cb->had_recurse) return FALSE;
break;
/* Atomic sub-patterns and assertions can always auto-possessify their
last iterator. However, if the group was entered as a result of checking
a previous iterator, this is not possible. */
case OP_ASSERT:
case OP_ASSERT_NOT:
case OP_ASSERTBACK:
case OP_ASSERTBACK_NOT:
case OP_ONCE:
/* Atomic sub-patterns and assertions can always auto-possessify their
last iterator. However, if the group was entered as a result of checking
a previous iterator, this is not possible. */
return !entered_a_group;
}
/* Skip over the bracket and inspect what comes next. */
code += PRIV(OP_lengths)[c];
continue;
/* Handle cases where the next item is a group. */
case OP_ONCE:
case OP_BRA:
case OP_CBRA:
@ -637,11 +663,15 @@ for(;;)
code += PRIV(OP_lengths)[c];
continue;
/* The next opcode does not need special handling; fall through and use it
to see if the base can be possessified. */
default:
break;
}
/* Check for a supported opcode, and load its properties. */
/* We now have the next appropriate opcode to compare with the base. Check
for a supported opcode, and load its properties. */
code = get_chr_property_list(code, utf, cb->fcc, list);
if (code == NULL) return FALSE; /* Unsupported */

30
testdata/testinput1 vendored
View File

@ -6159,4 +6159,34 @@ ef) x/x,mark
/((?<=((*ACCEPT))X)\1?Y(*ACCEPT))\1/
XYYZ
/(?(DEFINE)(?<optional_a>a?)X)^(?&optional_a)a$/
aa
a
/^(a?)b(?1)a/
abaa
aba
baa
ba
/^(a?)+b(?1)a/
abaa
aba
baa
ba
/^(a?)++b(?1)a/
abaa
aba
baa
ba
/^(a?)+b/
b
ab
aaab
/(?=a+)a(a+)++b/
aab
# End of testinput1

17
testdata/testinput2 vendored
View File

@ -5412,4 +5412,21 @@ a)"xI
\= Expect no match
\na
# These tests are matched in test 1 as they are Perl compatible. Here we are
# looking at what does and does not get auto-possessified.
/(?(DEFINE)(?<optional_a>a?))^(?&optional_a)a$/B
/(?(DEFINE)(?<optional_a>a?)X)^(?&optional_a)a$/B
/^(a?)b(?1)a/B
/^(a?)+b(?1)a/B
/^(a?)++b(?1)a/B
/^(a?)+b/B
/(?=a+)a(a+)++b/B
# End of testinput2

64
testdata/testoutput1 vendored
View File

@ -9758,4 +9758,68 @@ No match
1: Y
2:
/(?(DEFINE)(?<optional_a>a?)X)^(?&optional_a)a$/
aa
0: aa
a
0: a
/^(a?)b(?1)a/
abaa
0: abaa
1: a
aba
0: aba
1: a
baa
0: baa
1:
ba
0: ba
1:
/^(a?)+b(?1)a/
abaa
0: abaa
1:
aba
0: aba
1:
baa
0: baa
1:
ba
0: ba
1:
/^(a?)++b(?1)a/
abaa
0: abaa
1:
aba
0: aba
1:
baa
0: baa
1:
ba
0: ba
1:
/^(a?)+b/
b
0: b
1:
ab
0: ab
1:
aaab
0: aaab
1:
/(?=a+)a(a+)++b/
aab
0: aab
1: a
# End of testinput1

109
testdata/testoutput2 vendored
View File

@ -12701,7 +12701,7 @@ Subject length lower bound = 5
Ket
a
CBraPos 1
a++
a+
KetRpos
a
Ket
@ -16468,6 +16468,113 @@ No match
\na
No match
# These tests are matched in test 1 as they are Perl compatible. Here we are
# looking at what does and does not get auto-possessified.
/(?(DEFINE)(?<optional_a>a?))^(?&optional_a)a$/B
------------------------------------------------------------------
Bra
Cond
Cond false
CBra 1
a?
Ket
Ket
^
Recurse
a
$
Ket
End
------------------------------------------------------------------
/(?(DEFINE)(?<optional_a>a?)X)^(?&optional_a)a$/B
------------------------------------------------------------------
Bra
Cond
Cond false
CBra 1
a?
Ket
X
Ket
^
Recurse
a
$
Ket
End
------------------------------------------------------------------
/^(a?)b(?1)a/B
------------------------------------------------------------------
Bra
^
CBra 1
a?
Ket
b
Recurse
a
Ket
End
------------------------------------------------------------------
/^(a?)+b(?1)a/B
------------------------------------------------------------------
Bra
^
SCBra 1
a?
KetRmax
b
Recurse
a
Ket
End
------------------------------------------------------------------
/^(a?)++b(?1)a/B
------------------------------------------------------------------
Bra
^
SCBraPos 1
a?
KetRpos
b
Recurse
a
Ket
End
------------------------------------------------------------------
/^(a?)+b/B
------------------------------------------------------------------
Bra
^
SCBra 1
a?
KetRmax
b
Ket
End
------------------------------------------------------------------
/(?=a+)a(a+)++b/B
------------------------------------------------------------------
Bra
Assert
a++
Ket
a
CBraPos 1
a++
KetRpos
b
Ket
End
------------------------------------------------------------------
# End of testinput2
Error -65: PCRE2_ERROR_BADDATA (unknown error number)
Error -62: bad serialized data