diff --git a/ChangeLog b/ChangeLog index 9164443..103c39b 100644 --- a/ChangeLog +++ b/ChangeLog @@ -134,6 +134,15 @@ groups, making the ovector larger than this. The number has been increased to 131072, which allows for the maximum number of captures (65535) plus the overall match. This fixes oss-fuzz issue 5415. +31. Auto-possessification at the end of a capturing group was dependent on what +follows the group (e.g. /(a+)b/ would auto-possessify the a+) but this caused +incorrect behaviour when the group was called recursively from elsewhere in the +pattern where something different might follow. This bug is an unforseen +consequence of change #1 for 10.30 - the implementation of backtracking into +recursions. Iterators at the ends of capturing groups are no longer considered +for auto-possessification if the pattern contains any recursions. Fixes +Bugzilla #2232. + Version 10.30 14-August-2017 ---------------------------- diff --git a/src/pcre2_auto_possess.c b/src/pcre2_auto_possess.c index ad3543f..23275a2 100644 --- a/src/pcre2_auto_possess.c +++ b/src/pcre2_auto_possess.c @@ -558,47 +558,73 @@ for(;;) continue; } + /* At the end of a branch, skip to the end of the group. */ + if (c == OP_ALT) { do code += GET(code, 1); while (*code == OP_ALT); c = *code; } + /* Inspect the next opcode. */ + switch(c) { - case OP_END: - case OP_KETRPOS: - /* TRUE only in greedy case. The non-greedy case could be replaced by - an OP_EXACT, but it is probably not worth it. (And note that OP_EXACT - uses more memory, which we cannot get at this stage.) */ + /* We can always possessify a greedy iterator at the end of the pattern, + which is reached after skipping over the final OP_KET. A non-greedy + iterator must never be possessified. */ + case OP_END: return base_list[1] != 0; + /* When an iterator is at the end of certain kinds of group we can inspect + what follows the group by skipping over the closing ket. Note that this + does not apply to OP_KETRMAX or OP_KETRMIN because what follows any given + iteration is variable (could be another iteration or could be the next + item). As these two opcodes are not listed in the next switch, they will + end up as the next code to inspect, and return FALSE by virtue of being + unsupported. */ + case OP_KET: - /* If the bracket is capturing, and referenced by an OP_RECURSE, or - it is an atomic sub-pattern (assert, once, etc.) the non-greedy case - cannot be converted to a possessive form. */ + case OP_KETRPOS: + /* The non-greedy case cannot be converted to a possessive form. */ if (base_list[1] == 0) return FALSE; + /* If the bracket is capturing it might be referenced by an OP_RECURSE + so its last iterator can never be possessified if the pattern contains + recursions. (This could be improved by keeping a list of group numbers that + are called by recursion.) */ + switch(*(code - GET(code, 1))) { + case OP_CBRA: + case OP_SCBRA: + case OP_CBRAPOS: + case OP_SCBRAPOS: + if (cb->had_recurse) return FALSE; + break; + + /* Atomic sub-patterns and assertions can always auto-possessify their + last iterator. However, if the group was entered as a result of checking + a previous iterator, this is not possible. */ + case OP_ASSERT: case OP_ASSERT_NOT: case OP_ASSERTBACK: case OP_ASSERTBACK_NOT: case OP_ONCE: - /* Atomic sub-patterns and assertions can always auto-possessify their - last iterator. However, if the group was entered as a result of checking - a previous iterator, this is not possible. */ - return !entered_a_group; } + /* Skip over the bracket and inspect what comes next. */ + code += PRIV(OP_lengths)[c]; continue; + /* Handle cases where the next item is a group. */ + case OP_ONCE: case OP_BRA: case OP_CBRA: @@ -637,11 +663,15 @@ for(;;) code += PRIV(OP_lengths)[c]; continue; + /* The next opcode does not need special handling; fall through and use it + to see if the base can be possessified. */ + default: break; } - /* Check for a supported opcode, and load its properties. */ + /* We now have the next appropriate opcode to compare with the base. Check + for a supported opcode, and load its properties. */ code = get_chr_property_list(code, utf, cb->fcc, list); if (code == NULL) return FALSE; /* Unsupported */ diff --git a/testdata/testinput1 b/testdata/testinput1 index 756b4af..9a9c5fd 100644 --- a/testdata/testinput1 +++ b/testdata/testinput1 @@ -6159,4 +6159,34 @@ ef) x/x,mark /((?<=((*ACCEPT))X)\1?Y(*ACCEPT))\1/ XYYZ +/(?(DEFINE)(?a?)X)^(?&optional_a)a$/ + aa + a + +/^(a?)b(?1)a/ + abaa + aba + baa + ba + +/^(a?)+b(?1)a/ + abaa + aba + baa + ba + +/^(a?)++b(?1)a/ + abaa + aba + baa + ba + +/^(a?)+b/ + b + ab + aaab + +/(?=a+)a(a+)++b/ + aab + # End of testinput1 diff --git a/testdata/testinput2 b/testdata/testinput2 index 2c2334c..5d3a80e 100644 --- a/testdata/testinput2 +++ b/testdata/testinput2 @@ -5412,4 +5412,21 @@ a)"xI \= Expect no match \na +# These tests are matched in test 1 as they are Perl compatible. Here we are +# looking at what does and does not get auto-possessified. + +/(?(DEFINE)(?a?))^(?&optional_a)a$/B + +/(?(DEFINE)(?a?)X)^(?&optional_a)a$/B + +/^(a?)b(?1)a/B + +/^(a?)+b(?1)a/B + +/^(a?)++b(?1)a/B + +/^(a?)+b/B + +/(?=a+)a(a+)++b/B + # End of testinput2 diff --git a/testdata/testoutput1 b/testdata/testoutput1 index 7cf647e..9c55be9 100644 --- a/testdata/testoutput1 +++ b/testdata/testoutput1 @@ -9758,4 +9758,68 @@ No match 1: Y 2: +/(?(DEFINE)(?a?)X)^(?&optional_a)a$/ + aa + 0: aa + a + 0: a + +/^(a?)b(?1)a/ + abaa + 0: abaa + 1: a + aba + 0: aba + 1: a + baa + 0: baa + 1: + ba + 0: ba + 1: + +/^(a?)+b(?1)a/ + abaa + 0: abaa + 1: + aba + 0: aba + 1: + baa + 0: baa + 1: + ba + 0: ba + 1: + +/^(a?)++b(?1)a/ + abaa + 0: abaa + 1: + aba + 0: aba + 1: + baa + 0: baa + 1: + ba + 0: ba + 1: + +/^(a?)+b/ + b + 0: b + 1: + ab + 0: ab + 1: + aaab + 0: aaab + 1: + +/(?=a+)a(a+)++b/ + aab + 0: aab + 1: a + # End of testinput1 diff --git a/testdata/testoutput2 b/testdata/testoutput2 index f783320..fcaac8f 100644 --- a/testdata/testoutput2 +++ b/testdata/testoutput2 @@ -12701,7 +12701,7 @@ Subject length lower bound = 5 Ket a CBraPos 1 - a++ + a+ KetRpos a Ket @@ -16468,6 +16468,113 @@ No match \na No match +# These tests are matched in test 1 as they are Perl compatible. Here we are +# looking at what does and does not get auto-possessified. + +/(?(DEFINE)(?a?))^(?&optional_a)a$/B +------------------------------------------------------------------ + Bra + Cond + Cond false + CBra 1 + a? + Ket + Ket + ^ + Recurse + a + $ + Ket + End +------------------------------------------------------------------ + +/(?(DEFINE)(?a?)X)^(?&optional_a)a$/B +------------------------------------------------------------------ + Bra + Cond + Cond false + CBra 1 + a? + Ket + X + Ket + ^ + Recurse + a + $ + Ket + End +------------------------------------------------------------------ + +/^(a?)b(?1)a/B +------------------------------------------------------------------ + Bra + ^ + CBra 1 + a? + Ket + b + Recurse + a + Ket + End +------------------------------------------------------------------ + +/^(a?)+b(?1)a/B +------------------------------------------------------------------ + Bra + ^ + SCBra 1 + a? + KetRmax + b + Recurse + a + Ket + End +------------------------------------------------------------------ + +/^(a?)++b(?1)a/B +------------------------------------------------------------------ + Bra + ^ + SCBraPos 1 + a? + KetRpos + b + Recurse + a + Ket + End +------------------------------------------------------------------ + +/^(a?)+b/B +------------------------------------------------------------------ + Bra + ^ + SCBra 1 + a? + KetRmax + b + Ket + End +------------------------------------------------------------------ + +/(?=a+)a(a+)++b/B +------------------------------------------------------------------ + Bra + Assert + a++ + Ket + a + CBraPos 1 + a++ + KetRpos + b + Ket + End +------------------------------------------------------------------ + # End of testinput2 Error -65: PCRE2_ERROR_BADDATA (unknown error number) Error -62: bad serialized data