Fix issues with (*VERB)s inside recursive subroutine calls.

This commit is contained in:
Philip.Hazel 2017-03-23 17:54:58 +00:00
parent d5ca2dee9d
commit 45ddeb70cf
5 changed files with 71 additions and 35 deletions

View File

@ -15,9 +15,8 @@ the old code had a number of fudges to try to reduce stack usage. It seems to
run no slower than the old code.
A number of bugs in the refactored code were subsequently fixed during testing
before release, but after the code was made available in the repository. Many
of the bugs were discovered by fuzzing testing. These bugs were never in fully
released code, but are noted here for the record.
before release, but after the code was made available in the repository. These
bugs were never in fully released code, but are noted here for the record.
(a) If a pattern had fewer capturing parentheses than the ovector supplied in
the match data block, a memory error (detectable by ASAN) occurred after
@ -30,6 +29,8 @@ released code, but are noted here for the record.
vector on the stack is not big enough to handle at least 10 frames.
Fixes oss-fuzz issue 783.
(c) Handling of (*VERB)s in recursions was wrong in some cases.
2. Now that pcre2_match() no longer uses recursive function calls (see above),
the "match limit recursion" value seems misnamed. It still exists, and limits
the depth of tree that is searched. To avoid future confusion, it has been

View File

@ -826,13 +826,14 @@ typedef struct match_block {
PCRE2_SPTR start_code; /* For use when recursing */
PCRE2_SPTR start_subject; /* Start of the subject string */
PCRE2_SPTR end_subject; /* End of the subject string */
PCRE2_SPTR verb_ecode_ptr; /* For passing back info */
PCRE2_SPTR verb_skip_ptr; /* For passing back a (*SKIP) name */
PCRE2_SPTR end_match_ptr; /* Subject position at end match */
PCRE2_SPTR start_used_ptr; /* Earliest consulted character */
PCRE2_SPTR last_used_ptr; /* Latest consulted character */
PCRE2_SPTR mark; /* Mark pointer to pass back on success */
PCRE2_SPTR nomatch_mark; /* Mark pointer to pass back on failure */
PCRE2_SPTR verb_ecode_ptr; /* For passing back info */
PCRE2_SPTR verb_skip_ptr; /* For passing back a (*SKIP) name */
uint32_t verb_current_recurse; /* Current recurse when (*VERB) happens */
uint32_t moptions; /* Match options */
uint32_t poptions; /* Pattern options */
uint32_t skip_arg_count; /* For counting SKIP_ARGs */

View File

@ -5051,8 +5051,8 @@ fprintf(stderr, "++ op=%d\n", *Fecode);
offset data is the offset to the starting bracket from the start of the
whole pattern. (This is so that it works from duplicated subpatterns.) */
#define Lframe_type F->temp_32[0]
#define Lstart_group F->temp_sptr[0]
#define Lframe_type F->temp_32[0]
#define Lstart_branch F->temp_sptr[0]
case OP_RECURSE:
bracode = mb->start_code + GET(Fecode, 1);
@ -5083,42 +5083,47 @@ fprintf(stderr, "++ op=%d\n", *Fecode);
bracket. We must leave Fecode unchanged so that the ending code can find
out where to continue. */
Lstart_group = bracode;
Lstart_branch = bracode;
Lframe_type = GF_RECURSE | number;
for (;;)
{
PCRE2_SPTR next_ecode;
group_frame_type = Lframe_type;
RMATCH(Lstart_group + PRIV(OP_lengths)[*Lstart_group], RM11);
RMATCH(Lstart_branch + PRIV(OP_lengths)[*Lstart_branch], RM11);
next_ecode = Lstart_branch + GET(Lstart_branch,1);
/* See comment above about handling THEN. */
/* Handle backtracking verbs, which are defined in a range that can
easily be tested for. PCRE does not allow THEN, SKIP, PRUNE or COMMIT to
escape beyond a recursion; they cause a NOMATCH for the entire recursion.
if (rrc == MATCH_THEN)
When one of these verbs triggers, the current recursion group number is
recorded. If it matches the recursion we are processing, the verb
happened within the recursion and we must deal with it. Otherwise it must
have happened after the recursion completed, and so has to be passed
back. See comment above about handling THEN. */
if (rrc >= MATCH_BACKTRACK_MIN && rrc <= MATCH_BACKTRACK_MAX &&
mb->verb_current_recurse == (Lframe_type ^ GF_RECURSE))
{
PCRE2_SPTR next_ecode = Lstart_group + GET(Lstart_group,1);
if (mb->verb_ecode_ptr < next_ecode &&
(*Lstart_group == OP_ALT || *next_ecode == OP_ALT))
if (rrc == MATCH_THEN && mb->verb_ecode_ptr < next_ecode &&
(*Lstart_branch == OP_ALT || *next_ecode == OP_ALT))
rrc = MATCH_NOMATCH;
else RRETURN(MATCH_NOMATCH);
}
/* PCRE does not allow THEN, SKIP, PRUNE or COMMIT to escape beyond a
recursion; they cause a NOMATCH for the entire recursion. These codes are
defined in a range that can be tested for. */
if (rrc >= MATCH_BACKTRACK_MIN && rrc <= MATCH_BACKTRACK_MAX)
RRETURN(MATCH_NOMATCH);
/* Note that carrying on after (*ACCEPT) in a recursion is handled in the
OP_ACCEPT code. Nothing needs to be done here. */
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
Lstart_group += GET(Lstart_group, 1);
if (*Lstart_group != OP_ALT) RRETURN(MATCH_NOMATCH);
Lstart_branch = next_ecode;
if (*Lstart_branch != OP_ALT) RRETURN(MATCH_NOMATCH);
}
/* Control never reaches here. */
#undef Lframe_type
#undef Lstart_group
#undef Lstart_branch
/* ===================================================================== */
@ -5535,8 +5540,7 @@ fprintf(stderr, "++ op=%d\n", *Fecode);
/* Whole-pattern recursion is coded as a recurse into group 0, so it
won't be picked up here. Instead, we catch it when the OP_END is reached.
Other recursion is handled here. We just have to record the current
subject position and start match pointer and give a MATCH return. */
Other recursion is handled here. */
case OP_CBRA:
case OP_CBRAPOS:
@ -5545,7 +5549,7 @@ fprintf(stderr, "++ op=%d\n", *Fecode);
number = GET2(bracode, 1+LINK_SIZE);
/* Handle a recursively called group. We reinstate the previous set of
captures and then carry on. */
captures and then carry on after the recursion call. */
if (Fcurrent_recurse == number)
{
@ -5837,26 +5841,34 @@ fprintf(stderr, "++ op=%d\n", *Fecode);
case OP_FAIL:
RRETURN(MATCH_NOMATCH);
/* Record the current recursing group number in mb->verb_current_recurse
when a backtracking return such as MATCH_COMMIT is given. This enables the
recurse processing to catch verbs from within the recursion. */
case OP_COMMIT:
RMATCH(Fecode + PRIV(OP_lengths)[*Fecode], RM13);
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
mb->verb_current_recurse = Fcurrent_recurse;
RRETURN(MATCH_COMMIT);
case OP_PRUNE:
RMATCH(Fecode + PRIV(OP_lengths)[*Fecode], RM14);
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
mb->verb_current_recurse = Fcurrent_recurse;
RRETURN(MATCH_PRUNE);
case OP_PRUNE_ARG:
Fmark = mb->nomatch_mark = Fecode + 2;
RMATCH(Fecode + PRIV(OP_lengths)[*Fecode] + Fecode[1], RM15);
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
mb->verb_current_recurse = Fcurrent_recurse;
RRETURN(MATCH_PRUNE);
case OP_SKIP:
RMATCH(Fecode + PRIV(OP_lengths)[*Fecode], RM16);
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
mb->verb_skip_ptr = Feptr; /* Pass back current position */
mb->verb_current_recurse = Fcurrent_recurse;
RRETURN(MATCH_SKIP);
/* Note that, for Perl compatibility, SKIP with an argument does NOT set
@ -5883,6 +5895,7 @@ fprintf(stderr, "++ op=%d\n", *Fecode);
mb->skip_arg_count. */
mb->verb_skip_ptr = Fecode + 2;
mb->verb_current_recurse = Fcurrent_recurse;
RRETURN(MATCH_SKIP_ARG);
/* For THEN (and THEN_ARG) we pass back the address of the opcode, so that
@ -5892,14 +5905,15 @@ fprintf(stderr, "++ op=%d\n", *Fecode);
RMATCH(Fecode + PRIV(OP_lengths)[*Fecode], RM18);
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
mb->verb_ecode_ptr = Fecode;
mb->verb_current_recurse = Fcurrent_recurse;
RRETURN(MATCH_THEN);
case OP_THEN_ARG:
Fmark = mb->nomatch_mark = Fecode + 2;
RMATCH(Fecode + PRIV(OP_lengths)[*Fecode] + Fecode[1], RM19);
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
mb->verb_ecode_ptr = Fecode;
mb->verb_current_recurse = Fcurrent_recurse;
RRETURN(MATCH_THEN);
@ -6243,7 +6257,7 @@ correct when calling match() more than once for non-anchored patterns. */
frame_size = sizeof(heapframe) + ((re->top_bracket - 1) * 2 * sizeof(PCRE2_SIZE));
/* If a pattern has very many capturing parentheses, the frame size may be very
large. Ensure that there are at least 10 available frames by getting an initial
large. Ensure that there are at least 10 available frames by getting an initial
vector on the heap if necessary. */
if (frame_size <= START_FRAMES_SIZE/10)

14
testdata/testinput2 vendored
View File

@ -4957,9 +4957,12 @@ a)"xI
//
\=ovector=7777777777
# This is here because Perl matches, even though a COMMIT is encountered
# outside of the recursion.
/(?1)(A(*COMMIT)|B)D/
BAXBAD\=no_jit
BAXBAD
"(?1){2}(a)"B
@ -5001,7 +5004,7 @@ a)"xI
/^(.|(.)(?1)?\2)$/
abcba
# The first of these, when run by Perl, give the mark 'aa', which is wrong.
# The first of these, when run by Perl, gives the mark 'aa', which is wrong.
'(?>a(*:aa))b|ac' mark
ac
@ -5019,4 +5022,11 @@ a)"xI
/\g{3/
# Perl matches this one, but PCRE does not because (*ACCEPT) clears out any
# pending backtracks in the recursion.
/^ (?(DEFINE) (..(*ACCEPT)|...) ) (?1)$/x
\= Expect no match
abc
# End of testinput2

18
testdata/testoutput2 vendored
View File

@ -15424,11 +15424,13 @@ Subject length lower bound = 11
//
\=ovector=7777777777
** Invalid value in 'ovector=7777777777'
# This is here because Perl matches, even though a COMMIT is encountered
# outside of the recursion.
/(?1)(A(*COMMIT)|B)D/
BAXBAD\=no_jit
0: BAD
1: A
BAXBAD
No match
"(?1){2}(a)"B
------------------------------------------------------------------
@ -15549,7 +15551,7 @@ Subject length lower bound = 11
1: abcba
2: a
# The first of these, when run by Perl, give the mark 'aa', which is wrong.
# The first of these, when run by Perl, gives the mark 'aa', which is wrong.
'(?>a(*:aa))b|ac' mark
ac
@ -15573,6 +15575,14 @@ No match
/\g{3/
Failed: error 157 at offset 2: \g is not followed by a braced, angle-bracketed, or quoted name/number or by a plain number
# Perl matches this one, but PCRE does not because (*ACCEPT) clears out any
# pending backtracks in the recursion.
/^ (?(DEFINE) (..(*ACCEPT)|...) ) (?1)$/x
\= Expect no match
abc
No match
# End of testinput2
Error -63: PCRE2_ERROR_BADDATA (unknown error number)
Error -62: bad serialized data