Fix issues with (*VERB)s inside recursive subroutine calls.
This commit is contained in:
parent
d5ca2dee9d
commit
45ddeb70cf
|
@ -15,9 +15,8 @@ the old code had a number of fudges to try to reduce stack usage. It seems to
|
|||
run no slower than the old code.
|
||||
|
||||
A number of bugs in the refactored code were subsequently fixed during testing
|
||||
before release, but after the code was made available in the repository. Many
|
||||
of the bugs were discovered by fuzzing testing. These bugs were never in fully
|
||||
released code, but are noted here for the record.
|
||||
before release, but after the code was made available in the repository. These
|
||||
bugs were never in fully released code, but are noted here for the record.
|
||||
|
||||
(a) If a pattern had fewer capturing parentheses than the ovector supplied in
|
||||
the match data block, a memory error (detectable by ASAN) occurred after
|
||||
|
@ -30,6 +29,8 @@ released code, but are noted here for the record.
|
|||
vector on the stack is not big enough to handle at least 10 frames.
|
||||
Fixes oss-fuzz issue 783.
|
||||
|
||||
(c) Handling of (*VERB)s in recursions was wrong in some cases.
|
||||
|
||||
2. Now that pcre2_match() no longer uses recursive function calls (see above),
|
||||
the "match limit recursion" value seems misnamed. It still exists, and limits
|
||||
the depth of tree that is searched. To avoid future confusion, it has been
|
||||
|
|
|
@ -826,13 +826,14 @@ typedef struct match_block {
|
|||
PCRE2_SPTR start_code; /* For use when recursing */
|
||||
PCRE2_SPTR start_subject; /* Start of the subject string */
|
||||
PCRE2_SPTR end_subject; /* End of the subject string */
|
||||
PCRE2_SPTR verb_ecode_ptr; /* For passing back info */
|
||||
PCRE2_SPTR verb_skip_ptr; /* For passing back a (*SKIP) name */
|
||||
PCRE2_SPTR end_match_ptr; /* Subject position at end match */
|
||||
PCRE2_SPTR start_used_ptr; /* Earliest consulted character */
|
||||
PCRE2_SPTR last_used_ptr; /* Latest consulted character */
|
||||
PCRE2_SPTR mark; /* Mark pointer to pass back on success */
|
||||
PCRE2_SPTR nomatch_mark; /* Mark pointer to pass back on failure */
|
||||
PCRE2_SPTR verb_ecode_ptr; /* For passing back info */
|
||||
PCRE2_SPTR verb_skip_ptr; /* For passing back a (*SKIP) name */
|
||||
uint32_t verb_current_recurse; /* Current recurse when (*VERB) happens */
|
||||
uint32_t moptions; /* Match options */
|
||||
uint32_t poptions; /* Pattern options */
|
||||
uint32_t skip_arg_count; /* For counting SKIP_ARGs */
|
||||
|
|
|
@ -5051,8 +5051,8 @@ fprintf(stderr, "++ op=%d\n", *Fecode);
|
|||
offset data is the offset to the starting bracket from the start of the
|
||||
whole pattern. (This is so that it works from duplicated subpatterns.) */
|
||||
|
||||
#define Lframe_type F->temp_32[0]
|
||||
#define Lstart_group F->temp_sptr[0]
|
||||
#define Lframe_type F->temp_32[0]
|
||||
#define Lstart_branch F->temp_sptr[0]
|
||||
|
||||
case OP_RECURSE:
|
||||
bracode = mb->start_code + GET(Fecode, 1);
|
||||
|
@ -5083,42 +5083,47 @@ fprintf(stderr, "++ op=%d\n", *Fecode);
|
|||
bracket. We must leave Fecode unchanged so that the ending code can find
|
||||
out where to continue. */
|
||||
|
||||
Lstart_group = bracode;
|
||||
Lstart_branch = bracode;
|
||||
Lframe_type = GF_RECURSE | number;
|
||||
|
||||
for (;;)
|
||||
{
|
||||
PCRE2_SPTR next_ecode;
|
||||
|
||||
group_frame_type = Lframe_type;
|
||||
RMATCH(Lstart_group + PRIV(OP_lengths)[*Lstart_group], RM11);
|
||||
RMATCH(Lstart_branch + PRIV(OP_lengths)[*Lstart_branch], RM11);
|
||||
next_ecode = Lstart_branch + GET(Lstart_branch,1);
|
||||
|
||||
/* See comment above about handling THEN. */
|
||||
/* Handle backtracking verbs, which are defined in a range that can
|
||||
easily be tested for. PCRE does not allow THEN, SKIP, PRUNE or COMMIT to
|
||||
escape beyond a recursion; they cause a NOMATCH for the entire recursion.
|
||||
|
||||
if (rrc == MATCH_THEN)
|
||||
When one of these verbs triggers, the current recursion group number is
|
||||
recorded. If it matches the recursion we are processing, the verb
|
||||
happened within the recursion and we must deal with it. Otherwise it must
|
||||
have happened after the recursion completed, and so has to be passed
|
||||
back. See comment above about handling THEN. */
|
||||
|
||||
if (rrc >= MATCH_BACKTRACK_MIN && rrc <= MATCH_BACKTRACK_MAX &&
|
||||
mb->verb_current_recurse == (Lframe_type ^ GF_RECURSE))
|
||||
{
|
||||
PCRE2_SPTR next_ecode = Lstart_group + GET(Lstart_group,1);
|
||||
if (mb->verb_ecode_ptr < next_ecode &&
|
||||
(*Lstart_group == OP_ALT || *next_ecode == OP_ALT))
|
||||
if (rrc == MATCH_THEN && mb->verb_ecode_ptr < next_ecode &&
|
||||
(*Lstart_branch == OP_ALT || *next_ecode == OP_ALT))
|
||||
rrc = MATCH_NOMATCH;
|
||||
else RRETURN(MATCH_NOMATCH);
|
||||
}
|
||||
|
||||
/* PCRE does not allow THEN, SKIP, PRUNE or COMMIT to escape beyond a
|
||||
recursion; they cause a NOMATCH for the entire recursion. These codes are
|
||||
defined in a range that can be tested for. */
|
||||
|
||||
if (rrc >= MATCH_BACKTRACK_MIN && rrc <= MATCH_BACKTRACK_MAX)
|
||||
RRETURN(MATCH_NOMATCH);
|
||||
|
||||
/* Note that carrying on after (*ACCEPT) in a recursion is handled in the
|
||||
OP_ACCEPT code. Nothing needs to be done here. */
|
||||
|
||||
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
|
||||
Lstart_group += GET(Lstart_group, 1);
|
||||
if (*Lstart_group != OP_ALT) RRETURN(MATCH_NOMATCH);
|
||||
Lstart_branch = next_ecode;
|
||||
if (*Lstart_branch != OP_ALT) RRETURN(MATCH_NOMATCH);
|
||||
}
|
||||
/* Control never reaches here. */
|
||||
|
||||
#undef Lframe_type
|
||||
#undef Lstart_group
|
||||
#undef Lstart_branch
|
||||
|
||||
|
||||
/* ===================================================================== */
|
||||
|
@ -5535,8 +5540,7 @@ fprintf(stderr, "++ op=%d\n", *Fecode);
|
|||
|
||||
/* Whole-pattern recursion is coded as a recurse into group 0, so it
|
||||
won't be picked up here. Instead, we catch it when the OP_END is reached.
|
||||
Other recursion is handled here. We just have to record the current
|
||||
subject position and start match pointer and give a MATCH return. */
|
||||
Other recursion is handled here. */
|
||||
|
||||
case OP_CBRA:
|
||||
case OP_CBRAPOS:
|
||||
|
@ -5545,7 +5549,7 @@ fprintf(stderr, "++ op=%d\n", *Fecode);
|
|||
number = GET2(bracode, 1+LINK_SIZE);
|
||||
|
||||
/* Handle a recursively called group. We reinstate the previous set of
|
||||
captures and then carry on. */
|
||||
captures and then carry on after the recursion call. */
|
||||
|
||||
if (Fcurrent_recurse == number)
|
||||
{
|
||||
|
@ -5837,26 +5841,34 @@ fprintf(stderr, "++ op=%d\n", *Fecode);
|
|||
case OP_FAIL:
|
||||
RRETURN(MATCH_NOMATCH);
|
||||
|
||||
/* Record the current recursing group number in mb->verb_current_recurse
|
||||
when a backtracking return such as MATCH_COMMIT is given. This enables the
|
||||
recurse processing to catch verbs from within the recursion. */
|
||||
|
||||
case OP_COMMIT:
|
||||
RMATCH(Fecode + PRIV(OP_lengths)[*Fecode], RM13);
|
||||
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
|
||||
mb->verb_current_recurse = Fcurrent_recurse;
|
||||
RRETURN(MATCH_COMMIT);
|
||||
|
||||
case OP_PRUNE:
|
||||
RMATCH(Fecode + PRIV(OP_lengths)[*Fecode], RM14);
|
||||
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
|
||||
mb->verb_current_recurse = Fcurrent_recurse;
|
||||
RRETURN(MATCH_PRUNE);
|
||||
|
||||
case OP_PRUNE_ARG:
|
||||
Fmark = mb->nomatch_mark = Fecode + 2;
|
||||
RMATCH(Fecode + PRIV(OP_lengths)[*Fecode] + Fecode[1], RM15);
|
||||
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
|
||||
mb->verb_current_recurse = Fcurrent_recurse;
|
||||
RRETURN(MATCH_PRUNE);
|
||||
|
||||
case OP_SKIP:
|
||||
RMATCH(Fecode + PRIV(OP_lengths)[*Fecode], RM16);
|
||||
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
|
||||
mb->verb_skip_ptr = Feptr; /* Pass back current position */
|
||||
mb->verb_current_recurse = Fcurrent_recurse;
|
||||
RRETURN(MATCH_SKIP);
|
||||
|
||||
/* Note that, for Perl compatibility, SKIP with an argument does NOT set
|
||||
|
@ -5883,6 +5895,7 @@ fprintf(stderr, "++ op=%d\n", *Fecode);
|
|||
mb->skip_arg_count. */
|
||||
|
||||
mb->verb_skip_ptr = Fecode + 2;
|
||||
mb->verb_current_recurse = Fcurrent_recurse;
|
||||
RRETURN(MATCH_SKIP_ARG);
|
||||
|
||||
/* For THEN (and THEN_ARG) we pass back the address of the opcode, so that
|
||||
|
@ -5892,14 +5905,15 @@ fprintf(stderr, "++ op=%d\n", *Fecode);
|
|||
RMATCH(Fecode + PRIV(OP_lengths)[*Fecode], RM18);
|
||||
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
|
||||
mb->verb_ecode_ptr = Fecode;
|
||||
mb->verb_current_recurse = Fcurrent_recurse;
|
||||
RRETURN(MATCH_THEN);
|
||||
|
||||
case OP_THEN_ARG:
|
||||
Fmark = mb->nomatch_mark = Fecode + 2;
|
||||
RMATCH(Fecode + PRIV(OP_lengths)[*Fecode] + Fecode[1], RM19);
|
||||
|
||||
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
|
||||
mb->verb_ecode_ptr = Fecode;
|
||||
mb->verb_current_recurse = Fcurrent_recurse;
|
||||
RRETURN(MATCH_THEN);
|
||||
|
||||
|
||||
|
@ -6243,7 +6257,7 @@ correct when calling match() more than once for non-anchored patterns. */
|
|||
frame_size = sizeof(heapframe) + ((re->top_bracket - 1) * 2 * sizeof(PCRE2_SIZE));
|
||||
|
||||
/* If a pattern has very many capturing parentheses, the frame size may be very
|
||||
large. Ensure that there are at least 10 available frames by getting an initial
|
||||
large. Ensure that there are at least 10 available frames by getting an initial
|
||||
vector on the heap if necessary. */
|
||||
|
||||
if (frame_size <= START_FRAMES_SIZE/10)
|
||||
|
|
|
@ -4957,9 +4957,12 @@ a)"xI
|
|||
|
||||
//
|
||||
\=ovector=7777777777
|
||||
|
||||
# This is here because Perl matches, even though a COMMIT is encountered
|
||||
# outside of the recursion.
|
||||
|
||||
/(?1)(A(*COMMIT)|B)D/
|
||||
BAXBAD\=no_jit
|
||||
BAXBAD
|
||||
|
||||
"(?1){2}(a)"B
|
||||
|
||||
|
@ -5001,7 +5004,7 @@ a)"xI
|
|||
/^(.|(.)(?1)?\2)$/
|
||||
abcba
|
||||
|
||||
# The first of these, when run by Perl, give the mark 'aa', which is wrong.
|
||||
# The first of these, when run by Perl, gives the mark 'aa', which is wrong.
|
||||
|
||||
'(?>a(*:aa))b|ac' mark
|
||||
ac
|
||||
|
@ -5019,4 +5022,11 @@ a)"xI
|
|||
|
||||
/\g{3/
|
||||
|
||||
# Perl matches this one, but PCRE does not because (*ACCEPT) clears out any
|
||||
# pending backtracks in the recursion.
|
||||
|
||||
/^ (?(DEFINE) (..(*ACCEPT)|...) ) (?1)$/x
|
||||
\= Expect no match
|
||||
abc
|
||||
|
||||
# End of testinput2
|
||||
|
|
|
@ -15424,11 +15424,13 @@ Subject length lower bound = 11
|
|||
//
|
||||
\=ovector=7777777777
|
||||
** Invalid value in 'ovector=7777777777'
|
||||
|
||||
# This is here because Perl matches, even though a COMMIT is encountered
|
||||
# outside of the recursion.
|
||||
|
||||
/(?1)(A(*COMMIT)|B)D/
|
||||
BAXBAD\=no_jit
|
||||
0: BAD
|
||||
1: A
|
||||
BAXBAD
|
||||
No match
|
||||
|
||||
"(?1){2}(a)"B
|
||||
------------------------------------------------------------------
|
||||
|
@ -15549,7 +15551,7 @@ Subject length lower bound = 11
|
|||
1: abcba
|
||||
2: a
|
||||
|
||||
# The first of these, when run by Perl, give the mark 'aa', which is wrong.
|
||||
# The first of these, when run by Perl, gives the mark 'aa', which is wrong.
|
||||
|
||||
'(?>a(*:aa))b|ac' mark
|
||||
ac
|
||||
|
@ -15573,6 +15575,14 @@ No match
|
|||
/\g{3/
|
||||
Failed: error 157 at offset 2: \g is not followed by a braced, angle-bracketed, or quoted name/number or by a plain number
|
||||
|
||||
# Perl matches this one, but PCRE does not because (*ACCEPT) clears out any
|
||||
# pending backtracks in the recursion.
|
||||
|
||||
/^ (?(DEFINE) (..(*ACCEPT)|...) ) (?1)$/x
|
||||
\= Expect no match
|
||||
abc
|
||||
No match
|
||||
|
||||
# End of testinput2
|
||||
Error -63: PCRE2_ERROR_BADDATA (unknown error number)
|
||||
Error -62: bad serialized data
|
||||
|
|
Loading…
Reference in New Issue