Fix very slow find_minlength when mutual recursion is present.
This commit is contained in:
parent
f123833bdb
commit
3e1748390b
|
@ -80,6 +80,11 @@ reference to random memory and/or a segfault. There were also some other cases
|
||||||
where backtracking after \C could crash. This set of bugs was discovered by the
|
where backtracking after \C could crash. This set of bugs was discovered by the
|
||||||
LLVM fuzzer.
|
LLVM fuzzer.
|
||||||
|
|
||||||
|
20. The function for finding the minimum length of a matching string could take
|
||||||
|
a very long time if mutual recursion was present many times in a pattern, for
|
||||||
|
example, /((?2){73}(?2))((?1))/. A better mutual recursion detection method has
|
||||||
|
been implemented. This infelicity was discovered by the LLVM fuzzer.
|
||||||
|
|
||||||
|
|
||||||
Version 10.10 06-March-2015
|
Version 10.10 06-March-2015
|
||||||
---------------------------
|
---------------------------
|
||||||
|
|
|
@ -677,14 +677,6 @@ static const uint8_t opcode_possessify[] = {
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
/* Structure for checking for mutual recursion when scanning compiled code. */
|
|
||||||
|
|
||||||
typedef struct recurse_check {
|
|
||||||
struct recurse_check *prev;
|
|
||||||
PCRE2_SPTR group;
|
|
||||||
} recurse_check;
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
/*************************************************
|
/*************************************************
|
||||||
* Free compiled code *
|
* Free compiled code *
|
||||||
|
|
|
@ -640,6 +640,13 @@ typedef struct pcre2_real_match_data {
|
||||||
|
|
||||||
#ifndef PCRE2_PCRE2TEST
|
#ifndef PCRE2_PCRE2TEST
|
||||||
|
|
||||||
|
/* Structure for checking for mutual recursion when scanning compiled code. */
|
||||||
|
|
||||||
|
typedef struct recurse_check {
|
||||||
|
struct recurse_check *prev;
|
||||||
|
PCRE2_SPTR group;
|
||||||
|
} recurse_check;
|
||||||
|
|
||||||
/* Structure for maintaining a chain of pointers to the currently incomplete
|
/* Structure for maintaining a chain of pointers to the currently incomplete
|
||||||
branches, for testing for left recursion while compiling. */
|
branches, for testing for left recursion while compiling. */
|
||||||
|
|
||||||
|
|
|
@ -73,23 +73,23 @@ Arguments:
|
||||||
re compiled pattern block
|
re compiled pattern block
|
||||||
code pointer to start of group (the bracket)
|
code pointer to start of group (the bracket)
|
||||||
startcode pointer to start of the whole pattern's code
|
startcode pointer to start of the whole pattern's code
|
||||||
recurse_depth RECURSE and/or backreference depth
|
|
||||||
utf UTF flag
|
utf UTF flag
|
||||||
|
recurses chain of recurse_check to catch mutual recursion
|
||||||
|
|
||||||
Returns: the minimum length
|
Returns: the minimum length
|
||||||
-1 \C in UTF-8 mode
|
-1 \C in UTF-8 mode
|
||||||
or (*ACCEPT)
|
or (*ACCEPT)
|
||||||
or too much back reference recursion
|
|
||||||
-2 internal error (missing capturing bracket)
|
-2 internal error (missing capturing bracket)
|
||||||
-3 internal error (opcode not listed)
|
-3 internal error (opcode not listed)
|
||||||
*/
|
*/
|
||||||
|
|
||||||
static int
|
static int
|
||||||
find_minlength(const pcre2_real_code *re, PCRE2_SPTR code,
|
find_minlength(const pcre2_real_code *re, PCRE2_SPTR code,
|
||||||
PCRE2_SPTR startcode, int recurse_depth, BOOL utf)
|
PCRE2_SPTR startcode, BOOL utf, recurse_check *recurses)
|
||||||
{
|
{
|
||||||
int length = -1;
|
int length = -1;
|
||||||
BOOL had_recurse = FALSE;
|
BOOL had_recurse = FALSE;
|
||||||
|
recurse_check this_recurse;
|
||||||
register int branchlength = 0;
|
register int branchlength = 0;
|
||||||
register PCRE2_UCHAR *cc = (PCRE2_UCHAR *)code + 1 + LINK_SIZE;
|
register PCRE2_UCHAR *cc = (PCRE2_UCHAR *)code + 1 + LINK_SIZE;
|
||||||
|
|
||||||
|
@ -134,7 +134,7 @@ for (;;)
|
||||||
case OP_SBRAPOS:
|
case OP_SBRAPOS:
|
||||||
case OP_ONCE:
|
case OP_ONCE:
|
||||||
case OP_ONCE_NC:
|
case OP_ONCE_NC:
|
||||||
d = find_minlength(re, cc, startcode, recurse_depth, utf);
|
d = find_minlength(re, cc, startcode, utf, recurses);
|
||||||
if (d < 0) return d;
|
if (d < 0) return d;
|
||||||
branchlength += d;
|
branchlength += d;
|
||||||
do cc += GET(cc, 1); while (*cc == OP_ALT);
|
do cc += GET(cc, 1); while (*cc == OP_ALT);
|
||||||
|
@ -377,13 +377,12 @@ for (;;)
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
|
|
||||||
/* Backreferences and subroutine calls are treated in the same way: we find
|
/* Backreferences and subroutine calls (OP_RECURSE) are treated in the same
|
||||||
the minimum length for the subpattern. A recursion, however, causes an
|
way: we find the minimum length for the subpattern. A recursion
|
||||||
a flag to be set that causes the length of this branch to be ignored. The
|
(backreference or subroutine) causes an a flag to be set that causes the
|
||||||
logic is that a recursion can only make sense if there is another
|
length of this branch to be ignored. The logic is that a recursion can only
|
||||||
alternative that stops the recursing. That will provide the minimum length
|
make sense if there is another alternative that stops the recursing. That
|
||||||
(when no recursion happens). A backreference within the group that it is
|
will provide the minimum length (when no recursion happens).
|
||||||
referencing behaves in the same way.
|
|
||||||
|
|
||||||
If PCRE2_MATCH_UNSET_BACKREF is set, a backreference to an unset bracket
|
If PCRE2_MATCH_UNSET_BACKREF is set, a backreference to an unset bracket
|
||||||
matches an empty string (by default it causes a matching failure), so in
|
matches an empty string (by default it causes a matching failure), so in
|
||||||
|
@ -399,12 +398,15 @@ for (;;)
|
||||||
GET2(cc, 1) * re->name_entry_size;
|
GET2(cc, 1) * re->name_entry_size;
|
||||||
|
|
||||||
d = INT_MAX;
|
d = INT_MAX;
|
||||||
|
|
||||||
|
/* Scan all groups with the same name */
|
||||||
|
|
||||||
while (count-- > 0)
|
while (count-- > 0)
|
||||||
{
|
{
|
||||||
ce = cs = (PCRE2_UCHAR *)PRIV(find_bracket)(startcode, utf, GET2(slot, 0));
|
ce = cs = (PCRE2_UCHAR *)PRIV(find_bracket)(startcode, utf, GET2(slot, 0));
|
||||||
if (cs == NULL) return -2;
|
if (cs == NULL) return -2;
|
||||||
do ce += GET(ce, 1); while (*ce == OP_ALT);
|
do ce += GET(ce, 1); while (*ce == OP_ALT);
|
||||||
if ((cc > cs && cc < ce) || recurse_depth > 10)
|
if (cc > cs && cc < ce) /* Simple recursion */
|
||||||
{
|
{
|
||||||
d = 0;
|
d = 0;
|
||||||
had_recurse = TRUE;
|
had_recurse = TRUE;
|
||||||
|
@ -412,8 +414,22 @@ for (;;)
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
int dd = find_minlength(re, cs, startcode, recurse_depth + 1, utf);
|
recurse_check *r = recurses;
|
||||||
if (dd < d) d = dd;
|
for (r = recurses; r != NULL; r = r->prev) if (r->group == cs) break;
|
||||||
|
if (r != NULL) /* Mutual recursion */
|
||||||
|
{
|
||||||
|
d = 0;
|
||||||
|
had_recurse = TRUE;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
int dd;
|
||||||
|
this_recurse.prev = recurses;
|
||||||
|
this_recurse.group = cs;
|
||||||
|
dd = find_minlength(re, cs, startcode, utf, &this_recurse);
|
||||||
|
if (dd < d) d = dd;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
slot += re->name_entry_size;
|
slot += re->name_entry_size;
|
||||||
}
|
}
|
||||||
|
@ -429,14 +445,26 @@ for (;;)
|
||||||
ce = cs = (PCRE2_UCHAR *)PRIV(find_bracket)(startcode, utf, GET2(cc, 1));
|
ce = cs = (PCRE2_UCHAR *)PRIV(find_bracket)(startcode, utf, GET2(cc, 1));
|
||||||
if (cs == NULL) return -2;
|
if (cs == NULL) return -2;
|
||||||
do ce += GET(ce, 1); while (*ce == OP_ALT);
|
do ce += GET(ce, 1); while (*ce == OP_ALT);
|
||||||
if ((cc > cs && cc < ce) || recurse_depth > 10)
|
if (cc > cs && cc < ce) /* Simple recursion */
|
||||||
{
|
{
|
||||||
d = 0;
|
d = 0;
|
||||||
had_recurse = TRUE;
|
had_recurse = TRUE;
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
d = find_minlength(re, cs, startcode, recurse_depth + 1, utf);
|
recurse_check *r = recurses;
|
||||||
|
for (r = recurses; r != NULL; r = r->prev) if (r->group == cs) break;
|
||||||
|
if (r != NULL) /* Mutual recursion */
|
||||||
|
{
|
||||||
|
d = 0;
|
||||||
|
had_recurse = TRUE;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
this_recurse.prev = recurses;
|
||||||
|
this_recurse.group = cs;
|
||||||
|
d = find_minlength(re, cs, startcode, utf, &this_recurse);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
else d = 0;
|
else d = 0;
|
||||||
|
@ -479,17 +507,23 @@ for (;;)
|
||||||
branchlength += min * d;
|
branchlength += min * d;
|
||||||
break;
|
break;
|
||||||
|
|
||||||
/* We can easily detect direct recursion, but not mutual recursion. This is
|
|
||||||
caught by a recursion depth count. */
|
|
||||||
|
|
||||||
case OP_RECURSE:
|
case OP_RECURSE:
|
||||||
cs = ce = (PCRE2_UCHAR *)startcode + GET(cc, 1);
|
cs = ce = (PCRE2_UCHAR *)startcode + GET(cc, 1);
|
||||||
do ce += GET(ce, 1); while (*ce == OP_ALT);
|
do ce += GET(ce, 1); while (*ce == OP_ALT);
|
||||||
if ((cc > cs && cc < ce) || recurse_depth > 10)
|
if (cc > cs && cc < ce) /* Simple recursion */
|
||||||
had_recurse = TRUE;
|
had_recurse = TRUE;
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
branchlength += find_minlength(re, cs, startcode, recurse_depth + 1, utf);
|
recurse_check *r = recurses;
|
||||||
|
for (r = recurses; r != NULL; r = r->prev) if (r->group == cs) break;
|
||||||
|
if (r != NULL) /* Mutual recursion */
|
||||||
|
had_recurse = TRUE;
|
||||||
|
else
|
||||||
|
{
|
||||||
|
this_recurse.prev = recurses;
|
||||||
|
this_recurse.group = cs;
|
||||||
|
branchlength += find_minlength(re, cs, startcode, utf, &this_recurse);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
cc += 1 + LINK_SIZE;
|
cc += 1 + LINK_SIZE;
|
||||||
break;
|
break;
|
||||||
|
@ -1429,9 +1463,9 @@ if ((re->overall_options & PCRE2_ANCHORED) == 0 &&
|
||||||
|
|
||||||
/* Find the minimum length of subject string. */
|
/* Find the minimum length of subject string. */
|
||||||
|
|
||||||
switch(min = find_minlength(re, code, code, 0, utf))
|
switch(min = find_minlength(re, code, code, utf, NULL))
|
||||||
{
|
{
|
||||||
case -1: /* \C in UTF mode or (*ACCEPT) or too much backref recursion */
|
case -1: /* \C in UTF mode or (*ACCEPT) */
|
||||||
break; /* Leave minlength unchanged (will be zero) */
|
break; /* Leave minlength unchanged (will be zero) */
|
||||||
|
|
||||||
case -2:
|
case -2:
|
||||||
|
|
|
@ -4263,4 +4263,6 @@ a random value. /Ix
|
||||||
/(?<=\Ka)/altglobal,aftertext
|
/(?<=\Ka)/altglobal,aftertext
|
||||||
aaaaa
|
aaaaa
|
||||||
|
|
||||||
|
/((?2){73}(?2))((?1))/info
|
||||||
|
|
||||||
# End of testinput2
|
# End of testinput2
|
||||||
|
|
|
@ -14288,4 +14288,9 @@ Failed: error 125 at offset 20: lookbehind assertion is not fixed length
|
||||||
0: a
|
0: a
|
||||||
0+
|
0+
|
||||||
|
|
||||||
|
/((?2){73}(?2))((?1))/info
|
||||||
|
Capturing subpattern count = 2
|
||||||
|
May match empty string
|
||||||
|
Subject length lower bound = 0
|
||||||
|
|
||||||
# End of testinput2
|
# End of testinput2
|
||||||
|
|
Loading…
Reference in New Issue