Fix issues with minimum length finding.

This commit is contained in:
Philip.Hazel 2015-08-01 09:11:28 +00:00
parent d1caa059fc
commit 8269696f57
5 changed files with 106 additions and 33 deletions

View File

@ -85,6 +85,13 @@ string if the final multi-byte UTF-8 character was truncated.
class, where both values are literal letters in the same case, omit the
non-letter EBCDIC code points within the range.
23. Finding the minimum matching length of complex patterns with back
references and/or recursions can take a long time. There is now a cut-off that
gives up trying to find a minimum length when things get too complex.
24. An optimization has been added that speeds up finding the minimum matching
length for patterns containing repeated capturing groups or recursions.
Version 10.20 30-June-2015
--------------------------

View File

@ -83,7 +83,7 @@ for (;;)
if (c == OP_XCLASS) code += GET(code, 1);
else if (c == OP_CALLOUT_STR) code += GET(code, 1 + 2*LINK_SIZE);
/* Handle recursion */
/* Handle lookbehind */
else if (c == OP_REVERSE)
{

View File

@ -59,7 +59,6 @@ collecting data (e.g. minimum matching length). */
enum { SSB_FAIL, SSB_DONE, SSB_CONTINUE, SSB_UNKNOWN };
/*************************************************
* Find the minimum subject length for a group *
*************************************************/
@ -75,24 +74,35 @@ Arguments:
startcode pointer to start of the whole pattern's code
utf UTF flag
recurses chain of recurse_check to catch mutual recursion
countptr pointer to call count (to catch over complexity)
Returns: the minimum length
-1 \C in UTF-8 mode
or (*ACCEPT)
or pattern too complicated
-2 internal error (missing capturing bracket)
-3 internal error (opcode not listed)
*/
static int
find_minlength(const pcre2_real_code *re, PCRE2_SPTR code,
PCRE2_SPTR startcode, BOOL utf, recurse_check *recurses)
PCRE2_SPTR startcode, BOOL utf, recurse_check *recurses, int *countptr)
{
int length = -1;
int prev_recno = -1;
int prev_d = 0;
uint32_t once_fudge = 0;
BOOL had_recurse = FALSE;
recurse_check this_recurse;
register int branchlength = 0;
register PCRE2_UCHAR *cc = (PCRE2_UCHAR *)code + 1 + LINK_SIZE;
/* A large and/or complex regex can take too long to process. */
if ((*countptr)++ > 1000) return -1;
/* Skip over capturing bracket number */
if (*code == OP_CBRA || *code == OP_SCBRA ||
*code == OP_CBRAPOS || *code == OP_SCBRAPOS) cc += IMM2_SIZE;
@ -101,7 +111,7 @@ branch, check the length against that of the other branches. */
for (;;)
{
int d, min;
int d, min, recno;
PCRE2_UCHAR *cs, *ce;
register PCRE2_UCHAR op = *cc;
@ -112,7 +122,8 @@ for (;;)
/* If there is only one branch in a condition, the implied branch has zero
length, so we don't add anything. This covers the DEFINE "condition"
automatically. */
automatically. If there are two branches we can treat it the same as any
other non-capturing subpattern. */
cs = cc + GET(cc, 1);
if (*cs != OP_ALT)
@ -120,23 +131,51 @@ for (;;)
cc = cs + 1 + LINK_SIZE;
break;
}
goto PROCESS_NON_CAPTURE;
/* Otherwise we can fall through and treat it the same as any other
subpattern. */
/* There's a special case of OP_ONCE, when it is wrapped round an
OP_RECURSE. We'd like to process the latter at this level so that
remembering the value works for repeated cases. So we do nothing, but
set a fudge value to skip over the OP_KET after the recurse. */
case OP_ONCE:
if (cc[1+LINK_SIZE] == OP_RECURSE && cc[2*(1+LINK_SIZE)] == OP_KET)
{
once_fudge = 1 + LINK_SIZE;
cc += 1 + LINK_SIZE;
break;
}
/* Fall through */
case OP_ONCE_NC:
case OP_BRA:
case OP_SBRA:
case OP_BRAPOS:
case OP_SBRAPOS:
PROCESS_NON_CAPTURE:
d = find_minlength(re, cc, startcode, utf, recurses, countptr);
if (d < 0) return d;
branchlength += d;
do cc += GET(cc, 1); while (*cc == OP_ALT);
cc += 1 + LINK_SIZE;
break;
/* To save time for repeated capturing subpatterns, we remember the
length of the previous one. Unfortunately we can't do the same for
the unnumbered ones above. */
case OP_CBRA:
case OP_SCBRA:
case OP_BRA:
case OP_SBRA:
case OP_CBRAPOS:
case OP_SCBRAPOS:
case OP_BRAPOS:
case OP_SBRAPOS:
case OP_ONCE:
case OP_ONCE_NC:
d = find_minlength(re, cc, startcode, utf, recurses);
if (d < 0) return d;
branchlength += d;
recno = GET2(cc, 1+LINK_SIZE);
if (recno != prev_recno)
{
prev_recno = recno;
prev_d = find_minlength(re, cc, startcode, utf, recurses, countptr);
if (prev_d < 0) return prev_d;
}
branchlength += prev_d;
do cc += GET(cc, 1); while (*cc == OP_ALT);
cc += 1 + LINK_SIZE;
break;
@ -427,7 +466,7 @@ for (;;)
int dd;
this_recurse.prev = recurses;
this_recurse.group = cs;
dd = find_minlength(re, cs, startcode, utf, &this_recurse);
dd = find_minlength(re, cs, startcode, utf, &this_recurse, countptr);
if (dd < d) d = dd;
}
}
@ -463,7 +502,7 @@ for (;;)
{
this_recurse.prev = recurses;
this_recurse.group = cs;
d = find_minlength(re, cs, startcode, utf, &this_recurse);
d = find_minlength(re, cs, startcode, utf, &this_recurse, countptr);
}
}
}
@ -509,6 +548,13 @@ for (;;)
case OP_RECURSE:
cs = ce = (PCRE2_UCHAR *)startcode + GET(cc, 1);
recno = GET2(cs, 1+LINK_SIZE);
if (recno == prev_recno)
{
branchlength += prev_d;
}
else
{
do ce += GET(ce, 1); while (*ce == OP_ALT);
if (cc > cs && cc < ce) /* Simple recursion */
had_recurse = TRUE;
@ -522,10 +568,16 @@ for (;;)
{
this_recurse.prev = recurses;
this_recurse.group = cs;
branchlength += find_minlength(re, cs, startcode, utf, &this_recurse);
prev_d = find_minlength(re, cs, startcode, utf, &this_recurse,
countptr);
if (prev_d < 0) return prev_d;
prev_recno = recno;
branchlength += prev_d;
}
}
cc += 1 + LINK_SIZE;
}
cc += 1 + LINK_SIZE + once_fudge;
once_fudge = 0;
break;
/* Anything else does not or need not match a character. We can get the
@ -1441,6 +1493,7 @@ int
PRIV(study)(pcre2_real_code *re)
{
int min;
int count = 0;
PCRE2_UCHAR *code;
BOOL utf = (re->overall_options & PCRE2_UTF) != 0;
@ -1463,9 +1516,9 @@ if ((re->overall_options & PCRE2_ANCHORED) == 0 &&
/* Find the minimum length of subject string. */
switch(min = find_minlength(re, code, code, utf, NULL))
switch(min = find_minlength(re, code, code, utf, NULL, &count))
{
case -1: /* \C in UTF mode or (*ACCEPT) */
case -1: /* \C in UTF mode or (*ACCEPT) or over-complex regex */
break; /* Leave minlength unchanged (will be zero) */
case -2:
@ -1475,6 +1528,7 @@ switch(min = find_minlength(re, code, code, utf, NULL))
return 3; /* unrecognized opcode */
default:
if (min > UINT16_MAX) min = UINT16_MAX;
re->minlength = min;
break;
}

2
testdata/testinput2 vendored
View File

@ -4360,4 +4360,6 @@ a random value. /Ix
/(?(?C{\Q})(?!(?'abc')))/I
/(?1){3918}(((((0(\k'R'))))(?J)(?'R'(?'R'\3){99})))/I
# End of testinput2

12
testdata/testoutput2 vendored
View File

@ -3942,7 +3942,7 @@ No match
Capturing subpattern count = 2
Compile options: <none>
Overall options: anchored
Subject length lower bound = 2
Subject length lower bound = 3
xyz
0: xyz
1: xyz
@ -14566,4 +14566,14 @@ Named capturing subpatterns:
May match empty string
Subject length lower bound = 0
/(?1){3918}(((((0(\k'R'))))(?J)(?'R'(?'R'\3){99})))/I
Capturing subpattern count = 8
Max back reference = 8
Named capturing subpatterns:
R 7
R 8
Duplicate name status changes
Last code unit = '0'
Subject length lower bound = 65535
# End of testinput2