Fix issues with minimum length finding.
This commit is contained in:
parent
d1caa059fc
commit
8269696f57
|
@ -85,6 +85,13 @@ string if the final multi-byte UTF-8 character was truncated.
|
|||
class, where both values are literal letters in the same case, omit the
|
||||
non-letter EBCDIC code points within the range.
|
||||
|
||||
23. Finding the minimum matching length of complex patterns with back
|
||||
references and/or recursions can take a long time. There is now a cut-off that
|
||||
gives up trying to find a minimum length when things get too complex.
|
||||
|
||||
24. An optimization has been added that speeds up finding the minimum matching
|
||||
length for patterns containing repeated capturing groups or recursions.
|
||||
|
||||
|
||||
Version 10.20 30-June-2015
|
||||
--------------------------
|
||||
|
|
|
@ -83,7 +83,7 @@ for (;;)
|
|||
if (c == OP_XCLASS) code += GET(code, 1);
|
||||
else if (c == OP_CALLOUT_STR) code += GET(code, 1 + 2*LINK_SIZE);
|
||||
|
||||
/* Handle recursion */
|
||||
/* Handle lookbehind */
|
||||
|
||||
else if (c == OP_REVERSE)
|
||||
{
|
||||
|
|
|
@ -59,7 +59,6 @@ collecting data (e.g. minimum matching length). */
|
|||
enum { SSB_FAIL, SSB_DONE, SSB_CONTINUE, SSB_UNKNOWN };
|
||||
|
||||
|
||||
|
||||
/*************************************************
|
||||
* Find the minimum subject length for a group *
|
||||
*************************************************/
|
||||
|
@ -75,24 +74,35 @@ Arguments:
|
|||
startcode pointer to start of the whole pattern's code
|
||||
utf UTF flag
|
||||
recurses chain of recurse_check to catch mutual recursion
|
||||
countptr pointer to call count (to catch over complexity)
|
||||
|
||||
Returns: the minimum length
|
||||
-1 \C in UTF-8 mode
|
||||
or (*ACCEPT)
|
||||
or pattern too complicated
|
||||
-2 internal error (missing capturing bracket)
|
||||
-3 internal error (opcode not listed)
|
||||
*/
|
||||
|
||||
static int
|
||||
find_minlength(const pcre2_real_code *re, PCRE2_SPTR code,
|
||||
PCRE2_SPTR startcode, BOOL utf, recurse_check *recurses)
|
||||
PCRE2_SPTR startcode, BOOL utf, recurse_check *recurses, int *countptr)
|
||||
{
|
||||
int length = -1;
|
||||
int prev_recno = -1;
|
||||
int prev_d = 0;
|
||||
uint32_t once_fudge = 0;
|
||||
BOOL had_recurse = FALSE;
|
||||
recurse_check this_recurse;
|
||||
register int branchlength = 0;
|
||||
register PCRE2_UCHAR *cc = (PCRE2_UCHAR *)code + 1 + LINK_SIZE;
|
||||
|
||||
/* A large and/or complex regex can take too long to process. */
|
||||
|
||||
if ((*countptr)++ > 1000) return -1;
|
||||
|
||||
/* Skip over capturing bracket number */
|
||||
|
||||
if (*code == OP_CBRA || *code == OP_SCBRA ||
|
||||
*code == OP_CBRAPOS || *code == OP_SCBRAPOS) cc += IMM2_SIZE;
|
||||
|
||||
|
@ -101,10 +111,10 @@ branch, check the length against that of the other branches. */
|
|||
|
||||
for (;;)
|
||||
{
|
||||
int d, min;
|
||||
int d, min, recno;
|
||||
PCRE2_UCHAR *cs, *ce;
|
||||
register PCRE2_UCHAR op = *cc;
|
||||
|
||||
|
||||
switch (op)
|
||||
{
|
||||
case OP_COND:
|
||||
|
@ -112,7 +122,8 @@ for (;;)
|
|||
|
||||
/* If there is only one branch in a condition, the implied branch has zero
|
||||
length, so we don't add anything. This covers the DEFINE "condition"
|
||||
automatically. */
|
||||
automatically. If there are two branches we can treat it the same as any
|
||||
other non-capturing subpattern. */
|
||||
|
||||
cs = cc + GET(cc, 1);
|
||||
if (*cs != OP_ALT)
|
||||
|
@ -120,27 +131,55 @@ for (;;)
|
|||
cc = cs + 1 + LINK_SIZE;
|
||||
break;
|
||||
}
|
||||
goto PROCESS_NON_CAPTURE;
|
||||
|
||||
/* Otherwise we can fall through and treat it the same as any other
|
||||
subpattern. */
|
||||
|
||||
case OP_CBRA:
|
||||
case OP_SCBRA:
|
||||
/* There's a special case of OP_ONCE, when it is wrapped round an
|
||||
OP_RECURSE. We'd like to process the latter at this level so that
|
||||
remembering the value works for repeated cases. So we do nothing, but
|
||||
set a fudge value to skip over the OP_KET after the recurse. */
|
||||
|
||||
case OP_ONCE:
|
||||
if (cc[1+LINK_SIZE] == OP_RECURSE && cc[2*(1+LINK_SIZE)] == OP_KET)
|
||||
{
|
||||
once_fudge = 1 + LINK_SIZE;
|
||||
cc += 1 + LINK_SIZE;
|
||||
break;
|
||||
}
|
||||
/* Fall through */
|
||||
|
||||
case OP_ONCE_NC:
|
||||
case OP_BRA:
|
||||
case OP_SBRA:
|
||||
case OP_CBRAPOS:
|
||||
case OP_SCBRAPOS:
|
||||
case OP_BRAPOS:
|
||||
case OP_SBRAPOS:
|
||||
case OP_ONCE:
|
||||
case OP_ONCE_NC:
|
||||
d = find_minlength(re, cc, startcode, utf, recurses);
|
||||
PROCESS_NON_CAPTURE:
|
||||
d = find_minlength(re, cc, startcode, utf, recurses, countptr);
|
||||
if (d < 0) return d;
|
||||
branchlength += d;
|
||||
do cc += GET(cc, 1); while (*cc == OP_ALT);
|
||||
cc += 1 + LINK_SIZE;
|
||||
break;
|
||||
|
||||
/* To save time for repeated capturing subpatterns, we remember the
|
||||
length of the previous one. Unfortunately we can't do the same for
|
||||
the unnumbered ones above. */
|
||||
|
||||
case OP_CBRA:
|
||||
case OP_SCBRA:
|
||||
case OP_CBRAPOS:
|
||||
case OP_SCBRAPOS:
|
||||
recno = GET2(cc, 1+LINK_SIZE);
|
||||
if (recno != prev_recno)
|
||||
{
|
||||
prev_recno = recno;
|
||||
prev_d = find_minlength(re, cc, startcode, utf, recurses, countptr);
|
||||
if (prev_d < 0) return prev_d;
|
||||
}
|
||||
branchlength += prev_d;
|
||||
do cc += GET(cc, 1); while (*cc == OP_ALT);
|
||||
cc += 1 + LINK_SIZE;
|
||||
break;
|
||||
|
||||
/* ACCEPT makes things far too complicated; we have to give up. */
|
||||
|
||||
case OP_ACCEPT:
|
||||
|
@ -427,7 +466,7 @@ for (;;)
|
|||
int dd;
|
||||
this_recurse.prev = recurses;
|
||||
this_recurse.group = cs;
|
||||
dd = find_minlength(re, cs, startcode, utf, &this_recurse);
|
||||
dd = find_minlength(re, cs, startcode, utf, &this_recurse, countptr);
|
||||
if (dd < d) d = dd;
|
||||
}
|
||||
}
|
||||
|
@ -463,7 +502,7 @@ for (;;)
|
|||
{
|
||||
this_recurse.prev = recurses;
|
||||
this_recurse.group = cs;
|
||||
d = find_minlength(re, cs, startcode, utf, &this_recurse);
|
||||
d = find_minlength(re, cs, startcode, utf, &this_recurse, countptr);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -509,23 +548,36 @@ for (;;)
|
|||
|
||||
case OP_RECURSE:
|
||||
cs = ce = (PCRE2_UCHAR *)startcode + GET(cc, 1);
|
||||
do ce += GET(ce, 1); while (*ce == OP_ALT);
|
||||
if (cc > cs && cc < ce) /* Simple recursion */
|
||||
had_recurse = TRUE;
|
||||
else
|
||||
recno = GET2(cs, 1+LINK_SIZE);
|
||||
if (recno == prev_recno)
|
||||
{
|
||||
recurse_check *r = recurses;
|
||||
for (r = recurses; r != NULL; r = r->prev) if (r->group == cs) break;
|
||||
if (r != NULL) /* Mutual recursion */
|
||||
branchlength += prev_d;
|
||||
}
|
||||
else
|
||||
{
|
||||
do ce += GET(ce, 1); while (*ce == OP_ALT);
|
||||
if (cc > cs && cc < ce) /* Simple recursion */
|
||||
had_recurse = TRUE;
|
||||
else
|
||||
{
|
||||
this_recurse.prev = recurses;
|
||||
this_recurse.group = cs;
|
||||
branchlength += find_minlength(re, cs, startcode, utf, &this_recurse);
|
||||
recurse_check *r = recurses;
|
||||
for (r = recurses; r != NULL; r = r->prev) if (r->group == cs) break;
|
||||
if (r != NULL) /* Mutual recursion */
|
||||
had_recurse = TRUE;
|
||||
else
|
||||
{
|
||||
this_recurse.prev = recurses;
|
||||
this_recurse.group = cs;
|
||||
prev_d = find_minlength(re, cs, startcode, utf, &this_recurse,
|
||||
countptr);
|
||||
if (prev_d < 0) return prev_d;
|
||||
prev_recno = recno;
|
||||
branchlength += prev_d;
|
||||
}
|
||||
}
|
||||
}
|
||||
cc += 1 + LINK_SIZE;
|
||||
}
|
||||
cc += 1 + LINK_SIZE + once_fudge;
|
||||
once_fudge = 0;
|
||||
break;
|
||||
|
||||
/* Anything else does not or need not match a character. We can get the
|
||||
|
@ -1441,6 +1493,7 @@ int
|
|||
PRIV(study)(pcre2_real_code *re)
|
||||
{
|
||||
int min;
|
||||
int count = 0;
|
||||
PCRE2_UCHAR *code;
|
||||
BOOL utf = (re->overall_options & PCRE2_UTF) != 0;
|
||||
|
||||
|
@ -1463,9 +1516,9 @@ if ((re->overall_options & PCRE2_ANCHORED) == 0 &&
|
|||
|
||||
/* Find the minimum length of subject string. */
|
||||
|
||||
switch(min = find_minlength(re, code, code, utf, NULL))
|
||||
switch(min = find_minlength(re, code, code, utf, NULL, &count))
|
||||
{
|
||||
case -1: /* \C in UTF mode or (*ACCEPT) */
|
||||
case -1: /* \C in UTF mode or (*ACCEPT) or over-complex regex */
|
||||
break; /* Leave minlength unchanged (will be zero) */
|
||||
|
||||
case -2:
|
||||
|
@ -1475,6 +1528,7 @@ switch(min = find_minlength(re, code, code, utf, NULL))
|
|||
return 3; /* unrecognized opcode */
|
||||
|
||||
default:
|
||||
if (min > UINT16_MAX) min = UINT16_MAX;
|
||||
re->minlength = min;
|
||||
break;
|
||||
}
|
||||
|
|
|
@ -4360,4 +4360,6 @@ a random value. /Ix
|
|||
|
||||
/(?(?C{\Q})(?!(?'abc')))/I
|
||||
|
||||
/(?1){3918}(((((0(\k'R'))))(?J)(?'R'(?'R'\3){99})))/I
|
||||
|
||||
# End of testinput2
|
||||
|
|
|
@ -3942,7 +3942,7 @@ No match
|
|||
Capturing subpattern count = 2
|
||||
Compile options: <none>
|
||||
Overall options: anchored
|
||||
Subject length lower bound = 2
|
||||
Subject length lower bound = 3
|
||||
xyz
|
||||
0: xyz
|
||||
1: xyz
|
||||
|
@ -14566,4 +14566,14 @@ Named capturing subpatterns:
|
|||
May match empty string
|
||||
Subject length lower bound = 0
|
||||
|
||||
/(?1){3918}(((((0(\k'R'))))(?J)(?'R'(?'R'\3){99})))/I
|
||||
Capturing subpattern count = 8
|
||||
Max back reference = 8
|
||||
Named capturing subpatterns:
|
||||
R 7
|
||||
R 8
|
||||
Duplicate name status changes
|
||||
Last code unit = '0'
|
||||
Subject length lower bound = 65535
|
||||
|
||||
# End of testinput2
|
||||
|
|
Loading…
Reference in New Issue