From 8269696f57dc3e6631dc11ff48df36d1896a5e3f Mon Sep 17 00:00:00 2001 From: "Philip.Hazel" Date: Sat, 1 Aug 2015 09:11:28 +0000 Subject: [PATCH] Fix issues with minimum length finding. --- ChangeLog | 7 +++ src/pcre2_find_bracket.c | 2 +- src/pcre2_study.c | 116 ++++++++++++++++++++++++++++----------- testdata/testinput2 | 2 + testdata/testoutput2 | 12 +++- 5 files changed, 106 insertions(+), 33 deletions(-) diff --git a/ChangeLog b/ChangeLog index 6ea8aae..7170b0f 100644 --- a/ChangeLog +++ b/ChangeLog @@ -85,6 +85,13 @@ string if the final multi-byte UTF-8 character was truncated. class, where both values are literal letters in the same case, omit the non-letter EBCDIC code points within the range. +23. Finding the minimum matching length of complex patterns with back +references and/or recursions can take a long time. There is now a cut-off that +gives up trying to find a minimum length when things get too complex. + +24. An optimization has been added that speeds up finding the minimum matching +length for patterns containing repeated capturing groups or recursions. + Version 10.20 30-June-2015 -------------------------- diff --git a/src/pcre2_find_bracket.c b/src/pcre2_find_bracket.c index 3cdc419..fba4b47 100644 --- a/src/pcre2_find_bracket.c +++ b/src/pcre2_find_bracket.c @@ -83,7 +83,7 @@ for (;;) if (c == OP_XCLASS) code += GET(code, 1); else if (c == OP_CALLOUT_STR) code += GET(code, 1 + 2*LINK_SIZE); - /* Handle recursion */ + /* Handle lookbehind */ else if (c == OP_REVERSE) { diff --git a/src/pcre2_study.c b/src/pcre2_study.c index 25d7e51..946893d 100644 --- a/src/pcre2_study.c +++ b/src/pcre2_study.c @@ -59,7 +59,6 @@ collecting data (e.g. minimum matching length). */ enum { SSB_FAIL, SSB_DONE, SSB_CONTINUE, SSB_UNKNOWN }; - /************************************************* * Find the minimum subject length for a group * *************************************************/ @@ -75,24 +74,35 @@ Arguments: startcode pointer to start of the whole pattern's code utf UTF flag recurses chain of recurse_check to catch mutual recursion + countptr pointer to call count (to catch over complexity) Returns: the minimum length -1 \C in UTF-8 mode or (*ACCEPT) + or pattern too complicated -2 internal error (missing capturing bracket) -3 internal error (opcode not listed) */ static int find_minlength(const pcre2_real_code *re, PCRE2_SPTR code, - PCRE2_SPTR startcode, BOOL utf, recurse_check *recurses) + PCRE2_SPTR startcode, BOOL utf, recurse_check *recurses, int *countptr) { int length = -1; +int prev_recno = -1; +int prev_d = 0; +uint32_t once_fudge = 0; BOOL had_recurse = FALSE; recurse_check this_recurse; register int branchlength = 0; register PCRE2_UCHAR *cc = (PCRE2_UCHAR *)code + 1 + LINK_SIZE; +/* A large and/or complex regex can take too long to process. */ + +if ((*countptr)++ > 1000) return -1; + +/* Skip over capturing bracket number */ + if (*code == OP_CBRA || *code == OP_SCBRA || *code == OP_CBRAPOS || *code == OP_SCBRAPOS) cc += IMM2_SIZE; @@ -101,10 +111,10 @@ branch, check the length against that of the other branches. */ for (;;) { - int d, min; + int d, min, recno; PCRE2_UCHAR *cs, *ce; register PCRE2_UCHAR op = *cc; - + switch (op) { case OP_COND: @@ -112,7 +122,8 @@ for (;;) /* If there is only one branch in a condition, the implied branch has zero length, so we don't add anything. This covers the DEFINE "condition" - automatically. */ + automatically. If there are two branches we can treat it the same as any + other non-capturing subpattern. */ cs = cc + GET(cc, 1); if (*cs != OP_ALT) @@ -120,27 +131,55 @@ for (;;) cc = cs + 1 + LINK_SIZE; break; } + goto PROCESS_NON_CAPTURE; - /* Otherwise we can fall through and treat it the same as any other - subpattern. */ - - case OP_CBRA: - case OP_SCBRA: + /* There's a special case of OP_ONCE, when it is wrapped round an + OP_RECURSE. We'd like to process the latter at this level so that + remembering the value works for repeated cases. So we do nothing, but + set a fudge value to skip over the OP_KET after the recurse. */ + + case OP_ONCE: + if (cc[1+LINK_SIZE] == OP_RECURSE && cc[2*(1+LINK_SIZE)] == OP_KET) + { + once_fudge = 1 + LINK_SIZE; + cc += 1 + LINK_SIZE; + break; + } + /* Fall through */ + + case OP_ONCE_NC: case OP_BRA: case OP_SBRA: - case OP_CBRAPOS: - case OP_SCBRAPOS: case OP_BRAPOS: case OP_SBRAPOS: - case OP_ONCE: - case OP_ONCE_NC: - d = find_minlength(re, cc, startcode, utf, recurses); + PROCESS_NON_CAPTURE: + d = find_minlength(re, cc, startcode, utf, recurses, countptr); if (d < 0) return d; branchlength += d; do cc += GET(cc, 1); while (*cc == OP_ALT); cc += 1 + LINK_SIZE; break; + /* To save time for repeated capturing subpatterns, we remember the + length of the previous one. Unfortunately we can't do the same for + the unnumbered ones above. */ + + case OP_CBRA: + case OP_SCBRA: + case OP_CBRAPOS: + case OP_SCBRAPOS: + recno = GET2(cc, 1+LINK_SIZE); + if (recno != prev_recno) + { + prev_recno = recno; + prev_d = find_minlength(re, cc, startcode, utf, recurses, countptr); + if (prev_d < 0) return prev_d; + } + branchlength += prev_d; + do cc += GET(cc, 1); while (*cc == OP_ALT); + cc += 1 + LINK_SIZE; + break; + /* ACCEPT makes things far too complicated; we have to give up. */ case OP_ACCEPT: @@ -427,7 +466,7 @@ for (;;) int dd; this_recurse.prev = recurses; this_recurse.group = cs; - dd = find_minlength(re, cs, startcode, utf, &this_recurse); + dd = find_minlength(re, cs, startcode, utf, &this_recurse, countptr); if (dd < d) d = dd; } } @@ -463,7 +502,7 @@ for (;;) { this_recurse.prev = recurses; this_recurse.group = cs; - d = find_minlength(re, cs, startcode, utf, &this_recurse); + d = find_minlength(re, cs, startcode, utf, &this_recurse, countptr); } } } @@ -509,23 +548,36 @@ for (;;) case OP_RECURSE: cs = ce = (PCRE2_UCHAR *)startcode + GET(cc, 1); - do ce += GET(ce, 1); while (*ce == OP_ALT); - if (cc > cs && cc < ce) /* Simple recursion */ - had_recurse = TRUE; - else + recno = GET2(cs, 1+LINK_SIZE); + if (recno == prev_recno) { - recurse_check *r = recurses; - for (r = recurses; r != NULL; r = r->prev) if (r->group == cs) break; - if (r != NULL) /* Mutual recursion */ + branchlength += prev_d; + } + else + { + do ce += GET(ce, 1); while (*ce == OP_ALT); + if (cc > cs && cc < ce) /* Simple recursion */ had_recurse = TRUE; else { - this_recurse.prev = recurses; - this_recurse.group = cs; - branchlength += find_minlength(re, cs, startcode, utf, &this_recurse); + recurse_check *r = recurses; + for (r = recurses; r != NULL; r = r->prev) if (r->group == cs) break; + if (r != NULL) /* Mutual recursion */ + had_recurse = TRUE; + else + { + this_recurse.prev = recurses; + this_recurse.group = cs; + prev_d = find_minlength(re, cs, startcode, utf, &this_recurse, + countptr); + if (prev_d < 0) return prev_d; + prev_recno = recno; + branchlength += prev_d; + } } - } - cc += 1 + LINK_SIZE; + } + cc += 1 + LINK_SIZE + once_fudge; + once_fudge = 0; break; /* Anything else does not or need not match a character. We can get the @@ -1441,6 +1493,7 @@ int PRIV(study)(pcre2_real_code *re) { int min; +int count = 0; PCRE2_UCHAR *code; BOOL utf = (re->overall_options & PCRE2_UTF) != 0; @@ -1463,9 +1516,9 @@ if ((re->overall_options & PCRE2_ANCHORED) == 0 && /* Find the minimum length of subject string. */ -switch(min = find_minlength(re, code, code, utf, NULL)) +switch(min = find_minlength(re, code, code, utf, NULL, &count)) { - case -1: /* \C in UTF mode or (*ACCEPT) */ + case -1: /* \C in UTF mode or (*ACCEPT) or over-complex regex */ break; /* Leave minlength unchanged (will be zero) */ case -2: @@ -1475,6 +1528,7 @@ switch(min = find_minlength(re, code, code, utf, NULL)) return 3; /* unrecognized opcode */ default: + if (min > UINT16_MAX) min = UINT16_MAX; re->minlength = min; break; } diff --git a/testdata/testinput2 b/testdata/testinput2 index 1e59452..d6404d8 100644 --- a/testdata/testinput2 +++ b/testdata/testinput2 @@ -4360,4 +4360,6 @@ a random value. /Ix /(?(?C{\Q})(?!(?'abc')))/I +/(?1){3918}(((((0(\k'R'))))(?J)(?'R'(?'R'\3){99})))/I + # End of testinput2 diff --git a/testdata/testoutput2 b/testdata/testoutput2 index 7d8b74c..5302e34 100644 --- a/testdata/testoutput2 +++ b/testdata/testoutput2 @@ -3942,7 +3942,7 @@ No match Capturing subpattern count = 2 Compile options: Overall options: anchored -Subject length lower bound = 2 +Subject length lower bound = 3 xyz 0: xyz 1: xyz @@ -14566,4 +14566,14 @@ Named capturing subpatterns: May match empty string Subject length lower bound = 0 +/(?1){3918}(((((0(\k'R'))))(?J)(?'R'(?'R'\3){99})))/I +Capturing subpattern count = 8 +Max back reference = 8 +Named capturing subpatterns: + R 7 + R 8 +Duplicate name status changes +Last code unit = '0' +Subject length lower bound = 65535 + # End of testinput2