From 3e12e15fe1944973e819e52d7225a08cc431f43a Mon Sep 17 00:00:00 2001 From: "Philip.Hazel" Date: Mon, 3 Aug 2015 13:18:49 +0000 Subject: [PATCH] Fix incorrect minimum matching length when pattern contains (?| groups. --- ChangeLog | 6 ++++ src/pcre2_compile.c | 1 + src/pcre2_internal.h | 3 +- src/pcre2_study.c | 83 ++++++++++++++++++++++++++------------------ testdata/testinput1 | 22 ++++++++++++ testdata/testinput2 | 10 ++++++ testdata/testoutput1 | 46 ++++++++++++++++++++++++ testdata/testoutput2 | 35 +++++++++++++++++++ 8 files changed, 172 insertions(+), 34 deletions(-) diff --git a/ChangeLog b/ChangeLog index 7170b0f..890f6df 100644 --- a/ChangeLog +++ b/ChangeLog @@ -92,6 +92,12 @@ gives up trying to find a minimum length when things get too complex. 24. An optimization has been added that speeds up finding the minimum matching length for patterns containing repeated capturing groups or recursions. +25. If a pattern contained a back reference to a group whose number was +duplicated as a result of appearing in a (?|...) group, the computation of the +minimum matching length gave a wrong result, which could cause incorrect "no +match" errors. For such patterns, a minimum matching length cannot at present +be computed. + Version 10.20 30-June-2015 -------------------------- diff --git a/src/pcre2_compile.c b/src/pcre2_compile.c index c25970f..d27a5bd 100644 --- a/src/pcre2_compile.c +++ b/src/pcre2_compile.c @@ -3215,6 +3215,7 @@ for (; ptr < cb->end_pattern; ptr++) top_nest->reset_group = cb->bracount; top_nest->max_group = cb->bracount; top_nest->flags |= NSF_RESET; + cb->external_flags |= PCRE2_DUPCAPUSED; break; } diff --git a/src/pcre2_internal.h b/src/pcre2_internal.h index a4cf1e0..19564a5 100644 --- a/src/pcre2_internal.h +++ b/src/pcre2_internal.h @@ -524,9 +524,10 @@ bytes in a code unit in that mode. */ #define PCRE2_NL_SET 0x00008000 /* newline was set in the pattern */ #define PCRE2_NOTEMPTY_SET 0x00010000 /* (*NOTEMPTY) used ) keep */ #define PCRE2_NE_ATST_SET 0x00020000 /* (*NOTEMPTY_ATSTART) used) together */ -#define PCRE2_DEREF_TABLES 0x00040000 /* Release character tables. */ +#define PCRE2_DEREF_TABLES 0x00040000 /* release character tables */ #define PCRE2_NOJIT 0x00080000 /* (*NOJIT) used */ #define PCRE2_HASBKPORX 0x00100000 /* contains \P, \p, or \X */ +#define PCRE2_DUPCAPUSED 0x00200000 /* contains (?| */ #define PCRE2_MODE_MASK (PCRE2_MODE8 | PCRE2_MODE16 | PCRE2_MODE32) diff --git a/src/pcre2_study.c b/src/pcre2_study.c index 946893d..c2b0083 100644 --- a/src/pcre2_study.c +++ b/src/pcre2_study.c @@ -74,12 +74,13 @@ Arguments: startcode pointer to start of the whole pattern's code utf UTF flag recurses chain of recurse_check to catch mutual recursion - countptr pointer to call count (to catch over complexity) + countptr pointer to call count (to catch over complexity) Returns: the minimum length -1 \C in UTF-8 mode or (*ACCEPT) - or pattern too complicated + or pattern too complicated + or back reference to duplicate name/number -2 internal error (missing capturing bracket) -3 internal error (opcode not listed) */ @@ -89,10 +90,13 @@ find_minlength(const pcre2_real_code *re, PCRE2_SPTR code, PCRE2_SPTR startcode, BOOL utf, recurse_check *recurses, int *countptr) { int length = -1; -int prev_recno = -1; -int prev_d = 0; +int prev_cap_recno = -1; +int prev_cap_d = 0; +int prev_recurse_recno = -1; +int prev_recurse_d = 0; uint32_t once_fudge = 0; BOOL had_recurse = FALSE; +BOOL dupcapused = (re->flags & PCRE2_DUPCAPUSED) != 0; recurse_check this_recurse; register int branchlength = 0; register PCRE2_UCHAR *cc = (PCRE2_UCHAR *)code + 1 + LINK_SIZE; @@ -114,7 +118,7 @@ for (;;) int d, min, recno; PCRE2_UCHAR *cs, *ce; register PCRE2_UCHAR op = *cc; - + switch (op) { case OP_COND: @@ -133,26 +137,26 @@ for (;;) } goto PROCESS_NON_CAPTURE; - /* There's a special case of OP_ONCE, when it is wrapped round an - OP_RECURSE. We'd like to process the latter at this level so that + /* There's a special case of OP_ONCE, when it is wrapped round an + OP_RECURSE. We'd like to process the latter at this level so that remembering the value works for repeated cases. So we do nothing, but set a fudge value to skip over the OP_KET after the recurse. */ - + case OP_ONCE: if (cc[1+LINK_SIZE] == OP_RECURSE && cc[2*(1+LINK_SIZE)] == OP_KET) { once_fudge = 1 + LINK_SIZE; cc += 1 + LINK_SIZE; - break; - } + break; + } /* Fall through */ - + case OP_ONCE_NC: case OP_BRA: case OP_SBRA: case OP_BRAPOS: case OP_SBRAPOS: - PROCESS_NON_CAPTURE: + PROCESS_NON_CAPTURE: d = find_minlength(re, cc, startcode, utf, recurses, countptr); if (d < 0) return d; branchlength += d; @@ -162,24 +166,25 @@ for (;;) /* To save time for repeated capturing subpatterns, we remember the length of the previous one. Unfortunately we can't do the same for - the unnumbered ones above. */ + the unnumbered ones above. Nor can we do this if (?| is present in the + pattern because captures with the same number are not then identical. */ case OP_CBRA: case OP_SCBRA: case OP_CBRAPOS: case OP_SCBRAPOS: - recno = GET2(cc, 1+LINK_SIZE); - if (recno != prev_recno) + recno = dupcapused? prev_cap_recno - 1 : (int)GET2(cc, 1+LINK_SIZE); + if (recno != prev_cap_recno) { - prev_recno = recno; - prev_d = find_minlength(re, cc, startcode, utf, recurses, countptr); - if (prev_d < 0) return prev_d; + prev_cap_recno = recno; + prev_cap_d = find_minlength(re, cc, startcode, utf, recurses, countptr); + if (prev_cap_d < 0) return prev_cap_d; } - branchlength += prev_d; + branchlength += prev_cap_d; do cc += GET(cc, 1); while (*cc == OP_ALT); cc += 1 + LINK_SIZE; break; - + /* ACCEPT makes things far too complicated; we have to give up. */ case OP_ACCEPT: @@ -427,8 +432,12 @@ for (;;) matches an empty string (by default it causes a matching failure), so in that case we must set the minimum length to zero. */ - case OP_DNREF: /* Duplicate named pattern back reference */ + /* Duplicate named pattern back reference. We cannot reliably find a length + for this if duplicate numbers are present in the pattern. */ + + case OP_DNREF: case OP_DNREFI: + if (dupcapused) return -1; if ((re->overall_options & PCRE2_MATCH_UNSET_BACKREF) == 0) { int count = GET2(cc, 1+IMM2_SIZE); @@ -477,8 +486,12 @@ for (;;) cc += 1 + 2*IMM2_SIZE; goto REPEAT_BACK_REFERENCE; - case OP_REF: /* Single back reference */ + /* Single back reference. We cannot find a length for this if duplicate + numbers are present in the pattern. */ + + case OP_REF: case OP_REFI: + if (dupcapused) return -1; if ((re->overall_options & PCRE2_MATCH_UNSET_BACKREF) == 0) { ce = cs = (PCRE2_UCHAR *)PRIV(find_bracket)(startcode, utf, GET2(cc, 1)); @@ -546,15 +559,19 @@ for (;;) branchlength += min * d; break; + /* Recursion always refers to the first occurrence of a subpattern with a + given number. Therefore, we can always make use of caching, even when the + pattern contains multiple subpatterns with the same number. */ + case OP_RECURSE: cs = ce = (PCRE2_UCHAR *)startcode + GET(cc, 1); - recno = GET2(cs, 1+LINK_SIZE); - if (recno == prev_recno) + recno = GET2(cs, 1+LINK_SIZE); + if (recno == prev_recurse_recno) { - branchlength += prev_d; - } + branchlength += prev_recurse_d; + } else - { + { do ce += GET(ce, 1); while (*ce == OP_ALT); if (cc > cs && cc < ce) /* Simple recursion */ had_recurse = TRUE; @@ -568,16 +585,16 @@ for (;;) { this_recurse.prev = recurses; this_recurse.group = cs; - prev_d = find_minlength(re, cs, startcode, utf, &this_recurse, + prev_recurse_d = find_minlength(re, cs, startcode, utf, &this_recurse, countptr); - if (prev_d < 0) return prev_d; - prev_recno = recno; - branchlength += prev_d; + if (prev_recurse_d < 0) return prev_recurse_d; + prev_recurse_recno = recno; + branchlength += prev_recurse_d; } } - } + } cc += 1 + LINK_SIZE + once_fudge; - once_fudge = 0; + once_fudge = 0; break; /* Anything else does not or need not match a character. We can get the diff --git a/testdata/testinput1 b/testdata/testinput1 index 884faf1..f5b801c 100644 --- a/testdata/testinput1 +++ b/testdata/testinput1 @@ -5727,4 +5727,26 @@ name)/mark "(?|(\k'Pm')|(?'Pm'))" abcd +/(?|(aaa)|(b))\g{1}/ + aaaaaa + bb + +/(?|(aaa)|(b))(?1)/ + aaaaaa + baaa + ** Failers + bb + +/(?|(aaa)|(b))/ + xaaa + xbc + +/(?|(?'a'aaa)|(?'a'b))\k'a'/ + aaaaaa + bb + +/(?|(?'a'aaa)|(?'a'b))(?'a'cccc)\k'a'/dupnames + aaaccccaaa + bccccb + # End of testinput1 diff --git a/testdata/testinput2 b/testdata/testinput2 index d6404d8..29346d9 100644 --- a/testdata/testinput2 +++ b/testdata/testinput2 @@ -4362,4 +4362,14 @@ a random value. /Ix /(?1){3918}(((((0(\k'R'))))(?J)(?'R'(?'R'\3){99})))/I +/(?|(aaa)|(b))\g{1}/I + +/(?|(aaa)|(b))(?1)/I + +/(?|(aaa)|(b))/I + +/(?|(?'a'aaa)|(?'a'b))\k'a'/I + +/(?|(?'a'aaa)|(?'a'b))(?'a'cccc)\k'a'/I,dupnames + # End of testinput2 diff --git a/testdata/testoutput1 b/testdata/testoutput1 index 942dc7d..153186d 100644 --- a/testdata/testoutput1 +++ b/testdata/testoutput1 @@ -9463,4 +9463,50 @@ No match 0: 1: +/(?|(aaa)|(b))\g{1}/ + aaaaaa + 0: aaaaaa + 1: aaa + bb + 0: bb + 1: b + +/(?|(aaa)|(b))(?1)/ + aaaaaa + 0: aaaaaa + 1: aaa + baaa + 0: baaa + 1: b + ** Failers +No match + bb +No match + +/(?|(aaa)|(b))/ + xaaa + 0: aaa + 1: aaa + xbc + 0: b + 1: b + +/(?|(?'a'aaa)|(?'a'b))\k'a'/ + aaaaaa + 0: aaaaaa + 1: aaa + bb + 0: bb + 1: b + +/(?|(?'a'aaa)|(?'a'b))(?'a'cccc)\k'a'/dupnames + aaaccccaaa + 0: aaaccccaaa + 1: aaa + 2: cccc + bccccb + 0: bccccb + 1: b + 2: cccc + # End of testinput1 diff --git a/testdata/testoutput2 b/testdata/testoutput2 index 5302e34..9e10df2 100644 --- a/testdata/testoutput2 +++ b/testdata/testoutput2 @@ -14576,4 +14576,39 @@ Duplicate name status changes Last code unit = '0' Subject length lower bound = 65535 +/(?|(aaa)|(b))\g{1}/I +Capturing subpattern count = 1 +Max back reference = 1 +Starting code units: a b +Subject length lower bound = 0 + +/(?|(aaa)|(b))(?1)/I +Capturing subpattern count = 1 +Starting code units: a b +Subject length lower bound = 4 + +/(?|(aaa)|(b))/I +Capturing subpattern count = 1 +Starting code units: a b +Subject length lower bound = 1 + +/(?|(?'a'aaa)|(?'a'b))\k'a'/I +Capturing subpattern count = 1 +Max back reference = 1 +Named capturing subpatterns: + a 1 +Starting code units: a b +Subject length lower bound = 0 + +/(?|(?'a'aaa)|(?'a'b))(?'a'cccc)\k'a'/I,dupnames +Capturing subpattern count = 2 +Max back reference = 2 +Named capturing subpatterns: + a 1 + a 2 +Options: dupnames +Starting code units: a b +Last code unit = 'c' +Subject length lower bound = 0 + # End of testinput2