From 1ebc2c50ccd6ac360463799ec62fa526b7771449 Mon Sep 17 00:00:00 2001 From: "Philip.Hazel" Date: Mon, 17 Jun 2019 16:26:44 +0000 Subject: [PATCH] Another extension to minimum length calculation. --- ChangeLog | 4 ++ src/pcre2_study.c | 92 ++++++++++++++++++++++------------------ testdata/testinput10 | 2 + testdata/testinput12 | 2 + testdata/testinput2 | 16 +++++++ testdata/testoutput10 | 9 ++++ testdata/testoutput12-16 | 8 ++++ testdata/testoutput12-32 | 8 ++++ testdata/testoutput2 | 63 +++++++++++++++++++++++++++ 9 files changed, 163 insertions(+), 41 deletions(-) diff --git a/ChangeLog b/ChangeLog index b3debf4..52f22f7 100644 --- a/ChangeLog +++ b/ChangeLog @@ -42,6 +42,10 @@ minimum is potentially useful. such references as not adding to the minimum length (which it should have done all along). + * Furthermore, the above action now happens only if the back reference is to + a group that exists more than once in a pattern instead of any back + reference in a pattern with duplicate numbers. + 10. A (*MARK) value inside a successful condition was not being returned by the interpretive matcher (it was returned by JIT). This bug has been mended. diff --git a/src/pcre2_study.c b/src/pcre2_study.c index 496c8dc..a6790be 100644 --- a/src/pcre2_study.c +++ b/src/pcre2_study.c @@ -134,7 +134,7 @@ for (;;) int d, min, recno; PCRE2_UCHAR *cs, *ce; PCRE2_UCHAR op = *cc; - + if (branchlength >= UINT16_MAX) return UINT16_MAX; switch (op) @@ -448,11 +448,13 @@ for (;;) If PCRE2_MATCH_UNSET_BACKREF is set, a backreference to an unset bracket matches an empty string (by default it causes a matching failure), so in - that case we must set the minimum length to zero. */ + that case we must set the minimum length to zero. - /* Duplicate named pattern back reference. We cannot reliably find a length - for this if duplicate numbers are present in the pattern, so we set the - length to zero here also. */ + For backreferenes, if duplicate numbers are present in the pattern we check + for a reference to a duplicate. If it is, we don't know which version will + be referenced, so we have to set the minimum length to zero. */ + + /* Duplicate named pattern back reference. */ case OP_DNREF: case OP_DNREFI: @@ -479,30 +481,34 @@ for (;;) ce = cs = (PCRE2_UCHAR *)PRIV(find_bracket)(startcode, utf, recno); if (cs == NULL) return -2; do ce += GET(ce, 1); while (*ce == OP_ALT); - if (cc > cs && cc < ce) /* Simple recursion */ + + dd = 0; + if (!dupcapused || + (PCRE2_UCHAR *)PRIV(find_bracket)(ce, utf, recno) == NULL) { - dd = 0; - had_recurse = TRUE; - } - else - { - recurse_check *r = recurses; - for (r = recurses; r != NULL; r = r->prev) - if (r->group == cs) break; - if (r != NULL) /* Mutual recursion */ + if (cc > cs && cc < ce) /* Simple recursion */ { - dd = 0; had_recurse = TRUE; } else { - this_recurse.prev = recurses; - this_recurse.group = cs; - dd = find_minlength(re, cs, startcode, utf, &this_recurse, - countptr, backref_cache); - if (dd < 0) return dd; + recurse_check *r = recurses; + for (r = recurses; r != NULL; r = r->prev) + if (r->group == cs) break; + if (r != NULL) /* Mutual recursion */ + { + had_recurse = TRUE; + } + else + { + this_recurse.prev = recurses; /* No recursion */ + this_recurse.group = cs; + dd = find_minlength(re, cs, startcode, utf, &this_recurse, + countptr, backref_cache); + if (dd < 0) return dd; + } } - } + } backref_cache[recno] = dd; for (i = backref_cache[0] + 1; i < recno; i++) backref_cache[i] = -1; @@ -518,8 +524,8 @@ for (;;) cc += 1 + 2*IMM2_SIZE; goto REPEAT_BACK_REFERENCE; - /* Single back reference. We cannot find a length for this if duplicate - numbers are present in the pattern. */ + /* Single back reference by number. References by name are converted to by + number when there is no duplication. */ case OP_REF: case OP_REFI: @@ -529,36 +535,40 @@ for (;;) else { int i; - if (!dupcapused && (re->overall_options & PCRE2_MATCH_UNSET_BACKREF) == 0) + d = 0; + + if ((re->overall_options & PCRE2_MATCH_UNSET_BACKREF) == 0) { ce = cs = (PCRE2_UCHAR *)PRIV(find_bracket)(startcode, utf, recno); if (cs == NULL) return -2; do ce += GET(ce, 1); while (*ce == OP_ALT); - if (cc > cs && cc < ce) /* Simple recursion */ + + if (!dupcapused || + (PCRE2_UCHAR *)PRIV(find_bracket)(ce, utf, recno) == NULL) { - d = 0; - had_recurse = TRUE; - } - else - { - recurse_check *r = recurses; - for (r = recurses; r != NULL; r = r->prev) if (r->group == cs) break; - if (r != NULL) /* Mutual recursion */ + if (cc > cs && cc < ce) /* Simple recursion */ { - d = 0; had_recurse = TRUE; } else { - this_recurse.prev = recurses; - this_recurse.group = cs; - d = find_minlength(re, cs, startcode, utf, &this_recurse, countptr, - backref_cache); - if (d < 0) return d; + recurse_check *r = recurses; + for (r = recurses; r != NULL; r = r->prev) if (r->group == cs) break; + if (r != NULL) /* Mutual recursion */ + { + had_recurse = TRUE; + } + else /* No recursion */ + { + this_recurse.prev = recurses; + this_recurse.group = cs; + d = find_minlength(re, cs, startcode, utf, &this_recurse, countptr, + backref_cache); + if (d < 0) return d; + } } } } - else d = 0; backref_cache[recno] = d; for (i = backref_cache[0] + 1; i < recno; i++) backref_cache[i] = -1; diff --git a/testdata/testinput10 b/testdata/testinput10 index 2029c44..5e8458f 100644 --- a/testdata/testinput10 +++ b/testdata/testinput10 @@ -557,4 +557,6 @@ # ------------------------------------- +/(*UTF)(?=\x{123})/I + # End of testinput10 diff --git a/testdata/testinput12 b/testdata/testinput12 index b1af30a..1f68732 100644 --- a/testdata/testinput12 +++ b/testdata/testinput12 @@ -447,4 +447,6 @@ # ---------------------------------------------------- +/(*UTF)(?=\x{123})/I + # End of testinput12 diff --git a/testdata/testinput2 b/testdata/testinput2 index bfec80c..3c1e589 100644 --- a/testdata/testinput2 +++ b/testdata/testinput2 @@ -5607,4 +5607,20 @@ a)"xI /(*:\Q \E){5}/alt_verbnames +/(?=abc)/I + +/(?|(X)|(XY))\1abc/I + +/(?|(a)|(bcde))(c)\2/I + +/(?|(a)|(bcde))(c)\1/I + +/(?|(?'A'a)|(?'A'bcde))(?'B'c)\k'B'(?'A')/I,dupnames + +/(?|(?'A'a)|(?'A'bcde))(?'B'c)\k'A'(?'A')/I,dupnames + +/((a|)+)+Z/I + +/((?=a))[abcd]/I + # End of testinput2 diff --git a/testdata/testoutput10 b/testdata/testoutput10 index 0a336d8..e5b7c13 100644 --- a/testdata/testoutput10 +++ b/testdata/testoutput10 @@ -1757,4 +1757,13 @@ No match # ------------------------------------- +/(*UTF)(?=\x{123})/I +Capture group count = 0 +May match empty string +Compile options: +Overall options: utf +First code unit = \xc4 +Last code unit = \xa3 +Subject length lower bound = 1 + # End of testinput10 diff --git a/testdata/testoutput12-16 b/testdata/testoutput12-16 index fd6eef7..9b11b62 100644 --- a/testdata/testoutput12-16 +++ b/testdata/testoutput12-16 @@ -1579,4 +1579,12 @@ No match # ---------------------------------------------------- +/(*UTF)(?=\x{123})/I +Capture group count = 0 +May match empty string +Compile options: +Overall options: utf +First code unit = \x{123} +Subject length lower bound = 1 + # End of testinput12 diff --git a/testdata/testoutput12-32 b/testdata/testoutput12-32 index 768fd7e..9b1b38b 100644 --- a/testdata/testoutput12-32 +++ b/testdata/testoutput12-32 @@ -1577,4 +1577,12 @@ No match # ---------------------------------------------------- +/(*UTF)(?=\x{123})/I +Capture group count = 0 +May match empty string +Compile options: +Overall options: utf +First code unit = \x{123} +Subject length lower bound = 1 + # End of testinput12 diff --git a/testdata/testoutput2 b/testdata/testoutput2 index 9c1dc50..6a93525 100644 --- a/testdata/testoutput2 +++ b/testdata/testoutput2 @@ -16963,6 +16963,69 @@ Failed: error 109 at offset 5: quantifier does not follow a repeatable item /(*:\Q \E){5}/alt_verbnames Failed: error 109 at offset 11: quantifier does not follow a repeatable item +/(?=abc)/I +Capture group count = 0 +May match empty string +First code unit = 'a' +Last code unit = 'c' +Subject length lower bound = 2 + +/(?|(X)|(XY))\1abc/I +Capture group count = 1 +Max back reference = 1 +First code unit = 'X' +Last code unit = 'c' +Subject length lower bound = 4 + +/(?|(a)|(bcde))(c)\2/I +Capture group count = 2 +Max back reference = 2 +Starting code units: a b +Last code unit = 'c' +Subject length lower bound = 3 + +/(?|(a)|(bcde))(c)\1/I +Capture group count = 2 +Max back reference = 1 +Starting code units: a b +Last code unit = 'c' +Subject length lower bound = 2 + +/(?|(?'A'a)|(?'A'bcde))(?'B'c)\k'B'(?'A')/I,dupnames +Capture group count = 3 +Max back reference = 2 +Named capture groups: + A 1 + A 3 + B 2 +Options: dupnames +Starting code units: a b +Last code unit = 'c' +Subject length lower bound = 3 + +/(?|(?'A'a)|(?'A'bcde))(?'B'c)\k'A'(?'A')/I,dupnames +Capture group count = 3 +Max back reference = 3 +Named capture groups: + A 1 + A 3 + B 2 +Options: dupnames +Starting code units: a b +Last code unit = 'c' +Subject length lower bound = 2 + +/((a|)+)+Z/I +Capture group count = 2 +Starting code units: Z a +Last code unit = 'Z' +Subject length lower bound = 1 + +/((?=a))[abcd]/I +Capture group count = 1 +First code unit = 'a' +Subject length lower bound = 1 + # End of testinput2 Error -70: PCRE2_ERROR_BADDATA (unknown error number) Error -62: bad serialized data