Another extension to minimum length calculation.

This commit is contained in:
Philip.Hazel 2019-06-17 16:26:44 +00:00
parent ead78198d1
commit 1ebc2c50cc
9 changed files with 163 additions and 41 deletions

View File

@ -42,6 +42,10 @@ minimum is potentially useful.
such references as not adding to the minimum length (which it should have such references as not adding to the minimum length (which it should have
done all along). done all along).
* Furthermore, the above action now happens only if the back reference is to
a group that exists more than once in a pattern instead of any back
reference in a pattern with duplicate numbers.
10. A (*MARK) value inside a successful condition was not being returned by the 10. A (*MARK) value inside a successful condition was not being returned by the
interpretive matcher (it was returned by JIT). This bug has been mended. interpretive matcher (it was returned by JIT). This bug has been mended.

View File

@ -134,7 +134,7 @@ for (;;)
int d, min, recno; int d, min, recno;
PCRE2_UCHAR *cs, *ce; PCRE2_UCHAR *cs, *ce;
PCRE2_UCHAR op = *cc; PCRE2_UCHAR op = *cc;
if (branchlength >= UINT16_MAX) return UINT16_MAX; if (branchlength >= UINT16_MAX) return UINT16_MAX;
switch (op) switch (op)
@ -448,11 +448,13 @@ for (;;)
If PCRE2_MATCH_UNSET_BACKREF is set, a backreference to an unset bracket If PCRE2_MATCH_UNSET_BACKREF is set, a backreference to an unset bracket
matches an empty string (by default it causes a matching failure), so in matches an empty string (by default it causes a matching failure), so in
that case we must set the minimum length to zero. */ that case we must set the minimum length to zero.
/* Duplicate named pattern back reference. We cannot reliably find a length For backreferenes, if duplicate numbers are present in the pattern we check
for this if duplicate numbers are present in the pattern, so we set the for a reference to a duplicate. If it is, we don't know which version will
length to zero here also. */ be referenced, so we have to set the minimum length to zero. */
/* Duplicate named pattern back reference. */
case OP_DNREF: case OP_DNREF:
case OP_DNREFI: case OP_DNREFI:
@ -479,30 +481,34 @@ for (;;)
ce = cs = (PCRE2_UCHAR *)PRIV(find_bracket)(startcode, utf, recno); ce = cs = (PCRE2_UCHAR *)PRIV(find_bracket)(startcode, utf, recno);
if (cs == NULL) return -2; if (cs == NULL) return -2;
do ce += GET(ce, 1); while (*ce == OP_ALT); do ce += GET(ce, 1); while (*ce == OP_ALT);
if (cc > cs && cc < ce) /* Simple recursion */
dd = 0;
if (!dupcapused ||
(PCRE2_UCHAR *)PRIV(find_bracket)(ce, utf, recno) == NULL)
{ {
dd = 0; if (cc > cs && cc < ce) /* Simple recursion */
had_recurse = TRUE;
}
else
{
recurse_check *r = recurses;
for (r = recurses; r != NULL; r = r->prev)
if (r->group == cs) break;
if (r != NULL) /* Mutual recursion */
{ {
dd = 0;
had_recurse = TRUE; had_recurse = TRUE;
} }
else else
{ {
this_recurse.prev = recurses; recurse_check *r = recurses;
this_recurse.group = cs; for (r = recurses; r != NULL; r = r->prev)
dd = find_minlength(re, cs, startcode, utf, &this_recurse, if (r->group == cs) break;
countptr, backref_cache); if (r != NULL) /* Mutual recursion */
if (dd < 0) return dd; {
had_recurse = TRUE;
}
else
{
this_recurse.prev = recurses; /* No recursion */
this_recurse.group = cs;
dd = find_minlength(re, cs, startcode, utf, &this_recurse,
countptr, backref_cache);
if (dd < 0) return dd;
}
} }
} }
backref_cache[recno] = dd; backref_cache[recno] = dd;
for (i = backref_cache[0] + 1; i < recno; i++) backref_cache[i] = -1; for (i = backref_cache[0] + 1; i < recno; i++) backref_cache[i] = -1;
@ -518,8 +524,8 @@ for (;;)
cc += 1 + 2*IMM2_SIZE; cc += 1 + 2*IMM2_SIZE;
goto REPEAT_BACK_REFERENCE; goto REPEAT_BACK_REFERENCE;
/* Single back reference. We cannot find a length for this if duplicate /* Single back reference by number. References by name are converted to by
numbers are present in the pattern. */ number when there is no duplication. */
case OP_REF: case OP_REF:
case OP_REFI: case OP_REFI:
@ -529,36 +535,40 @@ for (;;)
else else
{ {
int i; int i;
if (!dupcapused && (re->overall_options & PCRE2_MATCH_UNSET_BACKREF) == 0) d = 0;
if ((re->overall_options & PCRE2_MATCH_UNSET_BACKREF) == 0)
{ {
ce = cs = (PCRE2_UCHAR *)PRIV(find_bracket)(startcode, utf, recno); ce = cs = (PCRE2_UCHAR *)PRIV(find_bracket)(startcode, utf, recno);
if (cs == NULL) return -2; if (cs == NULL) return -2;
do ce += GET(ce, 1); while (*ce == OP_ALT); do ce += GET(ce, 1); while (*ce == OP_ALT);
if (cc > cs && cc < ce) /* Simple recursion */
if (!dupcapused ||
(PCRE2_UCHAR *)PRIV(find_bracket)(ce, utf, recno) == NULL)
{ {
d = 0; if (cc > cs && cc < ce) /* Simple recursion */
had_recurse = TRUE;
}
else
{
recurse_check *r = recurses;
for (r = recurses; r != NULL; r = r->prev) if (r->group == cs) break;
if (r != NULL) /* Mutual recursion */
{ {
d = 0;
had_recurse = TRUE; had_recurse = TRUE;
} }
else else
{ {
this_recurse.prev = recurses; recurse_check *r = recurses;
this_recurse.group = cs; for (r = recurses; r != NULL; r = r->prev) if (r->group == cs) break;
d = find_minlength(re, cs, startcode, utf, &this_recurse, countptr, if (r != NULL) /* Mutual recursion */
backref_cache); {
if (d < 0) return d; had_recurse = TRUE;
}
else /* No recursion */
{
this_recurse.prev = recurses;
this_recurse.group = cs;
d = find_minlength(re, cs, startcode, utf, &this_recurse, countptr,
backref_cache);
if (d < 0) return d;
}
} }
} }
} }
else d = 0;
backref_cache[recno] = d; backref_cache[recno] = d;
for (i = backref_cache[0] + 1; i < recno; i++) backref_cache[i] = -1; for (i = backref_cache[0] + 1; i < recno; i++) backref_cache[i] = -1;

View File

@ -557,4 +557,6 @@
# ------------------------------------- # -------------------------------------
/(*UTF)(?=\x{123})/I
# End of testinput10 # End of testinput10

View File

@ -447,4 +447,6 @@
# ---------------------------------------------------- # ----------------------------------------------------
/(*UTF)(?=\x{123})/I
# End of testinput12 # End of testinput12

16
testdata/testinput2 vendored
View File

@ -5607,4 +5607,20 @@ a)"xI
/(*:\Q \E){5}/alt_verbnames /(*:\Q \E){5}/alt_verbnames
/(?=abc)/I
/(?|(X)|(XY))\1abc/I
/(?|(a)|(bcde))(c)\2/I
/(?|(a)|(bcde))(c)\1/I
/(?|(?'A'a)|(?'A'bcde))(?'B'c)\k'B'(?'A')/I,dupnames
/(?|(?'A'a)|(?'A'bcde))(?'B'c)\k'A'(?'A')/I,dupnames
/((a|)+)+Z/I
/((?=a))[abcd]/I
# End of testinput2 # End of testinput2

View File

@ -1757,4 +1757,13 @@ No match
# ------------------------------------- # -------------------------------------
/(*UTF)(?=\x{123})/I
Capture group count = 0
May match empty string
Compile options: <none>
Overall options: utf
First code unit = \xc4
Last code unit = \xa3
Subject length lower bound = 1
# End of testinput10 # End of testinput10

View File

@ -1579,4 +1579,12 @@ No match
# ---------------------------------------------------- # ----------------------------------------------------
/(*UTF)(?=\x{123})/I
Capture group count = 0
May match empty string
Compile options: <none>
Overall options: utf
First code unit = \x{123}
Subject length lower bound = 1
# End of testinput12 # End of testinput12

View File

@ -1577,4 +1577,12 @@ No match
# ---------------------------------------------------- # ----------------------------------------------------
/(*UTF)(?=\x{123})/I
Capture group count = 0
May match empty string
Compile options: <none>
Overall options: utf
First code unit = \x{123}
Subject length lower bound = 1
# End of testinput12 # End of testinput12

63
testdata/testoutput2 vendored
View File

@ -16963,6 +16963,69 @@ Failed: error 109 at offset 5: quantifier does not follow a repeatable item
/(*:\Q \E){5}/alt_verbnames /(*:\Q \E){5}/alt_verbnames
Failed: error 109 at offset 11: quantifier does not follow a repeatable item Failed: error 109 at offset 11: quantifier does not follow a repeatable item
/(?=abc)/I
Capture group count = 0
May match empty string
First code unit = 'a'
Last code unit = 'c'
Subject length lower bound = 2
/(?|(X)|(XY))\1abc/I
Capture group count = 1
Max back reference = 1
First code unit = 'X'
Last code unit = 'c'
Subject length lower bound = 4
/(?|(a)|(bcde))(c)\2/I
Capture group count = 2
Max back reference = 2
Starting code units: a b
Last code unit = 'c'
Subject length lower bound = 3
/(?|(a)|(bcde))(c)\1/I
Capture group count = 2
Max back reference = 1
Starting code units: a b
Last code unit = 'c'
Subject length lower bound = 2
/(?|(?'A'a)|(?'A'bcde))(?'B'c)\k'B'(?'A')/I,dupnames
Capture group count = 3
Max back reference = 2
Named capture groups:
A 1
A 3
B 2
Options: dupnames
Starting code units: a b
Last code unit = 'c'
Subject length lower bound = 3
/(?|(?'A'a)|(?'A'bcde))(?'B'c)\k'A'(?'A')/I,dupnames
Capture group count = 3
Max back reference = 3
Named capture groups:
A 1
A 3
B 2
Options: dupnames
Starting code units: a b
Last code unit = 'c'
Subject length lower bound = 2
/((a|)+)+Z/I
Capture group count = 2
Starting code units: Z a
Last code unit = 'Z'
Subject length lower bound = 1
/((?=a))[abcd]/I
Capture group count = 1
First code unit = 'a'
Subject length lower bound = 1
# End of testinput2 # End of testinput2
Error -70: PCRE2_ERROR_BADDATA (unknown error number) Error -70: PCRE2_ERROR_BADDATA (unknown error number)
Error -62: bad serialized data Error -62: bad serialized data