Another extension to minimum length calculation.

This commit is contained in:
Philip.Hazel 2019-06-17 16:26:44 +00:00
parent ead78198d1
commit 1ebc2c50cc
9 changed files with 163 additions and 41 deletions

View File

@ -42,6 +42,10 @@ minimum is potentially useful.
such references as not adding to the minimum length (which it should have
done all along).
* Furthermore, the above action now happens only if the back reference is to
a group that exists more than once in a pattern instead of any back
reference in a pattern with duplicate numbers.
10. A (*MARK) value inside a successful condition was not being returned by the
interpretive matcher (it was returned by JIT). This bug has been mended.

View File

@ -448,11 +448,13 @@ for (;;)
If PCRE2_MATCH_UNSET_BACKREF is set, a backreference to an unset bracket
matches an empty string (by default it causes a matching failure), so in
that case we must set the minimum length to zero. */
that case we must set the minimum length to zero.
/* Duplicate named pattern back reference. We cannot reliably find a length
for this if duplicate numbers are present in the pattern, so we set the
length to zero here also. */
For backreferenes, if duplicate numbers are present in the pattern we check
for a reference to a duplicate. If it is, we don't know which version will
be referenced, so we have to set the minimum length to zero. */
/* Duplicate named pattern back reference. */
case OP_DNREF:
case OP_DNREFI:
@ -479,28 +481,32 @@ for (;;)
ce = cs = (PCRE2_UCHAR *)PRIV(find_bracket)(startcode, utf, recno);
if (cs == NULL) return -2;
do ce += GET(ce, 1); while (*ce == OP_ALT);
if (cc > cs && cc < ce) /* Simple recursion */
dd = 0;
if (!dupcapused ||
(PCRE2_UCHAR *)PRIV(find_bracket)(ce, utf, recno) == NULL)
{
dd = 0;
had_recurse = TRUE;
}
else
{
recurse_check *r = recurses;
for (r = recurses; r != NULL; r = r->prev)
if (r->group == cs) break;
if (r != NULL) /* Mutual recursion */
if (cc > cs && cc < ce) /* Simple recursion */
{
dd = 0;
had_recurse = TRUE;
}
else
{
this_recurse.prev = recurses;
this_recurse.group = cs;
dd = find_minlength(re, cs, startcode, utf, &this_recurse,
countptr, backref_cache);
if (dd < 0) return dd;
recurse_check *r = recurses;
for (r = recurses; r != NULL; r = r->prev)
if (r->group == cs) break;
if (r != NULL) /* Mutual recursion */
{
had_recurse = TRUE;
}
else
{
this_recurse.prev = recurses; /* No recursion */
this_recurse.group = cs;
dd = find_minlength(re, cs, startcode, utf, &this_recurse,
countptr, backref_cache);
if (dd < 0) return dd;
}
}
}
@ -518,8 +524,8 @@ for (;;)
cc += 1 + 2*IMM2_SIZE;
goto REPEAT_BACK_REFERENCE;
/* Single back reference. We cannot find a length for this if duplicate
numbers are present in the pattern. */
/* Single back reference by number. References by name are converted to by
number when there is no duplication. */
case OP_REF:
case OP_REFI:
@ -529,36 +535,40 @@ for (;;)
else
{
int i;
if (!dupcapused && (re->overall_options & PCRE2_MATCH_UNSET_BACKREF) == 0)
d = 0;
if ((re->overall_options & PCRE2_MATCH_UNSET_BACKREF) == 0)
{
ce = cs = (PCRE2_UCHAR *)PRIV(find_bracket)(startcode, utf, recno);
if (cs == NULL) return -2;
do ce += GET(ce, 1); while (*ce == OP_ALT);
if (cc > cs && cc < ce) /* Simple recursion */
if (!dupcapused ||
(PCRE2_UCHAR *)PRIV(find_bracket)(ce, utf, recno) == NULL)
{
d = 0;
had_recurse = TRUE;
}
else
{
recurse_check *r = recurses;
for (r = recurses; r != NULL; r = r->prev) if (r->group == cs) break;
if (r != NULL) /* Mutual recursion */
if (cc > cs && cc < ce) /* Simple recursion */
{
d = 0;
had_recurse = TRUE;
}
else
{
this_recurse.prev = recurses;
this_recurse.group = cs;
d = find_minlength(re, cs, startcode, utf, &this_recurse, countptr,
backref_cache);
if (d < 0) return d;
recurse_check *r = recurses;
for (r = recurses; r != NULL; r = r->prev) if (r->group == cs) break;
if (r != NULL) /* Mutual recursion */
{
had_recurse = TRUE;
}
else /* No recursion */
{
this_recurse.prev = recurses;
this_recurse.group = cs;
d = find_minlength(re, cs, startcode, utf, &this_recurse, countptr,
backref_cache);
if (d < 0) return d;
}
}
}
}
else d = 0;
backref_cache[recno] = d;
for (i = backref_cache[0] + 1; i < recno; i++) backref_cache[i] = -1;

View File

@ -557,4 +557,6 @@
# -------------------------------------
/(*UTF)(?=\x{123})/I
# End of testinput10

View File

@ -447,4 +447,6 @@
# ----------------------------------------------------
/(*UTF)(?=\x{123})/I
# End of testinput12

16
testdata/testinput2 vendored
View File

@ -5607,4 +5607,20 @@ a)"xI
/(*:\Q \E){5}/alt_verbnames
/(?=abc)/I
/(?|(X)|(XY))\1abc/I
/(?|(a)|(bcde))(c)\2/I
/(?|(a)|(bcde))(c)\1/I
/(?|(?'A'a)|(?'A'bcde))(?'B'c)\k'B'(?'A')/I,dupnames
/(?|(?'A'a)|(?'A'bcde))(?'B'c)\k'A'(?'A')/I,dupnames
/((a|)+)+Z/I
/((?=a))[abcd]/I
# End of testinput2

View File

@ -1757,4 +1757,13 @@ No match
# -------------------------------------
/(*UTF)(?=\x{123})/I
Capture group count = 0
May match empty string
Compile options: <none>
Overall options: utf
First code unit = \xc4
Last code unit = \xa3
Subject length lower bound = 1
# End of testinput10

View File

@ -1579,4 +1579,12 @@ No match
# ----------------------------------------------------
/(*UTF)(?=\x{123})/I
Capture group count = 0
May match empty string
Compile options: <none>
Overall options: utf
First code unit = \x{123}
Subject length lower bound = 1
# End of testinput12

View File

@ -1577,4 +1577,12 @@ No match
# ----------------------------------------------------
/(*UTF)(?=\x{123})/I
Capture group count = 0
May match empty string
Compile options: <none>
Overall options: utf
First code unit = \x{123}
Subject length lower bound = 1
# End of testinput12

63
testdata/testoutput2 vendored
View File

@ -16963,6 +16963,69 @@ Failed: error 109 at offset 5: quantifier does not follow a repeatable item
/(*:\Q \E){5}/alt_verbnames
Failed: error 109 at offset 11: quantifier does not follow a repeatable item
/(?=abc)/I
Capture group count = 0
May match empty string
First code unit = 'a'
Last code unit = 'c'
Subject length lower bound = 2
/(?|(X)|(XY))\1abc/I
Capture group count = 1
Max back reference = 1
First code unit = 'X'
Last code unit = 'c'
Subject length lower bound = 4
/(?|(a)|(bcde))(c)\2/I
Capture group count = 2
Max back reference = 2
Starting code units: a b
Last code unit = 'c'
Subject length lower bound = 3
/(?|(a)|(bcde))(c)\1/I
Capture group count = 2
Max back reference = 1
Starting code units: a b
Last code unit = 'c'
Subject length lower bound = 2
/(?|(?'A'a)|(?'A'bcde))(?'B'c)\k'B'(?'A')/I,dupnames
Capture group count = 3
Max back reference = 2
Named capture groups:
A 1
A 3
B 2
Options: dupnames
Starting code units: a b
Last code unit = 'c'
Subject length lower bound = 3
/(?|(?'A'a)|(?'A'bcde))(?'B'c)\k'A'(?'A')/I,dupnames
Capture group count = 3
Max back reference = 3
Named capture groups:
A 1
A 3
B 2
Options: dupnames
Starting code units: a b
Last code unit = 'c'
Subject length lower bound = 2
/((a|)+)+Z/I
Capture group count = 2
Starting code units: Z a
Last code unit = 'Z'
Subject length lower bound = 1
/((?=a))[abcd]/I
Capture group count = 1
First code unit = 'a'
Subject length lower bound = 1
# End of testinput2
Error -70: PCRE2_ERROR_BADDATA (unknown error number)
Error -62: bad serialized data