Fix incorrect minimum matching length when pattern contains (?| groups.

This commit is contained in:
Philip.Hazel 2015-08-03 13:18:49 +00:00
parent 8269696f57
commit 3e12e15fe1
8 changed files with 172 additions and 34 deletions

View File

@ -92,6 +92,12 @@ gives up trying to find a minimum length when things get too complex.
24. An optimization has been added that speeds up finding the minimum matching 24. An optimization has been added that speeds up finding the minimum matching
length for patterns containing repeated capturing groups or recursions. length for patterns containing repeated capturing groups or recursions.
25. If a pattern contained a back reference to a group whose number was
duplicated as a result of appearing in a (?|...) group, the computation of the
minimum matching length gave a wrong result, which could cause incorrect "no
match" errors. For such patterns, a minimum matching length cannot at present
be computed.
Version 10.20 30-June-2015 Version 10.20 30-June-2015
-------------------------- --------------------------

View File

@ -3215,6 +3215,7 @@ for (; ptr < cb->end_pattern; ptr++)
top_nest->reset_group = cb->bracount; top_nest->reset_group = cb->bracount;
top_nest->max_group = cb->bracount; top_nest->max_group = cb->bracount;
top_nest->flags |= NSF_RESET; top_nest->flags |= NSF_RESET;
cb->external_flags |= PCRE2_DUPCAPUSED;
break; break;
} }

View File

@ -524,9 +524,10 @@ bytes in a code unit in that mode. */
#define PCRE2_NL_SET 0x00008000 /* newline was set in the pattern */ #define PCRE2_NL_SET 0x00008000 /* newline was set in the pattern */
#define PCRE2_NOTEMPTY_SET 0x00010000 /* (*NOTEMPTY) used ) keep */ #define PCRE2_NOTEMPTY_SET 0x00010000 /* (*NOTEMPTY) used ) keep */
#define PCRE2_NE_ATST_SET 0x00020000 /* (*NOTEMPTY_ATSTART) used) together */ #define PCRE2_NE_ATST_SET 0x00020000 /* (*NOTEMPTY_ATSTART) used) together */
#define PCRE2_DEREF_TABLES 0x00040000 /* Release character tables. */ #define PCRE2_DEREF_TABLES 0x00040000 /* release character tables */
#define PCRE2_NOJIT 0x00080000 /* (*NOJIT) used */ #define PCRE2_NOJIT 0x00080000 /* (*NOJIT) used */
#define PCRE2_HASBKPORX 0x00100000 /* contains \P, \p, or \X */ #define PCRE2_HASBKPORX 0x00100000 /* contains \P, \p, or \X */
#define PCRE2_DUPCAPUSED 0x00200000 /* contains (?| */
#define PCRE2_MODE_MASK (PCRE2_MODE8 | PCRE2_MODE16 | PCRE2_MODE32) #define PCRE2_MODE_MASK (PCRE2_MODE8 | PCRE2_MODE16 | PCRE2_MODE32)

View File

@ -74,12 +74,13 @@ Arguments:
startcode pointer to start of the whole pattern's code startcode pointer to start of the whole pattern's code
utf UTF flag utf UTF flag
recurses chain of recurse_check to catch mutual recursion recurses chain of recurse_check to catch mutual recursion
countptr pointer to call count (to catch over complexity) countptr pointer to call count (to catch over complexity)
Returns: the minimum length Returns: the minimum length
-1 \C in UTF-8 mode -1 \C in UTF-8 mode
or (*ACCEPT) or (*ACCEPT)
or pattern too complicated or pattern too complicated
or back reference to duplicate name/number
-2 internal error (missing capturing bracket) -2 internal error (missing capturing bracket)
-3 internal error (opcode not listed) -3 internal error (opcode not listed)
*/ */
@ -89,10 +90,13 @@ find_minlength(const pcre2_real_code *re, PCRE2_SPTR code,
PCRE2_SPTR startcode, BOOL utf, recurse_check *recurses, int *countptr) PCRE2_SPTR startcode, BOOL utf, recurse_check *recurses, int *countptr)
{ {
int length = -1; int length = -1;
int prev_recno = -1; int prev_cap_recno = -1;
int prev_d = 0; int prev_cap_d = 0;
int prev_recurse_recno = -1;
int prev_recurse_d = 0;
uint32_t once_fudge = 0; uint32_t once_fudge = 0;
BOOL had_recurse = FALSE; BOOL had_recurse = FALSE;
BOOL dupcapused = (re->flags & PCRE2_DUPCAPUSED) != 0;
recurse_check this_recurse; recurse_check this_recurse;
register int branchlength = 0; register int branchlength = 0;
register PCRE2_UCHAR *cc = (PCRE2_UCHAR *)code + 1 + LINK_SIZE; register PCRE2_UCHAR *cc = (PCRE2_UCHAR *)code + 1 + LINK_SIZE;
@ -114,7 +118,7 @@ for (;;)
int d, min, recno; int d, min, recno;
PCRE2_UCHAR *cs, *ce; PCRE2_UCHAR *cs, *ce;
register PCRE2_UCHAR op = *cc; register PCRE2_UCHAR op = *cc;
switch (op) switch (op)
{ {
case OP_COND: case OP_COND:
@ -133,26 +137,26 @@ for (;;)
} }
goto PROCESS_NON_CAPTURE; goto PROCESS_NON_CAPTURE;
/* There's a special case of OP_ONCE, when it is wrapped round an /* There's a special case of OP_ONCE, when it is wrapped round an
OP_RECURSE. We'd like to process the latter at this level so that OP_RECURSE. We'd like to process the latter at this level so that
remembering the value works for repeated cases. So we do nothing, but remembering the value works for repeated cases. So we do nothing, but
set a fudge value to skip over the OP_KET after the recurse. */ set a fudge value to skip over the OP_KET after the recurse. */
case OP_ONCE: case OP_ONCE:
if (cc[1+LINK_SIZE] == OP_RECURSE && cc[2*(1+LINK_SIZE)] == OP_KET) if (cc[1+LINK_SIZE] == OP_RECURSE && cc[2*(1+LINK_SIZE)] == OP_KET)
{ {
once_fudge = 1 + LINK_SIZE; once_fudge = 1 + LINK_SIZE;
cc += 1 + LINK_SIZE; cc += 1 + LINK_SIZE;
break; break;
} }
/* Fall through */ /* Fall through */
case OP_ONCE_NC: case OP_ONCE_NC:
case OP_BRA: case OP_BRA:
case OP_SBRA: case OP_SBRA:
case OP_BRAPOS: case OP_BRAPOS:
case OP_SBRAPOS: case OP_SBRAPOS:
PROCESS_NON_CAPTURE: PROCESS_NON_CAPTURE:
d = find_minlength(re, cc, startcode, utf, recurses, countptr); d = find_minlength(re, cc, startcode, utf, recurses, countptr);
if (d < 0) return d; if (d < 0) return d;
branchlength += d; branchlength += d;
@ -162,24 +166,25 @@ for (;;)
/* To save time for repeated capturing subpatterns, we remember the /* To save time for repeated capturing subpatterns, we remember the
length of the previous one. Unfortunately we can't do the same for length of the previous one. Unfortunately we can't do the same for
the unnumbered ones above. */ the unnumbered ones above. Nor can we do this if (?| is present in the
pattern because captures with the same number are not then identical. */
case OP_CBRA: case OP_CBRA:
case OP_SCBRA: case OP_SCBRA:
case OP_CBRAPOS: case OP_CBRAPOS:
case OP_SCBRAPOS: case OP_SCBRAPOS:
recno = GET2(cc, 1+LINK_SIZE); recno = dupcapused? prev_cap_recno - 1 : (int)GET2(cc, 1+LINK_SIZE);
if (recno != prev_recno) if (recno != prev_cap_recno)
{ {
prev_recno = recno; prev_cap_recno = recno;
prev_d = find_minlength(re, cc, startcode, utf, recurses, countptr); prev_cap_d = find_minlength(re, cc, startcode, utf, recurses, countptr);
if (prev_d < 0) return prev_d; if (prev_cap_d < 0) return prev_cap_d;
} }
branchlength += prev_d; branchlength += prev_cap_d;
do cc += GET(cc, 1); while (*cc == OP_ALT); do cc += GET(cc, 1); while (*cc == OP_ALT);
cc += 1 + LINK_SIZE; cc += 1 + LINK_SIZE;
break; break;
/* ACCEPT makes things far too complicated; we have to give up. */ /* ACCEPT makes things far too complicated; we have to give up. */
case OP_ACCEPT: case OP_ACCEPT:
@ -427,8 +432,12 @@ for (;;)
matches an empty string (by default it causes a matching failure), so in matches an empty string (by default it causes a matching failure), so in
that case we must set the minimum length to zero. */ that case we must set the minimum length to zero. */
case OP_DNREF: /* Duplicate named pattern back reference */ /* Duplicate named pattern back reference. We cannot reliably find a length
for this if duplicate numbers are present in the pattern. */
case OP_DNREF:
case OP_DNREFI: case OP_DNREFI:
if (dupcapused) return -1;
if ((re->overall_options & PCRE2_MATCH_UNSET_BACKREF) == 0) if ((re->overall_options & PCRE2_MATCH_UNSET_BACKREF) == 0)
{ {
int count = GET2(cc, 1+IMM2_SIZE); int count = GET2(cc, 1+IMM2_SIZE);
@ -477,8 +486,12 @@ for (;;)
cc += 1 + 2*IMM2_SIZE; cc += 1 + 2*IMM2_SIZE;
goto REPEAT_BACK_REFERENCE; goto REPEAT_BACK_REFERENCE;
case OP_REF: /* Single back reference */ /* Single back reference. We cannot find a length for this if duplicate
numbers are present in the pattern. */
case OP_REF:
case OP_REFI: case OP_REFI:
if (dupcapused) return -1;
if ((re->overall_options & PCRE2_MATCH_UNSET_BACKREF) == 0) if ((re->overall_options & PCRE2_MATCH_UNSET_BACKREF) == 0)
{ {
ce = cs = (PCRE2_UCHAR *)PRIV(find_bracket)(startcode, utf, GET2(cc, 1)); ce = cs = (PCRE2_UCHAR *)PRIV(find_bracket)(startcode, utf, GET2(cc, 1));
@ -546,15 +559,19 @@ for (;;)
branchlength += min * d; branchlength += min * d;
break; break;
/* Recursion always refers to the first occurrence of a subpattern with a
given number. Therefore, we can always make use of caching, even when the
pattern contains multiple subpatterns with the same number. */
case OP_RECURSE: case OP_RECURSE:
cs = ce = (PCRE2_UCHAR *)startcode + GET(cc, 1); cs = ce = (PCRE2_UCHAR *)startcode + GET(cc, 1);
recno = GET2(cs, 1+LINK_SIZE); recno = GET2(cs, 1+LINK_SIZE);
if (recno == prev_recno) if (recno == prev_recurse_recno)
{ {
branchlength += prev_d; branchlength += prev_recurse_d;
} }
else else
{ {
do ce += GET(ce, 1); while (*ce == OP_ALT); do ce += GET(ce, 1); while (*ce == OP_ALT);
if (cc > cs && cc < ce) /* Simple recursion */ if (cc > cs && cc < ce) /* Simple recursion */
had_recurse = TRUE; had_recurse = TRUE;
@ -568,16 +585,16 @@ for (;;)
{ {
this_recurse.prev = recurses; this_recurse.prev = recurses;
this_recurse.group = cs; this_recurse.group = cs;
prev_d = find_minlength(re, cs, startcode, utf, &this_recurse, prev_recurse_d = find_minlength(re, cs, startcode, utf, &this_recurse,
countptr); countptr);
if (prev_d < 0) return prev_d; if (prev_recurse_d < 0) return prev_recurse_d;
prev_recno = recno; prev_recurse_recno = recno;
branchlength += prev_d; branchlength += prev_recurse_d;
} }
} }
} }
cc += 1 + LINK_SIZE + once_fudge; cc += 1 + LINK_SIZE + once_fudge;
once_fudge = 0; once_fudge = 0;
break; break;
/* Anything else does not or need not match a character. We can get the /* Anything else does not or need not match a character. We can get the

22
testdata/testinput1 vendored
View File

@ -5727,4 +5727,26 @@ name)/mark
"(?|(\k'Pm')|(?'Pm'))" "(?|(\k'Pm')|(?'Pm'))"
abcd abcd
/(?|(aaa)|(b))\g{1}/
aaaaaa
bb
/(?|(aaa)|(b))(?1)/
aaaaaa
baaa
** Failers
bb
/(?|(aaa)|(b))/
xaaa
xbc
/(?|(?'a'aaa)|(?'a'b))\k'a'/
aaaaaa
bb
/(?|(?'a'aaa)|(?'a'b))(?'a'cccc)\k'a'/dupnames
aaaccccaaa
bccccb
# End of testinput1 # End of testinput1

10
testdata/testinput2 vendored
View File

@ -4362,4 +4362,14 @@ a random value. /Ix
/(?1){3918}(((((0(\k'R'))))(?J)(?'R'(?'R'\3){99})))/I /(?1){3918}(((((0(\k'R'))))(?J)(?'R'(?'R'\3){99})))/I
/(?|(aaa)|(b))\g{1}/I
/(?|(aaa)|(b))(?1)/I
/(?|(aaa)|(b))/I
/(?|(?'a'aaa)|(?'a'b))\k'a'/I
/(?|(?'a'aaa)|(?'a'b))(?'a'cccc)\k'a'/I,dupnames
# End of testinput2 # End of testinput2

46
testdata/testoutput1 vendored
View File

@ -9463,4 +9463,50 @@ No match
0: 0:
1: 1:
/(?|(aaa)|(b))\g{1}/
aaaaaa
0: aaaaaa
1: aaa
bb
0: bb
1: b
/(?|(aaa)|(b))(?1)/
aaaaaa
0: aaaaaa
1: aaa
baaa
0: baaa
1: b
** Failers
No match
bb
No match
/(?|(aaa)|(b))/
xaaa
0: aaa
1: aaa
xbc
0: b
1: b
/(?|(?'a'aaa)|(?'a'b))\k'a'/
aaaaaa
0: aaaaaa
1: aaa
bb
0: bb
1: b
/(?|(?'a'aaa)|(?'a'b))(?'a'cccc)\k'a'/dupnames
aaaccccaaa
0: aaaccccaaa
1: aaa
2: cccc
bccccb
0: bccccb
1: b
2: cccc
# End of testinput1 # End of testinput1

35
testdata/testoutput2 vendored
View File

@ -14576,4 +14576,39 @@ Duplicate name status changes
Last code unit = '0' Last code unit = '0'
Subject length lower bound = 65535 Subject length lower bound = 65535
/(?|(aaa)|(b))\g{1}/I
Capturing subpattern count = 1
Max back reference = 1
Starting code units: a b
Subject length lower bound = 0
/(?|(aaa)|(b))(?1)/I
Capturing subpattern count = 1
Starting code units: a b
Subject length lower bound = 4
/(?|(aaa)|(b))/I
Capturing subpattern count = 1
Starting code units: a b
Subject length lower bound = 1
/(?|(?'a'aaa)|(?'a'b))\k'a'/I
Capturing subpattern count = 1
Max back reference = 1
Named capturing subpatterns:
a 1
Starting code units: a b
Subject length lower bound = 0
/(?|(?'a'aaa)|(?'a'b))(?'a'cccc)\k'a'/I,dupnames
Capturing subpattern count = 2
Max back reference = 2
Named capturing subpatterns:
a 1
a 2
Options: dupnames
Starting code units: a b
Last code unit = 'c'
Subject length lower bound = 0
# End of testinput2 # End of testinput2