Fix incorrect minimum matching length when pattern contains (?| groups.
This commit is contained in:
parent
8269696f57
commit
3e12e15fe1
|
@ -92,6 +92,12 @@ gives up trying to find a minimum length when things get too complex.
|
|||
24. An optimization has been added that speeds up finding the minimum matching
|
||||
length for patterns containing repeated capturing groups or recursions.
|
||||
|
||||
25. If a pattern contained a back reference to a group whose number was
|
||||
duplicated as a result of appearing in a (?|...) group, the computation of the
|
||||
minimum matching length gave a wrong result, which could cause incorrect "no
|
||||
match" errors. For such patterns, a minimum matching length cannot at present
|
||||
be computed.
|
||||
|
||||
|
||||
Version 10.20 30-June-2015
|
||||
--------------------------
|
||||
|
|
|
@ -3215,6 +3215,7 @@ for (; ptr < cb->end_pattern; ptr++)
|
|||
top_nest->reset_group = cb->bracount;
|
||||
top_nest->max_group = cb->bracount;
|
||||
top_nest->flags |= NSF_RESET;
|
||||
cb->external_flags |= PCRE2_DUPCAPUSED;
|
||||
break;
|
||||
}
|
||||
|
||||
|
|
|
@ -524,9 +524,10 @@ bytes in a code unit in that mode. */
|
|||
#define PCRE2_NL_SET 0x00008000 /* newline was set in the pattern */
|
||||
#define PCRE2_NOTEMPTY_SET 0x00010000 /* (*NOTEMPTY) used ) keep */
|
||||
#define PCRE2_NE_ATST_SET 0x00020000 /* (*NOTEMPTY_ATSTART) used) together */
|
||||
#define PCRE2_DEREF_TABLES 0x00040000 /* Release character tables. */
|
||||
#define PCRE2_DEREF_TABLES 0x00040000 /* release character tables */
|
||||
#define PCRE2_NOJIT 0x00080000 /* (*NOJIT) used */
|
||||
#define PCRE2_HASBKPORX 0x00100000 /* contains \P, \p, or \X */
|
||||
#define PCRE2_DUPCAPUSED 0x00200000 /* contains (?| */
|
||||
|
||||
#define PCRE2_MODE_MASK (PCRE2_MODE8 | PCRE2_MODE16 | PCRE2_MODE32)
|
||||
|
||||
|
|
|
@ -74,12 +74,13 @@ Arguments:
|
|||
startcode pointer to start of the whole pattern's code
|
||||
utf UTF flag
|
||||
recurses chain of recurse_check to catch mutual recursion
|
||||
countptr pointer to call count (to catch over complexity)
|
||||
countptr pointer to call count (to catch over complexity)
|
||||
|
||||
Returns: the minimum length
|
||||
-1 \C in UTF-8 mode
|
||||
or (*ACCEPT)
|
||||
or pattern too complicated
|
||||
or pattern too complicated
|
||||
or back reference to duplicate name/number
|
||||
-2 internal error (missing capturing bracket)
|
||||
-3 internal error (opcode not listed)
|
||||
*/
|
||||
|
@ -89,10 +90,13 @@ find_minlength(const pcre2_real_code *re, PCRE2_SPTR code,
|
|||
PCRE2_SPTR startcode, BOOL utf, recurse_check *recurses, int *countptr)
|
||||
{
|
||||
int length = -1;
|
||||
int prev_recno = -1;
|
||||
int prev_d = 0;
|
||||
int prev_cap_recno = -1;
|
||||
int prev_cap_d = 0;
|
||||
int prev_recurse_recno = -1;
|
||||
int prev_recurse_d = 0;
|
||||
uint32_t once_fudge = 0;
|
||||
BOOL had_recurse = FALSE;
|
||||
BOOL dupcapused = (re->flags & PCRE2_DUPCAPUSED) != 0;
|
||||
recurse_check this_recurse;
|
||||
register int branchlength = 0;
|
||||
register PCRE2_UCHAR *cc = (PCRE2_UCHAR *)code + 1 + LINK_SIZE;
|
||||
|
@ -114,7 +118,7 @@ for (;;)
|
|||
int d, min, recno;
|
||||
PCRE2_UCHAR *cs, *ce;
|
||||
register PCRE2_UCHAR op = *cc;
|
||||
|
||||
|
||||
switch (op)
|
||||
{
|
||||
case OP_COND:
|
||||
|
@ -133,26 +137,26 @@ for (;;)
|
|||
}
|
||||
goto PROCESS_NON_CAPTURE;
|
||||
|
||||
/* There's a special case of OP_ONCE, when it is wrapped round an
|
||||
OP_RECURSE. We'd like to process the latter at this level so that
|
||||
/* There's a special case of OP_ONCE, when it is wrapped round an
|
||||
OP_RECURSE. We'd like to process the latter at this level so that
|
||||
remembering the value works for repeated cases. So we do nothing, but
|
||||
set a fudge value to skip over the OP_KET after the recurse. */
|
||||
|
||||
|
||||
case OP_ONCE:
|
||||
if (cc[1+LINK_SIZE] == OP_RECURSE && cc[2*(1+LINK_SIZE)] == OP_KET)
|
||||
{
|
||||
once_fudge = 1 + LINK_SIZE;
|
||||
cc += 1 + LINK_SIZE;
|
||||
break;
|
||||
}
|
||||
break;
|
||||
}
|
||||
/* Fall through */
|
||||
|
||||
|
||||
case OP_ONCE_NC:
|
||||
case OP_BRA:
|
||||
case OP_SBRA:
|
||||
case OP_BRAPOS:
|
||||
case OP_SBRAPOS:
|
||||
PROCESS_NON_CAPTURE:
|
||||
PROCESS_NON_CAPTURE:
|
||||
d = find_minlength(re, cc, startcode, utf, recurses, countptr);
|
||||
if (d < 0) return d;
|
||||
branchlength += d;
|
||||
|
@ -162,24 +166,25 @@ for (;;)
|
|||
|
||||
/* To save time for repeated capturing subpatterns, we remember the
|
||||
length of the previous one. Unfortunately we can't do the same for
|
||||
the unnumbered ones above. */
|
||||
the unnumbered ones above. Nor can we do this if (?| is present in the
|
||||
pattern because captures with the same number are not then identical. */
|
||||
|
||||
case OP_CBRA:
|
||||
case OP_SCBRA:
|
||||
case OP_CBRAPOS:
|
||||
case OP_SCBRAPOS:
|
||||
recno = GET2(cc, 1+LINK_SIZE);
|
||||
if (recno != prev_recno)
|
||||
recno = dupcapused? prev_cap_recno - 1 : (int)GET2(cc, 1+LINK_SIZE);
|
||||
if (recno != prev_cap_recno)
|
||||
{
|
||||
prev_recno = recno;
|
||||
prev_d = find_minlength(re, cc, startcode, utf, recurses, countptr);
|
||||
if (prev_d < 0) return prev_d;
|
||||
prev_cap_recno = recno;
|
||||
prev_cap_d = find_minlength(re, cc, startcode, utf, recurses, countptr);
|
||||
if (prev_cap_d < 0) return prev_cap_d;
|
||||
}
|
||||
branchlength += prev_d;
|
||||
branchlength += prev_cap_d;
|
||||
do cc += GET(cc, 1); while (*cc == OP_ALT);
|
||||
cc += 1 + LINK_SIZE;
|
||||
break;
|
||||
|
||||
|
||||
/* ACCEPT makes things far too complicated; we have to give up. */
|
||||
|
||||
case OP_ACCEPT:
|
||||
|
@ -427,8 +432,12 @@ for (;;)
|
|||
matches an empty string (by default it causes a matching failure), so in
|
||||
that case we must set the minimum length to zero. */
|
||||
|
||||
case OP_DNREF: /* Duplicate named pattern back reference */
|
||||
/* Duplicate named pattern back reference. We cannot reliably find a length
|
||||
for this if duplicate numbers are present in the pattern. */
|
||||
|
||||
case OP_DNREF:
|
||||
case OP_DNREFI:
|
||||
if (dupcapused) return -1;
|
||||
if ((re->overall_options & PCRE2_MATCH_UNSET_BACKREF) == 0)
|
||||
{
|
||||
int count = GET2(cc, 1+IMM2_SIZE);
|
||||
|
@ -477,8 +486,12 @@ for (;;)
|
|||
cc += 1 + 2*IMM2_SIZE;
|
||||
goto REPEAT_BACK_REFERENCE;
|
||||
|
||||
case OP_REF: /* Single back reference */
|
||||
/* Single back reference. We cannot find a length for this if duplicate
|
||||
numbers are present in the pattern. */
|
||||
|
||||
case OP_REF:
|
||||
case OP_REFI:
|
||||
if (dupcapused) return -1;
|
||||
if ((re->overall_options & PCRE2_MATCH_UNSET_BACKREF) == 0)
|
||||
{
|
||||
ce = cs = (PCRE2_UCHAR *)PRIV(find_bracket)(startcode, utf, GET2(cc, 1));
|
||||
|
@ -546,15 +559,19 @@ for (;;)
|
|||
branchlength += min * d;
|
||||
break;
|
||||
|
||||
/* Recursion always refers to the first occurrence of a subpattern with a
|
||||
given number. Therefore, we can always make use of caching, even when the
|
||||
pattern contains multiple subpatterns with the same number. */
|
||||
|
||||
case OP_RECURSE:
|
||||
cs = ce = (PCRE2_UCHAR *)startcode + GET(cc, 1);
|
||||
recno = GET2(cs, 1+LINK_SIZE);
|
||||
if (recno == prev_recno)
|
||||
recno = GET2(cs, 1+LINK_SIZE);
|
||||
if (recno == prev_recurse_recno)
|
||||
{
|
||||
branchlength += prev_d;
|
||||
}
|
||||
branchlength += prev_recurse_d;
|
||||
}
|
||||
else
|
||||
{
|
||||
{
|
||||
do ce += GET(ce, 1); while (*ce == OP_ALT);
|
||||
if (cc > cs && cc < ce) /* Simple recursion */
|
||||
had_recurse = TRUE;
|
||||
|
@ -568,16 +585,16 @@ for (;;)
|
|||
{
|
||||
this_recurse.prev = recurses;
|
||||
this_recurse.group = cs;
|
||||
prev_d = find_minlength(re, cs, startcode, utf, &this_recurse,
|
||||
prev_recurse_d = find_minlength(re, cs, startcode, utf, &this_recurse,
|
||||
countptr);
|
||||
if (prev_d < 0) return prev_d;
|
||||
prev_recno = recno;
|
||||
branchlength += prev_d;
|
||||
if (prev_recurse_d < 0) return prev_recurse_d;
|
||||
prev_recurse_recno = recno;
|
||||
branchlength += prev_recurse_d;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
cc += 1 + LINK_SIZE + once_fudge;
|
||||
once_fudge = 0;
|
||||
once_fudge = 0;
|
||||
break;
|
||||
|
||||
/* Anything else does not or need not match a character. We can get the
|
||||
|
|
|
@ -5727,4 +5727,26 @@ name)/mark
|
|||
"(?|(\k'Pm')|(?'Pm'))"
|
||||
abcd
|
||||
|
||||
/(?|(aaa)|(b))\g{1}/
|
||||
aaaaaa
|
||||
bb
|
||||
|
||||
/(?|(aaa)|(b))(?1)/
|
||||
aaaaaa
|
||||
baaa
|
||||
** Failers
|
||||
bb
|
||||
|
||||
/(?|(aaa)|(b))/
|
||||
xaaa
|
||||
xbc
|
||||
|
||||
/(?|(?'a'aaa)|(?'a'b))\k'a'/
|
||||
aaaaaa
|
||||
bb
|
||||
|
||||
/(?|(?'a'aaa)|(?'a'b))(?'a'cccc)\k'a'/dupnames
|
||||
aaaccccaaa
|
||||
bccccb
|
||||
|
||||
# End of testinput1
|
||||
|
|
|
@ -4362,4 +4362,14 @@ a random value. /Ix
|
|||
|
||||
/(?1){3918}(((((0(\k'R'))))(?J)(?'R'(?'R'\3){99})))/I
|
||||
|
||||
/(?|(aaa)|(b))\g{1}/I
|
||||
|
||||
/(?|(aaa)|(b))(?1)/I
|
||||
|
||||
/(?|(aaa)|(b))/I
|
||||
|
||||
/(?|(?'a'aaa)|(?'a'b))\k'a'/I
|
||||
|
||||
/(?|(?'a'aaa)|(?'a'b))(?'a'cccc)\k'a'/I,dupnames
|
||||
|
||||
# End of testinput2
|
||||
|
|
|
@ -9463,4 +9463,50 @@ No match
|
|||
0:
|
||||
1:
|
||||
|
||||
/(?|(aaa)|(b))\g{1}/
|
||||
aaaaaa
|
||||
0: aaaaaa
|
||||
1: aaa
|
||||
bb
|
||||
0: bb
|
||||
1: b
|
||||
|
||||
/(?|(aaa)|(b))(?1)/
|
||||
aaaaaa
|
||||
0: aaaaaa
|
||||
1: aaa
|
||||
baaa
|
||||
0: baaa
|
||||
1: b
|
||||
** Failers
|
||||
No match
|
||||
bb
|
||||
No match
|
||||
|
||||
/(?|(aaa)|(b))/
|
||||
xaaa
|
||||
0: aaa
|
||||
1: aaa
|
||||
xbc
|
||||
0: b
|
||||
1: b
|
||||
|
||||
/(?|(?'a'aaa)|(?'a'b))\k'a'/
|
||||
aaaaaa
|
||||
0: aaaaaa
|
||||
1: aaa
|
||||
bb
|
||||
0: bb
|
||||
1: b
|
||||
|
||||
/(?|(?'a'aaa)|(?'a'b))(?'a'cccc)\k'a'/dupnames
|
||||
aaaccccaaa
|
||||
0: aaaccccaaa
|
||||
1: aaa
|
||||
2: cccc
|
||||
bccccb
|
||||
0: bccccb
|
||||
1: b
|
||||
2: cccc
|
||||
|
||||
# End of testinput1
|
||||
|
|
|
@ -14576,4 +14576,39 @@ Duplicate name status changes
|
|||
Last code unit = '0'
|
||||
Subject length lower bound = 65535
|
||||
|
||||
/(?|(aaa)|(b))\g{1}/I
|
||||
Capturing subpattern count = 1
|
||||
Max back reference = 1
|
||||
Starting code units: a b
|
||||
Subject length lower bound = 0
|
||||
|
||||
/(?|(aaa)|(b))(?1)/I
|
||||
Capturing subpattern count = 1
|
||||
Starting code units: a b
|
||||
Subject length lower bound = 4
|
||||
|
||||
/(?|(aaa)|(b))/I
|
||||
Capturing subpattern count = 1
|
||||
Starting code units: a b
|
||||
Subject length lower bound = 1
|
||||
|
||||
/(?|(?'a'aaa)|(?'a'b))\k'a'/I
|
||||
Capturing subpattern count = 1
|
||||
Max back reference = 1
|
||||
Named capturing subpatterns:
|
||||
a 1
|
||||
Starting code units: a b
|
||||
Subject length lower bound = 0
|
||||
|
||||
/(?|(?'a'aaa)|(?'a'b))(?'a'cccc)\k'a'/I,dupnames
|
||||
Capturing subpattern count = 2
|
||||
Max back reference = 2
|
||||
Named capturing subpatterns:
|
||||
a 1
|
||||
a 2
|
||||
Options: dupnames
|
||||
Starting code units: a b
|
||||
Last code unit = 'c'
|
||||
Subject length lower bound = 0
|
||||
|
||||
# End of testinput2
|
||||
|
|
Loading…
Reference in New Issue