Fix incorrect minimum matching length when pattern contains (?| groups.
This commit is contained in:
parent
8269696f57
commit
3e12e15fe1
|
@ -92,6 +92,12 @@ gives up trying to find a minimum length when things get too complex.
|
||||||
24. An optimization has been added that speeds up finding the minimum matching
|
24. An optimization has been added that speeds up finding the minimum matching
|
||||||
length for patterns containing repeated capturing groups or recursions.
|
length for patterns containing repeated capturing groups or recursions.
|
||||||
|
|
||||||
|
25. If a pattern contained a back reference to a group whose number was
|
||||||
|
duplicated as a result of appearing in a (?|...) group, the computation of the
|
||||||
|
minimum matching length gave a wrong result, which could cause incorrect "no
|
||||||
|
match" errors. For such patterns, a minimum matching length cannot at present
|
||||||
|
be computed.
|
||||||
|
|
||||||
|
|
||||||
Version 10.20 30-June-2015
|
Version 10.20 30-June-2015
|
||||||
--------------------------
|
--------------------------
|
||||||
|
|
|
@ -3215,6 +3215,7 @@ for (; ptr < cb->end_pattern; ptr++)
|
||||||
top_nest->reset_group = cb->bracount;
|
top_nest->reset_group = cb->bracount;
|
||||||
top_nest->max_group = cb->bracount;
|
top_nest->max_group = cb->bracount;
|
||||||
top_nest->flags |= NSF_RESET;
|
top_nest->flags |= NSF_RESET;
|
||||||
|
cb->external_flags |= PCRE2_DUPCAPUSED;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -524,9 +524,10 @@ bytes in a code unit in that mode. */
|
||||||
#define PCRE2_NL_SET 0x00008000 /* newline was set in the pattern */
|
#define PCRE2_NL_SET 0x00008000 /* newline was set in the pattern */
|
||||||
#define PCRE2_NOTEMPTY_SET 0x00010000 /* (*NOTEMPTY) used ) keep */
|
#define PCRE2_NOTEMPTY_SET 0x00010000 /* (*NOTEMPTY) used ) keep */
|
||||||
#define PCRE2_NE_ATST_SET 0x00020000 /* (*NOTEMPTY_ATSTART) used) together */
|
#define PCRE2_NE_ATST_SET 0x00020000 /* (*NOTEMPTY_ATSTART) used) together */
|
||||||
#define PCRE2_DEREF_TABLES 0x00040000 /* Release character tables. */
|
#define PCRE2_DEREF_TABLES 0x00040000 /* release character tables */
|
||||||
#define PCRE2_NOJIT 0x00080000 /* (*NOJIT) used */
|
#define PCRE2_NOJIT 0x00080000 /* (*NOJIT) used */
|
||||||
#define PCRE2_HASBKPORX 0x00100000 /* contains \P, \p, or \X */
|
#define PCRE2_HASBKPORX 0x00100000 /* contains \P, \p, or \X */
|
||||||
|
#define PCRE2_DUPCAPUSED 0x00200000 /* contains (?| */
|
||||||
|
|
||||||
#define PCRE2_MODE_MASK (PCRE2_MODE8 | PCRE2_MODE16 | PCRE2_MODE32)
|
#define PCRE2_MODE_MASK (PCRE2_MODE8 | PCRE2_MODE16 | PCRE2_MODE32)
|
||||||
|
|
||||||
|
|
|
@ -74,12 +74,13 @@ Arguments:
|
||||||
startcode pointer to start of the whole pattern's code
|
startcode pointer to start of the whole pattern's code
|
||||||
utf UTF flag
|
utf UTF flag
|
||||||
recurses chain of recurse_check to catch mutual recursion
|
recurses chain of recurse_check to catch mutual recursion
|
||||||
countptr pointer to call count (to catch over complexity)
|
countptr pointer to call count (to catch over complexity)
|
||||||
|
|
||||||
Returns: the minimum length
|
Returns: the minimum length
|
||||||
-1 \C in UTF-8 mode
|
-1 \C in UTF-8 mode
|
||||||
or (*ACCEPT)
|
or (*ACCEPT)
|
||||||
or pattern too complicated
|
or pattern too complicated
|
||||||
|
or back reference to duplicate name/number
|
||||||
-2 internal error (missing capturing bracket)
|
-2 internal error (missing capturing bracket)
|
||||||
-3 internal error (opcode not listed)
|
-3 internal error (opcode not listed)
|
||||||
*/
|
*/
|
||||||
|
@ -89,10 +90,13 @@ find_minlength(const pcre2_real_code *re, PCRE2_SPTR code,
|
||||||
PCRE2_SPTR startcode, BOOL utf, recurse_check *recurses, int *countptr)
|
PCRE2_SPTR startcode, BOOL utf, recurse_check *recurses, int *countptr)
|
||||||
{
|
{
|
||||||
int length = -1;
|
int length = -1;
|
||||||
int prev_recno = -1;
|
int prev_cap_recno = -1;
|
||||||
int prev_d = 0;
|
int prev_cap_d = 0;
|
||||||
|
int prev_recurse_recno = -1;
|
||||||
|
int prev_recurse_d = 0;
|
||||||
uint32_t once_fudge = 0;
|
uint32_t once_fudge = 0;
|
||||||
BOOL had_recurse = FALSE;
|
BOOL had_recurse = FALSE;
|
||||||
|
BOOL dupcapused = (re->flags & PCRE2_DUPCAPUSED) != 0;
|
||||||
recurse_check this_recurse;
|
recurse_check this_recurse;
|
||||||
register int branchlength = 0;
|
register int branchlength = 0;
|
||||||
register PCRE2_UCHAR *cc = (PCRE2_UCHAR *)code + 1 + LINK_SIZE;
|
register PCRE2_UCHAR *cc = (PCRE2_UCHAR *)code + 1 + LINK_SIZE;
|
||||||
|
@ -114,7 +118,7 @@ for (;;)
|
||||||
int d, min, recno;
|
int d, min, recno;
|
||||||
PCRE2_UCHAR *cs, *ce;
|
PCRE2_UCHAR *cs, *ce;
|
||||||
register PCRE2_UCHAR op = *cc;
|
register PCRE2_UCHAR op = *cc;
|
||||||
|
|
||||||
switch (op)
|
switch (op)
|
||||||
{
|
{
|
||||||
case OP_COND:
|
case OP_COND:
|
||||||
|
@ -133,26 +137,26 @@ for (;;)
|
||||||
}
|
}
|
||||||
goto PROCESS_NON_CAPTURE;
|
goto PROCESS_NON_CAPTURE;
|
||||||
|
|
||||||
/* There's a special case of OP_ONCE, when it is wrapped round an
|
/* There's a special case of OP_ONCE, when it is wrapped round an
|
||||||
OP_RECURSE. We'd like to process the latter at this level so that
|
OP_RECURSE. We'd like to process the latter at this level so that
|
||||||
remembering the value works for repeated cases. So we do nothing, but
|
remembering the value works for repeated cases. So we do nothing, but
|
||||||
set a fudge value to skip over the OP_KET after the recurse. */
|
set a fudge value to skip over the OP_KET after the recurse. */
|
||||||
|
|
||||||
case OP_ONCE:
|
case OP_ONCE:
|
||||||
if (cc[1+LINK_SIZE] == OP_RECURSE && cc[2*(1+LINK_SIZE)] == OP_KET)
|
if (cc[1+LINK_SIZE] == OP_RECURSE && cc[2*(1+LINK_SIZE)] == OP_KET)
|
||||||
{
|
{
|
||||||
once_fudge = 1 + LINK_SIZE;
|
once_fudge = 1 + LINK_SIZE;
|
||||||
cc += 1 + LINK_SIZE;
|
cc += 1 + LINK_SIZE;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
/* Fall through */
|
/* Fall through */
|
||||||
|
|
||||||
case OP_ONCE_NC:
|
case OP_ONCE_NC:
|
||||||
case OP_BRA:
|
case OP_BRA:
|
||||||
case OP_SBRA:
|
case OP_SBRA:
|
||||||
case OP_BRAPOS:
|
case OP_BRAPOS:
|
||||||
case OP_SBRAPOS:
|
case OP_SBRAPOS:
|
||||||
PROCESS_NON_CAPTURE:
|
PROCESS_NON_CAPTURE:
|
||||||
d = find_minlength(re, cc, startcode, utf, recurses, countptr);
|
d = find_minlength(re, cc, startcode, utf, recurses, countptr);
|
||||||
if (d < 0) return d;
|
if (d < 0) return d;
|
||||||
branchlength += d;
|
branchlength += d;
|
||||||
|
@ -162,24 +166,25 @@ for (;;)
|
||||||
|
|
||||||
/* To save time for repeated capturing subpatterns, we remember the
|
/* To save time for repeated capturing subpatterns, we remember the
|
||||||
length of the previous one. Unfortunately we can't do the same for
|
length of the previous one. Unfortunately we can't do the same for
|
||||||
the unnumbered ones above. */
|
the unnumbered ones above. Nor can we do this if (?| is present in the
|
||||||
|
pattern because captures with the same number are not then identical. */
|
||||||
|
|
||||||
case OP_CBRA:
|
case OP_CBRA:
|
||||||
case OP_SCBRA:
|
case OP_SCBRA:
|
||||||
case OP_CBRAPOS:
|
case OP_CBRAPOS:
|
||||||
case OP_SCBRAPOS:
|
case OP_SCBRAPOS:
|
||||||
recno = GET2(cc, 1+LINK_SIZE);
|
recno = dupcapused? prev_cap_recno - 1 : (int)GET2(cc, 1+LINK_SIZE);
|
||||||
if (recno != prev_recno)
|
if (recno != prev_cap_recno)
|
||||||
{
|
{
|
||||||
prev_recno = recno;
|
prev_cap_recno = recno;
|
||||||
prev_d = find_minlength(re, cc, startcode, utf, recurses, countptr);
|
prev_cap_d = find_minlength(re, cc, startcode, utf, recurses, countptr);
|
||||||
if (prev_d < 0) return prev_d;
|
if (prev_cap_d < 0) return prev_cap_d;
|
||||||
}
|
}
|
||||||
branchlength += prev_d;
|
branchlength += prev_cap_d;
|
||||||
do cc += GET(cc, 1); while (*cc == OP_ALT);
|
do cc += GET(cc, 1); while (*cc == OP_ALT);
|
||||||
cc += 1 + LINK_SIZE;
|
cc += 1 + LINK_SIZE;
|
||||||
break;
|
break;
|
||||||
|
|
||||||
/* ACCEPT makes things far too complicated; we have to give up. */
|
/* ACCEPT makes things far too complicated; we have to give up. */
|
||||||
|
|
||||||
case OP_ACCEPT:
|
case OP_ACCEPT:
|
||||||
|
@ -427,8 +432,12 @@ for (;;)
|
||||||
matches an empty string (by default it causes a matching failure), so in
|
matches an empty string (by default it causes a matching failure), so in
|
||||||
that case we must set the minimum length to zero. */
|
that case we must set the minimum length to zero. */
|
||||||
|
|
||||||
case OP_DNREF: /* Duplicate named pattern back reference */
|
/* Duplicate named pattern back reference. We cannot reliably find a length
|
||||||
|
for this if duplicate numbers are present in the pattern. */
|
||||||
|
|
||||||
|
case OP_DNREF:
|
||||||
case OP_DNREFI:
|
case OP_DNREFI:
|
||||||
|
if (dupcapused) return -1;
|
||||||
if ((re->overall_options & PCRE2_MATCH_UNSET_BACKREF) == 0)
|
if ((re->overall_options & PCRE2_MATCH_UNSET_BACKREF) == 0)
|
||||||
{
|
{
|
||||||
int count = GET2(cc, 1+IMM2_SIZE);
|
int count = GET2(cc, 1+IMM2_SIZE);
|
||||||
|
@ -477,8 +486,12 @@ for (;;)
|
||||||
cc += 1 + 2*IMM2_SIZE;
|
cc += 1 + 2*IMM2_SIZE;
|
||||||
goto REPEAT_BACK_REFERENCE;
|
goto REPEAT_BACK_REFERENCE;
|
||||||
|
|
||||||
case OP_REF: /* Single back reference */
|
/* Single back reference. We cannot find a length for this if duplicate
|
||||||
|
numbers are present in the pattern. */
|
||||||
|
|
||||||
|
case OP_REF:
|
||||||
case OP_REFI:
|
case OP_REFI:
|
||||||
|
if (dupcapused) return -1;
|
||||||
if ((re->overall_options & PCRE2_MATCH_UNSET_BACKREF) == 0)
|
if ((re->overall_options & PCRE2_MATCH_UNSET_BACKREF) == 0)
|
||||||
{
|
{
|
||||||
ce = cs = (PCRE2_UCHAR *)PRIV(find_bracket)(startcode, utf, GET2(cc, 1));
|
ce = cs = (PCRE2_UCHAR *)PRIV(find_bracket)(startcode, utf, GET2(cc, 1));
|
||||||
|
@ -546,15 +559,19 @@ for (;;)
|
||||||
branchlength += min * d;
|
branchlength += min * d;
|
||||||
break;
|
break;
|
||||||
|
|
||||||
|
/* Recursion always refers to the first occurrence of a subpattern with a
|
||||||
|
given number. Therefore, we can always make use of caching, even when the
|
||||||
|
pattern contains multiple subpatterns with the same number. */
|
||||||
|
|
||||||
case OP_RECURSE:
|
case OP_RECURSE:
|
||||||
cs = ce = (PCRE2_UCHAR *)startcode + GET(cc, 1);
|
cs = ce = (PCRE2_UCHAR *)startcode + GET(cc, 1);
|
||||||
recno = GET2(cs, 1+LINK_SIZE);
|
recno = GET2(cs, 1+LINK_SIZE);
|
||||||
if (recno == prev_recno)
|
if (recno == prev_recurse_recno)
|
||||||
{
|
{
|
||||||
branchlength += prev_d;
|
branchlength += prev_recurse_d;
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
do ce += GET(ce, 1); while (*ce == OP_ALT);
|
do ce += GET(ce, 1); while (*ce == OP_ALT);
|
||||||
if (cc > cs && cc < ce) /* Simple recursion */
|
if (cc > cs && cc < ce) /* Simple recursion */
|
||||||
had_recurse = TRUE;
|
had_recurse = TRUE;
|
||||||
|
@ -568,16 +585,16 @@ for (;;)
|
||||||
{
|
{
|
||||||
this_recurse.prev = recurses;
|
this_recurse.prev = recurses;
|
||||||
this_recurse.group = cs;
|
this_recurse.group = cs;
|
||||||
prev_d = find_minlength(re, cs, startcode, utf, &this_recurse,
|
prev_recurse_d = find_minlength(re, cs, startcode, utf, &this_recurse,
|
||||||
countptr);
|
countptr);
|
||||||
if (prev_d < 0) return prev_d;
|
if (prev_recurse_d < 0) return prev_recurse_d;
|
||||||
prev_recno = recno;
|
prev_recurse_recno = recno;
|
||||||
branchlength += prev_d;
|
branchlength += prev_recurse_d;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
cc += 1 + LINK_SIZE + once_fudge;
|
cc += 1 + LINK_SIZE + once_fudge;
|
||||||
once_fudge = 0;
|
once_fudge = 0;
|
||||||
break;
|
break;
|
||||||
|
|
||||||
/* Anything else does not or need not match a character. We can get the
|
/* Anything else does not or need not match a character. We can get the
|
||||||
|
|
|
@ -5727,4 +5727,26 @@ name)/mark
|
||||||
"(?|(\k'Pm')|(?'Pm'))"
|
"(?|(\k'Pm')|(?'Pm'))"
|
||||||
abcd
|
abcd
|
||||||
|
|
||||||
|
/(?|(aaa)|(b))\g{1}/
|
||||||
|
aaaaaa
|
||||||
|
bb
|
||||||
|
|
||||||
|
/(?|(aaa)|(b))(?1)/
|
||||||
|
aaaaaa
|
||||||
|
baaa
|
||||||
|
** Failers
|
||||||
|
bb
|
||||||
|
|
||||||
|
/(?|(aaa)|(b))/
|
||||||
|
xaaa
|
||||||
|
xbc
|
||||||
|
|
||||||
|
/(?|(?'a'aaa)|(?'a'b))\k'a'/
|
||||||
|
aaaaaa
|
||||||
|
bb
|
||||||
|
|
||||||
|
/(?|(?'a'aaa)|(?'a'b))(?'a'cccc)\k'a'/dupnames
|
||||||
|
aaaccccaaa
|
||||||
|
bccccb
|
||||||
|
|
||||||
# End of testinput1
|
# End of testinput1
|
||||||
|
|
|
@ -4362,4 +4362,14 @@ a random value. /Ix
|
||||||
|
|
||||||
/(?1){3918}(((((0(\k'R'))))(?J)(?'R'(?'R'\3){99})))/I
|
/(?1){3918}(((((0(\k'R'))))(?J)(?'R'(?'R'\3){99})))/I
|
||||||
|
|
||||||
|
/(?|(aaa)|(b))\g{1}/I
|
||||||
|
|
||||||
|
/(?|(aaa)|(b))(?1)/I
|
||||||
|
|
||||||
|
/(?|(aaa)|(b))/I
|
||||||
|
|
||||||
|
/(?|(?'a'aaa)|(?'a'b))\k'a'/I
|
||||||
|
|
||||||
|
/(?|(?'a'aaa)|(?'a'b))(?'a'cccc)\k'a'/I,dupnames
|
||||||
|
|
||||||
# End of testinput2
|
# End of testinput2
|
||||||
|
|
|
@ -9463,4 +9463,50 @@ No match
|
||||||
0:
|
0:
|
||||||
1:
|
1:
|
||||||
|
|
||||||
|
/(?|(aaa)|(b))\g{1}/
|
||||||
|
aaaaaa
|
||||||
|
0: aaaaaa
|
||||||
|
1: aaa
|
||||||
|
bb
|
||||||
|
0: bb
|
||||||
|
1: b
|
||||||
|
|
||||||
|
/(?|(aaa)|(b))(?1)/
|
||||||
|
aaaaaa
|
||||||
|
0: aaaaaa
|
||||||
|
1: aaa
|
||||||
|
baaa
|
||||||
|
0: baaa
|
||||||
|
1: b
|
||||||
|
** Failers
|
||||||
|
No match
|
||||||
|
bb
|
||||||
|
No match
|
||||||
|
|
||||||
|
/(?|(aaa)|(b))/
|
||||||
|
xaaa
|
||||||
|
0: aaa
|
||||||
|
1: aaa
|
||||||
|
xbc
|
||||||
|
0: b
|
||||||
|
1: b
|
||||||
|
|
||||||
|
/(?|(?'a'aaa)|(?'a'b))\k'a'/
|
||||||
|
aaaaaa
|
||||||
|
0: aaaaaa
|
||||||
|
1: aaa
|
||||||
|
bb
|
||||||
|
0: bb
|
||||||
|
1: b
|
||||||
|
|
||||||
|
/(?|(?'a'aaa)|(?'a'b))(?'a'cccc)\k'a'/dupnames
|
||||||
|
aaaccccaaa
|
||||||
|
0: aaaccccaaa
|
||||||
|
1: aaa
|
||||||
|
2: cccc
|
||||||
|
bccccb
|
||||||
|
0: bccccb
|
||||||
|
1: b
|
||||||
|
2: cccc
|
||||||
|
|
||||||
# End of testinput1
|
# End of testinput1
|
||||||
|
|
|
@ -14576,4 +14576,39 @@ Duplicate name status changes
|
||||||
Last code unit = '0'
|
Last code unit = '0'
|
||||||
Subject length lower bound = 65535
|
Subject length lower bound = 65535
|
||||||
|
|
||||||
|
/(?|(aaa)|(b))\g{1}/I
|
||||||
|
Capturing subpattern count = 1
|
||||||
|
Max back reference = 1
|
||||||
|
Starting code units: a b
|
||||||
|
Subject length lower bound = 0
|
||||||
|
|
||||||
|
/(?|(aaa)|(b))(?1)/I
|
||||||
|
Capturing subpattern count = 1
|
||||||
|
Starting code units: a b
|
||||||
|
Subject length lower bound = 4
|
||||||
|
|
||||||
|
/(?|(aaa)|(b))/I
|
||||||
|
Capturing subpattern count = 1
|
||||||
|
Starting code units: a b
|
||||||
|
Subject length lower bound = 1
|
||||||
|
|
||||||
|
/(?|(?'a'aaa)|(?'a'b))\k'a'/I
|
||||||
|
Capturing subpattern count = 1
|
||||||
|
Max back reference = 1
|
||||||
|
Named capturing subpatterns:
|
||||||
|
a 1
|
||||||
|
Starting code units: a b
|
||||||
|
Subject length lower bound = 0
|
||||||
|
|
||||||
|
/(?|(?'a'aaa)|(?'a'b))(?'a'cccc)\k'a'/I,dupnames
|
||||||
|
Capturing subpattern count = 2
|
||||||
|
Max back reference = 2
|
||||||
|
Named capturing subpatterns:
|
||||||
|
a 1
|
||||||
|
a 2
|
||||||
|
Options: dupnames
|
||||||
|
Starting code units: a b
|
||||||
|
Last code unit = 'c'
|
||||||
|
Subject length lower bound = 0
|
||||||
|
|
||||||
# End of testinput2
|
# End of testinput2
|
||||||
|
|
Loading…
Reference in New Issue