Fix bugs in recent patch for setting the maximum lookbehind.
This commit is contained in:
parent
c0d0ee5365
commit
4866bd3652
|
@ -128,7 +128,7 @@ static int
|
||||||
compile_block *, PCRE2_SIZE *);
|
compile_block *, PCRE2_SIZE *);
|
||||||
|
|
||||||
static int
|
static int
|
||||||
get_branchlength(uint32_t **, int *, int *, parsed_recurse_check *,
|
get_branchlength(uint32_t **, int *, int *, int *, parsed_recurse_check *,
|
||||||
compile_block *);
|
compile_block *);
|
||||||
|
|
||||||
static BOOL
|
static BOOL
|
||||||
|
@ -388,6 +388,9 @@ compiler is clever with identical subexpressions. */
|
||||||
#define GI_SET_FIXED_LENGTH 0x80000000u
|
#define GI_SET_FIXED_LENGTH 0x80000000u
|
||||||
#define GI_NOT_FIXED_LENGTH 0x40000000u
|
#define GI_NOT_FIXED_LENGTH 0x40000000u
|
||||||
#define GI_FIXED_LENGTH_MASK 0x0000ffffu
|
#define GI_FIXED_LENGTH_MASK 0x0000ffffu
|
||||||
|
#define GI_EXTRA_MASK 0x0fff0000u
|
||||||
|
#define GI_EXTRA_MAX 0xfff /* NB not unsigned */
|
||||||
|
#define GI_EXTRA_SHIFT 16
|
||||||
|
|
||||||
/* This simple test for a decimal digit works for both ASCII/Unicode and EBCDIC
|
/* This simple test for a decimal digit works for both ASCII/Unicode and EBCDIC
|
||||||
and is fast (a good compiler can turn it into a subtraction and unsigned
|
and is fast (a good compiler can turn it into a subtraction and unsigned
|
||||||
|
@ -8841,6 +8844,7 @@ improve processing speed when the same capturing group occurs many times.
|
||||||
Arguments:
|
Arguments:
|
||||||
pptrptr pointer to pointer in the parsed pattern
|
pptrptr pointer to pointer in the parsed pattern
|
||||||
isinline FALSE if a reference or recursion; TRUE for inline group
|
isinline FALSE if a reference or recursion; TRUE for inline group
|
||||||
|
extraptr pointer to where to return extra lookbehind length
|
||||||
errcodeptr pointer to the errorcode
|
errcodeptr pointer to the errorcode
|
||||||
lcptr pointer to the loop counter
|
lcptr pointer to the loop counter
|
||||||
group number of captured group or -1 for a non-capturing group
|
group number of captured group or -1 for a non-capturing group
|
||||||
|
@ -8851,11 +8855,13 @@ Returns: the group length or a negative number
|
||||||
*/
|
*/
|
||||||
|
|
||||||
static int
|
static int
|
||||||
get_grouplength(uint32_t **pptrptr, BOOL isinline, int *errcodeptr, int *lcptr,
|
get_grouplength(uint32_t **pptrptr, BOOL isinline, int *extraptr,
|
||||||
int group, parsed_recurse_check *recurses, compile_block *cb)
|
int *errcodeptr, int *lcptr, int group, parsed_recurse_check *recurses,
|
||||||
|
compile_block *cb)
|
||||||
{
|
{
|
||||||
int branchlength;
|
int branchlength;
|
||||||
int grouplength = -1;
|
int grouplength = -1;
|
||||||
|
int extra = 0;
|
||||||
|
|
||||||
/* The cache can be used only if there is no possibility of there being two
|
/* The cache can be used only if there is no possibility of there being two
|
||||||
groups with the same number. We do not need to set the end pointer for a group
|
groups with the same number. We do not need to set the end pointer for a group
|
||||||
|
@ -8869,6 +8875,7 @@ if (group > 0 && (cb->external_flags & PCRE2_DUPCAPUSED) == 0)
|
||||||
if ((groupinfo & GI_SET_FIXED_LENGTH) != 0)
|
if ((groupinfo & GI_SET_FIXED_LENGTH) != 0)
|
||||||
{
|
{
|
||||||
if (isinline) *pptrptr = parsed_skip(*pptrptr, PSKIP_KET);
|
if (isinline) *pptrptr = parsed_skip(*pptrptr, PSKIP_KET);
|
||||||
|
*extraptr = (groupinfo & GI_EXTRA_MASK) >> GI_EXTRA_SHIFT;
|
||||||
return groupinfo & GI_FIXED_LENGTH_MASK;
|
return groupinfo & GI_FIXED_LENGTH_MASK;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -8877,16 +8884,28 @@ if (group > 0 && (cb->external_flags & PCRE2_DUPCAPUSED) == 0)
|
||||||
|
|
||||||
for(;;)
|
for(;;)
|
||||||
{
|
{
|
||||||
branchlength = get_branchlength(pptrptr, errcodeptr, lcptr, recurses, cb);
|
int branchextra;
|
||||||
|
branchlength = get_branchlength(pptrptr, &branchextra, errcodeptr, lcptr,
|
||||||
|
recurses, cb);
|
||||||
if (branchlength < 0) goto ISNOTFIXED;
|
if (branchlength < 0) goto ISNOTFIXED;
|
||||||
if (grouplength == -1) grouplength = branchlength;
|
if (grouplength == -1)
|
||||||
else if (grouplength != branchlength) goto ISNOTFIXED;
|
{
|
||||||
|
grouplength = branchlength;
|
||||||
|
extra = branchextra;
|
||||||
|
}
|
||||||
|
else if (grouplength != branchlength || extra != branchextra) goto ISNOTFIXED;
|
||||||
if (**pptrptr == META_KET) break;
|
if (**pptrptr == META_KET) break;
|
||||||
*pptrptr += 1; /* Skip META_ALT */
|
*pptrptr += 1; /* Skip META_ALT */
|
||||||
}
|
}
|
||||||
|
|
||||||
if (group > 0)
|
/* There are only 12 bits for caching the extra value, but a pattern that
|
||||||
cb->groupinfo[group] |= (uint32_t)(GI_SET_FIXED_LENGTH | grouplength);
|
needs more than that is weird indeed. */
|
||||||
|
|
||||||
|
if (group > 0 && extra <= GI_EXTRA_MAX)
|
||||||
|
cb->groupinfo[group] |= (uint32_t)
|
||||||
|
(GI_SET_FIXED_LENGTH | (extra << GI_EXTRA_SHIFT) | grouplength);
|
||||||
|
|
||||||
|
*extraptr = extra;
|
||||||
return grouplength;
|
return grouplength;
|
||||||
|
|
||||||
ISNOTFIXED:
|
ISNOTFIXED:
|
||||||
|
@ -8901,13 +8920,17 @@ return -1;
|
||||||
*************************************************/
|
*************************************************/
|
||||||
|
|
||||||
/* Return a fixed length for a branch in a lookbehind, giving an error if the
|
/* Return a fixed length for a branch in a lookbehind, giving an error if the
|
||||||
length is not fixed. If any lookbehinds are encountered on the way, they get
|
length is not fixed. We also take note of any extra value that is generated
|
||||||
their length set, and there is a check for them looking further back than the
|
from a nested lookbehind. For example, for /(?<=a(?<=ba)c)/ each individual
|
||||||
current lookbehind. On entry, *pptrptr points to the first element inside the
|
lookbehind has length 2, but the max_lookbehind setting must be 3 because
|
||||||
branch. On exit it is set to point to the ALT or KET.
|
matching inspects 3 characters before the match starting point.
|
||||||
|
|
||||||
|
On entry, *pptrptr points to the first element inside the branch. On exit it is
|
||||||
|
set to point to the ALT or KET.
|
||||||
|
|
||||||
Arguments:
|
Arguments:
|
||||||
pptrptr pointer to pointer in the parsed pattern
|
pptrptr pointer to pointer in the parsed pattern
|
||||||
|
extraptr pointer to where to return extra lookbehind length
|
||||||
errcodeptr pointer to error code
|
errcodeptr pointer to error code
|
||||||
lcptr pointer to loop counter
|
lcptr pointer to loop counter
|
||||||
recurses chain of recurse_check to catch mutual recursion
|
recurses chain of recurse_check to catch mutual recursion
|
||||||
|
@ -8917,11 +8940,12 @@ Returns: the length, or a negative value on error
|
||||||
*/
|
*/
|
||||||
|
|
||||||
static int
|
static int
|
||||||
get_branchlength(uint32_t **pptrptr, int *errcodeptr, int *lcptr,
|
get_branchlength(uint32_t **pptrptr, int *extraptr, int *errcodeptr, int *lcptr,
|
||||||
parsed_recurse_check *recurses, compile_block *cb)
|
parsed_recurse_check *recurses, compile_block *cb)
|
||||||
{
|
{
|
||||||
int branchlength = 0;
|
int branchlength = 0;
|
||||||
int grouplength;
|
int grouplength;
|
||||||
|
int groupextra;
|
||||||
int max;
|
int max;
|
||||||
int extra = 0; /* Additional lookbehind from nesting */
|
int extra = 0; /* Additional lookbehind from nesting */
|
||||||
uint32_t lastitemlength = 0;
|
uint32_t lastitemlength = 0;
|
||||||
|
@ -9070,11 +9094,11 @@ for (;; pptr++)
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
|
|
||||||
/* A lookbehind does not contribute any length to this lookbehind, but must
|
/* A nested lookbehind does not contribute any length to this lookbehind,
|
||||||
itself be checked and have its lengths set. If the maximum lookebhind of
|
but must itself be checked and have its lengths set. If the maximum
|
||||||
any branch is greater than the length so far computed for this branch, we
|
lookbehind for the nested lookbehind is greater than the length so far
|
||||||
must set an extra value for use when setting the maximum overall
|
computed for this branch, we must compute an extra value and keep the
|
||||||
lookbehind. */
|
largest encountered for use when setting the maximum overall lookbehind. */
|
||||||
|
|
||||||
case META_LOOKBEHIND:
|
case META_LOOKBEHIND:
|
||||||
case META_LOOKBEHINDNOT:
|
case META_LOOKBEHINDNOT:
|
||||||
|
@ -9188,15 +9212,14 @@ for (;; pptr++)
|
||||||
in the cache. */
|
in the cache. */
|
||||||
|
|
||||||
gptr++;
|
gptr++;
|
||||||
grouplength = get_grouplength(&gptr, FALSE, errcodeptr, lcptr, group,
|
grouplength = get_grouplength(&gptr, FALSE, &groupextra, errcodeptr, lcptr,
|
||||||
&this_recurse, cb);
|
group, &this_recurse, cb);
|
||||||
if (grouplength < 0)
|
if (grouplength < 0)
|
||||||
{
|
{
|
||||||
if (*errcodeptr == 0) goto ISNOTFIXED;
|
if (*errcodeptr == 0) goto ISNOTFIXED;
|
||||||
return -1; /* Error already set */
|
return -1; /* Error already set */
|
||||||
}
|
}
|
||||||
itemlength = grouplength;
|
goto OK_GROUP;
|
||||||
break;
|
|
||||||
|
|
||||||
/* Check nested groups - advance past the initial data for each type and
|
/* Check nested groups - advance past the initial data for each type and
|
||||||
then seek a fixed length with get_grouplength(). */
|
then seek a fixed length with get_grouplength(). */
|
||||||
|
@ -9226,10 +9249,16 @@ for (;; pptr++)
|
||||||
case META_SCRIPT_RUN:
|
case META_SCRIPT_RUN:
|
||||||
pptr++;
|
pptr++;
|
||||||
CHECK_GROUP:
|
CHECK_GROUP:
|
||||||
grouplength = get_grouplength(&pptr, TRUE, errcodeptr, lcptr, group,
|
grouplength = get_grouplength(&pptr, TRUE, &groupextra, errcodeptr, lcptr,
|
||||||
recurses, cb);
|
group, recurses, cb);
|
||||||
if (grouplength < 0) return -1;
|
if (grouplength < 0) return -1;
|
||||||
|
|
||||||
|
/* A nested lookbehind within the group may require looking back further
|
||||||
|
than the length of the group. */
|
||||||
|
|
||||||
|
OK_GROUP:
|
||||||
itemlength = grouplength;
|
itemlength = grouplength;
|
||||||
|
if (groupextra - branchlength > extra) extra = groupextra - branchlength;
|
||||||
break;
|
break;
|
||||||
|
|
||||||
/* Exact repetition is OK; variable repetition is not. A repetition of zero
|
/* Exact repetition is OK; variable repetition is not. A repetition of zero
|
||||||
|
@ -9272,15 +9301,7 @@ for (;; pptr++)
|
||||||
|
|
||||||
EXIT:
|
EXIT:
|
||||||
*pptrptr = pptr;
|
*pptrptr = pptr;
|
||||||
|
*extraptr = extra;
|
||||||
/* The overall maximum lookbehind for any branch in the pattern takes note of
|
|
||||||
any extra value that is generated from a nested lookbehind. For example, for
|
|
||||||
/(?<=a(?<=ba)c)/ each individual lookbehind has length 2, but the
|
|
||||||
max_lookbehind setting is 3 because matching inspects 3 characters before the
|
|
||||||
match starting point. */
|
|
||||||
|
|
||||||
if (branchlength + extra > cb->max_lookbehind)
|
|
||||||
cb->max_lookbehind = branchlength + extra;
|
|
||||||
return branchlength;
|
return branchlength;
|
||||||
|
|
||||||
PARSED_SKIP_FAILED:
|
PARSED_SKIP_FAILED:
|
||||||
|
@ -9299,9 +9320,14 @@ branches. An error occurs if any branch does not have a fixed length that is
|
||||||
less than the maximum (65535). On exit, the pointer must be left on the final
|
less than the maximum (65535). On exit, the pointer must be left on the final
|
||||||
ket.
|
ket.
|
||||||
|
|
||||||
|
The function also maintains the max_lookbehind value. Any lookbehind branch
|
||||||
|
that contains a nested lookbehind may actually look further back than the
|
||||||
|
length of the branch. The additional amount is passed back from
|
||||||
|
get_branchlength() as an "extra" value.
|
||||||
|
|
||||||
Arguments:
|
Arguments:
|
||||||
pptrptr pointer to pointer in the parsed pattern
|
pptrptr pointer to pointer in the parsed pattern
|
||||||
maxptr where to return maximum length for the whole group
|
maxptr where to return maximum lookbehind for the whole group
|
||||||
errcodeptr pointer to error code
|
errcodeptr pointer to error code
|
||||||
lcptr pointer to loop counter
|
lcptr pointer to loop counter
|
||||||
recurses chain of recurse_check to catch mutual recursion
|
recurses chain of recurse_check to catch mutual recursion
|
||||||
|
@ -9317,6 +9343,7 @@ set_lookbehind_lengths(uint32_t **pptrptr, int *maxptr, int *errcodeptr,
|
||||||
{
|
{
|
||||||
PCRE2_SIZE offset;
|
PCRE2_SIZE offset;
|
||||||
int branchlength;
|
int branchlength;
|
||||||
|
int branchextra;
|
||||||
int max = 0;
|
int max = 0;
|
||||||
uint32_t *bptr = *pptrptr;
|
uint32_t *bptr = *pptrptr;
|
||||||
|
|
||||||
|
@ -9326,7 +9353,8 @@ READPLUSOFFSET(offset, bptr); /* Offset for error messages */
|
||||||
do
|
do
|
||||||
{
|
{
|
||||||
*pptrptr += 1;
|
*pptrptr += 1;
|
||||||
branchlength = get_branchlength(pptrptr, errcodeptr, lcptr, recurses, cb);
|
branchlength = get_branchlength(pptrptr, &branchextra, errcodeptr, lcptr,
|
||||||
|
recurses, cb);
|
||||||
if (branchlength < 0)
|
if (branchlength < 0)
|
||||||
{
|
{
|
||||||
/* The errorcode and offset may already be set from a nested lookbehind. */
|
/* The errorcode and offset may already be set from a nested lookbehind. */
|
||||||
|
@ -9334,12 +9362,13 @@ do
|
||||||
if (cb->erroroffset == PCRE2_UNSET) cb->erroroffset = offset;
|
if (cb->erroroffset == PCRE2_UNSET) cb->erroroffset = offset;
|
||||||
return FALSE;
|
return FALSE;
|
||||||
}
|
}
|
||||||
if (branchlength > max) max = branchlength;
|
if (branchlength + branchextra > max) max = branchlength + branchextra;
|
||||||
*bptr |= branchlength; /* branchlength never more than 65535 */
|
*bptr |= branchlength; /* branchlength never more than 65535 */
|
||||||
bptr = *pptrptr;
|
bptr = *pptrptr;
|
||||||
}
|
}
|
||||||
while (*bptr == META_ALT);
|
while (*bptr == META_ALT);
|
||||||
|
|
||||||
|
if (max > cb->max_lookbehind) cb->max_lookbehind = max;
|
||||||
*maxptr = max;
|
*maxptr = max;
|
||||||
return TRUE;
|
return TRUE;
|
||||||
}
|
}
|
||||||
|
|
|
@ -157,6 +157,18 @@
|
||||||
/(?<=ab)cdef/
|
/(?<=ab)cdef/
|
||||||
xxabcd\=ph
|
xxabcd\=ph
|
||||||
|
|
||||||
|
/(?<=(?<=(?<=a)b)c)./I
|
||||||
|
123abcXYZ
|
||||||
|
|
||||||
|
/(?<=ab(cd(?<=...)))./I
|
||||||
|
abcdX
|
||||||
|
|
||||||
|
/(?<=ab((?<=...)cd))./I
|
||||||
|
ZabcdX
|
||||||
|
|
||||||
|
/(?<=((?<=(?<=ab).))(?1)(?1))./I
|
||||||
|
abxZ
|
||||||
|
|
||||||
#subject
|
#subject
|
||||||
# -------------------------------------------------------------------
|
# -------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
|
@ -335,6 +335,41 @@ Partial match: abcd
|
||||||
Partial match: abcd
|
Partial match: abcd
|
||||||
<<
|
<<
|
||||||
|
|
||||||
|
/(?<=(?<=(?<=a)b)c)./I
|
||||||
|
Capture group count = 0
|
||||||
|
Max lookbehind = 3
|
||||||
|
Subject length lower bound = 1
|
||||||
|
123abcXYZ
|
||||||
|
0: abcX
|
||||||
|
<<<
|
||||||
|
|
||||||
|
/(?<=ab(cd(?<=...)))./I
|
||||||
|
Capture group count = 1
|
||||||
|
Max lookbehind = 4
|
||||||
|
Subject length lower bound = 1
|
||||||
|
abcdX
|
||||||
|
0: abcdX
|
||||||
|
<<<<
|
||||||
|
1: cd
|
||||||
|
|
||||||
|
/(?<=ab((?<=...)cd))./I
|
||||||
|
Capture group count = 1
|
||||||
|
Max lookbehind = 5
|
||||||
|
Subject length lower bound = 1
|
||||||
|
ZabcdX
|
||||||
|
0: ZabcdX
|
||||||
|
<<<<<
|
||||||
|
1: cd
|
||||||
|
|
||||||
|
/(?<=((?<=(?<=ab).))(?1)(?1))./I
|
||||||
|
Capture group count = 1
|
||||||
|
Max lookbehind = 3
|
||||||
|
Subject length lower bound = 1
|
||||||
|
abxZ
|
||||||
|
0: abxZ
|
||||||
|
<<<
|
||||||
|
1:
|
||||||
|
|
||||||
#subject
|
#subject
|
||||||
# -------------------------------------------------------------------
|
# -------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue