Fix bugs in recent patch for setting the maximum lookbehind.

This commit is contained in:
Philip.Hazel 2019-06-28 16:58:08 +00:00
parent c0d0ee5365
commit 4866bd3652
3 changed files with 112 additions and 36 deletions

View File

@ -128,7 +128,7 @@ static int
compile_block *, PCRE2_SIZE *);
static int
get_branchlength(uint32_t **, int *, int *, parsed_recurse_check *,
get_branchlength(uint32_t **, int *, int *, int *, parsed_recurse_check *,
compile_block *);
static BOOL
@ -388,6 +388,9 @@ compiler is clever with identical subexpressions. */
#define GI_SET_FIXED_LENGTH 0x80000000u
#define GI_NOT_FIXED_LENGTH 0x40000000u
#define GI_FIXED_LENGTH_MASK 0x0000ffffu
#define GI_EXTRA_MASK 0x0fff0000u
#define GI_EXTRA_MAX 0xfff /* NB not unsigned */
#define GI_EXTRA_SHIFT 16
/* This simple test for a decimal digit works for both ASCII/Unicode and EBCDIC
and is fast (a good compiler can turn it into a subtraction and unsigned
@ -8841,6 +8844,7 @@ improve processing speed when the same capturing group occurs many times.
Arguments:
pptrptr pointer to pointer in the parsed pattern
isinline FALSE if a reference or recursion; TRUE for inline group
extraptr pointer to where to return extra lookbehind length
errcodeptr pointer to the errorcode
lcptr pointer to the loop counter
group number of captured group or -1 for a non-capturing group
@ -8851,11 +8855,13 @@ Returns: the group length or a negative number
*/
static int
get_grouplength(uint32_t **pptrptr, BOOL isinline, int *errcodeptr, int *lcptr,
int group, parsed_recurse_check *recurses, compile_block *cb)
get_grouplength(uint32_t **pptrptr, BOOL isinline, int *extraptr,
int *errcodeptr, int *lcptr, int group, parsed_recurse_check *recurses,
compile_block *cb)
{
int branchlength;
int grouplength = -1;
int extra = 0;
/* The cache can be used only if there is no possibility of there being two
groups with the same number. We do not need to set the end pointer for a group
@ -8869,6 +8875,7 @@ if (group > 0 && (cb->external_flags & PCRE2_DUPCAPUSED) == 0)
if ((groupinfo & GI_SET_FIXED_LENGTH) != 0)
{
if (isinline) *pptrptr = parsed_skip(*pptrptr, PSKIP_KET);
*extraptr = (groupinfo & GI_EXTRA_MASK) >> GI_EXTRA_SHIFT;
return groupinfo & GI_FIXED_LENGTH_MASK;
}
}
@ -8877,16 +8884,28 @@ if (group > 0 && (cb->external_flags & PCRE2_DUPCAPUSED) == 0)
for(;;)
{
branchlength = get_branchlength(pptrptr, errcodeptr, lcptr, recurses, cb);
int branchextra;
branchlength = get_branchlength(pptrptr, &branchextra, errcodeptr, lcptr,
recurses, cb);
if (branchlength < 0) goto ISNOTFIXED;
if (grouplength == -1) grouplength = branchlength;
else if (grouplength != branchlength) goto ISNOTFIXED;
if (grouplength == -1)
{
grouplength = branchlength;
extra = branchextra;
}
else if (grouplength != branchlength || extra != branchextra) goto ISNOTFIXED;
if (**pptrptr == META_KET) break;
*pptrptr += 1; /* Skip META_ALT */
}
if (group > 0)
cb->groupinfo[group] |= (uint32_t)(GI_SET_FIXED_LENGTH | grouplength);
/* There are only 12 bits for caching the extra value, but a pattern that
needs more than that is weird indeed. */
if (group > 0 && extra <= GI_EXTRA_MAX)
cb->groupinfo[group] |= (uint32_t)
(GI_SET_FIXED_LENGTH | (extra << GI_EXTRA_SHIFT) | grouplength);
*extraptr = extra;
return grouplength;
ISNOTFIXED:
@ -8901,13 +8920,17 @@ return -1;
*************************************************/
/* Return a fixed length for a branch in a lookbehind, giving an error if the
length is not fixed. If any lookbehinds are encountered on the way, they get
their length set, and there is a check for them looking further back than the
current lookbehind. On entry, *pptrptr points to the first element inside the
branch. On exit it is set to point to the ALT or KET.
length is not fixed. We also take note of any extra value that is generated
from a nested lookbehind. For example, for /(?<=a(?<=ba)c)/ each individual
lookbehind has length 2, but the max_lookbehind setting must be 3 because
matching inspects 3 characters before the match starting point.
On entry, *pptrptr points to the first element inside the branch. On exit it is
set to point to the ALT or KET.
Arguments:
pptrptr pointer to pointer in the parsed pattern
extraptr pointer to where to return extra lookbehind length
errcodeptr pointer to error code
lcptr pointer to loop counter
recurses chain of recurse_check to catch mutual recursion
@ -8917,11 +8940,12 @@ Returns: the length, or a negative value on error
*/
static int
get_branchlength(uint32_t **pptrptr, int *errcodeptr, int *lcptr,
get_branchlength(uint32_t **pptrptr, int *extraptr, int *errcodeptr, int *lcptr,
parsed_recurse_check *recurses, compile_block *cb)
{
int branchlength = 0;
int grouplength;
int groupextra;
int max;
int extra = 0; /* Additional lookbehind from nesting */
uint32_t lastitemlength = 0;
@ -9070,11 +9094,11 @@ for (;; pptr++)
}
break;
/* A lookbehind does not contribute any length to this lookbehind, but must
itself be checked and have its lengths set. If the maximum lookebhind of
any branch is greater than the length so far computed for this branch, we
must set an extra value for use when setting the maximum overall
lookbehind. */
/* A nested lookbehind does not contribute any length to this lookbehind,
but must itself be checked and have its lengths set. If the maximum
lookbehind for the nested lookbehind is greater than the length so far
computed for this branch, we must compute an extra value and keep the
largest encountered for use when setting the maximum overall lookbehind. */
case META_LOOKBEHIND:
case META_LOOKBEHINDNOT:
@ -9188,15 +9212,14 @@ for (;; pptr++)
in the cache. */
gptr++;
grouplength = get_grouplength(&gptr, FALSE, errcodeptr, lcptr, group,
&this_recurse, cb);
grouplength = get_grouplength(&gptr, FALSE, &groupextra, errcodeptr, lcptr,
group, &this_recurse, cb);
if (grouplength < 0)
{
if (*errcodeptr == 0) goto ISNOTFIXED;
return -1; /* Error already set */
}
itemlength = grouplength;
break;
goto OK_GROUP;
/* Check nested groups - advance past the initial data for each type and
then seek a fixed length with get_grouplength(). */
@ -9226,10 +9249,16 @@ for (;; pptr++)
case META_SCRIPT_RUN:
pptr++;
CHECK_GROUP:
grouplength = get_grouplength(&pptr, TRUE, errcodeptr, lcptr, group,
recurses, cb);
grouplength = get_grouplength(&pptr, TRUE, &groupextra, errcodeptr, lcptr,
group, recurses, cb);
if (grouplength < 0) return -1;
/* A nested lookbehind within the group may require looking back further
than the length of the group. */
OK_GROUP:
itemlength = grouplength;
if (groupextra - branchlength > extra) extra = groupextra - branchlength;
break;
/* Exact repetition is OK; variable repetition is not. A repetition of zero
@ -9272,15 +9301,7 @@ for (;; pptr++)
EXIT:
*pptrptr = pptr;
/* The overall maximum lookbehind for any branch in the pattern takes note of
any extra value that is generated from a nested lookbehind. For example, for
/(?<=a(?<=ba)c)/ each individual lookbehind has length 2, but the
max_lookbehind setting is 3 because matching inspects 3 characters before the
match starting point. */
if (branchlength + extra > cb->max_lookbehind)
cb->max_lookbehind = branchlength + extra;
*extraptr = extra;
return branchlength;
PARSED_SKIP_FAILED:
@ -9299,9 +9320,14 @@ branches. An error occurs if any branch does not have a fixed length that is
less than the maximum (65535). On exit, the pointer must be left on the final
ket.
The function also maintains the max_lookbehind value. Any lookbehind branch
that contains a nested lookbehind may actually look further back than the
length of the branch. The additional amount is passed back from
get_branchlength() as an "extra" value.
Arguments:
pptrptr pointer to pointer in the parsed pattern
maxptr where to return maximum length for the whole group
maxptr where to return maximum lookbehind for the whole group
errcodeptr pointer to error code
lcptr pointer to loop counter
recurses chain of recurse_check to catch mutual recursion
@ -9317,6 +9343,7 @@ set_lookbehind_lengths(uint32_t **pptrptr, int *maxptr, int *errcodeptr,
{
PCRE2_SIZE offset;
int branchlength;
int branchextra;
int max = 0;
uint32_t *bptr = *pptrptr;
@ -9326,7 +9353,8 @@ READPLUSOFFSET(offset, bptr); /* Offset for error messages */
do
{
*pptrptr += 1;
branchlength = get_branchlength(pptrptr, errcodeptr, lcptr, recurses, cb);
branchlength = get_branchlength(pptrptr, &branchextra, errcodeptr, lcptr,
recurses, cb);
if (branchlength < 0)
{
/* The errorcode and offset may already be set from a nested lookbehind. */
@ -9334,12 +9362,13 @@ do
if (cb->erroroffset == PCRE2_UNSET) cb->erroroffset = offset;
return FALSE;
}
if (branchlength > max) max = branchlength;
if (branchlength + branchextra > max) max = branchlength + branchextra;
*bptr |= branchlength; /* branchlength never more than 65535 */
bptr = *pptrptr;
}
while (*bptr == META_ALT);
if (max > cb->max_lookbehind) cb->max_lookbehind = max;
*maxptr = max;
return TRUE;
}

12
testdata/testinput15 vendored
View File

@ -157,6 +157,18 @@
/(?<=ab)cdef/
xxabcd\=ph
/(?<=(?<=(?<=a)b)c)./I
123abcXYZ
/(?<=ab(cd(?<=...)))./I
abcdX
/(?<=ab((?<=...)cd))./I
ZabcdX
/(?<=((?<=(?<=ab).))(?1)(?1))./I
abxZ
#subject
# -------------------------------------------------------------------

35
testdata/testoutput15 vendored
View File

@ -335,6 +335,41 @@ Partial match: abcd
Partial match: abcd
<<
/(?<=(?<=(?<=a)b)c)./I
Capture group count = 0
Max lookbehind = 3
Subject length lower bound = 1
123abcXYZ
0: abcX
<<<
/(?<=ab(cd(?<=...)))./I
Capture group count = 1
Max lookbehind = 4
Subject length lower bound = 1
abcdX
0: abcdX
<<<<
1: cd
/(?<=ab((?<=...)cd))./I
Capture group count = 1
Max lookbehind = 5
Subject length lower bound = 1
ZabcdX
0: ZabcdX
<<<<<
1: cd
/(?<=((?<=(?<=ab).))(?1)(?1))./I
Capture group count = 1
Max lookbehind = 3
Subject length lower bound = 1
abxZ
0: abxZ
<<<
1:
#subject
# -------------------------------------------------------------------