Fix bugs in recent patch for setting the maximum lookbehind.

This commit is contained in:
Philip.Hazel 2019-06-28 16:58:08 +00:00
parent c0d0ee5365
commit 4866bd3652
3 changed files with 112 additions and 36 deletions

View File

@ -128,7 +128,7 @@ static int
compile_block *, PCRE2_SIZE *); compile_block *, PCRE2_SIZE *);
static int static int
get_branchlength(uint32_t **, int *, int *, parsed_recurse_check *, get_branchlength(uint32_t **, int *, int *, int *, parsed_recurse_check *,
compile_block *); compile_block *);
static BOOL static BOOL
@ -388,6 +388,9 @@ compiler is clever with identical subexpressions. */
#define GI_SET_FIXED_LENGTH 0x80000000u #define GI_SET_FIXED_LENGTH 0x80000000u
#define GI_NOT_FIXED_LENGTH 0x40000000u #define GI_NOT_FIXED_LENGTH 0x40000000u
#define GI_FIXED_LENGTH_MASK 0x0000ffffu #define GI_FIXED_LENGTH_MASK 0x0000ffffu
#define GI_EXTRA_MASK 0x0fff0000u
#define GI_EXTRA_MAX 0xfff /* NB not unsigned */
#define GI_EXTRA_SHIFT 16
/* This simple test for a decimal digit works for both ASCII/Unicode and EBCDIC /* This simple test for a decimal digit works for both ASCII/Unicode and EBCDIC
and is fast (a good compiler can turn it into a subtraction and unsigned and is fast (a good compiler can turn it into a subtraction and unsigned
@ -8841,6 +8844,7 @@ improve processing speed when the same capturing group occurs many times.
Arguments: Arguments:
pptrptr pointer to pointer in the parsed pattern pptrptr pointer to pointer in the parsed pattern
isinline FALSE if a reference or recursion; TRUE for inline group isinline FALSE if a reference or recursion; TRUE for inline group
extraptr pointer to where to return extra lookbehind length
errcodeptr pointer to the errorcode errcodeptr pointer to the errorcode
lcptr pointer to the loop counter lcptr pointer to the loop counter
group number of captured group or -1 for a non-capturing group group number of captured group or -1 for a non-capturing group
@ -8851,11 +8855,13 @@ Returns: the group length or a negative number
*/ */
static int static int
get_grouplength(uint32_t **pptrptr, BOOL isinline, int *errcodeptr, int *lcptr, get_grouplength(uint32_t **pptrptr, BOOL isinline, int *extraptr,
int group, parsed_recurse_check *recurses, compile_block *cb) int *errcodeptr, int *lcptr, int group, parsed_recurse_check *recurses,
compile_block *cb)
{ {
int branchlength; int branchlength;
int grouplength = -1; int grouplength = -1;
int extra = 0;
/* The cache can be used only if there is no possibility of there being two /* The cache can be used only if there is no possibility of there being two
groups with the same number. We do not need to set the end pointer for a group groups with the same number. We do not need to set the end pointer for a group
@ -8869,6 +8875,7 @@ if (group > 0 && (cb->external_flags & PCRE2_DUPCAPUSED) == 0)
if ((groupinfo & GI_SET_FIXED_LENGTH) != 0) if ((groupinfo & GI_SET_FIXED_LENGTH) != 0)
{ {
if (isinline) *pptrptr = parsed_skip(*pptrptr, PSKIP_KET); if (isinline) *pptrptr = parsed_skip(*pptrptr, PSKIP_KET);
*extraptr = (groupinfo & GI_EXTRA_MASK) >> GI_EXTRA_SHIFT;
return groupinfo & GI_FIXED_LENGTH_MASK; return groupinfo & GI_FIXED_LENGTH_MASK;
} }
} }
@ -8877,16 +8884,28 @@ if (group > 0 && (cb->external_flags & PCRE2_DUPCAPUSED) == 0)
for(;;) for(;;)
{ {
branchlength = get_branchlength(pptrptr, errcodeptr, lcptr, recurses, cb); int branchextra;
branchlength = get_branchlength(pptrptr, &branchextra, errcodeptr, lcptr,
recurses, cb);
if (branchlength < 0) goto ISNOTFIXED; if (branchlength < 0) goto ISNOTFIXED;
if (grouplength == -1) grouplength = branchlength; if (grouplength == -1)
else if (grouplength != branchlength) goto ISNOTFIXED; {
grouplength = branchlength;
extra = branchextra;
}
else if (grouplength != branchlength || extra != branchextra) goto ISNOTFIXED;
if (**pptrptr == META_KET) break; if (**pptrptr == META_KET) break;
*pptrptr += 1; /* Skip META_ALT */ *pptrptr += 1; /* Skip META_ALT */
} }
if (group > 0) /* There are only 12 bits for caching the extra value, but a pattern that
cb->groupinfo[group] |= (uint32_t)(GI_SET_FIXED_LENGTH | grouplength); needs more than that is weird indeed. */
if (group > 0 && extra <= GI_EXTRA_MAX)
cb->groupinfo[group] |= (uint32_t)
(GI_SET_FIXED_LENGTH | (extra << GI_EXTRA_SHIFT) | grouplength);
*extraptr = extra;
return grouplength; return grouplength;
ISNOTFIXED: ISNOTFIXED:
@ -8901,13 +8920,17 @@ return -1;
*************************************************/ *************************************************/
/* Return a fixed length for a branch in a lookbehind, giving an error if the /* Return a fixed length for a branch in a lookbehind, giving an error if the
length is not fixed. If any lookbehinds are encountered on the way, they get length is not fixed. We also take note of any extra value that is generated
their length set, and there is a check for them looking further back than the from a nested lookbehind. For example, for /(?<=a(?<=ba)c)/ each individual
current lookbehind. On entry, *pptrptr points to the first element inside the lookbehind has length 2, but the max_lookbehind setting must be 3 because
branch. On exit it is set to point to the ALT or KET. matching inspects 3 characters before the match starting point.
On entry, *pptrptr points to the first element inside the branch. On exit it is
set to point to the ALT or KET.
Arguments: Arguments:
pptrptr pointer to pointer in the parsed pattern pptrptr pointer to pointer in the parsed pattern
extraptr pointer to where to return extra lookbehind length
errcodeptr pointer to error code errcodeptr pointer to error code
lcptr pointer to loop counter lcptr pointer to loop counter
recurses chain of recurse_check to catch mutual recursion recurses chain of recurse_check to catch mutual recursion
@ -8917,11 +8940,12 @@ Returns: the length, or a negative value on error
*/ */
static int static int
get_branchlength(uint32_t **pptrptr, int *errcodeptr, int *lcptr, get_branchlength(uint32_t **pptrptr, int *extraptr, int *errcodeptr, int *lcptr,
parsed_recurse_check *recurses, compile_block *cb) parsed_recurse_check *recurses, compile_block *cb)
{ {
int branchlength = 0; int branchlength = 0;
int grouplength; int grouplength;
int groupextra;
int max; int max;
int extra = 0; /* Additional lookbehind from nesting */ int extra = 0; /* Additional lookbehind from nesting */
uint32_t lastitemlength = 0; uint32_t lastitemlength = 0;
@ -9070,11 +9094,11 @@ for (;; pptr++)
} }
break; break;
/* A lookbehind does not contribute any length to this lookbehind, but must /* A nested lookbehind does not contribute any length to this lookbehind,
itself be checked and have its lengths set. If the maximum lookebhind of but must itself be checked and have its lengths set. If the maximum
any branch is greater than the length so far computed for this branch, we lookbehind for the nested lookbehind is greater than the length so far
must set an extra value for use when setting the maximum overall computed for this branch, we must compute an extra value and keep the
lookbehind. */ largest encountered for use when setting the maximum overall lookbehind. */
case META_LOOKBEHIND: case META_LOOKBEHIND:
case META_LOOKBEHINDNOT: case META_LOOKBEHINDNOT:
@ -9188,15 +9212,14 @@ for (;; pptr++)
in the cache. */ in the cache. */
gptr++; gptr++;
grouplength = get_grouplength(&gptr, FALSE, errcodeptr, lcptr, group, grouplength = get_grouplength(&gptr, FALSE, &groupextra, errcodeptr, lcptr,
&this_recurse, cb); group, &this_recurse, cb);
if (grouplength < 0) if (grouplength < 0)
{ {
if (*errcodeptr == 0) goto ISNOTFIXED; if (*errcodeptr == 0) goto ISNOTFIXED;
return -1; /* Error already set */ return -1; /* Error already set */
} }
itemlength = grouplength; goto OK_GROUP;
break;
/* Check nested groups - advance past the initial data for each type and /* Check nested groups - advance past the initial data for each type and
then seek a fixed length with get_grouplength(). */ then seek a fixed length with get_grouplength(). */
@ -9226,10 +9249,16 @@ for (;; pptr++)
case META_SCRIPT_RUN: case META_SCRIPT_RUN:
pptr++; pptr++;
CHECK_GROUP: CHECK_GROUP:
grouplength = get_grouplength(&pptr, TRUE, errcodeptr, lcptr, group, grouplength = get_grouplength(&pptr, TRUE, &groupextra, errcodeptr, lcptr,
recurses, cb); group, recurses, cb);
if (grouplength < 0) return -1; if (grouplength < 0) return -1;
/* A nested lookbehind within the group may require looking back further
than the length of the group. */
OK_GROUP:
itemlength = grouplength; itemlength = grouplength;
if (groupextra - branchlength > extra) extra = groupextra - branchlength;
break; break;
/* Exact repetition is OK; variable repetition is not. A repetition of zero /* Exact repetition is OK; variable repetition is not. A repetition of zero
@ -9272,15 +9301,7 @@ for (;; pptr++)
EXIT: EXIT:
*pptrptr = pptr; *pptrptr = pptr;
*extraptr = extra;
/* The overall maximum lookbehind for any branch in the pattern takes note of
any extra value that is generated from a nested lookbehind. For example, for
/(?<=a(?<=ba)c)/ each individual lookbehind has length 2, but the
max_lookbehind setting is 3 because matching inspects 3 characters before the
match starting point. */
if (branchlength + extra > cb->max_lookbehind)
cb->max_lookbehind = branchlength + extra;
return branchlength; return branchlength;
PARSED_SKIP_FAILED: PARSED_SKIP_FAILED:
@ -9299,9 +9320,14 @@ branches. An error occurs if any branch does not have a fixed length that is
less than the maximum (65535). On exit, the pointer must be left on the final less than the maximum (65535). On exit, the pointer must be left on the final
ket. ket.
The function also maintains the max_lookbehind value. Any lookbehind branch
that contains a nested lookbehind may actually look further back than the
length of the branch. The additional amount is passed back from
get_branchlength() as an "extra" value.
Arguments: Arguments:
pptrptr pointer to pointer in the parsed pattern pptrptr pointer to pointer in the parsed pattern
maxptr where to return maximum length for the whole group maxptr where to return maximum lookbehind for the whole group
errcodeptr pointer to error code errcodeptr pointer to error code
lcptr pointer to loop counter lcptr pointer to loop counter
recurses chain of recurse_check to catch mutual recursion recurses chain of recurse_check to catch mutual recursion
@ -9317,6 +9343,7 @@ set_lookbehind_lengths(uint32_t **pptrptr, int *maxptr, int *errcodeptr,
{ {
PCRE2_SIZE offset; PCRE2_SIZE offset;
int branchlength; int branchlength;
int branchextra;
int max = 0; int max = 0;
uint32_t *bptr = *pptrptr; uint32_t *bptr = *pptrptr;
@ -9326,7 +9353,8 @@ READPLUSOFFSET(offset, bptr); /* Offset for error messages */
do do
{ {
*pptrptr += 1; *pptrptr += 1;
branchlength = get_branchlength(pptrptr, errcodeptr, lcptr, recurses, cb); branchlength = get_branchlength(pptrptr, &branchextra, errcodeptr, lcptr,
recurses, cb);
if (branchlength < 0) if (branchlength < 0)
{ {
/* The errorcode and offset may already be set from a nested lookbehind. */ /* The errorcode and offset may already be set from a nested lookbehind. */
@ -9334,12 +9362,13 @@ do
if (cb->erroroffset == PCRE2_UNSET) cb->erroroffset = offset; if (cb->erroroffset == PCRE2_UNSET) cb->erroroffset = offset;
return FALSE; return FALSE;
} }
if (branchlength > max) max = branchlength; if (branchlength + branchextra > max) max = branchlength + branchextra;
*bptr |= branchlength; /* branchlength never more than 65535 */ *bptr |= branchlength; /* branchlength never more than 65535 */
bptr = *pptrptr; bptr = *pptrptr;
} }
while (*bptr == META_ALT); while (*bptr == META_ALT);
if (max > cb->max_lookbehind) cb->max_lookbehind = max;
*maxptr = max; *maxptr = max;
return TRUE; return TRUE;
} }

12
testdata/testinput15 vendored
View File

@ -157,6 +157,18 @@
/(?<=ab)cdef/ /(?<=ab)cdef/
xxabcd\=ph xxabcd\=ph
/(?<=(?<=(?<=a)b)c)./I
123abcXYZ
/(?<=ab(cd(?<=...)))./I
abcdX
/(?<=ab((?<=...)cd))./I
ZabcdX
/(?<=((?<=(?<=ab).))(?1)(?1))./I
abxZ
#subject #subject
# ------------------------------------------------------------------- # -------------------------------------------------------------------

35
testdata/testoutput15 vendored
View File

@ -335,6 +335,41 @@ Partial match: abcd
Partial match: abcd Partial match: abcd
<< <<
/(?<=(?<=(?<=a)b)c)./I
Capture group count = 0
Max lookbehind = 3
Subject length lower bound = 1
123abcXYZ
0: abcX
<<<
/(?<=ab(cd(?<=...)))./I
Capture group count = 1
Max lookbehind = 4
Subject length lower bound = 1
abcdX
0: abcdX
<<<<
1: cd
/(?<=ab((?<=...)cd))./I
Capture group count = 1
Max lookbehind = 5
Subject length lower bound = 1
ZabcdX
0: ZabcdX
<<<<<
1: cd
/(?<=((?<=(?<=ab).))(?1)(?1))./I
Capture group count = 1
Max lookbehind = 3
Subject length lower bound = 1
abxZ
0: abxZ
<<<
1:
#subject #subject
# ------------------------------------------------------------------- # -------------------------------------------------------------------