Compile \p{Any} the same as . in DOTALL mode, to benefit from auto-anchoring.

This commit is contained in:
Philip.Hazel 2019-02-13 17:30:24 +00:00
parent f2e1cea288
commit 255f5e741b
4 changed files with 70 additions and 43 deletions

View File

@ -128,6 +128,9 @@ ClusterFuzz 12950, fixed before release.
31. Implemented PCRE2_EXTRA_ALT_BSUX to support ECMAScript 6's \u{hhh}
construct.
32. Compile \p{Any} to be the same as . in DOTALL mode, so that it benefits
from auto-anchoring if \p{Any}* starts a pattern.
Version 10.32 10-September-2018
-------------------------------

View File

@ -1459,7 +1459,7 @@ Returns: zero => a data character
int
PRIV(check_escape)(PCRE2_SPTR *ptrptr, PCRE2_SPTR ptrend, uint32_t *chptr,
int *errorcodeptr, uint32_t options, uint32_t extra_options, BOOL isclass,
int *errorcodeptr, uint32_t options, uint32_t extra_options, BOOL isclass,
compile_block *cb)
{
BOOL utf = (options & PCRE2_UTF) != 0;
@ -1551,7 +1551,7 @@ else if ((i = escapes[c - ESCAPES_FIRST]) != 0)
/* Escapes that need further processing, including those that are unknown, have
a zero entry in the lookup table. When called from pcre2_substitute(), only \c,
\o, and \x are recognized (\u and \U can never appear as they are used for case
\o, and \x are recognized (\u and \U can never appear as they are used for case
forcing). */
else
@ -1559,7 +1559,7 @@ else
int s;
PCRE2_SPTR oldptr;
BOOL overflow;
BOOL alt_bsux =
BOOL alt_bsux =
((options & PCRE2_ALT_BSUX) | (extra_options & PCRE2_EXTRA_ALT_BSUX)) != 0;
/* Filter calls from pcre2_substitute(). */
@ -1571,8 +1571,8 @@ else
*errorcodeptr = ERR3;
return 0;
}
alt_bsux = FALSE; /* Do not modify \x handling */
}
alt_bsux = FALSE; /* Do not modify \x handling */
}
switch (c)
{
@ -1595,37 +1595,37 @@ else
if (!alt_bsux) *errorcodeptr = ERR37; else
{
uint32_t xc;
if (ptr >= ptrend) break;
if (*ptr == CHAR_LEFT_CURLY_BRACKET &&
if (*ptr == CHAR_LEFT_CURLY_BRACKET &&
(extra_options & PCRE2_EXTRA_ALT_BSUX) != 0)
{
PCRE2_SPTR hptr = ptr + 1;
cc = 0;
while (hptr < ptrend && (xc = XDIGIT(*hptr)) != 0xff)
{
{
if ((cc & 0xf0000000) != 0) /* Test for 32-bit overflow */
{
*errorcodeptr = ERR77;
ptr = hptr; /* Show where */
break; /* *hptr != } will cause another break below */
}
break; /* *hptr != } will cause another break below */
}
cc = (cc << 4) | xc;
hptr++;
}
hptr++;
}
if (hptr == ptr + 1 || /* No hex digits */
hptr >= ptrend || /* Hit end of input */
*hptr != CHAR_RIGHT_CURLY_BRACKET) /* No } terminator */
break; /* Hex escape not recognized */
c = cc; /* Accept the code point */
ptr = hptr + 1;
ptr = hptr + 1;
}
else /* Must be exactly 4 hex digits */
{
{
if (ptrend - ptr < 4) break; /* Less than 4 chars */
if ((cc = XDIGIT(ptr[0])) == 0xff) break; /* Not a hex digit */
if ((xc = XDIGIT(ptr[1])) == 0xff) break; /* Not a hex digit */
@ -1635,8 +1635,8 @@ else
if ((xc = XDIGIT(ptr[3])) == 0xff) break; /* Not a hex digit */
c = (cc << 4) | xc;
ptr += 4;
}
}
if (utf)
{
if (c > 0x10ffffU) *errorcodeptr = ERR77;
@ -3424,7 +3424,7 @@ while (ptr < ptrend)
else
{
tempptr = ptr;
escape = PRIV(check_escape)(&ptr, ptrend, &c, &errorcode, options,
escape = PRIV(check_escape)(&ptr, ptrend, &c, &errorcode, options,
cb->cx->extra_options, TRUE, cb);
if (errorcode != 0)
@ -7631,9 +7631,20 @@ for (;; pptr++)
{
uint32_t ptype = *(++pptr) >> 16;
uint32_t pdata = *pptr & 0xffff;
*code++ = (meta_arg == ESC_p)? OP_PROP : OP_NOTPROP;
*code++ = ptype;
*code++ = pdata;
/* The special case of \p{Any} is compiled to OP_ALLANY so as to benefit
from the auto-anchoring code. */
if (meta_arg == ESC_p && ptype == PT_ANY)
{
*code++ = OP_ALLANY;
}
else
{
*code++ = (meta_arg == ESC_p)? OP_PROP : OP_NOTPROP;
*code++ = ptype;
*code++ = pdata;
}
break; /* End META_ESCAPE */
}
#endif

4
testdata/testinput5 vendored
View File

@ -2170,4 +2170,8 @@
/(?'X²ABC'...)/utf
# -------
/\p{Any}*xyz/I
# End of testinput5

47
testdata/testoutput5 vendored
View File

@ -3294,27 +3294,27 @@ No match
/\p{Any}+\p{Any} \p{Any}+\P{Any} \p{Any}+\p{L&} \p{Any}+\p{L} \p{Any}+\p{Lu} \p{Any}+\p{Han} \p{Any}+\p{Xan} \p{Any}+\p{Xsp} \p{Any}+\p{Xps} \p{Xwd}+\p{Any} \p{Any}+\p{Xuc}/Bx,ucp
------------------------------------------------------------------
Bra
prop Any +
prop Any
prop Any +
AllAny+
AllAny
AllAny+
notprop Any
prop Any +
AllAny+
prop L&
prop Any +
AllAny+
prop L
prop Any +
AllAny+
prop Lu
prop Any +
AllAny+
prop Han
prop Any +
AllAny+
prop Xan
prop Any +
AllAny+
prop Xsp
prop Any +
AllAny+
prop Xps
prop Xwd +
prop Any
prop Any +
AllAny
AllAny+
prop Xuc
Ket
End
@ -3324,7 +3324,7 @@ No match
------------------------------------------------------------------
Bra
prop L& +
prop Any
AllAny
prop L& +
prop L&
notprop L& ++
@ -3355,7 +3355,7 @@ No match
------------------------------------------------------------------
Bra
prop N +
prop Any
AllAny
prop N +
prop L&
prop N ++
@ -3386,7 +3386,7 @@ No match
------------------------------------------------------------------
Bra
prop Lu +
prop Any
AllAny
prop Lu +
prop L&
prop Lu +
@ -3448,7 +3448,7 @@ No match
------------------------------------------------------------------
Bra
prop Xan +
prop Any
AllAny
prop Xan +
prop L&
notprop Xan ++
@ -3479,7 +3479,7 @@ No match
------------------------------------------------------------------
Bra
prop Xsp +
prop Any
AllAny
prop Xsp ++
prop L&
prop Xsp ++
@ -3508,7 +3508,7 @@ No match
------------------------------------------------------------------
Bra
prop Xwd +
prop Any
AllAny
prop Xwd +
prop L&
prop Xwd +
@ -3537,7 +3537,7 @@ No match
------------------------------------------------------------------
Bra
prop Xuc +
prop Any
AllAny
prop Xuc +
prop L&
prop Xuc +
@ -4924,4 +4924,13 @@ Failed: error 162 at offset 3: subpattern name expected
/(?'X²ABC'...)/utf
Failed: error 142 at offset 4: syntax error in subpattern name (missing terminator?)
# -------
/\p{Any}*xyz/I
Capture group count = 0
Compile options: <none>
Overall options: anchored
Last code unit = 'z'
Subject length lower bound = 3
# End of testinput5