Compile \p{Any} the same as . in DOTALL mode, to benefit from auto-anchoring.

This commit is contained in:
Philip.Hazel 2019-02-13 17:30:24 +00:00
parent f2e1cea288
commit 255f5e741b
4 changed files with 70 additions and 43 deletions

View File

@ -128,6 +128,9 @@ ClusterFuzz 12950, fixed before release.
31. Implemented PCRE2_EXTRA_ALT_BSUX to support ECMAScript 6's \u{hhh} 31. Implemented PCRE2_EXTRA_ALT_BSUX to support ECMAScript 6's \u{hhh}
construct. construct.
32. Compile \p{Any} to be the same as . in DOTALL mode, so that it benefits
from auto-anchoring if \p{Any}* starts a pattern.
Version 10.32 10-September-2018 Version 10.32 10-September-2018
------------------------------- -------------------------------

View File

@ -1459,7 +1459,7 @@ Returns: zero => a data character
int int
PRIV(check_escape)(PCRE2_SPTR *ptrptr, PCRE2_SPTR ptrend, uint32_t *chptr, PRIV(check_escape)(PCRE2_SPTR *ptrptr, PCRE2_SPTR ptrend, uint32_t *chptr,
int *errorcodeptr, uint32_t options, uint32_t extra_options, BOOL isclass, int *errorcodeptr, uint32_t options, uint32_t extra_options, BOOL isclass,
compile_block *cb) compile_block *cb)
{ {
BOOL utf = (options & PCRE2_UTF) != 0; BOOL utf = (options & PCRE2_UTF) != 0;
@ -1551,7 +1551,7 @@ else if ((i = escapes[c - ESCAPES_FIRST]) != 0)
/* Escapes that need further processing, including those that are unknown, have /* Escapes that need further processing, including those that are unknown, have
a zero entry in the lookup table. When called from pcre2_substitute(), only \c, a zero entry in the lookup table. When called from pcre2_substitute(), only \c,
\o, and \x are recognized (\u and \U can never appear as they are used for case \o, and \x are recognized (\u and \U can never appear as they are used for case
forcing). */ forcing). */
else else
@ -1559,7 +1559,7 @@ else
int s; int s;
PCRE2_SPTR oldptr; PCRE2_SPTR oldptr;
BOOL overflow; BOOL overflow;
BOOL alt_bsux = BOOL alt_bsux =
((options & PCRE2_ALT_BSUX) | (extra_options & PCRE2_EXTRA_ALT_BSUX)) != 0; ((options & PCRE2_ALT_BSUX) | (extra_options & PCRE2_EXTRA_ALT_BSUX)) != 0;
/* Filter calls from pcre2_substitute(). */ /* Filter calls from pcre2_substitute(). */
@ -1571,8 +1571,8 @@ else
*errorcodeptr = ERR3; *errorcodeptr = ERR3;
return 0; return 0;
} }
alt_bsux = FALSE; /* Do not modify \x handling */ alt_bsux = FALSE; /* Do not modify \x handling */
} }
switch (c) switch (c)
{ {
@ -1595,37 +1595,37 @@ else
if (!alt_bsux) *errorcodeptr = ERR37; else if (!alt_bsux) *errorcodeptr = ERR37; else
{ {
uint32_t xc; uint32_t xc;
if (ptr >= ptrend) break; if (ptr >= ptrend) break;
if (*ptr == CHAR_LEFT_CURLY_BRACKET && if (*ptr == CHAR_LEFT_CURLY_BRACKET &&
(extra_options & PCRE2_EXTRA_ALT_BSUX) != 0) (extra_options & PCRE2_EXTRA_ALT_BSUX) != 0)
{ {
PCRE2_SPTR hptr = ptr + 1; PCRE2_SPTR hptr = ptr + 1;
cc = 0; cc = 0;
while (hptr < ptrend && (xc = XDIGIT(*hptr)) != 0xff) while (hptr < ptrend && (xc = XDIGIT(*hptr)) != 0xff)
{ {
if ((cc & 0xf0000000) != 0) /* Test for 32-bit overflow */ if ((cc & 0xf0000000) != 0) /* Test for 32-bit overflow */
{ {
*errorcodeptr = ERR77; *errorcodeptr = ERR77;
ptr = hptr; /* Show where */ ptr = hptr; /* Show where */
break; /* *hptr != } will cause another break below */ break; /* *hptr != } will cause another break below */
} }
cc = (cc << 4) | xc; cc = (cc << 4) | xc;
hptr++; hptr++;
} }
if (hptr == ptr + 1 || /* No hex digits */ if (hptr == ptr + 1 || /* No hex digits */
hptr >= ptrend || /* Hit end of input */ hptr >= ptrend || /* Hit end of input */
*hptr != CHAR_RIGHT_CURLY_BRACKET) /* No } terminator */ *hptr != CHAR_RIGHT_CURLY_BRACKET) /* No } terminator */
break; /* Hex escape not recognized */ break; /* Hex escape not recognized */
c = cc; /* Accept the code point */ c = cc; /* Accept the code point */
ptr = hptr + 1; ptr = hptr + 1;
} }
else /* Must be exactly 4 hex digits */ else /* Must be exactly 4 hex digits */
{ {
if (ptrend - ptr < 4) break; /* Less than 4 chars */ if (ptrend - ptr < 4) break; /* Less than 4 chars */
if ((cc = XDIGIT(ptr[0])) == 0xff) break; /* Not a hex digit */ if ((cc = XDIGIT(ptr[0])) == 0xff) break; /* Not a hex digit */
if ((xc = XDIGIT(ptr[1])) == 0xff) break; /* Not a hex digit */ if ((xc = XDIGIT(ptr[1])) == 0xff) break; /* Not a hex digit */
@ -1635,8 +1635,8 @@ else
if ((xc = XDIGIT(ptr[3])) == 0xff) break; /* Not a hex digit */ if ((xc = XDIGIT(ptr[3])) == 0xff) break; /* Not a hex digit */
c = (cc << 4) | xc; c = (cc << 4) | xc;
ptr += 4; ptr += 4;
} }
if (utf) if (utf)
{ {
if (c > 0x10ffffU) *errorcodeptr = ERR77; if (c > 0x10ffffU) *errorcodeptr = ERR77;
@ -3424,7 +3424,7 @@ while (ptr < ptrend)
else else
{ {
tempptr = ptr; tempptr = ptr;
escape = PRIV(check_escape)(&ptr, ptrend, &c, &errorcode, options, escape = PRIV(check_escape)(&ptr, ptrend, &c, &errorcode, options,
cb->cx->extra_options, TRUE, cb); cb->cx->extra_options, TRUE, cb);
if (errorcode != 0) if (errorcode != 0)
@ -7631,9 +7631,20 @@ for (;; pptr++)
{ {
uint32_t ptype = *(++pptr) >> 16; uint32_t ptype = *(++pptr) >> 16;
uint32_t pdata = *pptr & 0xffff; uint32_t pdata = *pptr & 0xffff;
*code++ = (meta_arg == ESC_p)? OP_PROP : OP_NOTPROP;
*code++ = ptype; /* The special case of \p{Any} is compiled to OP_ALLANY so as to benefit
*code++ = pdata; from the auto-anchoring code. */
if (meta_arg == ESC_p && ptype == PT_ANY)
{
*code++ = OP_ALLANY;
}
else
{
*code++ = (meta_arg == ESC_p)? OP_PROP : OP_NOTPROP;
*code++ = ptype;
*code++ = pdata;
}
break; /* End META_ESCAPE */ break; /* End META_ESCAPE */
} }
#endif #endif

4
testdata/testinput5 vendored
View File

@ -2170,4 +2170,8 @@
/(?'X²ABC'...)/utf /(?'X²ABC'...)/utf
# -------
/\p{Any}*xyz/I
# End of testinput5 # End of testinput5

47
testdata/testoutput5 vendored
View File

@ -3294,27 +3294,27 @@ No match
/\p{Any}+\p{Any} \p{Any}+\P{Any} \p{Any}+\p{L&} \p{Any}+\p{L} \p{Any}+\p{Lu} \p{Any}+\p{Han} \p{Any}+\p{Xan} \p{Any}+\p{Xsp} \p{Any}+\p{Xps} \p{Xwd}+\p{Any} \p{Any}+\p{Xuc}/Bx,ucp /\p{Any}+\p{Any} \p{Any}+\P{Any} \p{Any}+\p{L&} \p{Any}+\p{L} \p{Any}+\p{Lu} \p{Any}+\p{Han} \p{Any}+\p{Xan} \p{Any}+\p{Xsp} \p{Any}+\p{Xps} \p{Xwd}+\p{Any} \p{Any}+\p{Xuc}/Bx,ucp
------------------------------------------------------------------ ------------------------------------------------------------------
Bra Bra
prop Any + AllAny+
prop Any AllAny
prop Any + AllAny+
notprop Any notprop Any
prop Any + AllAny+
prop L& prop L&
prop Any + AllAny+
prop L prop L
prop Any + AllAny+
prop Lu prop Lu
prop Any + AllAny+
prop Han prop Han
prop Any + AllAny+
prop Xan prop Xan
prop Any + AllAny+
prop Xsp prop Xsp
prop Any + AllAny+
prop Xps prop Xps
prop Xwd + prop Xwd +
prop Any AllAny
prop Any + AllAny+
prop Xuc prop Xuc
Ket Ket
End End
@ -3324,7 +3324,7 @@ No match
------------------------------------------------------------------ ------------------------------------------------------------------
Bra Bra
prop L& + prop L& +
prop Any AllAny
prop L& + prop L& +
prop L& prop L&
notprop L& ++ notprop L& ++
@ -3355,7 +3355,7 @@ No match
------------------------------------------------------------------ ------------------------------------------------------------------
Bra Bra
prop N + prop N +
prop Any AllAny
prop N + prop N +
prop L& prop L&
prop N ++ prop N ++
@ -3386,7 +3386,7 @@ No match
------------------------------------------------------------------ ------------------------------------------------------------------
Bra Bra
prop Lu + prop Lu +
prop Any AllAny
prop Lu + prop Lu +
prop L& prop L&
prop Lu + prop Lu +
@ -3448,7 +3448,7 @@ No match
------------------------------------------------------------------ ------------------------------------------------------------------
Bra Bra
prop Xan + prop Xan +
prop Any AllAny
prop Xan + prop Xan +
prop L& prop L&
notprop Xan ++ notprop Xan ++
@ -3479,7 +3479,7 @@ No match
------------------------------------------------------------------ ------------------------------------------------------------------
Bra Bra
prop Xsp + prop Xsp +
prop Any AllAny
prop Xsp ++ prop Xsp ++
prop L& prop L&
prop Xsp ++ prop Xsp ++
@ -3508,7 +3508,7 @@ No match
------------------------------------------------------------------ ------------------------------------------------------------------
Bra Bra
prop Xwd + prop Xwd +
prop Any AllAny
prop Xwd + prop Xwd +
prop L& prop L&
prop Xwd + prop Xwd +
@ -3537,7 +3537,7 @@ No match
------------------------------------------------------------------ ------------------------------------------------------------------
Bra Bra
prop Xuc + prop Xuc +
prop Any AllAny
prop Xuc + prop Xuc +
prop L& prop L&
prop Xuc + prop Xuc +
@ -4924,4 +4924,13 @@ Failed: error 162 at offset 3: subpattern name expected
/(?'X²ABC'...)/utf /(?'X²ABC'...)/utf
Failed: error 142 at offset 4: syntax error in subpattern name (missing terminator?) Failed: error 142 at offset 4: syntax error in subpattern name (missing terminator?)
# -------
/\p{Any}*xyz/I
Capture group count = 0
Compile options: <none>
Overall options: anchored
Last code unit = 'z'
Subject length lower bound = 3
# End of testinput5 # End of testinput5