From 255f5e741ba73c7d1ea0730fb09f45a9d0f581d2 Mon Sep 17 00:00:00 2001 From: "Philip.Hazel" Date: Wed, 13 Feb 2019 17:30:24 +0000 Subject: [PATCH] Compile \p{Any} the same as . in DOTALL mode, to benefit from auto-anchoring. --- ChangeLog | 3 +++ src/pcre2_compile.c | 59 ++++++++++++++++++++++++++------------------ testdata/testinput5 | 4 +++ testdata/testoutput5 | 47 +++++++++++++++++++++-------------- 4 files changed, 70 insertions(+), 43 deletions(-) diff --git a/ChangeLog b/ChangeLog index 9997ba0..990a6a3 100644 --- a/ChangeLog +++ b/ChangeLog @@ -128,6 +128,9 @@ ClusterFuzz 12950, fixed before release. 31. Implemented PCRE2_EXTRA_ALT_BSUX to support ECMAScript 6's \u{hhh} construct. +32. Compile \p{Any} to be the same as . in DOTALL mode, so that it benefits +from auto-anchoring if \p{Any}* starts a pattern. + Version 10.32 10-September-2018 ------------------------------- diff --git a/src/pcre2_compile.c b/src/pcre2_compile.c index 5a6f88c..90d30a5 100644 --- a/src/pcre2_compile.c +++ b/src/pcre2_compile.c @@ -1459,7 +1459,7 @@ Returns: zero => a data character int PRIV(check_escape)(PCRE2_SPTR *ptrptr, PCRE2_SPTR ptrend, uint32_t *chptr, - int *errorcodeptr, uint32_t options, uint32_t extra_options, BOOL isclass, + int *errorcodeptr, uint32_t options, uint32_t extra_options, BOOL isclass, compile_block *cb) { BOOL utf = (options & PCRE2_UTF) != 0; @@ -1551,7 +1551,7 @@ else if ((i = escapes[c - ESCAPES_FIRST]) != 0) /* Escapes that need further processing, including those that are unknown, have a zero entry in the lookup table. When called from pcre2_substitute(), only \c, -\o, and \x are recognized (\u and \U can never appear as they are used for case +\o, and \x are recognized (\u and \U can never appear as they are used for case forcing). */ else @@ -1559,7 +1559,7 @@ else int s; PCRE2_SPTR oldptr; BOOL overflow; - BOOL alt_bsux = + BOOL alt_bsux = ((options & PCRE2_ALT_BSUX) | (extra_options & PCRE2_EXTRA_ALT_BSUX)) != 0; /* Filter calls from pcre2_substitute(). */ @@ -1571,8 +1571,8 @@ else *errorcodeptr = ERR3; return 0; } - alt_bsux = FALSE; /* Do not modify \x handling */ - } + alt_bsux = FALSE; /* Do not modify \x handling */ + } switch (c) { @@ -1595,37 +1595,37 @@ else if (!alt_bsux) *errorcodeptr = ERR37; else { uint32_t xc; - + if (ptr >= ptrend) break; - if (*ptr == CHAR_LEFT_CURLY_BRACKET && + if (*ptr == CHAR_LEFT_CURLY_BRACKET && (extra_options & PCRE2_EXTRA_ALT_BSUX) != 0) { PCRE2_SPTR hptr = ptr + 1; cc = 0; - + while (hptr < ptrend && (xc = XDIGIT(*hptr)) != 0xff) - { + { if ((cc & 0xf0000000) != 0) /* Test for 32-bit overflow */ { *errorcodeptr = ERR77; ptr = hptr; /* Show where */ - break; /* *hptr != } will cause another break below */ - } + break; /* *hptr != } will cause another break below */ + } cc = (cc << 4) | xc; - hptr++; - } - + hptr++; + } + if (hptr == ptr + 1 || /* No hex digits */ hptr >= ptrend || /* Hit end of input */ *hptr != CHAR_RIGHT_CURLY_BRACKET) /* No } terminator */ break; /* Hex escape not recognized */ - + c = cc; /* Accept the code point */ - ptr = hptr + 1; + ptr = hptr + 1; } - + else /* Must be exactly 4 hex digits */ - { + { if (ptrend - ptr < 4) break; /* Less than 4 chars */ if ((cc = XDIGIT(ptr[0])) == 0xff) break; /* Not a hex digit */ if ((xc = XDIGIT(ptr[1])) == 0xff) break; /* Not a hex digit */ @@ -1635,8 +1635,8 @@ else if ((xc = XDIGIT(ptr[3])) == 0xff) break; /* Not a hex digit */ c = (cc << 4) | xc; ptr += 4; - } - + } + if (utf) { if (c > 0x10ffffU) *errorcodeptr = ERR77; @@ -3424,7 +3424,7 @@ while (ptr < ptrend) else { tempptr = ptr; - escape = PRIV(check_escape)(&ptr, ptrend, &c, &errorcode, options, + escape = PRIV(check_escape)(&ptr, ptrend, &c, &errorcode, options, cb->cx->extra_options, TRUE, cb); if (errorcode != 0) @@ -7631,9 +7631,20 @@ for (;; pptr++) { uint32_t ptype = *(++pptr) >> 16; uint32_t pdata = *pptr & 0xffff; - *code++ = (meta_arg == ESC_p)? OP_PROP : OP_NOTPROP; - *code++ = ptype; - *code++ = pdata; + + /* The special case of \p{Any} is compiled to OP_ALLANY so as to benefit + from the auto-anchoring code. */ + + if (meta_arg == ESC_p && ptype == PT_ANY) + { + *code++ = OP_ALLANY; + } + else + { + *code++ = (meta_arg == ESC_p)? OP_PROP : OP_NOTPROP; + *code++ = ptype; + *code++ = pdata; + } break; /* End META_ESCAPE */ } #endif diff --git a/testdata/testinput5 b/testdata/testinput5 index 2c4e847..7c58145 100644 --- a/testdata/testinput5 +++ b/testdata/testinput5 @@ -2170,4 +2170,8 @@ /(?'X²ABC'...)/utf +# ------- + +/\p{Any}*xyz/I + # End of testinput5 diff --git a/testdata/testoutput5 b/testdata/testoutput5 index 23438dd..5d64d00 100644 --- a/testdata/testoutput5 +++ b/testdata/testoutput5 @@ -3294,27 +3294,27 @@ No match /\p{Any}+\p{Any} \p{Any}+\P{Any} \p{Any}+\p{L&} \p{Any}+\p{L} \p{Any}+\p{Lu} \p{Any}+\p{Han} \p{Any}+\p{Xan} \p{Any}+\p{Xsp} \p{Any}+\p{Xps} \p{Xwd}+\p{Any} \p{Any}+\p{Xuc}/Bx,ucp ------------------------------------------------------------------ Bra - prop Any + - prop Any - prop Any + + AllAny+ + AllAny + AllAny+ notprop Any - prop Any + + AllAny+ prop L& - prop Any + + AllAny+ prop L - prop Any + + AllAny+ prop Lu - prop Any + + AllAny+ prop Han - prop Any + + AllAny+ prop Xan - prop Any + + AllAny+ prop Xsp - prop Any + + AllAny+ prop Xps prop Xwd + - prop Any - prop Any + + AllAny + AllAny+ prop Xuc Ket End @@ -3324,7 +3324,7 @@ No match ------------------------------------------------------------------ Bra prop L& + - prop Any + AllAny prop L& + prop L& notprop L& ++ @@ -3355,7 +3355,7 @@ No match ------------------------------------------------------------------ Bra prop N + - prop Any + AllAny prop N + prop L& prop N ++ @@ -3386,7 +3386,7 @@ No match ------------------------------------------------------------------ Bra prop Lu + - prop Any + AllAny prop Lu + prop L& prop Lu + @@ -3448,7 +3448,7 @@ No match ------------------------------------------------------------------ Bra prop Xan + - prop Any + AllAny prop Xan + prop L& notprop Xan ++ @@ -3479,7 +3479,7 @@ No match ------------------------------------------------------------------ Bra prop Xsp + - prop Any + AllAny prop Xsp ++ prop L& prop Xsp ++ @@ -3508,7 +3508,7 @@ No match ------------------------------------------------------------------ Bra prop Xwd + - prop Any + AllAny prop Xwd + prop L& prop Xwd + @@ -3537,7 +3537,7 @@ No match ------------------------------------------------------------------ Bra prop Xuc + - prop Any + AllAny prop Xuc + prop L& prop Xuc + @@ -4924,4 +4924,13 @@ Failed: error 162 at offset 3: subpattern name expected /(?'X²ABC'...)/utf Failed: error 142 at offset 4: syntax error in subpattern name (missing terminator?) +# ------- + +/\p{Any}*xyz/I +Capture group count = 0 +Compile options: +Overall options: anchored +Last code unit = 'z' +Subject length lower bound = 3 + # End of testinput5