From 11e0001b14cabfacdecc70edd63c9a83b929b93c Mon Sep 17 00:00:00 2001 From: "Philip.Hazel" Date: Fri, 20 Nov 2015 16:55:36 +0000 Subject: [PATCH] Fix wide character problem with negated POSIX ascii and xdigit class items. --- ChangeLog | 6 +++++ src/pcre2_compile.c | 46 +++++++++++++++++++++++++------- testdata/testinput4 | 36 +++++++++++++++++++++++++ testdata/testinput5 | 15 +++++++++++ testdata/testoutput4 | 62 ++++++++++++++++++++++++++++++++++++++++++++ testdata/testoutput5 | 49 ++++++++++++++++++++++++++++++++++ 6 files changed, 205 insertions(+), 9 deletions(-) diff --git a/ChangeLog b/ChangeLog index 31e8aa4..224b3d7 100644 --- a/ChangeLog +++ b/ChangeLog @@ -321,6 +321,12 @@ mode. The POSIX class got lost, but only if the single character followed it. 96. [:punct:] in UCP mode was matching some characters in the range 128-255 that should not have been matched. +97. If [:^ascii:] or [:^xdigit:] are present in a non-negated class, all +characters with code points greater than 255 are in the class. When a Unicode +property was also in the class (if PCRE2_UCP is set, escapes such as \w are +turned into Unicode properties), wide characters were not correctly handled, +and could fail to match. + Version 10.20 30-June-2015 -------------------------- diff --git a/src/pcre2_compile.c b/src/pcre2_compile.c index ff7bebd..8637f0f 100644 --- a/src/pcre2_compile.c +++ b/src/pcre2_compile.c @@ -3857,6 +3857,7 @@ for (;; ptr++) { BOOL negate_class; BOOL should_flip_negation; + BOOL match_all_wide_chars; BOOL possessive_quantifier; BOOL is_quantifier; BOOL is_recurse; @@ -4187,11 +4188,12 @@ for (;; ptr++) break; } - /* If a class contains a negative special such as \S, we need to flip the - negation flag at the end, so that support for characters > 255 works - correctly (they are all included in the class). */ + /* If a non-extended class contains a negative special such as \S, we need + to flip the negation flag at the end, so that support for characters > 255 + works correctly (they are all included in the class). An extended class may + need to insert specific matching code for wide characters. */ - should_flip_negation = FALSE; + should_flip_negation = match_all_wide_chars = FALSE; /* Extended class (xclass) will be used when characters > 255 might match. */ @@ -4345,10 +4347,23 @@ for (;; ptr++) ptr = tempptr + 1; goto CONTINUE_CLASS; - /* For all other POSIX classes, no special action is taken in UCP - mode. Fall through to the non_UCP case. */ + /* For the other POSIX classes (ascii, xdigit) we are going to fall + through to the non-UCP case and build a bit map for characters with + code points less than 256. If we are in a negated POSIX class + within a non-negated overall class, characters with code points + greater than 255 must all match. In the special case where we have + not yet generated any xclass data, and this is the final item in + the overall class, we need do nothing: later on, the opcode + OP_NCLASS will be used to indicate that characters greater than 255 + are acceptable. If we have already seen an xclass item or one may + follow (we have to assume that it might if this is not the end of + the class), set a flag to cause the generation of an explicit range + for all wide codepoints. */ default: + if (!negate_class && local_negate && + (xclass || tempptr[2] != CHAR_RIGHT_SQUARE_BRACKET)) + match_all_wide_chars = TRUE; break; } } @@ -4848,9 +4863,15 @@ for (;; ptr++) unless there were no property settings and there was a negated special such as \S in the class, and PCRE2_UCP is not set, because in that case all characters > 255 are in the class, so any that were explicitly given as - well can be ignored. If (when there are explicit characters > 255 or - property settings that must be listed) there are no characters < 256, we - can omit the bitmap in the actual compiled code. */ + well can be ignored. + + In the UCP case, if certain negated POSIX classes ([:^ascii:] or + {^:xdigit:]) were present in a non-negative class, we again have to match + all wide characters, indicated by match_all_wide_chars being true. We do + this by including an explicit range. + + If, when generating an xclass, there are no characters < 256, we can omit + the bitmap in the actual compiled code. */ #ifdef SUPPORT_WIDE_CHARS #ifdef SUPPORT_UNICODE @@ -4860,6 +4881,13 @@ for (;; ptr++) if (xclass && (xclass_has_prop || !should_flip_negation)) #endif { + if (match_all_wide_chars) + { + *class_uchardata++ = XCL_RANGE; + class_uchardata += PRIV(ord2utf)(0x100, class_uchardata); + class_uchardata += PRIV(ord2utf)(MAX_UTF_CODE_POINT, + class_uchardata); + } *class_uchardata++ = XCL_END; /* Marks the end of extra data */ *code++ = OP_XCLASS; code += LINK_SIZE; diff --git a/testdata/testinput4 b/testdata/testinput4 index dfaa1c0..464d798 100644 --- a/testdata/testinput4 +++ b/testdata/testinput4 @@ -2236,4 +2236,40 @@ /[[:punct:]]/utf,ucp \x{b4} +/[[:^ascii:]]/utf,ucp + \x{100} + \x{200} + \x{300} + \x{37e} +\= Expect no match + aa + 99 + +/[[:^ascii:]\w]/utf,ucp + aa + 99 + gg + \x{100} + \x{200} + \x{300} + \x{37e} + +/[\w[:^ascii:]]/utf,ucp + aa + 99 + gg + \x{100} + \x{200} + \x{300} + \x{37e} + +/[^[:ascii:]\W]/utf,ucp + \x{100} + \x{200} +\= Expect no match + aa + 99 + gg + \x{37e} + # End of testinput4 diff --git a/testdata/testinput5 b/testdata/testinput5 index 2432d19..136712e 100644 --- a/testdata/testinput5 +++ b/testdata/testinput5 @@ -1697,4 +1697,19 @@ /a[b[:punct:]]/utf,ucp,bincode +/[[:^ascii:]]/utf,ucp,bincode + +/[[:^ascii:]\w]/utf,ucp,bincode + +/[\w[:^ascii:]]/utf,ucp,bincode + +/[^[:ascii:]\W]/utf,ucp,bincode + \x{de} + \x{200} +\= Expect no match + \x{300} + \x{37e} + +/[[:^ascii:]a]/utf,ucp,bincode + # End of testinput5 diff --git a/testdata/testoutput4 b/testdata/testoutput4 index 0814646..9504c5e 100644 --- a/testdata/testoutput4 +++ b/testdata/testoutput4 @@ -3624,4 +3624,66 @@ No match \x{b4} No match +/[[:^ascii:]]/utf,ucp + \x{100} + 0: \x{100} + \x{200} + 0: \x{200} + \x{300} + 0: \x{300} + \x{37e} + 0: \x{37e} +\= Expect no match + aa +No match + 99 +No match + +/[[:^ascii:]\w]/utf,ucp + aa + 0: a + 99 + 0: 9 + gg + 0: g + \x{100} + 0: \x{100} + \x{200} + 0: \x{200} + \x{300} + 0: \x{300} + \x{37e} + 0: \x{37e} + +/[\w[:^ascii:]]/utf,ucp + aa + 0: a + 99 + 0: 9 + gg + 0: g + \x{100} + 0: \x{100} + \x{200} + 0: \x{200} + \x{300} + 0: \x{300} + \x{37e} + 0: \x{37e} + +/[^[:ascii:]\W]/utf,ucp + \x{100} + 0: \x{100} + \x{200} + 0: \x{200} +\= Expect no match + aa +No match + 99 +No match + gg +No match + \x{37e} +No match + # End of testinput4 diff --git a/testdata/testoutput5 b/testdata/testoutput5 index 7eb9df3..9879231 100644 --- a/testdata/testoutput5 +++ b/testdata/testoutput5 @@ -4097,4 +4097,53 @@ Subject length lower bound = 0 End ------------------------------------------------------------------ +/[[:^ascii:]]/utf,ucp,bincode +------------------------------------------------------------------ + Bra + [\x80-\xff] (neg) + Ket + End +------------------------------------------------------------------ + +/[[:^ascii:]\w]/utf,ucp,bincode +------------------------------------------------------------------ + Bra + [\x80-\xff\p{Xwd}\x{100}-\x{10ffff}] + Ket + End +------------------------------------------------------------------ + +/[\w[:^ascii:]]/utf,ucp,bincode +------------------------------------------------------------------ + Bra + [\x80-\xff\p{Xwd}\x{100}-\x{10ffff}] + Ket + End +------------------------------------------------------------------ + +/[^[:ascii:]\W]/utf,ucp,bincode +------------------------------------------------------------------ + Bra + [^\x00-\x7f\P{Xwd}] + Ket + End +------------------------------------------------------------------ + \x{de} + 0: \x{de} + \x{200} + 0: \x{200} +\= Expect no match + \x{300} +No match + \x{37e} +No match + +/[[:^ascii:]a]/utf,ucp,bincode +------------------------------------------------------------------ + Bra + [a\x80-\xff] (neg) + Ket + End +------------------------------------------------------------------ + # End of testinput5