Fix wide character problem with negated POSIX ascii and xdigit class items.

This commit is contained in:
Philip.Hazel 2015-11-20 16:55:36 +00:00
parent 3485b14a18
commit 11e0001b14
6 changed files with 205 additions and 9 deletions

View File

@ -321,6 +321,12 @@ mode. The POSIX class got lost, but only if the single character followed it.
96. [:punct:] in UCP mode was matching some characters in the range 128-255 96. [:punct:] in UCP mode was matching some characters in the range 128-255
that should not have been matched. that should not have been matched.
97. If [:^ascii:] or [:^xdigit:] are present in a non-negated class, all
characters with code points greater than 255 are in the class. When a Unicode
property was also in the class (if PCRE2_UCP is set, escapes such as \w are
turned into Unicode properties), wide characters were not correctly handled,
and could fail to match.
Version 10.20 30-June-2015 Version 10.20 30-June-2015
-------------------------- --------------------------

View File

@ -3857,6 +3857,7 @@ for (;; ptr++)
{ {
BOOL negate_class; BOOL negate_class;
BOOL should_flip_negation; BOOL should_flip_negation;
BOOL match_all_wide_chars;
BOOL possessive_quantifier; BOOL possessive_quantifier;
BOOL is_quantifier; BOOL is_quantifier;
BOOL is_recurse; BOOL is_recurse;
@ -4187,11 +4188,12 @@ for (;; ptr++)
break; break;
} }
/* If a class contains a negative special such as \S, we need to flip the /* If a non-extended class contains a negative special such as \S, we need
negation flag at the end, so that support for characters > 255 works to flip the negation flag at the end, so that support for characters > 255
correctly (they are all included in the class). */ works correctly (they are all included in the class). An extended class may
need to insert specific matching code for wide characters. */
should_flip_negation = FALSE; should_flip_negation = match_all_wide_chars = FALSE;
/* Extended class (xclass) will be used when characters > 255 /* Extended class (xclass) will be used when characters > 255
might match. */ might match. */
@ -4345,10 +4347,23 @@ for (;; ptr++)
ptr = tempptr + 1; ptr = tempptr + 1;
goto CONTINUE_CLASS; goto CONTINUE_CLASS;
/* For all other POSIX classes, no special action is taken in UCP /* For the other POSIX classes (ascii, xdigit) we are going to fall
mode. Fall through to the non_UCP case. */ through to the non-UCP case and build a bit map for characters with
code points less than 256. If we are in a negated POSIX class
within a non-negated overall class, characters with code points
greater than 255 must all match. In the special case where we have
not yet generated any xclass data, and this is the final item in
the overall class, we need do nothing: later on, the opcode
OP_NCLASS will be used to indicate that characters greater than 255
are acceptable. If we have already seen an xclass item or one may
follow (we have to assume that it might if this is not the end of
the class), set a flag to cause the generation of an explicit range
for all wide codepoints. */
default: default:
if (!negate_class && local_negate &&
(xclass || tempptr[2] != CHAR_RIGHT_SQUARE_BRACKET))
match_all_wide_chars = TRUE;
break; break;
} }
} }
@ -4848,9 +4863,15 @@ for (;; ptr++)
unless there were no property settings and there was a negated special such unless there were no property settings and there was a negated special such
as \S in the class, and PCRE2_UCP is not set, because in that case all as \S in the class, and PCRE2_UCP is not set, because in that case all
characters > 255 are in the class, so any that were explicitly given as characters > 255 are in the class, so any that were explicitly given as
well can be ignored. If (when there are explicit characters > 255 or well can be ignored.
property settings that must be listed) there are no characters < 256, we
can omit the bitmap in the actual compiled code. */ In the UCP case, if certain negated POSIX classes ([:^ascii:] or
{^:xdigit:]) were present in a non-negative class, we again have to match
all wide characters, indicated by match_all_wide_chars being true. We do
this by including an explicit range.
If, when generating an xclass, there are no characters < 256, we can omit
the bitmap in the actual compiled code. */
#ifdef SUPPORT_WIDE_CHARS #ifdef SUPPORT_WIDE_CHARS
#ifdef SUPPORT_UNICODE #ifdef SUPPORT_UNICODE
@ -4860,6 +4881,13 @@ for (;; ptr++)
if (xclass && (xclass_has_prop || !should_flip_negation)) if (xclass && (xclass_has_prop || !should_flip_negation))
#endif #endif
{ {
if (match_all_wide_chars)
{
*class_uchardata++ = XCL_RANGE;
class_uchardata += PRIV(ord2utf)(0x100, class_uchardata);
class_uchardata += PRIV(ord2utf)(MAX_UTF_CODE_POINT,
class_uchardata);
}
*class_uchardata++ = XCL_END; /* Marks the end of extra data */ *class_uchardata++ = XCL_END; /* Marks the end of extra data */
*code++ = OP_XCLASS; *code++ = OP_XCLASS;
code += LINK_SIZE; code += LINK_SIZE;

36
testdata/testinput4 vendored
View File

@ -2236,4 +2236,40 @@
/[[:punct:]]/utf,ucp /[[:punct:]]/utf,ucp
\x{b4} \x{b4}
/[[:^ascii:]]/utf,ucp
\x{100}
\x{200}
\x{300}
\x{37e}
\= Expect no match
aa
99
/[[:^ascii:]\w]/utf,ucp
aa
99
gg
\x{100}
\x{200}
\x{300}
\x{37e}
/[\w[:^ascii:]]/utf,ucp
aa
99
gg
\x{100}
\x{200}
\x{300}
\x{37e}
/[^[:ascii:]\W]/utf,ucp
\x{100}
\x{200}
\= Expect no match
aa
99
gg
\x{37e}
# End of testinput4 # End of testinput4

15
testdata/testinput5 vendored
View File

@ -1697,4 +1697,19 @@
/a[b[:punct:]]/utf,ucp,bincode /a[b[:punct:]]/utf,ucp,bincode
/[[:^ascii:]]/utf,ucp,bincode
/[[:^ascii:]\w]/utf,ucp,bincode
/[\w[:^ascii:]]/utf,ucp,bincode
/[^[:ascii:]\W]/utf,ucp,bincode
\x{de}
\x{200}
\= Expect no match
\x{300}
\x{37e}
/[[:^ascii:]a]/utf,ucp,bincode
# End of testinput5 # End of testinput5

62
testdata/testoutput4 vendored
View File

@ -3624,4 +3624,66 @@ No match
\x{b4} \x{b4}
No match No match
/[[:^ascii:]]/utf,ucp
\x{100}
0: \x{100}
\x{200}
0: \x{200}
\x{300}
0: \x{300}
\x{37e}
0: \x{37e}
\= Expect no match
aa
No match
99
No match
/[[:^ascii:]\w]/utf,ucp
aa
0: a
99
0: 9
gg
0: g
\x{100}
0: \x{100}
\x{200}
0: \x{200}
\x{300}
0: \x{300}
\x{37e}
0: \x{37e}
/[\w[:^ascii:]]/utf,ucp
aa
0: a
99
0: 9
gg
0: g
\x{100}
0: \x{100}
\x{200}
0: \x{200}
\x{300}
0: \x{300}
\x{37e}
0: \x{37e}
/[^[:ascii:]\W]/utf,ucp
\x{100}
0: \x{100}
\x{200}
0: \x{200}
\= Expect no match
aa
No match
99
No match
gg
No match
\x{37e}
No match
# End of testinput4 # End of testinput4

49
testdata/testoutput5 vendored
View File

@ -4097,4 +4097,53 @@ Subject length lower bound = 0
End End
------------------------------------------------------------------ ------------------------------------------------------------------
/[[:^ascii:]]/utf,ucp,bincode
------------------------------------------------------------------
Bra
[\x80-\xff] (neg)
Ket
End
------------------------------------------------------------------
/[[:^ascii:]\w]/utf,ucp,bincode
------------------------------------------------------------------
Bra
[\x80-\xff\p{Xwd}\x{100}-\x{10ffff}]
Ket
End
------------------------------------------------------------------
/[\w[:^ascii:]]/utf,ucp,bincode
------------------------------------------------------------------
Bra
[\x80-\xff\p{Xwd}\x{100}-\x{10ffff}]
Ket
End
------------------------------------------------------------------
/[^[:ascii:]\W]/utf,ucp,bincode
------------------------------------------------------------------
Bra
[^\x00-\x7f\P{Xwd}]
Ket
End
------------------------------------------------------------------
\x{de}
0: \x{de}
\x{200}
0: \x{200}
\= Expect no match
\x{300}
No match
\x{37e}
No match
/[[:^ascii:]a]/utf,ucp,bincode
------------------------------------------------------------------
Bra
[a\x80-\xff] (neg)
Ket
End
------------------------------------------------------------------
# End of testinput5 # End of testinput5