Fix wide character problem with negated POSIX ascii and xdigit class items.
This commit is contained in:
parent
3485b14a18
commit
11e0001b14
|
@ -321,6 +321,12 @@ mode. The POSIX class got lost, but only if the single character followed it.
|
||||||
96. [:punct:] in UCP mode was matching some characters in the range 128-255
|
96. [:punct:] in UCP mode was matching some characters in the range 128-255
|
||||||
that should not have been matched.
|
that should not have been matched.
|
||||||
|
|
||||||
|
97. If [:^ascii:] or [:^xdigit:] are present in a non-negated class, all
|
||||||
|
characters with code points greater than 255 are in the class. When a Unicode
|
||||||
|
property was also in the class (if PCRE2_UCP is set, escapes such as \w are
|
||||||
|
turned into Unicode properties), wide characters were not correctly handled,
|
||||||
|
and could fail to match.
|
||||||
|
|
||||||
|
|
||||||
Version 10.20 30-June-2015
|
Version 10.20 30-June-2015
|
||||||
--------------------------
|
--------------------------
|
||||||
|
|
|
@ -3857,6 +3857,7 @@ for (;; ptr++)
|
||||||
{
|
{
|
||||||
BOOL negate_class;
|
BOOL negate_class;
|
||||||
BOOL should_flip_negation;
|
BOOL should_flip_negation;
|
||||||
|
BOOL match_all_wide_chars;
|
||||||
BOOL possessive_quantifier;
|
BOOL possessive_quantifier;
|
||||||
BOOL is_quantifier;
|
BOOL is_quantifier;
|
||||||
BOOL is_recurse;
|
BOOL is_recurse;
|
||||||
|
@ -4187,11 +4188,12 @@ for (;; ptr++)
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* If a class contains a negative special such as \S, we need to flip the
|
/* If a non-extended class contains a negative special such as \S, we need
|
||||||
negation flag at the end, so that support for characters > 255 works
|
to flip the negation flag at the end, so that support for characters > 255
|
||||||
correctly (they are all included in the class). */
|
works correctly (they are all included in the class). An extended class may
|
||||||
|
need to insert specific matching code for wide characters. */
|
||||||
|
|
||||||
should_flip_negation = FALSE;
|
should_flip_negation = match_all_wide_chars = FALSE;
|
||||||
|
|
||||||
/* Extended class (xclass) will be used when characters > 255
|
/* Extended class (xclass) will be used when characters > 255
|
||||||
might match. */
|
might match. */
|
||||||
|
@ -4345,10 +4347,23 @@ for (;; ptr++)
|
||||||
ptr = tempptr + 1;
|
ptr = tempptr + 1;
|
||||||
goto CONTINUE_CLASS;
|
goto CONTINUE_CLASS;
|
||||||
|
|
||||||
/* For all other POSIX classes, no special action is taken in UCP
|
/* For the other POSIX classes (ascii, xdigit) we are going to fall
|
||||||
mode. Fall through to the non_UCP case. */
|
through to the non-UCP case and build a bit map for characters with
|
||||||
|
code points less than 256. If we are in a negated POSIX class
|
||||||
|
within a non-negated overall class, characters with code points
|
||||||
|
greater than 255 must all match. In the special case where we have
|
||||||
|
not yet generated any xclass data, and this is the final item in
|
||||||
|
the overall class, we need do nothing: later on, the opcode
|
||||||
|
OP_NCLASS will be used to indicate that characters greater than 255
|
||||||
|
are acceptable. If we have already seen an xclass item or one may
|
||||||
|
follow (we have to assume that it might if this is not the end of
|
||||||
|
the class), set a flag to cause the generation of an explicit range
|
||||||
|
for all wide codepoints. */
|
||||||
|
|
||||||
default:
|
default:
|
||||||
|
if (!negate_class && local_negate &&
|
||||||
|
(xclass || tempptr[2] != CHAR_RIGHT_SQUARE_BRACKET))
|
||||||
|
match_all_wide_chars = TRUE;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -4848,9 +4863,15 @@ for (;; ptr++)
|
||||||
unless there were no property settings and there was a negated special such
|
unless there were no property settings and there was a negated special such
|
||||||
as \S in the class, and PCRE2_UCP is not set, because in that case all
|
as \S in the class, and PCRE2_UCP is not set, because in that case all
|
||||||
characters > 255 are in the class, so any that were explicitly given as
|
characters > 255 are in the class, so any that were explicitly given as
|
||||||
well can be ignored. If (when there are explicit characters > 255 or
|
well can be ignored.
|
||||||
property settings that must be listed) there are no characters < 256, we
|
|
||||||
can omit the bitmap in the actual compiled code. */
|
In the UCP case, if certain negated POSIX classes ([:^ascii:] or
|
||||||
|
{^:xdigit:]) were present in a non-negative class, we again have to match
|
||||||
|
all wide characters, indicated by match_all_wide_chars being true. We do
|
||||||
|
this by including an explicit range.
|
||||||
|
|
||||||
|
If, when generating an xclass, there are no characters < 256, we can omit
|
||||||
|
the bitmap in the actual compiled code. */
|
||||||
|
|
||||||
#ifdef SUPPORT_WIDE_CHARS
|
#ifdef SUPPORT_WIDE_CHARS
|
||||||
#ifdef SUPPORT_UNICODE
|
#ifdef SUPPORT_UNICODE
|
||||||
|
@ -4860,6 +4881,13 @@ for (;; ptr++)
|
||||||
if (xclass && (xclass_has_prop || !should_flip_negation))
|
if (xclass && (xclass_has_prop || !should_flip_negation))
|
||||||
#endif
|
#endif
|
||||||
{
|
{
|
||||||
|
if (match_all_wide_chars)
|
||||||
|
{
|
||||||
|
*class_uchardata++ = XCL_RANGE;
|
||||||
|
class_uchardata += PRIV(ord2utf)(0x100, class_uchardata);
|
||||||
|
class_uchardata += PRIV(ord2utf)(MAX_UTF_CODE_POINT,
|
||||||
|
class_uchardata);
|
||||||
|
}
|
||||||
*class_uchardata++ = XCL_END; /* Marks the end of extra data */
|
*class_uchardata++ = XCL_END; /* Marks the end of extra data */
|
||||||
*code++ = OP_XCLASS;
|
*code++ = OP_XCLASS;
|
||||||
code += LINK_SIZE;
|
code += LINK_SIZE;
|
||||||
|
|
|
@ -2236,4 +2236,40 @@
|
||||||
/[[:punct:]]/utf,ucp
|
/[[:punct:]]/utf,ucp
|
||||||
\x{b4}
|
\x{b4}
|
||||||
|
|
||||||
|
/[[:^ascii:]]/utf,ucp
|
||||||
|
\x{100}
|
||||||
|
\x{200}
|
||||||
|
\x{300}
|
||||||
|
\x{37e}
|
||||||
|
\= Expect no match
|
||||||
|
aa
|
||||||
|
99
|
||||||
|
|
||||||
|
/[[:^ascii:]\w]/utf,ucp
|
||||||
|
aa
|
||||||
|
99
|
||||||
|
gg
|
||||||
|
\x{100}
|
||||||
|
\x{200}
|
||||||
|
\x{300}
|
||||||
|
\x{37e}
|
||||||
|
|
||||||
|
/[\w[:^ascii:]]/utf,ucp
|
||||||
|
aa
|
||||||
|
99
|
||||||
|
gg
|
||||||
|
\x{100}
|
||||||
|
\x{200}
|
||||||
|
\x{300}
|
||||||
|
\x{37e}
|
||||||
|
|
||||||
|
/[^[:ascii:]\W]/utf,ucp
|
||||||
|
\x{100}
|
||||||
|
\x{200}
|
||||||
|
\= Expect no match
|
||||||
|
aa
|
||||||
|
99
|
||||||
|
gg
|
||||||
|
\x{37e}
|
||||||
|
|
||||||
# End of testinput4
|
# End of testinput4
|
||||||
|
|
|
@ -1697,4 +1697,19 @@
|
||||||
|
|
||||||
/a[b[:punct:]]/utf,ucp,bincode
|
/a[b[:punct:]]/utf,ucp,bincode
|
||||||
|
|
||||||
|
/[[:^ascii:]]/utf,ucp,bincode
|
||||||
|
|
||||||
|
/[[:^ascii:]\w]/utf,ucp,bincode
|
||||||
|
|
||||||
|
/[\w[:^ascii:]]/utf,ucp,bincode
|
||||||
|
|
||||||
|
/[^[:ascii:]\W]/utf,ucp,bincode
|
||||||
|
\x{de}
|
||||||
|
\x{200}
|
||||||
|
\= Expect no match
|
||||||
|
\x{300}
|
||||||
|
\x{37e}
|
||||||
|
|
||||||
|
/[[:^ascii:]a]/utf,ucp,bincode
|
||||||
|
|
||||||
# End of testinput5
|
# End of testinput5
|
||||||
|
|
|
@ -3624,4 +3624,66 @@ No match
|
||||||
\x{b4}
|
\x{b4}
|
||||||
No match
|
No match
|
||||||
|
|
||||||
|
/[[:^ascii:]]/utf,ucp
|
||||||
|
\x{100}
|
||||||
|
0: \x{100}
|
||||||
|
\x{200}
|
||||||
|
0: \x{200}
|
||||||
|
\x{300}
|
||||||
|
0: \x{300}
|
||||||
|
\x{37e}
|
||||||
|
0: \x{37e}
|
||||||
|
\= Expect no match
|
||||||
|
aa
|
||||||
|
No match
|
||||||
|
99
|
||||||
|
No match
|
||||||
|
|
||||||
|
/[[:^ascii:]\w]/utf,ucp
|
||||||
|
aa
|
||||||
|
0: a
|
||||||
|
99
|
||||||
|
0: 9
|
||||||
|
gg
|
||||||
|
0: g
|
||||||
|
\x{100}
|
||||||
|
0: \x{100}
|
||||||
|
\x{200}
|
||||||
|
0: \x{200}
|
||||||
|
\x{300}
|
||||||
|
0: \x{300}
|
||||||
|
\x{37e}
|
||||||
|
0: \x{37e}
|
||||||
|
|
||||||
|
/[\w[:^ascii:]]/utf,ucp
|
||||||
|
aa
|
||||||
|
0: a
|
||||||
|
99
|
||||||
|
0: 9
|
||||||
|
gg
|
||||||
|
0: g
|
||||||
|
\x{100}
|
||||||
|
0: \x{100}
|
||||||
|
\x{200}
|
||||||
|
0: \x{200}
|
||||||
|
\x{300}
|
||||||
|
0: \x{300}
|
||||||
|
\x{37e}
|
||||||
|
0: \x{37e}
|
||||||
|
|
||||||
|
/[^[:ascii:]\W]/utf,ucp
|
||||||
|
\x{100}
|
||||||
|
0: \x{100}
|
||||||
|
\x{200}
|
||||||
|
0: \x{200}
|
||||||
|
\= Expect no match
|
||||||
|
aa
|
||||||
|
No match
|
||||||
|
99
|
||||||
|
No match
|
||||||
|
gg
|
||||||
|
No match
|
||||||
|
\x{37e}
|
||||||
|
No match
|
||||||
|
|
||||||
# End of testinput4
|
# End of testinput4
|
||||||
|
|
|
@ -4097,4 +4097,53 @@ Subject length lower bound = 0
|
||||||
End
|
End
|
||||||
------------------------------------------------------------------
|
------------------------------------------------------------------
|
||||||
|
|
||||||
|
/[[:^ascii:]]/utf,ucp,bincode
|
||||||
|
------------------------------------------------------------------
|
||||||
|
Bra
|
||||||
|
[\x80-\xff] (neg)
|
||||||
|
Ket
|
||||||
|
End
|
||||||
|
------------------------------------------------------------------
|
||||||
|
|
||||||
|
/[[:^ascii:]\w]/utf,ucp,bincode
|
||||||
|
------------------------------------------------------------------
|
||||||
|
Bra
|
||||||
|
[\x80-\xff\p{Xwd}\x{100}-\x{10ffff}]
|
||||||
|
Ket
|
||||||
|
End
|
||||||
|
------------------------------------------------------------------
|
||||||
|
|
||||||
|
/[\w[:^ascii:]]/utf,ucp,bincode
|
||||||
|
------------------------------------------------------------------
|
||||||
|
Bra
|
||||||
|
[\x80-\xff\p{Xwd}\x{100}-\x{10ffff}]
|
||||||
|
Ket
|
||||||
|
End
|
||||||
|
------------------------------------------------------------------
|
||||||
|
|
||||||
|
/[^[:ascii:]\W]/utf,ucp,bincode
|
||||||
|
------------------------------------------------------------------
|
||||||
|
Bra
|
||||||
|
[^\x00-\x7f\P{Xwd}]
|
||||||
|
Ket
|
||||||
|
End
|
||||||
|
------------------------------------------------------------------
|
||||||
|
\x{de}
|
||||||
|
0: \x{de}
|
||||||
|
\x{200}
|
||||||
|
0: \x{200}
|
||||||
|
\= Expect no match
|
||||||
|
\x{300}
|
||||||
|
No match
|
||||||
|
\x{37e}
|
||||||
|
No match
|
||||||
|
|
||||||
|
/[[:^ascii:]a]/utf,ucp,bincode
|
||||||
|
------------------------------------------------------------------
|
||||||
|
Bra
|
||||||
|
[a\x80-\xff] (neg)
|
||||||
|
Ket
|
||||||
|
End
|
||||||
|
------------------------------------------------------------------
|
||||||
|
|
||||||
# End of testinput5
|
# End of testinput5
|
||||||
|
|
Loading…
Reference in New Issue