Fix bug that caused chars > 255 not to be matched by classes like [\W\pL] when
PCRE2_UCP was not set.
This commit is contained in:
parent
69c9d81e43
commit
dda1e79060
|
@ -8,6 +8,12 @@ Version 10.23 xx-xxxxxx-2016
|
|||
1. Extended pcre2test with the utf8_input modifier so that it is able to
|
||||
generate all possible 16-bit and 32-bit code unit values in non-UTF modes.
|
||||
|
||||
2. In any wide-character mode (8-bit UTF or any 16-bit or 32-bit mode), without
|
||||
PCRE2_UCP set, a negative character type such as \D in a positive class should
|
||||
cause all characters greater than 255 to match, whatever else is in the class.
|
||||
There was a bug that caused this not to happen if a Unicode property item was
|
||||
added to such a class, for example [\D\P{Nd}] or [\W\pL].
|
||||
|
||||
|
||||
Version 10.22 29-July-2016
|
||||
--------------------------
|
||||
|
|
|
@ -4950,11 +4950,11 @@ for (;; ptr++)
|
|||
}
|
||||
|
||||
#ifdef SUPPORT_WIDE_CHARS
|
||||
/* If any wide characters have been encountered, set xclass = TRUE. Then,
|
||||
in the pre-compile phase, accumulate the length of the wide characters
|
||||
and reset the pointer. This is so that very large classes that contain a
|
||||
zillion wide characters do not overwrite the work space (which is on the
|
||||
stack). */
|
||||
/* If any wide characters or Unicode properties have been encountered,
|
||||
set xclass = TRUE. Then, in the pre-compile phase, accumulate the length
|
||||
of the wide characters etc. and reset the pointer. This is so that very
|
||||
large classes that contain a zillion wide characters do not overwrite the
|
||||
work space (which is on the stack). */
|
||||
|
||||
if (class_uchardata > class_uchardata_base)
|
||||
{
|
||||
|
@ -4994,22 +4994,43 @@ for (;; ptr++)
|
|||
negated). This requirement is indicated by match_all_or_no_wide_chars being
|
||||
true. We do this by including an explicit range, which works in both cases.
|
||||
|
||||
When there *are* properties in a positive UTF-8 or any 16-bit or 32_bit
|
||||
class where \S etc is present without PCRE2_UCP, causing an extended class
|
||||
to be compiled, we make sure that all characters > 255 are included by
|
||||
forcing match_all_or_no_wide_chars to be true.
|
||||
|
||||
If, when generating an xclass, there are no characters < 256, we can omit
|
||||
the bitmap in the actual compiled code. */
|
||||
|
||||
#ifdef SUPPORT_WIDE_CHARS
|
||||
#ifdef SUPPORT_WIDE_CHARS /* Defined for 16/32 bits, or 8-bit with Unicode */
|
||||
if (xclass && (
|
||||
#ifdef SUPPORT_UNICODE
|
||||
if (xclass && (xclass_has_prop || !should_flip_negation ||
|
||||
(options & PCRE2_UCP) != 0))
|
||||
#elif PCRE2_CODE_UNIT_WIDTH != 8
|
||||
if (xclass && (xclass_has_prop || !should_flip_negation))
|
||||
(options & PCRE2_UCP) != 0 ||
|
||||
#endif
|
||||
xclass_has_prop || !should_flip_negation))
|
||||
{
|
||||
if (match_all_or_no_wide_chars)
|
||||
if (match_all_or_no_wide_chars || (
|
||||
#if PCRE2_CODE_UNIT_WIDTH == 8
|
||||
utf &&
|
||||
#endif
|
||||
should_flip_negation && !negate_class && (options & PCRE2_UCP) == 0))
|
||||
{
|
||||
*class_uchardata++ = XCL_RANGE;
|
||||
class_uchardata += PRIV(ord2utf)(0x100, class_uchardata);
|
||||
class_uchardata += PRIV(ord2utf)(MAX_UTF_CODE_POINT, class_uchardata);
|
||||
if (utf) /* Will always be utf in the 8-bit library */
|
||||
{
|
||||
class_uchardata += PRIV(ord2utf)(0x100, class_uchardata);
|
||||
class_uchardata += PRIV(ord2utf)(MAX_UTF_CODE_POINT, class_uchardata);
|
||||
}
|
||||
else /* Can only happen for the 16-bit & 32-bit libraries */
|
||||
{
|
||||
#if PCRE2_CODE_UNIT_WIDTH == 16
|
||||
*class_uchardata++ = 0x100;
|
||||
*class_uchardata++ = 0xffffu;
|
||||
#elif PCRE2_CODE_UNIT_WIDTH == 32
|
||||
*class_uchardata++ = 0x100;
|
||||
*class_uchardata++ = 0xffffffffu;
|
||||
#endif
|
||||
}
|
||||
}
|
||||
*class_uchardata++ = XCL_END; /* Marks the end of extra data */
|
||||
*code++ = OP_XCLASS;
|
||||
|
@ -5037,7 +5058,7 @@ for (;; ptr++)
|
|||
PUT(previous, 1, (int)(code - previous));
|
||||
break; /* End of class handling */
|
||||
}
|
||||
#endif
|
||||
#endif /* SUPPORT_WIDE_CHARS */
|
||||
|
||||
/* If there are no characters > 255, or they are all to be included or
|
||||
excluded, set the opcode to OP_CLASS or OP_NCLASS, depending on whether the
|
||||
|
|
|
@ -445,4 +445,13 @@
|
|||
/(?<=(a)(?-1))x/I,utf
|
||||
a\x80zx\=offset=3
|
||||
|
||||
/[\W\p{Any}]/B
|
||||
abc
|
||||
123
|
||||
|
||||
/[\W\pL]/B
|
||||
abc
|
||||
\= Expect no match
|
||||
123
|
||||
|
||||
# End of testinput10
|
||||
|
|
|
@ -347,4 +347,15 @@
|
|||
|
||||
/abý¿¿¿¿¿z/utf
|
||||
|
||||
/[\W\p{Any}]/B
|
||||
abc
|
||||
123
|
||||
|
||||
/[\W\pL]/B
|
||||
abc
|
||||
\x{100}
|
||||
\x{308}
|
||||
\= Expect no match
|
||||
123
|
||||
|
||||
# End of testinput12
|
||||
|
|
|
@ -1675,15 +1675,6 @@
|
|||
/((?<digit>\d)|(?<letter>\p{L}))/g,substitute_extended,replace=<${digit:+digit; :not digit; }${letter:+letter:not a letter}>
|
||||
ab12cde
|
||||
|
||||
/[\W\p{Any}]/B
|
||||
abc
|
||||
123
|
||||
|
||||
/[\W\pL]/B
|
||||
abc
|
||||
\= Expect no match
|
||||
123
|
||||
|
||||
/(*UCP)(*UTF)[[:>:]]X/B
|
||||
|
||||
/abc/utf,replace=xyz
|
||||
|
@ -1716,4 +1707,21 @@
|
|||
|
||||
/(*UTF)C\x09((?<!'(?x)!*H? #\xcc\x9a[^$]/
|
||||
|
||||
/[\D]/utf
|
||||
\x{1d7cf}
|
||||
|
||||
/[\D\P{Nd}]/utf
|
||||
\x{1d7cf}
|
||||
|
||||
/[^\D]/utf
|
||||
a9b
|
||||
\= Expect no match
|
||||
\x{1d7cf}
|
||||
|
||||
/[^\D\P{Nd}]/utf
|
||||
a9b
|
||||
\x{1d7cf}
|
||||
\= Expect no match
|
||||
\x{10000}
|
||||
|
||||
# End of testinput5
|
||||
|
|
|
@ -1539,4 +1539,29 @@ Subject length lower bound = 1
|
|||
a\x80zx\=offset=3
|
||||
Failed: error -22: UTF-8 error: isolated byte with 0x80 bit set at offset 1
|
||||
|
||||
/[\W\p{Any}]/B
|
||||
------------------------------------------------------------------
|
||||
Bra
|
||||
[\x00-/:-@[-^`{-\xff\p{Any}]
|
||||
Ket
|
||||
End
|
||||
------------------------------------------------------------------
|
||||
abc
|
||||
0: a
|
||||
123
|
||||
0: 1
|
||||
|
||||
/[\W\pL]/B
|
||||
------------------------------------------------------------------
|
||||
Bra
|
||||
[\x00-/:-@[-^`{-\xff\p{L}]
|
||||
Ket
|
||||
End
|
||||
------------------------------------------------------------------
|
||||
abc
|
||||
0: a
|
||||
\= Expect no match
|
||||
123
|
||||
No match
|
||||
|
||||
# End of testinput10
|
||||
|
|
|
@ -1378,4 +1378,33 @@ Subject length lower bound = 2
|
|||
/abý¿¿¿¿¿z/utf
|
||||
** Failed: character value greater than 0x10ffff cannot be converted to UTF
|
||||
|
||||
/[\W\p{Any}]/B
|
||||
------------------------------------------------------------------
|
||||
Bra
|
||||
[\x00-/:-@[-^`{-\xff\p{Any}\x{100}-\x{ffff}]
|
||||
Ket
|
||||
End
|
||||
------------------------------------------------------------------
|
||||
abc
|
||||
0: a
|
||||
123
|
||||
0: 1
|
||||
|
||||
/[\W\pL]/B
|
||||
------------------------------------------------------------------
|
||||
Bra
|
||||
[\x00-/:-@[-^`{-\xff\p{L}\x{100}-\x{ffff}]
|
||||
Ket
|
||||
End
|
||||
------------------------------------------------------------------
|
||||
abc
|
||||
0: a
|
||||
\x{100}
|
||||
0: \x{100}
|
||||
\x{308}
|
||||
0: \x{308}
|
||||
\= Expect no match
|
||||
123
|
||||
No match
|
||||
|
||||
# End of testinput12
|
||||
|
|
|
@ -1372,4 +1372,33 @@ Failed: error -28: UTF-32 error: code points greater than 0x10ffff are not defin
|
|||
/abý¿¿¿¿¿z/utf
|
||||
** Failed: character value greater than 0x10ffff cannot be converted to UTF
|
||||
|
||||
/[\W\p{Any}]/B
|
||||
------------------------------------------------------------------
|
||||
Bra
|
||||
[\x00-/:-@[-^`{-\xff\p{Any}\x{100}-\x{ffffffff}]
|
||||
Ket
|
||||
End
|
||||
------------------------------------------------------------------
|
||||
abc
|
||||
0: a
|
||||
123
|
||||
0: 1
|
||||
|
||||
/[\W\pL]/B
|
||||
------------------------------------------------------------------
|
||||
Bra
|
||||
[\x00-/:-@[-^`{-\xff\p{L}\x{100}-\x{ffffffff}]
|
||||
Ket
|
||||
End
|
||||
------------------------------------------------------------------
|
||||
abc
|
||||
0: a
|
||||
\x{100}
|
||||
0: \x{100}
|
||||
\x{308}
|
||||
0: \x{308}
|
||||
\= Expect no match
|
||||
123
|
||||
No match
|
||||
|
||||
# End of testinput12
|
||||
|
|
|
@ -4020,31 +4020,6 @@ MK: a\x{12345}b\x{09}(d)c
|
|||
ab12cde
|
||||
7: <not digit; letter><not digit; letter><digit; not a letter><digit; not a letter><not digit; letter><not digit; letter><not digit; letter>
|
||||
|
||||
/[\W\p{Any}]/B
|
||||
------------------------------------------------------------------
|
||||
Bra
|
||||
[\x00-/:-@[-^`{-\xff\p{Any}]
|
||||
Ket
|
||||
End
|
||||
------------------------------------------------------------------
|
||||
abc
|
||||
0: a
|
||||
123
|
||||
0: 1
|
||||
|
||||
/[\W\pL]/B
|
||||
------------------------------------------------------------------
|
||||
Bra
|
||||
[\x00-/:-@[-^`{-\xff\p{L}]
|
||||
Ket
|
||||
End
|
||||
------------------------------------------------------------------
|
||||
abc
|
||||
0: a
|
||||
\= Expect no match
|
||||
123
|
||||
No match
|
||||
|
||||
/(*UCP)(*UTF)[[:>:]]X/B
|
||||
------------------------------------------------------------------
|
||||
Bra
|
||||
|
@ -4161,4 +4136,28 @@ No match
|
|||
/(*UTF)C\x09((?<!'(?x)!*H? #\xcc\x9a[^$]/
|
||||
Failed: error 114 at offset 39: missing closing parenthesis
|
||||
|
||||
/[\D]/utf
|
||||
\x{1d7cf}
|
||||
0: \x{1d7cf}
|
||||
|
||||
/[\D\P{Nd}]/utf
|
||||
\x{1d7cf}
|
||||
0: \x{1d7cf}
|
||||
|
||||
/[^\D]/utf
|
||||
a9b
|
||||
0: 9
|
||||
\= Expect no match
|
||||
\x{1d7cf}
|
||||
No match
|
||||
|
||||
/[^\D\P{Nd}]/utf
|
||||
a9b
|
||||
0: 9
|
||||
\x{1d7cf}
|
||||
0: \x{1d7cf}
|
||||
\= Expect no match
|
||||
\x{10000}
|
||||
No match
|
||||
|
||||
# End of testinput5
|
||||
|
|
Loading…
Reference in New Issue