Adjust tests for PCRE/Perl anomalies in character properties & fix one bug.
This commit is contained in:
parent
fd555f266c
commit
f40fba5dc8
|
@ -470,6 +470,9 @@ general substitute of a Unicode property escape (\p or \P). However, for some
|
||||||
POSIX classes (e.g. graph, print, punct) a special property code is compiled
|
POSIX classes (e.g. graph, print, punct) a special property code is compiled
|
||||||
directly. */
|
directly. */
|
||||||
|
|
||||||
|
static const PCRE2_UCHAR string_pCc[] = {
|
||||||
|
CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
|
||||||
|
CHAR_C, CHAR_c, CHAR_RIGHT_CURLY_BRACKET, '\0' };
|
||||||
static const PCRE2_UCHAR string_pL[] = {
|
static const PCRE2_UCHAR string_pL[] = {
|
||||||
CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
|
CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
|
||||||
CHAR_L, CHAR_RIGHT_CURLY_BRACKET, '\0' };
|
CHAR_L, CHAR_RIGHT_CURLY_BRACKET, '\0' };
|
||||||
|
@ -487,6 +490,9 @@ static const PCRE2_UCHAR string_h[] = {
|
||||||
static const PCRE2_UCHAR string_pXps[] = {
|
static const PCRE2_UCHAR string_pXps[] = {
|
||||||
CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
|
CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
|
||||||
CHAR_X, CHAR_p, CHAR_s, CHAR_RIGHT_CURLY_BRACKET, '\0' };
|
CHAR_X, CHAR_p, CHAR_s, CHAR_RIGHT_CURLY_BRACKET, '\0' };
|
||||||
|
static const PCRE2_UCHAR string_PCc[] = {
|
||||||
|
CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
|
||||||
|
CHAR_C, CHAR_c, CHAR_RIGHT_CURLY_BRACKET, '\0' };
|
||||||
static const PCRE2_UCHAR string_PL[] = {
|
static const PCRE2_UCHAR string_PL[] = {
|
||||||
CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
|
CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
|
||||||
CHAR_L, CHAR_RIGHT_CURLY_BRACKET, '\0' };
|
CHAR_L, CHAR_RIGHT_CURLY_BRACKET, '\0' };
|
||||||
|
@ -512,7 +518,7 @@ static PCRE2_SPTR posix_substitutes[] = {
|
||||||
string_pXan, /* alnum */
|
string_pXan, /* alnum */
|
||||||
NULL, /* ascii */
|
NULL, /* ascii */
|
||||||
string_h, /* blank */
|
string_h, /* blank */
|
||||||
NULL, /* cntrl */
|
string_pCc, /* cntrl */
|
||||||
string_pNd, /* digit */
|
string_pNd, /* digit */
|
||||||
NULL, /* graph */
|
NULL, /* graph */
|
||||||
NULL, /* print */
|
NULL, /* print */
|
||||||
|
@ -527,7 +533,7 @@ static PCRE2_SPTR posix_substitutes[] = {
|
||||||
string_PXan, /* ^alnum */
|
string_PXan, /* ^alnum */
|
||||||
NULL, /* ^ascii */
|
NULL, /* ^ascii */
|
||||||
string_H, /* ^blank */
|
string_H, /* ^blank */
|
||||||
NULL, /* ^cntrl */
|
string_PCc, /* ^cntrl */
|
||||||
string_PNd, /* ^digit */
|
string_PNd, /* ^digit */
|
||||||
NULL, /* ^graph */
|
NULL, /* ^graph */
|
||||||
NULL, /* ^print */
|
NULL, /* ^print */
|
||||||
|
|
|
@ -389,6 +389,11 @@ other. NOTE: The values also appear in pcre2_jit_compile.c. */
|
||||||
|
|
||||||
#ifndef EBCDIC
|
#ifndef EBCDIC
|
||||||
|
|
||||||
|
/* Character U+180E (Mongolian Vowel Separator) is not included in the list of
|
||||||
|
spaces in the Unicode file PropList.txt, and Perl does not recognize it as a
|
||||||
|
space. However, in many other sources it is listed as a space and has been in
|
||||||
|
PCRE for a long time. */
|
||||||
|
|
||||||
#define HSPACE_LIST \
|
#define HSPACE_LIST \
|
||||||
CHAR_HT, CHAR_SPACE, 0xa0, \
|
CHAR_HT, CHAR_SPACE, 0xa0, \
|
||||||
0x1680, 0x180e, 0x2000, 0x2001, 0x2002, 0x2003, 0x2004, 0x2005, \
|
0x1680, 0x180e, 0x2000, 0x2001, 0x2002, 0x2003, 0x2004, 0x2005, \
|
||||||
|
|
|
@ -1139,7 +1139,6 @@
|
||||||
\x{06e9}
|
\x{06e9}
|
||||||
\x{060b}
|
\x{060b}
|
||||||
** Failers
|
** Failers
|
||||||
\x{061c}
|
|
||||||
X\x{06e9}
|
X\x{06e9}
|
||||||
|
|
||||||
/^[\P{Yi}]/utf
|
/^[\P{Yi}]/utf
|
||||||
|
@ -1492,7 +1491,7 @@
|
||||||
>\x{20}\x{a0}\x{1680}\x{2028}\x{2029}\x{202f}\x{9}\x{b}
|
>\x{20}\x{a0}\x{1680}\x{2028}\x{2029}\x{202f}\x{9}\x{b}
|
||||||
|
|
||||||
/^>[[:blank:]]*/utf,ucp
|
/^>[[:blank:]]*/utf,ucp
|
||||||
>\x{20}\x{a0}\x{1680}\x{180e}\x{2000}\x{202f}\x{9}\x{b}\x{2028}
|
>\x{20}\x{a0}\x{1680}\x{2000}\x{202f}\x{9}\x{b}\x{2028}
|
||||||
|
|
||||||
/^[[:alpha:]]*/utf,ucp
|
/^[[:alpha:]]*/utf,ucp
|
||||||
Az\x{aa}\x{c0}\x{1c5}\x{2b0}\x{3b6}\x{1d7c9}\x{2fa1d}
|
Az\x{aa}\x{c0}\x{1c5}\x{2b0}\x{3b6}\x{1d7c9}\x{2fa1d}
|
||||||
|
@ -2045,11 +2044,11 @@
|
||||||
|
|
||||||
/^A\s+Z/utf,ucp
|
/^A\s+Z/utf,ucp
|
||||||
A\x{2005}Z
|
A\x{2005}Z
|
||||||
A\x{85}\x{180e}\x{2005}Z
|
A\x{85}\x{2005}Z
|
||||||
|
|
||||||
/^A[\s]+Z/utf,ucp
|
/^A[\s]+Z/utf,ucp
|
||||||
A\x{2005}Z
|
A\x{2005}Z
|
||||||
A\x{85}\x{180e}\x{2005}Z
|
A\x{85}\x{2005}Z
|
||||||
|
|
||||||
/^[[:graph:]]+$/utf,ucp
|
/^[[:graph:]]+$/utf,ucp
|
||||||
Letter:ABC
|
Letter:ABC
|
||||||
|
@ -2075,17 +2074,11 @@
|
||||||
\x{20}
|
\x{20}
|
||||||
\x{85}
|
\x{85}
|
||||||
\x{a0}
|
\x{a0}
|
||||||
\x{61c}
|
|
||||||
\x{1680}
|
\x{1680}
|
||||||
\x{180e}
|
|
||||||
\x{2028}
|
\x{2028}
|
||||||
\x{2029}
|
\x{2029}
|
||||||
\x{202f}
|
\x{202f}
|
||||||
\x{2065}
|
\x{2065}
|
||||||
\x{2066}
|
|
||||||
\x{2067}
|
|
||||||
\x{2068}
|
|
||||||
\x{2069}
|
|
||||||
\x{3000}
|
\x{3000}
|
||||||
\x{e0002}
|
\x{e0002}
|
||||||
\x{e001f}
|
\x{e001f}
|
||||||
|
@ -2103,7 +2096,6 @@
|
||||||
Punctuation:\x{66a},;
|
Punctuation:\x{66a},;
|
||||||
Symbol:\x{6de}<>\x{fffc}
|
Symbol:\x{6de}<>\x{fffc}
|
||||||
Cf-property:\x{ad}\x{600}\x{601}\x{602}\x{603}\x{604}\x{6dd}\x{70f}
|
Cf-property:\x{ad}\x{600}\x{601}\x{602}\x{603}\x{604}\x{6dd}\x{70f}
|
||||||
\x{180e}
|
|
||||||
\x{200b}\x{200c}\x{200d}\x{200e}\x{200f}
|
\x{200b}\x{200c}\x{200d}\x{200e}\x{200f}
|
||||||
\x{202a}\x{202b}\x{202c}\x{202d}\x{202e}
|
\x{202a}\x{202b}\x{202c}\x{202d}\x{202e}
|
||||||
\x{202f}
|
\x{202f}
|
||||||
|
@ -2119,14 +2111,9 @@
|
||||||
\x{09}
|
\x{09}
|
||||||
\x{1D}
|
\x{1D}
|
||||||
\x{85}
|
\x{85}
|
||||||
\x{61c}
|
|
||||||
\x{2028}
|
\x{2028}
|
||||||
\x{2029}
|
\x{2029}
|
||||||
\x{2065}
|
\x{2065}
|
||||||
\x{2066}
|
|
||||||
\x{2067}
|
|
||||||
\x{2068}
|
|
||||||
\x{2069}
|
|
||||||
\x{e0002}
|
\x{e0002}
|
||||||
\x{e001f}
|
\x{e001f}
|
||||||
\x{e0080}
|
\x{e0080}
|
||||||
|
@ -2140,8 +2127,8 @@
|
||||||
abcde
|
abcde
|
||||||
|
|
||||||
/^[[:^graph:]]+$/utf,ucp
|
/^[[:^graph:]]+$/utf,ucp
|
||||||
\x{09}\x{0a}\x{1D}\x{20}\x{85}\x{a0}\x{61c}\x{1680}\x{180e}
|
\x{09}\x{0a}\x{1D}\x{20}\x{85}\x{a0}\x{1680}
|
||||||
\x{2028}\x{2029}\x{202f}\x{2065}\x{2066}\x{2067}\x{2068}\x{2069}
|
\x{2028}\x{2029}\x{202f}\x{2065}
|
||||||
\x{3000}\x{e0002}\x{e001f}\x{e0080}
|
\x{3000}\x{e0002}\x{e001f}\x{e0080}
|
||||||
** Failers
|
** Failers
|
||||||
Letter:ABC
|
Letter:ABC
|
||||||
|
@ -2162,8 +2149,8 @@
|
||||||
\x{e0020}\x{e0030}\x{e0040}\x{e0050}\x{e0060}\x{e0070}\x{e007f}
|
\x{e0020}\x{e0030}\x{e0040}\x{e0050}\x{e0060}\x{e0070}\x{e007f}
|
||||||
|
|
||||||
/^[[:^print:]]+$/utf,ucp
|
/^[[:^print:]]+$/utf,ucp
|
||||||
\x{09}\x{1D}\x{85}\x{61c}\x{2028}\x{2029}\x{2065}\x{2066}\x{2067}
|
\x{09}\x{1D}\x{85}\x{2028}\x{2029}\x{2065}
|
||||||
\x{2068}\x{2069}\x{e0002}\x{e001f}\x{e0080}
|
\x{e0002}\x{e001f}\x{e0080}
|
||||||
** Failers
|
** Failers
|
||||||
Space: \x{a0}
|
Space: \x{a0}
|
||||||
\x{1680}\x{2000}\x{2001}\x{2002}\x{2003}\x{2004}\x{2005}
|
\x{1680}\x{2000}\x{2001}\x{2002}\x{2003}\x{2004}\x{2005}
|
||||||
|
@ -2176,7 +2163,6 @@
|
||||||
Punctuation:\x{66a},;
|
Punctuation:\x{66a},;
|
||||||
Symbol:\x{6de}<>\x{fffc}
|
Symbol:\x{6de}<>\x{fffc}
|
||||||
Cf-property:\x{ad}\x{600}\x{601}\x{602}\x{603}\x{604}\x{6dd}\x{70f}
|
Cf-property:\x{ad}\x{600}\x{601}\x{602}\x{603}\x{604}\x{6dd}\x{70f}
|
||||||
\x{180e}
|
|
||||||
\x{200b}\x{200c}\x{200d}\x{200e}\x{200f}
|
\x{200b}\x{200c}\x{200d}\x{200e}\x{200f}
|
||||||
\x{202a}\x{202b}\x{202c}\x{202d}\x{202e}
|
\x{202a}\x{202b}\x{202c}\x{202d}\x{202e}
|
||||||
\x{202f}
|
\x{202f}
|
||||||
|
|
|
@ -3,6 +3,71 @@
|
||||||
# results in 8-bit, 16-bit, and 32-bit modes are excluded (see tests 10 and
|
# results in 8-bit, 16-bit, and 32-bit modes are excluded (see tests 10 and
|
||||||
# 12).
|
# 12).
|
||||||
|
|
||||||
|
# PCRE2 and Perl disagree about the characteristics of certain Unicode
|
||||||
|
# characters. For example, 061C is considered by Perl to be Arabic, though
|
||||||
|
# is it not listed as such in the Unicode Scripts.txt file, and 2066-2069 are
|
||||||
|
# graphic and printable according to Perl, though they are actually "isolate"
|
||||||
|
# control characters. That is why the following tests are here rather than in
|
||||||
|
# test 4.
|
||||||
|
|
||||||
|
/^[\p{Arabic}]/utf
|
||||||
|
** Failers
|
||||||
|
\x{061c}
|
||||||
|
|
||||||
|
/^[[:graph:]]+$/utf,ucp
|
||||||
|
** Failers
|
||||||
|
\x{61c}
|
||||||
|
\x{2066}
|
||||||
|
\x{2067}
|
||||||
|
\x{2068}
|
||||||
|
\x{2069}
|
||||||
|
|
||||||
|
/^[[:print:]]+$/utf,ucp
|
||||||
|
** Failers
|
||||||
|
\x{61c}
|
||||||
|
\x{2066}
|
||||||
|
\x{2067}
|
||||||
|
\x{2068}
|
||||||
|
\x{2069}
|
||||||
|
|
||||||
|
/^[[:^graph:]]+$/utf,ucp
|
||||||
|
\x{09}\x{0a}\x{1D}\x{20}\x{85}\x{a0}\x{61c}\x{1680}
|
||||||
|
\x{2028}\x{2029}\x{202f}\x{2065}\x{2066}\x{2067}\x{2068}\x{2069}
|
||||||
|
|
||||||
|
/^[[:^print:]]+$/utf,ucp
|
||||||
|
\x{09}\x{1D}\x{85}\x{61c}\x{2028}\x{2029}\x{2065}\x{2066}\x{2067}
|
||||||
|
\x{2068}\x{2069}
|
||||||
|
|
||||||
|
# Perl does not consider U+180e to be a space character. It is true that it
|
||||||
|
# does not appear in the Unicode PropList.txt file as such, but in many other
|
||||||
|
# sources it is listed as a space, and has been treated as such in PCRE for
|
||||||
|
# a long time.
|
||||||
|
|
||||||
|
/^>[[:blank:]]*/utf,ucp
|
||||||
|
>\x{20}\x{a0}\x{1680}\x{180e}\x{2000}\x{202f}\x{9}\x{b}\x{2028}
|
||||||
|
|
||||||
|
/^A\s+Z/utf,ucp
|
||||||
|
A\x{85}\x{180e}\x{2005}Z
|
||||||
|
|
||||||
|
/^A[\s]+Z/utf,ucp
|
||||||
|
A\x{2005}Z
|
||||||
|
A\x{85}\x{2005}Z
|
||||||
|
|
||||||
|
/^[[:graph:]]+$/utf,ucp
|
||||||
|
\x{180e}
|
||||||
|
|
||||||
|
/^[[:print:]]+$/utf,ucp
|
||||||
|
\x{180e}
|
||||||
|
|
||||||
|
/^[[:^graph:]]+$/utf,ucp
|
||||||
|
\x{09}\x{0a}\x{1D}\x{20}\x{85}\x{a0}\x{61c}\x{1680}\x{180e}
|
||||||
|
|
||||||
|
/^[[:^print:]]+$/utf,ucp
|
||||||
|
\x{180e}
|
||||||
|
|
||||||
|
# End of U+180E tests.
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------
|
||||||
|
|
||||||
/\x{110000}/IB,utf
|
/\x{110000}/IB,utf
|
||||||
|
|
||||||
|
@ -872,9 +937,8 @@
|
||||||
\x{2028}
|
\x{2028}
|
||||||
\x{200d}
|
\x{200d}
|
||||||
|
|
||||||
# These are here rather than in test 6 because Perl has problems with
|
# These are here because Perl has problems with the negative versions of the
|
||||||
# the negative versions of the properties and behaves has changed how
|
# properties and has changed how it behaves for caseless matching.
|
||||||
# it behaves for caseless matching.
|
|
||||||
|
|
||||||
/\p{^Lu}/i,utf
|
/\p{^Lu}/i,utf
|
||||||
1234
|
1234
|
||||||
|
@ -1264,8 +1328,6 @@
|
||||||
/(\x{2c65}\x{2c65})\1Y/i,utf
|
/(\x{2c65}\x{2c65})\1Y/i,utf
|
||||||
X\x{2c65}\x{2c65}\x{23a}\x{23a}YZ
|
X\x{2c65}\x{2c65}\x{23a}\x{23a}YZ
|
||||||
|
|
||||||
#
|
|
||||||
|
|
||||||
# These scripts weren't yet in Perl when I added Unicode 6.0.0 to PCRE
|
# These scripts weren't yet in Perl when I added Unicode 6.0.0 to PCRE
|
||||||
|
|
||||||
/^[\p{Batak}]/utf
|
/^[\p{Batak}]/utf
|
||||||
|
@ -1287,8 +1349,6 @@
|
||||||
\x{85c}
|
\x{85c}
|
||||||
\x{85d}
|
\x{85d}
|
||||||
|
|
||||||
#
|
|
||||||
|
|
||||||
/(\X*)(.)/s,utf
|
/(\X*)(.)/s,utf
|
||||||
A\x{300}
|
A\x{300}
|
||||||
|
|
||||||
|
|
|
@ -1983,8 +1983,6 @@ No match
|
||||||
\x{060b}
|
\x{060b}
|
||||||
0: \x{60b}
|
0: \x{60b}
|
||||||
** Failers
|
** Failers
|
||||||
No match
|
|
||||||
\x{061c}
|
|
||||||
No match
|
No match
|
||||||
X\x{06e9}
|
X\x{06e9}
|
||||||
No match
|
No match
|
||||||
|
@ -2578,8 +2576,8 @@ No match
|
||||||
0: > \x{a0}\x{1680}\x{2028}\x{2029}\x{202f}\x{09}\x{0b}
|
0: > \x{a0}\x{1680}\x{2028}\x{2029}\x{202f}\x{09}\x{0b}
|
||||||
|
|
||||||
/^>[[:blank:]]*/utf,ucp
|
/^>[[:blank:]]*/utf,ucp
|
||||||
>\x{20}\x{a0}\x{1680}\x{180e}\x{2000}\x{202f}\x{9}\x{b}\x{2028}
|
>\x{20}\x{a0}\x{1680}\x{2000}\x{202f}\x{9}\x{b}\x{2028}
|
||||||
0: > \x{a0}\x{1680}\x{180e}\x{2000}\x{202f}\x{09}
|
0: > \x{a0}\x{1680}\x{2000}\x{202f}\x{09}
|
||||||
|
|
||||||
/^[[:alpha:]]*/utf,ucp
|
/^[[:alpha:]]*/utf,ucp
|
||||||
Az\x{aa}\x{c0}\x{1c5}\x{2b0}\x{3b6}\x{1d7c9}\x{2fa1d}
|
Az\x{aa}\x{c0}\x{1c5}\x{2b0}\x{3b6}\x{1d7c9}\x{2fa1d}
|
||||||
|
@ -2591,7 +2589,7 @@ No match
|
||||||
|
|
||||||
/^[[:cntrl:]]*/utf,ucp
|
/^[[:cntrl:]]*/utf,ucp
|
||||||
\x{0}\x{09}\x{1f}\x{7f}\x{9f}
|
\x{0}\x{09}\x{1f}\x{7f}\x{9f}
|
||||||
0: \x{00}\x{09}\x{1f}\x{7f}
|
0: \x{00}\x{09}\x{1f}\x{7f}\x{9f}
|
||||||
|
|
||||||
/^[[:graph:]]*/utf,ucp
|
/^[[:graph:]]*/utf,ucp
|
||||||
A\x{a1}\x{a0}
|
A\x{a1}\x{a0}
|
||||||
|
@ -3414,14 +3412,14 @@ No match
|
||||||
/^A\s+Z/utf,ucp
|
/^A\s+Z/utf,ucp
|
||||||
A\x{2005}Z
|
A\x{2005}Z
|
||||||
0: A\x{2005}Z
|
0: A\x{2005}Z
|
||||||
A\x{85}\x{180e}\x{2005}Z
|
A\x{85}\x{2005}Z
|
||||||
0: A\x{85}\x{180e}\x{2005}Z
|
0: A\x{85}\x{2005}Z
|
||||||
|
|
||||||
/^A[\s]+Z/utf,ucp
|
/^A[\s]+Z/utf,ucp
|
||||||
A\x{2005}Z
|
A\x{2005}Z
|
||||||
0: A\x{2005}Z
|
0: A\x{2005}Z
|
||||||
A\x{85}\x{180e}\x{2005}Z
|
A\x{85}\x{2005}Z
|
||||||
0: A\x{85}\x{180e}\x{2005}Z
|
0: A\x{85}\x{2005}Z
|
||||||
|
|
||||||
/^[[:graph:]]+$/utf,ucp
|
/^[[:graph:]]+$/utf,ucp
|
||||||
Letter:ABC
|
Letter:ABC
|
||||||
|
@ -3469,12 +3467,8 @@ No match
|
||||||
\x{85}
|
\x{85}
|
||||||
No match
|
No match
|
||||||
\x{a0}
|
\x{a0}
|
||||||
No match
|
|
||||||
\x{61c}
|
|
||||||
No match
|
No match
|
||||||
\x{1680}
|
\x{1680}
|
||||||
No match
|
|
||||||
\x{180e}
|
|
||||||
No match
|
No match
|
||||||
\x{2028}
|
\x{2028}
|
||||||
No match
|
No match
|
||||||
|
@ -3483,14 +3477,6 @@ No match
|
||||||
\x{202f}
|
\x{202f}
|
||||||
No match
|
No match
|
||||||
\x{2065}
|
\x{2065}
|
||||||
No match
|
|
||||||
\x{2066}
|
|
||||||
No match
|
|
||||||
\x{2067}
|
|
||||||
No match
|
|
||||||
\x{2068}
|
|
||||||
No match
|
|
||||||
\x{2069}
|
|
||||||
No match
|
No match
|
||||||
\x{3000}
|
\x{3000}
|
||||||
No match
|
No match
|
||||||
|
@ -3524,8 +3510,6 @@ No match
|
||||||
0: Symbol:\x{6de}<>\x{fffc}
|
0: Symbol:\x{6de}<>\x{fffc}
|
||||||
Cf-property:\x{ad}\x{600}\x{601}\x{602}\x{603}\x{604}\x{6dd}\x{70f}
|
Cf-property:\x{ad}\x{600}\x{601}\x{602}\x{603}\x{604}\x{6dd}\x{70f}
|
||||||
0: Cf-property:\x{ad}\x{600}\x{601}\x{602}\x{603}\x{604}\x{6dd}\x{70f}
|
0: Cf-property:\x{ad}\x{600}\x{601}\x{602}\x{603}\x{604}\x{6dd}\x{70f}
|
||||||
\x{180e}
|
|
||||||
0: \x{180e}
|
|
||||||
\x{200b}\x{200c}\x{200d}\x{200e}\x{200f}
|
\x{200b}\x{200c}\x{200d}\x{200e}\x{200f}
|
||||||
0: \x{200b}\x{200c}\x{200d}\x{200e}\x{200f}
|
0: \x{200b}\x{200c}\x{200d}\x{200e}\x{200f}
|
||||||
\x{202a}\x{202b}\x{202c}\x{202d}\x{202e}
|
\x{202a}\x{202b}\x{202c}\x{202d}\x{202e}
|
||||||
|
@ -3555,22 +3539,12 @@ No match
|
||||||
\x{1D}
|
\x{1D}
|
||||||
No match
|
No match
|
||||||
\x{85}
|
\x{85}
|
||||||
No match
|
|
||||||
\x{61c}
|
|
||||||
No match
|
No match
|
||||||
\x{2028}
|
\x{2028}
|
||||||
No match
|
No match
|
||||||
\x{2029}
|
\x{2029}
|
||||||
No match
|
No match
|
||||||
\x{2065}
|
\x{2065}
|
||||||
No match
|
|
||||||
\x{2066}
|
|
||||||
No match
|
|
||||||
\x{2067}
|
|
||||||
No match
|
|
||||||
\x{2068}
|
|
||||||
No match
|
|
||||||
\x{2069}
|
|
||||||
No match
|
No match
|
||||||
\x{e0002}
|
\x{e0002}
|
||||||
No match
|
No match
|
||||||
|
@ -3594,10 +3568,10 @@ No match
|
||||||
No match
|
No match
|
||||||
|
|
||||||
/^[[:^graph:]]+$/utf,ucp
|
/^[[:^graph:]]+$/utf,ucp
|
||||||
\x{09}\x{0a}\x{1D}\x{20}\x{85}\x{a0}\x{61c}\x{1680}\x{180e}
|
\x{09}\x{0a}\x{1D}\x{20}\x{85}\x{a0}\x{1680}
|
||||||
0: \x{09}\x{0a}\x{1d} \x{85}\x{a0}\x{61c}\x{1680}\x{180e}
|
0: \x{09}\x{0a}\x{1d} \x{85}\x{a0}\x{1680}
|
||||||
\x{2028}\x{2029}\x{202f}\x{2065}\x{2066}\x{2067}\x{2068}\x{2069}
|
\x{2028}\x{2029}\x{202f}\x{2065}
|
||||||
0: \x{2028}\x{2029}\x{202f}\x{2065}\x{2066}\x{2067}\x{2068}\x{2069}
|
0: \x{2028}\x{2029}\x{202f}\x{2065}
|
||||||
\x{3000}\x{e0002}\x{e001f}\x{e0080}
|
\x{3000}\x{e0002}\x{e001f}\x{e0080}
|
||||||
0: \x{3000}\x{e0002}\x{e001f}\x{e0080}
|
0: \x{3000}\x{e0002}\x{e001f}\x{e0080}
|
||||||
** Failers
|
** Failers
|
||||||
|
@ -3636,10 +3610,10 @@ No match
|
||||||
No match
|
No match
|
||||||
|
|
||||||
/^[[:^print:]]+$/utf,ucp
|
/^[[:^print:]]+$/utf,ucp
|
||||||
\x{09}\x{1D}\x{85}\x{61c}\x{2028}\x{2029}\x{2065}\x{2066}\x{2067}
|
\x{09}\x{1D}\x{85}\x{2028}\x{2029}\x{2065}
|
||||||
0: \x{09}\x{1d}\x{85}\x{61c}\x{2028}\x{2029}\x{2065}\x{2066}\x{2067}
|
0: \x{09}\x{1d}\x{85}\x{2028}\x{2029}\x{2065}
|
||||||
\x{2068}\x{2069}\x{e0002}\x{e001f}\x{e0080}
|
\x{e0002}\x{e001f}\x{e0080}
|
||||||
0: \x{2068}\x{2069}\x{e0002}\x{e001f}\x{e0080}
|
0: \x{e0002}\x{e001f}\x{e0080}
|
||||||
** Failers
|
** Failers
|
||||||
No match
|
No match
|
||||||
Space: \x{a0}
|
Space: \x{a0}
|
||||||
|
@ -3663,8 +3637,6 @@ No match
|
||||||
Symbol:\x{6de}<>\x{fffc}
|
Symbol:\x{6de}<>\x{fffc}
|
||||||
No match
|
No match
|
||||||
Cf-property:\x{ad}\x{600}\x{601}\x{602}\x{603}\x{604}\x{6dd}\x{70f}
|
Cf-property:\x{ad}\x{600}\x{601}\x{602}\x{603}\x{604}\x{6dd}\x{70f}
|
||||||
No match
|
|
||||||
\x{180e}
|
|
||||||
No match
|
No match
|
||||||
\x{200b}\x{200c}\x{200d}\x{200e}\x{200f}
|
\x{200b}\x{200c}\x{200d}\x{200e}\x{200f}
|
||||||
No match
|
No match
|
||||||
|
|
|
@ -3,6 +3,97 @@
|
||||||
# results in 8-bit, 16-bit, and 32-bit modes are excluded (see tests 10 and
|
# results in 8-bit, 16-bit, and 32-bit modes are excluded (see tests 10 and
|
||||||
# 12).
|
# 12).
|
||||||
|
|
||||||
|
# PCRE2 and Perl disagree about the characteristics of certain Unicode
|
||||||
|
# characters. For example, 061C is considered by Perl to be Arabic, though
|
||||||
|
# is it not listed as such in the Unicode Scripts.txt file, and 2066-2069 are
|
||||||
|
# graphic and printable according to Perl, though they are actually "isolate"
|
||||||
|
# control characters. That is why the following tests are here rather than in
|
||||||
|
# test 4.
|
||||||
|
|
||||||
|
/^[\p{Arabic}]/utf
|
||||||
|
** Failers
|
||||||
|
No match
|
||||||
|
\x{061c}
|
||||||
|
No match
|
||||||
|
|
||||||
|
/^[[:graph:]]+$/utf,ucp
|
||||||
|
** Failers
|
||||||
|
No match
|
||||||
|
\x{61c}
|
||||||
|
No match
|
||||||
|
\x{2066}
|
||||||
|
No match
|
||||||
|
\x{2067}
|
||||||
|
No match
|
||||||
|
\x{2068}
|
||||||
|
No match
|
||||||
|
\x{2069}
|
||||||
|
No match
|
||||||
|
|
||||||
|
/^[[:print:]]+$/utf,ucp
|
||||||
|
** Failers
|
||||||
|
0: ** Failers
|
||||||
|
\x{61c}
|
||||||
|
No match
|
||||||
|
\x{2066}
|
||||||
|
No match
|
||||||
|
\x{2067}
|
||||||
|
No match
|
||||||
|
\x{2068}
|
||||||
|
No match
|
||||||
|
\x{2069}
|
||||||
|
No match
|
||||||
|
|
||||||
|
/^[[:^graph:]]+$/utf,ucp
|
||||||
|
\x{09}\x{0a}\x{1D}\x{20}\x{85}\x{a0}\x{61c}\x{1680}
|
||||||
|
0: \x{09}\x{0a}\x{1d} \x{85}\x{a0}\x{61c}\x{1680}
|
||||||
|
\x{2028}\x{2029}\x{202f}\x{2065}\x{2066}\x{2067}\x{2068}\x{2069}
|
||||||
|
0: \x{2028}\x{2029}\x{202f}\x{2065}\x{2066}\x{2067}\x{2068}\x{2069}
|
||||||
|
|
||||||
|
/^[[:^print:]]+$/utf,ucp
|
||||||
|
\x{09}\x{1D}\x{85}\x{61c}\x{2028}\x{2029}\x{2065}\x{2066}\x{2067}
|
||||||
|
0: \x{09}\x{1d}\x{85}\x{61c}\x{2028}\x{2029}\x{2065}\x{2066}\x{2067}
|
||||||
|
\x{2068}\x{2069}
|
||||||
|
0: \x{2068}\x{2069}
|
||||||
|
|
||||||
|
# Perl does not consider U+180e to be a space character. It is true that it
|
||||||
|
# does not appear in the Unicode PropList.txt file as such, but in many other
|
||||||
|
# sources it is listed as a space, and has been treated as such in PCRE for
|
||||||
|
# a long time.
|
||||||
|
|
||||||
|
/^>[[:blank:]]*/utf,ucp
|
||||||
|
>\x{20}\x{a0}\x{1680}\x{180e}\x{2000}\x{202f}\x{9}\x{b}\x{2028}
|
||||||
|
0: > \x{a0}\x{1680}\x{180e}\x{2000}\x{202f}\x{09}
|
||||||
|
|
||||||
|
/^A\s+Z/utf,ucp
|
||||||
|
A\x{85}\x{180e}\x{2005}Z
|
||||||
|
0: A\x{85}\x{180e}\x{2005}Z
|
||||||
|
|
||||||
|
/^A[\s]+Z/utf,ucp
|
||||||
|
A\x{2005}Z
|
||||||
|
0: A\x{2005}Z
|
||||||
|
A\x{85}\x{2005}Z
|
||||||
|
0: A\x{85}\x{2005}Z
|
||||||
|
|
||||||
|
/^[[:graph:]]+$/utf,ucp
|
||||||
|
\x{180e}
|
||||||
|
No match
|
||||||
|
|
||||||
|
/^[[:print:]]+$/utf,ucp
|
||||||
|
\x{180e}
|
||||||
|
0: \x{180e}
|
||||||
|
|
||||||
|
/^[[:^graph:]]+$/utf,ucp
|
||||||
|
\x{09}\x{0a}\x{1D}\x{20}\x{85}\x{a0}\x{61c}\x{1680}\x{180e}
|
||||||
|
0: \x{09}\x{0a}\x{1d} \x{85}\x{a0}\x{61c}\x{1680}\x{180e}
|
||||||
|
|
||||||
|
/^[[:^print:]]+$/utf,ucp
|
||||||
|
\x{180e}
|
||||||
|
No match
|
||||||
|
|
||||||
|
# End of U+180E tests.
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------
|
||||||
|
|
||||||
/\x{110000}/IB,utf
|
/\x{110000}/IB,utf
|
||||||
Failed: error 134 at offset 9: character code point value in \x{} or \o{} is too large
|
Failed: error 134 at offset 9: character code point value in \x{} or \o{} is too large
|
||||||
|
@ -2015,9 +2106,8 @@ No match
|
||||||
\x{200d}
|
\x{200d}
|
||||||
No match
|
No match
|
||||||
|
|
||||||
# These are here rather than in test 6 because Perl has problems with
|
# These are here because Perl has problems with the negative versions of the
|
||||||
# the negative versions of the properties and behaves has changed how
|
# properties and has changed how it behaves for caseless matching.
|
||||||
# it behaves for caseless matching.
|
|
||||||
|
|
||||||
/\p{^Lu}/i,utf
|
/\p{^Lu}/i,utf
|
||||||
1234
|
1234
|
||||||
|
@ -2520,7 +2610,7 @@ No match
|
||||||
/[[:cntrl:]]/B,ucp
|
/[[:cntrl:]]/B,ucp
|
||||||
------------------------------------------------------------------
|
------------------------------------------------------------------
|
||||||
Bra
|
Bra
|
||||||
[\x00-\x1f\x7f]
|
[\p{Cc}]
|
||||||
Ket
|
Ket
|
||||||
End
|
End
|
||||||
------------------------------------------------------------------
|
------------------------------------------------------------------
|
||||||
|
@ -2626,7 +2716,7 @@ No match
|
||||||
/[[:^alpha:][:^cntrl:]]+/B,utf,ucp
|
/[[:^alpha:][:^cntrl:]]+/B,utf,ucp
|
||||||
------------------------------------------------------------------
|
------------------------------------------------------------------
|
||||||
Bra
|
Bra
|
||||||
[ -~\x80-\xff\P{L}]++
|
[\P{L}\P{Cc}]++
|
||||||
Ket
|
Ket
|
||||||
End
|
End
|
||||||
------------------------------------------------------------------
|
------------------------------------------------------------------
|
||||||
|
@ -2638,7 +2728,7 @@ No match
|
||||||
/[[:^cntrl:][:^alpha:]]+/B,utf,ucp
|
/[[:^cntrl:][:^alpha:]]+/B,utf,ucp
|
||||||
------------------------------------------------------------------
|
------------------------------------------------------------------
|
||||||
Bra
|
Bra
|
||||||
[ -~\x80-\xff\P{L}]++
|
[\P{Cc}\P{L}]++
|
||||||
Ket
|
Ket
|
||||||
End
|
End
|
||||||
------------------------------------------------------------------
|
------------------------------------------------------------------
|
||||||
|
@ -2850,8 +2940,6 @@ No match
|
||||||
0: \x{2c65}\x{2c65}\x{23a}\x{23a}Y
|
0: \x{2c65}\x{2c65}\x{23a}\x{23a}Y
|
||||||
1: \x{2c65}\x{2c65}
|
1: \x{2c65}\x{2c65}
|
||||||
|
|
||||||
#
|
|
||||||
|
|
||||||
# These scripts weren't yet in Perl when I added Unicode 6.0.0 to PCRE
|
# These scripts weren't yet in Perl when I added Unicode 6.0.0 to PCRE
|
||||||
|
|
||||||
/^[\p{Batak}]/utf
|
/^[\p{Batak}]/utf
|
||||||
|
@ -2886,8 +2974,6 @@ No match
|
||||||
\x{85d}
|
\x{85d}
|
||||||
No match
|
No match
|
||||||
|
|
||||||
#
|
|
||||||
|
|
||||||
/(\X*)(.)/s,utf
|
/(\X*)(.)/s,utf
|
||||||
A\x{300}
|
A\x{300}
|
||||||
0: A
|
0: A
|
||||||
|
|
|
@ -659,18 +659,18 @@ Memory allocation (code space): 14
|
||||||
|
|
||||||
/[[:^alpha:][:^cntrl:]]+/utf,ucp
|
/[[:^alpha:][:^cntrl:]]+/utf,ucp
|
||||||
------------------------------------------------------------------
|
------------------------------------------------------------------
|
||||||
0 26 Bra
|
0 13 Bra
|
||||||
2 [ -~\x80-\xff\P{L}]++
|
2 [\P{L}\P{Cc}]++
|
||||||
26 26 Ket
|
13 13 Ket
|
||||||
28 End
|
15 End
|
||||||
------------------------------------------------------------------
|
------------------------------------------------------------------
|
||||||
|
|
||||||
/[[:^cntrl:][:^alpha:]]+/utf,ucp
|
/[[:^cntrl:][:^alpha:]]+/utf,ucp
|
||||||
------------------------------------------------------------------
|
------------------------------------------------------------------
|
||||||
0 26 Bra
|
0 13 Bra
|
||||||
2 [ -~\x80-\xff\P{L}]++
|
2 [\P{Cc}\P{L}]++
|
||||||
26 26 Ket
|
13 13 Ket
|
||||||
28 End
|
15 End
|
||||||
------------------------------------------------------------------
|
------------------------------------------------------------------
|
||||||
|
|
||||||
/[[:alpha:]]+/utf,ucp
|
/[[:alpha:]]+/utf,ucp
|
||||||
|
|
|
@ -659,18 +659,18 @@ Memory allocation (code space): 28
|
||||||
|
|
||||||
/[[:^alpha:][:^cntrl:]]+/utf,ucp
|
/[[:^alpha:][:^cntrl:]]+/utf,ucp
|
||||||
------------------------------------------------------------------
|
------------------------------------------------------------------
|
||||||
0 18 Bra
|
0 13 Bra
|
||||||
2 [ -~\x80-\xff\P{L}]++
|
2 [\P{L}\P{Cc}]++
|
||||||
18 18 Ket
|
13 13 Ket
|
||||||
20 End
|
15 End
|
||||||
------------------------------------------------------------------
|
------------------------------------------------------------------
|
||||||
|
|
||||||
/[[:^cntrl:][:^alpha:]]+/utf,ucp
|
/[[:^cntrl:][:^alpha:]]+/utf,ucp
|
||||||
------------------------------------------------------------------
|
------------------------------------------------------------------
|
||||||
0 18 Bra
|
0 13 Bra
|
||||||
2 [ -~\x80-\xff\P{L}]++
|
2 [\P{Cc}\P{L}]++
|
||||||
18 18 Ket
|
13 13 Ket
|
||||||
20 End
|
15 End
|
||||||
------------------------------------------------------------------
|
------------------------------------------------------------------
|
||||||
|
|
||||||
/[[:alpha:]]+/utf,ucp
|
/[[:alpha:]]+/utf,ucp
|
||||||
|
|
|
@ -659,18 +659,18 @@ Memory allocation (code space): 10
|
||||||
|
|
||||||
/[[:^alpha:][:^cntrl:]]+/utf,ucp
|
/[[:^alpha:][:^cntrl:]]+/utf,ucp
|
||||||
------------------------------------------------------------------
|
------------------------------------------------------------------
|
||||||
0 44 Bra
|
0 15 Bra
|
||||||
3 [ -~\x80-\xff\P{L}]++
|
3 [\P{L}\P{Cc}]++
|
||||||
44 44 Ket
|
15 15 Ket
|
||||||
47 End
|
18 End
|
||||||
------------------------------------------------------------------
|
------------------------------------------------------------------
|
||||||
|
|
||||||
/[[:^cntrl:][:^alpha:]]+/utf,ucp
|
/[[:^cntrl:][:^alpha:]]+/utf,ucp
|
||||||
------------------------------------------------------------------
|
------------------------------------------------------------------
|
||||||
0 44 Bra
|
0 15 Bra
|
||||||
3 [ -~\x80-\xff\P{L}]++
|
3 [\P{Cc}\P{L}]++
|
||||||
44 44 Ket
|
15 15 Ket
|
||||||
47 End
|
18 End
|
||||||
------------------------------------------------------------------
|
------------------------------------------------------------------
|
||||||
|
|
||||||
/[[:alpha:]]+/utf,ucp
|
/[[:alpha:]]+/utf,ucp
|
||||||
|
|
Loading…
Reference in New Issue