Adjust tests for PCRE/Perl anomalies in character properties & fix one bug.

This commit is contained in:
Philip.Hazel 2014-08-06 17:33:14 +00:00
parent fd555f266c
commit f40fba5dc8
9 changed files with 222 additions and 107 deletions

View File

@ -470,6 +470,9 @@ general substitute of a Unicode property escape (\p or \P). However, for some
POSIX classes (e.g. graph, print, punct) a special property code is compiled
directly. */
static const PCRE2_UCHAR string_pCc[] = {
CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
CHAR_C, CHAR_c, CHAR_RIGHT_CURLY_BRACKET, '\0' };
static const PCRE2_UCHAR string_pL[] = {
CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
CHAR_L, CHAR_RIGHT_CURLY_BRACKET, '\0' };
@ -487,6 +490,9 @@ static const PCRE2_UCHAR string_h[] = {
static const PCRE2_UCHAR string_pXps[] = {
CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
CHAR_X, CHAR_p, CHAR_s, CHAR_RIGHT_CURLY_BRACKET, '\0' };
static const PCRE2_UCHAR string_PCc[] = {
CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
CHAR_C, CHAR_c, CHAR_RIGHT_CURLY_BRACKET, '\0' };
static const PCRE2_UCHAR string_PL[] = {
CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
CHAR_L, CHAR_RIGHT_CURLY_BRACKET, '\0' };
@ -512,7 +518,7 @@ static PCRE2_SPTR posix_substitutes[] = {
string_pXan, /* alnum */
NULL, /* ascii */
string_h, /* blank */
NULL, /* cntrl */
string_pCc, /* cntrl */
string_pNd, /* digit */
NULL, /* graph */
NULL, /* print */
@ -527,7 +533,7 @@ static PCRE2_SPTR posix_substitutes[] = {
string_PXan, /* ^alnum */
NULL, /* ^ascii */
string_H, /* ^blank */
NULL, /* ^cntrl */
string_PCc, /* ^cntrl */
string_PNd, /* ^digit */
NULL, /* ^graph */
NULL, /* ^print */

View File

@ -389,6 +389,11 @@ other. NOTE: The values also appear in pcre2_jit_compile.c. */
#ifndef EBCDIC
/* Character U+180E (Mongolian Vowel Separator) is not included in the list of
spaces in the Unicode file PropList.txt, and Perl does not recognize it as a
space. However, in many other sources it is listed as a space and has been in
PCRE for a long time. */
#define HSPACE_LIST \
CHAR_HT, CHAR_SPACE, 0xa0, \
0x1680, 0x180e, 0x2000, 0x2001, 0x2002, 0x2003, 0x2004, 0x2005, \

28
testdata/testinput4 vendored
View File

@ -1139,7 +1139,6 @@
\x{06e9}
\x{060b}
** Failers
\x{061c}
X\x{06e9}
/^[\P{Yi}]/utf
@ -1492,7 +1491,7 @@
>\x{20}\x{a0}\x{1680}\x{2028}\x{2029}\x{202f}\x{9}\x{b}
/^>[[:blank:]]*/utf,ucp
>\x{20}\x{a0}\x{1680}\x{180e}\x{2000}\x{202f}\x{9}\x{b}\x{2028}
>\x{20}\x{a0}\x{1680}\x{2000}\x{202f}\x{9}\x{b}\x{2028}
/^[[:alpha:]]*/utf,ucp
Az\x{aa}\x{c0}\x{1c5}\x{2b0}\x{3b6}\x{1d7c9}\x{2fa1d}
@ -2045,11 +2044,11 @@
/^A\s+Z/utf,ucp
A\x{2005}Z
A\x{85}\x{180e}\x{2005}Z
A\x{85}\x{2005}Z
/^A[\s]+Z/utf,ucp
A\x{2005}Z
A\x{85}\x{180e}\x{2005}Z
A\x{85}\x{2005}Z
/^[[:graph:]]+$/utf,ucp
Letter:ABC
@ -2075,17 +2074,11 @@
\x{20}
\x{85}
\x{a0}
\x{61c}
\x{1680}
\x{180e}
\x{2028}
\x{2029}
\x{202f}
\x{2065}
\x{2066}
\x{2067}
\x{2068}
\x{2069}
\x{3000}
\x{e0002}
\x{e001f}
@ -2103,7 +2096,6 @@
Punctuation:\x{66a},;
Symbol:\x{6de}<>\x{fffc}
Cf-property:\x{ad}\x{600}\x{601}\x{602}\x{603}\x{604}\x{6dd}\x{70f}
\x{180e}
\x{200b}\x{200c}\x{200d}\x{200e}\x{200f}
\x{202a}\x{202b}\x{202c}\x{202d}\x{202e}
\x{202f}
@ -2119,14 +2111,9 @@
\x{09}
\x{1D}
\x{85}
\x{61c}
\x{2028}
\x{2029}
\x{2065}
\x{2066}
\x{2067}
\x{2068}
\x{2069}
\x{e0002}
\x{e001f}
\x{e0080}
@ -2140,8 +2127,8 @@
abcde
/^[[:^graph:]]+$/utf,ucp
\x{09}\x{0a}\x{1D}\x{20}\x{85}\x{a0}\x{61c}\x{1680}\x{180e}
\x{2028}\x{2029}\x{202f}\x{2065}\x{2066}\x{2067}\x{2068}\x{2069}
\x{09}\x{0a}\x{1D}\x{20}\x{85}\x{a0}\x{1680}
\x{2028}\x{2029}\x{202f}\x{2065}
\x{3000}\x{e0002}\x{e001f}\x{e0080}
** Failers
Letter:ABC
@ -2162,8 +2149,8 @@
\x{e0020}\x{e0030}\x{e0040}\x{e0050}\x{e0060}\x{e0070}\x{e007f}
/^[[:^print:]]+$/utf,ucp
\x{09}\x{1D}\x{85}\x{61c}\x{2028}\x{2029}\x{2065}\x{2066}\x{2067}
\x{2068}\x{2069}\x{e0002}\x{e001f}\x{e0080}
\x{09}\x{1D}\x{85}\x{2028}\x{2029}\x{2065}
\x{e0002}\x{e001f}\x{e0080}
** Failers
Space: \x{a0}
\x{1680}\x{2000}\x{2001}\x{2002}\x{2003}\x{2004}\x{2005}
@ -2176,7 +2163,6 @@
Punctuation:\x{66a},;
Symbol:\x{6de}<>\x{fffc}
Cf-property:\x{ad}\x{600}\x{601}\x{602}\x{603}\x{604}\x{6dd}\x{70f}
\x{180e}
\x{200b}\x{200c}\x{200d}\x{200e}\x{200f}
\x{202a}\x{202b}\x{202c}\x{202d}\x{202e}
\x{202f}

74
testdata/testinput5 vendored
View File

@ -2,7 +2,72 @@
# support, including Unicode properties. However, tests that give different
# results in 8-bit, 16-bit, and 32-bit modes are excluded (see tests 10 and
# 12).
# PCRE2 and Perl disagree about the characteristics of certain Unicode
# characters. For example, 061C is considered by Perl to be Arabic, though
# is it not listed as such in the Unicode Scripts.txt file, and 2066-2069 are
# graphic and printable according to Perl, though they are actually "isolate"
# control characters. That is why the following tests are here rather than in
# test 4.
/^[\p{Arabic}]/utf
** Failers
\x{061c}
/^[[:graph:]]+$/utf,ucp
** Failers
\x{61c}
\x{2066}
\x{2067}
\x{2068}
\x{2069}
/^[[:print:]]+$/utf,ucp
** Failers
\x{61c}
\x{2066}
\x{2067}
\x{2068}
\x{2069}
/^[[:^graph:]]+$/utf,ucp
\x{09}\x{0a}\x{1D}\x{20}\x{85}\x{a0}\x{61c}\x{1680}
\x{2028}\x{2029}\x{202f}\x{2065}\x{2066}\x{2067}\x{2068}\x{2069}
/^[[:^print:]]+$/utf,ucp
\x{09}\x{1D}\x{85}\x{61c}\x{2028}\x{2029}\x{2065}\x{2066}\x{2067}
\x{2068}\x{2069}
# Perl does not consider U+180e to be a space character. It is true that it
# does not appear in the Unicode PropList.txt file as such, but in many other
# sources it is listed as a space, and has been treated as such in PCRE for
# a long time.
/^>[[:blank:]]*/utf,ucp
>\x{20}\x{a0}\x{1680}\x{180e}\x{2000}\x{202f}\x{9}\x{b}\x{2028}
/^A\s+Z/utf,ucp
A\x{85}\x{180e}\x{2005}Z
/^A[\s]+Z/utf,ucp
A\x{2005}Z
A\x{85}\x{2005}Z
/^[[:graph:]]+$/utf,ucp
\x{180e}
/^[[:print:]]+$/utf,ucp
\x{180e}
/^[[:^graph:]]+$/utf,ucp
\x{09}\x{0a}\x{1D}\x{20}\x{85}\x{a0}\x{61c}\x{1680}\x{180e}
/^[[:^print:]]+$/utf,ucp
\x{180e}
# End of U+180E tests.
# ---------------------------------------------------------------------
/\x{110000}/IB,utf
@ -872,9 +937,8 @@
\x{2028}
\x{200d}
# These are here rather than in test 6 because Perl has problems with
# the negative versions of the properties and behaves has changed how
# it behaves for caseless matching.
# These are here because Perl has problems with the negative versions of the
# properties and has changed how it behaves for caseless matching.
/\p{^Lu}/i,utf
1234
@ -1264,8 +1328,6 @@
/(\x{2c65}\x{2c65})\1Y/i,utf
X\x{2c65}\x{2c65}\x{23a}\x{23a}YZ
#
# These scripts weren't yet in Perl when I added Unicode 6.0.0 to PCRE
/^[\p{Batak}]/utf
@ -1287,8 +1349,6 @@
\x{85c}
\x{85d}
#
/(\X*)(.)/s,utf
A\x{300}

58
testdata/testoutput4 vendored
View File

@ -1983,8 +1983,6 @@ No match
\x{060b}
0: \x{60b}
** Failers
No match
\x{061c}
No match
X\x{06e9}
No match
@ -2578,8 +2576,8 @@ No match
0: > \x{a0}\x{1680}\x{2028}\x{2029}\x{202f}\x{09}\x{0b}
/^>[[:blank:]]*/utf,ucp
>\x{20}\x{a0}\x{1680}\x{180e}\x{2000}\x{202f}\x{9}\x{b}\x{2028}
0: > \x{a0}\x{1680}\x{180e}\x{2000}\x{202f}\x{09}
>\x{20}\x{a0}\x{1680}\x{2000}\x{202f}\x{9}\x{b}\x{2028}
0: > \x{a0}\x{1680}\x{2000}\x{202f}\x{09}
/^[[:alpha:]]*/utf,ucp
Az\x{aa}\x{c0}\x{1c5}\x{2b0}\x{3b6}\x{1d7c9}\x{2fa1d}
@ -2591,7 +2589,7 @@ No match
/^[[:cntrl:]]*/utf,ucp
\x{0}\x{09}\x{1f}\x{7f}\x{9f}
0: \x{00}\x{09}\x{1f}\x{7f}
0: \x{00}\x{09}\x{1f}\x{7f}\x{9f}
/^[[:graph:]]*/utf,ucp
A\x{a1}\x{a0}
@ -3414,14 +3412,14 @@ No match
/^A\s+Z/utf,ucp
A\x{2005}Z
0: A\x{2005}Z
A\x{85}\x{180e}\x{2005}Z
0: A\x{85}\x{180e}\x{2005}Z
A\x{85}\x{2005}Z
0: A\x{85}\x{2005}Z
/^A[\s]+Z/utf,ucp
A\x{2005}Z
0: A\x{2005}Z
A\x{85}\x{180e}\x{2005}Z
0: A\x{85}\x{180e}\x{2005}Z
A\x{85}\x{2005}Z
0: A\x{85}\x{2005}Z
/^[[:graph:]]+$/utf,ucp
Letter:ABC
@ -3469,12 +3467,8 @@ No match
\x{85}
No match
\x{a0}
No match
\x{61c}
No match
\x{1680}
No match
\x{180e}
No match
\x{2028}
No match
@ -3483,14 +3477,6 @@ No match
\x{202f}
No match
\x{2065}
No match
\x{2066}
No match
\x{2067}
No match
\x{2068}
No match
\x{2069}
No match
\x{3000}
No match
@ -3524,8 +3510,6 @@ No match
0: Symbol:\x{6de}<>\x{fffc}
Cf-property:\x{ad}\x{600}\x{601}\x{602}\x{603}\x{604}\x{6dd}\x{70f}
0: Cf-property:\x{ad}\x{600}\x{601}\x{602}\x{603}\x{604}\x{6dd}\x{70f}
\x{180e}
0: \x{180e}
\x{200b}\x{200c}\x{200d}\x{200e}\x{200f}
0: \x{200b}\x{200c}\x{200d}\x{200e}\x{200f}
\x{202a}\x{202b}\x{202c}\x{202d}\x{202e}
@ -3555,22 +3539,12 @@ No match
\x{1D}
No match
\x{85}
No match
\x{61c}
No match
\x{2028}
No match
\x{2029}
No match
\x{2065}
No match
\x{2066}
No match
\x{2067}
No match
\x{2068}
No match
\x{2069}
No match
\x{e0002}
No match
@ -3594,10 +3568,10 @@ No match
No match
/^[[:^graph:]]+$/utf,ucp
\x{09}\x{0a}\x{1D}\x{20}\x{85}\x{a0}\x{61c}\x{1680}\x{180e}
0: \x{09}\x{0a}\x{1d} \x{85}\x{a0}\x{61c}\x{1680}\x{180e}
\x{2028}\x{2029}\x{202f}\x{2065}\x{2066}\x{2067}\x{2068}\x{2069}
0: \x{2028}\x{2029}\x{202f}\x{2065}\x{2066}\x{2067}\x{2068}\x{2069}
\x{09}\x{0a}\x{1D}\x{20}\x{85}\x{a0}\x{1680}
0: \x{09}\x{0a}\x{1d} \x{85}\x{a0}\x{1680}
\x{2028}\x{2029}\x{202f}\x{2065}
0: \x{2028}\x{2029}\x{202f}\x{2065}
\x{3000}\x{e0002}\x{e001f}\x{e0080}
0: \x{3000}\x{e0002}\x{e001f}\x{e0080}
** Failers
@ -3636,10 +3610,10 @@ No match
No match
/^[[:^print:]]+$/utf,ucp
\x{09}\x{1D}\x{85}\x{61c}\x{2028}\x{2029}\x{2065}\x{2066}\x{2067}
0: \x{09}\x{1d}\x{85}\x{61c}\x{2028}\x{2029}\x{2065}\x{2066}\x{2067}
\x{2068}\x{2069}\x{e0002}\x{e001f}\x{e0080}
0: \x{2068}\x{2069}\x{e0002}\x{e001f}\x{e0080}
\x{09}\x{1D}\x{85}\x{2028}\x{2029}\x{2065}
0: \x{09}\x{1d}\x{85}\x{2028}\x{2029}\x{2065}
\x{e0002}\x{e001f}\x{e0080}
0: \x{e0002}\x{e001f}\x{e0080}
** Failers
No match
Space: \x{a0}
@ -3663,8 +3637,6 @@ No match
Symbol:\x{6de}<>\x{fffc}
No match
Cf-property:\x{ad}\x{600}\x{601}\x{602}\x{603}\x{604}\x{6dd}\x{70f}
No match
\x{180e}
No match
\x{200b}\x{200c}\x{200d}\x{200e}\x{200f}
No match

106
testdata/testoutput5 vendored
View File

@ -2,7 +2,98 @@
# support, including Unicode properties. However, tests that give different
# results in 8-bit, 16-bit, and 32-bit modes are excluded (see tests 10 and
# 12).
# PCRE2 and Perl disagree about the characteristics of certain Unicode
# characters. For example, 061C is considered by Perl to be Arabic, though
# is it not listed as such in the Unicode Scripts.txt file, and 2066-2069 are
# graphic and printable according to Perl, though they are actually "isolate"
# control characters. That is why the following tests are here rather than in
# test 4.
/^[\p{Arabic}]/utf
** Failers
No match
\x{061c}
No match
/^[[:graph:]]+$/utf,ucp
** Failers
No match
\x{61c}
No match
\x{2066}
No match
\x{2067}
No match
\x{2068}
No match
\x{2069}
No match
/^[[:print:]]+$/utf,ucp
** Failers
0: ** Failers
\x{61c}
No match
\x{2066}
No match
\x{2067}
No match
\x{2068}
No match
\x{2069}
No match
/^[[:^graph:]]+$/utf,ucp
\x{09}\x{0a}\x{1D}\x{20}\x{85}\x{a0}\x{61c}\x{1680}
0: \x{09}\x{0a}\x{1d} \x{85}\x{a0}\x{61c}\x{1680}
\x{2028}\x{2029}\x{202f}\x{2065}\x{2066}\x{2067}\x{2068}\x{2069}
0: \x{2028}\x{2029}\x{202f}\x{2065}\x{2066}\x{2067}\x{2068}\x{2069}
/^[[:^print:]]+$/utf,ucp
\x{09}\x{1D}\x{85}\x{61c}\x{2028}\x{2029}\x{2065}\x{2066}\x{2067}
0: \x{09}\x{1d}\x{85}\x{61c}\x{2028}\x{2029}\x{2065}\x{2066}\x{2067}
\x{2068}\x{2069}
0: \x{2068}\x{2069}
# Perl does not consider U+180e to be a space character. It is true that it
# does not appear in the Unicode PropList.txt file as such, but in many other
# sources it is listed as a space, and has been treated as such in PCRE for
# a long time.
/^>[[:blank:]]*/utf,ucp
>\x{20}\x{a0}\x{1680}\x{180e}\x{2000}\x{202f}\x{9}\x{b}\x{2028}
0: > \x{a0}\x{1680}\x{180e}\x{2000}\x{202f}\x{09}
/^A\s+Z/utf,ucp
A\x{85}\x{180e}\x{2005}Z
0: A\x{85}\x{180e}\x{2005}Z
/^A[\s]+Z/utf,ucp
A\x{2005}Z
0: A\x{2005}Z
A\x{85}\x{2005}Z
0: A\x{85}\x{2005}Z
/^[[:graph:]]+$/utf,ucp
\x{180e}
No match
/^[[:print:]]+$/utf,ucp
\x{180e}
0: \x{180e}
/^[[:^graph:]]+$/utf,ucp
\x{09}\x{0a}\x{1D}\x{20}\x{85}\x{a0}\x{61c}\x{1680}\x{180e}
0: \x{09}\x{0a}\x{1d} \x{85}\x{a0}\x{61c}\x{1680}\x{180e}
/^[[:^print:]]+$/utf,ucp
\x{180e}
No match
# End of U+180E tests.
# ---------------------------------------------------------------------
/\x{110000}/IB,utf
Failed: error 134 at offset 9: character code point value in \x{} or \o{} is too large
@ -2015,9 +2106,8 @@ No match
\x{200d}
No match
# These are here rather than in test 6 because Perl has problems with
# the negative versions of the properties and behaves has changed how
# it behaves for caseless matching.
# These are here because Perl has problems with the negative versions of the
# properties and has changed how it behaves for caseless matching.
/\p{^Lu}/i,utf
1234
@ -2520,7 +2610,7 @@ No match
/[[:cntrl:]]/B,ucp
------------------------------------------------------------------
Bra
[\x00-\x1f\x7f]
[\p{Cc}]
Ket
End
------------------------------------------------------------------
@ -2626,7 +2716,7 @@ No match
/[[:^alpha:][:^cntrl:]]+/B,utf,ucp
------------------------------------------------------------------
Bra
[ -~\x80-\xff\P{L}]++
[\P{L}\P{Cc}]++
Ket
End
------------------------------------------------------------------
@ -2638,7 +2728,7 @@ No match
/[[:^cntrl:][:^alpha:]]+/B,utf,ucp
------------------------------------------------------------------
Bra
[ -~\x80-\xff\P{L}]++
[\P{Cc}\P{L}]++
Ket
End
------------------------------------------------------------------
@ -2850,8 +2940,6 @@ No match
0: \x{2c65}\x{2c65}\x{23a}\x{23a}Y
1: \x{2c65}\x{2c65}
#
# These scripts weren't yet in Perl when I added Unicode 6.0.0 to PCRE
/^[\p{Batak}]/utf
@ -2886,8 +2974,6 @@ No match
\x{85d}
No match
#
/(\X*)(.)/s,utf
A\x{300}
0: A

View File

@ -659,18 +659,18 @@ Memory allocation (code space): 14
/[[:^alpha:][:^cntrl:]]+/utf,ucp
------------------------------------------------------------------
0 26 Bra
2 [ -~\x80-\xff\P{L}]++
26 26 Ket
28 End
0 13 Bra
2 [\P{L}\P{Cc}]++
13 13 Ket
15 End
------------------------------------------------------------------
/[[:^cntrl:][:^alpha:]]+/utf,ucp
------------------------------------------------------------------
0 26 Bra
2 [ -~\x80-\xff\P{L}]++
26 26 Ket
28 End
0 13 Bra
2 [\P{Cc}\P{L}]++
13 13 Ket
15 End
------------------------------------------------------------------
/[[:alpha:]]+/utf,ucp

View File

@ -659,18 +659,18 @@ Memory allocation (code space): 28
/[[:^alpha:][:^cntrl:]]+/utf,ucp
------------------------------------------------------------------
0 18 Bra
2 [ -~\x80-\xff\P{L}]++
18 18 Ket
20 End
0 13 Bra
2 [\P{L}\P{Cc}]++
13 13 Ket
15 End
------------------------------------------------------------------
/[[:^cntrl:][:^alpha:]]+/utf,ucp
------------------------------------------------------------------
0 18 Bra
2 [ -~\x80-\xff\P{L}]++
18 18 Ket
20 End
0 13 Bra
2 [\P{Cc}\P{L}]++
13 13 Ket
15 End
------------------------------------------------------------------
/[[:alpha:]]+/utf,ucp

View File

@ -659,18 +659,18 @@ Memory allocation (code space): 10
/[[:^alpha:][:^cntrl:]]+/utf,ucp
------------------------------------------------------------------
0 44 Bra
3 [ -~\x80-\xff\P{L}]++
44 44 Ket
47 End
0 15 Bra
3 [\P{L}\P{Cc}]++
15 15 Ket
18 End
------------------------------------------------------------------
/[[:^cntrl:][:^alpha:]]+/utf,ucp
------------------------------------------------------------------
0 44 Bra
3 [ -~\x80-\xff\P{L}]++
44 44 Ket
47 End
0 15 Bra
3 [\P{Cc}\P{L}]++
15 15 Ket
18 End
------------------------------------------------------------------
/[[:alpha:]]+/utf,ucp