From f40fba5dc84fbae43b33ad5499b9eea76b743124 Mon Sep 17 00:00:00 2001 From: "Philip.Hazel" Date: Wed, 6 Aug 2014 17:33:14 +0000 Subject: [PATCH] Adjust tests for PCRE/Perl anomalies in character properties & fix one bug. --- src/pcre2_compile.c | 10 +++- src/pcre2_internal.h | 5 ++ testdata/testinput4 | 28 +++-------- testdata/testinput5 | 74 +++++++++++++++++++++++++--- testdata/testoutput4 | 58 ++++++---------------- testdata/testoutput5 | 106 ++++++++++++++++++++++++++++++++++++---- testdata/testoutput8-16 | 16 +++--- testdata/testoutput8-32 | 16 +++--- testdata/testoutput8-8 | 16 +++--- 9 files changed, 222 insertions(+), 107 deletions(-) diff --git a/src/pcre2_compile.c b/src/pcre2_compile.c index 4565e2b..4157a8a 100644 --- a/src/pcre2_compile.c +++ b/src/pcre2_compile.c @@ -470,6 +470,9 @@ general substitute of a Unicode property escape (\p or \P). However, for some POSIX classes (e.g. graph, print, punct) a special property code is compiled directly. */ +static const PCRE2_UCHAR string_pCc[] = { + CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET, + CHAR_C, CHAR_c, CHAR_RIGHT_CURLY_BRACKET, '\0' }; static const PCRE2_UCHAR string_pL[] = { CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET, CHAR_L, CHAR_RIGHT_CURLY_BRACKET, '\0' }; @@ -487,6 +490,9 @@ static const PCRE2_UCHAR string_h[] = { static const PCRE2_UCHAR string_pXps[] = { CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET, CHAR_X, CHAR_p, CHAR_s, CHAR_RIGHT_CURLY_BRACKET, '\0' }; +static const PCRE2_UCHAR string_PCc[] = { + CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET, + CHAR_C, CHAR_c, CHAR_RIGHT_CURLY_BRACKET, '\0' }; static const PCRE2_UCHAR string_PL[] = { CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET, CHAR_L, CHAR_RIGHT_CURLY_BRACKET, '\0' }; @@ -512,7 +518,7 @@ static PCRE2_SPTR posix_substitutes[] = { string_pXan, /* alnum */ NULL, /* ascii */ string_h, /* blank */ - NULL, /* cntrl */ + string_pCc, /* cntrl */ string_pNd, /* digit */ NULL, /* graph */ NULL, /* print */ @@ -527,7 +533,7 @@ static PCRE2_SPTR posix_substitutes[] = { string_PXan, /* ^alnum */ NULL, /* ^ascii */ string_H, /* ^blank */ - NULL, /* ^cntrl */ + string_PCc, /* ^cntrl */ string_PNd, /* ^digit */ NULL, /* ^graph */ NULL, /* ^print */ diff --git a/src/pcre2_internal.h b/src/pcre2_internal.h index 7c2132d..f4261eb 100644 --- a/src/pcre2_internal.h +++ b/src/pcre2_internal.h @@ -389,6 +389,11 @@ other. NOTE: The values also appear in pcre2_jit_compile.c. */ #ifndef EBCDIC +/* Character U+180E (Mongolian Vowel Separator) is not included in the list of +spaces in the Unicode file PropList.txt, and Perl does not recognize it as a +space. However, in many other sources it is listed as a space and has been in +PCRE for a long time. */ + #define HSPACE_LIST \ CHAR_HT, CHAR_SPACE, 0xa0, \ 0x1680, 0x180e, 0x2000, 0x2001, 0x2002, 0x2003, 0x2004, 0x2005, \ diff --git a/testdata/testinput4 b/testdata/testinput4 index d4b6075..873d5fa 100644 --- a/testdata/testinput4 +++ b/testdata/testinput4 @@ -1139,7 +1139,6 @@ \x{06e9} \x{060b} ** Failers - \x{061c} X\x{06e9} /^[\P{Yi}]/utf @@ -1492,7 +1491,7 @@ >\x{20}\x{a0}\x{1680}\x{2028}\x{2029}\x{202f}\x{9}\x{b} /^>[[:blank:]]*/utf,ucp - >\x{20}\x{a0}\x{1680}\x{180e}\x{2000}\x{202f}\x{9}\x{b}\x{2028} + >\x{20}\x{a0}\x{1680}\x{2000}\x{202f}\x{9}\x{b}\x{2028} /^[[:alpha:]]*/utf,ucp Az\x{aa}\x{c0}\x{1c5}\x{2b0}\x{3b6}\x{1d7c9}\x{2fa1d} @@ -2045,11 +2044,11 @@ /^A\s+Z/utf,ucp A\x{2005}Z - A\x{85}\x{180e}\x{2005}Z + A\x{85}\x{2005}Z /^A[\s]+Z/utf,ucp A\x{2005}Z - A\x{85}\x{180e}\x{2005}Z + A\x{85}\x{2005}Z /^[[:graph:]]+$/utf,ucp Letter:ABC @@ -2075,17 +2074,11 @@ \x{20} \x{85} \x{a0} - \x{61c} \x{1680} - \x{180e} \x{2028} \x{2029} \x{202f} \x{2065} - \x{2066} - \x{2067} - \x{2068} - \x{2069} \x{3000} \x{e0002} \x{e001f} @@ -2103,7 +2096,6 @@ Punctuation:\x{66a},; Symbol:\x{6de}<>\x{fffc} Cf-property:\x{ad}\x{600}\x{601}\x{602}\x{603}\x{604}\x{6dd}\x{70f} - \x{180e} \x{200b}\x{200c}\x{200d}\x{200e}\x{200f} \x{202a}\x{202b}\x{202c}\x{202d}\x{202e} \x{202f} @@ -2119,14 +2111,9 @@ \x{09} \x{1D} \x{85} - \x{61c} \x{2028} \x{2029} \x{2065} - \x{2066} - \x{2067} - \x{2068} - \x{2069} \x{e0002} \x{e001f} \x{e0080} @@ -2140,8 +2127,8 @@ abcde /^[[:^graph:]]+$/utf,ucp - \x{09}\x{0a}\x{1D}\x{20}\x{85}\x{a0}\x{61c}\x{1680}\x{180e} - \x{2028}\x{2029}\x{202f}\x{2065}\x{2066}\x{2067}\x{2068}\x{2069} + \x{09}\x{0a}\x{1D}\x{20}\x{85}\x{a0}\x{1680} + \x{2028}\x{2029}\x{202f}\x{2065} \x{3000}\x{e0002}\x{e001f}\x{e0080} ** Failers Letter:ABC @@ -2162,8 +2149,8 @@ \x{e0020}\x{e0030}\x{e0040}\x{e0050}\x{e0060}\x{e0070}\x{e007f} /^[[:^print:]]+$/utf,ucp - \x{09}\x{1D}\x{85}\x{61c}\x{2028}\x{2029}\x{2065}\x{2066}\x{2067} - \x{2068}\x{2069}\x{e0002}\x{e001f}\x{e0080} + \x{09}\x{1D}\x{85}\x{2028}\x{2029}\x{2065} + \x{e0002}\x{e001f}\x{e0080} ** Failers Space: \x{a0} \x{1680}\x{2000}\x{2001}\x{2002}\x{2003}\x{2004}\x{2005} @@ -2176,7 +2163,6 @@ Punctuation:\x{66a},; Symbol:\x{6de}<>\x{fffc} Cf-property:\x{ad}\x{600}\x{601}\x{602}\x{603}\x{604}\x{6dd}\x{70f} - \x{180e} \x{200b}\x{200c}\x{200d}\x{200e}\x{200f} \x{202a}\x{202b}\x{202c}\x{202d}\x{202e} \x{202f} diff --git a/testdata/testinput5 b/testdata/testinput5 index a2431b1..9149855 100644 --- a/testdata/testinput5 +++ b/testdata/testinput5 @@ -2,7 +2,72 @@ # support, including Unicode properties. However, tests that give different # results in 8-bit, 16-bit, and 32-bit modes are excluded (see tests 10 and # 12). + +# PCRE2 and Perl disagree about the characteristics of certain Unicode +# characters. For example, 061C is considered by Perl to be Arabic, though +# is it not listed as such in the Unicode Scripts.txt file, and 2066-2069 are +# graphic and printable according to Perl, though they are actually "isolate" +# control characters. That is why the following tests are here rather than in +# test 4. + +/^[\p{Arabic}]/utf + ** Failers + \x{061c} +/^[[:graph:]]+$/utf,ucp + ** Failers + \x{61c} + \x{2066} + \x{2067} + \x{2068} + \x{2069} + +/^[[:print:]]+$/utf,ucp + ** Failers + \x{61c} + \x{2066} + \x{2067} + \x{2068} + \x{2069} + +/^[[:^graph:]]+$/utf,ucp + \x{09}\x{0a}\x{1D}\x{20}\x{85}\x{a0}\x{61c}\x{1680} + \x{2028}\x{2029}\x{202f}\x{2065}\x{2066}\x{2067}\x{2068}\x{2069} + +/^[[:^print:]]+$/utf,ucp + \x{09}\x{1D}\x{85}\x{61c}\x{2028}\x{2029}\x{2065}\x{2066}\x{2067} + \x{2068}\x{2069} + +# Perl does not consider U+180e to be a space character. It is true that it +# does not appear in the Unicode PropList.txt file as such, but in many other +# sources it is listed as a space, and has been treated as such in PCRE for +# a long time. + +/^>[[:blank:]]*/utf,ucp + >\x{20}\x{a0}\x{1680}\x{180e}\x{2000}\x{202f}\x{9}\x{b}\x{2028} + +/^A\s+Z/utf,ucp + A\x{85}\x{180e}\x{2005}Z + +/^A[\s]+Z/utf,ucp + A\x{2005}Z + A\x{85}\x{2005}Z + +/^[[:graph:]]+$/utf,ucp + \x{180e} + +/^[[:print:]]+$/utf,ucp + \x{180e} + +/^[[:^graph:]]+$/utf,ucp + \x{09}\x{0a}\x{1D}\x{20}\x{85}\x{a0}\x{61c}\x{1680}\x{180e} + +/^[[:^print:]]+$/utf,ucp + \x{180e} + +# End of U+180E tests. + +# --------------------------------------------------------------------- /\x{110000}/IB,utf @@ -872,9 +937,8 @@ \x{2028} \x{200d} -# These are here rather than in test 6 because Perl has problems with -# the negative versions of the properties and behaves has changed how -# it behaves for caseless matching. +# These are here because Perl has problems with the negative versions of the +# properties and has changed how it behaves for caseless matching. /\p{^Lu}/i,utf 1234 @@ -1264,8 +1328,6 @@ /(\x{2c65}\x{2c65})\1Y/i,utf X\x{2c65}\x{2c65}\x{23a}\x{23a}YZ -# - # These scripts weren't yet in Perl when I added Unicode 6.0.0 to PCRE /^[\p{Batak}]/utf @@ -1287,8 +1349,6 @@ \x{85c} \x{85d} -# - /(\X*)(.)/s,utf A\x{300} diff --git a/testdata/testoutput4 b/testdata/testoutput4 index 97cff37..b3b6896 100644 --- a/testdata/testoutput4 +++ b/testdata/testoutput4 @@ -1983,8 +1983,6 @@ No match \x{060b} 0: \x{60b} ** Failers -No match - \x{061c} No match X\x{06e9} No match @@ -2578,8 +2576,8 @@ No match 0: > \x{a0}\x{1680}\x{2028}\x{2029}\x{202f}\x{09}\x{0b} /^>[[:blank:]]*/utf,ucp - >\x{20}\x{a0}\x{1680}\x{180e}\x{2000}\x{202f}\x{9}\x{b}\x{2028} - 0: > \x{a0}\x{1680}\x{180e}\x{2000}\x{202f}\x{09} + >\x{20}\x{a0}\x{1680}\x{2000}\x{202f}\x{9}\x{b}\x{2028} + 0: > \x{a0}\x{1680}\x{2000}\x{202f}\x{09} /^[[:alpha:]]*/utf,ucp Az\x{aa}\x{c0}\x{1c5}\x{2b0}\x{3b6}\x{1d7c9}\x{2fa1d} @@ -2591,7 +2589,7 @@ No match /^[[:cntrl:]]*/utf,ucp \x{0}\x{09}\x{1f}\x{7f}\x{9f} - 0: \x{00}\x{09}\x{1f}\x{7f} + 0: \x{00}\x{09}\x{1f}\x{7f}\x{9f} /^[[:graph:]]*/utf,ucp A\x{a1}\x{a0} @@ -3414,14 +3412,14 @@ No match /^A\s+Z/utf,ucp A\x{2005}Z 0: A\x{2005}Z - A\x{85}\x{180e}\x{2005}Z - 0: A\x{85}\x{180e}\x{2005}Z + A\x{85}\x{2005}Z + 0: A\x{85}\x{2005}Z /^A[\s]+Z/utf,ucp A\x{2005}Z 0: A\x{2005}Z - A\x{85}\x{180e}\x{2005}Z - 0: A\x{85}\x{180e}\x{2005}Z + A\x{85}\x{2005}Z + 0: A\x{85}\x{2005}Z /^[[:graph:]]+$/utf,ucp Letter:ABC @@ -3469,12 +3467,8 @@ No match \x{85} No match \x{a0} -No match - \x{61c} No match \x{1680} -No match - \x{180e} No match \x{2028} No match @@ -3483,14 +3477,6 @@ No match \x{202f} No match \x{2065} -No match - \x{2066} -No match - \x{2067} -No match - \x{2068} -No match - \x{2069} No match \x{3000} No match @@ -3524,8 +3510,6 @@ No match 0: Symbol:\x{6de}<>\x{fffc} Cf-property:\x{ad}\x{600}\x{601}\x{602}\x{603}\x{604}\x{6dd}\x{70f} 0: Cf-property:\x{ad}\x{600}\x{601}\x{602}\x{603}\x{604}\x{6dd}\x{70f} - \x{180e} - 0: \x{180e} \x{200b}\x{200c}\x{200d}\x{200e}\x{200f} 0: \x{200b}\x{200c}\x{200d}\x{200e}\x{200f} \x{202a}\x{202b}\x{202c}\x{202d}\x{202e} @@ -3555,22 +3539,12 @@ No match \x{1D} No match \x{85} -No match - \x{61c} No match \x{2028} No match \x{2029} No match \x{2065} -No match - \x{2066} -No match - \x{2067} -No match - \x{2068} -No match - \x{2069} No match \x{e0002} No match @@ -3594,10 +3568,10 @@ No match No match /^[[:^graph:]]+$/utf,ucp - \x{09}\x{0a}\x{1D}\x{20}\x{85}\x{a0}\x{61c}\x{1680}\x{180e} - 0: \x{09}\x{0a}\x{1d} \x{85}\x{a0}\x{61c}\x{1680}\x{180e} - \x{2028}\x{2029}\x{202f}\x{2065}\x{2066}\x{2067}\x{2068}\x{2069} - 0: \x{2028}\x{2029}\x{202f}\x{2065}\x{2066}\x{2067}\x{2068}\x{2069} + \x{09}\x{0a}\x{1D}\x{20}\x{85}\x{a0}\x{1680} + 0: \x{09}\x{0a}\x{1d} \x{85}\x{a0}\x{1680} + \x{2028}\x{2029}\x{202f}\x{2065} + 0: \x{2028}\x{2029}\x{202f}\x{2065} \x{3000}\x{e0002}\x{e001f}\x{e0080} 0: \x{3000}\x{e0002}\x{e001f}\x{e0080} ** Failers @@ -3636,10 +3610,10 @@ No match No match /^[[:^print:]]+$/utf,ucp - \x{09}\x{1D}\x{85}\x{61c}\x{2028}\x{2029}\x{2065}\x{2066}\x{2067} - 0: \x{09}\x{1d}\x{85}\x{61c}\x{2028}\x{2029}\x{2065}\x{2066}\x{2067} - \x{2068}\x{2069}\x{e0002}\x{e001f}\x{e0080} - 0: \x{2068}\x{2069}\x{e0002}\x{e001f}\x{e0080} + \x{09}\x{1D}\x{85}\x{2028}\x{2029}\x{2065} + 0: \x{09}\x{1d}\x{85}\x{2028}\x{2029}\x{2065} + \x{e0002}\x{e001f}\x{e0080} + 0: \x{e0002}\x{e001f}\x{e0080} ** Failers No match Space: \x{a0} @@ -3663,8 +3637,6 @@ No match Symbol:\x{6de}<>\x{fffc} No match Cf-property:\x{ad}\x{600}\x{601}\x{602}\x{603}\x{604}\x{6dd}\x{70f} -No match - \x{180e} No match \x{200b}\x{200c}\x{200d}\x{200e}\x{200f} No match diff --git a/testdata/testoutput5 b/testdata/testoutput5 index d1bb20a..2ddd11f 100644 --- a/testdata/testoutput5 +++ b/testdata/testoutput5 @@ -2,7 +2,98 @@ # support, including Unicode properties. However, tests that give different # results in 8-bit, 16-bit, and 32-bit modes are excluded (see tests 10 and # 12). + +# PCRE2 and Perl disagree about the characteristics of certain Unicode +# characters. For example, 061C is considered by Perl to be Arabic, though +# is it not listed as such in the Unicode Scripts.txt file, and 2066-2069 are +# graphic and printable according to Perl, though they are actually "isolate" +# control characters. That is why the following tests are here rather than in +# test 4. + +/^[\p{Arabic}]/utf + ** Failers +No match + \x{061c} +No match +/^[[:graph:]]+$/utf,ucp + ** Failers +No match + \x{61c} +No match + \x{2066} +No match + \x{2067} +No match + \x{2068} +No match + \x{2069} +No match + +/^[[:print:]]+$/utf,ucp + ** Failers + 0: ** Failers + \x{61c} +No match + \x{2066} +No match + \x{2067} +No match + \x{2068} +No match + \x{2069} +No match + +/^[[:^graph:]]+$/utf,ucp + \x{09}\x{0a}\x{1D}\x{20}\x{85}\x{a0}\x{61c}\x{1680} + 0: \x{09}\x{0a}\x{1d} \x{85}\x{a0}\x{61c}\x{1680} + \x{2028}\x{2029}\x{202f}\x{2065}\x{2066}\x{2067}\x{2068}\x{2069} + 0: \x{2028}\x{2029}\x{202f}\x{2065}\x{2066}\x{2067}\x{2068}\x{2069} + +/^[[:^print:]]+$/utf,ucp + \x{09}\x{1D}\x{85}\x{61c}\x{2028}\x{2029}\x{2065}\x{2066}\x{2067} + 0: \x{09}\x{1d}\x{85}\x{61c}\x{2028}\x{2029}\x{2065}\x{2066}\x{2067} + \x{2068}\x{2069} + 0: \x{2068}\x{2069} + +# Perl does not consider U+180e to be a space character. It is true that it +# does not appear in the Unicode PropList.txt file as such, but in many other +# sources it is listed as a space, and has been treated as such in PCRE for +# a long time. + +/^>[[:blank:]]*/utf,ucp + >\x{20}\x{a0}\x{1680}\x{180e}\x{2000}\x{202f}\x{9}\x{b}\x{2028} + 0: > \x{a0}\x{1680}\x{180e}\x{2000}\x{202f}\x{09} + +/^A\s+Z/utf,ucp + A\x{85}\x{180e}\x{2005}Z + 0: A\x{85}\x{180e}\x{2005}Z + +/^A[\s]+Z/utf,ucp + A\x{2005}Z + 0: A\x{2005}Z + A\x{85}\x{2005}Z + 0: A\x{85}\x{2005}Z + +/^[[:graph:]]+$/utf,ucp + \x{180e} +No match + +/^[[:print:]]+$/utf,ucp + \x{180e} + 0: \x{180e} + +/^[[:^graph:]]+$/utf,ucp + \x{09}\x{0a}\x{1D}\x{20}\x{85}\x{a0}\x{61c}\x{1680}\x{180e} + 0: \x{09}\x{0a}\x{1d} \x{85}\x{a0}\x{61c}\x{1680}\x{180e} + +/^[[:^print:]]+$/utf,ucp + \x{180e} +No match + +# End of U+180E tests. + +# --------------------------------------------------------------------- /\x{110000}/IB,utf Failed: error 134 at offset 9: character code point value in \x{} or \o{} is too large @@ -2015,9 +2106,8 @@ No match \x{200d} No match -# These are here rather than in test 6 because Perl has problems with -# the negative versions of the properties and behaves has changed how -# it behaves for caseless matching. +# These are here because Perl has problems with the negative versions of the +# properties and has changed how it behaves for caseless matching. /\p{^Lu}/i,utf 1234 @@ -2520,7 +2610,7 @@ No match /[[:cntrl:]]/B,ucp ------------------------------------------------------------------ Bra - [\x00-\x1f\x7f] + [\p{Cc}] Ket End ------------------------------------------------------------------ @@ -2626,7 +2716,7 @@ No match /[[:^alpha:][:^cntrl:]]+/B,utf,ucp ------------------------------------------------------------------ Bra - [ -~\x80-\xff\P{L}]++ + [\P{L}\P{Cc}]++ Ket End ------------------------------------------------------------------ @@ -2638,7 +2728,7 @@ No match /[[:^cntrl:][:^alpha:]]+/B,utf,ucp ------------------------------------------------------------------ Bra - [ -~\x80-\xff\P{L}]++ + [\P{Cc}\P{L}]++ Ket End ------------------------------------------------------------------ @@ -2850,8 +2940,6 @@ No match 0: \x{2c65}\x{2c65}\x{23a}\x{23a}Y 1: \x{2c65}\x{2c65} -# - # These scripts weren't yet in Perl when I added Unicode 6.0.0 to PCRE /^[\p{Batak}]/utf @@ -2886,8 +2974,6 @@ No match \x{85d} No match -# - /(\X*)(.)/s,utf A\x{300} 0: A diff --git a/testdata/testoutput8-16 b/testdata/testoutput8-16 index c51b406..62cd27b 100644 --- a/testdata/testoutput8-16 +++ b/testdata/testoutput8-16 @@ -659,18 +659,18 @@ Memory allocation (code space): 14 /[[:^alpha:][:^cntrl:]]+/utf,ucp ------------------------------------------------------------------ - 0 26 Bra - 2 [ -~\x80-\xff\P{L}]++ - 26 26 Ket - 28 End + 0 13 Bra + 2 [\P{L}\P{Cc}]++ + 13 13 Ket + 15 End ------------------------------------------------------------------ /[[:^cntrl:][:^alpha:]]+/utf,ucp ------------------------------------------------------------------ - 0 26 Bra - 2 [ -~\x80-\xff\P{L}]++ - 26 26 Ket - 28 End + 0 13 Bra + 2 [\P{Cc}\P{L}]++ + 13 13 Ket + 15 End ------------------------------------------------------------------ /[[:alpha:]]+/utf,ucp diff --git a/testdata/testoutput8-32 b/testdata/testoutput8-32 index 1cb5ff1..f27b624 100644 --- a/testdata/testoutput8-32 +++ b/testdata/testoutput8-32 @@ -659,18 +659,18 @@ Memory allocation (code space): 28 /[[:^alpha:][:^cntrl:]]+/utf,ucp ------------------------------------------------------------------ - 0 18 Bra - 2 [ -~\x80-\xff\P{L}]++ - 18 18 Ket - 20 End + 0 13 Bra + 2 [\P{L}\P{Cc}]++ + 13 13 Ket + 15 End ------------------------------------------------------------------ /[[:^cntrl:][:^alpha:]]+/utf,ucp ------------------------------------------------------------------ - 0 18 Bra - 2 [ -~\x80-\xff\P{L}]++ - 18 18 Ket - 20 End + 0 13 Bra + 2 [\P{Cc}\P{L}]++ + 13 13 Ket + 15 End ------------------------------------------------------------------ /[[:alpha:]]+/utf,ucp diff --git a/testdata/testoutput8-8 b/testdata/testoutput8-8 index ae0518e..92b7a28 100644 --- a/testdata/testoutput8-8 +++ b/testdata/testoutput8-8 @@ -659,18 +659,18 @@ Memory allocation (code space): 10 /[[:^alpha:][:^cntrl:]]+/utf,ucp ------------------------------------------------------------------ - 0 44 Bra - 3 [ -~\x80-\xff\P{L}]++ - 44 44 Ket - 47 End + 0 15 Bra + 3 [\P{L}\P{Cc}]++ + 15 15 Ket + 18 End ------------------------------------------------------------------ /[[:^cntrl:][:^alpha:]]+/utf,ucp ------------------------------------------------------------------ - 0 44 Bra - 3 [ -~\x80-\xff\P{L}]++ - 44 44 Ket - 47 End + 0 15 Bra + 3 [\P{Cc}\P{L}]++ + 15 15 Ket + 18 End ------------------------------------------------------------------ /[[:alpha:]]+/utf,ucp