From f40fba5dc84fbae43b33ad5499b9eea76b743124 Mon Sep 17 00:00:00 2001
From: "Philip.Hazel" <Philip.Hazel@gmail.com>
Date: Wed, 6 Aug 2014 17:33:14 +0000
Subject: [PATCH] Adjust tests for PCRE/Perl anomalies in character properties
 & fix one bug.

---
 src/pcre2_compile.c     |  10 +++-
 src/pcre2_internal.h    |   5 ++
 testdata/testinput4     |  28 +++--------
 testdata/testinput5     |  74 +++++++++++++++++++++++++---
 testdata/testoutput4    |  58 ++++++----------------
 testdata/testoutput5    | 106 ++++++++++++++++++++++++++++++++++++----
 testdata/testoutput8-16 |  16 +++---
 testdata/testoutput8-32 |  16 +++---
 testdata/testoutput8-8  |  16 +++---
 9 files changed, 222 insertions(+), 107 deletions(-)

diff --git a/src/pcre2_compile.c b/src/pcre2_compile.c
index 4565e2b..4157a8a 100644
--- a/src/pcre2_compile.c
+++ b/src/pcre2_compile.c
@@ -470,6 +470,9 @@ general substitute of a Unicode property escape (\p or \P). However, for some
 POSIX classes (e.g. graph, print, punct) a special property code is compiled
 directly. */
 
+static const PCRE2_UCHAR string_pCc[] =  {
+  CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
+  CHAR_C, CHAR_c, CHAR_RIGHT_CURLY_BRACKET, '\0' };
 static const PCRE2_UCHAR string_pL[] =   {
   CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
   CHAR_L, CHAR_RIGHT_CURLY_BRACKET, '\0' };
@@ -487,6 +490,9 @@ static const PCRE2_UCHAR string_h[] =    {
 static const PCRE2_UCHAR string_pXps[] = {
   CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
   CHAR_X, CHAR_p, CHAR_s, CHAR_RIGHT_CURLY_BRACKET, '\0' };
+static const PCRE2_UCHAR string_PCc[] =  {
+  CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
+  CHAR_C, CHAR_c, CHAR_RIGHT_CURLY_BRACKET, '\0' };
 static const PCRE2_UCHAR string_PL[] =   {
   CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
   CHAR_L, CHAR_RIGHT_CURLY_BRACKET, '\0' };
@@ -512,7 +518,7 @@ static PCRE2_SPTR posix_substitutes[] = {
   string_pXan,          /* alnum */
   NULL,                 /* ascii */
   string_h,             /* blank */
-  NULL,                 /* cntrl */
+  string_pCc,           /* cntrl */
   string_pNd,           /* digit */
   NULL,                 /* graph */
   NULL,                 /* print */
@@ -527,7 +533,7 @@ static PCRE2_SPTR posix_substitutes[] = {
   string_PXan,          /* ^alnum */
   NULL,                 /* ^ascii */
   string_H,             /* ^blank */
-  NULL,                 /* ^cntrl */
+  string_PCc,           /* ^cntrl */
   string_PNd,           /* ^digit */
   NULL,                 /* ^graph */
   NULL,                 /* ^print */
diff --git a/src/pcre2_internal.h b/src/pcre2_internal.h
index 7c2132d..f4261eb 100644
--- a/src/pcre2_internal.h
+++ b/src/pcre2_internal.h
@@ -389,6 +389,11 @@ other. NOTE: The values also appear in pcre2_jit_compile.c. */
 
 #ifndef EBCDIC
 
+/* Character U+180E (Mongolian Vowel Separator) is not included in the list of 
+spaces in the Unicode file PropList.txt, and Perl does not recognize it as a 
+space. However, in many other sources it is listed as a space and has been in
+PCRE for a long time. */ 
+
 #define HSPACE_LIST \
   CHAR_HT, CHAR_SPACE, 0xa0, \
   0x1680, 0x180e, 0x2000, 0x2001, 0x2002, 0x2003, 0x2004, 0x2005, \
diff --git a/testdata/testinput4 b/testdata/testinput4
index d4b6075..873d5fa 100644
--- a/testdata/testinput4
+++ b/testdata/testinput4
@@ -1139,7 +1139,6 @@
     \x{06e9}
     \x{060b}
     ** Failers
-    \x{061c}
     X\x{06e9}   
 
 /^[\P{Yi}]/utf
@@ -1492,7 +1491,7 @@
     >\x{20}\x{a0}\x{1680}\x{2028}\x{2029}\x{202f}\x{9}\x{b} 
 
 /^>[[:blank:]]*/utf,ucp
-    >\x{20}\x{a0}\x{1680}\x{180e}\x{2000}\x{202f}\x{9}\x{b}\x{2028} 
+    >\x{20}\x{a0}\x{1680}\x{2000}\x{202f}\x{9}\x{b}\x{2028} 
 
 /^[[:alpha:]]*/utf,ucp
     Az\x{aa}\x{c0}\x{1c5}\x{2b0}\x{3b6}\x{1d7c9}\x{2fa1d}
@@ -2045,11 +2044,11 @@
 
 /^A\s+Z/utf,ucp
     A\x{2005}Z
-    A\x{85}\x{180e}\x{2005}Z
+    A\x{85}\x{2005}Z
 
 /^A[\s]+Z/utf,ucp
     A\x{2005}Z
-    A\x{85}\x{180e}\x{2005}Z
+    A\x{85}\x{2005}Z
 
 /^[[:graph:]]+$/utf,ucp
     Letter:ABC
@@ -2075,17 +2074,11 @@
     \x{20}
     \x{85}
     \x{a0}
-    \x{61c}
     \x{1680}
-    \x{180e}
     \x{2028}
     \x{2029}
     \x{202f}
     \x{2065}
-    \x{2066}
-    \x{2067}
-    \x{2068}
-    \x{2069}
     \x{3000}
     \x{e0002}
     \x{e001f}
@@ -2103,7 +2096,6 @@
     Punctuation:\x{66a},;
     Symbol:\x{6de}<>\x{fffc}
     Cf-property:\x{ad}\x{600}\x{601}\x{602}\x{603}\x{604}\x{6dd}\x{70f}
-    \x{180e}
     \x{200b}\x{200c}\x{200d}\x{200e}\x{200f}
     \x{202a}\x{202b}\x{202c}\x{202d}\x{202e}
     \x{202f}
@@ -2119,14 +2111,9 @@
     \x{09}
     \x{1D}
     \x{85}
-    \x{61c}
     \x{2028}
     \x{2029}
     \x{2065}
-    \x{2066}
-    \x{2067}
-    \x{2068}
-    \x{2069}
     \x{e0002}
     \x{e001f}
     \x{e0080} 
@@ -2140,8 +2127,8 @@
     abcde  
 
 /^[[:^graph:]]+$/utf,ucp
-    \x{09}\x{0a}\x{1D}\x{20}\x{85}\x{a0}\x{61c}\x{1680}\x{180e}
-    \x{2028}\x{2029}\x{202f}\x{2065}\x{2066}\x{2067}\x{2068}\x{2069}
+    \x{09}\x{0a}\x{1D}\x{20}\x{85}\x{a0}\x{1680}
+    \x{2028}\x{2029}\x{202f}\x{2065}
     \x{3000}\x{e0002}\x{e001f}\x{e0080}
     ** Failers
     Letter:ABC
@@ -2162,8 +2149,8 @@
     \x{e0020}\x{e0030}\x{e0040}\x{e0050}\x{e0060}\x{e0070}\x{e007f}
 
 /^[[:^print:]]+$/utf,ucp
-    \x{09}\x{1D}\x{85}\x{61c}\x{2028}\x{2029}\x{2065}\x{2066}\x{2067}
-    \x{2068}\x{2069}\x{e0002}\x{e001f}\x{e0080}
+    \x{09}\x{1D}\x{85}\x{2028}\x{2029}\x{2065}
+    \x{e0002}\x{e001f}\x{e0080}
     ** Failers
     Space: \x{a0}
     \x{1680}\x{2000}\x{2001}\x{2002}\x{2003}\x{2004}\x{2005}
@@ -2176,7 +2163,6 @@
     Punctuation:\x{66a},;
     Symbol:\x{6de}<>\x{fffc}
     Cf-property:\x{ad}\x{600}\x{601}\x{602}\x{603}\x{604}\x{6dd}\x{70f}
-    \x{180e}
     \x{200b}\x{200c}\x{200d}\x{200e}\x{200f}
     \x{202a}\x{202b}\x{202c}\x{202d}\x{202e}
     \x{202f}
diff --git a/testdata/testinput5 b/testdata/testinput5
index a2431b1..9149855 100644
--- a/testdata/testinput5
+++ b/testdata/testinput5
@@ -2,7 +2,72 @@
 # support, including Unicode properties. However, tests that give different
 # results in 8-bit, 16-bit, and 32-bit modes are excluded (see tests 10 and
 # 12).
+
+# PCRE2 and Perl disagree about the characteristics of certain Unicode
+# characters. For example, 061C is considered by Perl to be Arabic, though
+# is it not listed as such in the Unicode Scripts.txt file, and 2066-2069 are
+# graphic and printable according to Perl, though they are actually "isolate"
+# control characters. That is why the following tests are here rather than in
+# test 4.
+
+/^[\p{Arabic}]/utf
+    ** Failers
+    \x{061c}
     
+/^[[:graph:]]+$/utf,ucp
+    ** Failers
+    \x{61c}
+    \x{2066}
+    \x{2067}
+    \x{2068}
+    \x{2069}
+
+/^[[:print:]]+$/utf,ucp
+    ** Failers
+    \x{61c}
+    \x{2066}
+    \x{2067}
+    \x{2068}
+    \x{2069}
+
+/^[[:^graph:]]+$/utf,ucp
+    \x{09}\x{0a}\x{1D}\x{20}\x{85}\x{a0}\x{61c}\x{1680}
+    \x{2028}\x{2029}\x{202f}\x{2065}\x{2066}\x{2067}\x{2068}\x{2069}
+
+/^[[:^print:]]+$/utf,ucp
+    \x{09}\x{1D}\x{85}\x{61c}\x{2028}\x{2029}\x{2065}\x{2066}\x{2067}
+    \x{2068}\x{2069}
+     
+# Perl does not consider U+180e to be a space character. It is true that it
+# does not appear in the Unicode PropList.txt file as such, but in many other
+# sources it is listed as a space, and has been treated as such in PCRE for
+# a long time. 
+
+/^>[[:blank:]]*/utf,ucp
+    >\x{20}\x{a0}\x{1680}\x{180e}\x{2000}\x{202f}\x{9}\x{b}\x{2028} 
+
+/^A\s+Z/utf,ucp
+    A\x{85}\x{180e}\x{2005}Z
+
+/^A[\s]+Z/utf,ucp
+    A\x{2005}Z
+    A\x{85}\x{2005}Z
+    
+/^[[:graph:]]+$/utf,ucp
+    \x{180e}
+
+/^[[:print:]]+$/utf,ucp
+    \x{180e}
+
+/^[[:^graph:]]+$/utf,ucp
+    \x{09}\x{0a}\x{1D}\x{20}\x{85}\x{a0}\x{61c}\x{1680}\x{180e}
+
+/^[[:^print:]]+$/utf,ucp
+    \x{180e}
+
+# End of U+180E tests.
+
+# ---------------------------------------------------------------------
 
 /\x{110000}/IB,utf
 
@@ -872,9 +937,8 @@
     \x{2028}
     \x{200d} 
   
-# These are here rather than in test 6 because Perl has problems with
-# the negative versions of the properties and behaves has changed how
-# it behaves for caseless matching. 
+# These are here because Perl has problems with the negative versions of the
+# properties and has changed how it behaves for caseless matching.
       
 /\p{^Lu}/i,utf
     1234
@@ -1264,8 +1328,6 @@
 /(\x{2c65}\x{2c65})\1Y/i,utf
     X\x{2c65}\x{2c65}\x{23a}\x{23a}YZ
 
-#  
-
 # These scripts weren't yet in Perl when I added Unicode 6.0.0 to PCRE 
 
 /^[\p{Batak}]/utf
@@ -1287,8 +1349,6 @@
     \x{85c}
     \x{85d}    
 
-#  
-
 /(\X*)(.)/s,utf
     A\x{300}
 
diff --git a/testdata/testoutput4 b/testdata/testoutput4
index 97cff37..b3b6896 100644
--- a/testdata/testoutput4
+++ b/testdata/testoutput4
@@ -1983,8 +1983,6 @@ No match
     \x{060b}
  0: \x{60b}
     ** Failers
-No match
-    \x{061c}
 No match
     X\x{06e9}   
 No match
@@ -2578,8 +2576,8 @@ No match
  0: > \x{a0}\x{1680}\x{2028}\x{2029}\x{202f}\x{09}\x{0b}
 
 /^>[[:blank:]]*/utf,ucp
-    >\x{20}\x{a0}\x{1680}\x{180e}\x{2000}\x{202f}\x{9}\x{b}\x{2028} 
- 0: > \x{a0}\x{1680}\x{180e}\x{2000}\x{202f}\x{09}
+    >\x{20}\x{a0}\x{1680}\x{2000}\x{202f}\x{9}\x{b}\x{2028} 
+ 0: > \x{a0}\x{1680}\x{2000}\x{202f}\x{09}
 
 /^[[:alpha:]]*/utf,ucp
     Az\x{aa}\x{c0}\x{1c5}\x{2b0}\x{3b6}\x{1d7c9}\x{2fa1d}
@@ -2591,7 +2589,7 @@ No match
 
 /^[[:cntrl:]]*/utf,ucp
     \x{0}\x{09}\x{1f}\x{7f}\x{9f} 
- 0: \x{00}\x{09}\x{1f}\x{7f}
+ 0: \x{00}\x{09}\x{1f}\x{7f}\x{9f}
 
 /^[[:graph:]]*/utf,ucp
     A\x{a1}\x{a0}
@@ -3414,14 +3412,14 @@ No match
 /^A\s+Z/utf,ucp
     A\x{2005}Z
  0: A\x{2005}Z
-    A\x{85}\x{180e}\x{2005}Z
- 0: A\x{85}\x{180e}\x{2005}Z
+    A\x{85}\x{2005}Z
+ 0: A\x{85}\x{2005}Z
 
 /^A[\s]+Z/utf,ucp
     A\x{2005}Z
  0: A\x{2005}Z
-    A\x{85}\x{180e}\x{2005}Z
- 0: A\x{85}\x{180e}\x{2005}Z
+    A\x{85}\x{2005}Z
+ 0: A\x{85}\x{2005}Z
 
 /^[[:graph:]]+$/utf,ucp
     Letter:ABC
@@ -3469,12 +3467,8 @@ No match
     \x{85}
 No match
     \x{a0}
-No match
-    \x{61c}
 No match
     \x{1680}
-No match
-    \x{180e}
 No match
     \x{2028}
 No match
@@ -3483,14 +3477,6 @@ No match
     \x{202f}
 No match
     \x{2065}
-No match
-    \x{2066}
-No match
-    \x{2067}
-No match
-    \x{2068}
-No match
-    \x{2069}
 No match
     \x{3000}
 No match
@@ -3524,8 +3510,6 @@ No match
  0: Symbol:\x{6de}<>\x{fffc}
     Cf-property:\x{ad}\x{600}\x{601}\x{602}\x{603}\x{604}\x{6dd}\x{70f}
  0: Cf-property:\x{ad}\x{600}\x{601}\x{602}\x{603}\x{604}\x{6dd}\x{70f}
-    \x{180e}
- 0: \x{180e}
     \x{200b}\x{200c}\x{200d}\x{200e}\x{200f}
  0: \x{200b}\x{200c}\x{200d}\x{200e}\x{200f}
     \x{202a}\x{202b}\x{202c}\x{202d}\x{202e}
@@ -3555,22 +3539,12 @@ No match
     \x{1D}
 No match
     \x{85}
-No match
-    \x{61c}
 No match
     \x{2028}
 No match
     \x{2029}
 No match
     \x{2065}
-No match
-    \x{2066}
-No match
-    \x{2067}
-No match
-    \x{2068}
-No match
-    \x{2069}
 No match
     \x{e0002}
 No match
@@ -3594,10 +3568,10 @@ No match
 No match
 
 /^[[:^graph:]]+$/utf,ucp
-    \x{09}\x{0a}\x{1D}\x{20}\x{85}\x{a0}\x{61c}\x{1680}\x{180e}
- 0: \x{09}\x{0a}\x{1d} \x{85}\x{a0}\x{61c}\x{1680}\x{180e}
-    \x{2028}\x{2029}\x{202f}\x{2065}\x{2066}\x{2067}\x{2068}\x{2069}
- 0: \x{2028}\x{2029}\x{202f}\x{2065}\x{2066}\x{2067}\x{2068}\x{2069}
+    \x{09}\x{0a}\x{1D}\x{20}\x{85}\x{a0}\x{1680}
+ 0: \x{09}\x{0a}\x{1d} \x{85}\x{a0}\x{1680}
+    \x{2028}\x{2029}\x{202f}\x{2065}
+ 0: \x{2028}\x{2029}\x{202f}\x{2065}
     \x{3000}\x{e0002}\x{e001f}\x{e0080}
  0: \x{3000}\x{e0002}\x{e001f}\x{e0080}
     ** Failers
@@ -3636,10 +3610,10 @@ No match
 No match
 
 /^[[:^print:]]+$/utf,ucp
-    \x{09}\x{1D}\x{85}\x{61c}\x{2028}\x{2029}\x{2065}\x{2066}\x{2067}
- 0: \x{09}\x{1d}\x{85}\x{61c}\x{2028}\x{2029}\x{2065}\x{2066}\x{2067}
-    \x{2068}\x{2069}\x{e0002}\x{e001f}\x{e0080}
- 0: \x{2068}\x{2069}\x{e0002}\x{e001f}\x{e0080}
+    \x{09}\x{1D}\x{85}\x{2028}\x{2029}\x{2065}
+ 0: \x{09}\x{1d}\x{85}\x{2028}\x{2029}\x{2065}
+    \x{e0002}\x{e001f}\x{e0080}
+ 0: \x{e0002}\x{e001f}\x{e0080}
     ** Failers
 No match
     Space: \x{a0}
@@ -3663,8 +3637,6 @@ No match
     Symbol:\x{6de}<>\x{fffc}
 No match
     Cf-property:\x{ad}\x{600}\x{601}\x{602}\x{603}\x{604}\x{6dd}\x{70f}
-No match
-    \x{180e}
 No match
     \x{200b}\x{200c}\x{200d}\x{200e}\x{200f}
 No match
diff --git a/testdata/testoutput5 b/testdata/testoutput5
index d1bb20a..2ddd11f 100644
--- a/testdata/testoutput5
+++ b/testdata/testoutput5
@@ -2,7 +2,98 @@
 # support, including Unicode properties. However, tests that give different
 # results in 8-bit, 16-bit, and 32-bit modes are excluded (see tests 10 and
 # 12).
+
+# PCRE2 and Perl disagree about the characteristics of certain Unicode
+# characters. For example, 061C is considered by Perl to be Arabic, though
+# is it not listed as such in the Unicode Scripts.txt file, and 2066-2069 are
+# graphic and printable according to Perl, though they are actually "isolate"
+# control characters. That is why the following tests are here rather than in
+# test 4.
+
+/^[\p{Arabic}]/utf
+    ** Failers
+No match
+    \x{061c}
+No match
     
+/^[[:graph:]]+$/utf,ucp
+    ** Failers
+No match
+    \x{61c}
+No match
+    \x{2066}
+No match
+    \x{2067}
+No match
+    \x{2068}
+No match
+    \x{2069}
+No match
+
+/^[[:print:]]+$/utf,ucp
+    ** Failers
+ 0: ** Failers
+    \x{61c}
+No match
+    \x{2066}
+No match
+    \x{2067}
+No match
+    \x{2068}
+No match
+    \x{2069}
+No match
+
+/^[[:^graph:]]+$/utf,ucp
+    \x{09}\x{0a}\x{1D}\x{20}\x{85}\x{a0}\x{61c}\x{1680}
+ 0: \x{09}\x{0a}\x{1d} \x{85}\x{a0}\x{61c}\x{1680}
+    \x{2028}\x{2029}\x{202f}\x{2065}\x{2066}\x{2067}\x{2068}\x{2069}
+ 0: \x{2028}\x{2029}\x{202f}\x{2065}\x{2066}\x{2067}\x{2068}\x{2069}
+
+/^[[:^print:]]+$/utf,ucp
+    \x{09}\x{1D}\x{85}\x{61c}\x{2028}\x{2029}\x{2065}\x{2066}\x{2067}
+ 0: \x{09}\x{1d}\x{85}\x{61c}\x{2028}\x{2029}\x{2065}\x{2066}\x{2067}
+    \x{2068}\x{2069}
+ 0: \x{2068}\x{2069}
+     
+# Perl does not consider U+180e to be a space character. It is true that it
+# does not appear in the Unicode PropList.txt file as such, but in many other
+# sources it is listed as a space, and has been treated as such in PCRE for
+# a long time. 
+
+/^>[[:blank:]]*/utf,ucp
+    >\x{20}\x{a0}\x{1680}\x{180e}\x{2000}\x{202f}\x{9}\x{b}\x{2028} 
+ 0: > \x{a0}\x{1680}\x{180e}\x{2000}\x{202f}\x{09}
+
+/^A\s+Z/utf,ucp
+    A\x{85}\x{180e}\x{2005}Z
+ 0: A\x{85}\x{180e}\x{2005}Z
+
+/^A[\s]+Z/utf,ucp
+    A\x{2005}Z
+ 0: A\x{2005}Z
+    A\x{85}\x{2005}Z
+ 0: A\x{85}\x{2005}Z
+    
+/^[[:graph:]]+$/utf,ucp
+    \x{180e}
+No match
+
+/^[[:print:]]+$/utf,ucp
+    \x{180e}
+ 0: \x{180e}
+
+/^[[:^graph:]]+$/utf,ucp
+    \x{09}\x{0a}\x{1D}\x{20}\x{85}\x{a0}\x{61c}\x{1680}\x{180e}
+ 0: \x{09}\x{0a}\x{1d} \x{85}\x{a0}\x{61c}\x{1680}\x{180e}
+
+/^[[:^print:]]+$/utf,ucp
+    \x{180e}
+No match
+
+# End of U+180E tests.
+
+# ---------------------------------------------------------------------
 
 /\x{110000}/IB,utf
 Failed: error 134 at offset 9: character code point value in \x{} or \o{} is too large
@@ -2015,9 +2106,8 @@ No match
     \x{200d} 
 No match
   
-# These are here rather than in test 6 because Perl has problems with
-# the negative versions of the properties and behaves has changed how
-# it behaves for caseless matching. 
+# These are here because Perl has problems with the negative versions of the
+# properties and has changed how it behaves for caseless matching.
       
 /\p{^Lu}/i,utf
     1234
@@ -2520,7 +2610,7 @@ No match
 /[[:cntrl:]]/B,ucp
 ------------------------------------------------------------------
         Bra
-        [\x00-\x1f\x7f]
+        [\p{Cc}]
         Ket
         End
 ------------------------------------------------------------------
@@ -2626,7 +2716,7 @@ No match
 /[[:^alpha:][:^cntrl:]]+/B,utf,ucp
 ------------------------------------------------------------------
         Bra
-        [ -~\x80-\xff\P{L}]++
+        [\P{L}\P{Cc}]++
         Ket
         End
 ------------------------------------------------------------------
@@ -2638,7 +2728,7 @@ No match
 /[[:^cntrl:][:^alpha:]]+/B,utf,ucp
 ------------------------------------------------------------------
         Bra
-        [ -~\x80-\xff\P{L}]++
+        [\P{Cc}\P{L}]++
         Ket
         End
 ------------------------------------------------------------------
@@ -2850,8 +2940,6 @@ No match
  0: \x{2c65}\x{2c65}\x{23a}\x{23a}Y
  1: \x{2c65}\x{2c65}
 
-#  
-
 # These scripts weren't yet in Perl when I added Unicode 6.0.0 to PCRE 
 
 /^[\p{Batak}]/utf
@@ -2886,8 +2974,6 @@ No match
     \x{85d}    
 No match
 
-#  
-
 /(\X*)(.)/s,utf
     A\x{300}
  0: A
diff --git a/testdata/testoutput8-16 b/testdata/testoutput8-16
index c51b406..62cd27b 100644
--- a/testdata/testoutput8-16
+++ b/testdata/testoutput8-16
@@ -659,18 +659,18 @@ Memory allocation (code space): 14
 
 /[[:^alpha:][:^cntrl:]]+/utf,ucp
 ------------------------------------------------------------------
-  0  26 Bra
-  2     [ -~\x80-\xff\P{L}]++
- 26  26 Ket
- 28     End
+  0  13 Bra
+  2     [\P{L}\P{Cc}]++
+ 13  13 Ket
+ 15     End
 ------------------------------------------------------------------
 
 /[[:^cntrl:][:^alpha:]]+/utf,ucp
 ------------------------------------------------------------------
-  0  26 Bra
-  2     [ -~\x80-\xff\P{L}]++
- 26  26 Ket
- 28     End
+  0  13 Bra
+  2     [\P{Cc}\P{L}]++
+ 13  13 Ket
+ 15     End
 ------------------------------------------------------------------
 
 /[[:alpha:]]+/utf,ucp
diff --git a/testdata/testoutput8-32 b/testdata/testoutput8-32
index 1cb5ff1..f27b624 100644
--- a/testdata/testoutput8-32
+++ b/testdata/testoutput8-32
@@ -659,18 +659,18 @@ Memory allocation (code space): 28
 
 /[[:^alpha:][:^cntrl:]]+/utf,ucp
 ------------------------------------------------------------------
-  0  18 Bra
-  2     [ -~\x80-\xff\P{L}]++
- 18  18 Ket
- 20     End
+  0  13 Bra
+  2     [\P{L}\P{Cc}]++
+ 13  13 Ket
+ 15     End
 ------------------------------------------------------------------
 
 /[[:^cntrl:][:^alpha:]]+/utf,ucp
 ------------------------------------------------------------------
-  0  18 Bra
-  2     [ -~\x80-\xff\P{L}]++
- 18  18 Ket
- 20     End
+  0  13 Bra
+  2     [\P{Cc}\P{L}]++
+ 13  13 Ket
+ 15     End
 ------------------------------------------------------------------
 
 /[[:alpha:]]+/utf,ucp
diff --git a/testdata/testoutput8-8 b/testdata/testoutput8-8
index ae0518e..92b7a28 100644
--- a/testdata/testoutput8-8
+++ b/testdata/testoutput8-8
@@ -659,18 +659,18 @@ Memory allocation (code space): 10
 
 /[[:^alpha:][:^cntrl:]]+/utf,ucp
 ------------------------------------------------------------------
-  0  44 Bra
-  3     [ -~\x80-\xff\P{L}]++
- 44  44 Ket
- 47     End
+  0  15 Bra
+  3     [\P{L}\P{Cc}]++
+ 15  15 Ket
+ 18     End
 ------------------------------------------------------------------
 
 /[[:^cntrl:][:^alpha:]]+/utf,ucp
 ------------------------------------------------------------------
-  0  44 Bra
-  3     [ -~\x80-\xff\P{L}]++
- 44  44 Ket
- 47     End
+  0  15 Bra
+  3     [\P{Cc}\P{L}]++
+ 15  15 Ket
+ 18     End
 ------------------------------------------------------------------
 
 /[[:alpha:]]+/utf,ucp