# This set of tests is for UTF-16 and UTF-32 support, including Unicode # properties. It is relevant only to the 16-bit and 32-bit libraries. The # output is different for each library, so there are separate output files. /ÃÃÃxxx/IB,utf,no_utf_check ** Failed: invalid UTF-8 string cannot be converted to 32-bit string /abc/utf Ã] ** Failed: invalid UTF-8 string cannot be used as input in UTF mode # Check maximum character size /\x{ffff}/IB,utf ------------------------------------------------------------------ Bra \x{ffff} Ket End ------------------------------------------------------------------ Capture group count = 0 Options: utf First code unit = \x{ffff} Subject length lower bound = 1 /\x{10000}/IB,utf ------------------------------------------------------------------ Bra \x{10000} Ket End ------------------------------------------------------------------ Capture group count = 0 Options: utf First code unit = \x{10000} Subject length lower bound = 1 /\x{100}/IB,utf ------------------------------------------------------------------ Bra \x{100} Ket End ------------------------------------------------------------------ Capture group count = 0 Options: utf First code unit = \x{100} Subject length lower bound = 1 /\x{1000}/IB,utf ------------------------------------------------------------------ Bra \x{1000} Ket End ------------------------------------------------------------------ Capture group count = 0 Options: utf First code unit = \x{1000} Subject length lower bound = 1 /\x{10000}/IB,utf ------------------------------------------------------------------ Bra \x{10000} Ket End ------------------------------------------------------------------ Capture group count = 0 Options: utf First code unit = \x{10000} Subject length lower bound = 1 /\x{100000}/IB,utf ------------------------------------------------------------------ Bra \x{100000} Ket End ------------------------------------------------------------------ Capture group count = 0 Options: utf First code unit = \x{100000} Subject length lower bound = 1 /\x{10ffff}/IB,utf ------------------------------------------------------------------ Bra \x{10ffff} Ket End ------------------------------------------------------------------ Capture group count = 0 Options: utf First code unit = \x{10ffff} Subject length lower bound = 1 /[\x{ff}]/IB,utf ------------------------------------------------------------------ Bra \x{ff} Ket End ------------------------------------------------------------------ Capture group count = 0 Options: utf First code unit = \xff Subject length lower bound = 1 /[\x{100}]/IB,utf ------------------------------------------------------------------ Bra \x{100} Ket End ------------------------------------------------------------------ Capture group count = 0 Options: utf First code unit = \x{100} Subject length lower bound = 1 /\x80/IB,utf ------------------------------------------------------------------ Bra \x{80} Ket End ------------------------------------------------------------------ Capture group count = 0 Options: utf First code unit = \x80 Subject length lower bound = 1 /\xff/IB,utf ------------------------------------------------------------------ Bra \x{ff} Ket End ------------------------------------------------------------------ Capture group count = 0 Options: utf First code unit = \xff Subject length lower bound = 1 /\x{D55c}\x{ad6d}\x{C5B4}/IB,utf ------------------------------------------------------------------ Bra \x{d55c}\x{ad6d}\x{c5b4} Ket End ------------------------------------------------------------------ Capture group count = 0 Options: utf First code unit = \x{d55c} Last code unit = \x{c5b4} Subject length lower bound = 3 \x{D55c}\x{ad6d}\x{C5B4} 0: \x{d55c}\x{ad6d}\x{c5b4} /\x{65e5}\x{672c}\x{8a9e}/IB,utf ------------------------------------------------------------------ Bra \x{65e5}\x{672c}\x{8a9e} Ket End ------------------------------------------------------------------ Capture group count = 0 Options: utf First code unit = \x{65e5} Last code unit = \x{8a9e} Subject length lower bound = 3 \x{65e5}\x{672c}\x{8a9e} 0: \x{65e5}\x{672c}\x{8a9e} /\x{80}/IB,utf ------------------------------------------------------------------ Bra \x{80} Ket End ------------------------------------------------------------------ Capture group count = 0 Options: utf First code unit = \x80 Subject length lower bound = 1 /\x{084}/IB,utf ------------------------------------------------------------------ Bra \x{84} Ket End ------------------------------------------------------------------ Capture group count = 0 Options: utf First code unit = \x84 Subject length lower bound = 1 /\x{104}/IB,utf ------------------------------------------------------------------ Bra \x{104} Ket End ------------------------------------------------------------------ Capture group count = 0 Options: utf First code unit = \x{104} Subject length lower bound = 1 /\x{861}/IB,utf ------------------------------------------------------------------ Bra \x{861} Ket End ------------------------------------------------------------------ Capture group count = 0 Options: utf First code unit = \x{861} Subject length lower bound = 1 /\x{212ab}/IB,utf ------------------------------------------------------------------ Bra \x{212ab} Ket End ------------------------------------------------------------------ Capture group count = 0 Options: utf First code unit = \x{212ab} Subject length lower bound = 1 /[^ab\xC0-\xF0]/IB,utf ------------------------------------------------------------------ Bra [\x00-`c-\xbf\xf1-\xff] (neg) Ket End ------------------------------------------------------------------ Capture group count = 0 Options: utf Starting code units: \x00 \x01 \x02 \x03 \x04 \x05 \x06 \x07 \x08 \x09 \x0a \x0b \x0c \x0d \x0e \x0f \x10 \x11 \x12 \x13 \x14 \x15 \x16 \x17 \x18 \x19 \x1a \x1b \x1c \x1d \x1e \x1f \x20 ! " # $ % & ' ( ) * + , - . / 0 1 2 3 4 5 6 7 8 9 : ; < = > ? @ A B C D E F G H I J K L M N O P Q R S T U V W X Y Z [ \ ] ^ _ ` c d e f g h i j k l m n o p q r s t u v w x y z { | } ~ \x7f \x80 \x81 \x82 \x83 \x84 \x85 \x86 \x87 \x88 \x89 \x8a \x8b \x8c \x8d \x8e \x8f \x90 \x91 \x92 \x93 \x94 \x95 \x96 \x97 \x98 \x99 \x9a \x9b \x9c \x9d \x9e \x9f \xa0 \xa1 \xa2 \xa3 \xa4 \xa5 \xa6 \xa7 \xa8 \xa9 \xaa \xab \xac \xad \xae \xaf \xb0 \xb1 \xb2 \xb3 \xb4 \xb5 \xb6 \xb7 \xb8 \xb9 \xba \xbb \xbc \xbd \xbe \xbf \xf1 \xf2 \xf3 \xf4 \xf5 \xf6 \xf7 \xf8 \xf9 \xfa \xfb \xfc \xfd \xfe \xff Subject length lower bound = 1 \x{f1} 0: \x{f1} \x{bf} 0: \x{bf} \x{100} 0: \x{100} \x{1000} 0: \x{1000} \= Expect no match \x{c0} No match \x{f0} No match /Ä€{3,4}/IB,utf ------------------------------------------------------------------ Bra \x{100}{3} \x{100}?+ Ket End ------------------------------------------------------------------ Capture group count = 0 Options: utf First code unit = \x{100} Last code unit = \x{100} Subject length lower bound = 3 \x{100}\x{100}\x{100}\x{100\x{100} 0: \x{100}\x{100}\x{100} /(\x{100}+|x)/IB,utf ------------------------------------------------------------------ Bra CBra 1 \x{100}++ Alt x Ket Ket End ------------------------------------------------------------------ Capture group count = 1 Options: utf Starting code units: x \xff Subject length lower bound = 1 /(\x{100}*a|x)/IB,utf ------------------------------------------------------------------ Bra CBra 1 \x{100}*+ a Alt x Ket Ket End ------------------------------------------------------------------ Capture group count = 1 Options: utf Starting code units: a x \xff Subject length lower bound = 1 /(\x{100}{0,2}a|x)/IB,utf ------------------------------------------------------------------ Bra CBra 1 \x{100}{0,2}+ a Alt x Ket Ket End ------------------------------------------------------------------ Capture group count = 1 Options: utf Starting code units: a x \xff Subject length lower bound = 1 /(\x{100}{1,2}a|x)/IB,utf ------------------------------------------------------------------ Bra CBra 1 \x{100} \x{100}{0,1}+ a Alt x Ket Ket End ------------------------------------------------------------------ Capture group count = 1 Options: utf Starting code units: x \xff Subject length lower bound = 1 /\x{100}/IB,utf ------------------------------------------------------------------ Bra \x{100} Ket End ------------------------------------------------------------------ Capture group count = 0 Options: utf First code unit = \x{100} Subject length lower bound = 1 /a\x{100}\x{101}*/IB,utf ------------------------------------------------------------------ Bra a\x{100} \x{101}*+ Ket End ------------------------------------------------------------------ Capture group count = 0 Options: utf First code unit = 'a' Last code unit = \x{100} Subject length lower bound = 2 /a\x{100}\x{101}+/IB,utf ------------------------------------------------------------------ Bra a\x{100} \x{101}++ Ket End ------------------------------------------------------------------ Capture group count = 0 Options: utf First code unit = 'a' Last code unit = \x{101} Subject length lower bound = 3 /[^\x{c4}]/IB ------------------------------------------------------------------ Bra [^\x{c4}] Ket End ------------------------------------------------------------------ Capture group count = 0 Subject length lower bound = 1 /[\x{100}]/IB,utf ------------------------------------------------------------------ Bra \x{100} Ket End ------------------------------------------------------------------ Capture group count = 0 Options: utf First code unit = \x{100} Subject length lower bound = 1 \x{100} 0: \x{100} Z\x{100} 0: \x{100} \x{100}Z 0: \x{100} /[\xff]/IB,utf ------------------------------------------------------------------ Bra \x{ff} Ket End ------------------------------------------------------------------ Capture group count = 0 Options: utf First code unit = \xff Subject length lower bound = 1 >\x{ff}< 0: \x{ff} /[^\xff]/IB,utf ------------------------------------------------------------------ Bra [^\x{ff}] Ket End ------------------------------------------------------------------ Capture group count = 0 Options: utf Subject length lower bound = 1 /\x{100}abc(xyz(?1))/IB,utf ------------------------------------------------------------------ Bra \x{100}abc CBra 1 xyz Recurse Ket Ket End ------------------------------------------------------------------ Capture group count = 1 Options: utf First code unit = \x{100} Last code unit = 'z' Subject length lower bound = 7 /\777/I,utf Capture group count = 0 Options: utf First code unit = \x{1ff} Subject length lower bound = 1 \x{1ff} 0: \x{1ff} \777 0: \x{1ff} /\x{100}+\x{200}/IB,utf ------------------------------------------------------------------ Bra \x{100}++ \x{200} Ket End ------------------------------------------------------------------ Capture group count = 0 Options: utf First code unit = \x{100} Last code unit = \x{200} Subject length lower bound = 2 /\x{100}+X/IB,utf ------------------------------------------------------------------ Bra \x{100}++ X Ket End ------------------------------------------------------------------ Capture group count = 0 Options: utf First code unit = \x{100} Last code unit = 'X' Subject length lower bound = 2 /^[\QÄ€\E-\QÅ\E/B,utf Failed: error 106 at offset 13: missing terminating ] for character class /X/utf XX\x{d800}\=no_utf_check 0: X XX\x{da00}\=no_utf_check 0: X XX\x{dc00}\=no_utf_check 0: X XX\x{de00}\=no_utf_check 0: X XX\x{dfff}\=no_utf_check 0: X \= Expect UTF error XX\x{d800} Failed: error -27: UTF-32 error: code points 0xd800-0xdfff are not defined at offset 2 XX\x{da00} Failed: error -27: UTF-32 error: code points 0xd800-0xdfff are not defined at offset 2 XX\x{dc00} Failed: error -27: UTF-32 error: code points 0xd800-0xdfff are not defined at offset 2 XX\x{de00} Failed: error -27: UTF-32 error: code points 0xd800-0xdfff are not defined at offset 2 XX\x{dfff} Failed: error -27: UTF-32 error: code points 0xd800-0xdfff are not defined at offset 2 XX\x{110000} Failed: error -28: UTF-32 error: code points greater than 0x10ffff are not defined at offset 2 XX\x{d800}\x{1234} Failed: error -27: UTF-32 error: code points 0xd800-0xdfff are not defined at offset 2 \= Expect no match XX\x{d800}\=offset=3 No match /(?<=.)X/utf XX\x{d800}\=offset=3 Failed: error -27: UTF-32 error: code points 0xd800-0xdfff are not defined at offset 2 /(*UTF16)\x{11234}/ Failed: error 160 at offset 7: (*VERB) not recognized or malformed abcd\x{11234}pqr /(*UTF)\x{11234}/I Capture group count = 0 Compile options: Overall options: utf First code unit = \x{11234} Subject length lower bound = 1 abcd\x{11234}pqr 0: \x{11234} /(*UTF-32)\x{11234}/ Failed: error 160 at offset 5: (*VERB) not recognized or malformed abcd\x{11234}pqr /(*UTF-32)\x{112}/ Failed: error 160 at offset 5: (*VERB) not recognized or malformed abcd\x{11234}pqr /(*CRLF)(*UTF16)(*BSR_UNICODE)a\Rb/I Failed: error 160 at offset 14: (*VERB) not recognized or malformed /(*CRLF)(*UTF32)(*BSR_UNICODE)a\Rb/I Capture group count = 0 Compile options: Overall options: utf \R matches any Unicode newline Forced newline is CRLF First code unit = 'a' Last code unit = 'b' Subject length lower bound = 3 /\h/I,utf Capture group count = 0 Options: utf Starting code units: \x09 \x20 \xa0 \xff Subject length lower bound = 1 ABC\x{09} 0: \x{09} ABC\x{20} 0: ABC\x{a0} 0: \x{a0} ABC\x{1680} 0: \x{1680} ABC\x{180e} 0: \x{180e} ABC\x{2000} 0: \x{2000} ABC\x{202f} 0: \x{202f} ABC\x{205f} 0: \x{205f} ABC\x{3000} 0: \x{3000} /\v/I,utf Capture group count = 0 Options: utf Starting code units: \x0a \x0b \x0c \x0d \x85 \xff Subject length lower bound = 1 ABC\x{0a} 0: \x{0a} ABC\x{0b} 0: \x{0b} ABC\x{0c} 0: \x{0c} ABC\x{0d} 0: \x{0d} ABC\x{85} 0: \x{85} ABC\x{2028} 0: \x{2028} /\h*A/I,utf Capture group count = 0 Options: utf Starting code units: \x09 \x20 A \xa0 \xff Last code unit = 'A' Subject length lower bound = 1 CDBABC 0: A \x{2000}ABC 0: \x{2000}A /\R*A/I,bsr=unicode,utf Capture group count = 0 Options: utf \R matches any Unicode newline Starting code units: \x0a \x0b \x0c \x0d A \x85 \xff Last code unit = 'A' Subject length lower bound = 1 CDBABC 0: A \x{2028}A 0: \x{2028}A /\v+A/I,utf Capture group count = 0 Options: utf Starting code units: \x0a \x0b \x0c \x0d \x85 \xff Last code unit = 'A' Subject length lower bound = 2 /\s?xxx\s/I,utf Capture group count = 0 Options: utf Starting code units: \x09 \x0a \x0b \x0c \x0d \x20 x Last code unit = 'x' Subject length lower bound = 4 /\sxxx\s/I,utf,tables=2 Capture group count = 0 Options: utf Starting code units: \x09 \x0a \x0b \x0c \x0d \x20 \x85 \xa0 Last code unit = 'x' Subject length lower bound = 5 AB\x{85}xxx\x{a0}XYZ 0: \x{85}xxx\x{a0} AB\x{a0}xxx\x{85}XYZ 0: \x{a0}xxx\x{85} /\S \S/I,utf,tables=2 Capture group count = 0 Options: utf Starting code units: \x00 \x01 \x02 \x03 \x04 \x05 \x06 \x07 \x08 \x0e \x0f \x10 \x11 \x12 \x13 \x14 \x15 \x16 \x17 \x18 \x19 \x1a \x1b \x1c \x1d \x1e \x1f ! " # $ % & ' ( ) * + , - . / 0 1 2 3 4 5 6 7 8 9 : ; < = > ? @ A B C D E F G H I J K L M N O P Q R S T U V W X Y Z [ \ ] ^ _ ` a b c d e f g h i j k l m n o p q r s t u v w x y z { | } ~ \x7f \x80 \x81 \x82 \x83 \x84 \x86 \x87 \x88 \x89 \x8a \x8b \x8c \x8d \x8e \x8f \x90 \x91 \x92 \x93 \x94 \x95 \x96 \x97 \x98 \x99 \x9a \x9b \x9c \x9d \x9e \x9f \xa1 \xa2 \xa3 \xa4 \xa5 \xa6 \xa7 \xa8 \xa9 \xaa \xab \xac \xad \xae \xaf \xb0 \xb1 \xb2 \xb3 \xb4 \xb5 \xb6 \xb7 \xb8 \xb9 \xba \xbb \xbc \xbd \xbe \xbf \xc0 \xc1 \xc2 \xc3 \xc4 \xc5 \xc6 \xc7 \xc8 \xc9 \xca \xcb \xcc \xcd \xce \xcf \xd0 \xd1 \xd2 \xd3 \xd4 \xd5 \xd6 \xd7 \xd8 \xd9 \xda \xdb \xdc \xdd \xde \xdf \xe0 \xe1 \xe2 \xe3 \xe4 \xe5 \xe6 \xe7 \xe8 \xe9 \xea \xeb \xec \xed \xee \xef \xf0 \xf1 \xf2 \xf3 \xf4 \xf5 \xf6 \xf7 \xf8 \xf9 \xfa \xfb \xfc \xfd \xfe \xff Last code unit = ' ' Subject length lower bound = 3 \x{a2} \x{84} 0: \x{a2} \x{84} A Z 0: A Z /a+/utf a\x{123}aa\=offset=1 0: aa a\x{123}aa\=offset=2 0: aa a\x{123}aa\=offset=3 0: a \= Expect no match a\x{123}aa\=offset=4 No match \= Expect bad offset error a\x{123}aa\=offset=5 Failed: error -33: bad offset value a\x{123}aa\=offset=6 Failed: error -33: bad offset value /\x{1234}+/Ii,utf Capture group count = 0 Options: caseless utf First code unit = \x{1234} Subject length lower bound = 1 /\x{1234}+?/Ii,utf Capture group count = 0 Options: caseless utf First code unit = \x{1234} Subject length lower bound = 1 /\x{1234}++/Ii,utf Capture group count = 0 Options: caseless utf First code unit = \x{1234} Subject length lower bound = 1 /\x{1234}{2}/Ii,utf Capture group count = 0 Options: caseless utf First code unit = \x{1234} Last code unit = \x{1234} Subject length lower bound = 2 /[^\x{c4}]/IB,utf ------------------------------------------------------------------ Bra [^\x{c4}] Ket End ------------------------------------------------------------------ Capture group count = 0 Options: utf Subject length lower bound = 1 /X+\x{200}/IB,utf ------------------------------------------------------------------ Bra X++ \x{200} Ket End ------------------------------------------------------------------ Capture group count = 0 Options: utf First code unit = 'X' Last code unit = \x{200} Subject length lower bound = 2 /\R/I,utf Capture group count = 0 Options: utf Starting code units: \x0a \x0b \x0c \x0d \x85 \xff Subject length lower bound = 1 # Check bad offset /a/utf \= Expect bad UTF-16 offset, or no match in 32-bit \x{10000}\=offset=1 No match \x{10000}ab\=offset=1 0: a \= Expect 16-bit match, 32-bit no match \x{10000}ab\=offset=2 No match \= Expect no match \x{10000}ab\=offset=3 No match \= Expect no match in 16-bit, bad offset in 32-bit \x{10000}ab\=offset=4 Failed: error -33: bad offset value \= Expect bad offset \x{10000}ab\=offset=5 Failed: error -33: bad offset value /í¼€/utf Failed: error -27 at offset 0: UTF-32 error: code points 0xd800-0xdfff are not defined /\w+\x{C4}/B,utf ------------------------------------------------------------------ Bra \w++ \x{c4} Ket End ------------------------------------------------------------------ a\x{C4}\x{C4} 0: a\x{c4} /\w+\x{C4}/B,utf,tables=2 ------------------------------------------------------------------ Bra \w+ \x{c4} Ket End ------------------------------------------------------------------ a\x{C4}\x{C4} 0: a\x{c4}\x{c4} /\W+\x{C4}/B,utf ------------------------------------------------------------------ Bra \W+ \x{c4} Ket End ------------------------------------------------------------------ !\x{C4} 0: !\x{c4} /\W+\x{C4}/B,utf,tables=2 ------------------------------------------------------------------ Bra \W++ \x{c4} Ket End ------------------------------------------------------------------ !\x{C4} 0: !\x{c4} /\W+\x{A1}/B,utf ------------------------------------------------------------------ Bra \W+ \x{a1} Ket End ------------------------------------------------------------------ !\x{A1} 0: !\x{a1} /\W+\x{A1}/B,utf,tables=2 ------------------------------------------------------------------ Bra \W+ \x{a1} Ket End ------------------------------------------------------------------ !\x{A1} 0: !\x{a1} /X\s+\x{A0}/B,utf ------------------------------------------------------------------ Bra X \s++ \x{a0} Ket End ------------------------------------------------------------------ X\x20\x{A0}\x{A0} 0: X \x{a0} /X\s+\x{A0}/B,utf,tables=2 ------------------------------------------------------------------ Bra X \s+ \x{a0} Ket End ------------------------------------------------------------------ X\x20\x{A0}\x{A0} 0: X \x{a0}\x{a0} /\S+\x{A0}/B,utf ------------------------------------------------------------------ Bra \S+ \x{a0} Ket End ------------------------------------------------------------------ X\x{A0}\x{A0} 0: X\x{a0}\x{a0} /\S+\x{A0}/B,utf,tables=2 ------------------------------------------------------------------ Bra \S++ \x{a0} Ket End ------------------------------------------------------------------ X\x{A0}\x{A0} 0: X\x{a0} /\x{a0}+\s!/B,utf ------------------------------------------------------------------ Bra \x{a0}++ \s ! Ket End ------------------------------------------------------------------ \x{a0}\x20! 0: \x{a0} ! /\x{a0}+\s!/B,utf,tables=2 ------------------------------------------------------------------ Bra \x{a0}+ \s ! Ket End ------------------------------------------------------------------ \x{a0}\x20! 0: \x{a0} ! /(*UTF)abc/never_utf Failed: error 174 at offset 6: using UTF is disabled by the application /abc/utf,never_utf Failed: error 174 at offset 0: using UTF is disabled by the application /A\x{391}\x{10427}\x{ff3a}\x{1fb0}/IBi,utf ------------------------------------------------------------------ Bra /i A\x{391}\x{10427}\x{ff3a}\x{1fb0} Ket End ------------------------------------------------------------------ Capture group count = 0 Options: caseless utf First code unit = 'A' (caseless) Last code unit = \x{1fb0} (caseless) Subject length lower bound = 5 /A\x{391}\x{10427}\x{ff3a}\x{1fb0}/IB,utf ------------------------------------------------------------------ Bra A\x{391}\x{10427}\x{ff3a}\x{1fb0} Ket End ------------------------------------------------------------------ Capture group count = 0 Options: utf First code unit = 'A' Last code unit = \x{1fb0} Subject length lower bound = 5 /AB\x{1fb0}/IB,utf ------------------------------------------------------------------ Bra AB\x{1fb0} Ket End ------------------------------------------------------------------ Capture group count = 0 Options: utf First code unit = 'A' Last code unit = \x{1fb0} Subject length lower bound = 3 /AB\x{1fb0}/IBi,utf ------------------------------------------------------------------ Bra /i AB\x{1fb0} Ket End ------------------------------------------------------------------ Capture group count = 0 Options: caseless utf First code unit = 'A' (caseless) Last code unit = \x{1fb0} (caseless) Subject length lower bound = 3 /\x{401}\x{420}\x{421}\x{422}\x{423}\x{424}\x{425}\x{426}\x{427}\x{428}\x{429}\x{42a}\x{42b}\x{42c}\x{42d}\x{42e}\x{42f}/Ii,utf Capture group count = 0 Options: caseless utf First code unit = \x{401} (caseless) Last code unit = \x{42f} (caseless) Subject length lower bound = 17 \x{401}\x{420}\x{421}\x{422}\x{423}\x{424}\x{425}\x{426}\x{427}\x{428}\x{429}\x{42a}\x{42b}\x{42c}\x{42d}\x{42e}\x{42f} 0: \x{401}\x{420}\x{421}\x{422}\x{423}\x{424}\x{425}\x{426}\x{427}\x{428}\x{429}\x{42a}\x{42b}\x{42c}\x{42d}\x{42e}\x{42f} \x{451}\x{440}\x{441}\x{442}\x{443}\x{444}\x{445}\x{446}\x{447}\x{448}\x{449}\x{44a}\x{44b}\x{44c}\x{44d}\x{44e}\x{44f} 0: \x{451}\x{440}\x{441}\x{442}\x{443}\x{444}\x{445}\x{446}\x{447}\x{448}\x{449}\x{44a}\x{44b}\x{44c}\x{44d}\x{44e}\x{44f} /[â±¥]/Bi,utf ------------------------------------------------------------------ Bra /i \x{2c65} Ket End ------------------------------------------------------------------ /[^â±¥]/Bi,utf ------------------------------------------------------------------ Bra /i [^\x{2c65}] Ket End ------------------------------------------------------------------ /[[:blank:]]/B,ucp ------------------------------------------------------------------ Bra [\x09 \xa0\x{1680}\x{180e}\x{2000}-\x{200a}\x{202f}\x{205f}\x{3000}] Ket End ------------------------------------------------------------------ /\x{212a}+/Ii,utf Capture group count = 0 Options: caseless utf Starting code units: K k \xff Subject length lower bound = 1 KKkk\x{212a} 0: KKkk\x{212a} /s+/Ii,utf Capture group count = 0 Options: caseless utf Starting code units: S s \xff Subject length lower bound = 1 SSss\x{17f} 0: SSss\x{17f} # Non-UTF characters should give errors in both 16-bit and 32-bit modes. /\x{110000}/utf Failed: error 134 at offset 9: character code point value in \x{} or \o{} is too large /\o{4200000}/utf Failed: error 134 at offset 10: character code point value in \x{} or \o{} is too large /\x{100}*A/IB,utf ------------------------------------------------------------------ Bra \x{100}*+ A Ket End ------------------------------------------------------------------ Capture group count = 0 Options: utf Starting code units: A \xff Last code unit = 'A' Subject length lower bound = 1 A 0: A /\x{100}*\d(?R)/IB,utf ------------------------------------------------------------------ Bra \x{100}*+ \d Recurse Ket End ------------------------------------------------------------------ Capture group count = 0 Options: utf Starting code units: 0 1 2 3 4 5 6 7 8 9 \xff Subject length lower bound = 1 /[Z\x{100}]/IB,utf ------------------------------------------------------------------ Bra [Z\x{100}] Ket End ------------------------------------------------------------------ Capture group count = 0 Options: utf Starting code units: Z \xff Subject length lower bound = 1 Z\x{100} 0: Z \x{100} 0: \x{100} \x{100}Z 0: \x{100} /[z-\x{100}]/IB,utf ------------------------------------------------------------------ Bra [z-\xff\x{100}] Ket End ------------------------------------------------------------------ Capture group count = 0 Options: utf Starting code units: z { | } ~ \x7f \x80 \x81 \x82 \x83 \x84 \x85 \x86 \x87 \x88 \x89 \x8a \x8b \x8c \x8d \x8e \x8f \x90 \x91 \x92 \x93 \x94 \x95 \x96 \x97 \x98 \x99 \x9a \x9b \x9c \x9d \x9e \x9f \xa0 \xa1 \xa2 \xa3 \xa4 \xa5 \xa6 \xa7 \xa8 \xa9 \xaa \xab \xac \xad \xae \xaf \xb0 \xb1 \xb2 \xb3 \xb4 \xb5 \xb6 \xb7 \xb8 \xb9 \xba \xbb \xbc \xbd \xbe \xbf \xc0 \xc1 \xc2 \xc3 \xc4 \xc5 \xc6 \xc7 \xc8 \xc9 \xca \xcb \xcc \xcd \xce \xcf \xd0 \xd1 \xd2 \xd3 \xd4 \xd5 \xd6 \xd7 \xd8 \xd9 \xda \xdb \xdc \xdd \xde \xdf \xe0 \xe1 \xe2 \xe3 \xe4 \xe5 \xe6 \xe7 \xe8 \xe9 \xea \xeb \xec \xed \xee \xef \xf0 \xf1 \xf2 \xf3 \xf4 \xf5 \xf6 \xf7 \xf8 \xf9 \xfa \xfb \xfc \xfd \xfe \xff Subject length lower bound = 1 /[z\Qa-d]Ä€\E]/IB,utf ------------------------------------------------------------------ Bra [\-\]adz\x{100}] Ket End ------------------------------------------------------------------ Capture group count = 0 Options: utf Starting code units: - ] a d z \xff Subject length lower bound = 1 \x{100} 0: \x{100} Ä€ 0: \x{100} /[ab\x{100}]abc(xyz(?1))/IB,utf ------------------------------------------------------------------ Bra [ab\x{100}] abc CBra 1 xyz Recurse Ket Ket End ------------------------------------------------------------------ Capture group count = 1 Options: utf Starting code units: a b \xff Last code unit = 'z' Subject length lower bound = 7 /\x{100}*\s/IB,utf ------------------------------------------------------------------ Bra \x{100}*+ \s Ket End ------------------------------------------------------------------ Capture group count = 0 Options: utf Starting code units: \x09 \x0a \x0b \x0c \x0d \x20 \xff Subject length lower bound = 1 /\x{100}*\d/IB,utf ------------------------------------------------------------------ Bra \x{100}*+ \d Ket End ------------------------------------------------------------------ Capture group count = 0 Options: utf Starting code units: 0 1 2 3 4 5 6 7 8 9 \xff Subject length lower bound = 1 /\x{100}*\w/IB,utf ------------------------------------------------------------------ Bra \x{100}*+ \w Ket End ------------------------------------------------------------------ Capture group count = 0 Options: utf Starting code units: 0 1 2 3 4 5 6 7 8 9 A B C D E F G H I J K L M N O P Q R S T U V W X Y Z _ a b c d e f g h i j k l m n o p q r s t u v w x y z \xff Subject length lower bound = 1 /\x{100}*\D/IB,utf ------------------------------------------------------------------ Bra \x{100}* \D Ket End ------------------------------------------------------------------ Capture group count = 0 Options: utf Starting code units: \x00 \x01 \x02 \x03 \x04 \x05 \x06 \x07 \x08 \x09 \x0a \x0b \x0c \x0d \x0e \x0f \x10 \x11 \x12 \x13 \x14 \x15 \x16 \x17 \x18 \x19 \x1a \x1b \x1c \x1d \x1e \x1f \x20 ! " # $ % & ' ( ) * + , - . / : ; < = > ? @ A B C D E F G H I J K L M N O P Q R S T U V W X Y Z [ \ ] ^ _ ` a b c d e f g h i j k l m n o p q r s t u v w x y z { | } ~ \x7f \x80 \x81 \x82 \x83 \x84 \x85 \x86 \x87 \x88 \x89 \x8a \x8b \x8c \x8d \x8e \x8f \x90 \x91 \x92 \x93 \x94 \x95 \x96 \x97 \x98 \x99 \x9a \x9b \x9c \x9d \x9e \x9f \xa0 \xa1 \xa2 \xa3 \xa4 \xa5 \xa6 \xa7 \xa8 \xa9 \xaa \xab \xac \xad \xae \xaf \xb0 \xb1 \xb2 \xb3 \xb4 \xb5 \xb6 \xb7 \xb8 \xb9 \xba \xbb \xbc \xbd \xbe \xbf \xc0 \xc1 \xc2 \xc3 \xc4 \xc5 \xc6 \xc7 \xc8 \xc9 \xca \xcb \xcc \xcd \xce \xcf \xd0 \xd1 \xd2 \xd3 \xd4 \xd5 \xd6 \xd7 \xd8 \xd9 \xda \xdb \xdc \xdd \xde \xdf \xe0 \xe1 \xe2 \xe3 \xe4 \xe5 \xe6 \xe7 \xe8 \xe9 \xea \xeb \xec \xed \xee \xef \xf0 \xf1 \xf2 \xf3 \xf4 \xf5 \xf6 \xf7 \xf8 \xf9 \xfa \xfb \xfc \xfd \xfe \xff Subject length lower bound = 1 /\x{100}*\S/IB,utf ------------------------------------------------------------------ Bra \x{100}* \S Ket End ------------------------------------------------------------------ Capture group count = 0 Options: utf Starting code units: \x00 \x01 \x02 \x03 \x04 \x05 \x06 \x07 \x08 \x0e \x0f \x10 \x11 \x12 \x13 \x14 \x15 \x16 \x17 \x18 \x19 \x1a \x1b \x1c \x1d \x1e \x1f ! " # $ % & ' ( ) * + , - . / 0 1 2 3 4 5 6 7 8 9 : ; < = > ? @ A B C D E F G H I J K L M N O P Q R S T U V W X Y Z [ \ ] ^ _ ` a b c d e f g h i j k l m n o p q r s t u v w x y z { | } ~ \x7f \x80 \x81 \x82 \x83 \x84 \x85 \x86 \x87 \x88 \x89 \x8a \x8b \x8c \x8d \x8e \x8f \x90 \x91 \x92 \x93 \x94 \x95 \x96 \x97 \x98 \x99 \x9a \x9b \x9c \x9d \x9e \x9f \xa0 \xa1 \xa2 \xa3 \xa4 \xa5 \xa6 \xa7 \xa8 \xa9 \xaa \xab \xac \xad \xae \xaf \xb0 \xb1 \xb2 \xb3 \xb4 \xb5 \xb6 \xb7 \xb8 \xb9 \xba \xbb \xbc \xbd \xbe \xbf \xc0 \xc1 \xc2 \xc3 \xc4 \xc5 \xc6 \xc7 \xc8 \xc9 \xca \xcb \xcc \xcd \xce \xcf \xd0 \xd1 \xd2 \xd3 \xd4 \xd5 \xd6 \xd7 \xd8 \xd9 \xda \xdb \xdc \xdd \xde \xdf \xe0 \xe1 \xe2 \xe3 \xe4 \xe5 \xe6 \xe7 \xe8 \xe9 \xea \xeb \xec \xed \xee \xef \xf0 \xf1 \xf2 \xf3 \xf4 \xf5 \xf6 \xf7 \xf8 \xf9 \xfa \xfb \xfc \xfd \xfe \xff Subject length lower bound = 1 /\x{100}*\W/IB,utf ------------------------------------------------------------------ Bra \x{100}* \W Ket End ------------------------------------------------------------------ Capture group count = 0 Options: utf Starting code units: \x00 \x01 \x02 \x03 \x04 \x05 \x06 \x07 \x08 \x09 \x0a \x0b \x0c \x0d \x0e \x0f \x10 \x11 \x12 \x13 \x14 \x15 \x16 \x17 \x18 \x19 \x1a \x1b \x1c \x1d \x1e \x1f \x20 ! " # $ % & ' ( ) * + , - . / : ; < = > ? @ [ \ ] ^ ` { | } ~ \x7f \x80 \x81 \x82 \x83 \x84 \x85 \x86 \x87 \x88 \x89 \x8a \x8b \x8c \x8d \x8e \x8f \x90 \x91 \x92 \x93 \x94 \x95 \x96 \x97 \x98 \x99 \x9a \x9b \x9c \x9d \x9e \x9f \xa0 \xa1 \xa2 \xa3 \xa4 \xa5 \xa6 \xa7 \xa8 \xa9 \xaa \xab \xac \xad \xae \xaf \xb0 \xb1 \xb2 \xb3 \xb4 \xb5 \xb6 \xb7 \xb8 \xb9 \xba \xbb \xbc \xbd \xbe \xbf \xc0 \xc1 \xc2 \xc3 \xc4 \xc5 \xc6 \xc7 \xc8 \xc9 \xca \xcb \xcc \xcd \xce \xcf \xd0 \xd1 \xd2 \xd3 \xd4 \xd5 \xd6 \xd7 \xd8 \xd9 \xda \xdb \xdc \xdd \xde \xdf \xe0 \xe1 \xe2 \xe3 \xe4 \xe5 \xe6 \xe7 \xe8 \xe9 \xea \xeb \xec \xed \xee \xef \xf0 \xf1 \xf2 \xf3 \xf4 \xf5 \xf6 \xf7 \xf8 \xf9 \xfa \xfb \xfc \xfd \xfe \xff Subject length lower bound = 1 /[\x{105}-\x{109}]/IBi,utf ------------------------------------------------------------------ Bra [\x{104}-\x{109}] Ket End ------------------------------------------------------------------ Capture group count = 0 Options: caseless utf Starting code units: \xff Subject length lower bound = 1 \x{104} 0: \x{104} \x{105} 0: \x{105} \x{109} 0: \x{109} \= Expect no match \x{100} No match \x{10a} No match /[z-\x{100}]/IBi,utf ------------------------------------------------------------------ Bra [Zz-\xff\x{39c}\x{3bc}\x{212b}\x{1e9e}\x{212b}\x{178}\x{100}-\x{101}] Ket End ------------------------------------------------------------------ Capture group count = 0 Options: caseless utf Starting code units: Z z { | } ~ \x7f \x80 \x81 \x82 \x83 \x84 \x85 \x86 \x87 \x88 \x89 \x8a \x8b \x8c \x8d \x8e \x8f \x90 \x91 \x92 \x93 \x94 \x95 \x96 \x97 \x98 \x99 \x9a \x9b \x9c \x9d \x9e \x9f \xa0 \xa1 \xa2 \xa3 \xa4 \xa5 \xa6 \xa7 \xa8 \xa9 \xaa \xab \xac \xad \xae \xaf \xb0 \xb1 \xb2 \xb3 \xb4 \xb5 \xb6 \xb7 \xb8 \xb9 \xba \xbb \xbc \xbd \xbe \xbf \xc0 \xc1 \xc2 \xc3 \xc4 \xc5 \xc6 \xc7 \xc8 \xc9 \xca \xcb \xcc \xcd \xce \xcf \xd0 \xd1 \xd2 \xd3 \xd4 \xd5 \xd6 \xd7 \xd8 \xd9 \xda \xdb \xdc \xdd \xde \xdf \xe0 \xe1 \xe2 \xe3 \xe4 \xe5 \xe6 \xe7 \xe8 \xe9 \xea \xeb \xec \xed \xee \xef \xf0 \xf1 \xf2 \xf3 \xf4 \xf5 \xf6 \xf7 \xf8 \xf9 \xfa \xfb \xfc \xfd \xfe \xff Subject length lower bound = 1 Z 0: Z z 0: z \x{39c} 0: \x{39c} \x{178} 0: \x{178} | 0: | \x{80} 0: \x{80} \x{ff} 0: \x{ff} \x{100} 0: \x{100} \x{101} 0: \x{101} \= Expect no match \x{102} No match Y No match y No match /[z-\x{100}]/IBi,utf ------------------------------------------------------------------ Bra [Zz-\xff\x{39c}\x{3bc}\x{212b}\x{1e9e}\x{212b}\x{178}\x{100}-\x{101}] Ket End ------------------------------------------------------------------ Capture group count = 0 Options: caseless utf Starting code units: Z z { | } ~ \x7f \x80 \x81 \x82 \x83 \x84 \x85 \x86 \x87 \x88 \x89 \x8a \x8b \x8c \x8d \x8e \x8f \x90 \x91 \x92 \x93 \x94 \x95 \x96 \x97 \x98 \x99 \x9a \x9b \x9c \x9d \x9e \x9f \xa0 \xa1 \xa2 \xa3 \xa4 \xa5 \xa6 \xa7 \xa8 \xa9 \xaa \xab \xac \xad \xae \xaf \xb0 \xb1 \xb2 \xb3 \xb4 \xb5 \xb6 \xb7 \xb8 \xb9 \xba \xbb \xbc \xbd \xbe \xbf \xc0 \xc1 \xc2 \xc3 \xc4 \xc5 \xc6 \xc7 \xc8 \xc9 \xca \xcb \xcc \xcd \xce \xcf \xd0 \xd1 \xd2 \xd3 \xd4 \xd5 \xd6 \xd7 \xd8 \xd9 \xda \xdb \xdc \xdd \xde \xdf \xe0 \xe1 \xe2 \xe3 \xe4 \xe5 \xe6 \xe7 \xe8 \xe9 \xea \xeb \xec \xed \xee \xef \xf0 \xf1 \xf2 \xf3 \xf4 \xf5 \xf6 \xf7 \xf8 \xf9 \xfa \xfb \xfc \xfd \xfe \xff Subject length lower bound = 1 /\x{3a3}B/IBi,utf ------------------------------------------------------------------ Bra clist 03a3 03c2 03c3 /i B Ket End ------------------------------------------------------------------ Capture group count = 0 Options: caseless utf Starting code units: \xff Last code unit = 'B' (caseless) Subject length lower bound = 2 /./utf \x{110000} Failed: error -28: UTF-32 error: code points greater than 0x10ffff are not defined at offset 0 /(*UTF)abý¿¿¿¿¿z/B ------------------------------------------------------------------ Bra ab\x{fd}\x{bf}\x{bf}\x{bf}\x{bf}\x{bf}z Ket End ------------------------------------------------------------------ /abý¿¿¿¿¿z/utf ** Failed: character value greater than 0x10ffff cannot be converted to UTF /[\W\p{Any}]/B ------------------------------------------------------------------ Bra [\x00-/:-@[-^`{-\xff\p{Any}\x{100}-\x{ffffffff}] Ket End ------------------------------------------------------------------ abc 0: a 123 0: 1 /[\W\pL]/B ------------------------------------------------------------------ Bra [\x00-/:-@[-^`{-\xff\p{L}\x{100}-\x{ffffffff}] Ket End ------------------------------------------------------------------ abc 0: a \x{100} 0: \x{100} \x{308} 0: \x{308} \= Expect no match 123 No match /[\s[:^ascii:]]/B,ucp ------------------------------------------------------------------ Bra [\x80-\xff\p{Xsp}\x{100}-\x{ffffffff}] Ket End ------------------------------------------------------------------ /\pP/ucp \x{7fffffff} No match # A special extra option allows excaped surrogate code points in 32-bit mode, # but subjects containing them must not be UTF-checked. These patterns give # errors in 16-bit mode. /\x{d800}/I,utf,allow_surrogate_escapes Capture group count = 0 Options: utf Extra options: allow_surrogate_escapes First code unit = \x{d800} Subject length lower bound = 1 \x{d800}\=no_utf_check 0: \x{d800} /\udfff\o{157401}/utf,alt_bsux,allow_surrogate_escapes \x{dfff}\x{df01}\=no_utf_check 0: \x{dfff}\x{df01} # This has different starting code units in 8-bit mode. /^[^ab]/IB,utf ------------------------------------------------------------------ Bra ^ [\x00-`c-\xff] (neg) Ket End ------------------------------------------------------------------ Capture group count = 0 Compile options: utf Overall options: anchored utf Starting code units: \x00 \x01 \x02 \x03 \x04 \x05 \x06 \x07 \x08 \x09 \x0a \x0b \x0c \x0d \x0e \x0f \x10 \x11 \x12 \x13 \x14 \x15 \x16 \x17 \x18 \x19 \x1a \x1b \x1c \x1d \x1e \x1f \x20 ! " # $ % & ' ( ) * + , - . / 0 1 2 3 4 5 6 7 8 9 : ; < = > ? @ A B C D E F G H I J K L M N O P Q R S T U V W X Y Z [ \ ] ^ _ ` c d e f g h i j k l m n o p q r s t u v w x y z { | } ~ \x7f \x80 \x81 \x82 \x83 \x84 \x85 \x86 \x87 \x88 \x89 \x8a \x8b \x8c \x8d \x8e \x8f \x90 \x91 \x92 \x93 \x94 \x95 \x96 \x97 \x98 \x99 \x9a \x9b \x9c \x9d \x9e \x9f \xa0 \xa1 \xa2 \xa3 \xa4 \xa5 \xa6 \xa7 \xa8 \xa9 \xaa \xab \xac \xad \xae \xaf \xb0 \xb1 \xb2 \xb3 \xb4 \xb5 \xb6 \xb7 \xb8 \xb9 \xba \xbb \xbc \xbd \xbe \xbf \xc0 \xc1 \xc2 \xc3 \xc4 \xc5 \xc6 \xc7 \xc8 \xc9 \xca \xcb \xcc \xcd \xce \xcf \xd0 \xd1 \xd2 \xd3 \xd4 \xd5 \xd6 \xd7 \xd8 \xd9 \xda \xdb \xdc \xdd \xde \xdf \xe0 \xe1 \xe2 \xe3 \xe4 \xe5 \xe6 \xe7 \xe8 \xe9 \xea \xeb \xec \xed \xee \xef \xf0 \xf1 \xf2 \xf3 \xf4 \xf5 \xf6 \xf7 \xf8 \xf9 \xfa \xfb \xfc \xfd \xfe \xff Subject length lower bound = 1 c 0: c \x{ff} 0: \x{ff} \x{100} 0: \x{100} \= Expect no match aaa No match # Offsets are different in 8-bit mode. /(?<=abc)(|def)/g,utf,replace=<$0>,substitute_callout 123abcáyzabcdef789abcሴqr 1(2) Old 6 6 "" New 6 8 "<>" 2(2) Old 12 12 "" New 14 16 "<>" 3(2) Old 12 15 "def" New 16 21 "" 4(2) Old 21 21 "" New 27 29 "<>" 4: 123abc<>\x{e1}yzabc<>789abc<>\x{1234}qr # A few script run tests in non-UTF mode (but they need Unicode support) /^(*script_run:.{4})/ \x{3041}\x{30a1}\x{3007}\x{3007} Hiragana Katakana Han Han 0: \x{3041}\x{30a1}\x{3007}\x{3007} \x{30a1}\x{3041}\x{3007}\x{3007} Katakana Hiragana Han Han 0: \x{30a1}\x{3041}\x{3007}\x{3007} \x{1100}\x{2e80}\x{2e80}\x{1101} Hangul Han Han Hangul 0: \x{1100}\x{2e80}\x{2e80}\x{1101} /^(*sr:.*)/utf,allow_surrogate_escapes \x{2e80}\x{3105}\x{2e80}\x{30a1} Han Bopomofo Han Katakana 0: \x{2e80}\x{3105}\x{2e80} \x{d800}\x{dfff} Surrogates (Unknown) \=no_utf_check 0: \x{d800} /(?(n/utf Failed: error 142 at offset 4: syntax error in subpattern name (missing terminator?) /(?(á/utf Failed: error 142 at offset 4: syntax error in subpattern name (missing terminator?) # Invalid UTF-16/32 tests. /.../g,match_invalid_utf abcd\x{df00}wxzy\x{df00}pqrs 0: abc 0: wxz 0: pqr abcd\x{80}wxzy\x{df00}pqrs 0: abc 0: d\x{80}w 0: xzy 0: pqr /abc/match_invalid_utf ab\x{df00}ab\=ph Partial match: ab \= Expect no match ab\x{df00}cdef\=ph No match /ab$/match_invalid_utf ab\x{df00}cdeab 0: ab \= Expect no match ab\x{df00}cde No match /.../g,match_invalid_utf abcd\x{80}wxzy\x{df00}pqrs 0: abc 0: d\x{80}w 0: xzy 0: pqr /(?<=x)../g,match_invalid_utf abcd\x{80}wxzy\x{df00}pqrs 0: zy abcd\x{80}wxzy\x{df00}xpqrs 0: zy 0: pq /X$/match_invalid_utf \= Expect no match X\x{df00} No match /(?<=..)X/match_invalid_utf,aftertext AB\x{df00}AQXYZ 0: X 0+ YZ AB\x{df00}AQXYZ\=offset=5 0: X 0+ YZ AB\x{df00}\x{df00}AXYZXC\=offset=5 0: X 0+ C \= Expect no match AB\x{df00}XYZ No match AB\x{df00}XYZ\=offset=3 No match AB\x{df00}AXYZ No match AB\x{df00}AXYZ\=offset=4 No match AB\x{df00}\x{df00}AXYZ\=offset=5 No match /.../match_invalid_utf \= Expect no match A\x{d800}B No match A\x{110000}B No match # ---------------------------------------------------- /(*UTF)(?=\x{123})/I Capture group count = 0 May match empty string Compile options: Overall options: utf First code unit = \x{123} Subject length lower bound = 1 /[\x{c1}\x{e1}]X[\x{145}\x{146}]/I,utf Capture group count = 0 Options: utf First code unit = \xc1 (caseless) Last code unit = \x{145} (caseless) Subject length lower bound = 3 /[\xff\x{ffff}]/I,utf Capture group count = 0 Options: utf Starting code units: \xff Subject length lower bound = 1 /[\xff\x{ff}]/I,utf Capture group count = 0 Options: utf Starting code units: \xff Subject length lower bound = 1 /[\xff\x{ff}]/I Capture group count = 0 Starting code units: \xff Subject length lower bound = 1 /[Ss]/I Capture group count = 0 Starting code units: S s Subject length lower bound = 1 /[Ss]/I,utf Capture group count = 0 Options: utf Starting code units: S s Subject length lower bound = 1 /(?:\x{ff}|\x{3000})/I,utf Capture group count = 0 Options: utf Starting code units: \xff Subject length lower bound = 1 # End of testinput12