# This set of tests is for UTF-8 support and Unicode property support, with # relevance only for the 8-bit library. # The next 5 patterns have UTF-8 errors /[Ã]/utf Failed: error -8 at offset 1: UTF-8 error: byte 2 top bits not 0x80 /Ã/utf Failed: error -3 at offset 0: UTF-8 error: 1 byte missing at end /ÃÃÃxxx/utf Failed: error -8 at offset 0: UTF-8 error: byte 2 top bits not 0x80 /‚‚‚‚‚‚‚Ã/utf Failed: error -22 at offset 2: UTF-8 error: isolated byte with 0x80 bit set /‚‚‚‚‚‚‚Ã/match_invalid_utf Failed: error -22 at offset 2: UTF-8 error: isolated byte with 0x80 bit set # Now test subjects /badutf/utf \= Expect UTF-8 errors X\xdf Failed: error -3: UTF-8 error: 1 byte missing at end at offset 1 XX\xef Failed: error -4: UTF-8 error: 2 bytes missing at end at offset 2 XXX\xef\x80 Failed: error -3: UTF-8 error: 1 byte missing at end at offset 3 X\xf7 Failed: error -5: UTF-8 error: 3 bytes missing at end at offset 1 XX\xf7\x80 Failed: error -4: UTF-8 error: 2 bytes missing at end at offset 2 XXX\xf7\x80\x80 Failed: error -3: UTF-8 error: 1 byte missing at end at offset 3 \xfb Failed: error -6: UTF-8 error: 4 bytes missing at end at offset 0 \xfb\x80 Failed: error -5: UTF-8 error: 3 bytes missing at end at offset 0 \xfb\x80\x80 Failed: error -4: UTF-8 error: 2 bytes missing at end at offset 0 \xfb\x80\x80\x80 Failed: error -3: UTF-8 error: 1 byte missing at end at offset 0 \xfd Failed: error -7: UTF-8 error: 5 bytes missing at end at offset 0 \xfd\x80 Failed: error -6: UTF-8 error: 4 bytes missing at end at offset 0 \xfd\x80\x80 Failed: error -5: UTF-8 error: 3 bytes missing at end at offset 0 \xfd\x80\x80\x80 Failed: error -4: UTF-8 error: 2 bytes missing at end at offset 0 \xfd\x80\x80\x80\x80 Failed: error -3: UTF-8 error: 1 byte missing at end at offset 0 \xdf\x7f Failed: error -8: UTF-8 error: byte 2 top bits not 0x80 at offset 0 \xef\x7f\x80 Failed: error -8: UTF-8 error: byte 2 top bits not 0x80 at offset 0 \xef\x80\x7f Failed: error -9: UTF-8 error: byte 3 top bits not 0x80 at offset 0 \xf7\x7f\x80\x80 Failed: error -8: UTF-8 error: byte 2 top bits not 0x80 at offset 0 \xf7\x80\x7f\x80 Failed: error -9: UTF-8 error: byte 3 top bits not 0x80 at offset 0 \xf7\x80\x80\x7f Failed: error -10: UTF-8 error: byte 4 top bits not 0x80 at offset 0 \xfb\x7f\x80\x80\x80 Failed: error -8: UTF-8 error: byte 2 top bits not 0x80 at offset 0 \xfb\x80\x7f\x80\x80 Failed: error -9: UTF-8 error: byte 3 top bits not 0x80 at offset 0 \xfb\x80\x80\x7f\x80 Failed: error -10: UTF-8 error: byte 4 top bits not 0x80 at offset 0 \xfb\x80\x80\x80\x7f Failed: error -11: UTF-8 error: byte 5 top bits not 0x80 at offset 0 \xfd\x7f\x80\x80\x80\x80 Failed: error -8: UTF-8 error: byte 2 top bits not 0x80 at offset 0 \xfd\x80\x7f\x80\x80\x80 Failed: error -9: UTF-8 error: byte 3 top bits not 0x80 at offset 0 \xfd\x80\x80\x7f\x80\x80 Failed: error -10: UTF-8 error: byte 4 top bits not 0x80 at offset 0 \xfd\x80\x80\x80\x7f\x80 Failed: error -11: UTF-8 error: byte 5 top bits not 0x80 at offset 0 \xfd\x80\x80\x80\x80\x7f Failed: error -12: UTF-8 error: byte 6 top bits not 0x80 at offset 0 \xed\xa0\x80 Failed: error -16: UTF-8 error: code points 0xd800-0xdfff are not defined at offset 0 \xc0\x8f Failed: error -17: UTF-8 error: overlong 2-byte sequence at offset 0 \xe0\x80\x8f Failed: error -18: UTF-8 error: overlong 3-byte sequence at offset 0 \xf0\x80\x80\x8f Failed: error -19: UTF-8 error: overlong 4-byte sequence at offset 0 \xf8\x80\x80\x80\x8f Failed: error -20: UTF-8 error: overlong 5-byte sequence at offset 0 \xfc\x80\x80\x80\x80\x8f Failed: error -21: UTF-8 error: overlong 6-byte sequence at offset 0 \x80 Failed: error -22: UTF-8 error: isolated byte with 0x80 bit set at offset 0 \xfe Failed: error -23: UTF-8 error: illegal byte (0xfe or 0xff) at offset 0 \xff Failed: error -23: UTF-8 error: illegal byte (0xfe or 0xff) at offset 0 /badutf/utf \= Expect UTF-8 errors XX\xfb\x80\x80\x80\x80 Failed: error -13: UTF-8 error: 5-byte character is not allowed (RFC 3629) at offset 2 XX\xfd\x80\x80\x80\x80\x80 Failed: error -14: UTF-8 error: 6-byte character is not allowed (RFC 3629) at offset 2 XX\xf7\xbf\xbf\xbf Failed: error -15: UTF-8 error: code points greater than 0x10ffff are not defined at offset 2 /shortutf/utf \= Expect UTF-8 errors XX\xdf\=ph Failed: error -3: UTF-8 error: 1 byte missing at end at offset 2 XX\xef\=ph Failed: error -4: UTF-8 error: 2 bytes missing at end at offset 2 XX\xef\x80\=ph Failed: error -3: UTF-8 error: 1 byte missing at end at offset 2 \xf7\=ph Failed: error -5: UTF-8 error: 3 bytes missing at end at offset 0 \xf7\x80\=ph Failed: error -4: UTF-8 error: 2 bytes missing at end at offset 0 \xf7\x80\x80\=ph Failed: error -3: UTF-8 error: 1 byte missing at end at offset 0 \xfb\=ph Failed: error -6: UTF-8 error: 4 bytes missing at end at offset 0 \xfb\x80\=ph Failed: error -5: UTF-8 error: 3 bytes missing at end at offset 0 \xfb\x80\x80\=ph Failed: error -4: UTF-8 error: 2 bytes missing at end at offset 0 \xfb\x80\x80\x80\=ph Failed: error -3: UTF-8 error: 1 byte missing at end at offset 0 \xfd\=ph Failed: error -7: UTF-8 error: 5 bytes missing at end at offset 0 \xfd\x80\=ph Failed: error -6: UTF-8 error: 4 bytes missing at end at offset 0 \xfd\x80\x80\=ph Failed: error -5: UTF-8 error: 3 bytes missing at end at offset 0 \xfd\x80\x80\x80\=ph Failed: error -4: UTF-8 error: 2 bytes missing at end at offset 0 \xfd\x80\x80\x80\x80\=ph Failed: error -3: UTF-8 error: 1 byte missing at end at offset 0 /anything/utf \= Expect UTF-8 errors X\xc0\x80 Failed: error -17: UTF-8 error: overlong 2-byte sequence at offset 1 XX\xc1\x8f Failed: error -17: UTF-8 error: overlong 2-byte sequence at offset 2 XXX\xe0\x9f\x80 Failed: error -18: UTF-8 error: overlong 3-byte sequence at offset 3 \xf0\x8f\x80\x80 Failed: error -19: UTF-8 error: overlong 4-byte sequence at offset 0 \xf8\x87\x80\x80\x80 Failed: error -20: UTF-8 error: overlong 5-byte sequence at offset 0 \xfc\x83\x80\x80\x80\x80 Failed: error -21: UTF-8 error: overlong 6-byte sequence at offset 0 \xfe\x80\x80\x80\x80\x80 Failed: error -23: UTF-8 error: illegal byte (0xfe or 0xff) at offset 0 \xff\x80\x80\x80\x80\x80 Failed: error -23: UTF-8 error: illegal byte (0xfe or 0xff) at offset 0 \xf8\x88\x80\x80\x80 Failed: error -13: UTF-8 error: 5-byte character is not allowed (RFC 3629) at offset 0 \xf9\x87\x80\x80\x80 Failed: error -13: UTF-8 error: 5-byte character is not allowed (RFC 3629) at offset 0 \xfc\x84\x80\x80\x80\x80 Failed: error -14: UTF-8 error: 6-byte character is not allowed (RFC 3629) at offset 0 \xfd\x83\x80\x80\x80\x80 Failed: error -14: UTF-8 error: 6-byte character is not allowed (RFC 3629) at offset 0 \= Expect no match \xc3\x8f No match \xe0\xaf\x80 No match \xe1\x80\x80 No match \xf0\x9f\x80\x80 No match \xf1\x8f\x80\x80 No match \xf8\x88\x80\x80\x80\=no_utf_check No match \xf9\x87\x80\x80\x80\=no_utf_check No match \xfc\x84\x80\x80\x80\x80\=no_utf_check No match \xfd\x83\x80\x80\x80\x80\=no_utf_check No match # Similar tests with offsets /badutf/utf \= Expect UTF-8 errors X\xdfabcd Failed: error -8: UTF-8 error: byte 2 top bits not 0x80 at offset 1 X\xdfabcd\=offset=1 Failed: error -8: UTF-8 error: byte 2 top bits not 0x80 at offset 1 \= Expect no match X\xdfabcd\=offset=2 No match /(?<=x)badutf/utf \= Expect UTF-8 errors X\xdfabcd Failed: error -8: UTF-8 error: byte 2 top bits not 0x80 at offset 1 X\xdfabcd\=offset=1 Failed: error -8: UTF-8 error: byte 2 top bits not 0x80 at offset 1 X\xdfabcd\=offset=2 Failed: error -8: UTF-8 error: byte 2 top bits not 0x80 at offset 1 X\xdfabcd\xdf\=offset=3 Failed: error -3: UTF-8 error: 1 byte missing at end at offset 6 \= Expect no match X\xdfabcd\=offset=3 No match /(?<=xx)badutf/utf \= Expect UTF-8 errors X\xdfabcd Failed: error -8: UTF-8 error: byte 2 top bits not 0x80 at offset 1 X\xdfabcd\=offset=1 Failed: error -8: UTF-8 error: byte 2 top bits not 0x80 at offset 1 X\xdfabcd\=offset=2 Failed: error -8: UTF-8 error: byte 2 top bits not 0x80 at offset 1 X\xdfabcd\=offset=3 Failed: error -8: UTF-8 error: byte 2 top bits not 0x80 at offset 1 /(?<=xxxx)badutf/utf \= Expect UTF-8 errors X\xdfabcd Failed: error -8: UTF-8 error: byte 2 top bits not 0x80 at offset 1 X\xdfabcd\=offset=1 Failed: error -8: UTF-8 error: byte 2 top bits not 0x80 at offset 1 X\xdfabcd\=offset=2 Failed: error -8: UTF-8 error: byte 2 top bits not 0x80 at offset 1 X\xdfabcd\=offset=3 Failed: error -8: UTF-8 error: byte 2 top bits not 0x80 at offset 1 X\xdfabc\xdf\=offset=6 Failed: error -3: UTF-8 error: 1 byte missing at end at offset 5 X\xdfabc\xdf\=offset=7 Failed: error -33: bad offset value \= Expect no match X\xdfabcd\=offset=6 No match /\x{100}/IB,utf ------------------------------------------------------------------ Bra \x{100} Ket End ------------------------------------------------------------------ Capture group count = 0 Options: utf First code unit = \xc4 Last code unit = \x80 Subject length lower bound = 1 /\x{1000}/IB,utf ------------------------------------------------------------------ Bra \x{1000} Ket End ------------------------------------------------------------------ Capture group count = 0 Options: utf First code unit = \xe1 Last code unit = \x80 Subject length lower bound = 1 /\x{10000}/IB,utf ------------------------------------------------------------------ Bra \x{10000} Ket End ------------------------------------------------------------------ Capture group count = 0 Options: utf First code unit = \xf0 Last code unit = \x80 Subject length lower bound = 1 /\x{100000}/IB,utf ------------------------------------------------------------------ Bra \x{100000} Ket End ------------------------------------------------------------------ Capture group count = 0 Options: utf First code unit = \xf4 Last code unit = \x80 Subject length lower bound = 1 /\x{10ffff}/IB,utf ------------------------------------------------------------------ Bra \x{10ffff} Ket End ------------------------------------------------------------------ Capture group count = 0 Options: utf First code unit = \xf4 Last code unit = \xbf Subject length lower bound = 1 /[\x{ff}]/IB,utf ------------------------------------------------------------------ Bra \x{ff} Ket End ------------------------------------------------------------------ Capture group count = 0 Options: utf First code unit = \xc3 Last code unit = \xbf Subject length lower bound = 1 /[\x{100}]/IB,utf ------------------------------------------------------------------ Bra \x{100} Ket End ------------------------------------------------------------------ Capture group count = 0 Options: utf First code unit = \xc4 Last code unit = \x80 Subject length lower bound = 1 /\x80/IB,utf ------------------------------------------------------------------ Bra \x{80} Ket End ------------------------------------------------------------------ Capture group count = 0 Options: utf First code unit = \xc2 Last code unit = \x80 Subject length lower bound = 1 /\xff/IB,utf ------------------------------------------------------------------ Bra \x{ff} Ket End ------------------------------------------------------------------ Capture group count = 0 Options: utf First code unit = \xc3 Last code unit = \xbf Subject length lower bound = 1 /\x{D55c}\x{ad6d}\x{C5B4}/IB,utf ------------------------------------------------------------------ Bra \x{d55c}\x{ad6d}\x{c5b4} Ket End ------------------------------------------------------------------ Capture group count = 0 Options: utf First code unit = \xed Last code unit = \xb4 Subject length lower bound = 3 \x{D55c}\x{ad6d}\x{C5B4} 0: \x{d55c}\x{ad6d}\x{c5b4} /\x{65e5}\x{672c}\x{8a9e}/IB,utf ------------------------------------------------------------------ Bra \x{65e5}\x{672c}\x{8a9e} Ket End ------------------------------------------------------------------ Capture group count = 0 Options: utf First code unit = \xe6 Last code unit = \x9e Subject length lower bound = 3 \x{65e5}\x{672c}\x{8a9e} 0: \x{65e5}\x{672c}\x{8a9e} /\x{80}/IB,utf ------------------------------------------------------------------ Bra \x{80} Ket End ------------------------------------------------------------------ Capture group count = 0 Options: utf First code unit = \xc2 Last code unit = \x80 Subject length lower bound = 1 /\x{084}/IB,utf ------------------------------------------------------------------ Bra \x{84} Ket End ------------------------------------------------------------------ Capture group count = 0 Options: utf First code unit = \xc2 Last code unit = \x84 Subject length lower bound = 1 /\x{104}/IB,utf ------------------------------------------------------------------ Bra \x{104} Ket End ------------------------------------------------------------------ Capture group count = 0 Options: utf First code unit = \xc4 Last code unit = \x84 Subject length lower bound = 1 /\x{861}/IB,utf ------------------------------------------------------------------ Bra \x{861} Ket End ------------------------------------------------------------------ Capture group count = 0 Options: utf First code unit = \xe0 Last code unit = \xa1 Subject length lower bound = 1 /\x{212ab}/IB,utf ------------------------------------------------------------------ Bra \x{212ab} Ket End ------------------------------------------------------------------ Capture group count = 0 Options: utf First code unit = \xf0 Last code unit = \xab Subject length lower bound = 1 /[^ab\xC0-\xF0]/IB,utf ------------------------------------------------------------------ Bra [\x00-`c-\xbf\xf1-\xff] (neg) Ket End ------------------------------------------------------------------ Capture group count = 0 Options: utf Starting code units: \x00 \x01 \x02 \x03 \x04 \x05 \x06 \x07 \x08 \x09 \x0a \x0b \x0c \x0d \x0e \x0f \x10 \x11 \x12 \x13 \x14 \x15 \x16 \x17 \x18 \x19 \x1a \x1b \x1c \x1d \x1e \x1f \x20 ! " # $ % & ' ( ) * + , - . / 0 1 2 3 4 5 6 7 8 9 : ; < = > ? @ A B C D E F G H I J K L M N O P Q R S T U V W X Y Z [ \ ] ^ _ ` c d e f g h i j k l m n o p q r s t u v w x y z { | } ~ \x7f \xc2 \xc3 \xc4 \xc5 \xc6 \xc7 \xc8 \xc9 \xca \xcb \xcc \xcd \xce \xcf \xd0 \xd1 \xd2 \xd3 \xd4 \xd5 \xd6 \xd7 \xd8 \xd9 \xda \xdb \xdc \xdd \xde \xdf \xe0 \xe1 \xe2 \xe3 \xe4 \xe5 \xe6 \xe7 \xe8 \xe9 \xea \xeb \xec \xed \xee \xef \xf0 \xf1 \xf2 \xf3 \xf4 \xf5 \xf6 \xf7 \xf8 \xf9 \xfa \xfb \xfc \xfd \xfe \xff Subject length lower bound = 1 \x{f1} 0: \x{f1} \x{bf} 0: \x{bf} \x{100} 0: \x{100} \x{1000} 0: \x{1000} \= Expect no match \x{c0} No match \x{f0} No match /Ä€{3,4}/IB,utf ------------------------------------------------------------------ Bra \x{100}{3} \x{100}?+ Ket End ------------------------------------------------------------------ Capture group count = 0 Options: utf First code unit = \xc4 Last code unit = \x80 Subject length lower bound = 3 \x{100}\x{100}\x{100}\x{100\x{100} 0: \x{100}\x{100}\x{100} /(\x{100}+|x)/IB,utf ------------------------------------------------------------------ Bra CBra 1 \x{100}++ Alt x Ket Ket End ------------------------------------------------------------------ Capture group count = 1 Options: utf Starting code units: x \xc4 Subject length lower bound = 1 /(\x{100}*a|x)/IB,utf ------------------------------------------------------------------ Bra CBra 1 \x{100}*+ a Alt x Ket Ket End ------------------------------------------------------------------ Capture group count = 1 Options: utf Starting code units: a x \xc4 Subject length lower bound = 1 /(\x{100}{0,2}a|x)/IB,utf ------------------------------------------------------------------ Bra CBra 1 \x{100}{0,2}+ a Alt x Ket Ket End ------------------------------------------------------------------ Capture group count = 1 Options: utf Starting code units: a x \xc4 Subject length lower bound = 1 /(\x{100}{1,2}a|x)/IB,utf ------------------------------------------------------------------ Bra CBra 1 \x{100} \x{100}{0,1}+ a Alt x Ket Ket End ------------------------------------------------------------------ Capture group count = 1 Options: utf Starting code units: x \xc4 Subject length lower bound = 1 /\x{100}/IB,utf ------------------------------------------------------------------ Bra \x{100} Ket End ------------------------------------------------------------------ Capture group count = 0 Options: utf First code unit = \xc4 Last code unit = \x80 Subject length lower bound = 1 /a\x{100}\x{101}*/IB,utf ------------------------------------------------------------------ Bra a\x{100} \x{101}*+ Ket End ------------------------------------------------------------------ Capture group count = 0 Options: utf First code unit = 'a' Last code unit = \x80 Subject length lower bound = 2 /a\x{100}\x{101}+/IB,utf ------------------------------------------------------------------ Bra a\x{100} \x{101}++ Ket End ------------------------------------------------------------------ Capture group count = 0 Options: utf First code unit = 'a' Last code unit = \x81 Subject length lower bound = 3 /[^\x{c4}]/IB ------------------------------------------------------------------ Bra [^\x{c4}] Ket End ------------------------------------------------------------------ Capture group count = 0 Subject length lower bound = 1 /[\x{100}]/IB,utf ------------------------------------------------------------------ Bra \x{100} Ket End ------------------------------------------------------------------ Capture group count = 0 Options: utf First code unit = \xc4 Last code unit = \x80 Subject length lower bound = 1 \x{100} 0: \x{100} Z\x{100} 0: \x{100} \x{100}Z 0: \x{100} /[\xff]/IB,utf ------------------------------------------------------------------ Bra \x{ff} Ket End ------------------------------------------------------------------ Capture group count = 0 Options: utf First code unit = \xc3 Last code unit = \xbf Subject length lower bound = 1 >\x{ff}< 0: \x{ff} /[^\xff]/IB,utf ------------------------------------------------------------------ Bra [^\x{ff}] Ket End ------------------------------------------------------------------ Capture group count = 0 Options: utf Subject length lower bound = 1 /\x{100}abc(xyz(?1))/IB,utf ------------------------------------------------------------------ Bra \x{100}abc CBra 1 xyz Recurse Ket Ket End ------------------------------------------------------------------ Capture group count = 1 Options: utf First code unit = \xc4 Last code unit = 'z' Subject length lower bound = 7 /\777/I,utf Capture group count = 0 Options: utf First code unit = \xc7 Last code unit = \xbf Subject length lower bound = 1 \x{1ff} 0: \x{1ff} \777 0: \x{1ff} /\x{100}+\x{200}/IB,utf ------------------------------------------------------------------ Bra \x{100}++ \x{200} Ket End ------------------------------------------------------------------ Capture group count = 0 Options: utf First code unit = \xc4 Last code unit = \x80 Subject length lower bound = 2 /\x{100}+X/IB,utf ------------------------------------------------------------------ Bra \x{100}++ X Ket End ------------------------------------------------------------------ Capture group count = 0 Options: utf First code unit = \xc4 Last code unit = 'X' Subject length lower bound = 2 /^[\QÄ€\E-\QÅ\E/B,utf Failed: error 106 at offset 15: missing terminating ] for character class # This tests the stricter UTF-8 check according to RFC 3629. /X/utf \= Expect UTF-8 errors \x{d800} Failed: error -16: UTF-8 error: code points 0xd800-0xdfff are not defined at offset 0 \x{da00} Failed: error -16: UTF-8 error: code points 0xd800-0xdfff are not defined at offset 0 \x{dfff} Failed: error -16: UTF-8 error: code points 0xd800-0xdfff are not defined at offset 0 \x{110000} Failed: error -15: UTF-8 error: code points greater than 0x10ffff are not defined at offset 0 \x{2000000} Failed: error -13: UTF-8 error: 5-byte character is not allowed (RFC 3629) at offset 0 \x{7fffffff} Failed: error -14: UTF-8 error: 6-byte character is not allowed (RFC 3629) at offset 0 \= Expect no match \x{d800}\=no_utf_check No match \x{da00}\=no_utf_check No match \x{dfff}\=no_utf_check No match \x{110000}\=no_utf_check No match \x{2000000}\=no_utf_check No match \x{7fffffff}\=no_utf_check No match /(*UTF8)\x{1234}/ abcd\x{1234}pqr 0: \x{1234} /(*CRLF)(*UTF)(*BSR_UNICODE)a\Rb/I Capture group count = 0 Compile options: Overall options: utf \R matches any Unicode newline Forced newline is CRLF First code unit = 'a' Last code unit = 'b' Subject length lower bound = 3 /\h/I,utf Capture group count = 0 Options: utf Starting code units: \x09 \x20 \xc2 \xe1 \xe2 \xe3 Subject length lower bound = 1 ABC\x{09} 0: \x{09} ABC\x{20} 0: ABC\x{a0} 0: \x{a0} ABC\x{1680} 0: \x{1680} ABC\x{180e} 0: \x{180e} ABC\x{2000} 0: \x{2000} ABC\x{202f} 0: \x{202f} ABC\x{205f} 0: \x{205f} ABC\x{3000} 0: \x{3000} /\v/I,utf Capture group count = 0 Options: utf Starting code units: \x0a \x0b \x0c \x0d \xc2 \xe2 Subject length lower bound = 1 ABC\x{0a} 0: \x{0a} ABC\x{0b} 0: \x{0b} ABC\x{0c} 0: \x{0c} ABC\x{0d} 0: \x{0d} ABC\x{85} 0: \x{85} ABC\x{2028} 0: \x{2028} /\h*A/I,utf Capture group count = 0 Options: utf Starting code units: \x09 \x20 A \xc2 \xe1 \xe2 \xe3 Last code unit = 'A' Subject length lower bound = 1 CDBABC 0: A /\v+A/I,utf Capture group count = 0 Options: utf Starting code units: \x0a \x0b \x0c \x0d \xc2 \xe2 Last code unit = 'A' Subject length lower bound = 2 /\s?xxx\s/I,utf Capture group count = 0 Options: utf Starting code units: \x09 \x0a \x0b \x0c \x0d \x20 x Last code unit = 'x' Subject length lower bound = 4 /\sxxx\s/I,utf,tables=2 Capture group count = 0 Options: utf Starting code units: \x09 \x0a \x0b \x0c \x0d \x20 \xc2 Last code unit = 'x' Subject length lower bound = 5 AB\x{85}xxx\x{a0}XYZ 0: \x{85}xxx\x{a0} AB\x{a0}xxx\x{85}XYZ 0: \x{a0}xxx\x{85} /\S \S/I,utf,tables=2 Capture group count = 0 Options: utf Starting code units: \x00 \x01 \x02 \x03 \x04 \x05 \x06 \x07 \x08 \x0e \x0f \x10 \x11 \x12 \x13 \x14 \x15 \x16 \x17 \x18 \x19 \x1a \x1b \x1c \x1d \x1e \x1f ! " # $ % & ' ( ) * + , - . / 0 1 2 3 4 5 6 7 8 9 : ; < = > ? @ A B C D E F G H I J K L M N O P Q R S T U V W X Y Z [ \ ] ^ _ ` a b c d e f g h i j k l m n o p q r s t u v w x y z { | } ~ \x7f \xc0 \xc1 \xc2 \xc3 \xc4 \xc5 \xc6 \xc7 \xc8 \xc9 \xca \xcb \xcc \xcd \xce \xcf \xd0 \xd1 \xd2 \xd3 \xd4 \xd5 \xd6 \xd7 \xd8 \xd9 \xda \xdb \xdc \xdd \xde \xdf \xe0 \xe1 \xe2 \xe3 \xe4 \xe5 \xe6 \xe7 \xe8 \xe9 \xea \xeb \xec \xed \xee \xef \xf0 \xf1 \xf2 \xf3 \xf4 \xf5 \xf6 \xf7 \xf8 \xf9 \xfa \xfb \xfc \xfd \xfe \xff Last code unit = ' ' Subject length lower bound = 3 \x{a2} \x{84} 0: \x{a2} \x{84} A Z 0: A Z /a+/utf a\x{123}aa\=offset=1 0: aa a\x{123}aa\=offset=3 0: aa a\x{123}aa\=offset=4 0: a \= Expect bad offset value a\x{123}aa\=offset=6 Failed: error -33: bad offset value \= Expect bad UTF-8 offset a\x{123}aa\=offset=2 Error -36 (bad UTF-8 offset) \= Expect no match a\x{123}aa\=offset=5 No match /\x{1234}+/Ii,utf Capture group count = 0 Options: caseless utf Starting code units: \xe1 Subject length lower bound = 1 /\x{1234}+?/Ii,utf Capture group count = 0 Options: caseless utf Starting code units: \xe1 Subject length lower bound = 1 /\x{1234}++/Ii,utf Capture group count = 0 Options: caseless utf Starting code units: \xe1 Subject length lower bound = 1 /\x{1234}{2}/Ii,utf Capture group count = 0 Options: caseless utf Starting code units: \xe1 Subject length lower bound = 2 /[^\x{c4}]/IB,utf ------------------------------------------------------------------ Bra [^\x{c4}] Ket End ------------------------------------------------------------------ Capture group count = 0 Options: utf Subject length lower bound = 1 /X+\x{200}/IB,utf ------------------------------------------------------------------ Bra X++ \x{200} Ket End ------------------------------------------------------------------ Capture group count = 0 Options: utf First code unit = 'X' Last code unit = \x80 Subject length lower bound = 2 /\R/I,utf Capture group count = 0 Options: utf Starting code units: \x0a \x0b \x0c \x0d \xc2 \xe2 Subject length lower bound = 1 /\777/IB,utf ------------------------------------------------------------------ Bra \x{1ff} Ket End ------------------------------------------------------------------ Capture group count = 0 Options: utf First code unit = \xc7 Last code unit = \xbf Subject length lower bound = 1 /\w+\x{C4}/B,utf ------------------------------------------------------------------ Bra \w++ \x{c4} Ket End ------------------------------------------------------------------ a\x{C4}\x{C4} 0: a\x{c4} /\w+\x{C4}/B,utf,tables=2 ------------------------------------------------------------------ Bra \w+ \x{c4} Ket End ------------------------------------------------------------------ a\x{C4}\x{C4} 0: a\x{c4}\x{c4} /\W+\x{C4}/B,utf ------------------------------------------------------------------ Bra \W+ \x{c4} Ket End ------------------------------------------------------------------ !\x{C4} 0: !\x{c4} /\W+\x{C4}/B,utf,tables=2 ------------------------------------------------------------------ Bra \W++ \x{c4} Ket End ------------------------------------------------------------------ !\x{C4} 0: !\x{c4} /\W+\x{A1}/B,utf ------------------------------------------------------------------ Bra \W+ \x{a1} Ket End ------------------------------------------------------------------ !\x{A1} 0: !\x{a1} /\W+\x{A1}/B,utf,tables=2 ------------------------------------------------------------------ Bra \W+ \x{a1} Ket End ------------------------------------------------------------------ !\x{A1} 0: !\x{a1} /X\s+\x{A0}/B,utf ------------------------------------------------------------------ Bra X \s++ \x{a0} Ket End ------------------------------------------------------------------ X\x20\x{A0}\x{A0} 0: X \x{a0} /X\s+\x{A0}/B,utf,tables=2 ------------------------------------------------------------------ Bra X \s+ \x{a0} Ket End ------------------------------------------------------------------ X\x20\x{A0}\x{A0} 0: X \x{a0}\x{a0} /\S+\x{A0}/B,utf ------------------------------------------------------------------ Bra \S+ \x{a0} Ket End ------------------------------------------------------------------ X\x{A0}\x{A0} 0: X\x{a0}\x{a0} /\S+\x{A0}/B,utf,tables=2 ------------------------------------------------------------------ Bra \S++ \x{a0} Ket End ------------------------------------------------------------------ X\x{A0}\x{A0} 0: X\x{a0} /\x{a0}+\s!/B,utf ------------------------------------------------------------------ Bra \x{a0}++ \s ! Ket End ------------------------------------------------------------------ \x{a0}\x20! 0: \x{a0} ! /\x{a0}+\s!/B,utf,tables=2 ------------------------------------------------------------------ Bra \x{a0}+ \s ! Ket End ------------------------------------------------------------------ \x{a0}\x20! 0: \x{a0} ! /A/utf \x{ff000041} ** Character \x{ff000041} is greater than 0x7fffffff and so cannot be converted to UTF-8 \x{7f000041} Failed: error -14: UTF-8 error: 6-byte character is not allowed (RFC 3629) at offset 0 /(*UTF8)abc/never_utf Failed: error 174 at offset 7: using UTF is disabled by the application /abc/utf,never_utf Failed: error 174 at offset 0: using UTF is disabled by the application /A\x{391}\x{10427}\x{ff3a}\x{1fb0}/IBi,utf ------------------------------------------------------------------ Bra /i A\x{391}\x{10427}\x{ff3a}\x{1fb0} Ket End ------------------------------------------------------------------ Capture group count = 0 Options: caseless utf First code unit = 'A' (caseless) Subject length lower bound = 5 /A\x{391}\x{10427}\x{ff3a}\x{1fb0}/IB,utf ------------------------------------------------------------------ Bra A\x{391}\x{10427}\x{ff3a}\x{1fb0} Ket End ------------------------------------------------------------------ Capture group count = 0 Options: utf First code unit = 'A' Last code unit = \xb0 Subject length lower bound = 5 /AB\x{1fb0}/IB,utf ------------------------------------------------------------------ Bra AB\x{1fb0} Ket End ------------------------------------------------------------------ Capture group count = 0 Options: utf First code unit = 'A' Last code unit = \xb0 Subject length lower bound = 3 /AB\x{1fb0}/IBi,utf ------------------------------------------------------------------ Bra /i AB\x{1fb0} Ket End ------------------------------------------------------------------ Capture group count = 0 Options: caseless utf First code unit = 'A' (caseless) Last code unit = 'B' (caseless) Subject length lower bound = 3 /\x{401}\x{420}\x{421}\x{422}\x{423}\x{424}\x{425}\x{426}\x{427}\x{428}\x{429}\x{42a}\x{42b}\x{42c}\x{42d}\x{42e}\x{42f}/Ii,utf Capture group count = 0 Options: caseless utf Starting code units: \xd0 \xd1 Subject length lower bound = 17 \x{401}\x{420}\x{421}\x{422}\x{423}\x{424}\x{425}\x{426}\x{427}\x{428}\x{429}\x{42a}\x{42b}\x{42c}\x{42d}\x{42e}\x{42f} 0: \x{401}\x{420}\x{421}\x{422}\x{423}\x{424}\x{425}\x{426}\x{427}\x{428}\x{429}\x{42a}\x{42b}\x{42c}\x{42d}\x{42e}\x{42f} \x{451}\x{440}\x{441}\x{442}\x{443}\x{444}\x{445}\x{446}\x{447}\x{448}\x{449}\x{44a}\x{44b}\x{44c}\x{44d}\x{44e}\x{44f} 0: \x{451}\x{440}\x{441}\x{442}\x{443}\x{444}\x{445}\x{446}\x{447}\x{448}\x{449}\x{44a}\x{44b}\x{44c}\x{44d}\x{44e}\x{44f} /[â±¥]/Bi,utf ------------------------------------------------------------------ Bra /i \x{2c65} Ket End ------------------------------------------------------------------ /[^â±¥]/Bi,utf ------------------------------------------------------------------ Bra /i [^\x{2c65}] Ket End ------------------------------------------------------------------ /\h/I Capture group count = 0 Starting code units: \x09 \x20 \xa0 Subject length lower bound = 1 /\v/I Capture group count = 0 Starting code units: \x0a \x0b \x0c \x0d \x85 Subject length lower bound = 1 /\R/I Capture group count = 0 Starting code units: \x0a \x0b \x0c \x0d \x85 Subject length lower bound = 1 /[[:blank:]]/B,ucp ------------------------------------------------------------------ Bra [\x09 \xa0] Ket End ------------------------------------------------------------------ /\x{212a}+/Ii,utf Capture group count = 0 Options: caseless utf Starting code units: K k \xe2 Subject length lower bound = 1 KKkk\x{212a} 0: KKkk\x{212a} /s+/Ii,utf Capture group count = 0 Options: caseless utf Starting code units: S s \xc5 Subject length lower bound = 1 SSss\x{17f} 0: SSss\x{17f} /\x{100}*A/IB,utf ------------------------------------------------------------------ Bra \x{100}*+ A Ket End ------------------------------------------------------------------ Capture group count = 0 Options: utf Starting code units: A \xc4 Last code unit = 'A' Subject length lower bound = 1 A 0: A /\x{100}*\d(?R)/IB,utf ------------------------------------------------------------------ Bra \x{100}*+ \d Recurse Ket End ------------------------------------------------------------------ Capture group count = 0 Options: utf Starting code units: 0 1 2 3 4 5 6 7 8 9 \xc4 Subject length lower bound = 1 /[Z\x{100}]/IB,utf ------------------------------------------------------------------ Bra [Z\x{100}] Ket End ------------------------------------------------------------------ Capture group count = 0 Options: utf Starting code units: Z \xc4 Subject length lower bound = 1 Z\x{100} 0: Z \x{100} 0: \x{100} \x{100}Z 0: \x{100} /[z-\x{100}]/IB,utf ------------------------------------------------------------------ Bra [z-\xff\x{100}] Ket End ------------------------------------------------------------------ Capture group count = 0 Options: utf Starting code units: z { | } ~ \x7f \xc2 \xc3 \xc4 Subject length lower bound = 1 /[z\Qa-d]Ä€\E]/IB,utf ------------------------------------------------------------------ Bra [\-\]adz\x{100}] Ket End ------------------------------------------------------------------ Capture group count = 0 Options: utf Starting code units: - ] a d z \xc4 Subject length lower bound = 1 \x{100} 0: \x{100} Ä€ 0: \x{100} /[ab\x{100}]abc(xyz(?1))/IB,utf ------------------------------------------------------------------ Bra [ab\x{100}] abc CBra 1 xyz Recurse Ket Ket End ------------------------------------------------------------------ Capture group count = 1 Options: utf Starting code units: a b \xc4 Last code unit = 'z' Subject length lower bound = 7 /\x{100}*\s/IB,utf ------------------------------------------------------------------ Bra \x{100}*+ \s Ket End ------------------------------------------------------------------ Capture group count = 0 Options: utf Starting code units: \x09 \x0a \x0b \x0c \x0d \x20 \xc4 Subject length lower bound = 1 /\x{100}*\d/IB,utf ------------------------------------------------------------------ Bra \x{100}*+ \d Ket End ------------------------------------------------------------------ Capture group count = 0 Options: utf Starting code units: 0 1 2 3 4 5 6 7 8 9 \xc4 Subject length lower bound = 1 /\x{100}*\w/IB,utf ------------------------------------------------------------------ Bra \x{100}*+ \w Ket End ------------------------------------------------------------------ Capture group count = 0 Options: utf Starting code units: 0 1 2 3 4 5 6 7 8 9 A B C D E F G H I J K L M N O P Q R S T U V W X Y Z _ a b c d e f g h i j k l m n o p q r s t u v w x y z \xc4 Subject length lower bound = 1 /\x{100}*\D/IB,utf ------------------------------------------------------------------ Bra \x{100}* \D Ket End ------------------------------------------------------------------ Capture group count = 0 Options: utf Starting code units: \x00 \x01 \x02 \x03 \x04 \x05 \x06 \x07 \x08 \x09 \x0a \x0b \x0c \x0d \x0e \x0f \x10 \x11 \x12 \x13 \x14 \x15 \x16 \x17 \x18 \x19 \x1a \x1b \x1c \x1d \x1e \x1f \x20 ! " # $ % & ' ( ) * + , - . / : ; < = > ? @ A B C D E F G H I J K L M N O P Q R S T U V W X Y Z [ \ ] ^ _ ` a b c d e f g h i j k l m n o p q r s t u v w x y z { | } ~ \x7f \xc0 \xc1 \xc2 \xc3 \xc4 \xc5 \xc6 \xc7 \xc8 \xc9 \xca \xcb \xcc \xcd \xce \xcf \xd0 \xd1 \xd2 \xd3 \xd4 \xd5 \xd6 \xd7 \xd8 \xd9 \xda \xdb \xdc \xdd \xde \xdf \xe0 \xe1 \xe2 \xe3 \xe4 \xe5 \xe6 \xe7 \xe8 \xe9 \xea \xeb \xec \xed \xee \xef \xf0 \xf1 \xf2 \xf3 \xf4 \xf5 \xf6 \xf7 \xf8 \xf9 \xfa \xfb \xfc \xfd \xfe \xff Subject length lower bound = 1 /\x{100}*\S/IB,utf ------------------------------------------------------------------ Bra \x{100}* \S Ket End ------------------------------------------------------------------ Capture group count = 0 Options: utf Starting code units: \x00 \x01 \x02 \x03 \x04 \x05 \x06 \x07 \x08 \x0e \x0f \x10 \x11 \x12 \x13 \x14 \x15 \x16 \x17 \x18 \x19 \x1a \x1b \x1c \x1d \x1e \x1f ! " # $ % & ' ( ) * + , - . / 0 1 2 3 4 5 6 7 8 9 : ; < = > ? @ A B C D E F G H I J K L M N O P Q R S T U V W X Y Z [ \ ] ^ _ ` a b c d e f g h i j k l m n o p q r s t u v w x y z { | } ~ \x7f \xc0 \xc1 \xc2 \xc3 \xc4 \xc5 \xc6 \xc7 \xc8 \xc9 \xca \xcb \xcc \xcd \xce \xcf \xd0 \xd1 \xd2 \xd3 \xd4 \xd5 \xd6 \xd7 \xd8 \xd9 \xda \xdb \xdc \xdd \xde \xdf \xe0 \xe1 \xe2 \xe3 \xe4 \xe5 \xe6 \xe7 \xe8 \xe9 \xea \xeb \xec \xed \xee \xef \xf0 \xf1 \xf2 \xf3 \xf4 \xf5 \xf6 \xf7 \xf8 \xf9 \xfa \xfb \xfc \xfd \xfe \xff Subject length lower bound = 1 /\x{100}*\W/IB,utf ------------------------------------------------------------------ Bra \x{100}* \W Ket End ------------------------------------------------------------------ Capture group count = 0 Options: utf Starting code units: \x00 \x01 \x02 \x03 \x04 \x05 \x06 \x07 \x08 \x09 \x0a \x0b \x0c \x0d \x0e \x0f \x10 \x11 \x12 \x13 \x14 \x15 \x16 \x17 \x18 \x19 \x1a \x1b \x1c \x1d \x1e \x1f \x20 ! " # $ % & ' ( ) * + , - . / : ; < = > ? @ [ \ ] ^ ` { | } ~ \x7f \xc0 \xc1 \xc2 \xc3 \xc4 \xc5 \xc6 \xc7 \xc8 \xc9 \xca \xcb \xcc \xcd \xce \xcf \xd0 \xd1 \xd2 \xd3 \xd4 \xd5 \xd6 \xd7 \xd8 \xd9 \xda \xdb \xdc \xdd \xde \xdf \xe0 \xe1 \xe2 \xe3 \xe4 \xe5 \xe6 \xe7 \xe8 \xe9 \xea \xeb \xec \xed \xee \xef \xf0 \xf1 \xf2 \xf3 \xf4 \xf5 \xf6 \xf7 \xf8 \xf9 \xfa \xfb \xfc \xfd \xfe \xff Subject length lower bound = 1 /[\x{105}-\x{109}]/IBi,utf ------------------------------------------------------------------ Bra [\x{104}-\x{109}] Ket End ------------------------------------------------------------------ Capture group count = 0 Options: caseless utf Starting code units: \xc4 Subject length lower bound = 1 \x{104} 0: \x{104} \x{105} 0: \x{105} \x{109} 0: \x{109} \= Expect no match \x{100} No match \x{10a} No match /[z-\x{100}]/IBi,utf ------------------------------------------------------------------ Bra [Zz-\xff\x{39c}\x{3bc}\x{212b}\x{1e9e}\x{212b}\x{178}\x{100}-\x{101}] Ket End ------------------------------------------------------------------ Capture group count = 0 Options: caseless utf Starting code units: Z z { | } ~ \x7f \xc2 \xc3 \xc4 \xc5 \xce \xe1 \xe2 Subject length lower bound = 1 Z 0: Z z 0: z \x{39c} 0: \x{39c} \x{178} 0: \x{178} | 0: | \x{80} 0: \x{80} \x{ff} 0: \x{ff} \x{100} 0: \x{100} \x{101} 0: \x{101} \= Expect no match \x{102} No match Y No match y No match /[z-\x{100}]/IBi,utf ------------------------------------------------------------------ Bra [Zz-\xff\x{39c}\x{3bc}\x{212b}\x{1e9e}\x{212b}\x{178}\x{100}-\x{101}] Ket End ------------------------------------------------------------------ Capture group count = 0 Options: caseless utf Starting code units: Z z { | } ~ \x7f \xc2 \xc3 \xc4 \xc5 \xce \xe1 \xe2 Subject length lower bound = 1 /\x{3a3}B/IBi,utf ------------------------------------------------------------------ Bra clist 03a3 03c2 03c3 /i B Ket End ------------------------------------------------------------------ Capture group count = 0 Options: caseless utf Starting code units: \xce \xcf Last code unit = 'B' (caseless) Subject length lower bound = 2 /abc/utf,replace=à abc Failed: error -3: UTF-8 error: 1 byte missing at end /(?<=(a)(?-1))x/I,utf Capture group count = 1 Max lookbehind = 2 Options: utf First code unit = 'x' Subject length lower bound = 1 a\x80zx\=offset=3 Failed: error -22: UTF-8 error: isolated byte with 0x80 bit set at offset 1 /[\W\p{Any}]/B ------------------------------------------------------------------ Bra [\x00-/:-@[-^`{-\xff\p{Any}] Ket End ------------------------------------------------------------------ abc 0: a 123 0: 1 /[\W\pL]/B ------------------------------------------------------------------ Bra [\x00-/:-@[-^`{-\xff\p{L}] Ket End ------------------------------------------------------------------ abc 0: a \= Expect no match 123 No match /(*:*++++++++++++''''''''''''''''''''+''+++'+++x+++++++++++++++++++++++++++++++++++(++++++++++++++++++++:++++++%++:''''''''''''''''''''''''+++++++++++++++++++++++++++++++++++++++++++++++++++++-++++++++k+++++++''''+++'+++++++++++++++++++++++''''++++++++++++':Æ¿)/utf Failed: error 176 at offset 259: name is too long in (*MARK), (*PRUNE), (*SKIP), or (*THEN) /[\s[:^ascii:]]/B,ucp ------------------------------------------------------------------ Bra [\x80-\xff\p{Xsp}] Ket End ------------------------------------------------------------------ # A special extra option allows excaped surrogate code points in 8-bit mode, # but subjects containing them must not be UTF-checked. /\x{d800}/I,utf,allow_surrogate_escapes Capture group count = 0 Options: utf Extra options: allow_surrogate_escapes First code unit = \xed Last code unit = \x80 Subject length lower bound = 1 \x{d800}\=no_utf_check 0: \x{d800} /\udfff\o{157401}/utf,alt_bsux,allow_surrogate_escapes \x{dfff}\x{df01}\=no_utf_check 0: \x{dfff}\x{df01} # This has different starting code units in 8-bit mode. /^[^ab]/IB,utf ------------------------------------------------------------------ Bra ^ [\x00-`c-\xff] (neg) Ket End ------------------------------------------------------------------ Capture group count = 0 Compile options: utf Overall options: anchored utf Starting code units: \x00 \x01 \x02 \x03 \x04 \x05 \x06 \x07 \x08 \x09 \x0a \x0b \x0c \x0d \x0e \x0f \x10 \x11 \x12 \x13 \x14 \x15 \x16 \x17 \x18 \x19 \x1a \x1b \x1c \x1d \x1e \x1f \x20 ! " # $ % & ' ( ) * + , - . / 0 1 2 3 4 5 6 7 8 9 : ; < = > ? @ A B C D E F G H I J K L M N O P Q R S T U V W X Y Z [ \ ] ^ _ ` c d e f g h i j k l m n o p q r s t u v w x y z { | } ~ \x7f \xc2 \xc3 \xc4 \xc5 \xc6 \xc7 \xc8 \xc9 \xca \xcb \xcc \xcd \xce \xcf \xd0 \xd1 \xd2 \xd3 \xd4 \xd5 \xd6 \xd7 \xd8 \xd9 \xda \xdb \xdc \xdd \xde \xdf \xe0 \xe1 \xe2 \xe3 \xe4 \xe5 \xe6 \xe7 \xe8 \xe9 \xea \xeb \xec \xed \xee \xef \xf0 \xf1 \xf2 \xf3 \xf4 \xf5 \xf6 \xf7 \xf8 \xf9 \xfa \xfb \xfc \xfd \xfe \xff Subject length lower bound = 1 c 0: c \x{ff} 0: \x{ff} \x{100} 0: \x{100} \= Expect no match aaa No match # Offsets are different in 8-bit mode. /(?<=abc)(|def)/g,utf,replace=<$0>,substitute_callout 123abcáyzabcdef789abcሴqr 1(2) Old 6 6 "" New 6 8 "<>" 2(2) Old 13 13 "" New 15 17 "<>" 3(2) Old 13 16 "def" New 17 22 "" 4(2) Old 22 22 "" New 28 30 "<>" 4: 123abc<>\x{e1}yzabc<>789abc<>\x{1234}qr # Check name length with non-ASCII characters /(?'ABáC678901234567890123456789012'...)/utf /(?'ABáC6789012345678901234567890123'...)/utf Failed: error 148 at offset 36: subpattern name is too long (maximum 32 code units) /(?'ABZC6789012345678901234567890123'...)/utf /(?(n/utf Failed: error 142 at offset 4: syntax error in subpattern name (missing terminator?) /(?(á/utf Failed: error 142 at offset 5: syntax error in subpattern name (missing terminator?) # Invalid UTF-8 tests /.../g,match_invalid_utf abcd\x80wxzy\x80pqrs 0: abc 0: wxz 0: pqr abcd\x{80}wxzy\x80pqrs 0: abc 0: d\x{80}w 0: xzy 0: pqr /abc/match_invalid_utf ab\x80ab\=ph Partial match: ab \= Expect no match ab\x80cdef\=ph No match /ab$/match_invalid_utf ab\x80cdeab 0: ab \= Expect no match ab\x80cde No match /.../g,match_invalid_utf abcd\x{80}wxzy\x80pqrs 0: abc 0: d\x{80}w 0: xzy 0: pqr /(?<=x)../g,match_invalid_utf abcd\x{80}wxzy\x80pqrs 0: zy abcd\x{80}wxzy\x80xpqrs 0: zy 0: pq /X$/match_invalid_utf \= Expect no match X\xc4 No match /(?<=..)X/match_invalid_utf,aftertext AB\x80AQXYZ 0: X 0+ YZ AB\x80AQXYZ\=offset=5 0: X 0+ YZ AB\x80\x80AXYZXC\=offset=5 0: X 0+ C \= Expect no match AB\x80XYZ No match AB\x80XYZ\=offset=3 No match AB\xfeXYZ No match AB\xffXYZ\=offset=3 No match AB\x80AXYZ No match AB\x80AXYZ\=offset=4 No match AB\x80\x80AXYZ\=offset=5 No match /.../match_invalid_utf AB\xc4CCC 0: CCC \= Expect no match A\x{d800}B No match A\x{110000}B No match A\xc4B No match /\bX/match_invalid_utf A\x80X 0: X /\BX/match_invalid_utf \= Expect no match A\x80X No match /(?<=...)X/match_invalid_utf AAA\x80BBBXYZ 0: X \= Expect no match AAA\x80BXYZ No match AAA\x80BBXYZ No match # ------------------------------------- /(*UTF)(?=\x{123})/I Capture group count = 0 May match empty string Compile options: Overall options: utf First code unit = \xc4 Last code unit = \xa3 Subject length lower bound = 1 /[\x{c1}\x{e1}]X[\x{145}\x{146}]/I,utf Capture group count = 0 Options: utf Starting code units: \xc3 Last code unit = 'X' Subject length lower bound = 3 /[󿾟,]/BI,utf ------------------------------------------------------------------ Bra [,\x{fff9f}] Ket End ------------------------------------------------------------------ Capture group count = 0 Options: utf Starting code units: , \xf3 Subject length lower bound = 1 /[\x{fff4}-\x{ffff8}]/I,utf Capture group count = 0 Options: utf Starting code units: \xef \xf0 \xf1 \xf2 \xf3 Subject length lower bound = 1 /[\x{fff4}-\x{afff8}\x{10ffff}]/I,utf Capture group count = 0 Options: utf Starting code units: \xef \xf0 \xf1 \xf2 \xf4 Subject length lower bound = 1 /[\xff\x{ffff}]/I,utf Capture group count = 0 Options: utf Starting code units: \xc3 \xef Subject length lower bound = 1 /[\xff\x{ff}]/I,utf Capture group count = 0 Options: utf Starting code units: \xc3 Subject length lower bound = 1 abc\x{ff}def 0: \x{ff} /[\xff\x{ff}]/I Capture group count = 0 First code unit = \xff Subject length lower bound = 1 abc\x{ff}def 0: \xff /[Ss]/I Capture group count = 0 First code unit = 'S' (caseless) Subject length lower bound = 1 /[Ss]/I,utf Capture group count = 0 Options: utf Starting code units: S s Subject length lower bound = 1 /(?:\x{ff}|\x{3000})/I,utf Capture group count = 0 Options: utf Starting code units: \xc3 \xe3 Subject length lower bound = 1 /x/utf abxyz 0: x \x80\=startchar Failed: error -22: UTF-8 error: isolated byte with 0x80 bit set at offset 0 abc\x80\=startchar Failed: error -22: UTF-8 error: isolated byte with 0x80 bit set at offset 3 abc\x80\=startchar,offset=3 Error -36 (bad UTF-8 offset) /\x{c1}+\x{e1}/iIB,ucp ------------------------------------------------------------------ Bra /i \x{c1}+ /i \x{e1} Ket End ------------------------------------------------------------------ Capture group count = 0 Options: caseless ucp First code unit = \xc1 (caseless) Last code unit = \xe1 (caseless) Subject length lower bound = 2 \x{c1}\x{c1}\x{c1} 0: \xc1\xc1\xc1 \x{e1}\x{e1}\x{e1} 0: \xe1\xe1\xe1 /a|\x{c1}/iI,ucp Capture group count = 0 Options: caseless ucp Starting code units: A a \xc1 \xe1 Subject length lower bound = 1 \x{e1}xxx 0: \xe1 /a|\x{c1}/iI,utf Capture group count = 0 Options: caseless utf Starting code units: A a \xc3 Subject length lower bound = 1 \x{e1}xxx 0: \x{e1} /\x{c1}|\x{e1}/iI,ucp Capture group count = 0 Options: caseless ucp First code unit = \xc1 (caseless) Subject length lower bound = 1 /X(\x{e1})Y/ucp,replace=>\U$1<,substitute_extended X\x{e1}Y 1: >\xc1< /X(\x{e1})Y/i,ucp,replace=>\L$1<,substitute_extended X\x{c1}Y 1: >\xe1< # Without UTF or UCP characters > 127 have only one case in the default locale. /X(\x{e1})Y/replace=>\U$1<,substitute_extended X\x{e1}Y 1: >\xe1< /A/utf,match_invalid_utf,caseless \xe5A 0: A # End of testinput10