# This set of tests is for UTF-8 support and Unicode property support, with # relevance only for the 8-bit library. # The next 5 patterns have UTF-8 errors /[Ã]/utf /Ã/utf /ÃÃÃxxx/utf /‚‚‚‚‚‚‚Ã/utf /‚‚‚‚‚‚‚Ã/match_invalid_utf # Now test subjects /badutf/utf \= Expect UTF-8 errors X\xdf XX\xef XXX\xef\x80 X\xf7 XX\xf7\x80 XXX\xf7\x80\x80 \xfb \xfb\x80 \xfb\x80\x80 \xfb\x80\x80\x80 \xfd \xfd\x80 \xfd\x80\x80 \xfd\x80\x80\x80 \xfd\x80\x80\x80\x80 \xdf\x7f \xef\x7f\x80 \xef\x80\x7f \xf7\x7f\x80\x80 \xf7\x80\x7f\x80 \xf7\x80\x80\x7f \xfb\x7f\x80\x80\x80 \xfb\x80\x7f\x80\x80 \xfb\x80\x80\x7f\x80 \xfb\x80\x80\x80\x7f \xfd\x7f\x80\x80\x80\x80 \xfd\x80\x7f\x80\x80\x80 \xfd\x80\x80\x7f\x80\x80 \xfd\x80\x80\x80\x7f\x80 \xfd\x80\x80\x80\x80\x7f \xed\xa0\x80 \xc0\x8f \xe0\x80\x8f \xf0\x80\x80\x8f \xf8\x80\x80\x80\x8f \xfc\x80\x80\x80\x80\x8f \x80 \xfe \xff /badutf/utf \= Expect UTF-8 errors XX\xfb\x80\x80\x80\x80 XX\xfd\x80\x80\x80\x80\x80 XX\xf7\xbf\xbf\xbf /shortutf/utf \= Expect UTF-8 errors XX\xdf\=ph XX\xef\=ph XX\xef\x80\=ph \xf7\=ph \xf7\x80\=ph \xf7\x80\x80\=ph \xfb\=ph \xfb\x80\=ph \xfb\x80\x80\=ph \xfb\x80\x80\x80\=ph \xfd\=ph \xfd\x80\=ph \xfd\x80\x80\=ph \xfd\x80\x80\x80\=ph \xfd\x80\x80\x80\x80\=ph /anything/utf \= Expect UTF-8 errors X\xc0\x80 XX\xc1\x8f XXX\xe0\x9f\x80 \xf0\x8f\x80\x80 \xf8\x87\x80\x80\x80 \xfc\x83\x80\x80\x80\x80 \xfe\x80\x80\x80\x80\x80 \xff\x80\x80\x80\x80\x80 \xf8\x88\x80\x80\x80 \xf9\x87\x80\x80\x80 \xfc\x84\x80\x80\x80\x80 \xfd\x83\x80\x80\x80\x80 \= Expect no match \xc3\x8f \xe0\xaf\x80 \xe1\x80\x80 \xf0\x9f\x80\x80 \xf1\x8f\x80\x80 \xf8\x88\x80\x80\x80\=no_utf_check \xf9\x87\x80\x80\x80\=no_utf_check \xfc\x84\x80\x80\x80\x80\=no_utf_check \xfd\x83\x80\x80\x80\x80\=no_utf_check # Similar tests with offsets /badutf/utf \= Expect UTF-8 errors X\xdfabcd X\xdfabcd\=offset=1 \= Expect no match X\xdfabcd\=offset=2 /(?<=x)badutf/utf \= Expect UTF-8 errors X\xdfabcd X\xdfabcd\=offset=1 X\xdfabcd\=offset=2 X\xdfabcd\xdf\=offset=3 \= Expect no match X\xdfabcd\=offset=3 /(?<=xx)badutf/utf \= Expect UTF-8 errors X\xdfabcd X\xdfabcd\=offset=1 X\xdfabcd\=offset=2 X\xdfabcd\=offset=3 /(?<=xxxx)badutf/utf \= Expect UTF-8 errors X\xdfabcd X\xdfabcd\=offset=1 X\xdfabcd\=offset=2 X\xdfabcd\=offset=3 X\xdfabc\xdf\=offset=6 X\xdfabc\xdf\=offset=7 \= Expect no match X\xdfabcd\=offset=6 /\x{100}/IB,utf /\x{1000}/IB,utf /\x{10000}/IB,utf /\x{100000}/IB,utf /\x{10ffff}/IB,utf /[\x{ff}]/IB,utf /[\x{100}]/IB,utf /\x80/IB,utf /\xff/IB,utf /\x{D55c}\x{ad6d}\x{C5B4}/IB,utf \x{D55c}\x{ad6d}\x{C5B4} /\x{65e5}\x{672c}\x{8a9e}/IB,utf \x{65e5}\x{672c}\x{8a9e} /\x{80}/IB,utf /\x{084}/IB,utf /\x{104}/IB,utf /\x{861}/IB,utf /\x{212ab}/IB,utf /[^ab\xC0-\xF0]/IB,utf \x{f1} \x{bf} \x{100} \x{1000} \= Expect no match \x{c0} \x{f0} /Ä€{3,4}/IB,utf \x{100}\x{100}\x{100}\x{100\x{100} /(\x{100}+|x)/IB,utf /(\x{100}*a|x)/IB,utf /(\x{100}{0,2}a|x)/IB,utf /(\x{100}{1,2}a|x)/IB,utf /\x{100}/IB,utf /a\x{100}\x{101}*/IB,utf /a\x{100}\x{101}+/IB,utf /[^\x{c4}]/IB /[\x{100}]/IB,utf \x{100} Z\x{100} \x{100}Z /[\xff]/IB,utf >\x{ff}< /[^\xff]/IB,utf /\x{100}abc(xyz(?1))/IB,utf /\777/I,utf \x{1ff} \777 /\x{100}+\x{200}/IB,utf /\x{100}+X/IB,utf /^[\QÄ€\E-\QÅ\E/B,utf # This tests the stricter UTF-8 check according to RFC 3629. /X/utf \= Expect UTF-8 errors \x{d800} \x{da00} \x{dfff} \x{110000} \x{2000000} \x{7fffffff} \= Expect no match \x{d800}\=no_utf_check \x{da00}\=no_utf_check \x{dfff}\=no_utf_check \x{110000}\=no_utf_check \x{2000000}\=no_utf_check \x{7fffffff}\=no_utf_check /(*UTF8)\x{1234}/ abcd\x{1234}pqr /(*CRLF)(*UTF)(*BSR_UNICODE)a\Rb/I /\h/I,utf ABC\x{09} ABC\x{20} ABC\x{a0} ABC\x{1680} ABC\x{180e} ABC\x{2000} ABC\x{202f} ABC\x{205f} ABC\x{3000} /\v/I,utf ABC\x{0a} ABC\x{0b} ABC\x{0c} ABC\x{0d} ABC\x{85} ABC\x{2028} /\h*A/I,utf CDBABC /\v+A/I,utf /\s?xxx\s/I,utf /\sxxx\s/I,utf,tables=2 AB\x{85}xxx\x{a0}XYZ AB\x{a0}xxx\x{85}XYZ /\S \S/I,utf,tables=2 \x{a2} \x{84} A Z /a+/utf a\x{123}aa\=offset=1 a\x{123}aa\=offset=3 a\x{123}aa\=offset=4 \= Expect bad offset value a\x{123}aa\=offset=6 \= Expect bad UTF-8 offset a\x{123}aa\=offset=2 \= Expect no match a\x{123}aa\=offset=5 /\x{1234}+/Ii,utf /\x{1234}+?/Ii,utf /\x{1234}++/Ii,utf /\x{1234}{2}/Ii,utf /[^\x{c4}]/IB,utf /X+\x{200}/IB,utf /\R/I,utf /\777/IB,utf /\w+\x{C4}/B,utf a\x{C4}\x{C4} /\w+\x{C4}/B,utf,tables=2 a\x{C4}\x{C4} /\W+\x{C4}/B,utf !\x{C4} /\W+\x{C4}/B,utf,tables=2 !\x{C4} /\W+\x{A1}/B,utf !\x{A1} /\W+\x{A1}/B,utf,tables=2 !\x{A1} /X\s+\x{A0}/B,utf X\x20\x{A0}\x{A0} /X\s+\x{A0}/B,utf,tables=2 X\x20\x{A0}\x{A0} /\S+\x{A0}/B,utf X\x{A0}\x{A0} /\S+\x{A0}/B,utf,tables=2 X\x{A0}\x{A0} /\x{a0}+\s!/B,utf \x{a0}\x20! /\x{a0}+\s!/B,utf,tables=2 \x{a0}\x20! /A/utf \x{ff000041} \x{7f000041} /(*UTF8)abc/never_utf /abc/utf,never_utf /A\x{391}\x{10427}\x{ff3a}\x{1fb0}/IBi,utf /A\x{391}\x{10427}\x{ff3a}\x{1fb0}/IB,utf /AB\x{1fb0}/IB,utf /AB\x{1fb0}/IBi,utf /\x{401}\x{420}\x{421}\x{422}\x{423}\x{424}\x{425}\x{426}\x{427}\x{428}\x{429}\x{42a}\x{42b}\x{42c}\x{42d}\x{42e}\x{42f}/Ii,utf \x{401}\x{420}\x{421}\x{422}\x{423}\x{424}\x{425}\x{426}\x{427}\x{428}\x{429}\x{42a}\x{42b}\x{42c}\x{42d}\x{42e}\x{42f} \x{451}\x{440}\x{441}\x{442}\x{443}\x{444}\x{445}\x{446}\x{447}\x{448}\x{449}\x{44a}\x{44b}\x{44c}\x{44d}\x{44e}\x{44f} /[â±¥]/Bi,utf /[^â±¥]/Bi,utf /\h/I /\v/I /\R/I /[[:blank:]]/B,ucp /\x{212a}+/Ii,utf KKkk\x{212a} /s+/Ii,utf SSss\x{17f} /\x{100}*A/IB,utf A /\x{100}*\d(?R)/IB,utf /[Z\x{100}]/IB,utf Z\x{100} \x{100} \x{100}Z /[z-\x{100}]/IB,utf /[z\Qa-d]Ä€\E]/IB,utf \x{100} Ä€ /[ab\x{100}]abc(xyz(?1))/IB,utf /\x{100}*\s/IB,utf /\x{100}*\d/IB,utf /\x{100}*\w/IB,utf /\x{100}*\D/IB,utf /\x{100}*\S/IB,utf /\x{100}*\W/IB,utf /[\x{105}-\x{109}]/IBi,utf \x{104} \x{105} \x{109} \= Expect no match \x{100} \x{10a} /[z-\x{100}]/IBi,utf Z z \x{39c} \x{178} | \x{80} \x{ff} \x{100} \x{101} \= Expect no match \x{102} Y y /[z-\x{100}]/IBi,utf /\x{3a3}B/IBi,utf /abc/utf,replace=à abc /(?<=(a)(?-1))x/I,utf a\x80zx\=offset=3 /[\W\p{Any}]/B abc 123 /[\W\pL]/B abc \= Expect no match 123 /(*:*++++++++++++''''''''''''''''''''+''+++'+++x+++++++++++++++++++++++++++++++++++(++++++++++++++++++++:++++++%++:''''''''''''''''''''''''+++++++++++++++++++++++++++++++++++++++++++++++++++++-++++++++k+++++++''''+++'+++++++++++++++++++++++''''++++++++++++':Æ¿)/utf /[\s[:^ascii:]]/B,ucp # A special extra option allows excaped surrogate code points in 8-bit mode, # but subjects containing them must not be UTF-checked. /\x{d800}/I,utf,allow_surrogate_escapes \x{d800}\=no_utf_check /\udfff\o{157401}/utf,alt_bsux,allow_surrogate_escapes \x{dfff}\x{df01}\=no_utf_check # This has different starting code units in 8-bit mode. /^[^ab]/IB,utf c \x{ff} \x{100} \= Expect no match aaa # Offsets are different in 8-bit mode. /(?<=abc)(|def)/g,utf,replace=<$0>,substitute_callout 123abcáyzabcdef789abcሴqr # Check name length with non-ASCII characters /(?'ABáC678901234567890123456789012'...)/utf /(?'ABáC6789012345678901234567890123'...)/utf /(?'ABZC6789012345678901234567890123'...)/utf /(?(n/utf /(?(á/utf # Invalid UTF-8 tests /.../g,match_invalid_utf abcd\x80wxzy\x80pqrs abcd\x{80}wxzy\x80pqrs /abc/match_invalid_utf ab\x80ab\=ph \= Expect no match ab\x80cdef\=ph /ab$/match_invalid_utf ab\x80cdeab \= Expect no match ab\x80cde /.../g,match_invalid_utf abcd\x{80}wxzy\x80pqrs /(?<=x)../g,match_invalid_utf abcd\x{80}wxzy\x80pqrs abcd\x{80}wxzy\x80xpqrs /X$/match_invalid_utf \= Expect no match X\xc4 /(?<=..)X/match_invalid_utf,aftertext AB\x80AQXYZ AB\x80AQXYZ\=offset=5 AB\x80\x80AXYZXC\=offset=5 \= Expect no match AB\x80XYZ AB\x80XYZ\=offset=3 AB\xfeXYZ AB\xffXYZ\=offset=3 AB\x80AXYZ AB\x80AXYZ\=offset=4 AB\x80\x80AXYZ\=offset=5 /.../match_invalid_utf AB\xc4CCC \= Expect no match A\x{d800}B A\x{110000}B A\xc4B /\bX/match_invalid_utf A\x80X /\BX/match_invalid_utf \= Expect no match A\x80X /(?<=...)X/match_invalid_utf AAA\x80BBBXYZ \= Expect no match AAA\x80BXYZ AAA\x80BBXYZ # ------------------------------------- /(*UTF)(?=\x{123})/I /[\x{c1}\x{e1}]X[\x{145}\x{146}]/I,utf /[󿾟,]/BI,utf /[\x{fff4}-\x{ffff8}]/I,utf /[\x{fff4}-\x{afff8}\x{10ffff}]/I,utf /[\xff\x{ffff}]/I,utf /[\xff\x{ff}]/I,utf abc\x{ff}def /[\xff\x{ff}]/I abc\x{ff}def /[Ss]/I /[Ss]/I,utf /(?:\x{ff}|\x{3000})/I,utf /x/utf abxyz \x80\=startchar abc\x80\=startchar abc\x80\=startchar,offset=3 /\x{c1}+\x{e1}/iIB,ucp \x{c1}\x{c1}\x{c1} \x{e1}\x{e1}\x{e1} /a|\x{c1}/iI,ucp \x{e1}xxx /a|\x{c1}/iI,utf \x{e1}xxx /\x{c1}|\x{e1}/iI,ucp /X(\x{e1})Y/ucp,replace=>\U$1<,substitute_extended X\x{e1}Y /X(\x{e1})Y/i,ucp,replace=>\L$1<,substitute_extended X\x{c1}Y # Without UTF or UCP characters > 127 have only one case in the default locale. /X(\x{e1})Y/replace=>\U$1<,substitute_extended X\x{e1}Y /A/utf,match_invalid_utf,caseless \xe5A /\bch\b/utf,match_invalid_utf qchq\=ph qchq\=ps # End of testinput10