From e022475d5424552a7bbd7fd787c4845a17e219be Mon Sep 17 00:00:00 2001 From: "Philip.Hazel" Date: Tue, 5 Aug 2014 16:51:32 +0000 Subject: [PATCH] All tests except JIT and save/reload are implemented. --- RunTest | 481 +++++--------- doc/pcre2test.1 | 5 +- src/pcre2_compile.c | 7 +- src/pcre2_error.c | 8 +- src/pcre2_internal.h | 7 +- src/pcre2_printint.c | 3 +- src/pcre2_valid_utf.c | 2 +- src/pcre2posix.c | 6 +- src/pcre2test.c | 20 +- testdata/testinput10 | 398 ++++++++++++ testdata/testinput11 | 357 +++++++++++ testdata/testinput12 | 332 ++++++++++ testdata/testinput13 | 22 + testdata/testinput2 | 2 +- testdata/testinput5 | 4 +- testdata/testinput8 | 141 +++++ testdata/testinput9 | 333 ++++++++++ testdata/testoutput10 | 1273 ++++++++++++++++++++++++++++++++++++++ testdata/testoutput11-16 | 675 ++++++++++++++++++++ testdata/testoutput11-32 | 681 ++++++++++++++++++++ testdata/testoutput12-16 | 1159 ++++++++++++++++++++++++++++++++++ testdata/testoutput12-32 | 1157 ++++++++++++++++++++++++++++++++++ testdata/testoutput13 | 27 + testdata/testoutput2 | 2 +- testdata/testoutput5 | 8 +- testdata/testoutput8-16 | 745 ++++++++++++++++++++++ testdata/testoutput8-32 | 745 ++++++++++++++++++++++ testdata/testoutput8-8 | 745 ++++++++++++++++++++++ testdata/testoutput9 | 498 +++++++++++++++ 29 files changed, 9495 insertions(+), 348 deletions(-) create mode 100644 testdata/testinput10 create mode 100644 testdata/testinput11 create mode 100644 testdata/testinput12 create mode 100644 testdata/testinput13 create mode 100644 testdata/testinput8 create mode 100644 testdata/testinput9 create mode 100644 testdata/testoutput10 create mode 100644 testdata/testoutput11-16 create mode 100644 testdata/testoutput11-32 create mode 100644 testdata/testoutput12-16 create mode 100644 testdata/testoutput12-32 create mode 100644 testdata/testoutput13 create mode 100644 testdata/testoutput8-16 create mode 100644 testdata/testoutput8-32 create mode 100644 testdata/testoutput8-8 create mode 100644 testdata/testoutput9 diff --git a/RunTest b/RunTest index 435bfab..e178d81 100755 --- a/RunTest +++ b/RunTest @@ -58,22 +58,18 @@ title5B=" and UCP support" title6="Test 6: DFA matching main non-UTF, non-UCP functionality" title7A="Test 7: DFA matching with UTF" title7B=" and Unicode property support" -#title11="Test 11: Internal offsets and code size tests" +title8="Test 8: Internal offsets and code size tests" +title9="Test 9: Specials for the basic 8-bit library" +title10="Test 10: Specials for the 8-bit library with UTF-8 and UCP support" +title11="Test 11: Specials for the basic 16-bit and 32-bit libraries" +title12="Test 12: Specials for the 16-bit and 32-bit libraries UTF and UCP support" +title13="Test 13: DFA specials for the basic 16-bit and 32-bit libraries" + #title12="Test 12: JIT-specific features (when JIT is available)" #title13="Test 13: JIT-specific features (when JIT is not available)" -#title14="Test 14: Specials for the basic 8-bit library" -#title15="Test 15: Specials for the 8-bit library with UTF-8 support" -#title16="Test 16: Specials for the 8-bit library with Unicode propery support" -#title17="Test 17: Specials for the basic 16/32-bit library" -#title18="Test 18: Specials for the 16/32-bit library with UTF-16/32 support" -#title19="Test 19: Specials for the 16/32-bit library with Unicode property support" -#title20="Test 20: DFA specials for the basic 16/32-bit library" + #title21="Test 21: Reloads for the basic 16/32-bit library" #title22="Test 22: Reloads for the 16/32-bit library with UTF-16/32 support" -#title23="Test 23: Specials for the 16-bit library" -#title24="Test 24: Specials for the 16-bit library with UTF-16 support" -#title25="Test 25: Specials for the 32-bit library" -#title26="Test 26: Specials for the 32-bit library with UTF-32 support" maxtest=2 @@ -85,12 +81,12 @@ if [ $# -eq 1 -a "$1" = "list" ]; then echo $title5A $title5B echo $title6 echo $title7A $title7B -# echo $title8 -# echo $title9 -# echo $title10 -# echo $title11 -# echo $title12 -# echo $title13 + echo $title8 + echo $title9 + echo $title10 + echo $title11 + echo $title12 + echo $title13 # echo $title14 # echo $title15 # echo $title16 @@ -100,10 +96,6 @@ if [ $# -eq 1 -a "$1" = "list" ]; then # echo $title20 # echo $title21 # echo $title22 -# echo $title23 -# echo $title24 -# echo $title25 -# echo $title26 exit 0 fi @@ -178,12 +170,12 @@ do4=no do5=no do6=no do7=no -#do8=no -#do9=no -#do10=no -#do11=no -#do12=no -#do13=no +do8=no +do9=no +do10=no +do11=no +do12=no +do13=no #do14=no #do15=no #do16=no @@ -193,10 +185,6 @@ do7=no #do20=no #do21=no #do22=no -#do23=no -#do24=no -#do25=no -#do26=no while [ $# -gt 0 ] ; do case $1 in @@ -207,12 +195,12 @@ while [ $# -gt 0 ] ; do 5) do5=yes;; 6) do6=yes;; 7) do7=yes;; -# 8) do8=yes;; -# 9) do9=yes;; -# 10) do10=yes;; -# 11) do11=yes;; -# 12) do12=yes;; -# 13) do13=yes;; + 8) do8=yes;; + 9) do9=yes;; + 10) do10=yes;; + 11) do11=yes;; + 12) do12=yes;; + 13) do13=yes;; # 14) do14=yes;; # 15) do15=yes;; # 16) do16=yes;; @@ -222,10 +210,6 @@ while [ $# -gt 0 ] ; do # 20) do20=yes;; # 21) do21=yes;; # 22) do22=yes;; -# 23) do23=yes;; -# 24) do24=yes;; -# 25) do25=yes;; -# 26) do26=yes;; -8) arg8=yes;; -16) arg16=yes;; -32) arg32=yes;; @@ -330,7 +314,7 @@ else fi # UTF support always applies to all bit sizes if both are supported; we can't -# have UTF-8 support without UTF-16 support (for example). +# have UTF-8 support without UTF-16 or UTF-32 support. $sim ./pcre2test -C utf >/dev/null utf=$? @@ -346,14 +330,13 @@ fi # relevant will be automatically skipped. if [ $do1 = no -a $do2 = no -a $do3 = no -a $do4 = no -a \ - $do5 = no -a $do6 = no -a $do7 = no \ + $do5 = no -a $do6 = no -a $do7 = no -a $do8 = no -a \ + $do9 = no -a $do10 = no -a $do11 = no -a $do12 = no -a \ + $do13 = no \ ]; then -# -a $do8 = no -a \ -# $do9 = no -a $do10 = no -a $do11 = no -a $do12 = no -a \ -# $do13 = no -a $do14 = no -a $do15 = no -a $do16 = no -a \ +# -a $do14 = no -a $do15 = no -a $do16 = no -a \ # $do17 = no -a $do18 = no -a $do19 = no -a $do20 = no -a \ -# $do21 = no -a $do22 = no -a $do23 = no -a $do24 = no -a \ -# $do25 = no -a $do26 = no +# $do21 = no -a $do22 = no do1=yes do2=yes @@ -362,12 +345,12 @@ if [ $do1 = no -a $do2 = no -a $do3 = no -a $do4 = no -a \ do5=yes do6=yes do7=yes -# do8=yes -# do9=yes -# do10=yes -# do11=yes -# do12=yes -# do13=yes + do8=yes + do9=yes + do10=yes + do11=yes + do12=yes + do13=yes # do14=yes # do15=yes # do16=yes @@ -377,10 +360,6 @@ if [ $do1 = no -a $do2 = no -a $do3 = no -a $do4 = no -a \ # do20=yes # do21=yes # do22=yes -# do23=yes -# do24=yes -# do25=yes -# do26=yes fi # Handle any explicit skips at this stage, so that an argument list may consist @@ -584,32 +563,137 @@ if [ $do7 = yes ] ; then fi fi -## Test of internal offsets and code sizes. This test is run only when there -## is Unicode property support and the link size is 2. The actual tests are -## mostly the same as in some of the above, but in this test we inspect some -## offsets and sizes that require a known link size. This is a doublecheck for -## the maintainer, just in case something changes unexpectely. The output from -## this test is not the same in 8-bit and 16-bit modes. -# -#if [ $do11 = yes ] ; then -# echo $title11 -# if [ $link_size -ne 2 ] ; then -# echo " Skipped because link size is not 2" -# elif [ $ucp -eq 0 ] ; then -# echo " Skipped because Unicode property support is not available" -# else -# for opt in "" "-s"; do -# $sim $valgrind ./pcre2test -q $bmode $opt $testdata/testinput11 testtry -# if [ $? = 0 ] ; then -# $cf $testdata/testoutput11-$bits testtry -# if [ $? != 0 ] ; then exit 1; fi -# else exit 1 -# fi -# if [ "$opt" = "-s" ] ; then echo " OK with study" ; else echo " OK"; fi -# done -# fi -#fi -# +# Test of internal offsets and code sizes. This test is run only when there +# is UTF/UCP support and the link size is 2. The actual tests are +# mostly the same as in some of the above, but in this test we inspect some +# offsets and sizes that require a known link size. This is a doublecheck for +# the maintainer, just in case something changes unexpectely. The output from +# this test is different in 8-bit, 16-bit, and 32-bit modes, so there are +# mode-specific output files. + +if [ $do8 = yes ] ; then + echo $title8 + if [ $link_size -ne 2 ] ; then + echo " Skipped because link size is not 2" + elif [ $utf -eq 0 ] ; then + echo " Skipped because UTF-$bits support is not available" + else + $sim $valgrind ./pcre2test -q $bmode $testdata/testinput8 testtry + if [ $? = 0 ] ; then + $cf $testdata/testoutput8-$bits testtry + if [ $? != 0 ] ; then exit 1; fi + else exit 1 + fi + echo " OK" + fi +fi + +# Tests for 8-bit-specific features + +if [ "$do9" = yes ] ; then + echo $title9 + if [ "$bits" = "16" -o "$bits" = "32" ] ; then + echo " Skipped when running 16/32-bit tests" + else + for opt in "" $jitopt; do + $sim $valgrind ./pcre2test -q $bmode $opt $testdata/testinput9 testtry + if [ $? = 0 ] ; then + $cf $testdata/testoutput9 testtry + if [ $? != 0 ] ; then exit 1; fi + else exit 1 + fi + if [ "$opt" = "-jit" ] ; then echo " OK with JIT" + else echo " OK" + fi + done + fi +fi + +# Tests for UTF-8 and UCP 8-bit-specific features + +if [ "$do10" = yes ] ; then + echo $title10 + if [ "$bits" = "16" -o "$bits" = "32" ] ; then + echo " Skipped when running 16/32-bit tests" + elif [ $utf -eq 0 ] ; then + echo " Skipped because UTF-$bits support is not available" + else + for opt in "" $jitopt; do + $sim $valgrind ./pcre2test -q $bmode $opt $testdata/testinput10 testtry + if [ $? = 0 ] ; then + $cf $testdata/testoutput10 testtry + if [ $? != 0 ] ; then exit 1; fi + else exit 1 + fi + if [ "$opt" = "-jit" ] ; then echo " OK with JIT" + else echo " OK" + fi + done + fi +fi + +# Tests for 16-bit and 32-bit features. Output is different for the two widths. + +if [ $do11 = yes ] ; then + echo $title11 + if [ "$bits" = "8" ] ; then + echo " Skipped when running 8-bit tests" + else + for opt in "" $jitopt; do + $sim $valgrind ./pcre2test -q $bmode $opt $testdata/testinput11 testtry + if [ $? = 0 ] ; then + $cf $testdata/testoutput11-$bits testtry + if [ $? != 0 ] ; then exit 1; fi + else exit 1 + fi + if [ "$opt" = "-jit" ] ; then echo " OK with JIT" + else echo " OK" + fi + done + fi +fi + +# Tests for 16-bit and 32-bit features with UTF-16/32 and UCP support. Output +# is different for the two widths. + +if [ $do12 = yes ] ; then + echo $title12 + if [ "$bits" = "8" ] ; then + echo " Skipped when running 8-bit tests" + elif [ $utf -eq 0 ] ; then + echo " Skipped because UTF-$bits support is not available" + else + for opt in "" $jitopt; do + $sim $valgrind ./pcre2test -q $bmode $opt $testdata/testinput12 testtry + if [ $? = 0 ] ; then + $cf $testdata/testoutput12-$bits testtry + if [ $? != 0 ] ; then exit 1; fi + else exit 1 + fi + if [ "$opt" = "-jit" ] ; then echo " OK with JIT" + else echo " OK" + fi + done + fi +fi + +# Tests for 16/32-bit-specific features in DFA non-UTF modes + +if [ $do13 = yes ] ; then + echo $title13 + if [ "$bits" = "8" ] ; then + echo " Skipped when running 8-bit tests" + else + $sim $valgrind ./pcre2test -q $bmode $testdata/testinput13 testtry + if [ $? = 0 ] ; then + $cf $testdata/testoutput13 testtry + if [ $? != 0 ] ; then exit 1; fi + else exit 1 + fi + echo " OK" + fi +fi + ## Test JIT-specific features when JIT is available # #if [ $do12 = yes ] ; then @@ -644,169 +728,6 @@ fi # fi #fi # -## Tests for 8-bit-specific features -# -#if [ "$do14" = yes ] ; then -# echo $title14 -# if [ "$bits" = "16" -o "$bits" = "32" ] ; then -# echo " Skipped when running 16/32-bit tests" -# else -# cp -f $testdata/saved16 testsaved16 -# cp -f $testdata/saved32 testsaved32 -# for opt in "" "-s" $jitopt; do -# $sim $valgrind ./pcre2test -q $bmode $opt $testdata/testinput14 testtry -# if [ $? = 0 ] ; then -# $cf $testdata/testoutput14 testtry -# if [ $? != 0 ] ; then exit 1; fi -# else exit 1 -# fi -# if [ "$opt" = "-s" ] ; then echo " OK with study" -# elif [ "$opt" = "-s+" ] ; then echo " OK with JIT study" -# else echo " OK" -# fi -# done -# fi -#fi -# -## Tests for 8-bit-specific features (needs UTF-8 support) -# -#if [ "$do15" = yes ] ; then -# echo $title15 -# if [ "$bits" = "16" -o "$bits" = "32" ] ; then -# echo " Skipped when running 16/32-bit tests" -# elif [ $utf -eq 0 ] ; then -# echo " Skipped because UTF-$bits support is not available" -# else -# for opt in "" "-s" $jitopt; do -# $sim $valgrind ./pcre2test -q $bmode $opt $testdata/testinput15 testtry -# if [ $? = 0 ] ; then -# $cf $testdata/testoutput15 testtry -# if [ $? != 0 ] ; then exit 1; fi -# else exit 1 -# fi -# if [ "$opt" = "-s" ] ; then echo " OK with study" -# elif [ "$opt" = "-s+" ] ; then echo " OK with JIT study" -# else echo " OK" -# fi -# done -# fi -#fi -# -## Tests for 8-bit-specific features (Unicode property support) -# -#if [ $do16 = yes ] ; then -# echo $title16 -# if [ "$bits" = "16" -o "$bits" = "32" ] ; then -# echo " Skipped when running 16/32-bit tests" -# elif [ $ucp -eq 0 ] ; then -# echo " Skipped because Unicode property support is not available" -# else -# for opt in "" "-s" $jitopt; do -# $sim $valgrind ./pcre2test -q $bmode $opt $testdata/testinput16 testtry -# if [ $? = 0 ] ; then -# $cf $testdata/testoutput16 testtry -# if [ $? != 0 ] ; then exit 1; fi -# else exit 1 -# fi -# if [ "$opt" = "-s" ] ; then echo " OK with study" -# elif [ "$opt" = "-s+" ] ; then echo " OK with JIT study" -# else echo " OK" -# fi -# done -# fi -#fi -# -## Tests for 16/32-bit-specific features -# -#if [ $do17 = yes ] ; then -# echo $title17 -# if [ "$bits" = "8" ] ; then -# echo " Skipped when running 8-bit tests" -# else -# for opt in "" "-s" $jitopt; do -# $sim $valgrind ./pcre2test -q $bmode $opt $testdata/testinput17 testtry -# if [ $? = 0 ] ; then -# $cf $testdata/testoutput17 testtry -# if [ $? != 0 ] ; then exit 1; fi -# else exit 1 -# fi -# if [ "$opt" = "-s" ] ; then echo " OK with study" -# elif [ "$opt" = "-s+" ] ; then echo " OK with JIT study" -# else echo " OK" -# fi -# done -# fi -#fi -# -## Tests for 16/32-bit-specific features (UTF-16/32 support) -# -#if [ $do18 = yes ] ; then -# echo $title18 -# if [ "$bits" = "8" ] ; then -# echo " Skipped when running 8-bit tests" -# elif [ $utf -eq 0 ] ; then -# echo " Skipped because UTF-$bits support is not available" -# else -# for opt in "" "-s" $jitopt; do -# $sim $valgrind ./pcre2test -q $bmode $opt $testdata/testinput18 testtry -# if [ $? = 0 ] ; then -# $cf $testdata/testoutput18-$bits testtry -# if [ $? != 0 ] ; then exit 1; fi -# else exit 1 -# fi -# if [ "$opt" = "-s" ] ; then echo " OK with study" -# elif [ "$opt" = "-s+" ] ; then echo " OK with JIT study" -# else echo " OK" -# fi -# done -# fi -#fi -# -## Tests for 16/32-bit-specific features (Unicode property support) -# -#if [ $do19 = yes ] ; then -# echo $title19 -# if [ "$bits" = "8" ] ; then -# echo " Skipped when running 8-bit tests" -# elif [ $ucp -eq 0 ] ; then -# echo " Skipped because Unicode property support is not available" -# else -# for opt in "" "-s" $jitopt; do -# $sim $valgrind ./pcre2test -q $bmode $opt $testdata/testinput19 testtry -# if [ $? = 0 ] ; then -# $cf $testdata/testoutput19 testtry -# if [ $? != 0 ] ; then exit 1; fi -# else exit 1 -# fi -# if [ "$opt" = "-s" ] ; then echo " OK with study" -# elif [ "$opt" = "-s+" ] ; then echo " OK with JIT study" -# else echo " OK" -# fi -# done -# fi -#fi -# -## Tests for 16/32-bit-specific features in DFA non-UTF-16/32 mode -# -#if [ $do20 = yes ] ; then -# echo $title20 -# if [ "$bits" = "8" ] ; then -# echo " Skipped when running 8-bit tests" -# else -# for opt in "" "-s"; do -# $sim $valgrind ./pcre2test -q $bmode $opt -dfa $testdata/testinput20 testtry -# if [ $? = 0 ] ; then -# $cf $testdata/testoutput20 testtry -# if [ $? != 0 ] ; then exit 1; fi -# else exit 1 -# fi -# if [ "$opt" = "-s" ] ; then echo " OK with study" -# else echo " OK" -# fi -# done -# fi -#fi -# ## Tests for reloads with 16/32-bit library # #if [ $do21 = yes ] ; then @@ -855,70 +776,6 @@ fi # echo " OK" # fi #fi -# -#if [ $do23 = yes ] ; then -# echo $title23 -# if [ "$bits" = "8" -o "$bits" = "32" ] ; then -# echo " Skipped when running 8/32-bit tests" -# else -# $sim $valgrind ./pcre2test -q $bmode $testdata/testinput23 testtry -# if [ $? = 0 ] ; then -# $cf $testdata/testoutput23 testtry -# if [ $? != 0 ] ; then exit 1; fi -# else exit 1 -# fi -# echo " OK" -# fi -#fi -# -#if [ $do24 = yes ] ; then -# echo $title24 -# if [ "$bits" = "8" -o "$bits" = "32" ] ; then -# echo " Skipped when running 8/32-bit tests" -# elif [ $utf -eq 0 ] ; then -# echo " Skipped because UTF-$bits support is not available" -# else -# $sim $valgrind ./pcre2test -q $bmode $testdata/testinput24 testtry -# if [ $? = 0 ] ; then -# $cf $testdata/testoutput24 testtry -# if [ $? != 0 ] ; then exit 1; fi -# else exit 1 -# fi -# echo " OK" -# fi -#fi -# -#if [ $do25 = yes ] ; then -# echo $title25 -# if [ "$bits" = "8" -o "$bits" = "16" ] ; then -# echo " Skipped when running 8/16-bit tests" -# else -# $sim $valgrind ./pcre2test -q $bmode $testdata/testinput25 testtry -# if [ $? = 0 ] ; then -# $cf $testdata/testoutput25 testtry -# if [ $? != 0 ] ; then exit 1; fi -# else exit 1 -# fi -# echo " OK" -# fi -#fi -# -#if [ $do26 = yes ] ; then -# echo $title26 -# if [ "$bits" = "8" -o "$bits" = "16" ] ; then -# echo " Skipped when running 8/16-bit tests" -# elif [ $utf -eq 0 ] ; then -# echo " Skipped because UTF-$bits support is not available" -# else -# $sim $valgrind ./pcre2test -q $bmode $testdata/testinput26 testtry -# if [ $? = 0 ] ; then -# $cf $testdata/testoutput26 testtry -# if [ $? != 0 ] ; then exit 1; fi -# else exit 1 -# fi -# echo " OK" -# fi -#fi # End of loop for 8/16/32-bit tests done diff --git a/doc/pcre2test.1 b/doc/pcre2test.1 index dd7190f..2848851 100644 --- a/doc/pcre2test.1 +++ b/doc/pcre2test.1 @@ -286,9 +286,10 @@ This is a pattern line whose modifier list starts with two one-letter modifiers .SH "PATTERN SYNTAX" .rs .sp -A pattern line must start with one of the following characters: +A pattern line must start with one of the following characters (common symbols, +excluding pattern meta-characters): .sp - " / ! ' ` - + = : ; . , + / ! " ' ` - = _ : ; , % & @ ~ .sp This is interpreted as the pattern's delimiter. A regular expression may be continued over several input lines, in which case the newline characters are diff --git a/src/pcre2_compile.c b/src/pcre2_compile.c index f307198..4565e2b 100644 --- a/src/pcre2_compile.c +++ b/src/pcre2_compile.c @@ -7833,11 +7833,12 @@ if ((re->overall_options & PCRE2_ANCHORED) == 0) /* The first code unit is > 128 in UTF mode, or > 255 otherwise. In 8-bit UTF mode, codepoints in the range 128-255 are introductory code - points and cannot have another case. In 16-bit and 32-bit mode, we can + points and cannot have another case. In 16-bit and 32-bit modes, we can check wide characters when UTF (and therefore UCP) is supported. */ #if defined SUPPORT_UTF && PCRE2_CODE_UNIT_WIDTH != 8 - else if (UCD_OTHERCASE(firstcu) != firstcu) + else if (firstcu <= MAX_UTF_CODE_POINT && + UCD_OTHERCASE(firstcu) != firstcu) re->flags |= PCRE2_FIRSTCASELESS; #endif } @@ -7870,7 +7871,7 @@ if (reqcuflags >= 0 && if (cb.fcc[reqcu] != reqcu) re->flags |= PCRE2_LASTCASELESS; } #if defined SUPPORT_UTF && PCRE2_CODE_UNIT_WIDTH != 8 - else if (UCD_OTHERCASE(reqcu) != reqcu) + else if (reqcu <= MAX_UTF_CODE_POINT && UCD_OTHERCASE(reqcu) != reqcu) re->flags |= PCRE2_LASTCASELESS; #endif } diff --git a/src/pcre2_error.c b/src/pcre2_error.c index 6983ecd..fa01036 100644 --- a/src/pcre2_error.c +++ b/src/pcre2_error.c @@ -184,8 +184,8 @@ static const char match_error_texts[] = "UTF-8 error: 5-byte character is not allowed (RFC 3629)\0" "UTF-8 error: 6-byte character is not allowed (RFC 3629)\0" /* 15 */ - "UTF-8 error: code point > 0x10ffff is not defined\0" - "UTF-8 error: code points 0xd000-0xdfff are not defined\0" + "UTF-8 error: code points greater than 0x10ffff are not defined\0" + "UTF-8 error: code points 0xd800-0xdfff are not defined\0" "UTF-8 error: overlong 2-byte sequence\0" "UTF-8 error: overlong 3-byte sequence\0" "UTF-8 error: overlong 4-byte sequence\0" @@ -198,8 +198,8 @@ static const char match_error_texts[] = /* 25 */ "UTF-16 error: invalid low surrogate\0" "UTF-16 error: isolated low surrogate\0" - "UTF-32 error: surrogate character not allowed\0" - "UTF-32 error: code point > 0x10ffff is not defined\0" + "UTF-32 error: code points 0xd800-0xdfff are not defined\0" + "UTF-32 error: code points greater than 0x10ffff are not defined\0" "bad count value\0" /* 30 */ "pattern compiled with other endianness\0" diff --git a/src/pcre2_internal.h b/src/pcre2_internal.h index 37135ab..7c2132d 100644 --- a/src/pcre2_internal.h +++ b/src/pcre2_internal.h @@ -240,6 +240,10 @@ Unicode doesn't go beyond 0x0010ffff. */ #define NOTACHAR 0xffffffff +/* This is the largest valid UTF/Unicode code point. */ + +#define MAX_UTF_CODE_POINT 0x10ffff + /* Compile-time errors are added to this value. As they are documented, it should probably never be changed. */ @@ -574,9 +578,6 @@ total length. */ #define tables_length (ctypes_offset + 256) - - - /* -------------------- Character and string names ------------------------ */ /* If PCRE is to support UTF-8 on EBCDIC platforms, we cannot use normal diff --git a/src/pcre2_printint.c b/src/pcre2_printint.c index 936a81d..acb1e3e 100644 --- a/src/pcre2_printint.c +++ b/src/pcre2_printint.c @@ -279,9 +279,8 @@ static void pcre2_printint(pcre2_code *re, FILE *f, BOOL print_lengths) { PCRE2_SPTR codestart, nametable, code; -uint32_t options = re->compile_options; uint32_t nesize = re->name_entry_size; -BOOL utf = (options & PCRE2_UTF) != 0; +BOOL utf = (re->overall_options & PCRE2_UTF) != 0; nametable = (PCRE2_SPTR)((uint8_t *)re + sizeof(pcre2_real_code)); code = codestart = nametable + re->name_count * re->name_entry_size; diff --git a/src/pcre2_valid_utf.c b/src/pcre2_valid_utf.c index 9f68409..8382ab8 100644 --- a/src/pcre2_valid_utf.c +++ b/src/pcre2_valid_utf.c @@ -125,7 +125,7 @@ PCRE2_ERROR_UTF8_ERR10 6th-byte's two top bits are not 0x80 PCRE2_ERROR_UTF8_ERR11 5-byte character is not permitted by RFC 3629 PCRE2_ERROR_UTF8_ERR12 6-byte character is not permitted by RFC 3629 PCRE2_ERROR_UTF8_ERR13 4-byte character with value > 0x10ffff is not permitted -PCRE2_ERROR_UTF8_ERR14 3-byte character with value 0xd000-0xdfff is not permitted +PCRE2_ERROR_UTF8_ERR14 3-byte character with value 0xd800-0xdfff is not permitted PCRE2_ERROR_UTF8_ERR15 Overlong 2-byte sequence PCRE2_ERROR_UTF8_ERR16 Overlong 3-byte sequence PCRE2_ERROR_UTF8_ERR17 Overlong 4-byte sequence diff --git a/src/pcre2posix.c b/src/pcre2posix.c index 72fecaf..31d8228 100644 --- a/src/pcre2posix.c +++ b/src/pcre2posix.c @@ -147,8 +147,6 @@ regerror(int errcode, const regex_t *preg, char *errbuf, size_t errbuf_size) const char *message, *addmessage; size_t length, addlength; -errcode -= COMPILE_ERROR_BASE; - message = (errcode >= (int)(sizeof(pstring)/sizeof(char *)))? "unknown error code" : pstring[errcode]; length = strlen(message) + 1; @@ -237,8 +235,8 @@ if (preg->re_pcre2_code == NULL) (void)pcre2_pattern_info((const pcre2_code *)preg->re_pcre2_code, PCRE2_INFO_CAPTURECOUNT, &re_nsub); preg->re_nsub = (size_t)re_nsub; -preg->re_match_data = ((cflags & REG_NOSUB) != 0)? NULL : - pcre2_match_data_create(re_nsub + 1, NULL); +if ((options & PCRE2_NO_AUTO_CAPTURE) != 0) re_nsub = -1; +preg->re_match_data = pcre2_match_data_create(re_nsub + 1, NULL); return 0; } diff --git a/src/pcre2test.c b/src/pcre2test.c index f0a09cf..d384662 100644 --- a/src/pcre2test.c +++ b/src/pcre2test.c @@ -497,7 +497,7 @@ static modstruct modlist[] = { #define POSIX_SUPPORTED_MATCH_OPTIONS ( \ PCRE2_NOTBOL|PCRE2_NOTEMPTY|PCRE2_NOTEOL) -#define POSIX_SUPPORTED_MATCH_CONTROLS ( 0 ) +#define POSIX_SUPPORTED_MATCH_CONTROLS (CTL_AFTERTEXT|CTL_ALLAFTERTEXT) /* Table of single-character abbreviated modifiers. The index field is initialized to -1, but the first time the modifier is encountered, it is filled @@ -2884,7 +2884,7 @@ else fprintf(outfile, "%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s", ((options & PCRE2_DOLLAR_ENDONLY) != 0)? " dollar_endonly" : "", ((options & PCRE2_UNGREEDY) != 0)? " ungreedy" : "", ((options & PCRE2_NO_AUTO_CAPTURE) != 0)? " no_auto_capture" : "", - ((options & PCRE2_NO_AUTO_POSSESS) != 0)? " no_auto_possessify" : "", + ((options & PCRE2_NO_AUTO_POSSESS) != 0)? " no_auto_possess" : "", ((options & PCRE2_UTF) != 0)? " utf" : "", ((options & PCRE2_UCP) != 0)? " ucp" : "", ((options & PCRE2_NO_UTF_CHECK) != 0)? " no_utf_check" : "", @@ -3884,7 +3884,7 @@ static int callout_function(pcre2_callout_block_8 *cb) { uint32_t i, pre_start, post_start, subject_length; -BOOL utf = (FLD(compiled_code, compile_options) & PCRE2_UTF) != 0; +BOOL utf = (FLD(compiled_code, overall_options) & PCRE2_UTF) != 0; BOOL callout_capture = (dat_datctl.control & CTL_CALLOUT_CAPTURE) != 0; FILE *f = (first_callout || callout_capture)? outfile : NULL; @@ -4033,8 +4033,10 @@ dat_datctl.control |= (pat_patctl.control & CTL_ALLPD); /* Initialize for scanning the data line. */ -utf = (pat_patctl.control & CTL_POSIX) == 0 && - (FLD(compiled_code, compile_options) & PCRE2_UTF) != 0; +utf = ((((pat_patctl.control & CTL_POSIX) != 0)? + ((pcre2_real_code_8 *)preg.re_pcre2_code)->overall_options : + FLD(compiled_code, overall_options)) & PCRE2_UTF) != 0; + start_rep = NULL; len = strlen((const char *)buffer); while (len > 0 && isspace(buffer[len-1])) len--; @@ -4043,7 +4045,7 @@ p = buffer; while (isspace(*p)) p++; /* Check that the data is well-formed UTF-8 if we're in UTF mode. To create -invalid input to pcre2_exec, you must use \x?? or \x{} sequences. */ +invalid input to pcre2_match(), you must use \x?? or \x{} sequences. */ if (utf) { @@ -4414,14 +4416,14 @@ if ((pat_patctl.control & CTL_POSIX) != 0) { fprintf(outfile, "%2d: ", (int)i); PCHARSV(dbuffer, pmatch[i].rm_so, - pmatch[i].rm_eo - pmatch[i].rm_so, FALSE, outfile); + pmatch[i].rm_eo - pmatch[i].rm_so, utf, outfile); fprintf(outfile, "\n"); if ((i == 0 && (dat_datctl.control & CTL_AFTERTEXT) != 0) || (dat_datctl.control & CTL_ALLAFTERTEXT) != 0) { fprintf(outfile, "%2d+ ", (int)i); PCHARSV(dbuffer, pmatch[i].rm_eo, len - pmatch[i].rm_eo, - FALSE, outfile); + utf, outfile); fprintf(outfile, "\n"); } } @@ -5587,7 +5589,7 @@ while (notdone) rc = process_command(); } - else if (strchr("\"/!'`-+=:;.,", *p) != NULL) + else if (strchr("/!\"'`%&-=_:;,@~", *p) != NULL) { rc = process_pattern(); dfa_matched = 0; diff --git a/testdata/testinput10 b/testdata/testinput10 new file mode 100644 index 0000000..effdd1b --- /dev/null +++ b/testdata/testinput10 @@ -0,0 +1,398 @@ +# This set of tests is for UTF-8 support and Unicode property support, with +# relevance only for the 8-bit library. + +/X(\C{3})/utf + X\x{1234} + +/X(\C{4})/utf + X\x{1234}YZ + +/X\C*/utf + XYZabcdce + +/X\C*?/utf + XYZabcde + +/X\C{3,5}/utf + Xabcdefg + X\x{1234} + X\x{1234}YZ + X\x{1234}\x{512} + X\x{1234}\x{512}YZ + +/X\C{3,5}?/utf + Xabcdefg + X\x{1234} + X\x{1234}YZ + X\x{1234}\x{512} + +/a\Cb/utf + aXb + a\nb + +/a\C\Cb/utf + a\x{100}b + +/ab\Cde/utf + abXde + +/a\C\Cb/utf + a\x{100}b + ** Failers + a\x{12257}b + +/[]/utf + +//utf + +/xxx/utf + +/badutf/utf + \xdf + \xef + \xef\x80 + \xf7 + \xf7\x80 + \xf7\x80\x80 + \xfb + \xfb\x80 + \xfb\x80\x80 + \xfb\x80\x80\x80 + \xfd + \xfd\x80 + \xfd\x80\x80 + \xfd\x80\x80\x80 + \xfd\x80\x80\x80\x80 + \xdf\x7f + \xef\x7f\x80 + \xef\x80\x7f + \xf7\x7f\x80\x80 + \xf7\x80\x7f\x80 + \xf7\x80\x80\x7f + \xfb\x7f\x80\x80\x80 + \xfb\x80\x7f\x80\x80 + \xfb\x80\x80\x7f\x80 + \xfb\x80\x80\x80\x7f + \xfd\x7f\x80\x80\x80\x80 + \xfd\x80\x7f\x80\x80\x80 + \xfd\x80\x80\x7f\x80\x80 + \xfd\x80\x80\x80\x7f\x80 + \xfd\x80\x80\x80\x80\x7f + \xed\xa0\x80 + \xc0\x8f + \xe0\x80\x8f + \xf0\x80\x80\x8f + \xf8\x80\x80\x80\x8f + \xfc\x80\x80\x80\x80\x8f + \x80 + \xfe + \xff + +/badutf/utf + \xfb\x80\x80\x80\x80 + \xfd\x80\x80\x80\x80\x80 + \xf7\xbf\xbf\xbf + +/shortutf/utf + \xdf\=ph + \xef\=ph + \xef\x80\=ph + \xf7\=ph + \xf7\x80\=ph + \xf7\x80\x80\=ph + \xfb\=ph + \xfb\x80\=ph + \xfb\x80\x80\=ph + \xfb\x80\x80\x80\=ph + \xfd\=ph + \xfd\x80\=ph + \xfd\x80\x80\=ph + \xfd\x80\x80\x80\=ph + \xfd\x80\x80\x80\x80\=ph + +/anything/utf + \xc0\x80 + \xc1\x8f + \xe0\x9f\x80 + \xf0\x8f\x80\x80 + \xf8\x87\x80\x80\x80 + \xfc\x83\x80\x80\x80\x80 + \xfe\x80\x80\x80\x80\x80 + \xff\x80\x80\x80\x80\x80 + \xc3\x8f + \xe0\xaf\x80 + \xe1\x80\x80 + \xf0\x9f\x80\x80 + \xf1\x8f\x80\x80 + \xf8\x88\x80\x80\x80 + \xf9\x87\x80\x80\x80 + \xfc\x84\x80\x80\x80\x80 + \xfd\x83\x80\x80\x80\x80 + \xf8\x88\x80\x80\x80\=no_utf_check + \xf9\x87\x80\x80\x80\=no_utf_check + \xfc\x84\x80\x80\x80\x80\=no_utf_check + \xfd\x83\x80\x80\x80\x80\=no_utf_check + +/\x{100}/IB,utf + +/\x{1000}/IB,utf + +/\x{10000}/IB,utf + +/\x{100000}/IB,utf + +/\x{10ffff}/IB,utf + +/[\x{ff}]/IB,utf + +/[\x{100}]/IB,utf + +/\x80/IB,utf + +/\xff/IB,utf + +/\x{D55c}\x{ad6d}\x{C5B4}/IB,utf + \x{D55c}\x{ad6d}\x{C5B4} + +/\x{65e5}\x{672c}\x{8a9e}/IB,utf + \x{65e5}\x{672c}\x{8a9e} + +/\x{80}/IB,utf + +/\x{084}/IB,utf + +/\x{104}/IB,utf + +/\x{861}/IB,utf + +/\x{212ab}/IB,utf + +# This one is here not because it's different to Perl, but because the way +# the captured single-byte is displayed. (In Perl it becomes a character, and you +# can't tell the difference.) + +/X(\C)(.*)/utf + X\x{1234} + X\nabc + +# This one is here because Perl gives out a grumbly error message (quite +# correctly, but that messes up comparisons). + +/a\Cb/utf + *** Failers + a\x{100}b + +/[^ab\xC0-\xF0]/IB,utf + \x{f1} + \x{bf} + \x{100} + \x{1000} + *** Failers + \x{c0} + \x{f0} + +/Ā{3,4}/IB,utf + \x{100}\x{100}\x{100}\x{100\x{100} + +/(\x{100}+|x)/IB,utf + +/(\x{100}*a|x)/IB,utf + +/(\x{100}{0,2}a|x)/IB,utf + +/(\x{100}{1,2}a|x)/IB,utf + +/\x{100}/IB,utf + +/a\x{100}\x{101}*/IB,utf + +/a\x{100}\x{101}+/IB,utf + +/[^\x{c4}]/IB + +/[\x{100}]/IB,utf + \x{100} + Z\x{100} + \x{100}Z + *** Failers + +/[\xff]/IB,utf + >\x{ff}< + +/[^\xff]/IB,utf + +/\x{100}abc(xyz(?1))/IB,utf + +/a\x{1234}b/utf,posix + a\x{1234}b + +/\777/I,utf + \x{1ff} + \777 + +/\x{100}+\x{200}/IB,utf + +/\x{100}+X/IB,utf + +/^[\QĀ\E-\QŐ\E/B,utf + +# This tests the stricter UTF-8 check according to RFC 3629. + +/X/utf + \x{d800} + \x{d800}\=no_utf_check + \x{da00} + \x{da00}\=no_utf_check + \x{dfff} + \x{dfff}\=no_utf_check + \x{110000} + \x{110000}\=no_utf_check + \x{2000000} + \x{2000000}\=no_utf_check + \x{7fffffff} + \x{7fffffff}\=no_utf_check + +/(*UTF8)\x{1234}/ + abcd\x{1234}pqr + +/(*CRLF)(*UTF)(*BSR_UNICODE)a\Rb/I + +/\h/I,utf + ABC\x{09} + ABC\x{20} + ABC\x{a0} + ABC\x{1680} + ABC\x{180e} + ABC\x{2000} + ABC\x{202f} + ABC\x{205f} + ABC\x{3000} + +/\v/I,utf + ABC\x{0a} + ABC\x{0b} + ABC\x{0c} + ABC\x{0d} + ABC\x{85} + ABC\x{2028} + +/\h*A/I,utf + CDBABC + +/\v+A/I,utf + +/\s?xxx\s/I,utf + +/\sxxx\s/I,utf,tables=2 + AB\x{85}xxx\x{a0}XYZ + AB\x{a0}xxx\x{85}XYZ + +/\S \S/I,utf,tables=2 + \x{a2} \x{84} + A Z + +/a+/utf + a\x{123}aa\=offset=1 + a\x{123}aa\=offset=2 + a\x{123}aa\=offset=3 + a\x{123}aa\=offset=4 + a\x{123}aa\=offset=5 + a\x{123}aa\=offset=6 + +/\x{1234}+/Ii,utf + +/\x{1234}+?/Ii,utf + +/\x{1234}++/Ii,utf + +/\x{1234}{2}/Ii,utf + +/[^\x{c4}]/IB,utf + +/X+\x{200}/IB,utf + +/\R/I,utf + +/\777/IB,utf + +/\w+\x{C4}/B,utf + a\x{C4}\x{C4} + +/\w+\x{C4}/B,utf,tables=2 + a\x{C4}\x{C4} + +/\W+\x{C4}/B,utf + !\x{C4} + +/\W+\x{C4}/B,utf,tables=2 + !\x{C4} + +/\W+\x{A1}/B,utf + !\x{A1} + +/\W+\x{A1}/B,utf,tables=2 + !\x{A1} + +/X\s+\x{A0}/B,utf + X\x20\x{A0}\x{A0} + +/X\s+\x{A0}/B,utf,tables=2 + X\x20\x{A0}\x{A0} + +/\S+\x{A0}/B,utf + X\x{A0}\x{A0} + +/\S+\x{A0}/B,utf,tables=2 + X\x{A0}\x{A0} + +/\x{a0}+\s!/B,utf + \x{a0}\x20! + +/\x{a0}+\s!/B,utf,tables=2 + \x{a0}\x20! + +/A/utf + \x{ff000041} + \x{7f000041} + +/(*UTF8)abc/never_utf + +/abc/utf,never_utf + +/\w/posix + +++\x{c2} + +/\w/ucp,posix + +++\x{c2} + +/A\x{391}\x{10427}\x{ff3a}\x{1fb0}/IBi,utf + +/A\x{391}\x{10427}\x{ff3a}\x{1fb0}/IB,utf + +/AB\x{1fb0}/IB,utf + +/AB\x{1fb0}/IBi,utf + +/\x{401}\x{420}\x{421}\x{422}\x{423}\x{424}\x{425}\x{426}\x{427}\x{428}\x{429}\x{42a}\x{42b}\x{42c}\x{42d}\x{42e}\x{42f}/Ii,utf + \x{401}\x{420}\x{421}\x{422}\x{423}\x{424}\x{425}\x{426}\x{427}\x{428}\x{429}\x{42a}\x{42b}\x{42c}\x{42d}\x{42e}\x{42f} + \x{451}\x{440}\x{441}\x{442}\x{443}\x{444}\x{445}\x{446}\x{447}\x{448}\x{449}\x{44a}\x{44b}\x{44c}\x{44d}\x{44e}\x{44f} + +/[ⱥ]/Bi,utf + +/[^ⱥ]/Bi,utf + +/\h/I + +/\v/I + +/\R/I + +/[[:blank:]]/B,ucp + +/\x{212a}+/Ii,utf + KKkk\x{212a} + +/s+/Ii,utf + SSss\x{17f} + +# End of testinput10 diff --git a/testdata/testinput11 b/testdata/testinput11 new file mode 100644 index 0000000..43d89e1 --- /dev/null +++ b/testdata/testinput11 @@ -0,0 +1,357 @@ +# This set of tests is for the 16-bit and 32-bit libraries' basic (non-UTF) +# features that are not compatible with the 8-bit library, or which give +# different output in 16-bit or 32-bit mode. The output for the two widths is +# different, so they have separate output files. + +#forbid_utf + +/a\Cb/ + aXb + a\nb + +/[^\x{c4}]/IB + +/\x{100}/I + +/ (?: [\040\t] | \( +(?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] | \( (?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] )* \) )* +\) )* # optional leading comment +(?: (?: +[^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]+ # some number of atom characters... +(?![^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]) # ..not followed by something that could be part of an atom +| +" (?: # opening quote... +[^\\\x80-\xff\n\015"] # Anything except backslash and quote +| # or +\\ [^\x80-\xff] # Escaped something (something != CR) +)* " # closing quote +) # initial word +(?: (?: [\040\t] | \( +(?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] | \( (?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] )* \) )* +\) )* \. (?: [\040\t] | \( +(?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] | \( (?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] )* \) )* +\) )* (?: +[^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]+ # some number of atom characters... +(?![^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]) # ..not followed by something that could be part of an atom +| +" (?: # opening quote... +[^\\\x80-\xff\n\015"] # Anything except backslash and quote +| # or +\\ [^\x80-\xff] # Escaped something (something != CR) +)* " # closing quote +) )* # further okay, if led by a period +(?: [\040\t] | \( +(?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] | \( (?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] )* \) )* +\) )* @ (?: [\040\t] | \( +(?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] | \( (?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] )* \) )* +\) )* (?: +[^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]+ # some number of atom characters... +(?![^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]) # ..not followed by something that could be part of an atom +| \[ # [ +(?: [^\\\x80-\xff\n\015\[\]] | \\ [^\x80-\xff] )* # stuff +\] # ] +) # initial subdomain +(?: # +(?: [\040\t] | \( +(?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] | \( (?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] )* \) )* +\) )* \. # if led by a period... +(?: [\040\t] | \( +(?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] | \( (?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] )* \) )* +\) )* (?: +[^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]+ # some number of atom characters... +(?![^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]) # ..not followed by something that could be part of an atom +| \[ # [ +(?: [^\\\x80-\xff\n\015\[\]] | \\ [^\x80-\xff] )* # stuff +\] # ] +) # ...further okay +)* +# address +| # or +(?: +[^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]+ # some number of atom characters... +(?![^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]) # ..not followed by something that could be part of an atom +| +" (?: # opening quote... +[^\\\x80-\xff\n\015"] # Anything except backslash and quote +| # or +\\ [^\x80-\xff] # Escaped something (something != CR) +)* " # closing quote +) # one word, optionally followed by.... +(?: +[^()<>@,;:".\\\[\]\x80-\xff\000-\010\012-\037] | # atom and space parts, or... +\( +(?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] | \( (?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] )* \) )* +\) | # comments, or... + +" (?: # opening quote... +[^\\\x80-\xff\n\015"] # Anything except backslash and quote +| # or +\\ [^\x80-\xff] # Escaped something (something != CR) +)* " # closing quote +# quoted strings +)* +< (?: [\040\t] | \( +(?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] | \( (?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] )* \) )* +\) )* # leading < +(?: @ (?: [\040\t] | \( +(?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] | \( (?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] )* \) )* +\) )* (?: +[^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]+ # some number of atom characters... +(?![^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]) # ..not followed by something that could be part of an atom +| \[ # [ +(?: [^\\\x80-\xff\n\015\[\]] | \\ [^\x80-\xff] )* # stuff +\] # ] +) # initial subdomain +(?: # +(?: [\040\t] | \( +(?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] | \( (?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] )* \) )* +\) )* \. # if led by a period... +(?: [\040\t] | \( +(?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] | \( (?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] )* \) )* +\) )* (?: +[^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]+ # some number of atom characters... +(?![^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]) # ..not followed by something that could be part of an atom +| \[ # [ +(?: [^\\\x80-\xff\n\015\[\]] | \\ [^\x80-\xff] )* # stuff +\] # ] +) # ...further okay +)* + +(?: (?: [\040\t] | \( +(?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] | \( (?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] )* \) )* +\) )* , (?: [\040\t] | \( +(?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] | \( (?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] )* \) )* +\) )* @ (?: [\040\t] | \( +(?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] | \( (?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] )* \) )* +\) )* (?: +[^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]+ # some number of atom characters... +(?![^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]) # ..not followed by something that could be part of an atom +| \[ # [ +(?: [^\\\x80-\xff\n\015\[\]] | \\ [^\x80-\xff] )* # stuff +\] # ] +) # initial subdomain +(?: # +(?: [\040\t] | \( +(?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] | \( (?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] )* \) )* +\) )* \. # if led by a period... +(?: [\040\t] | \( +(?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] | \( (?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] )* \) )* +\) )* (?: +[^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]+ # some number of atom characters... +(?![^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]) # ..not followed by something that could be part of an atom +| \[ # [ +(?: [^\\\x80-\xff\n\015\[\]] | \\ [^\x80-\xff] )* # stuff +\] # ] +) # ...further okay +)* +)* # further okay, if led by comma +: # closing colon +(?: [\040\t] | \( +(?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] | \( (?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] )* \) )* +\) )* )? # optional route +(?: +[^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]+ # some number of atom characters... +(?![^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]) # ..not followed by something that could be part of an atom +| +" (?: # opening quote... +[^\\\x80-\xff\n\015"] # Anything except backslash and quote +| # or +\\ [^\x80-\xff] # Escaped something (something != CR) +)* " # closing quote +) # initial word +(?: (?: [\040\t] | \( +(?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] | \( (?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] )* \) )* +\) )* \. (?: [\040\t] | \( +(?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] | \( (?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] )* \) )* +\) )* (?: +[^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]+ # some number of atom characters... +(?![^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]) # ..not followed by something that could be part of an atom +| +" (?: # opening quote... +[^\\\x80-\xff\n\015"] # Anything except backslash and quote +| # or +\\ [^\x80-\xff] # Escaped something (something != CR) +)* " # closing quote +) )* # further okay, if led by a period +(?: [\040\t] | \( +(?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] | \( (?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] )* \) )* +\) )* @ (?: [\040\t] | \( +(?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] | \( (?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] )* \) )* +\) )* (?: +[^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]+ # some number of atom characters... +(?![^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]) # ..not followed by something that could be part of an atom +| \[ # [ +(?: [^\\\x80-\xff\n\015\[\]] | \\ [^\x80-\xff] )* # stuff +\] # ] +) # initial subdomain +(?: # +(?: [\040\t] | \( +(?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] | \( (?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] )* \) )* +\) )* \. # if led by a period... +(?: [\040\t] | \( +(?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] | \( (?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] )* \) )* +\) )* (?: +[^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]+ # some number of atom characters... +(?![^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]) # ..not followed by something that could be part of an atom +| \[ # [ +(?: [^\\\x80-\xff\n\015\[\]] | \\ [^\x80-\xff] )* # stuff +\] # ] +) # ...further okay +)* +# address spec +(?: [\040\t] | \( +(?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] | \( (?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] )* \) )* +\) )* > # trailing > +# name and address +) (?: [\040\t] | \( +(?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] | \( (?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] )* \) )* +\) )* # optional trailing comment +/Ix + +/[\h]/B + >\x09< + +/[\h]+/B + >\x09\x20\xa0< + +/[\v]/B + +/[^\h]/B + +/\h+/I + \x{1681}\x{200b}\x{1680}\x{2000}\x{202f}\x{3000} + \x{3001}\x{2fff}\x{200a}\xa0\x{2000} + +/[\h\x{dc00}]+/IB + \x{1681}\x{200b}\x{1680}\x{2000}\x{202f}\x{3000} + \x{3001}\x{2fff}\x{200a}\xa0\x{2000} + +/\H+/I + \x{1680}\x{180e}\x{167f}\x{1681}\x{180d}\x{180f} + \x{2000}\x{200a}\x{1fff}\x{200b} + \x{202f}\x{205f}\x{202e}\x{2030}\x{205e}\x{2060} + \xa0\x{3000}\x9f\xa1\x{2fff}\x{3001} + +/[\H\x{d800}]+/ + \x{1680}\x{180e}\x{167f}\x{1681}\x{180d}\x{180f} + \x{2000}\x{200a}\x{1fff}\x{200b} + \x{202f}\x{205f}\x{202e}\x{2030}\x{205e}\x{2060} + \xa0\x{3000}\x9f\xa1\x{2fff}\x{3001} + +/\v+/I + \x{2027}\x{2030}\x{2028}\x{2029} + \x09\x0e\x84\x86\x85\x0a\x0b\x0c\x0d + +/[\v\x{dc00}]+/IB + \x{2027}\x{2030}\x{2028}\x{2029} + \x09\x0e\x84\x86\x85\x0a\x0b\x0c\x0d + +/\V+/I + \x{2028}\x{2029}\x{2027}\x{2030} + \x85\x0a\x0b\x0c\x0d\x09\x0e\x84\x86 + +/[\V\x{d800}]+/ + \x{2028}\x{2029}\x{2027}\x{2030} + \x85\x0a\x0b\x0c\x0d\x09\x0e\x84\x86 + +/\R+/I,bsr=unicode + \x{2027}\x{2030}\x{2028}\x{2029} + \x09\x0e\x84\x86\x85\x0a\x0b\x0c\x0d + +/\x{d800}\x{d7ff}\x{dc00}\x{dc00}\x{dcff}\x{dd00}/I + \x{d800}\x{d7ff}\x{dc00}\x{dc00}\x{dcff}\x{dd00} + +/[^\x{80}][^\x{ff}][^\x{100}][^\x{1000}][^\x{ffff}]/B + +/[^\x{80}][^\x{ff}][^\x{100}][^\x{1000}][^\x{ffff}]/Bi + +/[^\x{100}]*[^\x{1000}]+[^\x{ffff}]??[^\x{8000}]{4,}[^\x{7fff}]{2,9}?[^\x{100}]{5,6}+/B + +/[^\x{100}]*[^\x{1000}]+[^\x{ffff}]??[^\x{8000}]{4,}[^\x{7fff}]{2,9}?[^\x{100}]{5,6}+/Bi + +/(*:0123456789ABCDEF0123456789ABCDEF0123456789ABCDEF0123456789ABCDEF0123456789ABCDEF0123456789ABCDEF0123456789ABCDEF0123456789ABCDEF0123456789ABCDEF0123456789ABCDEF0123456789ABCDEF0123456789ABCDEF0123456789ABCDEF0123456789ABCDEF0123456789ABCDEF0123456789ABCDEF)XX/mark + XX + +/(*:0123456789ABCDEF0123456789ABCDEF0123456789ABCDEF0123456789ABCDEF0123456789ABCDEF0123456789ABCDEF0123456789ABCDEF0123456789ABCDEF0123456789ABCDEF0123456789ABCDEF0123456789ABCDEF0123456789ABCDEF0123456789ABCDEF0123456789ABCDEF0123456789ABCDEF0123456789ABCDE)XX/mark + XX + +/\u0100/B,alt_bsux,allow_empty_class,match_unset_backref + +/[\u0100-\u0200]/B,alt_bsux,allow_empty_class,match_unset_backref + +/\ud800/B,alt_bsux,allow_empty_class,match_unset_backref + +/^\x{ffff}+/i + \x{ffff} + +/^\x{ffff}?/i + \x{ffff} + +/^\x{ffff}*/i + \x{ffff} + +/^\x{ffff}{3}/i + \x{ffff}\x{ffff}\x{ffff} + +/^\x{ffff}{0,3}/i + \x{ffff} + +/[^\x00-a]{12,}[^b-\xff]*/B + +/[^\s]*\s* [^\W]+\W+ [^\d]*?\d0 [^\d\w]{4,6}?\w*A/B + +/a*[b-\x{200}]?a#a*[b-\x{200}]?b#[a-f]*[g-\x{200}]*#[g-\x{200}]*[a-c]*#[g-\x{200}]*[a-h]*/B + +/^[\x{1234}\x{4321}]{2,4}?/ + \x{1234}\x{1234}\x{1234} + +# Check maximum non-UTF character size for the 16-bit library. + +/\x{ffff}/ + A\x{ffff}B + +/\x{10000}/ + +/\o{20000}/ + +# Check maximum character size for the 32-bit library. These will all give +# errors in the 16-bit library. + +/\x{110000}/ + +/\x{7fffffff}/ + +/\x{80000000}/ + +/\x{ffffffff}/ + +/\x{100000000}/ + +/\o{17777777777}/ + +/\o{20000000000}/ + +/\o{37777777777}/ + +/\o{40000000000}/ + +/\x{7fffffff}\x{7fffffff}/I + +/\x{80000000}\x{80000000}/I + +/\x{ffffffff}\x{ffffffff}/I + +# Non-UTF characters + +/\C{2,3}/ + \x{400000}\x{400001}\x{400002}\x{400003} + +/\x{400000}\x{800000}/IBi + +# Check character ranges + +/[\H]/IB + +/[\V]/IB + +# End of testinput11 diff --git a/testdata/testinput12 b/testdata/testinput12 new file mode 100644 index 0000000..e47c9d2 --- /dev/null +++ b/testdata/testinput12 @@ -0,0 +1,332 @@ +# This set of tests is for UTF-16 and UTF-32 support, and is relevant only to +# the 16-bit and 32-bit libraries. The output is different for each library, +# so there are separate output files. + +/xxx/IB,utf,no_utf_check + +/abc/utf + ] + +/X(\C{3})/utf + X\x{11234}Y + X\x{11234}YZ + +/X(\C{4})/utf + X\x{11234}YZ + X\x{11234}YZW + +/X\C*/utf + XYZabcdce + +/X\C*?/utf + XYZabcde + +/X\C{3,5}/utf + Xabcdefg + X\x{11234}Y + X\x{11234}YZ + X\x{11234}\x{512} + X\x{11234}\x{512}YZ + X\x{11234}\x{512}\x{11234}Z + +/X\C{3,5}?/utf + Xabcdefg + X\x{11234}Y + X\x{11234}YZ + X\x{11234}\x{512}YZ + *** Failers + X\x{11234} + +/a\Cb/utf + aXb + a\nb + +/a\C\Cb/utf + a\x{12257}b + a\x{12257}\x{11234}b + ** Failers + a\x{100}b + +/ab\Cde/utf + abXde + +# Check maximum character size + +/\x{ffff}/IB,utf + +/\x{10000}/IB,utf + +/\x{100}/IB,utf + +/\x{1000}/IB,utf + +/\x{10000}/IB,utf + +/\x{100000}/IB,utf + +/\x{10ffff}/IB,utf + +/[\x{ff}]/IB,utf + +/[\x{100}]/IB,utf + +/\x80/IB,utf + +/\xff/IB,utf + +/\x{D55c}\x{ad6d}\x{C5B4}/IB,utf + \x{D55c}\x{ad6d}\x{C5B4} + +/\x{65e5}\x{672c}\x{8a9e}/IB,utf + \x{65e5}\x{672c}\x{8a9e} + +/\x{80}/IB,utf + +/\x{084}/IB,utf + +/\x{104}/IB,utf + +/\x{861}/IB,utf + +/\x{212ab}/IB,utf + +# This one is here not because it's different to Perl, but because the way +# the captured single-byte is displayed. (In Perl it becomes a character, and you +# can't tell the difference.) + +/X(\C)(.*)/utf + X\x{1234} + X\nabc + +# This one is here because Perl gives out a grumbly error message (quite +# correctly, but that messes up comparisons). + +/a\Cb/utf + *** Failers + a\x{100}b + +/[^ab\xC0-\xF0]/IB,utf + \x{f1} + \x{bf} + \x{100} + \x{1000} + *** Failers + \x{c0} + \x{f0} + +/Ā{3,4}/IB,utf + \x{100}\x{100}\x{100}\x{100\x{100} + +/(\x{100}+|x)/IB,utf + +/(\x{100}*a|x)/IB,utf + +/(\x{100}{0,2}a|x)/IB,utf + +/(\x{100}{1,2}a|x)/IB,utf + +/\x{100}/IB,utf + +/a\x{100}\x{101}*/IB,utf + +/a\x{100}\x{101}+/IB,utf + +/[^\x{c4}]/IB + +/[\x{100}]/IB,utf + \x{100} + Z\x{100} + \x{100}Z + *** Failers + +/[\xff]/IB,utf + >\x{ff}< + +/[^\xff]/IB,utf + +/\x{100}abc(xyz(?1))/IB,utf + +/\777/I,utf + \x{1ff} + \777 + +/\x{100}+\x{200}/IB,utf + +/\x{100}+X/IB,utf + +/^[\QĀ\E-\QŐ\E/B,utf + +/X/utf + \x{d800} + \x{d800}\=no_utf_check + \x{da00} + \x{da00}\=no_utf_check + \x{dc00} + \x{dc00}\=no_utf_check + \x{de00} + \x{de00}\=no_utf_check + \x{dfff} + \x{dfff}\=no_utf_check + \x{110000} + \x{d800}\x{1234} + +/(*UTF16)\x{11234}/ + abcd\x{11234}pqr + +/(*UTF)\x{11234}/I + abcd\x{11234}pqr + +/(*UTF-32)\x{11234}/ + abcd\x{11234}pqr + +/(*CRLF)(*UTF16)(*BSR_UNICODE)a\Rb/I + +/(*CRLF)(*UTF32)(*BSR_UNICODE)a\Rb/I + +/\h/I,utf + ABC\x{09} + ABC\x{20} + ABC\x{a0} + ABC\x{1680} + ABC\x{180e} + ABC\x{2000} + ABC\x{202f} + ABC\x{205f} + ABC\x{3000} + +/\v/I,utf + ABC\x{0a} + ABC\x{0b} + ABC\x{0c} + ABC\x{0d} + ABC\x{85} + ABC\x{2028} + +/\h*A/I,utf + CDBABC + \x{2000}ABC + +/\R*A/I,bsr=unicode,utf + CDBABC + \x{2028}A + +/\v+A/I,utf + +/\s?xxx\s/I,utf + +/\sxxx\s/I,utf,tables=2 + AB\x{85}xxx\x{a0}XYZ + AB\x{a0}xxx\x{85}XYZ + +/\S \S/I,utf,tables=2 + \x{a2} \x{84} + A Z + +/a+/utf + a\x{123}aa\=offset=1 + a\x{123}aa\=offset=2 + a\x{123}aa\=offset=3 + a\x{123}aa\=offset=4 + a\x{123}aa\=offset=5 + a\x{123}aa\=offset=6 + +/\x{1234}+/Ii,utf + +/\x{1234}+?/Ii,utf + +/\x{1234}++/Ii,utf + +/\x{1234}{2}/Ii,utf + +/[^\x{c4}]/IB,utf + +/X+\x{200}/IB,utf + +/\R/I,utf + +# Check bad offset + +/a/utf + \x{10000}\=offset=1 + \x{10000}ab\=offset=1 + \x{10000}ab\=offset=2 + \x{10000}ab\=offset=3 + \x{10000}ab\=offset=4 + \x{10000}ab\=offset=5 + +//utf + +/\w+\x{C4}/B,utf + a\x{C4}\x{C4} + +/\w+\x{C4}/B,utf,tables=2 + a\x{C4}\x{C4} + +/\W+\x{C4}/B,utf + !\x{C4} + +/\W+\x{C4}/B,utf,tables=2 + !\x{C4} + +/\W+\x{A1}/B,utf + !\x{A1} + +/\W+\x{A1}/B,utf,tables=2 + !\x{A1} + +/X\s+\x{A0}/B,utf + X\x20\x{A0}\x{A0} + +/X\s+\x{A0}/B,utf,tables=2 + X\x20\x{A0}\x{A0} + +/\S+\x{A0}/B,utf + X\x{A0}\x{A0} + +/\S+\x{A0}/B,utf,tables=2 + X\x{A0}\x{A0} + +/\x{a0}+\s!/B,utf + \x{a0}\x20! + +/\x{a0}+\s!/B,utf,tables=2 + \x{a0}\x20! + +/(*UTF)abc/never_utf + +/abc/utf,never_utf + +/A\x{391}\x{10427}\x{ff3a}\x{1fb0}/IBi,utf + +/A\x{391}\x{10427}\x{ff3a}\x{1fb0}/IB,utf + +/AB\x{1fb0}/IB,utf + +/AB\x{1fb0}/IBi,utf + +/\x{401}\x{420}\x{421}\x{422}\x{423}\x{424}\x{425}\x{426}\x{427}\x{428}\x{429}\x{42a}\x{42b}\x{42c}\x{42d}\x{42e}\x{42f}/Ii,utf + \x{401}\x{420}\x{421}\x{422}\x{423}\x{424}\x{425}\x{426}\x{427}\x{428}\x{429}\x{42a}\x{42b}\x{42c}\x{42d}\x{42e}\x{42f} + \x{451}\x{440}\x{441}\x{442}\x{443}\x{444}\x{445}\x{446}\x{447}\x{448}\x{449}\x{44a}\x{44b}\x{44c}\x{44d}\x{44e}\x{44f} + +/[ⱥ]/Bi,utf + +/[^ⱥ]/Bi,utf + +/[[:blank:]]/B,ucp + +/\x{212a}+/Ii,utf + KKkk\x{212a} + +/s+/Ii,utf + SSss\x{17f} + +# Non-UTF characters should give errors in both 16-bit and 32-bit modes. + +/\x{110000}/utf + +/\o{4200000}/utf + +/\C/utf + \x{110000} + +# End of testinput12 diff --git a/testdata/testinput13 b/testdata/testinput13 new file mode 100644 index 0000000..93ac25f --- /dev/null +++ b/testdata/testinput13 @@ -0,0 +1,22 @@ +# These DFA tests are for the handling of characters greater than 255 in +# 16-bit or 32-bit, non-UTF mode. + +#forbid_utf +#subject dfa + +/^\x{ffff}+/i + \x{ffff} + +/^\x{ffff}?/i + \x{ffff} + +/^\x{ffff}*/i + \x{ffff} + +/^\x{ffff}{3}/i + \x{ffff}\x{ffff}\x{ffff} + +/^\x{ffff}{0,3}/i + \x{ffff} + +# End of testinput13 diff --git a/testdata/testinput2 b/testdata/testinput2 index 1c0bf90..bdcd801 100644 --- a/testdata/testinput2 +++ b/testdata/testinput2 @@ -1593,7 +1593,7 @@ a random value. /Ix abc\rdef abc\r\ndef -+((?:\s|//.*\\n|/[*](?:\\n|.)*?[*]/)*)+I +!((?:\s|//.*\\n|/[*](?:\\n|.)*?[*]/)*)!I /* this is a C style comment */\=find_limits /(?P25[0-5]|2[0-4]\d|[01]?\d?\d)(?:\.(?P>B)){3}/I diff --git a/testdata/testinput5 b/testdata/testinput5 index d34b019..935cf98 100644 --- a/testdata/testinput5 +++ b/testdata/testinput5 @@ -573,11 +573,11 @@ /X\W{3}X/utf X\=ps -/\sxxx\s/utf,tables=1 +/\sxxx\s/utf,tables=2 AB\x{85}xxx\x{a0}XYZ AB\x{a0}xxx\x{85}XYZ -/\S \S/utf,tables=1 +/\S \S/utf,tables=2 \x{a2} \x{84} 'A#хц'Bx,newline=any,utf diff --git a/testdata/testinput8 b/testdata/testinput8 new file mode 100644 index 0000000..b781518 --- /dev/null +++ b/testdata/testinput8 @@ -0,0 +1,141 @@ +# These are a few representative patterns whose lengths and offsets are to be +# shown when the link size is 2. This is just a doublecheck test to ensure the +# sizes don't go horribly wrong when something is changed. The pattern contents +# are all themselves checked in other tests. Unicode, including property +# support, is required for these tests. + +#pattern fullbincode,memory + +/((?i)b)/ + +/(?s)(.*X|^B)/ + +/(?s:.*X|^B)/ + +/^[[:alnum:]]/ + +/#/Ix + +/a#/Ix + +/x?+/ + +/x++/ + +/x{1,3}+/ + +/(x)*+/ + +/^((a+)(?U)([ab]+)(?-U)([bc]+)(\w*))/ + +"8J\$WE\<\.rX\+ix\[d1b\!H\#\?vV0vrK\:ZH1\=2M\>iV\;\?aPhFB\<\*vW\@QW\@sO9\}cfZA\-i\'w\%hKd6gt1UJP\,15_\#QY\$M\^Mss_U\/\]\&LK9\[5vQub\^w\[KDD\qmj\;2\}YWFdYx\.Ap\]hjCPTP\(n28k\+3\;o\&WXqs\/gOXdr\$\:r\'do0\;b4c\(f_Gr\=\"\\4\)\[01T7ajQJvL\$W\~mL_sS\/4h\:x\*\[ZN\=KLs\&L5zX\/\/\>it\,o\:aU\(\;Z\>pW\&T7oP\'2K\^E\:x9\'c\[\%z\-\,64JQ5AeH_G\#KijUKghQw\^\\vea3a\?kka_G\$8\#\`\*kynsxzBLru\'\]k_\[7FrVx\}\^\=\$blx\>s\-N\%j\;D\*aZDnsw\:YKZ\%Q\.Kne9\#hP\?\+b3\(SOvL\,\^\;\&u5\@\?5C5Bhb\=m\-vEh_L15Jl\]U\)0RP6\{q\%L\^_z5E\'Dw6X\b" + +"\$\<\.X\+ix\[d1b\!H\#\?vV0vrK\:ZH1\=2M\>iV\;\?aPhFB\<\*vW\@QW\@sO9\}cfZA\-i\'w\%hKd6gt1UJP\,15_\#QY\$M\^Mss_U\/\]\&LK9\[5vQub\^w\[KDD\qmj\;2\}YWFdYx\.Ap\]hjCPTP\(n28k\+3\;o\&WXqs\/gOXdr\$\:r\'do0\;b4c\(f_Gr\=\"\\4\)\[01T7ajQJvL\$W\~mL_sS\/4h\:x\*\[ZN\=KLs\&L5zX\/\/\>it\,o\:aU\(\;Z\>pW\&T7oP\'2K\^E\:x9\'c\[\%z\-\,64JQ5AeH_G\#KijUKghQw\^\\vea3a\?kka_G\$8\#\`\*kynsxzBLru\'\]k_\[7FrVx\}\^\=\$blx\>s\-N\%j\;D\*aZDnsw\:YKZ\%Q\.Kne9\#hP\?\+b3\(SOvL\,\^\;\&u5\@\?5C5Bhb\=m\-vEh_L15Jl\]U\)0RP6\{q\%L\^_z5E\'Dw6X\b" + +/(a(?1)b)/ + +/(a(?1)+b)/ + +/a(?Pb|c)d(?Pe)/ + +/(?:a(?Pc(?Pd)))(?Pa)/ + +/(?Pa)...(?P=a)bbb(?P>a)d/ + +/abc(?C255)de(?C)f/ + +/abcde/auto_callout + +/\x{100}/utf + +/\x{1000}/utf + +/\x{10000}/utf + +/\x{100000}/utf + +/\x{10ffff}/utf + +/\x{110000}/utf + +/[\x{ff}]/utf + +/[\x{100}]/utf + +/\x80/utf + +/\xff/utf + +/\x{0041}\x{2262}\x{0391}\x{002e}/I,utf + +/\x{D55c}\x{ad6d}\x{C5B4}/I,utf + +/\x{65e5}\x{672c}\x{8a9e}/I,utf + +/[\x{100}]/utf + +/[Z\x{100}]/utf + +/^[\x{100}\E-\Q\E\x{150}]/utf + +/^[\QĀ\E-\QŐ\E]/utf + +/^[\QĀ\E-\QŐ\E/utf + +/[\p{L}]/ + +/[\p{^L}]/ + +/[\P{L}]/ + +/[\P{^L}]/ + +/[abc\p{L}\x{0660}]/utf + +/[\p{Nd}]/utf + +/[\p{Nd}+-]+/utf + +/A\x{391}\x{10427}\x{ff3a}\x{1fb0}/i,utf + +/A\x{391}\x{10427}\x{ff3a}\x{1fb0}/utf + +/[\x{105}-\x{109}]/i,utf + +/( ( (?(1)0|) )* )/x + +/( (?(1)0|)* )/x + +/[a]/ + +/[a]/utf + +/[\xaa]/ + +/[\xaa]/utf + +/[^a]/ + +/[^a]/utf + +/[^\xaa]/ + +/[^\xaa]/utf + +#pattern -memory + +/[^\d]/utf,ucp + +/[[:^alpha:][:^cntrl:]]+/utf,ucp + +/[[:^cntrl:][:^alpha:]]+/utf,ucp + +/[[:alpha:]]+/utf,ucp + +/[[:^alpha:]\S]+/utf,ucp + +/abc(d|e)(*THEN)x(123(*THEN)4|567(b|q)(*THEN)xx)/ + +/(((a\2)|(a*)\g<-1>))*a?/ + +# End of testinput8 diff --git a/testdata/testinput9 b/testdata/testinput9 new file mode 100644 index 0000000..7ce8547 --- /dev/null +++ b/testdata/testinput9 @@ -0,0 +1,333 @@ +# This set of tests is run only with the 8-bit library. They do not require +# UTF-8 or Unicode property support. The file starts with all the tests of +# the POSIX interface, because that is supported only with the 8-bit library. + +#forbid_utf +#pattern posix + +/abc/ + abc + *** Failers + +/^abc|def/ + abcdef + abcdef\=notbol + +/.*((abc)$|(def))/ + defabc + defabc\=noteol + +/the quick brown fox/ + the quick brown fox + *** Failers + The Quick Brown Fox + +/the quick brown fox/i + the quick brown fox + The Quick Brown Fox + +/abc.def/ + *** Failers + abc\ndef + +/abc$/ + abc + abc\n + +/(abc)\2/ + +/(abc\1)/ + abc + +/a*(b+)(z)(z)/ + aaaabbbbzzzz + aaaabbbbzzzz\=ovector=0 + aaaabbbbzzzz\=ovector=1 + aaaabbbbzzzz\=ovector=2 + +/ab.cd/ + ab-cd + ab=cd + ** Failers + ab\ncd + +/ab.cd/s + ab-cd + ab=cd + ab\ncd + +/a(b)c/no_auto_capture + abc + +/a(?Pb)c/no_auto_capture + abc + +/a?|b?/ + abc + ** Failers + ddd\=notempty + +/\w+A/ + CDAAAAB + +/\w+A/ungreedy + CDAAAAB + +/\Biss\B/I,aftertext + Mississippi + +/abc/\ + +#pattern -posix + +# End of POSIX tests + +/a\Cb/ + aXb + a\nb + ** Failers (too big char) + A\x{123}B + A\o{443}B + +/\x{100}/I + +/\o{400}/I + +/ (?: [\040\t] | \( +(?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] | \( (?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] )* \) )* +\) )* # optional leading comment +(?: (?: +[^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]+ # some number of atom characters... +(?![^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]) # ..not followed by something that could be part of an atom +| +" (?: # opening quote... +[^\\\x80-\xff\n\015"] # Anything except backslash and quote +| # or +\\ [^\x80-\xff] # Escaped something (something != CR) +)* " # closing quote +) # initial word +(?: (?: [\040\t] | \( +(?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] | \( (?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] )* \) )* +\) )* \. (?: [\040\t] | \( +(?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] | \( (?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] )* \) )* +\) )* (?: +[^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]+ # some number of atom characters... +(?![^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]) # ..not followed by something that could be part of an atom +| +" (?: # opening quote... +[^\\\x80-\xff\n\015"] # Anything except backslash and quote +| # or +\\ [^\x80-\xff] # Escaped something (something != CR) +)* " # closing quote +) )* # further okay, if led by a period +(?: [\040\t] | \( +(?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] | \( (?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] )* \) )* +\) )* @ (?: [\040\t] | \( +(?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] | \( (?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] )* \) )* +\) )* (?: +[^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]+ # some number of atom characters... +(?![^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]) # ..not followed by something that could be part of an atom +| \[ # [ +(?: [^\\\x80-\xff\n\015\[\]] | \\ [^\x80-\xff] )* # stuff +\] # ] +) # initial subdomain +(?: # +(?: [\040\t] | \( +(?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] | \( (?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] )* \) )* +\) )* \. # if led by a period... +(?: [\040\t] | \( +(?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] | \( (?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] )* \) )* +\) )* (?: +[^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]+ # some number of atom characters... +(?![^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]) # ..not followed by something that could be part of an atom +| \[ # [ +(?: [^\\\x80-\xff\n\015\[\]] | \\ [^\x80-\xff] )* # stuff +\] # ] +) # ...further okay +)* +# address +| # or +(?: +[^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]+ # some number of atom characters... +(?![^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]) # ..not followed by something that could be part of an atom +| +" (?: # opening quote... +[^\\\x80-\xff\n\015"] # Anything except backslash and quote +| # or +\\ [^\x80-\xff] # Escaped something (something != CR) +)* " # closing quote +) # one word, optionally followed by.... +(?: +[^()<>@,;:".\\\[\]\x80-\xff\000-\010\012-\037] | # atom and space parts, or... +\( +(?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] | \( (?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] )* \) )* +\) | # comments, or... + +" (?: # opening quote... +[^\\\x80-\xff\n\015"] # Anything except backslash and quote +| # or +\\ [^\x80-\xff] # Escaped something (something != CR) +)* " # closing quote +# quoted strings +)* +< (?: [\040\t] | \( +(?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] | \( (?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] )* \) )* +\) )* # leading < +(?: @ (?: [\040\t] | \( +(?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] | \( (?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] )* \) )* +\) )* (?: +[^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]+ # some number of atom characters... +(?![^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]) # ..not followed by something that could be part of an atom +| \[ # [ +(?: [^\\\x80-\xff\n\015\[\]] | \\ [^\x80-\xff] )* # stuff +\] # ] +) # initial subdomain +(?: # +(?: [\040\t] | \( +(?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] | \( (?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] )* \) )* +\) )* \. # if led by a period... +(?: [\040\t] | \( +(?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] | \( (?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] )* \) )* +\) )* (?: +[^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]+ # some number of atom characters... +(?![^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]) # ..not followed by something that could be part of an atom +| \[ # [ +(?: [^\\\x80-\xff\n\015\[\]] | \\ [^\x80-\xff] )* # stuff +\] # ] +) # ...further okay +)* + +(?: (?: [\040\t] | \( +(?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] | \( (?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] )* \) )* +\) )* , (?: [\040\t] | \( +(?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] | \( (?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] )* \) )* +\) )* @ (?: [\040\t] | \( +(?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] | \( (?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] )* \) )* +\) )* (?: +[^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]+ # some number of atom characters... +(?![^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]) # ..not followed by something that could be part of an atom +| \[ # [ +(?: [^\\\x80-\xff\n\015\[\]] | \\ [^\x80-\xff] )* # stuff +\] # ] +) # initial subdomain +(?: # +(?: [\040\t] | \( +(?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] | \( (?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] )* \) )* +\) )* \. # if led by a period... +(?: [\040\t] | \( +(?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] | \( (?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] )* \) )* +\) )* (?: +[^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]+ # some number of atom characters... +(?![^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]) # ..not followed by something that could be part of an atom +| \[ # [ +(?: [^\\\x80-\xff\n\015\[\]] | \\ [^\x80-\xff] )* # stuff +\] # ] +) # ...further okay +)* +)* # further okay, if led by comma +: # closing colon +(?: [\040\t] | \( +(?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] | \( (?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] )* \) )* +\) )* )? # optional route +(?: +[^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]+ # some number of atom characters... +(?![^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]) # ..not followed by something that could be part of an atom +| +" (?: # opening quote... +[^\\\x80-\xff\n\015"] # Anything except backslash and quote +| # or +\\ [^\x80-\xff] # Escaped something (something != CR) +)* " # closing quote +) # initial word +(?: (?: [\040\t] | \( +(?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] | \( (?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] )* \) )* +\) )* \. (?: [\040\t] | \( +(?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] | \( (?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] )* \) )* +\) )* (?: +[^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]+ # some number of atom characters... +(?![^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]) # ..not followed by something that could be part of an atom +| +" (?: # opening quote... +[^\\\x80-\xff\n\015"] # Anything except backslash and quote +| # or +\\ [^\x80-\xff] # Escaped something (something != CR) +)* " # closing quote +) )* # further okay, if led by a period +(?: [\040\t] | \( +(?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] | \( (?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] )* \) )* +\) )* @ (?: [\040\t] | \( +(?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] | \( (?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] )* \) )* +\) )* (?: +[^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]+ # some number of atom characters... +(?![^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]) # ..not followed by something that could be part of an atom +| \[ # [ +(?: [^\\\x80-\xff\n\015\[\]] | \\ [^\x80-\xff] )* # stuff +\] # ] +) # initial subdomain +(?: # +(?: [\040\t] | \( +(?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] | \( (?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] )* \) )* +\) )* \. # if led by a period... +(?: [\040\t] | \( +(?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] | \( (?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] )* \) )* +\) )* (?: +[^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]+ # some number of atom characters... +(?![^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]) # ..not followed by something that could be part of an atom +| \[ # [ +(?: [^\\\x80-\xff\n\015\[\]] | \\ [^\x80-\xff] )* # stuff +\] # ] +) # ...further okay +)* +# address spec +(?: [\040\t] | \( +(?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] | \( (?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] )* \) )* +\) )* > # trailing > +# name and address +) (?: [\040\t] | \( +(?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] | \( (?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] )* \) )* +\) )* # optional trailing comment +/Ix + +/\h/I + +/\H/I + +/\v/I + +/\V/I + +/\R/I + +/[\h]/B + >\x09< + +/[\h]+/B + >\x09\x20\xa0< + +/[\v]/B + +/[\H]/B + +/[^\h]/B + +/[\V]/B + +/[\x0a\V]/B + +/\777/I + +/(*:0123456789ABCDEF0123456789ABCDEF0123456789ABCDEF0123456789ABCDEF0123456789ABCDEF0123456789ABCDEF0123456789ABCDEF0123456789ABCDEF0123456789ABCDEF0123456789ABCDEF0123456789ABCDEF0123456789ABCDEF0123456789ABCDEF0123456789ABCDEF0123456789ABCDEF0123456789ABCDEF)XX/mark + XX + +/(*:0123456789ABCDEF0123456789ABCDEF0123456789ABCDEF0123456789ABCDEF0123456789ABCDEF0123456789ABCDEF0123456789ABCDEF0123456789ABCDEF0123456789ABCDEF0123456789ABCDEF0123456789ABCDEF0123456789ABCDEF0123456789ABCDEF0123456789ABCDEF0123456789ABCDEF0123456789ABCDE)XX/mark + XX + +/\u0100/alt_bsux,allow_empty_class,match_unset_backref,dupnames + +/[\u0100-\u0200]/alt_bsux,allow_empty_class,match_unset_backref,dupnames + +/[^\x00-a]{12,}[^b-\xff]*/B + +/[^\s]*\s* [^\W]+\W+ [^\d]*?\d0 [^\d\w]{4,6}?\w*A/B + +# End of testinput9 diff --git a/testdata/testoutput10 b/testdata/testoutput10 new file mode 100644 index 0000000..5e83693 --- /dev/null +++ b/testdata/testoutput10 @@ -0,0 +1,1273 @@ +# This set of tests is for UTF-8 support and Unicode property support, with +# relevance only for the 8-bit library. + +/X(\C{3})/utf + X\x{1234} + 0: X\x{1234} + 1: \x{1234} + +/X(\C{4})/utf + X\x{1234}YZ + 0: X\x{1234}Y + 1: \x{1234}Y + +/X\C*/utf + XYZabcdce + 0: XYZabcdce + +/X\C*?/utf + XYZabcde + 0: X + +/X\C{3,5}/utf + Xabcdefg + 0: Xabcde + X\x{1234} + 0: X\x{1234} + X\x{1234}YZ + 0: X\x{1234}YZ + X\x{1234}\x{512} + 0: X\x{1234}\x{512} + X\x{1234}\x{512}YZ + 0: X\x{1234}\x{512} + +/X\C{3,5}?/utf + Xabcdefg + 0: Xabc + X\x{1234} + 0: X\x{1234} + X\x{1234}YZ + 0: X\x{1234} + X\x{1234}\x{512} + 0: X\x{1234} + +/a\Cb/utf + aXb + 0: aXb + a\nb + 0: a\x{0a}b + +/a\C\Cb/utf + a\x{100}b + 0: a\x{100}b + +/ab\Cde/utf + abXde + 0: abXde + +/a\C\Cb/utf + a\x{100}b + 0: a\x{100}b + ** Failers +No match + a\x{12257}b +No match + +/[]/utf +Failed: error -8 at offset 0: UTF-8 error: byte 2 top bits not 0x80 + +//utf +Failed: error -3 at offset 0: UTF-8 error: 1 byte missing at end + +/xxx/utf +Failed: error -8 at offset 0: UTF-8 error: byte 2 top bits not 0x80 + +/badutf/utf + \xdf +Failed: error -3: UTF-8 error: 1 byte missing at end + \xef +Failed: error -4: UTF-8 error: 2 bytes missing at end + \xef\x80 +Failed: error -3: UTF-8 error: 1 byte missing at end + \xf7 +Failed: error -5: UTF-8 error: 3 bytes missing at end + \xf7\x80 +Failed: error -4: UTF-8 error: 2 bytes missing at end + \xf7\x80\x80 +Failed: error -3: UTF-8 error: 1 byte missing at end + \xfb +Failed: error -6: UTF-8 error: 4 bytes missing at end + \xfb\x80 +Failed: error -5: UTF-8 error: 3 bytes missing at end + \xfb\x80\x80 +Failed: error -4: UTF-8 error: 2 bytes missing at end + \xfb\x80\x80\x80 +Failed: error -3: UTF-8 error: 1 byte missing at end + \xfd +Failed: error -7: UTF-8 error: 5 bytes missing at end + \xfd\x80 +Failed: error -6: UTF-8 error: 4 bytes missing at end + \xfd\x80\x80 +Failed: error -5: UTF-8 error: 3 bytes missing at end + \xfd\x80\x80\x80 +Failed: error -4: UTF-8 error: 2 bytes missing at end + \xfd\x80\x80\x80\x80 +Failed: error -3: UTF-8 error: 1 byte missing at end + \xdf\x7f +Failed: error -8: UTF-8 error: byte 2 top bits not 0x80 + \xef\x7f\x80 +Failed: error -8: UTF-8 error: byte 2 top bits not 0x80 + \xef\x80\x7f +Failed: error -9: UTF-8 error: byte 3 top bits not 0x80 + \xf7\x7f\x80\x80 +Failed: error -8: UTF-8 error: byte 2 top bits not 0x80 + \xf7\x80\x7f\x80 +Failed: error -9: UTF-8 error: byte 3 top bits not 0x80 + \xf7\x80\x80\x7f +Failed: error -10: UTF-8 error: byte 4 top bits not 0x80 + \xfb\x7f\x80\x80\x80 +Failed: error -8: UTF-8 error: byte 2 top bits not 0x80 + \xfb\x80\x7f\x80\x80 +Failed: error -9: UTF-8 error: byte 3 top bits not 0x80 + \xfb\x80\x80\x7f\x80 +Failed: error -10: UTF-8 error: byte 4 top bits not 0x80 + \xfb\x80\x80\x80\x7f +Failed: error -11: UTF-8 error: byte 5 top bits not 0x80 + \xfd\x7f\x80\x80\x80\x80 +Failed: error -8: UTF-8 error: byte 2 top bits not 0x80 + \xfd\x80\x7f\x80\x80\x80 +Failed: error -9: UTF-8 error: byte 3 top bits not 0x80 + \xfd\x80\x80\x7f\x80\x80 +Failed: error -10: UTF-8 error: byte 4 top bits not 0x80 + \xfd\x80\x80\x80\x7f\x80 +Failed: error -11: UTF-8 error: byte 5 top bits not 0x80 + \xfd\x80\x80\x80\x80\x7f +Failed: error -12: UTF-8 error: byte 6 top bits not 0x80 + \xed\xa0\x80 +Failed: error -16: UTF-8 error: code points 0xd800-0xdfff are not defined + \xc0\x8f +Failed: error -17: UTF-8 error: overlong 2-byte sequence + \xe0\x80\x8f +Failed: error -18: UTF-8 error: overlong 3-byte sequence + \xf0\x80\x80\x8f +Failed: error -19: UTF-8 error: overlong 4-byte sequence + \xf8\x80\x80\x80\x8f +Failed: error -20: UTF-8 error: overlong 5-byte sequence + \xfc\x80\x80\x80\x80\x8f +Failed: error -21: UTF-8 error: overlong 6-byte sequence + \x80 +Failed: error -22: UTF-8 error: isolated 0x80 byte + \xfe +Failed: error -23: UTF-8 error: illegal byte (0xfe or 0xff) + \xff +Failed: error -23: UTF-8 error: illegal byte (0xfe or 0xff) + +/badutf/utf + \xfb\x80\x80\x80\x80 +Failed: error -13: UTF-8 error: 5-byte character is not allowed (RFC 3629) + \xfd\x80\x80\x80\x80\x80 +Failed: error -14: UTF-8 error: 6-byte character is not allowed (RFC 3629) + \xf7\xbf\xbf\xbf +Failed: error -15: UTF-8 error: code points greater than 0x10ffff are not defined + +/shortutf/utf + \xdf\=ph +Failed: error -3: UTF-8 error: 1 byte missing at end + \xef\=ph +Failed: error -4: UTF-8 error: 2 bytes missing at end + \xef\x80\=ph +Failed: error -3: UTF-8 error: 1 byte missing at end + \xf7\=ph +Failed: error -5: UTF-8 error: 3 bytes missing at end + \xf7\x80\=ph +Failed: error -4: UTF-8 error: 2 bytes missing at end + \xf7\x80\x80\=ph +Failed: error -3: UTF-8 error: 1 byte missing at end + \xfb\=ph +Failed: error -6: UTF-8 error: 4 bytes missing at end + \xfb\x80\=ph +Failed: error -5: UTF-8 error: 3 bytes missing at end + \xfb\x80\x80\=ph +Failed: error -4: UTF-8 error: 2 bytes missing at end + \xfb\x80\x80\x80\=ph +Failed: error -3: UTF-8 error: 1 byte missing at end + \xfd\=ph +Failed: error -7: UTF-8 error: 5 bytes missing at end + \xfd\x80\=ph +Failed: error -6: UTF-8 error: 4 bytes missing at end + \xfd\x80\x80\=ph +Failed: error -5: UTF-8 error: 3 bytes missing at end + \xfd\x80\x80\x80\=ph +Failed: error -4: UTF-8 error: 2 bytes missing at end + \xfd\x80\x80\x80\x80\=ph +Failed: error -3: UTF-8 error: 1 byte missing at end + +/anything/utf + \xc0\x80 +Failed: error -17: UTF-8 error: overlong 2-byte sequence + \xc1\x8f +Failed: error -17: UTF-8 error: overlong 2-byte sequence + \xe0\x9f\x80 +Failed: error -18: UTF-8 error: overlong 3-byte sequence + \xf0\x8f\x80\x80 +Failed: error -19: UTF-8 error: overlong 4-byte sequence + \xf8\x87\x80\x80\x80 +Failed: error -20: UTF-8 error: overlong 5-byte sequence + \xfc\x83\x80\x80\x80\x80 +Failed: error -21: UTF-8 error: overlong 6-byte sequence + \xfe\x80\x80\x80\x80\x80 +Failed: error -23: UTF-8 error: illegal byte (0xfe or 0xff) + \xff\x80\x80\x80\x80\x80 +Failed: error -23: UTF-8 error: illegal byte (0xfe or 0xff) + \xc3\x8f +No match + \xe0\xaf\x80 +No match + \xe1\x80\x80 +No match + \xf0\x9f\x80\x80 +No match + \xf1\x8f\x80\x80 +No match + \xf8\x88\x80\x80\x80 +Failed: error -13: UTF-8 error: 5-byte character is not allowed (RFC 3629) + \xf9\x87\x80\x80\x80 +Failed: error -13: UTF-8 error: 5-byte character is not allowed (RFC 3629) + \xfc\x84\x80\x80\x80\x80 +Failed: error -14: UTF-8 error: 6-byte character is not allowed (RFC 3629) + \xfd\x83\x80\x80\x80\x80 +Failed: error -14: UTF-8 error: 6-byte character is not allowed (RFC 3629) + \xf8\x88\x80\x80\x80\=no_utf_check +No match + \xf9\x87\x80\x80\x80\=no_utf_check +No match + \xfc\x84\x80\x80\x80\x80\=no_utf_check +No match + \xfd\x83\x80\x80\x80\x80\=no_utf_check +No match + +/\x{100}/IB,utf +------------------------------------------------------------------ + Bra + \x{100} + Ket + End +------------------------------------------------------------------ +Capturing subpattern count = 0 +Options: utf +First code unit = \xc4 +Last code unit = \x80 +Subject length lower bound = 1 + +/\x{1000}/IB,utf +------------------------------------------------------------------ + Bra + \x{1000} + Ket + End +------------------------------------------------------------------ +Capturing subpattern count = 0 +Options: utf +First code unit = \xe1 +Last code unit = \x80 +Subject length lower bound = 1 + +/\x{10000}/IB,utf +------------------------------------------------------------------ + Bra + \x{10000} + Ket + End +------------------------------------------------------------------ +Capturing subpattern count = 0 +Options: utf +First code unit = \xf0 +Last code unit = \x80 +Subject length lower bound = 1 + +/\x{100000}/IB,utf +------------------------------------------------------------------ + Bra + \x{100000} + Ket + End +------------------------------------------------------------------ +Capturing subpattern count = 0 +Options: utf +First code unit = \xf4 +Last code unit = \x80 +Subject length lower bound = 1 + +/\x{10ffff}/IB,utf +------------------------------------------------------------------ + Bra + \x{10ffff} + Ket + End +------------------------------------------------------------------ +Capturing subpattern count = 0 +Options: utf +First code unit = \xf4 +Last code unit = \xbf +Subject length lower bound = 1 + +/[\x{ff}]/IB,utf +------------------------------------------------------------------ + Bra + \x{ff} + Ket + End +------------------------------------------------------------------ +Capturing subpattern count = 0 +Options: utf +First code unit = \xc3 +Last code unit = \xbf +Subject length lower bound = 1 + +/[\x{100}]/IB,utf +------------------------------------------------------------------ + Bra + \x{100} + Ket + End +------------------------------------------------------------------ +Capturing subpattern count = 0 +Options: utf +First code unit = \xc4 +Last code unit = \x80 +Subject length lower bound = 1 + +/\x80/IB,utf +------------------------------------------------------------------ + Bra + \x{80} + Ket + End +------------------------------------------------------------------ +Capturing subpattern count = 0 +Options: utf +First code unit = \xc2 +Last code unit = \x80 +Subject length lower bound = 1 + +/\xff/IB,utf +------------------------------------------------------------------ + Bra + \x{ff} + Ket + End +------------------------------------------------------------------ +Capturing subpattern count = 0 +Options: utf +First code unit = \xc3 +Last code unit = \xbf +Subject length lower bound = 1 + +/\x{D55c}\x{ad6d}\x{C5B4}/IB,utf +------------------------------------------------------------------ + Bra + \x{d55c}\x{ad6d}\x{c5b4} + Ket + End +------------------------------------------------------------------ +Capturing subpattern count = 0 +Options: utf +First code unit = \xed +Last code unit = \xb4 +Subject length lower bound = 3 + \x{D55c}\x{ad6d}\x{C5B4} + 0: \x{d55c}\x{ad6d}\x{c5b4} + +/\x{65e5}\x{672c}\x{8a9e}/IB,utf +------------------------------------------------------------------ + Bra + \x{65e5}\x{672c}\x{8a9e} + Ket + End +------------------------------------------------------------------ +Capturing subpattern count = 0 +Options: utf +First code unit = \xe6 +Last code unit = \x9e +Subject length lower bound = 3 + \x{65e5}\x{672c}\x{8a9e} + 0: \x{65e5}\x{672c}\x{8a9e} + +/\x{80}/IB,utf +------------------------------------------------------------------ + Bra + \x{80} + Ket + End +------------------------------------------------------------------ +Capturing subpattern count = 0 +Options: utf +First code unit = \xc2 +Last code unit = \x80 +Subject length lower bound = 1 + +/\x{084}/IB,utf +------------------------------------------------------------------ + Bra + \x{84} + Ket + End +------------------------------------------------------------------ +Capturing subpattern count = 0 +Options: utf +First code unit = \xc2 +Last code unit = \x84 +Subject length lower bound = 1 + +/\x{104}/IB,utf +------------------------------------------------------------------ + Bra + \x{104} + Ket + End +------------------------------------------------------------------ +Capturing subpattern count = 0 +Options: utf +First code unit = \xc4 +Last code unit = \x84 +Subject length lower bound = 1 + +/\x{861}/IB,utf +------------------------------------------------------------------ + Bra + \x{861} + Ket + End +------------------------------------------------------------------ +Capturing subpattern count = 0 +Options: utf +First code unit = \xe0 +Last code unit = \xa1 +Subject length lower bound = 1 + +/\x{212ab}/IB,utf +------------------------------------------------------------------ + Bra + \x{212ab} + Ket + End +------------------------------------------------------------------ +Capturing subpattern count = 0 +Options: utf +First code unit = \xf0 +Last code unit = \xab +Subject length lower bound = 1 + +# This one is here not because it's different to Perl, but because the way +# the captured single-byte is displayed. (In Perl it becomes a character, and you +# can't tell the difference.) + +/X(\C)(.*)/utf + X\x{1234} + 0: X\x{1234} + 1: \x{e1} + 2: \x{88}\x{b4} + X\nabc + 0: X\x{0a}abc + 1: \x{0a} + 2: abc + +# This one is here because Perl gives out a grumbly error message (quite +# correctly, but that messes up comparisons). + +/a\Cb/utf + *** Failers +No match + a\x{100}b +No match + +/[^ab\xC0-\xF0]/IB,utf +------------------------------------------------------------------ + Bra + [\x00-`c-\xbf\xf1-\xff] (neg) + Ket + End +------------------------------------------------------------------ +Capturing subpattern count = 0 +Options: utf +Starting code units: \x00 \x01 \x02 \x03 \x04 \x05 \x06 \x07 \x08 \x09 \x0a + \x0b \x0c \x0d \x0e \x0f \x10 \x11 \x12 \x13 \x14 \x15 \x16 \x17 \x18 \x19 + \x1a \x1b \x1c \x1d \x1e \x1f \x20 ! " # $ % & ' ( ) * + , - . / 0 1 2 3 4 + 5 6 7 8 9 : ; < = > ? @ A B C D E F G H I J K L M N O P Q R S T U V W X Y + Z [ \ ] ^ _ ` c d e f g h i j k l m n o p q r s t u v w x y z { | } ~ \x7f + \xc2 \xc3 \xc4 \xc5 \xc6 \xc7 \xc8 \xc9 \xca \xcb \xcc \xcd \xce \xcf \xd0 + \xd1 \xd2 \xd3 \xd4 \xd5 \xd6 \xd7 \xd8 \xd9 \xda \xdb \xdc \xdd \xde \xdf + \xe0 \xe1 \xe2 \xe3 \xe4 \xe5 \xe6 \xe7 \xe8 \xe9 \xea \xeb \xec \xed \xee + \xef \xf0 \xf1 \xf2 \xf3 \xf4 \xf5 \xf6 \xf7 \xf8 \xf9 \xfa \xfb \xfc \xfd + \xfe \xff +No last code unit +Subject length lower bound = 1 + \x{f1} + 0: \x{f1} + \x{bf} + 0: \x{bf} + \x{100} + 0: \x{100} + \x{1000} + 0: \x{1000} + *** Failers + 0: * + \x{c0} +No match + \x{f0} +No match + +/Ā{3,4}/IB,utf +------------------------------------------------------------------ + Bra + \x{100}{3} + \x{100}?+ + Ket + End +------------------------------------------------------------------ +Capturing subpattern count = 0 +Options: utf +First code unit = \xc4 +Last code unit = \x80 +Subject length lower bound = 3 + \x{100}\x{100}\x{100}\x{100\x{100} + 0: \x{100}\x{100}\x{100} + +/(\x{100}+|x)/IB,utf +------------------------------------------------------------------ + Bra + CBra 1 + \x{100}++ + Alt + x + Ket + Ket + End +------------------------------------------------------------------ +Capturing subpattern count = 1 +Options: utf +Starting code units: x \xc4 +No last code unit +Subject length lower bound = 1 + +/(\x{100}*a|x)/IB,utf +------------------------------------------------------------------ + Bra + CBra 1 + \x{100}*+ + a + Alt + x + Ket + Ket + End +------------------------------------------------------------------ +Capturing subpattern count = 1 +Options: utf +Starting code units: a x \xc4 +No last code unit +Subject length lower bound = 1 + +/(\x{100}{0,2}a|x)/IB,utf +------------------------------------------------------------------ + Bra + CBra 1 + \x{100}{0,2}+ + a + Alt + x + Ket + Ket + End +------------------------------------------------------------------ +Capturing subpattern count = 1 +Options: utf +Starting code units: a x \xc4 +No last code unit +Subject length lower bound = 1 + +/(\x{100}{1,2}a|x)/IB,utf +------------------------------------------------------------------ + Bra + CBra 1 + \x{100} + \x{100}{0,1}+ + a + Alt + x + Ket + Ket + End +------------------------------------------------------------------ +Capturing subpattern count = 1 +Options: utf +Starting code units: x \xc4 +No last code unit +Subject length lower bound = 1 + +/\x{100}/IB,utf +------------------------------------------------------------------ + Bra + \x{100} + Ket + End +------------------------------------------------------------------ +Capturing subpattern count = 0 +Options: utf +First code unit = \xc4 +Last code unit = \x80 +Subject length lower bound = 1 + +/a\x{100}\x{101}*/IB,utf +------------------------------------------------------------------ + Bra + a\x{100} + \x{101}*+ + Ket + End +------------------------------------------------------------------ +Capturing subpattern count = 0 +Options: utf +First code unit = 'a' +Last code unit = \x80 +Subject length lower bound = 2 + +/a\x{100}\x{101}+/IB,utf +------------------------------------------------------------------ + Bra + a\x{100} + \x{101}++ + Ket + End +------------------------------------------------------------------ +Capturing subpattern count = 0 +Options: utf +First code unit = 'a' +Last code unit = \x81 +Subject length lower bound = 3 + +/[^\x{c4}]/IB +------------------------------------------------------------------ + Bra + [^\x{c4}] + Ket + End +------------------------------------------------------------------ +Capturing subpattern count = 0 +No options +No first code unit +No last code unit +Subject length lower bound = 1 + +/[\x{100}]/IB,utf +------------------------------------------------------------------ + Bra + \x{100} + Ket + End +------------------------------------------------------------------ +Capturing subpattern count = 0 +Options: utf +First code unit = \xc4 +Last code unit = \x80 +Subject length lower bound = 1 + \x{100} + 0: \x{100} + Z\x{100} + 0: \x{100} + \x{100}Z + 0: \x{100} + *** Failers +No match + +/[\xff]/IB,utf +------------------------------------------------------------------ + Bra + \x{ff} + Ket + End +------------------------------------------------------------------ +Capturing subpattern count = 0 +Options: utf +First code unit = \xc3 +Last code unit = \xbf +Subject length lower bound = 1 + >\x{ff}< + 0: \x{ff} + +/[^\xff]/IB,utf +------------------------------------------------------------------ + Bra + [^\x{ff}] + Ket + End +------------------------------------------------------------------ +Capturing subpattern count = 0 +Options: utf +No first code unit +No last code unit +Subject length lower bound = 1 + +/\x{100}abc(xyz(?1))/IB,utf +------------------------------------------------------------------ + Bra + \x{100}abc + CBra 1 + xyz + Recurse + Ket + Ket + End +------------------------------------------------------------------ +Capturing subpattern count = 1 +Options: utf +First code unit = \xc4 +Last code unit = 'z' +Subject length lower bound = 7 + +/a\x{1234}b/utf,posix + a\x{1234}b + 0: a\x{1234}b + +/\777/I,utf +Capturing subpattern count = 0 +Options: utf +First code unit = \xc7 +Last code unit = \xbf +Subject length lower bound = 1 + \x{1ff} + 0: \x{1ff} + \777 + 0: \x{1ff} + +/\x{100}+\x{200}/IB,utf +------------------------------------------------------------------ + Bra + \x{100}++ + \x{200} + Ket + End +------------------------------------------------------------------ +Capturing subpattern count = 0 +Options: utf +First code unit = \xc4 +Last code unit = \x80 +Subject length lower bound = 2 + +/\x{100}+X/IB,utf +------------------------------------------------------------------ + Bra + \x{100}++ + X + Ket + End +------------------------------------------------------------------ +Capturing subpattern count = 0 +Options: utf +First code unit = \xc4 +Last code unit = 'X' +Subject length lower bound = 2 + +/^[\QĀ\E-\QŐ\E/B,utf +Failed: error 106 at offset 15: missing terminating ] for character class + +# This tests the stricter UTF-8 check according to RFC 3629. + +/X/utf + \x{d800} +Failed: error -16: UTF-8 error: code points 0xd800-0xdfff are not defined + \x{d800}\=no_utf_check +No match + \x{da00} +Failed: error -16: UTF-8 error: code points 0xd800-0xdfff are not defined + \x{da00}\=no_utf_check +No match + \x{dfff} +Failed: error -16: UTF-8 error: code points 0xd800-0xdfff are not defined + \x{dfff}\=no_utf_check +No match + \x{110000} +Failed: error -15: UTF-8 error: code points greater than 0x10ffff are not defined + \x{110000}\=no_utf_check +No match + \x{2000000} +Failed: error -13: UTF-8 error: 5-byte character is not allowed (RFC 3629) + \x{2000000}\=no_utf_check +No match + \x{7fffffff} +Failed: error -14: UTF-8 error: 6-byte character is not allowed (RFC 3629) + \x{7fffffff}\=no_utf_check +No match + +/(*UTF8)\x{1234}/ + abcd\x{1234}pqr + 0: \x{1234} + +/(*CRLF)(*UTF)(*BSR_UNICODE)a\Rb/I +Capturing subpattern count = 0 +Compile options: +Overall options: utf +Forced newline is CRLF +First code unit = 'a' +Last code unit = 'b' +Subject length lower bound = 3 + +/\h/I,utf +Capturing subpattern count = 0 +Options: utf +Starting code units: \x09 \x20 \xc2 \xe1 \xe2 \xe3 +No last code unit +Subject length lower bound = 1 + ABC\x{09} + 0: \x{09} + ABC\x{20} + 0: + ABC\x{a0} + 0: \x{a0} + ABC\x{1680} + 0: \x{1680} + ABC\x{180e} + 0: \x{180e} + ABC\x{2000} + 0: \x{2000} + ABC\x{202f} + 0: \x{202f} + ABC\x{205f} + 0: \x{205f} + ABC\x{3000} + 0: \x{3000} + +/\v/I,utf +Capturing subpattern count = 0 +Options: utf +Starting code units: \x0a \x0b \x0c \x0d \xc2 \xe2 +No last code unit +Subject length lower bound = 1 + ABC\x{0a} + 0: \x{0a} + ABC\x{0b} + 0: \x{0b} + ABC\x{0c} + 0: \x{0c} + ABC\x{0d} + 0: \x{0d} + ABC\x{85} + 0: \x{85} + ABC\x{2028} + 0: \x{2028} + +/\h*A/I,utf +Capturing subpattern count = 0 +Options: utf +Starting code units: \x09 \x20 A \xc2 \xe1 \xe2 \xe3 +Last code unit = 'A' +Subject length lower bound = 1 + CDBABC + 0: A + +/\v+A/I,utf +Capturing subpattern count = 0 +Options: utf +Starting code units: \x0a \x0b \x0c \x0d \xc2 \xe2 +Last code unit = 'A' +Subject length lower bound = 2 + +/\s?xxx\s/I,utf +Capturing subpattern count = 0 +Options: utf +Starting code units: \x09 \x0a \x0b \x0c \x0d \x20 x +Last code unit = 'x' +Subject length lower bound = 4 + +/\sxxx\s/I,utf,tables=2 +Capturing subpattern count = 0 +Options: utf +Starting code units: \x09 \x0a \x0b \x0c \x0d \x20 \xc2 +Last code unit = 'x' +Subject length lower bound = 5 + AB\x{85}xxx\x{a0}XYZ + 0: \x{85}xxx\x{a0} + AB\x{a0}xxx\x{85}XYZ + 0: \x{a0}xxx\x{85} + +/\S \S/I,utf,tables=2 +Capturing subpattern count = 0 +Options: utf +Starting code units: \x00 \x01 \x02 \x03 \x04 \x05 \x06 \x07 \x08 \x0e \x0f + \x10 \x11 \x12 \x13 \x14 \x15 \x16 \x17 \x18 \x19 \x1a \x1b \x1c \x1d \x1e + \x1f ! " # $ % & ' ( ) * + , - . / 0 1 2 3 4 5 6 7 8 9 : ; < = > ? @ A B C + D E F G H I J K L M N O P Q R S T U V W X Y Z [ \ ] ^ _ ` a b c d e f g h + i j k l m n o p q r s t u v w x y z { | } ~ \x7f \xc0 \xc1 \xc2 \xc3 \xc4 + \xc5 \xc6 \xc7 \xc8 \xc9 \xca \xcb \xcc \xcd \xce \xcf \xd0 \xd1 \xd2 \xd3 + \xd4 \xd5 \xd6 \xd7 \xd8 \xd9 \xda \xdb \xdc \xdd \xde \xdf \xe0 \xe1 \xe2 + \xe3 \xe4 \xe5 \xe6 \xe7 \xe8 \xe9 \xea \xeb \xec \xed \xee \xef \xf0 \xf1 + \xf2 \xf3 \xf4 \xf5 \xf6 \xf7 \xf8 \xf9 \xfa \xfb \xfc \xfd \xfe \xff +Last code unit = ' ' +Subject length lower bound = 3 + \x{a2} \x{84} + 0: \x{a2} \x{84} + A Z + 0: A Z + +/a+/utf + a\x{123}aa\=offset=1 + 0: aa + a\x{123}aa\=offset=2 +Error -36 (bad UTF-8 offset) + a\x{123}aa\=offset=3 + 0: aa + a\x{123}aa\=offset=4 + 0: a + a\x{123}aa\=offset=5 +No match + a\x{123}aa\=offset=6 +Failed: error -34: bad offset value + +/\x{1234}+/Ii,utf +Capturing subpattern count = 0 +Options: caseless utf +Starting code units: \xe1 +No last code unit +Subject length lower bound = 1 + +/\x{1234}+?/Ii,utf +Capturing subpattern count = 0 +Options: caseless utf +Starting code units: \xe1 +No last code unit +Subject length lower bound = 1 + +/\x{1234}++/Ii,utf +Capturing subpattern count = 0 +Options: caseless utf +Starting code units: \xe1 +No last code unit +Subject length lower bound = 1 + +/\x{1234}{2}/Ii,utf +Capturing subpattern count = 0 +Options: caseless utf +Starting code units: \xe1 +No last code unit +Subject length lower bound = 2 + +/[^\x{c4}]/IB,utf +------------------------------------------------------------------ + Bra + [^\x{c4}] + Ket + End +------------------------------------------------------------------ +Capturing subpattern count = 0 +Options: utf +No first code unit +No last code unit +Subject length lower bound = 1 + +/X+\x{200}/IB,utf +------------------------------------------------------------------ + Bra + X++ + \x{200} + Ket + End +------------------------------------------------------------------ +Capturing subpattern count = 0 +Options: utf +First code unit = 'X' +Last code unit = \x80 +Subject length lower bound = 2 + +/\R/I,utf +Capturing subpattern count = 0 +Options: utf +Starting code units: \x0a \x0b \x0c \x0d \xc2 \xe2 +No last code unit +Subject length lower bound = 1 + +/\777/IB,utf +------------------------------------------------------------------ + Bra + \x{1ff} + Ket + End +------------------------------------------------------------------ +Capturing subpattern count = 0 +Options: utf +First code unit = \xc7 +Last code unit = \xbf +Subject length lower bound = 1 + +/\w+\x{C4}/B,utf +------------------------------------------------------------------ + Bra + \w++ + \x{c4} + Ket + End +------------------------------------------------------------------ + a\x{C4}\x{C4} + 0: a\x{c4} + +/\w+\x{C4}/B,utf,tables=2 +------------------------------------------------------------------ + Bra + \w+ + \x{c4} + Ket + End +------------------------------------------------------------------ + a\x{C4}\x{C4} + 0: a\x{c4}\x{c4} + +/\W+\x{C4}/B,utf +------------------------------------------------------------------ + Bra + \W+ + \x{c4} + Ket + End +------------------------------------------------------------------ + !\x{C4} + 0: !\x{c4} + +/\W+\x{C4}/B,utf,tables=2 +------------------------------------------------------------------ + Bra + \W++ + \x{c4} + Ket + End +------------------------------------------------------------------ + !\x{C4} + 0: !\x{c4} + +/\W+\x{A1}/B,utf +------------------------------------------------------------------ + Bra + \W+ + \x{a1} + Ket + End +------------------------------------------------------------------ + !\x{A1} + 0: !\x{a1} + +/\W+\x{A1}/B,utf,tables=2 +------------------------------------------------------------------ + Bra + \W+ + \x{a1} + Ket + End +------------------------------------------------------------------ + !\x{A1} + 0: !\x{a1} + +/X\s+\x{A0}/B,utf +------------------------------------------------------------------ + Bra + X + \s++ + \x{a0} + Ket + End +------------------------------------------------------------------ + X\x20\x{A0}\x{A0} + 0: X \x{a0} + +/X\s+\x{A0}/B,utf,tables=2 +------------------------------------------------------------------ + Bra + X + \s+ + \x{a0} + Ket + End +------------------------------------------------------------------ + X\x20\x{A0}\x{A0} + 0: X \x{a0}\x{a0} + +/\S+\x{A0}/B,utf +------------------------------------------------------------------ + Bra + \S+ + \x{a0} + Ket + End +------------------------------------------------------------------ + X\x{A0}\x{A0} + 0: X\x{a0}\x{a0} + +/\S+\x{A0}/B,utf,tables=2 +------------------------------------------------------------------ + Bra + \S++ + \x{a0} + Ket + End +------------------------------------------------------------------ + X\x{A0}\x{A0} + 0: X\x{a0} + +/\x{a0}+\s!/B,utf +------------------------------------------------------------------ + Bra + \x{a0}++ + \s + ! + Ket + End +------------------------------------------------------------------ + \x{a0}\x20! + 0: \x{a0} ! + +/\x{a0}+\s!/B,utf,tables=2 +------------------------------------------------------------------ + Bra + \x{a0}+ + \s + ! + Ket + End +------------------------------------------------------------------ + \x{a0}\x20! + 0: \x{a0} ! + +/A/utf + \x{ff000041} +** Character \x{ff000041} is greater than 0x7fffffff and so cannot be converted to UTF-8 + \x{7f000041} +Failed: error -14: UTF-8 error: 6-byte character is not allowed (RFC 3629) + +/(*UTF8)abc/never_utf +Failed: error 174 at offset 7: using UTF is disabled by the application + +/abc/utf,never_utf +Failed: error 174 at offset 0: using UTF is disabled by the application + +/\w/posix + +++\x{c2} +No match: POSIX code 17: match failed + +/\w/ucp,posix + +++\x{c2} + 0: \xc2 + +/A\x{391}\x{10427}\x{ff3a}\x{1fb0}/IBi,utf +------------------------------------------------------------------ + Bra + /i A\x{391}\x{10427}\x{ff3a}\x{1fb0} + Ket + End +------------------------------------------------------------------ +Capturing subpattern count = 0 +Options: caseless utf +First code unit = 'A' (caseless) +No last code unit +Subject length lower bound = 5 + +/A\x{391}\x{10427}\x{ff3a}\x{1fb0}/IB,utf +------------------------------------------------------------------ + Bra + A\x{391}\x{10427}\x{ff3a}\x{1fb0} + Ket + End +------------------------------------------------------------------ +Capturing subpattern count = 0 +Options: utf +First code unit = 'A' +Last code unit = \xb0 +Subject length lower bound = 5 + +/AB\x{1fb0}/IB,utf +------------------------------------------------------------------ + Bra + AB\x{1fb0} + Ket + End +------------------------------------------------------------------ +Capturing subpattern count = 0 +Options: utf +First code unit = 'A' +Last code unit = \xb0 +Subject length lower bound = 3 + +/AB\x{1fb0}/IBi,utf +------------------------------------------------------------------ + Bra + /i AB\x{1fb0} + Ket + End +------------------------------------------------------------------ +Capturing subpattern count = 0 +Options: caseless utf +First code unit = 'A' (caseless) +Last code unit = 'B' (caseless) +Subject length lower bound = 3 + +/\x{401}\x{420}\x{421}\x{422}\x{423}\x{424}\x{425}\x{426}\x{427}\x{428}\x{429}\x{42a}\x{42b}\x{42c}\x{42d}\x{42e}\x{42f}/Ii,utf +Capturing subpattern count = 0 +Options: caseless utf +Starting code units: \xd0 \xd1 +No last code unit +Subject length lower bound = 17 + \x{401}\x{420}\x{421}\x{422}\x{423}\x{424}\x{425}\x{426}\x{427}\x{428}\x{429}\x{42a}\x{42b}\x{42c}\x{42d}\x{42e}\x{42f} + 0: \x{401}\x{420}\x{421}\x{422}\x{423}\x{424}\x{425}\x{426}\x{427}\x{428}\x{429}\x{42a}\x{42b}\x{42c}\x{42d}\x{42e}\x{42f} + \x{451}\x{440}\x{441}\x{442}\x{443}\x{444}\x{445}\x{446}\x{447}\x{448}\x{449}\x{44a}\x{44b}\x{44c}\x{44d}\x{44e}\x{44f} + 0: \x{451}\x{440}\x{441}\x{442}\x{443}\x{444}\x{445}\x{446}\x{447}\x{448}\x{449}\x{44a}\x{44b}\x{44c}\x{44d}\x{44e}\x{44f} + +/[ⱥ]/Bi,utf +------------------------------------------------------------------ + Bra + /i \x{2c65} + Ket + End +------------------------------------------------------------------ + +/[^ⱥ]/Bi,utf +------------------------------------------------------------------ + Bra + /i [^\x{2c65}] + Ket + End +------------------------------------------------------------------ + +/\h/I +Capturing subpattern count = 0 +No options +Starting code units: \x09 \x20 \xa0 +No last code unit +Subject length lower bound = 1 + +/\v/I +Capturing subpattern count = 0 +No options +Starting code units: \x0a \x0b \x0c \x0d \x85 +No last code unit +Subject length lower bound = 1 + +/\R/I +Capturing subpattern count = 0 +No options +Starting code units: \x0a \x0b \x0c \x0d \x85 +No last code unit +Subject length lower bound = 1 + +/[[:blank:]]/B,ucp +------------------------------------------------------------------ + Bra + [\x09 \xa0] + Ket + End +------------------------------------------------------------------ + +/\x{212a}+/Ii,utf +Capturing subpattern count = 0 +Options: caseless utf +Starting code units: K k \xe2 +No last code unit +Subject length lower bound = 1 + KKkk\x{212a} + 0: KKkk\x{212a} + +/s+/Ii,utf +Capturing subpattern count = 0 +Options: caseless utf +Starting code units: S s \xc5 +No last code unit +Subject length lower bound = 1 + SSss\x{17f} + 0: SSss\x{17f} + +# End of testinput10 diff --git a/testdata/testoutput11-16 b/testdata/testoutput11-16 new file mode 100644 index 0000000..1f6387b --- /dev/null +++ b/testdata/testoutput11-16 @@ -0,0 +1,675 @@ +# This set of tests is for the 16-bit and 32-bit libraries' basic (non-UTF) +# features that are not compatible with the 8-bit library, or which give +# different output in 16-bit or 32-bit mode. The output for the two widths is +# different, so they have separate output files. + +#forbid_utf + +/a\Cb/ + aXb + 0: aXb + a\nb + 0: a\x0ab + +/[^\x{c4}]/IB +------------------------------------------------------------------ + Bra + [^\x{c4}] + Ket + End +------------------------------------------------------------------ +Capturing subpattern count = 0 +No options +No first code unit +No last code unit +Subject length lower bound = 1 + +/\x{100}/I +Capturing subpattern count = 0 +No options +First code unit = \x{100} +No last code unit +Subject length lower bound = 1 + +/ (?: [\040\t] | \( +(?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] | \( (?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] )* \) )* +\) )* # optional leading comment +(?: (?: +[^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]+ # some number of atom characters... +(?![^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]) # ..not followed by something that could be part of an atom +| +" (?: # opening quote... +[^\\\x80-\xff\n\015"] # Anything except backslash and quote +| # or +\\ [^\x80-\xff] # Escaped something (something != CR) +)* " # closing quote +) # initial word +(?: (?: [\040\t] | \( +(?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] | \( (?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] )* \) )* +\) )* \. (?: [\040\t] | \( +(?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] | \( (?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] )* \) )* +\) )* (?: +[^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]+ # some number of atom characters... +(?![^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]) # ..not followed by something that could be part of an atom +| +" (?: # opening quote... +[^\\\x80-\xff\n\015"] # Anything except backslash and quote +| # or +\\ [^\x80-\xff] # Escaped something (something != CR) +)* " # closing quote +) )* # further okay, if led by a period +(?: [\040\t] | \( +(?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] | \( (?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] )* \) )* +\) )* @ (?: [\040\t] | \( +(?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] | \( (?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] )* \) )* +\) )* (?: +[^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]+ # some number of atom characters... +(?![^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]) # ..not followed by something that could be part of an atom +| \[ # [ +(?: [^\\\x80-\xff\n\015\[\]] | \\ [^\x80-\xff] )* # stuff +\] # ] +) # initial subdomain +(?: # +(?: [\040\t] | \( +(?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] | \( (?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] )* \) )* +\) )* \. # if led by a period... +(?: [\040\t] | \( +(?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] | \( (?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] )* \) )* +\) )* (?: +[^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]+ # some number of atom characters... +(?![^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]) # ..not followed by something that could be part of an atom +| \[ # [ +(?: [^\\\x80-\xff\n\015\[\]] | \\ [^\x80-\xff] )* # stuff +\] # ] +) # ...further okay +)* +# address +| # or +(?: +[^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]+ # some number of atom characters... +(?![^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]) # ..not followed by something that could be part of an atom +| +" (?: # opening quote... +[^\\\x80-\xff\n\015"] # Anything except backslash and quote +| # or +\\ [^\x80-\xff] # Escaped something (something != CR) +)* " # closing quote +) # one word, optionally followed by.... +(?: +[^()<>@,;:".\\\[\]\x80-\xff\000-\010\012-\037] | # atom and space parts, or... +\( +(?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] | \( (?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] )* \) )* +\) | # comments, or... + +" (?: # opening quote... +[^\\\x80-\xff\n\015"] # Anything except backslash and quote +| # or +\\ [^\x80-\xff] # Escaped something (something != CR) +)* " # closing quote +# quoted strings +)* +< (?: [\040\t] | \( +(?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] | \( (?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] )* \) )* +\) )* # leading < +(?: @ (?: [\040\t] | \( +(?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] | \( (?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] )* \) )* +\) )* (?: +[^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]+ # some number of atom characters... +(?![^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]) # ..not followed by something that could be part of an atom +| \[ # [ +(?: [^\\\x80-\xff\n\015\[\]] | \\ [^\x80-\xff] )* # stuff +\] # ] +) # initial subdomain +(?: # +(?: [\040\t] | \( +(?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] | \( (?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] )* \) )* +\) )* \. # if led by a period... +(?: [\040\t] | \( +(?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] | \( (?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] )* \) )* +\) )* (?: +[^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]+ # some number of atom characters... +(?![^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]) # ..not followed by something that could be part of an atom +| \[ # [ +(?: [^\\\x80-\xff\n\015\[\]] | \\ [^\x80-\xff] )* # stuff +\] # ] +) # ...further okay +)* + +(?: (?: [\040\t] | \( +(?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] | \( (?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] )* \) )* +\) )* , (?: [\040\t] | \( +(?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] | \( (?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] )* \) )* +\) )* @ (?: [\040\t] | \( +(?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] | \( (?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] )* \) )* +\) )* (?: +[^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]+ # some number of atom characters... +(?![^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]) # ..not followed by something that could be part of an atom +| \[ # [ +(?: [^\\\x80-\xff\n\015\[\]] | \\ [^\x80-\xff] )* # stuff +\] # ] +) # initial subdomain +(?: # +(?: [\040\t] | \( +(?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] | \( (?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] )* \) )* +\) )* \. # if led by a period... +(?: [\040\t] | \( +(?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] | \( (?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] )* \) )* +\) )* (?: +[^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]+ # some number of atom characters... +(?![^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]) # ..not followed by something that could be part of an atom +| \[ # [ +(?: [^\\\x80-\xff\n\015\[\]] | \\ [^\x80-\xff] )* # stuff +\] # ] +) # ...further okay +)* +)* # further okay, if led by comma +: # closing colon +(?: [\040\t] | \( +(?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] | \( (?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] )* \) )* +\) )* )? # optional route +(?: +[^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]+ # some number of atom characters... +(?![^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]) # ..not followed by something that could be part of an atom +| +" (?: # opening quote... +[^\\\x80-\xff\n\015"] # Anything except backslash and quote +| # or +\\ [^\x80-\xff] # Escaped something (something != CR) +)* " # closing quote +) # initial word +(?: (?: [\040\t] | \( +(?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] | \( (?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] )* \) )* +\) )* \. (?: [\040\t] | \( +(?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] | \( (?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] )* \) )* +\) )* (?: +[^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]+ # some number of atom characters... +(?![^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]) # ..not followed by something that could be part of an atom +| +" (?: # opening quote... +[^\\\x80-\xff\n\015"] # Anything except backslash and quote +| # or +\\ [^\x80-\xff] # Escaped something (something != CR) +)* " # closing quote +) )* # further okay, if led by a period +(?: [\040\t] | \( +(?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] | \( (?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] )* \) )* +\) )* @ (?: [\040\t] | \( +(?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] | \( (?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] )* \) )* +\) )* (?: +[^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]+ # some number of atom characters... +(?![^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]) # ..not followed by something that could be part of an atom +| \[ # [ +(?: [^\\\x80-\xff\n\015\[\]] | \\ [^\x80-\xff] )* # stuff +\] # ] +) # initial subdomain +(?: # +(?: [\040\t] | \( +(?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] | \( (?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] )* \) )* +\) )* \. # if led by a period... +(?: [\040\t] | \( +(?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] | \( (?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] )* \) )* +\) )* (?: +[^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]+ # some number of atom characters... +(?![^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]) # ..not followed by something that could be part of an atom +| \[ # [ +(?: [^\\\x80-\xff\n\015\[\]] | \\ [^\x80-\xff] )* # stuff +\] # ] +) # ...further okay +)* +# address spec +(?: [\040\t] | \( +(?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] | \( (?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] )* \) )* +\) )* > # trailing > +# name and address +) (?: [\040\t] | \( +(?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] | \( (?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] )* \) )* +\) )* # optional trailing comment +/Ix +Capturing subpattern count = 0 +Contains explicit CR or LF match +Options: extended +Starting code units: \x09 \x20 ! " # $ % & ' ( * + - / 0 1 2 3 4 5 6 7 8 + 9 = ? A B C D E F G H I J K L M N O P Q R S T U V W X Y Z ^ _ ` a b c d e + f g h i j k l m n o p q r s t u v w x y z { | } ~ \x7f \xff +No last code unit +Subject length lower bound = 3 + +/[\h]/B +------------------------------------------------------------------ + Bra + [\x09 \xa0\x{1680}\x{180e}\x{2000}-\x{200a}\x{202f}\x{205f}\x{3000}] + Ket + End +------------------------------------------------------------------ + >\x09< + 0: \x09 + +/[\h]+/B +------------------------------------------------------------------ + Bra + [\x09 \xa0\x{1680}\x{180e}\x{2000}-\x{200a}\x{202f}\x{205f}\x{3000}]++ + Ket + End +------------------------------------------------------------------ + >\x09\x20\xa0< + 0: \x09 \xa0 + +/[\v]/B +------------------------------------------------------------------ + Bra + [\x0a-\x0d\x85\x{2028}-\x{2029}] + Ket + End +------------------------------------------------------------------ + +/[^\h]/B +------------------------------------------------------------------ + Bra + [^\x09 \xa0\x{1680}\x{180e}\x{2000}-\x{200a}\x{202f}\x{205f}\x{3000}] + Ket + End +------------------------------------------------------------------ + +/\h+/I +Capturing subpattern count = 0 +No options +Starting code units: \x09 \x20 \xa0 \xff +No last code unit +Subject length lower bound = 1 + \x{1681}\x{200b}\x{1680}\x{2000}\x{202f}\x{3000} + 0: \x{1680}\x{2000}\x{202f}\x{3000} + \x{3001}\x{2fff}\x{200a}\xa0\x{2000} + 0: \x{200a}\xa0\x{2000} + +/[\h\x{dc00}]+/IB +------------------------------------------------------------------ + Bra + [\x09 \xa0\x{1680}\x{180e}\x{2000}-\x{200a}\x{202f}\x{205f}\x{3000}\x{dc00}]++ + Ket + End +------------------------------------------------------------------ +Capturing subpattern count = 0 +No options +Starting code units: \x09 \x20 \xa0 \xff +No last code unit +Subject length lower bound = 1 + \x{1681}\x{200b}\x{1680}\x{2000}\x{202f}\x{3000} + 0: \x{1680}\x{2000}\x{202f}\x{3000} + \x{3001}\x{2fff}\x{200a}\xa0\x{2000} + 0: \x{200a}\xa0\x{2000} + +/\H+/I +Capturing subpattern count = 0 +No options +No first code unit +No last code unit +Subject length lower bound = 1 + \x{1680}\x{180e}\x{167f}\x{1681}\x{180d}\x{180f} + 0: \x{167f}\x{1681}\x{180d}\x{180f} + \x{2000}\x{200a}\x{1fff}\x{200b} + 0: \x{1fff}\x{200b} + \x{202f}\x{205f}\x{202e}\x{2030}\x{205e}\x{2060} + 0: \x{202e}\x{2030}\x{205e}\x{2060} + \xa0\x{3000}\x9f\xa1\x{2fff}\x{3001} + 0: \x9f\xa1\x{2fff}\x{3001} + +/[\H\x{d800}]+/ + \x{1680}\x{180e}\x{167f}\x{1681}\x{180d}\x{180f} + 0: \x{167f}\x{1681}\x{180d}\x{180f} + \x{2000}\x{200a}\x{1fff}\x{200b} + 0: \x{1fff}\x{200b} + \x{202f}\x{205f}\x{202e}\x{2030}\x{205e}\x{2060} + 0: \x{202e}\x{2030}\x{205e}\x{2060} + \xa0\x{3000}\x9f\xa1\x{2fff}\x{3001} + 0: \x9f\xa1\x{2fff}\x{3001} + +/\v+/I +Capturing subpattern count = 0 +No options +Starting code units: \x0a \x0b \x0c \x0d \x85 \xff +No last code unit +Subject length lower bound = 1 + \x{2027}\x{2030}\x{2028}\x{2029} + 0: \x{2028}\x{2029} + \x09\x0e\x84\x86\x85\x0a\x0b\x0c\x0d + 0: \x85\x0a\x0b\x0c\x0d + +/[\v\x{dc00}]+/IB +------------------------------------------------------------------ + Bra + [\x0a-\x0d\x85\x{2028}-\x{2029}\x{dc00}]++ + Ket + End +------------------------------------------------------------------ +Capturing subpattern count = 0 +No options +Starting code units: \x0a \x0b \x0c \x0d \x85 \xff +No last code unit +Subject length lower bound = 1 + \x{2027}\x{2030}\x{2028}\x{2029} + 0: \x{2028}\x{2029} + \x09\x0e\x84\x86\x85\x0a\x0b\x0c\x0d + 0: \x85\x0a\x0b\x0c\x0d + +/\V+/I +Capturing subpattern count = 0 +No options +No first code unit +No last code unit +Subject length lower bound = 1 + \x{2028}\x{2029}\x{2027}\x{2030} + 0: \x{2027}\x{2030} + \x85\x0a\x0b\x0c\x0d\x09\x0e\x84\x86 + 0: \x09\x0e\x84\x86 + +/[\V\x{d800}]+/ + \x{2028}\x{2029}\x{2027}\x{2030} + 0: \x{2027}\x{2030} + \x85\x0a\x0b\x0c\x0d\x09\x0e\x84\x86 + 0: \x09\x0e\x84\x86 + +/\R+/I,bsr=unicode +Capturing subpattern count = 0 +No options +Starting code units: \x0a \x0b \x0c \x0d \x85 \xff +No last code unit +Subject length lower bound = 1 + \x{2027}\x{2030}\x{2028}\x{2029} + 0: \x{2028}\x{2029} + \x09\x0e\x84\x86\x85\x0a\x0b\x0c\x0d + 0: \x85\x0a\x0b\x0c\x0d + +/\x{d800}\x{d7ff}\x{dc00}\x{dc00}\x{dcff}\x{dd00}/I +Capturing subpattern count = 0 +No options +First code unit = \x{d800} +Last code unit = \x{dd00} +Subject length lower bound = 6 + \x{d800}\x{d7ff}\x{dc00}\x{dc00}\x{dcff}\x{dd00} + 0: \x{d800}\x{d7ff}\x{dc00}\x{dc00}\x{dcff}\x{dd00} + +/[^\x{80}][^\x{ff}][^\x{100}][^\x{1000}][^\x{ffff}]/B +------------------------------------------------------------------ + Bra + [^\x{80}] + [^\x{ff}] + [^\x{100}] + [^\x{1000}] + [^\x{ffff}] + Ket + End +------------------------------------------------------------------ + +/[^\x{80}][^\x{ff}][^\x{100}][^\x{1000}][^\x{ffff}]/Bi +------------------------------------------------------------------ + Bra + /i [^\x{80}] + /i [^\x{ff}] + /i [^\x{100}] + /i [^\x{1000}] + /i [^\x{ffff}] + Ket + End +------------------------------------------------------------------ + +/[^\x{100}]*[^\x{1000}]+[^\x{ffff}]??[^\x{8000}]{4,}[^\x{7fff}]{2,9}?[^\x{100}]{5,6}+/B +------------------------------------------------------------------ + Bra + [^\x{100}]* + [^\x{1000}]+ + [^\x{ffff}]?? + [^\x{8000}]{4} + [^\x{8000}]* + [^\x{7fff}]{2} + [^\x{7fff}]{0,7}? + [^\x{100}]{5} + [^\x{100}]?+ + Ket + End +------------------------------------------------------------------ + +/[^\x{100}]*[^\x{1000}]+[^\x{ffff}]??[^\x{8000}]{4,}[^\x{7fff}]{2,9}?[^\x{100}]{5,6}+/Bi +------------------------------------------------------------------ + Bra + /i [^\x{100}]* + /i [^\x{1000}]+ + /i [^\x{ffff}]?? + /i [^\x{8000}]{4} + /i [^\x{8000}]* + /i [^\x{7fff}]{2} + /i [^\x{7fff}]{0,7}? + /i [^\x{100}]{5} + /i [^\x{100}]?+ + Ket + End +------------------------------------------------------------------ + +/(*:0123456789ABCDEF0123456789ABCDEF0123456789ABCDEF0123456789ABCDEF0123456789ABCDEF0123456789ABCDEF0123456789ABCDEF0123456789ABCDEF0123456789ABCDEF0123456789ABCDEF0123456789ABCDEF0123456789ABCDEF0123456789ABCDEF0123456789ABCDEF0123456789ABCDEF0123456789ABCDEF)XX/mark + XX + 0: XX +MK: 0123456789ABCDEF0123456789ABCDEF0123456789ABCDEF0123456789ABCDEF0123456789ABCDEF0123456789ABCDEF0123456789ABCDEF0123456789ABCDEF0123456789ABCDEF0123456789ABCDEF0123456789ABCDEF0123456789ABCDEF0123456789ABCDEF0123456789ABCDEF0123456789ABCDEF0123456789ABCDEF + +/(*:0123456789ABCDEF0123456789ABCDEF0123456789ABCDEF0123456789ABCDEF0123456789ABCDEF0123456789ABCDEF0123456789ABCDEF0123456789ABCDEF0123456789ABCDEF0123456789ABCDEF0123456789ABCDEF0123456789ABCDEF0123456789ABCDEF0123456789ABCDEF0123456789ABCDEF0123456789ABCDE)XX/mark + XX + 0: XX +MK: 0123456789ABCDEF0123456789ABCDEF0123456789ABCDEF0123456789ABCDEF0123456789ABCDEF0123456789ABCDEF0123456789ABCDEF0123456789ABCDEF0123456789ABCDEF0123456789ABCDEF0123456789ABCDEF0123456789ABCDEF0123456789ABCDEF0123456789ABCDEF0123456789ABCDEF0123456789ABCDE + +/\u0100/B,alt_bsux,allow_empty_class,match_unset_backref +------------------------------------------------------------------ + Bra + \x{100} + Ket + End +------------------------------------------------------------------ + +/[\u0100-\u0200]/B,alt_bsux,allow_empty_class,match_unset_backref +------------------------------------------------------------------ + Bra + [\x{100}-\x{200}] + Ket + End +------------------------------------------------------------------ + +/\ud800/B,alt_bsux,allow_empty_class,match_unset_backref +------------------------------------------------------------------ + Bra + \x{d800} + Ket + End +------------------------------------------------------------------ + +/^\x{ffff}+/i + \x{ffff} + 0: \x{ffff} + +/^\x{ffff}?/i + \x{ffff} + 0: \x{ffff} + +/^\x{ffff}*/i + \x{ffff} + 0: \x{ffff} + +/^\x{ffff}{3}/i + \x{ffff}\x{ffff}\x{ffff} + 0: \x{ffff}\x{ffff}\x{ffff} + +/^\x{ffff}{0,3}/i + \x{ffff} + 0: \x{ffff} + +/[^\x00-a]{12,}[^b-\xff]*/B +------------------------------------------------------------------ + Bra + [b-\xff] (neg){12,} + [\x00-a] (neg)*+ + Ket + End +------------------------------------------------------------------ + +/[^\s]*\s* [^\W]+\W+ [^\d]*?\d0 [^\d\w]{4,6}?\w*A/B +------------------------------------------------------------------ + Bra + [\x00-\x08\x0e-\x1f!-\xff] (neg)* + \s* + + [0-9A-Z_a-z]++ + \W+ + + [\x00-/:-\xff] (neg)*? + \d + 0 + [\x00-/:-@[-^`{-\xff] (neg){4,6}? + \w* + A + Ket + End +------------------------------------------------------------------ + +/a*[b-\x{200}]?a#a*[b-\x{200}]?b#[a-f]*[g-\x{200}]*#[g-\x{200}]*[a-c]*#[g-\x{200}]*[a-h]*/B +------------------------------------------------------------------ + Bra + a* + [b-\xff\x{100}-\x{200}]?+ + a# + a*+ + [b-\xff\x{100}-\x{200}]? + b# + [a-f]*+ + [g-\xff\x{100}-\x{200}]*+ + # + [g-\xff\x{100}-\x{200}]*+ + [a-c]*+ + # + [g-\xff\x{100}-\x{200}]* + [a-h]*+ + Ket + End +------------------------------------------------------------------ + +/^[\x{1234}\x{4321}]{2,4}?/ + \x{1234}\x{1234}\x{1234} + 0: \x{1234}\x{1234} + +# Check maximum non-UTF character size for the 16-bit library. + +/\x{ffff}/ + A\x{ffff}B + 0: \x{ffff} + +/\x{10000}/ +Failed: error 134 at offset 8: character code point value in \x{} or \o{} is too large + +/\o{20000}/ + +# Check maximum character size for the 32-bit library. These will all give +# errors in the 16-bit library. + +/\x{110000}/ +Failed: error 134 at offset 9: character code point value in \x{} or \o{} is too large + +/\x{7fffffff}/ +Failed: error 134 at offset 11: character code point value in \x{} or \o{} is too large + +/\x{80000000}/ +Failed: error 134 at offset 11: character code point value in \x{} or \o{} is too large + +/\x{ffffffff}/ +Failed: error 134 at offset 11: character code point value in \x{} or \o{} is too large + +/\x{100000000}/ +Failed: error 134 at offset 12: character code point value in \x{} or \o{} is too large + +/\o{17777777777}/ +Failed: error 134 at offset 14: character code point value in \x{} or \o{} is too large + +/\o{20000000000}/ +Failed: error 134 at offset 14: character code point value in \x{} or \o{} is too large + +/\o{37777777777}/ +Failed: error 134 at offset 14: character code point value in \x{} or \o{} is too large + +/\o{40000000000}/ +Failed: error 134 at offset 14: character code point value in \x{} or \o{} is too large + +/\x{7fffffff}\x{7fffffff}/I +Failed: error 134 at offset 11: character code point value in \x{} or \o{} is too large + +/\x{80000000}\x{80000000}/I +Failed: error 134 at offset 11: character code point value in \x{} or \o{} is too large + +/\x{ffffffff}\x{ffffffff}/I +Failed: error 134 at offset 11: character code point value in \x{} or \o{} is too large + +# Non-UTF characters + +/\C{2,3}/ + \x{400000}\x{400001}\x{400002}\x{400003} +** Character \x{400000} is greater than 0xffff and UTF-16 mode is not enabled. +** Truncation will probably give the wrong result. +** Character \x{400001} is greater than 0xffff and UTF-16 mode is not enabled. +** Truncation will probably give the wrong result. +** Character \x{400002} is greater than 0xffff and UTF-16 mode is not enabled. +** Truncation will probably give the wrong result. +** Character \x{400003} is greater than 0xffff and UTF-16 mode is not enabled. +** Truncation will probably give the wrong result. + 0: \x00\x01\x02 + +/\x{400000}\x{800000}/IBi +Failed: error 134 at offset 9: character code point value in \x{} or \o{} is too large + +# Check character ranges + +/[\H]/IB +------------------------------------------------------------------ + Bra + [\x00-\x08\x0a-\x1f!-\x9f\xa1-\xff\x{100}-\x{167f}\x{1681}-\x{180d}\x{180f}-\x{1fff}\x{200b}-\x{202e}\x{2030}-\x{205e}\x{2060}-\x{2fff}\x{3001}-\x{ffff}] + Ket + End +------------------------------------------------------------------ +Capturing subpattern count = 0 +No options +Starting code units: \x00 \x01 \x02 \x03 \x04 \x05 \x06 \x07 \x08 \x0a \x0b + \x0c \x0d \x0e \x0f \x10 \x11 \x12 \x13 \x14 \x15 \x16 \x17 \x18 \x19 \x1a + \x1b \x1c \x1d \x1e \x1f ! " # $ % & ' ( ) * + , - . / 0 1 2 3 4 5 6 7 8 9 + : ; < = > ? @ A B C D E F G H I J K L M N O P Q R S T U V W X Y Z [ \ ] ^ + _ ` a b c d e f g h i j k l m n o p q r s t u v w x y z { | } ~ \x7f \x80 + \x81 \x82 \x83 \x84 \x85 \x86 \x87 \x88 \x89 \x8a \x8b \x8c \x8d \x8e \x8f + \x90 \x91 \x92 \x93 \x94 \x95 \x96 \x97 \x98 \x99 \x9a \x9b \x9c \x9d \x9e + \x9f \xa1 \xa2 \xa3 \xa4 \xa5 \xa6 \xa7 \xa8 \xa9 \xaa \xab \xac \xad \xae + \xaf \xb0 \xb1 \xb2 \xb3 \xb4 \xb5 \xb6 \xb7 \xb8 \xb9 \xba \xbb \xbc \xbd + \xbe \xbf \xc0 \xc1 \xc2 \xc3 \xc4 \xc5 \xc6 \xc7 \xc8 \xc9 \xca \xcb \xcc + \xcd \xce \xcf \xd0 \xd1 \xd2 \xd3 \xd4 \xd5 \xd6 \xd7 \xd8 \xd9 \xda \xdb + \xdc \xdd \xde \xdf \xe0 \xe1 \xe2 \xe3 \xe4 \xe5 \xe6 \xe7 \xe8 \xe9 \xea + \xeb \xec \xed \xee \xef \xf0 \xf1 \xf2 \xf3 \xf4 \xf5 \xf6 \xf7 \xf8 \xf9 + \xfa \xfb \xfc \xfd \xfe \xff +No last code unit +Subject length lower bound = 1 + +/[\V]/IB +------------------------------------------------------------------ + Bra + [\x00-\x09\x0e-\x84\x86-\xff\x{100}-\x{2027}\x{202a}-\x{ffff}] + Ket + End +------------------------------------------------------------------ +Capturing subpattern count = 0 +No options +Starting code units: \x00 \x01 \x02 \x03 \x04 \x05 \x06 \x07 \x08 \x09 \x0e + \x0f \x10 \x11 \x12 \x13 \x14 \x15 \x16 \x17 \x18 \x19 \x1a \x1b \x1c \x1d + \x1e \x1f \x20 ! " # $ % & ' ( ) * + , - . / 0 1 2 3 4 5 6 7 8 9 : ; < = > + ? @ A B C D E F G H I J K L M N O P Q R S T U V W X Y Z [ \ ] ^ _ ` a b c + d e f g h i j k l m n o p q r s t u v w x y z { | } ~ \x7f \x80 \x81 \x82 + \x83 \x84 \x86 \x87 \x88 \x89 \x8a \x8b \x8c \x8d \x8e \x8f \x90 \x91 \x92 + \x93 \x94 \x95 \x96 \x97 \x98 \x99 \x9a \x9b \x9c \x9d \x9e \x9f \xa0 \xa1 + \xa2 \xa3 \xa4 \xa5 \xa6 \xa7 \xa8 \xa9 \xaa \xab \xac \xad \xae \xaf \xb0 + \xb1 \xb2 \xb3 \xb4 \xb5 \xb6 \xb7 \xb8 \xb9 \xba \xbb \xbc \xbd \xbe \xbf + \xc0 \xc1 \xc2 \xc3 \xc4 \xc5 \xc6 \xc7 \xc8 \xc9 \xca \xcb \xcc \xcd \xce + \xcf \xd0 \xd1 \xd2 \xd3 \xd4 \xd5 \xd6 \xd7 \xd8 \xd9 \xda \xdb \xdc \xdd + \xde \xdf \xe0 \xe1 \xe2 \xe3 \xe4 \xe5 \xe6 \xe7 \xe8 \xe9 \xea \xeb \xec + \xed \xee \xef \xf0 \xf1 \xf2 \xf3 \xf4 \xf5 \xf6 \xf7 \xf8 \xf9 \xfa \xfb + \xfc \xfd \xfe \xff +No last code unit +Subject length lower bound = 1 + +# End of testinput11 diff --git a/testdata/testoutput11-32 b/testdata/testoutput11-32 new file mode 100644 index 0000000..255ab4d --- /dev/null +++ b/testdata/testoutput11-32 @@ -0,0 +1,681 @@ +# This set of tests is for the 16-bit and 32-bit libraries' basic (non-UTF) +# features that are not compatible with the 8-bit library, or which give +# different output in 16-bit or 32-bit mode. The output for the two widths is +# different, so they have separate output files. + +#forbid_utf + +/a\Cb/ + aXb + 0: aXb + a\nb + 0: a\x0ab + +/[^\x{c4}]/IB +------------------------------------------------------------------ + Bra + [^\x{c4}] + Ket + End +------------------------------------------------------------------ +Capturing subpattern count = 0 +No options +No first code unit +No last code unit +Subject length lower bound = 1 + +/\x{100}/I +Capturing subpattern count = 0 +No options +First code unit = \x{100} +No last code unit +Subject length lower bound = 1 + +/ (?: [\040\t] | \( +(?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] | \( (?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] )* \) )* +\) )* # optional leading comment +(?: (?: +[^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]+ # some number of atom characters... +(?![^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]) # ..not followed by something that could be part of an atom +| +" (?: # opening quote... +[^\\\x80-\xff\n\015"] # Anything except backslash and quote +| # or +\\ [^\x80-\xff] # Escaped something (something != CR) +)* " # closing quote +) # initial word +(?: (?: [\040\t] | \( +(?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] | \( (?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] )* \) )* +\) )* \. (?: [\040\t] | \( +(?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] | \( (?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] )* \) )* +\) )* (?: +[^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]+ # some number of atom characters... +(?![^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]) # ..not followed by something that could be part of an atom +| +" (?: # opening quote... +[^\\\x80-\xff\n\015"] # Anything except backslash and quote +| # or +\\ [^\x80-\xff] # Escaped something (something != CR) +)* " # closing quote +) )* # further okay, if led by a period +(?: [\040\t] | \( +(?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] | \( (?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] )* \) )* +\) )* @ (?: [\040\t] | \( +(?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] | \( (?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] )* \) )* +\) )* (?: +[^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]+ # some number of atom characters... +(?![^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]) # ..not followed by something that could be part of an atom +| \[ # [ +(?: [^\\\x80-\xff\n\015\[\]] | \\ [^\x80-\xff] )* # stuff +\] # ] +) # initial subdomain +(?: # +(?: [\040\t] | \( +(?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] | \( (?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] )* \) )* +\) )* \. # if led by a period... +(?: [\040\t] | \( +(?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] | \( (?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] )* \) )* +\) )* (?: +[^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]+ # some number of atom characters... +(?![^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]) # ..not followed by something that could be part of an atom +| \[ # [ +(?: [^\\\x80-\xff\n\015\[\]] | \\ [^\x80-\xff] )* # stuff +\] # ] +) # ...further okay +)* +# address +| # or +(?: +[^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]+ # some number of atom characters... +(?![^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]) # ..not followed by something that could be part of an atom +| +" (?: # opening quote... +[^\\\x80-\xff\n\015"] # Anything except backslash and quote +| # or +\\ [^\x80-\xff] # Escaped something (something != CR) +)* " # closing quote +) # one word, optionally followed by.... +(?: +[^()<>@,;:".\\\[\]\x80-\xff\000-\010\012-\037] | # atom and space parts, or... +\( +(?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] | \( (?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] )* \) )* +\) | # comments, or... + +" (?: # opening quote... +[^\\\x80-\xff\n\015"] # Anything except backslash and quote +| # or +\\ [^\x80-\xff] # Escaped something (something != CR) +)* " # closing quote +# quoted strings +)* +< (?: [\040\t] | \( +(?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] | \( (?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] )* \) )* +\) )* # leading < +(?: @ (?: [\040\t] | \( +(?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] | \( (?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] )* \) )* +\) )* (?: +[^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]+ # some number of atom characters... +(?![^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]) # ..not followed by something that could be part of an atom +| \[ # [ +(?: [^\\\x80-\xff\n\015\[\]] | \\ [^\x80-\xff] )* # stuff +\] # ] +) # initial subdomain +(?: # +(?: [\040\t] | \( +(?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] | \( (?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] )* \) )* +\) )* \. # if led by a period... +(?: [\040\t] | \( +(?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] | \( (?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] )* \) )* +\) )* (?: +[^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]+ # some number of atom characters... +(?![^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]) # ..not followed by something that could be part of an atom +| \[ # [ +(?: [^\\\x80-\xff\n\015\[\]] | \\ [^\x80-\xff] )* # stuff +\] # ] +) # ...further okay +)* + +(?: (?: [\040\t] | \( +(?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] | \( (?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] )* \) )* +\) )* , (?: [\040\t] | \( +(?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] | \( (?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] )* \) )* +\) )* @ (?: [\040\t] | \( +(?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] | \( (?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] )* \) )* +\) )* (?: +[^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]+ # some number of atom characters... +(?![^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]) # ..not followed by something that could be part of an atom +| \[ # [ +(?: [^\\\x80-\xff\n\015\[\]] | \\ [^\x80-\xff] )* # stuff +\] # ] +) # initial subdomain +(?: # +(?: [\040\t] | \( +(?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] | \( (?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] )* \) )* +\) )* \. # if led by a period... +(?: [\040\t] | \( +(?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] | \( (?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] )* \) )* +\) )* (?: +[^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]+ # some number of atom characters... +(?![^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]) # ..not followed by something that could be part of an atom +| \[ # [ +(?: [^\\\x80-\xff\n\015\[\]] | \\ [^\x80-\xff] )* # stuff +\] # ] +) # ...further okay +)* +)* # further okay, if led by comma +: # closing colon +(?: [\040\t] | \( +(?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] | \( (?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] )* \) )* +\) )* )? # optional route +(?: +[^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]+ # some number of atom characters... +(?![^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]) # ..not followed by something that could be part of an atom +| +" (?: # opening quote... +[^\\\x80-\xff\n\015"] # Anything except backslash and quote +| # or +\\ [^\x80-\xff] # Escaped something (something != CR) +)* " # closing quote +) # initial word +(?: (?: [\040\t] | \( +(?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] | \( (?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] )* \) )* +\) )* \. (?: [\040\t] | \( +(?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] | \( (?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] )* \) )* +\) )* (?: +[^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]+ # some number of atom characters... +(?![^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]) # ..not followed by something that could be part of an atom +| +" (?: # opening quote... +[^\\\x80-\xff\n\015"] # Anything except backslash and quote +| # or +\\ [^\x80-\xff] # Escaped something (something != CR) +)* " # closing quote +) )* # further okay, if led by a period +(?: [\040\t] | \( +(?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] | \( (?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] )* \) )* +\) )* @ (?: [\040\t] | \( +(?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] | \( (?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] )* \) )* +\) )* (?: +[^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]+ # some number of atom characters... +(?![^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]) # ..not followed by something that could be part of an atom +| \[ # [ +(?: [^\\\x80-\xff\n\015\[\]] | \\ [^\x80-\xff] )* # stuff +\] # ] +) # initial subdomain +(?: # +(?: [\040\t] | \( +(?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] | \( (?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] )* \) )* +\) )* \. # if led by a period... +(?: [\040\t] | \( +(?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] | \( (?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] )* \) )* +\) )* (?: +[^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]+ # some number of atom characters... +(?![^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]) # ..not followed by something that could be part of an atom +| \[ # [ +(?: [^\\\x80-\xff\n\015\[\]] | \\ [^\x80-\xff] )* # stuff +\] # ] +) # ...further okay +)* +# address spec +(?: [\040\t] | \( +(?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] | \( (?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] )* \) )* +\) )* > # trailing > +# name and address +) (?: [\040\t] | \( +(?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] | \( (?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] )* \) )* +\) )* # optional trailing comment +/Ix +Capturing subpattern count = 0 +Contains explicit CR or LF match +Options: extended +Starting code units: \x09 \x20 ! " # $ % & ' ( * + - / 0 1 2 3 4 5 6 7 8 + 9 = ? A B C D E F G H I J K L M N O P Q R S T U V W X Y Z ^ _ ` a b c d e + f g h i j k l m n o p q r s t u v w x y z { | } ~ \x7f \xff +No last code unit +Subject length lower bound = 3 + +/[\h]/B +------------------------------------------------------------------ + Bra + [\x09 \xa0\x{1680}\x{180e}\x{2000}-\x{200a}\x{202f}\x{205f}\x{3000}] + Ket + End +------------------------------------------------------------------ + >\x09< + 0: \x09 + +/[\h]+/B +------------------------------------------------------------------ + Bra + [\x09 \xa0\x{1680}\x{180e}\x{2000}-\x{200a}\x{202f}\x{205f}\x{3000}]++ + Ket + End +------------------------------------------------------------------ + >\x09\x20\xa0< + 0: \x09 \xa0 + +/[\v]/B +------------------------------------------------------------------ + Bra + [\x0a-\x0d\x85\x{2028}-\x{2029}] + Ket + End +------------------------------------------------------------------ + +/[^\h]/B +------------------------------------------------------------------ + Bra + [^\x09 \xa0\x{1680}\x{180e}\x{2000}-\x{200a}\x{202f}\x{205f}\x{3000}] + Ket + End +------------------------------------------------------------------ + +/\h+/I +Capturing subpattern count = 0 +No options +Starting code units: \x09 \x20 \xa0 \xff +No last code unit +Subject length lower bound = 1 + \x{1681}\x{200b}\x{1680}\x{2000}\x{202f}\x{3000} + 0: \x{1680}\x{2000}\x{202f}\x{3000} + \x{3001}\x{2fff}\x{200a}\xa0\x{2000} + 0: \x{200a}\xa0\x{2000} + +/[\h\x{dc00}]+/IB +------------------------------------------------------------------ + Bra + [\x09 \xa0\x{1680}\x{180e}\x{2000}-\x{200a}\x{202f}\x{205f}\x{3000}\x{dc00}]++ + Ket + End +------------------------------------------------------------------ +Capturing subpattern count = 0 +No options +Starting code units: \x09 \x20 \xa0 \xff +No last code unit +Subject length lower bound = 1 + \x{1681}\x{200b}\x{1680}\x{2000}\x{202f}\x{3000} + 0: \x{1680}\x{2000}\x{202f}\x{3000} + \x{3001}\x{2fff}\x{200a}\xa0\x{2000} + 0: \x{200a}\xa0\x{2000} + +/\H+/I +Capturing subpattern count = 0 +No options +No first code unit +No last code unit +Subject length lower bound = 1 + \x{1680}\x{180e}\x{167f}\x{1681}\x{180d}\x{180f} + 0: \x{167f}\x{1681}\x{180d}\x{180f} + \x{2000}\x{200a}\x{1fff}\x{200b} + 0: \x{1fff}\x{200b} + \x{202f}\x{205f}\x{202e}\x{2030}\x{205e}\x{2060} + 0: \x{202e}\x{2030}\x{205e}\x{2060} + \xa0\x{3000}\x9f\xa1\x{2fff}\x{3001} + 0: \x9f\xa1\x{2fff}\x{3001} + +/[\H\x{d800}]+/ + \x{1680}\x{180e}\x{167f}\x{1681}\x{180d}\x{180f} + 0: \x{167f}\x{1681}\x{180d}\x{180f} + \x{2000}\x{200a}\x{1fff}\x{200b} + 0: \x{1fff}\x{200b} + \x{202f}\x{205f}\x{202e}\x{2030}\x{205e}\x{2060} + 0: \x{202e}\x{2030}\x{205e}\x{2060} + \xa0\x{3000}\x9f\xa1\x{2fff}\x{3001} + 0: \x9f\xa1\x{2fff}\x{3001} + +/\v+/I +Capturing subpattern count = 0 +No options +Starting code units: \x0a \x0b \x0c \x0d \x85 \xff +No last code unit +Subject length lower bound = 1 + \x{2027}\x{2030}\x{2028}\x{2029} + 0: \x{2028}\x{2029} + \x09\x0e\x84\x86\x85\x0a\x0b\x0c\x0d + 0: \x85\x0a\x0b\x0c\x0d + +/[\v\x{dc00}]+/IB +------------------------------------------------------------------ + Bra + [\x0a-\x0d\x85\x{2028}-\x{2029}\x{dc00}]++ + Ket + End +------------------------------------------------------------------ +Capturing subpattern count = 0 +No options +Starting code units: \x0a \x0b \x0c \x0d \x85 \xff +No last code unit +Subject length lower bound = 1 + \x{2027}\x{2030}\x{2028}\x{2029} + 0: \x{2028}\x{2029} + \x09\x0e\x84\x86\x85\x0a\x0b\x0c\x0d + 0: \x85\x0a\x0b\x0c\x0d + +/\V+/I +Capturing subpattern count = 0 +No options +No first code unit +No last code unit +Subject length lower bound = 1 + \x{2028}\x{2029}\x{2027}\x{2030} + 0: \x{2027}\x{2030} + \x85\x0a\x0b\x0c\x0d\x09\x0e\x84\x86 + 0: \x09\x0e\x84\x86 + +/[\V\x{d800}]+/ + \x{2028}\x{2029}\x{2027}\x{2030} + 0: \x{2027}\x{2030} + \x85\x0a\x0b\x0c\x0d\x09\x0e\x84\x86 + 0: \x09\x0e\x84\x86 + +/\R+/I,bsr=unicode +Capturing subpattern count = 0 +No options +Starting code units: \x0a \x0b \x0c \x0d \x85 \xff +No last code unit +Subject length lower bound = 1 + \x{2027}\x{2030}\x{2028}\x{2029} + 0: \x{2028}\x{2029} + \x09\x0e\x84\x86\x85\x0a\x0b\x0c\x0d + 0: \x85\x0a\x0b\x0c\x0d + +/\x{d800}\x{d7ff}\x{dc00}\x{dc00}\x{dcff}\x{dd00}/I +Capturing subpattern count = 0 +No options +First code unit = \x{d800} +Last code unit = \x{dd00} +Subject length lower bound = 6 + \x{d800}\x{d7ff}\x{dc00}\x{dc00}\x{dcff}\x{dd00} + 0: \x{d800}\x{d7ff}\x{dc00}\x{dc00}\x{dcff}\x{dd00} + +/[^\x{80}][^\x{ff}][^\x{100}][^\x{1000}][^\x{ffff}]/B +------------------------------------------------------------------ + Bra + [^\x{80}] + [^\x{ff}] + [^\x{100}] + [^\x{1000}] + [^\x{ffff}] + Ket + End +------------------------------------------------------------------ + +/[^\x{80}][^\x{ff}][^\x{100}][^\x{1000}][^\x{ffff}]/Bi +------------------------------------------------------------------ + Bra + /i [^\x{80}] + /i [^\x{ff}] + /i [^\x{100}] + /i [^\x{1000}] + /i [^\x{ffff}] + Ket + End +------------------------------------------------------------------ + +/[^\x{100}]*[^\x{1000}]+[^\x{ffff}]??[^\x{8000}]{4,}[^\x{7fff}]{2,9}?[^\x{100}]{5,6}+/B +------------------------------------------------------------------ + Bra + [^\x{100}]* + [^\x{1000}]+ + [^\x{ffff}]?? + [^\x{8000}]{4} + [^\x{8000}]* + [^\x{7fff}]{2} + [^\x{7fff}]{0,7}? + [^\x{100}]{5} + [^\x{100}]?+ + Ket + End +------------------------------------------------------------------ + +/[^\x{100}]*[^\x{1000}]+[^\x{ffff}]??[^\x{8000}]{4,}[^\x{7fff}]{2,9}?[^\x{100}]{5,6}+/Bi +------------------------------------------------------------------ + Bra + /i [^\x{100}]* + /i [^\x{1000}]+ + /i [^\x{ffff}]?? + /i [^\x{8000}]{4} + /i [^\x{8000}]* + /i [^\x{7fff}]{2} + /i [^\x{7fff}]{0,7}? + /i [^\x{100}]{5} + /i [^\x{100}]?+ + Ket + End +------------------------------------------------------------------ + +/(*:0123456789ABCDEF0123456789ABCDEF0123456789ABCDEF0123456789ABCDEF0123456789ABCDEF0123456789ABCDEF0123456789ABCDEF0123456789ABCDEF0123456789ABCDEF0123456789ABCDEF0123456789ABCDEF0123456789ABCDEF0123456789ABCDEF0123456789ABCDEF0123456789ABCDEF0123456789ABCDEF)XX/mark + XX + 0: XX +MK: 0123456789ABCDEF0123456789ABCDEF0123456789ABCDEF0123456789ABCDEF0123456789ABCDEF0123456789ABCDEF0123456789ABCDEF0123456789ABCDEF0123456789ABCDEF0123456789ABCDEF0123456789ABCDEF0123456789ABCDEF0123456789ABCDEF0123456789ABCDEF0123456789ABCDEF0123456789ABCDEF + +/(*:0123456789ABCDEF0123456789ABCDEF0123456789ABCDEF0123456789ABCDEF0123456789ABCDEF0123456789ABCDEF0123456789ABCDEF0123456789ABCDEF0123456789ABCDEF0123456789ABCDEF0123456789ABCDEF0123456789ABCDEF0123456789ABCDEF0123456789ABCDEF0123456789ABCDEF0123456789ABCDE)XX/mark + XX + 0: XX +MK: 0123456789ABCDEF0123456789ABCDEF0123456789ABCDEF0123456789ABCDEF0123456789ABCDEF0123456789ABCDEF0123456789ABCDEF0123456789ABCDEF0123456789ABCDEF0123456789ABCDEF0123456789ABCDEF0123456789ABCDEF0123456789ABCDEF0123456789ABCDEF0123456789ABCDEF0123456789ABCDE + +/\u0100/B,alt_bsux,allow_empty_class,match_unset_backref +------------------------------------------------------------------ + Bra + \x{100} + Ket + End +------------------------------------------------------------------ + +/[\u0100-\u0200]/B,alt_bsux,allow_empty_class,match_unset_backref +------------------------------------------------------------------ + Bra + [\x{100}-\x{200}] + Ket + End +------------------------------------------------------------------ + +/\ud800/B,alt_bsux,allow_empty_class,match_unset_backref +------------------------------------------------------------------ + Bra + \x{d800} + Ket + End +------------------------------------------------------------------ + +/^\x{ffff}+/i + \x{ffff} + 0: \x{ffff} + +/^\x{ffff}?/i + \x{ffff} + 0: \x{ffff} + +/^\x{ffff}*/i + \x{ffff} + 0: \x{ffff} + +/^\x{ffff}{3}/i + \x{ffff}\x{ffff}\x{ffff} + 0: \x{ffff}\x{ffff}\x{ffff} + +/^\x{ffff}{0,3}/i + \x{ffff} + 0: \x{ffff} + +/[^\x00-a]{12,}[^b-\xff]*/B +------------------------------------------------------------------ + Bra + [b-\xff] (neg){12,} + [\x00-a] (neg)*+ + Ket + End +------------------------------------------------------------------ + +/[^\s]*\s* [^\W]+\W+ [^\d]*?\d0 [^\d\w]{4,6}?\w*A/B +------------------------------------------------------------------ + Bra + [\x00-\x08\x0e-\x1f!-\xff] (neg)* + \s* + + [0-9A-Z_a-z]++ + \W+ + + [\x00-/:-\xff] (neg)*? + \d + 0 + [\x00-/:-@[-^`{-\xff] (neg){4,6}? + \w* + A + Ket + End +------------------------------------------------------------------ + +/a*[b-\x{200}]?a#a*[b-\x{200}]?b#[a-f]*[g-\x{200}]*#[g-\x{200}]*[a-c]*#[g-\x{200}]*[a-h]*/B +------------------------------------------------------------------ + Bra + a* + [b-\xff\x{100}-\x{200}]?+ + a# + a*+ + [b-\xff\x{100}-\x{200}]? + b# + [a-f]*+ + [g-\xff\x{100}-\x{200}]*+ + # + [g-\xff\x{100}-\x{200}]*+ + [a-c]*+ + # + [g-\xff\x{100}-\x{200}]* + [a-h]*+ + Ket + End +------------------------------------------------------------------ + +/^[\x{1234}\x{4321}]{2,4}?/ + \x{1234}\x{1234}\x{1234} + 0: \x{1234}\x{1234} + +# Check maximum non-UTF character size for the 16-bit library. + +/\x{ffff}/ + A\x{ffff}B + 0: \x{ffff} + +/\x{10000}/ + +/\o{20000}/ + +# Check maximum character size for the 32-bit library. These will all give +# errors in the 16-bit library. + +/\x{110000}/ + +/\x{7fffffff}/ + +/\x{80000000}/ + +/\x{ffffffff}/ + +/\x{100000000}/ +Failed: error 134 at offset 12: character code point value in \x{} or \o{} is too large + +/\o{17777777777}/ + +/\o{20000000000}/ + +/\o{37777777777}/ + +/\o{40000000000}/ +Failed: error 134 at offset 14: character code point value in \x{} or \o{} is too large + +/\x{7fffffff}\x{7fffffff}/I +Capturing subpattern count = 0 +No options +First code unit = \x{7fffffff} +Last code unit = \x{7fffffff} +Subject length lower bound = 2 + +/\x{80000000}\x{80000000}/I +Capturing subpattern count = 0 +No options +First code unit = \x{80000000} +Last code unit = \x{80000000} +Subject length lower bound = 2 + +/\x{ffffffff}\x{ffffffff}/I +Capturing subpattern count = 0 +No options +First code unit = \x{ffffffff} +Last code unit = \x{ffffffff} +Subject length lower bound = 2 + +# Non-UTF characters + +/\C{2,3}/ + \x{400000}\x{400001}\x{400002}\x{400003} + 0: \x{400000}\x{400001}\x{400002} + +/\x{400000}\x{800000}/IBi +------------------------------------------------------------------ + Bra + /i \x{400000}\x{800000} + Ket + End +------------------------------------------------------------------ +Capturing subpattern count = 0 +Options: caseless +First code unit = \x{400000} +Last code unit = \x{800000} +Subject length lower bound = 2 + +# Check character ranges + +/[\H]/IB +------------------------------------------------------------------ + Bra + [\x00-\x08\x0a-\x1f!-\x9f\xa1-\xff\x{100}-\x{167f}\x{1681}-\x{180d}\x{180f}-\x{1fff}\x{200b}-\x{202e}\x{2030}-\x{205e}\x{2060}-\x{2fff}\x{3001}-\x{ffffffff}] + Ket + End +------------------------------------------------------------------ +Capturing subpattern count = 0 +No options +Starting code units: \x00 \x01 \x02 \x03 \x04 \x05 \x06 \x07 \x08 \x0a \x0b + \x0c \x0d \x0e \x0f \x10 \x11 \x12 \x13 \x14 \x15 \x16 \x17 \x18 \x19 \x1a + \x1b \x1c \x1d \x1e \x1f ! " # $ % & ' ( ) * + , - . / 0 1 2 3 4 5 6 7 8 9 + : ; < = > ? @ A B C D E F G H I J K L M N O P Q R S T U V W X Y Z [ \ ] ^ + _ ` a b c d e f g h i j k l m n o p q r s t u v w x y z { | } ~ \x7f \x80 + \x81 \x82 \x83 \x84 \x85 \x86 \x87 \x88 \x89 \x8a \x8b \x8c \x8d \x8e \x8f + \x90 \x91 \x92 \x93 \x94 \x95 \x96 \x97 \x98 \x99 \x9a \x9b \x9c \x9d \x9e + \x9f \xa1 \xa2 \xa3 \xa4 \xa5 \xa6 \xa7 \xa8 \xa9 \xaa \xab \xac \xad \xae + \xaf \xb0 \xb1 \xb2 \xb3 \xb4 \xb5 \xb6 \xb7 \xb8 \xb9 \xba \xbb \xbc \xbd + \xbe \xbf \xc0 \xc1 \xc2 \xc3 \xc4 \xc5 \xc6 \xc7 \xc8 \xc9 \xca \xcb \xcc + \xcd \xce \xcf \xd0 \xd1 \xd2 \xd3 \xd4 \xd5 \xd6 \xd7 \xd8 \xd9 \xda \xdb + \xdc \xdd \xde \xdf \xe0 \xe1 \xe2 \xe3 \xe4 \xe5 \xe6 \xe7 \xe8 \xe9 \xea + \xeb \xec \xed \xee \xef \xf0 \xf1 \xf2 \xf3 \xf4 \xf5 \xf6 \xf7 \xf8 \xf9 + \xfa \xfb \xfc \xfd \xfe \xff +No last code unit +Subject length lower bound = 1 + +/[\V]/IB +------------------------------------------------------------------ + Bra + [\x00-\x09\x0e-\x84\x86-\xff\x{100}-\x{2027}\x{202a}-\x{ffffffff}] + Ket + End +------------------------------------------------------------------ +Capturing subpattern count = 0 +No options +Starting code units: \x00 \x01 \x02 \x03 \x04 \x05 \x06 \x07 \x08 \x09 \x0e + \x0f \x10 \x11 \x12 \x13 \x14 \x15 \x16 \x17 \x18 \x19 \x1a \x1b \x1c \x1d + \x1e \x1f \x20 ! " # $ % & ' ( ) * + , - . / 0 1 2 3 4 5 6 7 8 9 : ; < = > + ? @ A B C D E F G H I J K L M N O P Q R S T U V W X Y Z [ \ ] ^ _ ` a b c + d e f g h i j k l m n o p q r s t u v w x y z { | } ~ \x7f \x80 \x81 \x82 + \x83 \x84 \x86 \x87 \x88 \x89 \x8a \x8b \x8c \x8d \x8e \x8f \x90 \x91 \x92 + \x93 \x94 \x95 \x96 \x97 \x98 \x99 \x9a \x9b \x9c \x9d \x9e \x9f \xa0 \xa1 + \xa2 \xa3 \xa4 \xa5 \xa6 \xa7 \xa8 \xa9 \xaa \xab \xac \xad \xae \xaf \xb0 + \xb1 \xb2 \xb3 \xb4 \xb5 \xb6 \xb7 \xb8 \xb9 \xba \xbb \xbc \xbd \xbe \xbf + \xc0 \xc1 \xc2 \xc3 \xc4 \xc5 \xc6 \xc7 \xc8 \xc9 \xca \xcb \xcc \xcd \xce + \xcf \xd0 \xd1 \xd2 \xd3 \xd4 \xd5 \xd6 \xd7 \xd8 \xd9 \xda \xdb \xdc \xdd + \xde \xdf \xe0 \xe1 \xe2 \xe3 \xe4 \xe5 \xe6 \xe7 \xe8 \xe9 \xea \xeb \xec + \xed \xee \xef \xf0 \xf1 \xf2 \xf3 \xf4 \xf5 \xf6 \xf7 \xf8 \xf9 \xfa \xfb + \xfc \xfd \xfe \xff +No last code unit +Subject length lower bound = 1 + +# End of testinput11 diff --git a/testdata/testoutput12-16 b/testdata/testoutput12-16 new file mode 100644 index 0000000..78dfd62 --- /dev/null +++ b/testdata/testoutput12-16 @@ -0,0 +1,1159 @@ +# This set of tests is for UTF-16 and UTF-32 support, and is relevant only to +# the 16-bit and 32-bit libraries. The output is different for each library, +# so there are separate output files. + +/xxx/IB,utf,no_utf_check +** Failed: invalid UTF-8 string cannot be converted to 16-bit string + +/abc/utf + ] +** Failed: invalid UTF-8 string cannot be used as input in UTF mode + +/X(\C{3})/utf + X\x{11234}Y + 0: X\x{11234}Y + 1: \x{11234}Y + X\x{11234}YZ + 0: X\x{11234}Y + 1: \x{11234}Y + +/X(\C{4})/utf + X\x{11234}YZ + 0: X\x{11234}YZ + 1: \x{11234}YZ + X\x{11234}YZW + 0: X\x{11234}YZ + 1: \x{11234}YZ + +/X\C*/utf + XYZabcdce + 0: XYZabcdce + +/X\C*?/utf + XYZabcde + 0: X + +/X\C{3,5}/utf + Xabcdefg + 0: Xabcde + X\x{11234}Y + 0: X\x{11234}Y + X\x{11234}YZ + 0: X\x{11234}YZ + X\x{11234}\x{512} + 0: X\x{11234}\x{512} + X\x{11234}\x{512}YZ + 0: X\x{11234}\x{512}YZ + X\x{11234}\x{512}\x{11234}Z + 0: X\x{11234}\x{512}\x{11234} + +/X\C{3,5}?/utf + Xabcdefg + 0: Xabc + X\x{11234}Y + 0: X\x{11234}Y + X\x{11234}YZ + 0: X\x{11234}Y + X\x{11234}\x{512}YZ + 0: X\x{11234}\x{512} + *** Failers +No match + X\x{11234} +No match + +/a\Cb/utf + aXb + 0: aXb + a\nb + 0: a\x{0a}b + +/a\C\Cb/utf + a\x{12257}b + 0: a\x{12257}b + a\x{12257}\x{11234}b +No match + ** Failers +No match + a\x{100}b +No match + +/ab\Cde/utf + abXde + 0: abXde + +# Check maximum character size + +/\x{ffff}/IB,utf +------------------------------------------------------------------ + Bra + \x{ffff} + Ket + End +------------------------------------------------------------------ +Capturing subpattern count = 0 +Options: utf +First code unit = \x{ffff} +No last code unit +Subject length lower bound = 1 + +/\x{10000}/IB,utf +------------------------------------------------------------------ + Bra + \x{10000} + Ket + End +------------------------------------------------------------------ +Capturing subpattern count = 0 +Options: utf +First code unit = \x{d800} +Last code unit = \x{dc00} +Subject length lower bound = 1 + +/\x{100}/IB,utf +------------------------------------------------------------------ + Bra + \x{100} + Ket + End +------------------------------------------------------------------ +Capturing subpattern count = 0 +Options: utf +First code unit = \x{100} +No last code unit +Subject length lower bound = 1 + +/\x{1000}/IB,utf +------------------------------------------------------------------ + Bra + \x{1000} + Ket + End +------------------------------------------------------------------ +Capturing subpattern count = 0 +Options: utf +First code unit = \x{1000} +No last code unit +Subject length lower bound = 1 + +/\x{10000}/IB,utf +------------------------------------------------------------------ + Bra + \x{10000} + Ket + End +------------------------------------------------------------------ +Capturing subpattern count = 0 +Options: utf +First code unit = \x{d800} +Last code unit = \x{dc00} +Subject length lower bound = 1 + +/\x{100000}/IB,utf +------------------------------------------------------------------ + Bra + \x{100000} + Ket + End +------------------------------------------------------------------ +Capturing subpattern count = 0 +Options: utf +First code unit = \x{dbc0} +Last code unit = \x{dc00} +Subject length lower bound = 1 + +/\x{10ffff}/IB,utf +------------------------------------------------------------------ + Bra + \x{10ffff} + Ket + End +------------------------------------------------------------------ +Capturing subpattern count = 0 +Options: utf +First code unit = \x{dbff} +Last code unit = \x{dfff} +Subject length lower bound = 1 + +/[\x{ff}]/IB,utf +------------------------------------------------------------------ + Bra + \x{ff} + Ket + End +------------------------------------------------------------------ +Capturing subpattern count = 0 +Options: utf +First code unit = \xff +No last code unit +Subject length lower bound = 1 + +/[\x{100}]/IB,utf +------------------------------------------------------------------ + Bra + \x{100} + Ket + End +------------------------------------------------------------------ +Capturing subpattern count = 0 +Options: utf +First code unit = \x{100} +No last code unit +Subject length lower bound = 1 + +/\x80/IB,utf +------------------------------------------------------------------ + Bra + \x{80} + Ket + End +------------------------------------------------------------------ +Capturing subpattern count = 0 +Options: utf +First code unit = \x80 +No last code unit +Subject length lower bound = 1 + +/\xff/IB,utf +------------------------------------------------------------------ + Bra + \x{ff} + Ket + End +------------------------------------------------------------------ +Capturing subpattern count = 0 +Options: utf +First code unit = \xff +No last code unit +Subject length lower bound = 1 + +/\x{D55c}\x{ad6d}\x{C5B4}/IB,utf +------------------------------------------------------------------ + Bra + \x{d55c}\x{ad6d}\x{c5b4} + Ket + End +------------------------------------------------------------------ +Capturing subpattern count = 0 +Options: utf +First code unit = \x{d55c} +Last code unit = \x{c5b4} +Subject length lower bound = 3 + \x{D55c}\x{ad6d}\x{C5B4} + 0: \x{d55c}\x{ad6d}\x{c5b4} + +/\x{65e5}\x{672c}\x{8a9e}/IB,utf +------------------------------------------------------------------ + Bra + \x{65e5}\x{672c}\x{8a9e} + Ket + End +------------------------------------------------------------------ +Capturing subpattern count = 0 +Options: utf +First code unit = \x{65e5} +Last code unit = \x{8a9e} +Subject length lower bound = 3 + \x{65e5}\x{672c}\x{8a9e} + 0: \x{65e5}\x{672c}\x{8a9e} + +/\x{80}/IB,utf +------------------------------------------------------------------ + Bra + \x{80} + Ket + End +------------------------------------------------------------------ +Capturing subpattern count = 0 +Options: utf +First code unit = \x80 +No last code unit +Subject length lower bound = 1 + +/\x{084}/IB,utf +------------------------------------------------------------------ + Bra + \x{84} + Ket + End +------------------------------------------------------------------ +Capturing subpattern count = 0 +Options: utf +First code unit = \x84 +No last code unit +Subject length lower bound = 1 + +/\x{104}/IB,utf +------------------------------------------------------------------ + Bra + \x{104} + Ket + End +------------------------------------------------------------------ +Capturing subpattern count = 0 +Options: utf +First code unit = \x{104} +No last code unit +Subject length lower bound = 1 + +/\x{861}/IB,utf +------------------------------------------------------------------ + Bra + \x{861} + Ket + End +------------------------------------------------------------------ +Capturing subpattern count = 0 +Options: utf +First code unit = \x{861} +No last code unit +Subject length lower bound = 1 + +/\x{212ab}/IB,utf +------------------------------------------------------------------ + Bra + \x{212ab} + Ket + End +------------------------------------------------------------------ +Capturing subpattern count = 0 +Options: utf +First code unit = \x{d844} +Last code unit = \x{deab} +Subject length lower bound = 1 + +# This one is here not because it's different to Perl, but because the way +# the captured single-byte is displayed. (In Perl it becomes a character, and you +# can't tell the difference.) + +/X(\C)(.*)/utf + X\x{1234} + 0: X\x{1234} + 1: \x{1234} + 2: + X\nabc + 0: X\x{0a}abc + 1: \x{0a} + 2: abc + +# This one is here because Perl gives out a grumbly error message (quite +# correctly, but that messes up comparisons). + +/a\Cb/utf + *** Failers +No match + a\x{100}b + 0: a\x{100}b + +/[^ab\xC0-\xF0]/IB,utf +------------------------------------------------------------------ + Bra + [\x00-`c-\xbf\xf1-\xff] (neg) + Ket + End +------------------------------------------------------------------ +Capturing subpattern count = 0 +Options: utf +Starting code units: \x00 \x01 \x02 \x03 \x04 \x05 \x06 \x07 \x08 \x09 \x0a + \x0b \x0c \x0d \x0e \x0f \x10 \x11 \x12 \x13 \x14 \x15 \x16 \x17 \x18 \x19 + \x1a \x1b \x1c \x1d \x1e \x1f \x20 ! " # $ % & ' ( ) * + , - . / 0 1 2 3 4 + 5 6 7 8 9 : ; < = > ? @ A B C D E F G H I J K L M N O P Q R S T U V W X Y + Z [ \ ] ^ _ ` c d e f g h i j k l m n o p q r s t u v w x y z { | } ~ \x7f + \x80 \x81 \x82 \x83 \x84 \x85 \x86 \x87 \x88 \x89 \x8a \x8b \x8c \x8d \x8e + \x8f \x90 \x91 \x92 \x93 \x94 \x95 \x96 \x97 \x98 \x99 \x9a \x9b \x9c \x9d + \x9e \x9f \xa0 \xa1 \xa2 \xa3 \xa4 \xa5 \xa6 \xa7 \xa8 \xa9 \xaa \xab \xac + \xad \xae \xaf \xb0 \xb1 \xb2 \xb3 \xb4 \xb5 \xb6 \xb7 \xb8 \xb9 \xba \xbb + \xbc \xbd \xbe \xbf \xf1 \xf2 \xf3 \xf4 \xf5 \xf6 \xf7 \xf8 \xf9 \xfa \xfb + \xfc \xfd \xfe \xff +No last code unit +Subject length lower bound = 1 + \x{f1} + 0: \x{f1} + \x{bf} + 0: \x{bf} + \x{100} + 0: \x{100} + \x{1000} + 0: \x{1000} + *** Failers + 0: * + \x{c0} +No match + \x{f0} +No match + +/Ā{3,4}/IB,utf +------------------------------------------------------------------ + Bra + \x{100}{3} + \x{100}?+ + Ket + End +------------------------------------------------------------------ +Capturing subpattern count = 0 +Options: utf +First code unit = \x{100} +Last code unit = \x{100} +Subject length lower bound = 3 + \x{100}\x{100}\x{100}\x{100\x{100} + 0: \x{100}\x{100}\x{100} + +/(\x{100}+|x)/IB,utf +------------------------------------------------------------------ + Bra + CBra 1 + \x{100}++ + Alt + x + Ket + Ket + End +------------------------------------------------------------------ +Capturing subpattern count = 1 +Options: utf +Starting code units: x \xff +No last code unit +Subject length lower bound = 1 + +/(\x{100}*a|x)/IB,utf +------------------------------------------------------------------ + Bra + CBra 1 + \x{100}*+ + a + Alt + x + Ket + Ket + End +------------------------------------------------------------------ +Capturing subpattern count = 1 +Options: utf +Starting code units: a x \xff +No last code unit +Subject length lower bound = 1 + +/(\x{100}{0,2}a|x)/IB,utf +------------------------------------------------------------------ + Bra + CBra 1 + \x{100}{0,2}+ + a + Alt + x + Ket + Ket + End +------------------------------------------------------------------ +Capturing subpattern count = 1 +Options: utf +Starting code units: a x \xff +No last code unit +Subject length lower bound = 1 + +/(\x{100}{1,2}a|x)/IB,utf +------------------------------------------------------------------ + Bra + CBra 1 + \x{100} + \x{100}{0,1}+ + a + Alt + x + Ket + Ket + End +------------------------------------------------------------------ +Capturing subpattern count = 1 +Options: utf +Starting code units: x \xff +No last code unit +Subject length lower bound = 1 + +/\x{100}/IB,utf +------------------------------------------------------------------ + Bra + \x{100} + Ket + End +------------------------------------------------------------------ +Capturing subpattern count = 0 +Options: utf +First code unit = \x{100} +No last code unit +Subject length lower bound = 1 + +/a\x{100}\x{101}*/IB,utf +------------------------------------------------------------------ + Bra + a\x{100} + \x{101}*+ + Ket + End +------------------------------------------------------------------ +Capturing subpattern count = 0 +Options: utf +First code unit = 'a' +Last code unit = \x{100} +Subject length lower bound = 2 + +/a\x{100}\x{101}+/IB,utf +------------------------------------------------------------------ + Bra + a\x{100} + \x{101}++ + Ket + End +------------------------------------------------------------------ +Capturing subpattern count = 0 +Options: utf +First code unit = 'a' +Last code unit = \x{101} +Subject length lower bound = 3 + +/[^\x{c4}]/IB +------------------------------------------------------------------ + Bra + [^\x{c4}] + Ket + End +------------------------------------------------------------------ +Capturing subpattern count = 0 +No options +No first code unit +No last code unit +Subject length lower bound = 1 + +/[\x{100}]/IB,utf +------------------------------------------------------------------ + Bra + \x{100} + Ket + End +------------------------------------------------------------------ +Capturing subpattern count = 0 +Options: utf +First code unit = \x{100} +No last code unit +Subject length lower bound = 1 + \x{100} + 0: \x{100} + Z\x{100} + 0: \x{100} + \x{100}Z + 0: \x{100} + *** Failers +No match + +/[\xff]/IB,utf +------------------------------------------------------------------ + Bra + \x{ff} + Ket + End +------------------------------------------------------------------ +Capturing subpattern count = 0 +Options: utf +First code unit = \xff +No last code unit +Subject length lower bound = 1 + >\x{ff}< + 0: \x{ff} + +/[^\xff]/IB,utf +------------------------------------------------------------------ + Bra + [^\x{ff}] + Ket + End +------------------------------------------------------------------ +Capturing subpattern count = 0 +Options: utf +No first code unit +No last code unit +Subject length lower bound = 1 + +/\x{100}abc(xyz(?1))/IB,utf +------------------------------------------------------------------ + Bra + \x{100}abc + CBra 1 + xyz + Recurse + Ket + Ket + End +------------------------------------------------------------------ +Capturing subpattern count = 1 +Options: utf +First code unit = \x{100} +Last code unit = 'z' +Subject length lower bound = 7 + +/\777/I,utf +Capturing subpattern count = 0 +Options: utf +First code unit = \x{1ff} +No last code unit +Subject length lower bound = 1 + \x{1ff} + 0: \x{1ff} + \777 + 0: \x{1ff} + +/\x{100}+\x{200}/IB,utf +------------------------------------------------------------------ + Bra + \x{100}++ + \x{200} + Ket + End +------------------------------------------------------------------ +Capturing subpattern count = 0 +Options: utf +First code unit = \x{100} +Last code unit = \x{200} +Subject length lower bound = 2 + +/\x{100}+X/IB,utf +------------------------------------------------------------------ + Bra + \x{100}++ + X + Ket + End +------------------------------------------------------------------ +Capturing subpattern count = 0 +Options: utf +First code unit = \x{100} +Last code unit = 'X' +Subject length lower bound = 2 + +/^[\QĀ\E-\QŐ\E/B,utf +Failed: error 106 at offset 13: missing terminating ] for character class + +/X/utf + \x{d800} +Failed: error -24: UTF-16 error: missing low surrogate at end + \x{d800}\=no_utf_check +No match + \x{da00} +Failed: error -24: UTF-16 error: missing low surrogate at end + \x{da00}\=no_utf_check +No match + \x{dc00} +Failed: error -26: UTF-16 error: isolated low surrogate + \x{dc00}\=no_utf_check +No match + \x{de00} +Failed: error -26: UTF-16 error: isolated low surrogate + \x{de00}\=no_utf_check +No match + \x{dfff} +Failed: error -26: UTF-16 error: isolated low surrogate + \x{dfff}\=no_utf_check +No match + \x{110000} +** Failed: character \x{110000} is greater than 0x10ffff and so cannot be converted to UTF-16 + \x{d800}\x{1234} +Failed: error -25: UTF-16 error: invalid low surrogate + +/(*UTF16)\x{11234}/ + abcd\x{11234}pqr + 0: \x{11234} + +/(*UTF)\x{11234}/I +Capturing subpattern count = 0 +Compile options: +Overall options: utf +First code unit = \x{d804} +Last code unit = \x{de34} +Subject length lower bound = 1 + abcd\x{11234}pqr + 0: \x{11234} + +/(*UTF-32)\x{11234}/ +Failed: error 160 at offset 5: (*VERB) not recognized or malformed + abcd\x{11234}pqr + +/(*CRLF)(*UTF16)(*BSR_UNICODE)a\Rb/I +Capturing subpattern count = 0 +Compile options: +Overall options: utf +Forced newline is CRLF +First code unit = 'a' +Last code unit = 'b' +Subject length lower bound = 3 + +/(*CRLF)(*UTF32)(*BSR_UNICODE)a\Rb/I +Failed: error 160 at offset 12: (*VERB) not recognized or malformed + +/\h/I,utf +Capturing subpattern count = 0 +Options: utf +Starting code units: \x09 \x20 \xa0 \xff +No last code unit +Subject length lower bound = 1 + ABC\x{09} + 0: \x{09} + ABC\x{20} + 0: + ABC\x{a0} + 0: \x{a0} + ABC\x{1680} + 0: \x{1680} + ABC\x{180e} + 0: \x{180e} + ABC\x{2000} + 0: \x{2000} + ABC\x{202f} + 0: \x{202f} + ABC\x{205f} + 0: \x{205f} + ABC\x{3000} + 0: \x{3000} + +/\v/I,utf +Capturing subpattern count = 0 +Options: utf +Starting code units: \x0a \x0b \x0c \x0d \x85 \xff +No last code unit +Subject length lower bound = 1 + ABC\x{0a} + 0: \x{0a} + ABC\x{0b} + 0: \x{0b} + ABC\x{0c} + 0: \x{0c} + ABC\x{0d} + 0: \x{0d} + ABC\x{85} + 0: \x{85} + ABC\x{2028} + 0: \x{2028} + +/\h*A/I,utf +Capturing subpattern count = 0 +Options: utf +Starting code units: \x09 \x20 A \xa0 \xff +Last code unit = 'A' +Subject length lower bound = 1 + CDBABC + 0: A + \x{2000}ABC + 0: \x{2000}A + +/\R*A/I,bsr=unicode,utf +Capturing subpattern count = 0 +Options: utf +Starting code units: \x0a \x0b \x0c \x0d A \x85 \xff +Last code unit = 'A' +Subject length lower bound = 1 + CDBABC + 0: A + \x{2028}A + 0: \x{2028}A + +/\v+A/I,utf +Capturing subpattern count = 0 +Options: utf +Starting code units: \x0a \x0b \x0c \x0d \x85 \xff +Last code unit = 'A' +Subject length lower bound = 2 + +/\s?xxx\s/I,utf +Capturing subpattern count = 0 +Options: utf +Starting code units: \x09 \x0a \x0b \x0c \x0d \x20 x +Last code unit = 'x' +Subject length lower bound = 4 + +/\sxxx\s/I,utf,tables=2 +Capturing subpattern count = 0 +Options: utf +Starting code units: \x09 \x0a \x0b \x0c \x0d \x20 \x85 \xa0 +Last code unit = 'x' +Subject length lower bound = 5 + AB\x{85}xxx\x{a0}XYZ + 0: \x{85}xxx\x{a0} + AB\x{a0}xxx\x{85}XYZ + 0: \x{a0}xxx\x{85} + +/\S \S/I,utf,tables=2 +Capturing subpattern count = 0 +Options: utf +Starting code units: \x00 \x01 \x02 \x03 \x04 \x05 \x06 \x07 \x08 \x0e \x0f + \x10 \x11 \x12 \x13 \x14 \x15 \x16 \x17 \x18 \x19 \x1a \x1b \x1c \x1d \x1e + \x1f ! " # $ % & ' ( ) * + , - . / 0 1 2 3 4 5 6 7 8 9 : ; < = > ? @ A B C + D E F G H I J K L M N O P Q R S T U V W X Y Z [ \ ] ^ _ ` a b c d e f g h + i j k l m n o p q r s t u v w x y z { | } ~ \x7f \x80 \x81 \x82 \x83 \x84 + \x86 \x87 \x88 \x89 \x8a \x8b \x8c \x8d \x8e \x8f \x90 \x91 \x92 \x93 \x94 + \x95 \x96 \x97 \x98 \x99 \x9a \x9b \x9c \x9d \x9e \x9f \xa1 \xa2 \xa3 \xa4 + \xa5 \xa6 \xa7 \xa8 \xa9 \xaa \xab \xac \xad \xae \xaf \xb0 \xb1 \xb2 \xb3 + \xb4 \xb5 \xb6 \xb7 \xb8 \xb9 \xba \xbb \xbc \xbd \xbe \xbf \xc0 \xc1 \xc2 + \xc3 \xc4 \xc5 \xc6 \xc7 \xc8 \xc9 \xca \xcb \xcc \xcd \xce \xcf \xd0 \xd1 + \xd2 \xd3 \xd4 \xd5 \xd6 \xd7 \xd8 \xd9 \xda \xdb \xdc \xdd \xde \xdf \xe0 + \xe1 \xe2 \xe3 \xe4 \xe5 \xe6 \xe7 \xe8 \xe9 \xea \xeb \xec \xed \xee \xef + \xf0 \xf1 \xf2 \xf3 \xf4 \xf5 \xf6 \xf7 \xf8 \xf9 \xfa \xfb \xfc \xfd \xfe + \xff +Last code unit = ' ' +Subject length lower bound = 3 + \x{a2} \x{84} + 0: \x{a2} \x{84} + A Z + 0: A Z + +/a+/utf + a\x{123}aa\=offset=1 + 0: aa + a\x{123}aa\=offset=2 + 0: aa + a\x{123}aa\=offset=3 + 0: a + a\x{123}aa\=offset=4 +No match + a\x{123}aa\=offset=5 +Failed: error -34: bad offset value + a\x{123}aa\=offset=6 +Failed: error -34: bad offset value + +/\x{1234}+/Ii,utf +Capturing subpattern count = 0 +Options: caseless utf +First code unit = \x{1234} +No last code unit +Subject length lower bound = 1 + +/\x{1234}+?/Ii,utf +Capturing subpattern count = 0 +Options: caseless utf +First code unit = \x{1234} +No last code unit +Subject length lower bound = 1 + +/\x{1234}++/Ii,utf +Capturing subpattern count = 0 +Options: caseless utf +First code unit = \x{1234} +No last code unit +Subject length lower bound = 1 + +/\x{1234}{2}/Ii,utf +Capturing subpattern count = 0 +Options: caseless utf +First code unit = \x{1234} +Last code unit = \x{1234} +Subject length lower bound = 2 + +/[^\x{c4}]/IB,utf +------------------------------------------------------------------ + Bra + [^\x{c4}] + Ket + End +------------------------------------------------------------------ +Capturing subpattern count = 0 +Options: utf +No first code unit +No last code unit +Subject length lower bound = 1 + +/X+\x{200}/IB,utf +------------------------------------------------------------------ + Bra + X++ + \x{200} + Ket + End +------------------------------------------------------------------ +Capturing subpattern count = 0 +Options: utf +First code unit = 'X' +Last code unit = \x{200} +Subject length lower bound = 2 + +/\R/I,utf +Capturing subpattern count = 0 +Options: utf +Starting code units: \x0a \x0b \x0c \x0d \x85 \xff +No last code unit +Subject length lower bound = 1 + +# Check bad offset + +/a/utf + \x{10000}\=offset=1 +Error -36 (bad UTF-16 offset) + \x{10000}ab\=offset=1 +Error -36 (bad UTF-16 offset) + \x{10000}ab\=offset=2 + 0: a + \x{10000}ab\=offset=3 +No match + \x{10000}ab\=offset=4 +No match + \x{10000}ab\=offset=5 +Failed: error -34: bad offset value + +//utf +Failed: error -26 at offset 0: UTF-16 error: isolated low surrogate + +/\w+\x{C4}/B,utf +------------------------------------------------------------------ + Bra + \w++ + \x{c4} + Ket + End +------------------------------------------------------------------ + a\x{C4}\x{C4} + 0: a\x{c4} + +/\w+\x{C4}/B,utf,tables=2 +------------------------------------------------------------------ + Bra + \w+ + \x{c4} + Ket + End +------------------------------------------------------------------ + a\x{C4}\x{C4} + 0: a\x{c4}\x{c4} + +/\W+\x{C4}/B,utf +------------------------------------------------------------------ + Bra + \W+ + \x{c4} + Ket + End +------------------------------------------------------------------ + !\x{C4} + 0: !\x{c4} + +/\W+\x{C4}/B,utf,tables=2 +------------------------------------------------------------------ + Bra + \W++ + \x{c4} + Ket + End +------------------------------------------------------------------ + !\x{C4} + 0: !\x{c4} + +/\W+\x{A1}/B,utf +------------------------------------------------------------------ + Bra + \W+ + \x{a1} + Ket + End +------------------------------------------------------------------ + !\x{A1} + 0: !\x{a1} + +/\W+\x{A1}/B,utf,tables=2 +------------------------------------------------------------------ + Bra + \W+ + \x{a1} + Ket + End +------------------------------------------------------------------ + !\x{A1} + 0: !\x{a1} + +/X\s+\x{A0}/B,utf +------------------------------------------------------------------ + Bra + X + \s++ + \x{a0} + Ket + End +------------------------------------------------------------------ + X\x20\x{A0}\x{A0} + 0: X \x{a0} + +/X\s+\x{A0}/B,utf,tables=2 +------------------------------------------------------------------ + Bra + X + \s+ + \x{a0} + Ket + End +------------------------------------------------------------------ + X\x20\x{A0}\x{A0} + 0: X \x{a0}\x{a0} + +/\S+\x{A0}/B,utf +------------------------------------------------------------------ + Bra + \S+ + \x{a0} + Ket + End +------------------------------------------------------------------ + X\x{A0}\x{A0} + 0: X\x{a0}\x{a0} + +/\S+\x{A0}/B,utf,tables=2 +------------------------------------------------------------------ + Bra + \S++ + \x{a0} + Ket + End +------------------------------------------------------------------ + X\x{A0}\x{A0} + 0: X\x{a0} + +/\x{a0}+\s!/B,utf +------------------------------------------------------------------ + Bra + \x{a0}++ + \s + ! + Ket + End +------------------------------------------------------------------ + \x{a0}\x20! + 0: \x{a0} ! + +/\x{a0}+\s!/B,utf,tables=2 +------------------------------------------------------------------ + Bra + \x{a0}+ + \s + ! + Ket + End +------------------------------------------------------------------ + \x{a0}\x20! + 0: \x{a0} ! + +/(*UTF)abc/never_utf +Failed: error 174 at offset 6: using UTF is disabled by the application + +/abc/utf,never_utf +Failed: error 174 at offset 0: using UTF is disabled by the application + +/A\x{391}\x{10427}\x{ff3a}\x{1fb0}/IBi,utf +------------------------------------------------------------------ + Bra + /i A\x{391}\x{10427}\x{ff3a}\x{1fb0} + Ket + End +------------------------------------------------------------------ +Capturing subpattern count = 0 +Options: caseless utf +First code unit = 'A' (caseless) +Last code unit = \x{1fb0} (caseless) +Subject length lower bound = 5 + +/A\x{391}\x{10427}\x{ff3a}\x{1fb0}/IB,utf +------------------------------------------------------------------ + Bra + A\x{391}\x{10427}\x{ff3a}\x{1fb0} + Ket + End +------------------------------------------------------------------ +Capturing subpattern count = 0 +Options: utf +First code unit = 'A' +Last code unit = \x{1fb0} +Subject length lower bound = 5 + +/AB\x{1fb0}/IB,utf +------------------------------------------------------------------ + Bra + AB\x{1fb0} + Ket + End +------------------------------------------------------------------ +Capturing subpattern count = 0 +Options: utf +First code unit = 'A' +Last code unit = \x{1fb0} +Subject length lower bound = 3 + +/AB\x{1fb0}/IBi,utf +------------------------------------------------------------------ + Bra + /i AB\x{1fb0} + Ket + End +------------------------------------------------------------------ +Capturing subpattern count = 0 +Options: caseless utf +First code unit = 'A' (caseless) +Last code unit = \x{1fb0} (caseless) +Subject length lower bound = 3 + +/\x{401}\x{420}\x{421}\x{422}\x{423}\x{424}\x{425}\x{426}\x{427}\x{428}\x{429}\x{42a}\x{42b}\x{42c}\x{42d}\x{42e}\x{42f}/Ii,utf +Capturing subpattern count = 0 +Options: caseless utf +First code unit = \x{401} (caseless) +Last code unit = \x{42f} (caseless) +Subject length lower bound = 17 + \x{401}\x{420}\x{421}\x{422}\x{423}\x{424}\x{425}\x{426}\x{427}\x{428}\x{429}\x{42a}\x{42b}\x{42c}\x{42d}\x{42e}\x{42f} + 0: \x{401}\x{420}\x{421}\x{422}\x{423}\x{424}\x{425}\x{426}\x{427}\x{428}\x{429}\x{42a}\x{42b}\x{42c}\x{42d}\x{42e}\x{42f} + \x{451}\x{440}\x{441}\x{442}\x{443}\x{444}\x{445}\x{446}\x{447}\x{448}\x{449}\x{44a}\x{44b}\x{44c}\x{44d}\x{44e}\x{44f} + 0: \x{451}\x{440}\x{441}\x{442}\x{443}\x{444}\x{445}\x{446}\x{447}\x{448}\x{449}\x{44a}\x{44b}\x{44c}\x{44d}\x{44e}\x{44f} + +/[ⱥ]/Bi,utf +------------------------------------------------------------------ + Bra + /i \x{2c65} + Ket + End +------------------------------------------------------------------ + +/[^ⱥ]/Bi,utf +------------------------------------------------------------------ + Bra + /i [^\x{2c65}] + Ket + End +------------------------------------------------------------------ + +/[[:blank:]]/B,ucp +------------------------------------------------------------------ + Bra + [\x09 \xa0\x{1680}\x{180e}\x{2000}-\x{200a}\x{202f}\x{205f}\x{3000}] + Ket + End +------------------------------------------------------------------ + +/\x{212a}+/Ii,utf +Capturing subpattern count = 0 +Options: caseless utf +Starting code units: K k \xff +No last code unit +Subject length lower bound = 1 + KKkk\x{212a} + 0: KKkk\x{212a} + +/s+/Ii,utf +Capturing subpattern count = 0 +Options: caseless utf +Starting code units: S s \xff +No last code unit +Subject length lower bound = 1 + SSss\x{17f} + 0: SSss\x{17f} + +# Non-UTF characters should give errors in both 16-bit and 32-bit modes. + +/\x{110000}/utf +Failed: error 134 at offset 9: character code point value in \x{} or \o{} is too large + +/\o{4200000}/utf +Failed: error 134 at offset 10: character code point value in \x{} or \o{} is too large + +/\C/utf + \x{110000} +** Failed: character \x{110000} is greater than 0x10ffff and so cannot be converted to UTF-16 + +# End of testinput12 diff --git a/testdata/testoutput12-32 b/testdata/testoutput12-32 new file mode 100644 index 0000000..9f7393e --- /dev/null +++ b/testdata/testoutput12-32 @@ -0,0 +1,1157 @@ +# This set of tests is for UTF-16 and UTF-32 support, and is relevant only to +# the 16-bit and 32-bit libraries. The output is different for each library, +# so there are separate output files. + +/xxx/IB,utf,no_utf_check +** Failed: invalid UTF-8 string cannot be converted to 32-bit string + +/abc/utf + ] +** Failed: invalid UTF-8 string cannot be used as input in UTF mode + +/X(\C{3})/utf + X\x{11234}Y +No match + X\x{11234}YZ + 0: X\x{11234}YZ + 1: \x{11234}YZ + +/X(\C{4})/utf + X\x{11234}YZ +No match + X\x{11234}YZW + 0: X\x{11234}YZW + 1: \x{11234}YZW + +/X\C*/utf + XYZabcdce + 0: XYZabcdce + +/X\C*?/utf + XYZabcde + 0: X + +/X\C{3,5}/utf + Xabcdefg + 0: Xabcde + X\x{11234}Y +No match + X\x{11234}YZ + 0: X\x{11234}YZ + X\x{11234}\x{512} +No match + X\x{11234}\x{512}YZ + 0: X\x{11234}\x{512}YZ + X\x{11234}\x{512}\x{11234}Z + 0: X\x{11234}\x{512}\x{11234}Z + +/X\C{3,5}?/utf + Xabcdefg + 0: Xabc + X\x{11234}Y +No match + X\x{11234}YZ + 0: X\x{11234}YZ + X\x{11234}\x{512}YZ + 0: X\x{11234}\x{512}Y + *** Failers +No match + X\x{11234} +No match + +/a\Cb/utf + aXb + 0: aXb + a\nb + 0: a\x{0a}b + +/a\C\Cb/utf + a\x{12257}b +No match + a\x{12257}\x{11234}b + 0: a\x{12257}\x{11234}b + ** Failers +No match + a\x{100}b +No match + +/ab\Cde/utf + abXde + 0: abXde + +# Check maximum character size + +/\x{ffff}/IB,utf +------------------------------------------------------------------ + Bra + \x{ffff} + Ket + End +------------------------------------------------------------------ +Capturing subpattern count = 0 +Options: utf +First code unit = \x{ffff} +No last code unit +Subject length lower bound = 1 + +/\x{10000}/IB,utf +------------------------------------------------------------------ + Bra + \x{10000} + Ket + End +------------------------------------------------------------------ +Capturing subpattern count = 0 +Options: utf +First code unit = \x{10000} +No last code unit +Subject length lower bound = 1 + +/\x{100}/IB,utf +------------------------------------------------------------------ + Bra + \x{100} + Ket + End +------------------------------------------------------------------ +Capturing subpattern count = 0 +Options: utf +First code unit = \x{100} +No last code unit +Subject length lower bound = 1 + +/\x{1000}/IB,utf +------------------------------------------------------------------ + Bra + \x{1000} + Ket + End +------------------------------------------------------------------ +Capturing subpattern count = 0 +Options: utf +First code unit = \x{1000} +No last code unit +Subject length lower bound = 1 + +/\x{10000}/IB,utf +------------------------------------------------------------------ + Bra + \x{10000} + Ket + End +------------------------------------------------------------------ +Capturing subpattern count = 0 +Options: utf +First code unit = \x{10000} +No last code unit +Subject length lower bound = 1 + +/\x{100000}/IB,utf +------------------------------------------------------------------ + Bra + \x{100000} + Ket + End +------------------------------------------------------------------ +Capturing subpattern count = 0 +Options: utf +First code unit = \x{100000} +No last code unit +Subject length lower bound = 1 + +/\x{10ffff}/IB,utf +------------------------------------------------------------------ + Bra + \x{10ffff} + Ket + End +------------------------------------------------------------------ +Capturing subpattern count = 0 +Options: utf +First code unit = \x{10ffff} +No last code unit +Subject length lower bound = 1 + +/[\x{ff}]/IB,utf +------------------------------------------------------------------ + Bra + \x{ff} + Ket + End +------------------------------------------------------------------ +Capturing subpattern count = 0 +Options: utf +First code unit = \xff +No last code unit +Subject length lower bound = 1 + +/[\x{100}]/IB,utf +------------------------------------------------------------------ + Bra + \x{100} + Ket + End +------------------------------------------------------------------ +Capturing subpattern count = 0 +Options: utf +First code unit = \x{100} +No last code unit +Subject length lower bound = 1 + +/\x80/IB,utf +------------------------------------------------------------------ + Bra + \x{80} + Ket + End +------------------------------------------------------------------ +Capturing subpattern count = 0 +Options: utf +First code unit = \x80 +No last code unit +Subject length lower bound = 1 + +/\xff/IB,utf +------------------------------------------------------------------ + Bra + \x{ff} + Ket + End +------------------------------------------------------------------ +Capturing subpattern count = 0 +Options: utf +First code unit = \xff +No last code unit +Subject length lower bound = 1 + +/\x{D55c}\x{ad6d}\x{C5B4}/IB,utf +------------------------------------------------------------------ + Bra + \x{d55c}\x{ad6d}\x{c5b4} + Ket + End +------------------------------------------------------------------ +Capturing subpattern count = 0 +Options: utf +First code unit = \x{d55c} +Last code unit = \x{c5b4} +Subject length lower bound = 3 + \x{D55c}\x{ad6d}\x{C5B4} + 0: \x{d55c}\x{ad6d}\x{c5b4} + +/\x{65e5}\x{672c}\x{8a9e}/IB,utf +------------------------------------------------------------------ + Bra + \x{65e5}\x{672c}\x{8a9e} + Ket + End +------------------------------------------------------------------ +Capturing subpattern count = 0 +Options: utf +First code unit = \x{65e5} +Last code unit = \x{8a9e} +Subject length lower bound = 3 + \x{65e5}\x{672c}\x{8a9e} + 0: \x{65e5}\x{672c}\x{8a9e} + +/\x{80}/IB,utf +------------------------------------------------------------------ + Bra + \x{80} + Ket + End +------------------------------------------------------------------ +Capturing subpattern count = 0 +Options: utf +First code unit = \x80 +No last code unit +Subject length lower bound = 1 + +/\x{084}/IB,utf +------------------------------------------------------------------ + Bra + \x{84} + Ket + End +------------------------------------------------------------------ +Capturing subpattern count = 0 +Options: utf +First code unit = \x84 +No last code unit +Subject length lower bound = 1 + +/\x{104}/IB,utf +------------------------------------------------------------------ + Bra + \x{104} + Ket + End +------------------------------------------------------------------ +Capturing subpattern count = 0 +Options: utf +First code unit = \x{104} +No last code unit +Subject length lower bound = 1 + +/\x{861}/IB,utf +------------------------------------------------------------------ + Bra + \x{861} + Ket + End +------------------------------------------------------------------ +Capturing subpattern count = 0 +Options: utf +First code unit = \x{861} +No last code unit +Subject length lower bound = 1 + +/\x{212ab}/IB,utf +------------------------------------------------------------------ + Bra + \x{212ab} + Ket + End +------------------------------------------------------------------ +Capturing subpattern count = 0 +Options: utf +First code unit = \x{212ab} +No last code unit +Subject length lower bound = 1 + +# This one is here not because it's different to Perl, but because the way +# the captured single-byte is displayed. (In Perl it becomes a character, and you +# can't tell the difference.) + +/X(\C)(.*)/utf + X\x{1234} + 0: X\x{1234} + 1: \x{1234} + 2: + X\nabc + 0: X\x{0a}abc + 1: \x{0a} + 2: abc + +# This one is here because Perl gives out a grumbly error message (quite +# correctly, but that messes up comparisons). + +/a\Cb/utf + *** Failers +No match + a\x{100}b + 0: a\x{100}b + +/[^ab\xC0-\xF0]/IB,utf +------------------------------------------------------------------ + Bra + [\x00-`c-\xbf\xf1-\xff] (neg) + Ket + End +------------------------------------------------------------------ +Capturing subpattern count = 0 +Options: utf +Starting code units: \x00 \x01 \x02 \x03 \x04 \x05 \x06 \x07 \x08 \x09 \x0a + \x0b \x0c \x0d \x0e \x0f \x10 \x11 \x12 \x13 \x14 \x15 \x16 \x17 \x18 \x19 + \x1a \x1b \x1c \x1d \x1e \x1f \x20 ! " # $ % & ' ( ) * + , - . / 0 1 2 3 4 + 5 6 7 8 9 : ; < = > ? @ A B C D E F G H I J K L M N O P Q R S T U V W X Y + Z [ \ ] ^ _ ` c d e f g h i j k l m n o p q r s t u v w x y z { | } ~ \x7f + \x80 \x81 \x82 \x83 \x84 \x85 \x86 \x87 \x88 \x89 \x8a \x8b \x8c \x8d \x8e + \x8f \x90 \x91 \x92 \x93 \x94 \x95 \x96 \x97 \x98 \x99 \x9a \x9b \x9c \x9d + \x9e \x9f \xa0 \xa1 \xa2 \xa3 \xa4 \xa5 \xa6 \xa7 \xa8 \xa9 \xaa \xab \xac + \xad \xae \xaf \xb0 \xb1 \xb2 \xb3 \xb4 \xb5 \xb6 \xb7 \xb8 \xb9 \xba \xbb + \xbc \xbd \xbe \xbf \xf1 \xf2 \xf3 \xf4 \xf5 \xf6 \xf7 \xf8 \xf9 \xfa \xfb + \xfc \xfd \xfe \xff +No last code unit +Subject length lower bound = 1 + \x{f1} + 0: \x{f1} + \x{bf} + 0: \x{bf} + \x{100} + 0: \x{100} + \x{1000} + 0: \x{1000} + *** Failers + 0: * + \x{c0} +No match + \x{f0} +No match + +/Ā{3,4}/IB,utf +------------------------------------------------------------------ + Bra + \x{100}{3} + \x{100}?+ + Ket + End +------------------------------------------------------------------ +Capturing subpattern count = 0 +Options: utf +First code unit = \x{100} +Last code unit = \x{100} +Subject length lower bound = 3 + \x{100}\x{100}\x{100}\x{100\x{100} + 0: \x{100}\x{100}\x{100} + +/(\x{100}+|x)/IB,utf +------------------------------------------------------------------ + Bra + CBra 1 + \x{100}++ + Alt + x + Ket + Ket + End +------------------------------------------------------------------ +Capturing subpattern count = 1 +Options: utf +Starting code units: x \xff +No last code unit +Subject length lower bound = 1 + +/(\x{100}*a|x)/IB,utf +------------------------------------------------------------------ + Bra + CBra 1 + \x{100}*+ + a + Alt + x + Ket + Ket + End +------------------------------------------------------------------ +Capturing subpattern count = 1 +Options: utf +Starting code units: a x \xff +No last code unit +Subject length lower bound = 1 + +/(\x{100}{0,2}a|x)/IB,utf +------------------------------------------------------------------ + Bra + CBra 1 + \x{100}{0,2}+ + a + Alt + x + Ket + Ket + End +------------------------------------------------------------------ +Capturing subpattern count = 1 +Options: utf +Starting code units: a x \xff +No last code unit +Subject length lower bound = 1 + +/(\x{100}{1,2}a|x)/IB,utf +------------------------------------------------------------------ + Bra + CBra 1 + \x{100} + \x{100}{0,1}+ + a + Alt + x + Ket + Ket + End +------------------------------------------------------------------ +Capturing subpattern count = 1 +Options: utf +Starting code units: x \xff +No last code unit +Subject length lower bound = 1 + +/\x{100}/IB,utf +------------------------------------------------------------------ + Bra + \x{100} + Ket + End +------------------------------------------------------------------ +Capturing subpattern count = 0 +Options: utf +First code unit = \x{100} +No last code unit +Subject length lower bound = 1 + +/a\x{100}\x{101}*/IB,utf +------------------------------------------------------------------ + Bra + a\x{100} + \x{101}*+ + Ket + End +------------------------------------------------------------------ +Capturing subpattern count = 0 +Options: utf +First code unit = 'a' +Last code unit = \x{100} +Subject length lower bound = 2 + +/a\x{100}\x{101}+/IB,utf +------------------------------------------------------------------ + Bra + a\x{100} + \x{101}++ + Ket + End +------------------------------------------------------------------ +Capturing subpattern count = 0 +Options: utf +First code unit = 'a' +Last code unit = \x{101} +Subject length lower bound = 3 + +/[^\x{c4}]/IB +------------------------------------------------------------------ + Bra + [^\x{c4}] + Ket + End +------------------------------------------------------------------ +Capturing subpattern count = 0 +No options +No first code unit +No last code unit +Subject length lower bound = 1 + +/[\x{100}]/IB,utf +------------------------------------------------------------------ + Bra + \x{100} + Ket + End +------------------------------------------------------------------ +Capturing subpattern count = 0 +Options: utf +First code unit = \x{100} +No last code unit +Subject length lower bound = 1 + \x{100} + 0: \x{100} + Z\x{100} + 0: \x{100} + \x{100}Z + 0: \x{100} + *** Failers +No match + +/[\xff]/IB,utf +------------------------------------------------------------------ + Bra + \x{ff} + Ket + End +------------------------------------------------------------------ +Capturing subpattern count = 0 +Options: utf +First code unit = \xff +No last code unit +Subject length lower bound = 1 + >\x{ff}< + 0: \x{ff} + +/[^\xff]/IB,utf +------------------------------------------------------------------ + Bra + [^\x{ff}] + Ket + End +------------------------------------------------------------------ +Capturing subpattern count = 0 +Options: utf +No first code unit +No last code unit +Subject length lower bound = 1 + +/\x{100}abc(xyz(?1))/IB,utf +------------------------------------------------------------------ + Bra + \x{100}abc + CBra 1 + xyz + Recurse + Ket + Ket + End +------------------------------------------------------------------ +Capturing subpattern count = 1 +Options: utf +First code unit = \x{100} +Last code unit = 'z' +Subject length lower bound = 7 + +/\777/I,utf +Capturing subpattern count = 0 +Options: utf +First code unit = \x{1ff} +No last code unit +Subject length lower bound = 1 + \x{1ff} + 0: \x{1ff} + \777 + 0: \x{1ff} + +/\x{100}+\x{200}/IB,utf +------------------------------------------------------------------ + Bra + \x{100}++ + \x{200} + Ket + End +------------------------------------------------------------------ +Capturing subpattern count = 0 +Options: utf +First code unit = \x{100} +Last code unit = \x{200} +Subject length lower bound = 2 + +/\x{100}+X/IB,utf +------------------------------------------------------------------ + Bra + \x{100}++ + X + Ket + End +------------------------------------------------------------------ +Capturing subpattern count = 0 +Options: utf +First code unit = \x{100} +Last code unit = 'X' +Subject length lower bound = 2 + +/^[\QĀ\E-\QŐ\E/B,utf +Failed: error 106 at offset 13: missing terminating ] for character class + +/X/utf + \x{d800} +Failed: error -27: UTF-32 error: code points 0xd800-0xdfff are not defined + \x{d800}\=no_utf_check +No match + \x{da00} +Failed: error -27: UTF-32 error: code points 0xd800-0xdfff are not defined + \x{da00}\=no_utf_check +No match + \x{dc00} +Failed: error -27: UTF-32 error: code points 0xd800-0xdfff are not defined + \x{dc00}\=no_utf_check +No match + \x{de00} +Failed: error -27: UTF-32 error: code points 0xd800-0xdfff are not defined + \x{de00}\=no_utf_check +No match + \x{dfff} +Failed: error -27: UTF-32 error: code points 0xd800-0xdfff are not defined + \x{dfff}\=no_utf_check +No match + \x{110000} +Failed: error -28: UTF-32 error: code points greater than 0x10ffff are not defined + \x{d800}\x{1234} +Failed: error -27: UTF-32 error: code points 0xd800-0xdfff are not defined + +/(*UTF16)\x{11234}/ +Failed: error 160 at offset 5: (*VERB) not recognized or malformed + abcd\x{11234}pqr + +/(*UTF)\x{11234}/I +Capturing subpattern count = 0 +Compile options: +Overall options: utf +First code unit = \x{11234} +No last code unit +Subject length lower bound = 1 + abcd\x{11234}pqr + 0: \x{11234} + +/(*UTF-32)\x{11234}/ +Failed: error 160 at offset 5: (*VERB) not recognized or malformed + abcd\x{11234}pqr + +/(*CRLF)(*UTF16)(*BSR_UNICODE)a\Rb/I +Failed: error 160 at offset 12: (*VERB) not recognized or malformed + +/(*CRLF)(*UTF32)(*BSR_UNICODE)a\Rb/I +Capturing subpattern count = 0 +Compile options: +Overall options: utf +Forced newline is CRLF +First code unit = 'a' +Last code unit = 'b' +Subject length lower bound = 3 + +/\h/I,utf +Capturing subpattern count = 0 +Options: utf +Starting code units: \x09 \x20 \xa0 \xff +No last code unit +Subject length lower bound = 1 + ABC\x{09} + 0: \x{09} + ABC\x{20} + 0: + ABC\x{a0} + 0: \x{a0} + ABC\x{1680} + 0: \x{1680} + ABC\x{180e} + 0: \x{180e} + ABC\x{2000} + 0: \x{2000} + ABC\x{202f} + 0: \x{202f} + ABC\x{205f} + 0: \x{205f} + ABC\x{3000} + 0: \x{3000} + +/\v/I,utf +Capturing subpattern count = 0 +Options: utf +Starting code units: \x0a \x0b \x0c \x0d \x85 \xff +No last code unit +Subject length lower bound = 1 + ABC\x{0a} + 0: \x{0a} + ABC\x{0b} + 0: \x{0b} + ABC\x{0c} + 0: \x{0c} + ABC\x{0d} + 0: \x{0d} + ABC\x{85} + 0: \x{85} + ABC\x{2028} + 0: \x{2028} + +/\h*A/I,utf +Capturing subpattern count = 0 +Options: utf +Starting code units: \x09 \x20 A \xa0 \xff +Last code unit = 'A' +Subject length lower bound = 1 + CDBABC + 0: A + \x{2000}ABC + 0: \x{2000}A + +/\R*A/I,bsr=unicode,utf +Capturing subpattern count = 0 +Options: utf +Starting code units: \x0a \x0b \x0c \x0d A \x85 \xff +Last code unit = 'A' +Subject length lower bound = 1 + CDBABC + 0: A + \x{2028}A + 0: \x{2028}A + +/\v+A/I,utf +Capturing subpattern count = 0 +Options: utf +Starting code units: \x0a \x0b \x0c \x0d \x85 \xff +Last code unit = 'A' +Subject length lower bound = 2 + +/\s?xxx\s/I,utf +Capturing subpattern count = 0 +Options: utf +Starting code units: \x09 \x0a \x0b \x0c \x0d \x20 x +Last code unit = 'x' +Subject length lower bound = 4 + +/\sxxx\s/I,utf,tables=2 +Capturing subpattern count = 0 +Options: utf +Starting code units: \x09 \x0a \x0b \x0c \x0d \x20 \x85 \xa0 +Last code unit = 'x' +Subject length lower bound = 5 + AB\x{85}xxx\x{a0}XYZ + 0: \x{85}xxx\x{a0} + AB\x{a0}xxx\x{85}XYZ + 0: \x{a0}xxx\x{85} + +/\S \S/I,utf,tables=2 +Capturing subpattern count = 0 +Options: utf +Starting code units: \x00 \x01 \x02 \x03 \x04 \x05 \x06 \x07 \x08 \x0e \x0f + \x10 \x11 \x12 \x13 \x14 \x15 \x16 \x17 \x18 \x19 \x1a \x1b \x1c \x1d \x1e + \x1f ! " # $ % & ' ( ) * + , - . / 0 1 2 3 4 5 6 7 8 9 : ; < = > ? @ A B C + D E F G H I J K L M N O P Q R S T U V W X Y Z [ \ ] ^ _ ` a b c d e f g h + i j k l m n o p q r s t u v w x y z { | } ~ \x7f \x80 \x81 \x82 \x83 \x84 + \x86 \x87 \x88 \x89 \x8a \x8b \x8c \x8d \x8e \x8f \x90 \x91 \x92 \x93 \x94 + \x95 \x96 \x97 \x98 \x99 \x9a \x9b \x9c \x9d \x9e \x9f \xa1 \xa2 \xa3 \xa4 + \xa5 \xa6 \xa7 \xa8 \xa9 \xaa \xab \xac \xad \xae \xaf \xb0 \xb1 \xb2 \xb3 + \xb4 \xb5 \xb6 \xb7 \xb8 \xb9 \xba \xbb \xbc \xbd \xbe \xbf \xc0 \xc1 \xc2 + \xc3 \xc4 \xc5 \xc6 \xc7 \xc8 \xc9 \xca \xcb \xcc \xcd \xce \xcf \xd0 \xd1 + \xd2 \xd3 \xd4 \xd5 \xd6 \xd7 \xd8 \xd9 \xda \xdb \xdc \xdd \xde \xdf \xe0 + \xe1 \xe2 \xe3 \xe4 \xe5 \xe6 \xe7 \xe8 \xe9 \xea \xeb \xec \xed \xee \xef + \xf0 \xf1 \xf2 \xf3 \xf4 \xf5 \xf6 \xf7 \xf8 \xf9 \xfa \xfb \xfc \xfd \xfe + \xff +Last code unit = ' ' +Subject length lower bound = 3 + \x{a2} \x{84} + 0: \x{a2} \x{84} + A Z + 0: A Z + +/a+/utf + a\x{123}aa\=offset=1 + 0: aa + a\x{123}aa\=offset=2 + 0: aa + a\x{123}aa\=offset=3 + 0: a + a\x{123}aa\=offset=4 +No match + a\x{123}aa\=offset=5 +Failed: error -34: bad offset value + a\x{123}aa\=offset=6 +Failed: error -34: bad offset value + +/\x{1234}+/Ii,utf +Capturing subpattern count = 0 +Options: caseless utf +First code unit = \x{1234} +No last code unit +Subject length lower bound = 1 + +/\x{1234}+?/Ii,utf +Capturing subpattern count = 0 +Options: caseless utf +First code unit = \x{1234} +No last code unit +Subject length lower bound = 1 + +/\x{1234}++/Ii,utf +Capturing subpattern count = 0 +Options: caseless utf +First code unit = \x{1234} +No last code unit +Subject length lower bound = 1 + +/\x{1234}{2}/Ii,utf +Capturing subpattern count = 0 +Options: caseless utf +First code unit = \x{1234} +Last code unit = \x{1234} +Subject length lower bound = 2 + +/[^\x{c4}]/IB,utf +------------------------------------------------------------------ + Bra + [^\x{c4}] + Ket + End +------------------------------------------------------------------ +Capturing subpattern count = 0 +Options: utf +No first code unit +No last code unit +Subject length lower bound = 1 + +/X+\x{200}/IB,utf +------------------------------------------------------------------ + Bra + X++ + \x{200} + Ket + End +------------------------------------------------------------------ +Capturing subpattern count = 0 +Options: utf +First code unit = 'X' +Last code unit = \x{200} +Subject length lower bound = 2 + +/\R/I,utf +Capturing subpattern count = 0 +Options: utf +Starting code units: \x0a \x0b \x0c \x0d \x85 \xff +No last code unit +Subject length lower bound = 1 + +# Check bad offset + +/a/utf + \x{10000}\=offset=1 +No match + \x{10000}ab\=offset=1 + 0: a + \x{10000}ab\=offset=2 +No match + \x{10000}ab\=offset=3 +No match + \x{10000}ab\=offset=4 +Failed: error -34: bad offset value + \x{10000}ab\=offset=5 +Failed: error -34: bad offset value + +//utf +Failed: error -27 at offset 0: UTF-32 error: code points 0xd800-0xdfff are not defined + +/\w+\x{C4}/B,utf +------------------------------------------------------------------ + Bra + \w++ + \x{c4} + Ket + End +------------------------------------------------------------------ + a\x{C4}\x{C4} + 0: a\x{c4} + +/\w+\x{C4}/B,utf,tables=2 +------------------------------------------------------------------ + Bra + \w+ + \x{c4} + Ket + End +------------------------------------------------------------------ + a\x{C4}\x{C4} + 0: a\x{c4}\x{c4} + +/\W+\x{C4}/B,utf +------------------------------------------------------------------ + Bra + \W+ + \x{c4} + Ket + End +------------------------------------------------------------------ + !\x{C4} + 0: !\x{c4} + +/\W+\x{C4}/B,utf,tables=2 +------------------------------------------------------------------ + Bra + \W++ + \x{c4} + Ket + End +------------------------------------------------------------------ + !\x{C4} + 0: !\x{c4} + +/\W+\x{A1}/B,utf +------------------------------------------------------------------ + Bra + \W+ + \x{a1} + Ket + End +------------------------------------------------------------------ + !\x{A1} + 0: !\x{a1} + +/\W+\x{A1}/B,utf,tables=2 +------------------------------------------------------------------ + Bra + \W+ + \x{a1} + Ket + End +------------------------------------------------------------------ + !\x{A1} + 0: !\x{a1} + +/X\s+\x{A0}/B,utf +------------------------------------------------------------------ + Bra + X + \s++ + \x{a0} + Ket + End +------------------------------------------------------------------ + X\x20\x{A0}\x{A0} + 0: X \x{a0} + +/X\s+\x{A0}/B,utf,tables=2 +------------------------------------------------------------------ + Bra + X + \s+ + \x{a0} + Ket + End +------------------------------------------------------------------ + X\x20\x{A0}\x{A0} + 0: X \x{a0}\x{a0} + +/\S+\x{A0}/B,utf +------------------------------------------------------------------ + Bra + \S+ + \x{a0} + Ket + End +------------------------------------------------------------------ + X\x{A0}\x{A0} + 0: X\x{a0}\x{a0} + +/\S+\x{A0}/B,utf,tables=2 +------------------------------------------------------------------ + Bra + \S++ + \x{a0} + Ket + End +------------------------------------------------------------------ + X\x{A0}\x{A0} + 0: X\x{a0} + +/\x{a0}+\s!/B,utf +------------------------------------------------------------------ + Bra + \x{a0}++ + \s + ! + Ket + End +------------------------------------------------------------------ + \x{a0}\x20! + 0: \x{a0} ! + +/\x{a0}+\s!/B,utf,tables=2 +------------------------------------------------------------------ + Bra + \x{a0}+ + \s + ! + Ket + End +------------------------------------------------------------------ + \x{a0}\x20! + 0: \x{a0} ! + +/(*UTF)abc/never_utf +Failed: error 174 at offset 6: using UTF is disabled by the application + +/abc/utf,never_utf +Failed: error 174 at offset 0: using UTF is disabled by the application + +/A\x{391}\x{10427}\x{ff3a}\x{1fb0}/IBi,utf +------------------------------------------------------------------ + Bra + /i A\x{391}\x{10427}\x{ff3a}\x{1fb0} + Ket + End +------------------------------------------------------------------ +Capturing subpattern count = 0 +Options: caseless utf +First code unit = 'A' (caseless) +Last code unit = \x{1fb0} (caseless) +Subject length lower bound = 5 + +/A\x{391}\x{10427}\x{ff3a}\x{1fb0}/IB,utf +------------------------------------------------------------------ + Bra + A\x{391}\x{10427}\x{ff3a}\x{1fb0} + Ket + End +------------------------------------------------------------------ +Capturing subpattern count = 0 +Options: utf +First code unit = 'A' +Last code unit = \x{1fb0} +Subject length lower bound = 5 + +/AB\x{1fb0}/IB,utf +------------------------------------------------------------------ + Bra + AB\x{1fb0} + Ket + End +------------------------------------------------------------------ +Capturing subpattern count = 0 +Options: utf +First code unit = 'A' +Last code unit = \x{1fb0} +Subject length lower bound = 3 + +/AB\x{1fb0}/IBi,utf +------------------------------------------------------------------ + Bra + /i AB\x{1fb0} + Ket + End +------------------------------------------------------------------ +Capturing subpattern count = 0 +Options: caseless utf +First code unit = 'A' (caseless) +Last code unit = \x{1fb0} (caseless) +Subject length lower bound = 3 + +/\x{401}\x{420}\x{421}\x{422}\x{423}\x{424}\x{425}\x{426}\x{427}\x{428}\x{429}\x{42a}\x{42b}\x{42c}\x{42d}\x{42e}\x{42f}/Ii,utf +Capturing subpattern count = 0 +Options: caseless utf +First code unit = \x{401} (caseless) +Last code unit = \x{42f} (caseless) +Subject length lower bound = 17 + \x{401}\x{420}\x{421}\x{422}\x{423}\x{424}\x{425}\x{426}\x{427}\x{428}\x{429}\x{42a}\x{42b}\x{42c}\x{42d}\x{42e}\x{42f} + 0: \x{401}\x{420}\x{421}\x{422}\x{423}\x{424}\x{425}\x{426}\x{427}\x{428}\x{429}\x{42a}\x{42b}\x{42c}\x{42d}\x{42e}\x{42f} + \x{451}\x{440}\x{441}\x{442}\x{443}\x{444}\x{445}\x{446}\x{447}\x{448}\x{449}\x{44a}\x{44b}\x{44c}\x{44d}\x{44e}\x{44f} + 0: \x{451}\x{440}\x{441}\x{442}\x{443}\x{444}\x{445}\x{446}\x{447}\x{448}\x{449}\x{44a}\x{44b}\x{44c}\x{44d}\x{44e}\x{44f} + +/[ⱥ]/Bi,utf +------------------------------------------------------------------ + Bra + /i \x{2c65} + Ket + End +------------------------------------------------------------------ + +/[^ⱥ]/Bi,utf +------------------------------------------------------------------ + Bra + /i [^\x{2c65}] + Ket + End +------------------------------------------------------------------ + +/[[:blank:]]/B,ucp +------------------------------------------------------------------ + Bra + [\x09 \xa0\x{1680}\x{180e}\x{2000}-\x{200a}\x{202f}\x{205f}\x{3000}] + Ket + End +------------------------------------------------------------------ + +/\x{212a}+/Ii,utf +Capturing subpattern count = 0 +Options: caseless utf +Starting code units: K k \xff +No last code unit +Subject length lower bound = 1 + KKkk\x{212a} + 0: KKkk\x{212a} + +/s+/Ii,utf +Capturing subpattern count = 0 +Options: caseless utf +Starting code units: S s \xff +No last code unit +Subject length lower bound = 1 + SSss\x{17f} + 0: SSss\x{17f} + +# Non-UTF characters should give errors in both 16-bit and 32-bit modes. + +/\x{110000}/utf +Failed: error 134 at offset 9: character code point value in \x{} or \o{} is too large + +/\o{4200000}/utf +Failed: error 134 at offset 10: character code point value in \x{} or \o{} is too large + +/\C/utf + \x{110000} +Failed: error -28: UTF-32 error: code points greater than 0x10ffff are not defined + +# End of testinput12 diff --git a/testdata/testoutput13 b/testdata/testoutput13 new file mode 100644 index 0000000..f737ebe --- /dev/null +++ b/testdata/testoutput13 @@ -0,0 +1,27 @@ +# These DFA tests are for the handling of characters greater than 255 in +# 16-bit or 32-bit, non-UTF mode. + +#forbid_utf +#subject dfa + +/^\x{ffff}+/i + \x{ffff} + 0: \x{ffff} + +/^\x{ffff}?/i + \x{ffff} + 0: \x{ffff} + +/^\x{ffff}*/i + \x{ffff} + 0: \x{ffff} + +/^\x{ffff}{3}/i + \x{ffff}\x{ffff}\x{ffff} + 0: \x{ffff}\x{ffff}\x{ffff} + +/^\x{ffff}{0,3}/i + \x{ffff} + 0: \x{ffff} + +# End of testinput13 diff --git a/testdata/testoutput2 b/testdata/testoutput2 index 599e0ce..3cb8a6c 100644 --- a/testdata/testoutput2 +++ b/testdata/testoutput2 @@ -6723,7 +6723,7 @@ Subject length lower bound = 5 1: \x0d 2: \x0a -+((?:\s|//.*\\n|/[*](?:\\n|.)*?[*]/)*)+I +!((?:\s|//.*\\n|/[*](?:\\n|.)*?[*]/)*)!I Capturing subpattern count = 1 May match empty string No options diff --git a/testdata/testoutput5 b/testdata/testoutput5 index a7862f2..d58cdb8 100644 --- a/testdata/testoutput5 +++ b/testdata/testoutput5 @@ -1281,13 +1281,13 @@ Partial match: abcde X\=ps Partial match: X -/\sxxx\s/utf,tables=1 +/\sxxx\s/utf,tables=2 AB\x{85}xxx\x{a0}XYZ -No match + 0: \x{85}xxx\x{a0} AB\x{a0}xxx\x{85}XYZ -No match + 0: \x{a0}xxx\x{85} -/\S \S/utf,tables=1 +/\S \S/utf,tables=2 \x{a2} \x{84} 0: \x{a2} \x{84} diff --git a/testdata/testoutput8-16 b/testdata/testoutput8-16 new file mode 100644 index 0000000..c51b406 --- /dev/null +++ b/testdata/testoutput8-16 @@ -0,0 +1,745 @@ +# These are a few representative patterns whose lengths and offsets are to be +# shown when the link size is 2. This is just a doublecheck test to ensure the +# sizes don't go horribly wrong when something is changed. The pattern contents +# are all themselves checked in other tests. Unicode, including property +# support, is required for these tests. + +#pattern fullbincode,memory + +/((?i)b)/ +Memory allocation (code space): 24 +------------------------------------------------------------------ + 0 9 Bra + 2 5 CBra 1 + 5 /i b + 7 5 Ket + 9 9 Ket + 11 End +------------------------------------------------------------------ + +/(?s)(.*X|^B)/ +Memory allocation (code space): 38 +------------------------------------------------------------------ + 0 16 Bra + 2 7 CBra 1 + 5 AllAny* + 7 X + 9 5 Alt + 11 ^ + 12 B + 14 12 Ket + 16 16 Ket + 18 End +------------------------------------------------------------------ + +/(?s:.*X|^B)/ +Memory allocation (code space): 36 +------------------------------------------------------------------ + 0 15 Bra + 2 6 Bra + 4 AllAny* + 6 X + 8 5 Alt + 10 ^ + 11 B + 13 11 Ket + 15 15 Ket + 17 End +------------------------------------------------------------------ + +/^[[:alnum:]]/ +Memory allocation (code space): 46 +------------------------------------------------------------------ + 0 20 Bra + 2 ^ + 3 [0-9A-Za-z] + 20 20 Ket + 22 End +------------------------------------------------------------------ + +/#/Ix +Memory allocation (code space): 10 +------------------------------------------------------------------ + 0 2 Bra + 2 2 Ket + 4 End +------------------------------------------------------------------ +Capturing subpattern count = 0 +May match empty string +Options: extended +No first code unit +No last code unit +Subject length lower bound = 0 + +/a#/Ix +Memory allocation (code space): 14 +------------------------------------------------------------------ + 0 4 Bra + 2 a + 4 4 Ket + 6 End +------------------------------------------------------------------ +Capturing subpattern count = 0 +Options: extended +First code unit = 'a' +No last code unit +Subject length lower bound = 1 + +/x?+/ +Memory allocation (code space): 14 +------------------------------------------------------------------ + 0 4 Bra + 2 x?+ + 4 4 Ket + 6 End +------------------------------------------------------------------ + +/x++/ +Memory allocation (code space): 14 +------------------------------------------------------------------ + 0 4 Bra + 2 x++ + 4 4 Ket + 6 End +------------------------------------------------------------------ + +/x{1,3}+/ +Memory allocation (code space): 20 +------------------------------------------------------------------ + 0 7 Bra + 2 x + 4 x{0,2}+ + 7 7 Ket + 9 End +------------------------------------------------------------------ + +/(x)*+/ +Memory allocation (code space): 26 +------------------------------------------------------------------ + 0 10 Bra + 2 Braposzero + 3 5 CBraPos 1 + 6 x + 8 5 KetRpos + 10 10 Ket + 12 End +------------------------------------------------------------------ + +/^((a+)(?U)([ab]+)(?-U)([bc]+)(\w*))/ +Memory allocation (code space): 142 +------------------------------------------------------------------ + 0 68 Bra + 2 ^ + 3 63 CBra 1 + 6 5 CBra 2 + 9 a+ + 11 5 Ket + 13 21 CBra 3 + 16 [ab]+? + 34 21 Ket + 36 21 CBra 4 + 39 [bc]+ + 57 21 Ket + 59 5 CBra 5 + 62 \w*+ + 64 5 Ket + 66 63 Ket + 68 68 Ket + 70 End +------------------------------------------------------------------ + +"8J\$WE\<\.rX\+ix\[d1b\!H\#\?vV0vrK\:ZH1\=2M\>iV\;\?aPhFB\<\*vW\@QW\@sO9\}cfZA\-i\'w\%hKd6gt1UJP\,15_\#QY\$M\^Mss_U\/\]\&LK9\[5vQub\^w\[KDD\qmj\;2\}YWFdYx\.Ap\]hjCPTP\(n28k\+3\;o\&WXqs\/gOXdr\$\:r\'do0\;b4c\(f_Gr\=\"\\4\)\[01T7ajQJvL\$W\~mL_sS\/4h\:x\*\[ZN\=KLs\&L5zX\/\/\>it\,o\:aU\(\;Z\>pW\&T7oP\'2K\^E\:x9\'c\[\%z\-\,64JQ5AeH_G\#KijUKghQw\^\\vea3a\?kka_G\$8\#\`\*kynsxzBLru\'\]k_\[7FrVx\}\^\=\$blx\>s\-N\%j\;D\*aZDnsw\:YKZ\%Q\.Kne9\#hP\?\+b3\(SOvL\,\^\;\&u5\@\?5C5Bhb\=m\-vEh_L15Jl\]U\)0RP6\{q\%L\^_z5E\'Dw6X\b" +Memory allocation (code space): 1648 +------------------------------------------------------------------ + 0 821 Bra + 2 8J$WE<.rX+ix[d1b!H#?vV0vrK:ZH1=2M>iV;?aPhFB<*vW@QW@sO9}cfZA-i'w%hKd6gt1UJP,15_#QY$M^Mss_U/]&LK9[5vQub^w[KDDqmj;2}YWFdYx.Ap]hjCPTP(n28k+3;o&WXqs/gOXdr$:r'do0;b4c(f_Gr="\4)[01T7ajQJvL$W~mL_sS/4h:x*[ZN=KLs&L5zX//>it,o:aU(;Z>pW&T7oP'2K^E:x9'c[%z-,64JQ5AeH_G#KijUKghQw^\vea3a?kka_G$8#`*kynsxzBLru']k_[7FrVx}^=$blx>s-N%j;D*aZDnsw:YKZ%Q.Kne9#hP?+b3(SOvL,^;&u5@?5C5Bhb=m-vEh_L15Jl]U)0RP6{q%L^_z5E'Dw6X +820 \b +821 821 Ket +823 End +------------------------------------------------------------------ + +"\$\<\.X\+ix\[d1b\!H\#\?vV0vrK\:ZH1\=2M\>iV\;\?aPhFB\<\*vW\@QW\@sO9\}cfZA\-i\'w\%hKd6gt1UJP\,15_\#QY\$M\^Mss_U\/\]\&LK9\[5vQub\^w\[KDD\qmj\;2\}YWFdYx\.Ap\]hjCPTP\(n28k\+3\;o\&WXqs\/gOXdr\$\:r\'do0\;b4c\(f_Gr\=\"\\4\)\[01T7ajQJvL\$W\~mL_sS\/4h\:x\*\[ZN\=KLs\&L5zX\/\/\>it\,o\:aU\(\;Z\>pW\&T7oP\'2K\^E\:x9\'c\[\%z\-\,64JQ5AeH_G\#KijUKghQw\^\\vea3a\?kka_G\$8\#\`\*kynsxzBLru\'\]k_\[7FrVx\}\^\=\$blx\>s\-N\%j\;D\*aZDnsw\:YKZ\%Q\.Kne9\#hP\?\+b3\(SOvL\,\^\;\&u5\@\?5C5Bhb\=m\-vEh_L15Jl\]U\)0RP6\{q\%L\^_z5E\'Dw6X\b" +Memory allocation (code space): 1628 +------------------------------------------------------------------ + 0 811 Bra + 2 $<.X+ix[d1b!H#?vV0vrK:ZH1=2M>iV;?aPhFB<*vW@QW@sO9}cfZA-i'w%hKd6gt1UJP,15_#QY$M^Mss_U/]&LK9[5vQub^w[KDDqmj;2}YWFdYx.Ap]hjCPTP(n28k+3;o&WXqs/gOXdr$:r'do0;b4c(f_Gr="\4)[01T7ajQJvL$W~mL_sS/4h:x*[ZN=KLs&L5zX//>it,o:aU(;Z>pW&T7oP'2K^E:x9'c[%z-,64JQ5AeH_G#KijUKghQw^\vea3a?kka_G$8#`*kynsxzBLru']k_[7FrVx}^=$blx>s-N%j;D*aZDnsw:YKZ%Q.Kne9#hP?+b3(SOvL,^;&u5@?5C5Bhb=m-vEh_L15Jl]U)0RP6{q%L^_z5E'Dw6X +810 \b +811 811 Ket +813 End +------------------------------------------------------------------ + +/(a(?1)b)/ +Memory allocation (code space): 32 +------------------------------------------------------------------ + 0 13 Bra + 2 9 CBra 1 + 5 a + 7 2 Recurse + 9 b + 11 9 Ket + 13 13 Ket + 15 End +------------------------------------------------------------------ + +/(a(?1)+b)/ +Memory allocation (code space): 40 +------------------------------------------------------------------ + 0 17 Bra + 2 13 CBra 1 + 5 a + 7 4 Once + 9 2 Recurse + 11 4 KetRmax + 13 b + 15 13 Ket + 17 17 Ket + 19 End +------------------------------------------------------------------ + +/a(?Pb|c)d(?Pe)/ +Memory allocation (code space): 54 +------------------------------------------------------------------ + 0 24 Bra + 2 a + 4 5 CBra 1 + 7 b + 9 4 Alt + 11 c + 13 9 Ket + 15 d + 17 5 CBra 2 + 20 e + 22 5 Ket + 24 24 Ket + 26 End +------------------------------------------------------------------ + +/(?:a(?Pc(?Pd)))(?Pa)/ +Memory allocation (code space): 64 +------------------------------------------------------------------ + 0 29 Bra + 2 18 Bra + 4 a + 6 12 CBra 1 + 9 c + 11 5 CBra 2 + 14 d + 16 5 Ket + 18 12 Ket + 20 18 Ket + 22 5 CBra 3 + 25 a + 27 5 Ket + 29 29 Ket + 31 End +------------------------------------------------------------------ + +/(?Pa)...(?P=a)bbb(?P>a)d/ +Memory allocation (code space): 54 +------------------------------------------------------------------ + 0 24 Bra + 2 5 CBra 1 + 5 a + 7 5 Ket + 9 Any + 10 Any + 11 Any + 12 \1 + 14 bbb + 20 2 Recurse + 22 d + 24 24 Ket + 26 End +------------------------------------------------------------------ + +/abc(?C255)de(?C)f/ +Memory allocation (code space): 50 +------------------------------------------------------------------ + 0 22 Bra + 2 abc + 8 Callout 255 10 1 + 12 de + 16 Callout 0 16 1 + 20 f + 22 22 Ket + 24 End +------------------------------------------------------------------ + +/abcde/auto_callout +Memory allocation (code space): 78 +------------------------------------------------------------------ + 0 36 Bra + 2 Callout 255 0 1 + 6 a + 8 Callout 255 1 1 + 12 b + 14 Callout 255 2 1 + 18 c + 20 Callout 255 3 1 + 24 d + 26 Callout 255 4 1 + 30 e + 32 Callout 255 5 0 + 36 36 Ket + 38 End +------------------------------------------------------------------ + +/\x{100}/utf +Memory allocation (code space): 14 +------------------------------------------------------------------ + 0 4 Bra + 2 \x{100} + 4 4 Ket + 6 End +------------------------------------------------------------------ + +/\x{1000}/utf +Memory allocation (code space): 14 +------------------------------------------------------------------ + 0 4 Bra + 2 \x{1000} + 4 4 Ket + 6 End +------------------------------------------------------------------ + +/\x{10000}/utf +Memory allocation (code space): 16 +------------------------------------------------------------------ + 0 5 Bra + 2 \x{10000} + 5 5 Ket + 7 End +------------------------------------------------------------------ + +/\x{100000}/utf +Memory allocation (code space): 16 +------------------------------------------------------------------ + 0 5 Bra + 2 \x{100000} + 5 5 Ket + 7 End +------------------------------------------------------------------ + +/\x{10ffff}/utf +Memory allocation (code space): 16 +------------------------------------------------------------------ + 0 5 Bra + 2 \x{10ffff} + 5 5 Ket + 7 End +------------------------------------------------------------------ + +/\x{110000}/utf +Failed: error 134 at offset 9: character code point value in \x{} or \o{} is too large + +/[\x{ff}]/utf +Memory allocation (code space): 14 +------------------------------------------------------------------ + 0 4 Bra + 2 \x{ff} + 4 4 Ket + 6 End +------------------------------------------------------------------ + +/[\x{100}]/utf +Memory allocation (code space): 14 +------------------------------------------------------------------ + 0 4 Bra + 2 \x{100} + 4 4 Ket + 6 End +------------------------------------------------------------------ + +/\x80/utf +Memory allocation (code space): 14 +------------------------------------------------------------------ + 0 4 Bra + 2 \x{80} + 4 4 Ket + 6 End +------------------------------------------------------------------ + +/\xff/utf +Memory allocation (code space): 14 +------------------------------------------------------------------ + 0 4 Bra + 2 \x{ff} + 4 4 Ket + 6 End +------------------------------------------------------------------ + +/\x{0041}\x{2262}\x{0391}\x{002e}/I,utf +Memory allocation (code space): 26 +------------------------------------------------------------------ + 0 10 Bra + 2 A\x{2262}\x{391}. + 10 10 Ket + 12 End +------------------------------------------------------------------ +Capturing subpattern count = 0 +Options: utf +First code unit = 'A' +Last code unit = '.' +Subject length lower bound = 4 + +/\x{D55c}\x{ad6d}\x{C5B4}/I,utf +Memory allocation (code space): 22 +------------------------------------------------------------------ + 0 8 Bra + 2 \x{d55c}\x{ad6d}\x{c5b4} + 8 8 Ket + 10 End +------------------------------------------------------------------ +Capturing subpattern count = 0 +Options: utf +First code unit = \x{d55c} +Last code unit = \x{c5b4} +Subject length lower bound = 3 + +/\x{65e5}\x{672c}\x{8a9e}/I,utf +Memory allocation (code space): 22 +------------------------------------------------------------------ + 0 8 Bra + 2 \x{65e5}\x{672c}\x{8a9e} + 8 8 Ket + 10 End +------------------------------------------------------------------ +Capturing subpattern count = 0 +Options: utf +First code unit = \x{65e5} +Last code unit = \x{8a9e} +Subject length lower bound = 3 + +/[\x{100}]/utf +Memory allocation (code space): 14 +------------------------------------------------------------------ + 0 4 Bra + 2 \x{100} + 4 4 Ket + 6 End +------------------------------------------------------------------ + +/[Z\x{100}]/utf +Memory allocation (code space): 54 +------------------------------------------------------------------ + 0 24 Bra + 2 [Z\x{100}] + 24 24 Ket + 26 End +------------------------------------------------------------------ + +/^[\x{100}\E-\Q\E\x{150}]/utf +Memory allocation (code space): 26 +------------------------------------------------------------------ + 0 10 Bra + 2 ^ + 3 [\x{100}-\x{150}] + 10 10 Ket + 12 End +------------------------------------------------------------------ + +/^[\QĀ\E-\QŐ\E]/utf +Memory allocation (code space): 26 +------------------------------------------------------------------ + 0 10 Bra + 2 ^ + 3 [\x{100}-\x{150}] + 10 10 Ket + 12 End +------------------------------------------------------------------ + +/^[\QĀ\E-\QŐ\E/utf +Failed: error 106 at offset 13: missing terminating ] for character class + +/[\p{L}]/ +Memory allocation (code space): 24 +------------------------------------------------------------------ + 0 9 Bra + 2 [\p{L}] + 9 9 Ket + 11 End +------------------------------------------------------------------ + +/[\p{^L}]/ +Memory allocation (code space): 24 +------------------------------------------------------------------ + 0 9 Bra + 2 [\P{L}] + 9 9 Ket + 11 End +------------------------------------------------------------------ + +/[\P{L}]/ +Memory allocation (code space): 24 +------------------------------------------------------------------ + 0 9 Bra + 2 [\P{L}] + 9 9 Ket + 11 End +------------------------------------------------------------------ + +/[\P{^L}]/ +Memory allocation (code space): 24 +------------------------------------------------------------------ + 0 9 Bra + 2 [\p{L}] + 9 9 Ket + 11 End +------------------------------------------------------------------ + +/[abc\p{L}\x{0660}]/utf +Memory allocation (code space): 60 +------------------------------------------------------------------ + 0 27 Bra + 2 [a-c\p{L}\x{660}] + 27 27 Ket + 29 End +------------------------------------------------------------------ + +/[\p{Nd}]/utf +Memory allocation (code space): 24 +------------------------------------------------------------------ + 0 9 Bra + 2 [\p{Nd}] + 9 9 Ket + 11 End +------------------------------------------------------------------ + +/[\p{Nd}+-]+/utf +Memory allocation (code space): 58 +------------------------------------------------------------------ + 0 26 Bra + 2 [+\-\p{Nd}]++ + 26 26 Ket + 28 End +------------------------------------------------------------------ + +/A\x{391}\x{10427}\x{ff3a}\x{1fb0}/i,utf +Memory allocation (code space): 32 +------------------------------------------------------------------ + 0 13 Bra + 2 /i A\x{391}\x{10427}\x{ff3a}\x{1fb0} + 13 13 Ket + 15 End +------------------------------------------------------------------ + +/A\x{391}\x{10427}\x{ff3a}\x{1fb0}/utf +Memory allocation (code space): 32 +------------------------------------------------------------------ + 0 13 Bra + 2 A\x{391}\x{10427}\x{ff3a}\x{1fb0} + 13 13 Ket + 15 End +------------------------------------------------------------------ + +/[\x{105}-\x{109}]/i,utf +Memory allocation (code space): 24 +------------------------------------------------------------------ + 0 9 Bra + 2 [\x{104}-\x{109}] + 9 9 Ket + 11 End +------------------------------------------------------------------ + +/( ( (?(1)0|) )* )/x +Memory allocation (code space): 52 +------------------------------------------------------------------ + 0 23 Bra + 2 19 CBra 1 + 5 Brazero + 6 13 SCBra 2 + 9 6 Cond + 11 1 Cond ref + 13 0 + 15 2 Alt + 17 8 Ket + 19 13 KetRmax + 21 19 Ket + 23 23 Ket + 25 End +------------------------------------------------------------------ + +/( (?(1)0|)* )/x +Memory allocation (code space): 42 +------------------------------------------------------------------ + 0 18 Bra + 2 14 CBra 1 + 5 Brazero + 6 6 SCond + 8 1 Cond ref + 10 0 + 12 2 Alt + 14 8 KetRmax + 16 14 Ket + 18 18 Ket + 20 End +------------------------------------------------------------------ + +/[a]/ +Memory allocation (code space): 14 +------------------------------------------------------------------ + 0 4 Bra + 2 a + 4 4 Ket + 6 End +------------------------------------------------------------------ + +/[a]/utf +Memory allocation (code space): 14 +------------------------------------------------------------------ + 0 4 Bra + 2 a + 4 4 Ket + 6 End +------------------------------------------------------------------ + +/[\xaa]/ +Memory allocation (code space): 14 +------------------------------------------------------------------ + 0 4 Bra + 2 \x{aa} + 4 4 Ket + 6 End +------------------------------------------------------------------ + +/[\xaa]/utf +Memory allocation (code space): 14 +------------------------------------------------------------------ + 0 4 Bra + 2 \x{aa} + 4 4 Ket + 6 End +------------------------------------------------------------------ + +/[^a]/ +Memory allocation (code space): 14 +------------------------------------------------------------------ + 0 4 Bra + 2 [^a] + 4 4 Ket + 6 End +------------------------------------------------------------------ + +/[^a]/utf +Memory allocation (code space): 14 +------------------------------------------------------------------ + 0 4 Bra + 2 [^a] + 4 4 Ket + 6 End +------------------------------------------------------------------ + +/[^\xaa]/ +Memory allocation (code space): 14 +------------------------------------------------------------------ + 0 4 Bra + 2 [^\x{aa}] + 4 4 Ket + 6 End +------------------------------------------------------------------ + +/[^\xaa]/utf +Memory allocation (code space): 14 +------------------------------------------------------------------ + 0 4 Bra + 2 [^\x{aa}] + 4 4 Ket + 6 End +------------------------------------------------------------------ + +#pattern -memory + +/[^\d]/utf,ucp +------------------------------------------------------------------ + 0 9 Bra + 2 [^\p{Nd}] + 9 9 Ket + 11 End +------------------------------------------------------------------ + +/[[:^alpha:][:^cntrl:]]+/utf,ucp +------------------------------------------------------------------ + 0 26 Bra + 2 [ -~\x80-\xff\P{L}]++ + 26 26 Ket + 28 End +------------------------------------------------------------------ + +/[[:^cntrl:][:^alpha:]]+/utf,ucp +------------------------------------------------------------------ + 0 26 Bra + 2 [ -~\x80-\xff\P{L}]++ + 26 26 Ket + 28 End +------------------------------------------------------------------ + +/[[:alpha:]]+/utf,ucp +------------------------------------------------------------------ + 0 10 Bra + 2 [\p{L}]++ + 10 10 Ket + 12 End +------------------------------------------------------------------ + +/[[:^alpha:]\S]+/utf,ucp +------------------------------------------------------------------ + 0 13 Bra + 2 [\P{L}\P{Xsp}]++ + 13 13 Ket + 15 End +------------------------------------------------------------------ + +/abc(d|e)(*THEN)x(123(*THEN)4|567(b|q)(*THEN)xx)/ +------------------------------------------------------------------ + 0 60 Bra + 2 abc + 8 5 CBra 1 + 11 d + 13 4 Alt + 15 e + 17 9 Ket + 19 *THEN + 20 x + 22 12 CBra 2 + 25 123 + 31 *THEN + 32 4 + 34 24 Alt + 36 567 + 42 5 CBra 3 + 45 b + 47 4 Alt + 49 q + 51 9 Ket + 53 *THEN + 54 xx + 58 36 Ket + 60 60 Ket + 62 End +------------------------------------------------------------------ + +/(((a\2)|(a*)\g<-1>))*a?/ +------------------------------------------------------------------ + 0 39 Bra + 2 Brazero + 3 32 SCBra 1 + 6 27 Once + 8 12 CBra 2 + 11 7 CBra 3 + 14 a + 16 \2 + 18 7 Ket + 20 11 Alt + 22 5 CBra 4 + 25 a* + 27 5 Ket + 29 22 Recurse + 31 23 Ket + 33 27 Ket + 35 32 KetRmax + 37 a?+ + 39 39 Ket + 41 End +------------------------------------------------------------------ + +# End of testinput8 diff --git a/testdata/testoutput8-32 b/testdata/testoutput8-32 new file mode 100644 index 0000000..1cb5ff1 --- /dev/null +++ b/testdata/testoutput8-32 @@ -0,0 +1,745 @@ +# These are a few representative patterns whose lengths and offsets are to be +# shown when the link size is 2. This is just a doublecheck test to ensure the +# sizes don't go horribly wrong when something is changed. The pattern contents +# are all themselves checked in other tests. Unicode, including property +# support, is required for these tests. + +#pattern fullbincode,memory + +/((?i)b)/ +Memory allocation (code space): 48 +------------------------------------------------------------------ + 0 9 Bra + 2 5 CBra 1 + 5 /i b + 7 5 Ket + 9 9 Ket + 11 End +------------------------------------------------------------------ + +/(?s)(.*X|^B)/ +Memory allocation (code space): 76 +------------------------------------------------------------------ + 0 16 Bra + 2 7 CBra 1 + 5 AllAny* + 7 X + 9 5 Alt + 11 ^ + 12 B + 14 12 Ket + 16 16 Ket + 18 End +------------------------------------------------------------------ + +/(?s:.*X|^B)/ +Memory allocation (code space): 72 +------------------------------------------------------------------ + 0 15 Bra + 2 6 Bra + 4 AllAny* + 6 X + 8 5 Alt + 10 ^ + 11 B + 13 11 Ket + 15 15 Ket + 17 End +------------------------------------------------------------------ + +/^[[:alnum:]]/ +Memory allocation (code space): 60 +------------------------------------------------------------------ + 0 12 Bra + 2 ^ + 3 [0-9A-Za-z] + 12 12 Ket + 14 End +------------------------------------------------------------------ + +/#/Ix +Memory allocation (code space): 20 +------------------------------------------------------------------ + 0 2 Bra + 2 2 Ket + 4 End +------------------------------------------------------------------ +Capturing subpattern count = 0 +May match empty string +Options: extended +No first code unit +No last code unit +Subject length lower bound = 0 + +/a#/Ix +Memory allocation (code space): 28 +------------------------------------------------------------------ + 0 4 Bra + 2 a + 4 4 Ket + 6 End +------------------------------------------------------------------ +Capturing subpattern count = 0 +Options: extended +First code unit = 'a' +No last code unit +Subject length lower bound = 1 + +/x?+/ +Memory allocation (code space): 28 +------------------------------------------------------------------ + 0 4 Bra + 2 x?+ + 4 4 Ket + 6 End +------------------------------------------------------------------ + +/x++/ +Memory allocation (code space): 28 +------------------------------------------------------------------ + 0 4 Bra + 2 x++ + 4 4 Ket + 6 End +------------------------------------------------------------------ + +/x{1,3}+/ +Memory allocation (code space): 40 +------------------------------------------------------------------ + 0 7 Bra + 2 x + 4 x{0,2}+ + 7 7 Ket + 9 End +------------------------------------------------------------------ + +/(x)*+/ +Memory allocation (code space): 52 +------------------------------------------------------------------ + 0 10 Bra + 2 Braposzero + 3 5 CBraPos 1 + 6 x + 8 5 KetRpos + 10 10 Ket + 12 End +------------------------------------------------------------------ + +/^((a+)(?U)([ab]+)(?-U)([bc]+)(\w*))/ +Memory allocation (code space): 220 +------------------------------------------------------------------ + 0 52 Bra + 2 ^ + 3 47 CBra 1 + 6 5 CBra 2 + 9 a+ + 11 5 Ket + 13 13 CBra 3 + 16 [ab]+? + 26 13 Ket + 28 13 CBra 4 + 31 [bc]+ + 41 13 Ket + 43 5 CBra 5 + 46 \w*+ + 48 5 Ket + 50 47 Ket + 52 52 Ket + 54 End +------------------------------------------------------------------ + +"8J\$WE\<\.rX\+ix\[d1b\!H\#\?vV0vrK\:ZH1\=2M\>iV\;\?aPhFB\<\*vW\@QW\@sO9\}cfZA\-i\'w\%hKd6gt1UJP\,15_\#QY\$M\^Mss_U\/\]\&LK9\[5vQub\^w\[KDD\qmj\;2\}YWFdYx\.Ap\]hjCPTP\(n28k\+3\;o\&WXqs\/gOXdr\$\:r\'do0\;b4c\(f_Gr\=\"\\4\)\[01T7ajQJvL\$W\~mL_sS\/4h\:x\*\[ZN\=KLs\&L5zX\/\/\>it\,o\:aU\(\;Z\>pW\&T7oP\'2K\^E\:x9\'c\[\%z\-\,64JQ5AeH_G\#KijUKghQw\^\\vea3a\?kka_G\$8\#\`\*kynsxzBLru\'\]k_\[7FrVx\}\^\=\$blx\>s\-N\%j\;D\*aZDnsw\:YKZ\%Q\.Kne9\#hP\?\+b3\(SOvL\,\^\;\&u5\@\?5C5Bhb\=m\-vEh_L15Jl\]U\)0RP6\{q\%L\^_z5E\'Dw6X\b" +Memory allocation (code space): 3296 +------------------------------------------------------------------ + 0 821 Bra + 2 8J$WE<.rX+ix[d1b!H#?vV0vrK:ZH1=2M>iV;?aPhFB<*vW@QW@sO9}cfZA-i'w%hKd6gt1UJP,15_#QY$M^Mss_U/]&LK9[5vQub^w[KDDqmj;2}YWFdYx.Ap]hjCPTP(n28k+3;o&WXqs/gOXdr$:r'do0;b4c(f_Gr="\4)[01T7ajQJvL$W~mL_sS/4h:x*[ZN=KLs&L5zX//>it,o:aU(;Z>pW&T7oP'2K^E:x9'c[%z-,64JQ5AeH_G#KijUKghQw^\vea3a?kka_G$8#`*kynsxzBLru']k_[7FrVx}^=$blx>s-N%j;D*aZDnsw:YKZ%Q.Kne9#hP?+b3(SOvL,^;&u5@?5C5Bhb=m-vEh_L15Jl]U)0RP6{q%L^_z5E'Dw6X +820 \b +821 821 Ket +823 End +------------------------------------------------------------------ + +"\$\<\.X\+ix\[d1b\!H\#\?vV0vrK\:ZH1\=2M\>iV\;\?aPhFB\<\*vW\@QW\@sO9\}cfZA\-i\'w\%hKd6gt1UJP\,15_\#QY\$M\^Mss_U\/\]\&LK9\[5vQub\^w\[KDD\qmj\;2\}YWFdYx\.Ap\]hjCPTP\(n28k\+3\;o\&WXqs\/gOXdr\$\:r\'do0\;b4c\(f_Gr\=\"\\4\)\[01T7ajQJvL\$W\~mL_sS\/4h\:x\*\[ZN\=KLs\&L5zX\/\/\>it\,o\:aU\(\;Z\>pW\&T7oP\'2K\^E\:x9\'c\[\%z\-\,64JQ5AeH_G\#KijUKghQw\^\\vea3a\?kka_G\$8\#\`\*kynsxzBLru\'\]k_\[7FrVx\}\^\=\$blx\>s\-N\%j\;D\*aZDnsw\:YKZ\%Q\.Kne9\#hP\?\+b3\(SOvL\,\^\;\&u5\@\?5C5Bhb\=m\-vEh_L15Jl\]U\)0RP6\{q\%L\^_z5E\'Dw6X\b" +Memory allocation (code space): 3256 +------------------------------------------------------------------ + 0 811 Bra + 2 $<.X+ix[d1b!H#?vV0vrK:ZH1=2M>iV;?aPhFB<*vW@QW@sO9}cfZA-i'w%hKd6gt1UJP,15_#QY$M^Mss_U/]&LK9[5vQub^w[KDDqmj;2}YWFdYx.Ap]hjCPTP(n28k+3;o&WXqs/gOXdr$:r'do0;b4c(f_Gr="\4)[01T7ajQJvL$W~mL_sS/4h:x*[ZN=KLs&L5zX//>it,o:aU(;Z>pW&T7oP'2K^E:x9'c[%z-,64JQ5AeH_G#KijUKghQw^\vea3a?kka_G$8#`*kynsxzBLru']k_[7FrVx}^=$blx>s-N%j;D*aZDnsw:YKZ%Q.Kne9#hP?+b3(SOvL,^;&u5@?5C5Bhb=m-vEh_L15Jl]U)0RP6{q%L^_z5E'Dw6X +810 \b +811 811 Ket +813 End +------------------------------------------------------------------ + +/(a(?1)b)/ +Memory allocation (code space): 64 +------------------------------------------------------------------ + 0 13 Bra + 2 9 CBra 1 + 5 a + 7 2 Recurse + 9 b + 11 9 Ket + 13 13 Ket + 15 End +------------------------------------------------------------------ + +/(a(?1)+b)/ +Memory allocation (code space): 80 +------------------------------------------------------------------ + 0 17 Bra + 2 13 CBra 1 + 5 a + 7 4 Once + 9 2 Recurse + 11 4 KetRmax + 13 b + 15 13 Ket + 17 17 Ket + 19 End +------------------------------------------------------------------ + +/a(?Pb|c)d(?Pe)/ +Memory allocation (code space): 108 +------------------------------------------------------------------ + 0 24 Bra + 2 a + 4 5 CBra 1 + 7 b + 9 4 Alt + 11 c + 13 9 Ket + 15 d + 17 5 CBra 2 + 20 e + 22 5 Ket + 24 24 Ket + 26 End +------------------------------------------------------------------ + +/(?:a(?Pc(?Pd)))(?Pa)/ +Memory allocation (code space): 128 +------------------------------------------------------------------ + 0 29 Bra + 2 18 Bra + 4 a + 6 12 CBra 1 + 9 c + 11 5 CBra 2 + 14 d + 16 5 Ket + 18 12 Ket + 20 18 Ket + 22 5 CBra 3 + 25 a + 27 5 Ket + 29 29 Ket + 31 End +------------------------------------------------------------------ + +/(?Pa)...(?P=a)bbb(?P>a)d/ +Memory allocation (code space): 108 +------------------------------------------------------------------ + 0 24 Bra + 2 5 CBra 1 + 5 a + 7 5 Ket + 9 Any + 10 Any + 11 Any + 12 \1 + 14 bbb + 20 2 Recurse + 22 d + 24 24 Ket + 26 End +------------------------------------------------------------------ + +/abc(?C255)de(?C)f/ +Memory allocation (code space): 100 +------------------------------------------------------------------ + 0 22 Bra + 2 abc + 8 Callout 255 10 1 + 12 de + 16 Callout 0 16 1 + 20 f + 22 22 Ket + 24 End +------------------------------------------------------------------ + +/abcde/auto_callout +Memory allocation (code space): 156 +------------------------------------------------------------------ + 0 36 Bra + 2 Callout 255 0 1 + 6 a + 8 Callout 255 1 1 + 12 b + 14 Callout 255 2 1 + 18 c + 20 Callout 255 3 1 + 24 d + 26 Callout 255 4 1 + 30 e + 32 Callout 255 5 0 + 36 36 Ket + 38 End +------------------------------------------------------------------ + +/\x{100}/utf +Memory allocation (code space): 28 +------------------------------------------------------------------ + 0 4 Bra + 2 \x{100} + 4 4 Ket + 6 End +------------------------------------------------------------------ + +/\x{1000}/utf +Memory allocation (code space): 28 +------------------------------------------------------------------ + 0 4 Bra + 2 \x{1000} + 4 4 Ket + 6 End +------------------------------------------------------------------ + +/\x{10000}/utf +Memory allocation (code space): 28 +------------------------------------------------------------------ + 0 4 Bra + 2 \x{10000} + 4 4 Ket + 6 End +------------------------------------------------------------------ + +/\x{100000}/utf +Memory allocation (code space): 28 +------------------------------------------------------------------ + 0 4 Bra + 2 \x{100000} + 4 4 Ket + 6 End +------------------------------------------------------------------ + +/\x{10ffff}/utf +Memory allocation (code space): 28 +------------------------------------------------------------------ + 0 4 Bra + 2 \x{10ffff} + 4 4 Ket + 6 End +------------------------------------------------------------------ + +/\x{110000}/utf +Failed: error 134 at offset 9: character code point value in \x{} or \o{} is too large + +/[\x{ff}]/utf +Memory allocation (code space): 28 +------------------------------------------------------------------ + 0 4 Bra + 2 \x{ff} + 4 4 Ket + 6 End +------------------------------------------------------------------ + +/[\x{100}]/utf +Memory allocation (code space): 28 +------------------------------------------------------------------ + 0 4 Bra + 2 \x{100} + 4 4 Ket + 6 End +------------------------------------------------------------------ + +/\x80/utf +Memory allocation (code space): 28 +------------------------------------------------------------------ + 0 4 Bra + 2 \x{80} + 4 4 Ket + 6 End +------------------------------------------------------------------ + +/\xff/utf +Memory allocation (code space): 28 +------------------------------------------------------------------ + 0 4 Bra + 2 \x{ff} + 4 4 Ket + 6 End +------------------------------------------------------------------ + +/\x{0041}\x{2262}\x{0391}\x{002e}/I,utf +Memory allocation (code space): 52 +------------------------------------------------------------------ + 0 10 Bra + 2 A\x{2262}\x{391}. + 10 10 Ket + 12 End +------------------------------------------------------------------ +Capturing subpattern count = 0 +Options: utf +First code unit = 'A' +Last code unit = '.' +Subject length lower bound = 4 + +/\x{D55c}\x{ad6d}\x{C5B4}/I,utf +Memory allocation (code space): 44 +------------------------------------------------------------------ + 0 8 Bra + 2 \x{d55c}\x{ad6d}\x{c5b4} + 8 8 Ket + 10 End +------------------------------------------------------------------ +Capturing subpattern count = 0 +Options: utf +First code unit = \x{d55c} +Last code unit = \x{c5b4} +Subject length lower bound = 3 + +/\x{65e5}\x{672c}\x{8a9e}/I,utf +Memory allocation (code space): 44 +------------------------------------------------------------------ + 0 8 Bra + 2 \x{65e5}\x{672c}\x{8a9e} + 8 8 Ket + 10 End +------------------------------------------------------------------ +Capturing subpattern count = 0 +Options: utf +First code unit = \x{65e5} +Last code unit = \x{8a9e} +Subject length lower bound = 3 + +/[\x{100}]/utf +Memory allocation (code space): 28 +------------------------------------------------------------------ + 0 4 Bra + 2 \x{100} + 4 4 Ket + 6 End +------------------------------------------------------------------ + +/[Z\x{100}]/utf +Memory allocation (code space): 76 +------------------------------------------------------------------ + 0 16 Bra + 2 [Z\x{100}] + 16 16 Ket + 18 End +------------------------------------------------------------------ + +/^[\x{100}\E-\Q\E\x{150}]/utf +Memory allocation (code space): 52 +------------------------------------------------------------------ + 0 10 Bra + 2 ^ + 3 [\x{100}-\x{150}] + 10 10 Ket + 12 End +------------------------------------------------------------------ + +/^[\QĀ\E-\QŐ\E]/utf +Memory allocation (code space): 52 +------------------------------------------------------------------ + 0 10 Bra + 2 ^ + 3 [\x{100}-\x{150}] + 10 10 Ket + 12 End +------------------------------------------------------------------ + +/^[\QĀ\E-\QŐ\E/utf +Failed: error 106 at offset 13: missing terminating ] for character class + +/[\p{L}]/ +Memory allocation (code space): 48 +------------------------------------------------------------------ + 0 9 Bra + 2 [\p{L}] + 9 9 Ket + 11 End +------------------------------------------------------------------ + +/[\p{^L}]/ +Memory allocation (code space): 48 +------------------------------------------------------------------ + 0 9 Bra + 2 [\P{L}] + 9 9 Ket + 11 End +------------------------------------------------------------------ + +/[\P{L}]/ +Memory allocation (code space): 48 +------------------------------------------------------------------ + 0 9 Bra + 2 [\P{L}] + 9 9 Ket + 11 End +------------------------------------------------------------------ + +/[\P{^L}]/ +Memory allocation (code space): 48 +------------------------------------------------------------------ + 0 9 Bra + 2 [\p{L}] + 9 9 Ket + 11 End +------------------------------------------------------------------ + +/[abc\p{L}\x{0660}]/utf +Memory allocation (code space): 88 +------------------------------------------------------------------ + 0 19 Bra + 2 [a-c\p{L}\x{660}] + 19 19 Ket + 21 End +------------------------------------------------------------------ + +/[\p{Nd}]/utf +Memory allocation (code space): 48 +------------------------------------------------------------------ + 0 9 Bra + 2 [\p{Nd}] + 9 9 Ket + 11 End +------------------------------------------------------------------ + +/[\p{Nd}+-]+/utf +Memory allocation (code space): 84 +------------------------------------------------------------------ + 0 18 Bra + 2 [+\-\p{Nd}]++ + 18 18 Ket + 20 End +------------------------------------------------------------------ + +/A\x{391}\x{10427}\x{ff3a}\x{1fb0}/i,utf +Memory allocation (code space): 60 +------------------------------------------------------------------ + 0 12 Bra + 2 /i A\x{391}\x{10427}\x{ff3a}\x{1fb0} + 12 12 Ket + 14 End +------------------------------------------------------------------ + +/A\x{391}\x{10427}\x{ff3a}\x{1fb0}/utf +Memory allocation (code space): 60 +------------------------------------------------------------------ + 0 12 Bra + 2 A\x{391}\x{10427}\x{ff3a}\x{1fb0} + 12 12 Ket + 14 End +------------------------------------------------------------------ + +/[\x{105}-\x{109}]/i,utf +Memory allocation (code space): 48 +------------------------------------------------------------------ + 0 9 Bra + 2 [\x{104}-\x{109}] + 9 9 Ket + 11 End +------------------------------------------------------------------ + +/( ( (?(1)0|) )* )/x +Memory allocation (code space): 104 +------------------------------------------------------------------ + 0 23 Bra + 2 19 CBra 1 + 5 Brazero + 6 13 SCBra 2 + 9 6 Cond + 11 1 Cond ref + 13 0 + 15 2 Alt + 17 8 Ket + 19 13 KetRmax + 21 19 Ket + 23 23 Ket + 25 End +------------------------------------------------------------------ + +/( (?(1)0|)* )/x +Memory allocation (code space): 84 +------------------------------------------------------------------ + 0 18 Bra + 2 14 CBra 1 + 5 Brazero + 6 6 SCond + 8 1 Cond ref + 10 0 + 12 2 Alt + 14 8 KetRmax + 16 14 Ket + 18 18 Ket + 20 End +------------------------------------------------------------------ + +/[a]/ +Memory allocation (code space): 28 +------------------------------------------------------------------ + 0 4 Bra + 2 a + 4 4 Ket + 6 End +------------------------------------------------------------------ + +/[a]/utf +Memory allocation (code space): 28 +------------------------------------------------------------------ + 0 4 Bra + 2 a + 4 4 Ket + 6 End +------------------------------------------------------------------ + +/[\xaa]/ +Memory allocation (code space): 28 +------------------------------------------------------------------ + 0 4 Bra + 2 \x{aa} + 4 4 Ket + 6 End +------------------------------------------------------------------ + +/[\xaa]/utf +Memory allocation (code space): 28 +------------------------------------------------------------------ + 0 4 Bra + 2 \x{aa} + 4 4 Ket + 6 End +------------------------------------------------------------------ + +/[^a]/ +Memory allocation (code space): 28 +------------------------------------------------------------------ + 0 4 Bra + 2 [^a] + 4 4 Ket + 6 End +------------------------------------------------------------------ + +/[^a]/utf +Memory allocation (code space): 28 +------------------------------------------------------------------ + 0 4 Bra + 2 [^a] + 4 4 Ket + 6 End +------------------------------------------------------------------ + +/[^\xaa]/ +Memory allocation (code space): 28 +------------------------------------------------------------------ + 0 4 Bra + 2 [^\x{aa}] + 4 4 Ket + 6 End +------------------------------------------------------------------ + +/[^\xaa]/utf +Memory allocation (code space): 28 +------------------------------------------------------------------ + 0 4 Bra + 2 [^\x{aa}] + 4 4 Ket + 6 End +------------------------------------------------------------------ + +#pattern -memory + +/[^\d]/utf,ucp +------------------------------------------------------------------ + 0 9 Bra + 2 [^\p{Nd}] + 9 9 Ket + 11 End +------------------------------------------------------------------ + +/[[:^alpha:][:^cntrl:]]+/utf,ucp +------------------------------------------------------------------ + 0 18 Bra + 2 [ -~\x80-\xff\P{L}]++ + 18 18 Ket + 20 End +------------------------------------------------------------------ + +/[[:^cntrl:][:^alpha:]]+/utf,ucp +------------------------------------------------------------------ + 0 18 Bra + 2 [ -~\x80-\xff\P{L}]++ + 18 18 Ket + 20 End +------------------------------------------------------------------ + +/[[:alpha:]]+/utf,ucp +------------------------------------------------------------------ + 0 10 Bra + 2 [\p{L}]++ + 10 10 Ket + 12 End +------------------------------------------------------------------ + +/[[:^alpha:]\S]+/utf,ucp +------------------------------------------------------------------ + 0 13 Bra + 2 [\P{L}\P{Xsp}]++ + 13 13 Ket + 15 End +------------------------------------------------------------------ + +/abc(d|e)(*THEN)x(123(*THEN)4|567(b|q)(*THEN)xx)/ +------------------------------------------------------------------ + 0 60 Bra + 2 abc + 8 5 CBra 1 + 11 d + 13 4 Alt + 15 e + 17 9 Ket + 19 *THEN + 20 x + 22 12 CBra 2 + 25 123 + 31 *THEN + 32 4 + 34 24 Alt + 36 567 + 42 5 CBra 3 + 45 b + 47 4 Alt + 49 q + 51 9 Ket + 53 *THEN + 54 xx + 58 36 Ket + 60 60 Ket + 62 End +------------------------------------------------------------------ + +/(((a\2)|(a*)\g<-1>))*a?/ +------------------------------------------------------------------ + 0 39 Bra + 2 Brazero + 3 32 SCBra 1 + 6 27 Once + 8 12 CBra 2 + 11 7 CBra 3 + 14 a + 16 \2 + 18 7 Ket + 20 11 Alt + 22 5 CBra 4 + 25 a* + 27 5 Ket + 29 22 Recurse + 31 23 Ket + 33 27 Ket + 35 32 KetRmax + 37 a?+ + 39 39 Ket + 41 End +------------------------------------------------------------------ + +# End of testinput8 diff --git a/testdata/testoutput8-8 b/testdata/testoutput8-8 new file mode 100644 index 0000000..ae0518e --- /dev/null +++ b/testdata/testoutput8-8 @@ -0,0 +1,745 @@ +# These are a few representative patterns whose lengths and offsets are to be +# shown when the link size is 2. This is just a doublecheck test to ensure the +# sizes don't go horribly wrong when something is changed. The pattern contents +# are all themselves checked in other tests. Unicode, including property +# support, is required for these tests. + +#pattern fullbincode,memory + +/((?i)b)/ +Memory allocation (code space): 17 +------------------------------------------------------------------ + 0 13 Bra + 3 7 CBra 1 + 8 /i b + 10 7 Ket + 13 13 Ket + 16 End +------------------------------------------------------------------ + +/(?s)(.*X|^B)/ +Memory allocation (code space): 25 +------------------------------------------------------------------ + 0 21 Bra + 3 9 CBra 1 + 8 AllAny* + 10 X + 12 6 Alt + 15 ^ + 16 B + 18 15 Ket + 21 21 Ket + 24 End +------------------------------------------------------------------ + +/(?s:.*X|^B)/ +Memory allocation (code space): 23 +------------------------------------------------------------------ + 0 19 Bra + 3 7 Bra + 6 AllAny* + 8 X + 10 6 Alt + 13 ^ + 14 B + 16 13 Ket + 19 19 Ket + 22 End +------------------------------------------------------------------ + +/^[[:alnum:]]/ +Memory allocation (code space): 41 +------------------------------------------------------------------ + 0 37 Bra + 3 ^ + 4 [0-9A-Za-z] + 37 37 Ket + 40 End +------------------------------------------------------------------ + +/#/Ix +Memory allocation (code space): 7 +------------------------------------------------------------------ + 0 3 Bra + 3 3 Ket + 6 End +------------------------------------------------------------------ +Capturing subpattern count = 0 +May match empty string +Options: extended +No first code unit +No last code unit +Subject length lower bound = 0 + +/a#/Ix +Memory allocation (code space): 9 +------------------------------------------------------------------ + 0 5 Bra + 3 a + 5 5 Ket + 8 End +------------------------------------------------------------------ +Capturing subpattern count = 0 +Options: extended +First code unit = 'a' +No last code unit +Subject length lower bound = 1 + +/x?+/ +Memory allocation (code space): 9 +------------------------------------------------------------------ + 0 5 Bra + 3 x?+ + 5 5 Ket + 8 End +------------------------------------------------------------------ + +/x++/ +Memory allocation (code space): 9 +------------------------------------------------------------------ + 0 5 Bra + 3 x++ + 5 5 Ket + 8 End +------------------------------------------------------------------ + +/x{1,3}+/ +Memory allocation (code space): 13 +------------------------------------------------------------------ + 0 9 Bra + 3 x + 5 x{0,2}+ + 9 9 Ket + 12 End +------------------------------------------------------------------ + +/(x)*+/ +Memory allocation (code space): 18 +------------------------------------------------------------------ + 0 14 Bra + 3 Braposzero + 4 7 CBraPos 1 + 9 x + 11 7 KetRpos + 14 14 Ket + 17 End +------------------------------------------------------------------ + +/^((a+)(?U)([ab]+)(?-U)([bc]+)(\w*))/ +Memory allocation (code space): 120 +------------------------------------------------------------------ + 0 116 Bra + 3 ^ + 4 109 CBra 1 + 9 7 CBra 2 + 14 a+ + 16 7 Ket + 19 39 CBra 3 + 24 [ab]+? + 58 39 Ket + 61 39 CBra 4 + 66 [bc]+ +100 39 Ket +103 7 CBra 5 +108 \w*+ +110 7 Ket +113 109 Ket +116 116 Ket +119 End +------------------------------------------------------------------ + +"8J\$WE\<\.rX\+ix\[d1b\!H\#\?vV0vrK\:ZH1\=2M\>iV\;\?aPhFB\<\*vW\@QW\@sO9\}cfZA\-i\'w\%hKd6gt1UJP\,15_\#QY\$M\^Mss_U\/\]\&LK9\[5vQub\^w\[KDD\qmj\;2\}YWFdYx\.Ap\]hjCPTP\(n28k\+3\;o\&WXqs\/gOXdr\$\:r\'do0\;b4c\(f_Gr\=\"\\4\)\[01T7ajQJvL\$W\~mL_sS\/4h\:x\*\[ZN\=KLs\&L5zX\/\/\>it\,o\:aU\(\;Z\>pW\&T7oP\'2K\^E\:x9\'c\[\%z\-\,64JQ5AeH_G\#KijUKghQw\^\\vea3a\?kka_G\$8\#\`\*kynsxzBLru\'\]k_\[7FrVx\}\^\=\$blx\>s\-N\%j\;D\*aZDnsw\:YKZ\%Q\.Kne9\#hP\?\+b3\(SOvL\,\^\;\&u5\@\?5C5Bhb\=m\-vEh_L15Jl\]U\)0RP6\{q\%L\^_z5E\'Dw6X\b" +Memory allocation (code space): 826 +------------------------------------------------------------------ + 0 822 Bra + 3 8J$WE<.rX+ix[d1b!H#?vV0vrK:ZH1=2M>iV;?aPhFB<*vW@QW@sO9}cfZA-i'w%hKd6gt1UJP,15_#QY$M^Mss_U/]&LK9[5vQub^w[KDDqmj;2}YWFdYx.Ap]hjCPTP(n28k+3;o&WXqs/gOXdr$:r'do0;b4c(f_Gr="\4)[01T7ajQJvL$W~mL_sS/4h:x*[ZN=KLs&L5zX//>it,o:aU(;Z>pW&T7oP'2K^E:x9'c[%z-,64JQ5AeH_G#KijUKghQw^\vea3a?kka_G$8#`*kynsxzBLru']k_[7FrVx}^=$blx>s-N%j;D*aZDnsw:YKZ%Q.Kne9#hP?+b3(SOvL,^;&u5@?5C5Bhb=m-vEh_L15Jl]U)0RP6{q%L^_z5E'Dw6X +821 \b +822 822 Ket +825 End +------------------------------------------------------------------ + +"\$\<\.X\+ix\[d1b\!H\#\?vV0vrK\:ZH1\=2M\>iV\;\?aPhFB\<\*vW\@QW\@sO9\}cfZA\-i\'w\%hKd6gt1UJP\,15_\#QY\$M\^Mss_U\/\]\&LK9\[5vQub\^w\[KDD\qmj\;2\}YWFdYx\.Ap\]hjCPTP\(n28k\+3\;o\&WXqs\/gOXdr\$\:r\'do0\;b4c\(f_Gr\=\"\\4\)\[01T7ajQJvL\$W\~mL_sS\/4h\:x\*\[ZN\=KLs\&L5zX\/\/\>it\,o\:aU\(\;Z\>pW\&T7oP\'2K\^E\:x9\'c\[\%z\-\,64JQ5AeH_G\#KijUKghQw\^\\vea3a\?kka_G\$8\#\`\*kynsxzBLru\'\]k_\[7FrVx\}\^\=\$blx\>s\-N\%j\;D\*aZDnsw\:YKZ\%Q\.Kne9\#hP\?\+b3\(SOvL\,\^\;\&u5\@\?5C5Bhb\=m\-vEh_L15Jl\]U\)0RP6\{q\%L\^_z5E\'Dw6X\b" +Memory allocation (code space): 816 +------------------------------------------------------------------ + 0 812 Bra + 3 $<.X+ix[d1b!H#?vV0vrK:ZH1=2M>iV;?aPhFB<*vW@QW@sO9}cfZA-i'w%hKd6gt1UJP,15_#QY$M^Mss_U/]&LK9[5vQub^w[KDDqmj;2}YWFdYx.Ap]hjCPTP(n28k+3;o&WXqs/gOXdr$:r'do0;b4c(f_Gr="\4)[01T7ajQJvL$W~mL_sS/4h:x*[ZN=KLs&L5zX//>it,o:aU(;Z>pW&T7oP'2K^E:x9'c[%z-,64JQ5AeH_G#KijUKghQw^\vea3a?kka_G$8#`*kynsxzBLru']k_[7FrVx}^=$blx>s-N%j;D*aZDnsw:YKZ%Q.Kne9#hP?+b3(SOvL,^;&u5@?5C5Bhb=m-vEh_L15Jl]U)0RP6{q%L^_z5E'Dw6X +811 \b +812 812 Ket +815 End +------------------------------------------------------------------ + +/(a(?1)b)/ +Memory allocation (code space): 22 +------------------------------------------------------------------ + 0 18 Bra + 3 12 CBra 1 + 8 a + 10 3 Recurse + 13 b + 15 12 Ket + 18 18 Ket + 21 End +------------------------------------------------------------------ + +/(a(?1)+b)/ +Memory allocation (code space): 28 +------------------------------------------------------------------ + 0 24 Bra + 3 18 CBra 1 + 8 a + 10 6 Once + 13 3 Recurse + 16 6 KetRmax + 19 b + 21 18 Ket + 24 24 Ket + 27 End +------------------------------------------------------------------ + +/a(?Pb|c)d(?Pe)/ +Memory allocation (code space): 36 +------------------------------------------------------------------ + 0 32 Bra + 3 a + 5 7 CBra 1 + 10 b + 12 5 Alt + 15 c + 17 12 Ket + 20 d + 22 7 CBra 2 + 27 e + 29 7 Ket + 32 32 Ket + 35 End +------------------------------------------------------------------ + +/(?:a(?Pc(?Pd)))(?Pa)/ +Memory allocation (code space): 45 +------------------------------------------------------------------ + 0 41 Bra + 3 25 Bra + 6 a + 8 17 CBra 1 + 13 c + 15 7 CBra 2 + 20 d + 22 7 Ket + 25 17 Ket + 28 25 Ket + 31 7 CBra 3 + 36 a + 38 7 Ket + 41 41 Ket + 44 End +------------------------------------------------------------------ + +/(?Pa)...(?P=a)bbb(?P>a)d/ +Memory allocation (code space): 34 +------------------------------------------------------------------ + 0 30 Bra + 3 7 CBra 1 + 8 a + 10 7 Ket + 13 Any + 14 Any + 15 Any + 16 \1 + 19 bbb + 25 3 Recurse + 28 d + 30 30 Ket + 33 End +------------------------------------------------------------------ + +/abc(?C255)de(?C)f/ +Memory allocation (code space): 31 +------------------------------------------------------------------ + 0 27 Bra + 3 abc + 9 Callout 255 10 1 + 15 de + 19 Callout 0 16 1 + 25 f + 27 27 Ket + 30 End +------------------------------------------------------------------ + +/abcde/auto_callout +Memory allocation (code space): 53 +------------------------------------------------------------------ + 0 49 Bra + 3 Callout 255 0 1 + 9 a + 11 Callout 255 1 1 + 17 b + 19 Callout 255 2 1 + 25 c + 27 Callout 255 3 1 + 33 d + 35 Callout 255 4 1 + 41 e + 43 Callout 255 5 0 + 49 49 Ket + 52 End +------------------------------------------------------------------ + +/\x{100}/utf +Memory allocation (code space): 10 +------------------------------------------------------------------ + 0 6 Bra + 3 \x{100} + 6 6 Ket + 9 End +------------------------------------------------------------------ + +/\x{1000}/utf +Memory allocation (code space): 11 +------------------------------------------------------------------ + 0 7 Bra + 3 \x{1000} + 7 7 Ket + 10 End +------------------------------------------------------------------ + +/\x{10000}/utf +Memory allocation (code space): 12 +------------------------------------------------------------------ + 0 8 Bra + 3 \x{10000} + 8 8 Ket + 11 End +------------------------------------------------------------------ + +/\x{100000}/utf +Memory allocation (code space): 12 +------------------------------------------------------------------ + 0 8 Bra + 3 \x{100000} + 8 8 Ket + 11 End +------------------------------------------------------------------ + +/\x{10ffff}/utf +Memory allocation (code space): 12 +------------------------------------------------------------------ + 0 8 Bra + 3 \x{10ffff} + 8 8 Ket + 11 End +------------------------------------------------------------------ + +/\x{110000}/utf +Failed: error 134 at offset 9: character code point value in \x{} or \o{} is too large + +/[\x{ff}]/utf +Memory allocation (code space): 10 +------------------------------------------------------------------ + 0 6 Bra + 3 \x{ff} + 6 6 Ket + 9 End +------------------------------------------------------------------ + +/[\x{100}]/utf +Memory allocation (code space): 10 +------------------------------------------------------------------ + 0 6 Bra + 3 \x{100} + 6 6 Ket + 9 End +------------------------------------------------------------------ + +/\x80/utf +Memory allocation (code space): 10 +------------------------------------------------------------------ + 0 6 Bra + 3 \x{80} + 6 6 Ket + 9 End +------------------------------------------------------------------ + +/\xff/utf +Memory allocation (code space): 10 +------------------------------------------------------------------ + 0 6 Bra + 3 \x{ff} + 6 6 Ket + 9 End +------------------------------------------------------------------ + +/\x{0041}\x{2262}\x{0391}\x{002e}/I,utf +Memory allocation (code space): 18 +------------------------------------------------------------------ + 0 14 Bra + 3 A\x{2262}\x{391}. + 14 14 Ket + 17 End +------------------------------------------------------------------ +Capturing subpattern count = 0 +Options: utf +First code unit = 'A' +Last code unit = '.' +Subject length lower bound = 4 + +/\x{D55c}\x{ad6d}\x{C5B4}/I,utf +Memory allocation (code space): 19 +------------------------------------------------------------------ + 0 15 Bra + 3 \x{d55c}\x{ad6d}\x{c5b4} + 15 15 Ket + 18 End +------------------------------------------------------------------ +Capturing subpattern count = 0 +Options: utf +First code unit = \xed +Last code unit = \xb4 +Subject length lower bound = 3 + +/\x{65e5}\x{672c}\x{8a9e}/I,utf +Memory allocation (code space): 19 +------------------------------------------------------------------ + 0 15 Bra + 3 \x{65e5}\x{672c}\x{8a9e} + 15 15 Ket + 18 End +------------------------------------------------------------------ +Capturing subpattern count = 0 +Options: utf +First code unit = \xe6 +Last code unit = \x9e +Subject length lower bound = 3 + +/[\x{100}]/utf +Memory allocation (code space): 10 +------------------------------------------------------------------ + 0 6 Bra + 3 \x{100} + 6 6 Ket + 9 End +------------------------------------------------------------------ + +/[Z\x{100}]/utf +Memory allocation (code space): 47 +------------------------------------------------------------------ + 0 43 Bra + 3 [Z\x{100}] + 43 43 Ket + 46 End +------------------------------------------------------------------ + +/^[\x{100}\E-\Q\E\x{150}]/utf +Memory allocation (code space): 18 +------------------------------------------------------------------ + 0 14 Bra + 3 ^ + 4 [\x{100}-\x{150}] + 14 14 Ket + 17 End +------------------------------------------------------------------ + +/^[\QĀ\E-\QŐ\E]/utf +Memory allocation (code space): 18 +------------------------------------------------------------------ + 0 14 Bra + 3 ^ + 4 [\x{100}-\x{150}] + 14 14 Ket + 17 End +------------------------------------------------------------------ + +/^[\QĀ\E-\QŐ\E/utf +Failed: error 106 at offset 15: missing terminating ] for character class + +/[\p{L}]/ +Memory allocation (code space): 15 +------------------------------------------------------------------ + 0 11 Bra + 3 [\p{L}] + 11 11 Ket + 14 End +------------------------------------------------------------------ + +/[\p{^L}]/ +Memory allocation (code space): 15 +------------------------------------------------------------------ + 0 11 Bra + 3 [\P{L}] + 11 11 Ket + 14 End +------------------------------------------------------------------ + +/[\P{L}]/ +Memory allocation (code space): 15 +------------------------------------------------------------------ + 0 11 Bra + 3 [\P{L}] + 11 11 Ket + 14 End +------------------------------------------------------------------ + +/[\P{^L}]/ +Memory allocation (code space): 15 +------------------------------------------------------------------ + 0 11 Bra + 3 [\p{L}] + 11 11 Ket + 14 End +------------------------------------------------------------------ + +/[abc\p{L}\x{0660}]/utf +Memory allocation (code space): 50 +------------------------------------------------------------------ + 0 46 Bra + 3 [a-c\p{L}\x{660}] + 46 46 Ket + 49 End +------------------------------------------------------------------ + +/[\p{Nd}]/utf +Memory allocation (code space): 15 +------------------------------------------------------------------ + 0 11 Bra + 3 [\p{Nd}] + 11 11 Ket + 14 End +------------------------------------------------------------------ + +/[\p{Nd}+-]+/utf +Memory allocation (code space): 48 +------------------------------------------------------------------ + 0 44 Bra + 3 [+\-\p{Nd}]++ + 44 44 Ket + 47 End +------------------------------------------------------------------ + +/A\x{391}\x{10427}\x{ff3a}\x{1fb0}/i,utf +Memory allocation (code space): 25 +------------------------------------------------------------------ + 0 21 Bra + 3 /i A\x{391}\x{10427}\x{ff3a}\x{1fb0} + 21 21 Ket + 24 End +------------------------------------------------------------------ + +/A\x{391}\x{10427}\x{ff3a}\x{1fb0}/utf +Memory allocation (code space): 25 +------------------------------------------------------------------ + 0 21 Bra + 3 A\x{391}\x{10427}\x{ff3a}\x{1fb0} + 21 21 Ket + 24 End +------------------------------------------------------------------ + +/[\x{105}-\x{109}]/i,utf +Memory allocation (code space): 17 +------------------------------------------------------------------ + 0 13 Bra + 3 [\x{104}-\x{109}] + 13 13 Ket + 16 End +------------------------------------------------------------------ + +/( ( (?(1)0|) )* )/x +Memory allocation (code space): 38 +------------------------------------------------------------------ + 0 34 Bra + 3 28 CBra 1 + 8 Brazero + 9 19 SCBra 2 + 14 8 Cond + 17 1 Cond ref + 20 0 + 22 3 Alt + 25 11 Ket + 28 19 KetRmax + 31 28 Ket + 34 34 Ket + 37 End +------------------------------------------------------------------ + +/( (?(1)0|)* )/x +Memory allocation (code space): 30 +------------------------------------------------------------------ + 0 26 Bra + 3 20 CBra 1 + 8 Brazero + 9 8 SCond + 12 1 Cond ref + 15 0 + 17 3 Alt + 20 11 KetRmax + 23 20 Ket + 26 26 Ket + 29 End +------------------------------------------------------------------ + +/[a]/ +Memory allocation (code space): 9 +------------------------------------------------------------------ + 0 5 Bra + 3 a + 5 5 Ket + 8 End +------------------------------------------------------------------ + +/[a]/utf +Memory allocation (code space): 9 +------------------------------------------------------------------ + 0 5 Bra + 3 a + 5 5 Ket + 8 End +------------------------------------------------------------------ + +/[\xaa]/ +Memory allocation (code space): 9 +------------------------------------------------------------------ + 0 5 Bra + 3 \x{aa} + 5 5 Ket + 8 End +------------------------------------------------------------------ + +/[\xaa]/utf +Memory allocation (code space): 10 +------------------------------------------------------------------ + 0 6 Bra + 3 \x{aa} + 6 6 Ket + 9 End +------------------------------------------------------------------ + +/[^a]/ +Memory allocation (code space): 9 +------------------------------------------------------------------ + 0 5 Bra + 3 [^a] + 5 5 Ket + 8 End +------------------------------------------------------------------ + +/[^a]/utf +Memory allocation (code space): 9 +------------------------------------------------------------------ + 0 5 Bra + 3 [^a] + 5 5 Ket + 8 End +------------------------------------------------------------------ + +/[^\xaa]/ +Memory allocation (code space): 9 +------------------------------------------------------------------ + 0 5 Bra + 3 [^\x{aa}] + 5 5 Ket + 8 End +------------------------------------------------------------------ + +/[^\xaa]/utf +Memory allocation (code space): 10 +------------------------------------------------------------------ + 0 6 Bra + 3 [^\x{aa}] + 6 6 Ket + 9 End +------------------------------------------------------------------ + +#pattern -memory + +/[^\d]/utf,ucp +------------------------------------------------------------------ + 0 11 Bra + 3 [^\p{Nd}] + 11 11 Ket + 14 End +------------------------------------------------------------------ + +/[[:^alpha:][:^cntrl:]]+/utf,ucp +------------------------------------------------------------------ + 0 44 Bra + 3 [ -~\x80-\xff\P{L}]++ + 44 44 Ket + 47 End +------------------------------------------------------------------ + +/[[:^cntrl:][:^alpha:]]+/utf,ucp +------------------------------------------------------------------ + 0 44 Bra + 3 [ -~\x80-\xff\P{L}]++ + 44 44 Ket + 47 End +------------------------------------------------------------------ + +/[[:alpha:]]+/utf,ucp +------------------------------------------------------------------ + 0 12 Bra + 3 [\p{L}]++ + 12 12 Ket + 15 End +------------------------------------------------------------------ + +/[[:^alpha:]\S]+/utf,ucp +------------------------------------------------------------------ + 0 15 Bra + 3 [\P{L}\P{Xsp}]++ + 15 15 Ket + 18 End +------------------------------------------------------------------ + +/abc(d|e)(*THEN)x(123(*THEN)4|567(b|q)(*THEN)xx)/ +------------------------------------------------------------------ + 0 73 Bra + 3 abc + 9 7 CBra 1 + 14 d + 16 5 Alt + 19 e + 21 12 Ket + 24 *THEN + 25 x + 27 14 CBra 2 + 32 123 + 38 *THEN + 39 4 + 41 29 Alt + 44 567 + 50 7 CBra 3 + 55 b + 57 5 Alt + 60 q + 62 12 Ket + 65 *THEN + 66 xx + 70 43 Ket + 73 73 Ket + 76 End +------------------------------------------------------------------ + +/(((a\2)|(a*)\g<-1>))*a?/ +------------------------------------------------------------------ + 0 57 Bra + 3 Brazero + 4 48 SCBra 1 + 9 40 Once + 12 18 CBra 2 + 17 10 CBra 3 + 22 a + 24 \2 + 27 10 Ket + 30 16 Alt + 33 7 CBra 4 + 38 a* + 40 7 Ket + 43 33 Recurse + 46 34 Ket + 49 40 Ket + 52 48 KetRmax + 55 a?+ + 57 57 Ket + 60 End +------------------------------------------------------------------ + +# End of testinput8 diff --git a/testdata/testoutput9 b/testdata/testoutput9 new file mode 100644 index 0000000..5d1460a --- /dev/null +++ b/testdata/testoutput9 @@ -0,0 +1,498 @@ +# This set of tests is run only with the 8-bit library. They do not require +# UTF-8 or Unicode property support. The file starts with all the tests of +# the POSIX interface, because that is supported only with the 8-bit library. + +#forbid_utf +#pattern posix + +/abc/ + abc + 0: abc + *** Failers +No match: POSIX code 17: match failed + +/^abc|def/ + abcdef + 0: abc + abcdef\=notbol + 0: def + +/.*((abc)$|(def))/ + defabc + 0: defabc + 1: abc + 2: abc + defabc\=noteol + 0: def + 1: def + 3: def + +/the quick brown fox/ + the quick brown fox + 0: the quick brown fox + *** Failers +No match: POSIX code 17: match failed + The Quick Brown Fox +No match: POSIX code 17: match failed + +/the quick brown fox/i + the quick brown fox + 0: the quick brown fox + The Quick Brown Fox + 0: The Quick Brown Fox + +/abc.def/ + *** Failers +No match: POSIX code 17: match failed + abc\ndef +No match: POSIX code 17: match failed + +/abc$/ + abc + 0: abc + abc\n + 0: abc + +/(abc)\2/ +Failed: POSIX code 15: bad back reference at offset 7 + +/(abc\1)/ + abc +No match: POSIX code 17: match failed + +/a*(b+)(z)(z)/ + aaaabbbbzzzz + 0: aaaabbbbzz + 1: bbbb + 2: z + 3: z + aaaabbbbzzzz\=ovector=0 +Matched without capture + aaaabbbbzzzz\=ovector=1 + 0: aaaabbbbzz + aaaabbbbzzzz\=ovector=2 + 0: aaaabbbbzz + 1: bbbb + +/ab.cd/ + ab-cd + 0: ab-cd + ab=cd + 0: ab=cd + ** Failers +No match: POSIX code 17: match failed + ab\ncd +No match: POSIX code 17: match failed + +/ab.cd/s + ab-cd + 0: ab-cd + ab=cd + 0: ab=cd + ab\ncd + 0: ab\x0acd + +/a(b)c/no_auto_capture + abc +Matched with REG_NOSUB + +/a(?Pb)c/no_auto_capture + abc +Matched with REG_NOSUB + +/a?|b?/ + abc + 0: a + ** Failers + 0: + ddd\=notempty +No match: POSIX code 17: match failed + +/\w+A/ + CDAAAAB + 0: CDAAAA + +/\w+A/ungreedy + CDAAAAB + 0: CDA + +/\Biss\B/I,aftertext +** Ignored with POSIX interface: info + Mississippi + 0: iss + 0+ issippi + +/abc/\ +Failed: POSIX code 9: bad escape sequence at offset 4 + +#pattern -posix + +# End of POSIX tests + +/a\Cb/ + aXb + 0: aXb + a\nb + 0: a\x0ab + ** Failers (too big char) +No match + A\x{123}B +** Character \x{123} is greater than 255 and UTF-8 mode is not enabled. +** Truncation will probably give the wrong result. +No match + A\o{443}B +** Character \x{123} is greater than 255 and UTF-8 mode is not enabled. +** Truncation will probably give the wrong result. +No match + +/\x{100}/I +Failed: error 134 at offset 6: character code point value in \x{} or \o{} is too large + +/\o{400}/I +Failed: error 134 at offset 6: character code point value in \x{} or \o{} is too large + +/ (?: [\040\t] | \( +(?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] | \( (?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] )* \) )* +\) )* # optional leading comment +(?: (?: +[^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]+ # some number of atom characters... +(?![^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]) # ..not followed by something that could be part of an atom +| +" (?: # opening quote... +[^\\\x80-\xff\n\015"] # Anything except backslash and quote +| # or +\\ [^\x80-\xff] # Escaped something (something != CR) +)* " # closing quote +) # initial word +(?: (?: [\040\t] | \( +(?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] | \( (?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] )* \) )* +\) )* \. (?: [\040\t] | \( +(?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] | \( (?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] )* \) )* +\) )* (?: +[^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]+ # some number of atom characters... +(?![^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]) # ..not followed by something that could be part of an atom +| +" (?: # opening quote... +[^\\\x80-\xff\n\015"] # Anything except backslash and quote +| # or +\\ [^\x80-\xff] # Escaped something (something != CR) +)* " # closing quote +) )* # further okay, if led by a period +(?: [\040\t] | \( +(?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] | \( (?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] )* \) )* +\) )* @ (?: [\040\t] | \( +(?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] | \( (?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] )* \) )* +\) )* (?: +[^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]+ # some number of atom characters... +(?![^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]) # ..not followed by something that could be part of an atom +| \[ # [ +(?: [^\\\x80-\xff\n\015\[\]] | \\ [^\x80-\xff] )* # stuff +\] # ] +) # initial subdomain +(?: # +(?: [\040\t] | \( +(?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] | \( (?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] )* \) )* +\) )* \. # if led by a period... +(?: [\040\t] | \( +(?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] | \( (?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] )* \) )* +\) )* (?: +[^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]+ # some number of atom characters... +(?![^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]) # ..not followed by something that could be part of an atom +| \[ # [ +(?: [^\\\x80-\xff\n\015\[\]] | \\ [^\x80-\xff] )* # stuff +\] # ] +) # ...further okay +)* +# address +| # or +(?: +[^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]+ # some number of atom characters... +(?![^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]) # ..not followed by something that could be part of an atom +| +" (?: # opening quote... +[^\\\x80-\xff\n\015"] # Anything except backslash and quote +| # or +\\ [^\x80-\xff] # Escaped something (something != CR) +)* " # closing quote +) # one word, optionally followed by.... +(?: +[^()<>@,;:".\\\[\]\x80-\xff\000-\010\012-\037] | # atom and space parts, or... +\( +(?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] | \( (?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] )* \) )* +\) | # comments, or... + +" (?: # opening quote... +[^\\\x80-\xff\n\015"] # Anything except backslash and quote +| # or +\\ [^\x80-\xff] # Escaped something (something != CR) +)* " # closing quote +# quoted strings +)* +< (?: [\040\t] | \( +(?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] | \( (?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] )* \) )* +\) )* # leading < +(?: @ (?: [\040\t] | \( +(?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] | \( (?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] )* \) )* +\) )* (?: +[^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]+ # some number of atom characters... +(?![^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]) # ..not followed by something that could be part of an atom +| \[ # [ +(?: [^\\\x80-\xff\n\015\[\]] | \\ [^\x80-\xff] )* # stuff +\] # ] +) # initial subdomain +(?: # +(?: [\040\t] | \( +(?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] | \( (?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] )* \) )* +\) )* \. # if led by a period... +(?: [\040\t] | \( +(?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] | \( (?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] )* \) )* +\) )* (?: +[^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]+ # some number of atom characters... +(?![^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]) # ..not followed by something that could be part of an atom +| \[ # [ +(?: [^\\\x80-\xff\n\015\[\]] | \\ [^\x80-\xff] )* # stuff +\] # ] +) # ...further okay +)* + +(?: (?: [\040\t] | \( +(?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] | \( (?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] )* \) )* +\) )* , (?: [\040\t] | \( +(?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] | \( (?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] )* \) )* +\) )* @ (?: [\040\t] | \( +(?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] | \( (?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] )* \) )* +\) )* (?: +[^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]+ # some number of atom characters... +(?![^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]) # ..not followed by something that could be part of an atom +| \[ # [ +(?: [^\\\x80-\xff\n\015\[\]] | \\ [^\x80-\xff] )* # stuff +\] # ] +) # initial subdomain +(?: # +(?: [\040\t] | \( +(?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] | \( (?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] )* \) )* +\) )* \. # if led by a period... +(?: [\040\t] | \( +(?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] | \( (?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] )* \) )* +\) )* (?: +[^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]+ # some number of atom characters... +(?![^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]) # ..not followed by something that could be part of an atom +| \[ # [ +(?: [^\\\x80-\xff\n\015\[\]] | \\ [^\x80-\xff] )* # stuff +\] # ] +) # ...further okay +)* +)* # further okay, if led by comma +: # closing colon +(?: [\040\t] | \( +(?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] | \( (?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] )* \) )* +\) )* )? # optional route +(?: +[^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]+ # some number of atom characters... +(?![^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]) # ..not followed by something that could be part of an atom +| +" (?: # opening quote... +[^\\\x80-\xff\n\015"] # Anything except backslash and quote +| # or +\\ [^\x80-\xff] # Escaped something (something != CR) +)* " # closing quote +) # initial word +(?: (?: [\040\t] | \( +(?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] | \( (?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] )* \) )* +\) )* \. (?: [\040\t] | \( +(?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] | \( (?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] )* \) )* +\) )* (?: +[^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]+ # some number of atom characters... +(?![^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]) # ..not followed by something that could be part of an atom +| +" (?: # opening quote... +[^\\\x80-\xff\n\015"] # Anything except backslash and quote +| # or +\\ [^\x80-\xff] # Escaped something (something != CR) +)* " # closing quote +) )* # further okay, if led by a period +(?: [\040\t] | \( +(?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] | \( (?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] )* \) )* +\) )* @ (?: [\040\t] | \( +(?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] | \( (?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] )* \) )* +\) )* (?: +[^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]+ # some number of atom characters... +(?![^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]) # ..not followed by something that could be part of an atom +| \[ # [ +(?: [^\\\x80-\xff\n\015\[\]] | \\ [^\x80-\xff] )* # stuff +\] # ] +) # initial subdomain +(?: # +(?: [\040\t] | \( +(?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] | \( (?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] )* \) )* +\) )* \. # if led by a period... +(?: [\040\t] | \( +(?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] | \( (?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] )* \) )* +\) )* (?: +[^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]+ # some number of atom characters... +(?![^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]) # ..not followed by something that could be part of an atom +| \[ # [ +(?: [^\\\x80-\xff\n\015\[\]] | \\ [^\x80-\xff] )* # stuff +\] # ] +) # ...further okay +)* +# address spec +(?: [\040\t] | \( +(?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] | \( (?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] )* \) )* +\) )* > # trailing > +# name and address +) (?: [\040\t] | \( +(?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] | \( (?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] )* \) )* +\) )* # optional trailing comment +/Ix +Capturing subpattern count = 0 +Contains explicit CR or LF match +Options: extended +Starting code units: \x09 \x20 ! " # $ % & ' ( * + - / 0 1 2 3 4 5 6 7 8 + 9 = ? A B C D E F G H I J K L M N O P Q R S T U V W X Y Z ^ _ ` a b c d e + f g h i j k l m n o p q r s t u v w x y z { | } ~ \x7f +No last code unit +Subject length lower bound = 3 + +/\h/I +Capturing subpattern count = 0 +No options +Starting code units: \x09 \x20 \xa0 +No last code unit +Subject length lower bound = 1 + +/\H/I +Capturing subpattern count = 0 +No options +No first code unit +No last code unit +Subject length lower bound = 1 + +/\v/I +Capturing subpattern count = 0 +No options +Starting code units: \x0a \x0b \x0c \x0d \x85 +No last code unit +Subject length lower bound = 1 + +/\V/I +Capturing subpattern count = 0 +No options +No first code unit +No last code unit +Subject length lower bound = 1 + +/\R/I +Capturing subpattern count = 0 +No options +Starting code units: \x0a \x0b \x0c \x0d \x85 +No last code unit +Subject length lower bound = 1 + +/[\h]/B +------------------------------------------------------------------ + Bra + [\x09 \xa0] + Ket + End +------------------------------------------------------------------ + >\x09< + 0: \x09 + +/[\h]+/B +------------------------------------------------------------------ + Bra + [\x09 \xa0]++ + Ket + End +------------------------------------------------------------------ + >\x09\x20\xa0< + 0: \x09 \xa0 + +/[\v]/B +------------------------------------------------------------------ + Bra + [\x0a-\x0d\x85] + Ket + End +------------------------------------------------------------------ + +/[\H]/B +------------------------------------------------------------------ + Bra + [\x00-\x08\x0a-\x1f!-\x9f\xa1-\xff] + Ket + End +------------------------------------------------------------------ + +/[^\h]/B +------------------------------------------------------------------ + Bra + [\x00-\x08\x0a-\x1f!-\x9f\xa1-\xff] (neg) + Ket + End +------------------------------------------------------------------ + +/[\V]/B +------------------------------------------------------------------ + Bra + [\x00-\x09\x0e-\x84\x86-\xff] + Ket + End +------------------------------------------------------------------ + +/[\x0a\V]/B +------------------------------------------------------------------ + Bra + [\x00-\x0a\x0e-\x84\x86-\xff] + Ket + End +------------------------------------------------------------------ + +/\777/I +Failed: error 151 at offset 3: octal value is greater than \377 in 8-bit non-UTF-8 mode + +/(*:0123456789ABCDEF0123456789ABCDEF0123456789ABCDEF0123456789ABCDEF0123456789ABCDEF0123456789ABCDEF0123456789ABCDEF0123456789ABCDEF0123456789ABCDEF0123456789ABCDEF0123456789ABCDEF0123456789ABCDEF0123456789ABCDEF0123456789ABCDEF0123456789ABCDEF0123456789ABCDEF)XX/mark +Failed: error 176 at offset 259: name is too long in (*MARK), (*PRUNE), (*SKIP), or (*THEN) + XX + +/(*:0123456789ABCDEF0123456789ABCDEF0123456789ABCDEF0123456789ABCDEF0123456789ABCDEF0123456789ABCDEF0123456789ABCDEF0123456789ABCDEF0123456789ABCDEF0123456789ABCDEF0123456789ABCDEF0123456789ABCDEF0123456789ABCDEF0123456789ABCDEF0123456789ABCDEF0123456789ABCDE)XX/mark + XX + 0: XX +MK: 0123456789ABCDEF0123456789ABCDEF0123456789ABCDEF0123456789ABCDEF0123456789ABCDEF0123456789ABCDEF0123456789ABCDEF0123456789ABCDEF0123456789ABCDEF0123456789ABCDEF0123456789ABCDEF0123456789ABCDEF0123456789ABCDEF0123456789ABCDEF0123456789ABCDEF0123456789ABCDE + +/\u0100/alt_bsux,allow_empty_class,match_unset_backref,dupnames +Failed: error 177 at offset 5: character code point value in \u.... sequence is too large + +/[\u0100-\u0200]/alt_bsux,allow_empty_class,match_unset_backref,dupnames +Failed: error 177 at offset 6: character code point value in \u.... sequence is too large + +/[^\x00-a]{12,}[^b-\xff]*/B +------------------------------------------------------------------ + Bra + [b-\xff] (neg){12,}+ + [\x00-a] (neg)*+ + Ket + End +------------------------------------------------------------------ + +/[^\s]*\s* [^\W]+\W+ [^\d]*?\d0 [^\d\w]{4,6}?\w*A/B +------------------------------------------------------------------ + Bra + [\x00-\x08\x0e-\x1f!-\xff] (neg)*+ + \s* + + [0-9A-Z_a-z]++ + \W+ + + [\x00-/:-\xff] (neg)*+ + \d + 0 + [\x00-/:-@[-^`{-\xff] (neg){4,6}+ + \w* + A + Ket + End +------------------------------------------------------------------ + +# End of testinput9