From ee41aa906f9473a5e25e167fab0fa87526ea4c79 Mon Sep 17 00:00:00 2001 From: "Philip.Hazel" Date: Tue, 18 Aug 2015 10:34:05 +0000 Subject: [PATCH] Runtime UTF checks now take not of the starting offset. --- ChangeLog | 4 + RunTest | 88 +++--- doc/pcre2api.3 | 19 +- doc/pcre2unicode.3 | 26 +- src/pcre2_compile.c | 2 +- src/pcre2_dfa_match.c | 50 +++- src/pcre2_intmodedep.h | 18 +- src/pcre2_match.c | 47 +++- testdata/testinput10 | 29 ++ testdata/testinput12 | 4 + testdata/testinput14 | 182 +++---------- testdata/testinput15 | 156 ++++++++++- testdata/testinput16 | 251 +---------------- testdata/testinput17 | 289 +++++++++++++++----- testdata/testinput18 | 100 ++++++- testdata/testinput19 | 75 +---- testdata/testinput20 | 62 +++++ testdata/testoutput10 | 48 ++++ testdata/testoutput12-16 | 6 + testdata/testoutput12-32 | 6 + testdata/testoutput14 | 334 ----------------------- testdata/testoutput14-16 | 61 +++++ testdata/testoutput14-32 | 61 +++++ testdata/testoutput14-8 | 61 +++++ testdata/testoutput15 | 329 +++++++++++++++++++++- testdata/testoutput16 | 487 +-------------------------------- testdata/testoutput17 | 576 +++++++++++++++++++++++++++++++-------- testdata/testoutput18 | 154 ++++++++++- testdata/testoutput19 | 116 ++------ testdata/testoutput20 | 100 +++++++ 30 files changed, 2077 insertions(+), 1664 deletions(-) create mode 100644 testdata/testinput20 delete mode 100644 testdata/testoutput14 create mode 100644 testdata/testoutput14-16 create mode 100644 testdata/testoutput14-32 create mode 100644 testdata/testoutput14-8 create mode 100644 testdata/testoutput20 diff --git a/ChangeLog b/ChangeLog index 7cd2eba..e8f5daa 100644 --- a/ChangeLog +++ b/ChangeLog @@ -145,6 +145,10 @@ was fixed. 39. Match limit check added to recursion. This issue was found by Karl Skomski with a custom LLVM fuzzer. +40. Arrange for the UTF check in pcre2_match() and pcre2_dfa_match() to look +only at the part of the subject that is relevant when the starting offset is +non-zero. + Version 10.20 30-June-2015 -------------------------- diff --git a/RunTest b/RunTest index fb758fe..a0345c8 100755 --- a/RunTest +++ b/RunTest @@ -68,12 +68,13 @@ title10="Test 10: Specials for the 8-bit library with UTF-8 and UCP support" title11="Test 11: Specials for the basic 16-bit and 32-bit libraries" title12="Test 12: Specials for the 16-bit and 32-bit libraries UTF and UCP support" title13="Test 13: DFA specials for the basic 16-bit and 32-bit libraries" -title14="Test 14: Non-JIT limits and other non-JIT tests" -title15="Test 15: JIT-specific features when JIT is not available" -title16="Test 16: JIT-specific features when JIT is available" -title17="Test 17: Tests of the POSIX interface, excluding UTF/UCP" -title18="Test 18: Tests of the POSIX interface with UTF/UCP" -title19="Test 19: Serialization tests" +title14="Test 14: DFA specials for UTF and UCP support" +title15="Test 15: Non-JIT limits and other non-JIT tests" +title16="Test 16: JIT-specific features when JIT is not available" +title17="Test 17: JIT-specific features when JIT is available" +title18="Test 18: Tests of the POSIX interface, excluding UTF/UCP" +title19="Test 19: Tests of the POSIX interface with UTF/UCP" +title20="Test 20: Serialization tests" maxtest=18 if [ $# -eq 1 -a "$1" = "list" ]; then @@ -97,6 +98,7 @@ if [ $# -eq 1 -a "$1" = "list" ]; then echo $title17 echo $title18 echo $title19 + echo $title20 exit 0 fi @@ -219,6 +221,7 @@ do16=no do17=no do18=no do19=no +do20=no while [ $# -gt 0 ] ; do case $1 in @@ -242,10 +245,11 @@ while [ $# -gt 0 ] ; do 17) do17=yes;; 18) do18=yes;; 19) do19=yes;; + 20) do20=yes;; -8) arg8=yes;; -16) arg16=yes;; -32) arg32=yes;; - bigstack|-bigstack) bigstack=yes;; + bigstack|-bigstack) bigstack=yes;; nojit|-nojit) nojit=yes;; sim|-sim) shift; sim=$1;; valgrind|-valgrind) valgrind="valgrind --tool=memcheck -q --smc-check=all";; @@ -305,10 +309,10 @@ if [ $? -eq 0 ] ; then else test2stack="-S 1024" defaultstack="-S 64" - fi + fi else test2stack="" - defaultstack="" + defaultstack="" fi # All of 8-bit, 16-bit, and 32-bit character strings may be supported, but only @@ -387,7 +391,8 @@ if [ $do0 = no -a $do1 = no -a $do2 = no -a $do3 = no -a \ $do4 = no -a $do5 = no -a $do6 = no -a $do7 = no -a \ $do8 = no -a $do9 = no -a $do10 = no -a $do11 = no -a \ $do12 = no -a $do13 = no -a $do14 = no -a $do15 = no -a \ - $do16 = no -a $do17 = no -a $do18 = no -a $do19 = no \ + $do16 = no -a $do17 = no -a $do18 = no -a $do19 = no -a \ + $do20 = no \ ]; then do0=yes do1=yes @@ -409,6 +414,7 @@ if [ $do0 = no -a $do1 = no -a $do2 = no -a $do3 = no -a \ do17=yes do18=yes do19=yes + do20=yes fi # Handle any explicit skips at this stage, so that an argument list may consist @@ -688,71 +694,79 @@ for bmode in "$test8" "$test16" "$test32"; do checkresult $? 13 "" fi fi - - # Test non-JIT match and recursion limits + + # Tests for DFA UTF and UCP features. Output is different for the different widths. if [ $do14 = yes ] ; then echo $title14 - $sim $valgrind ./pcre2test -q $defaultstack $bmode $testdata/testinput14 testtry - checkresult $? 14 "" + $sim $valgrind ./pcre2test -q $defaultstack $bmode $opt $testdata/testinput14 testtry + checkresult $? 14-$bits "$opt" + fi + + # Test non-JIT match and recursion limits + + if [ $do15 = yes ] ; then + echo $title15 + $sim $valgrind ./pcre2test -q $defaultstack $bmode $testdata/testinput15 testtry + checkresult $? 15 "" fi # Test JIT-specific features when JIT is not available - if [ $do15 = yes ] ; then - echo $title15 - if [ $jit -ne 0 ] ; then - echo " Skipped because JIT is available" - else - $sim $valgrind ./pcre2test -q $defaultstack $bmode $testdata/testinput15 testtry - checkresult $? 15 "" - fi - fi - - # Test JIT-specific features when JIT is available - if [ $do16 = yes ] ; then echo $title16 - if [ $jit -eq 0 -o "$nojit" = "yes" ] ; then - echo " Skipped because JIT is not available or nojit was specified" + if [ $jit -ne 0 ] ; then + echo " Skipped because JIT is available" else $sim $valgrind ./pcre2test -q $defaultstack $bmode $testdata/testinput16 testtry checkresult $? 16 "" fi fi - # Tests for the POSIX interface without UTF/UCP (8-bit only) + # Test JIT-specific features when JIT is available if [ $do17 = yes ] ; then echo $title17 - if [ "$bits" = "16" -o "$bits" = "32" ] ; then - echo " Skipped when running 16/32-bit tests" + if [ $jit -eq 0 -o "$nojit" = "yes" ] ; then + echo " Skipped because JIT is not available or nojit was specified" else $sim $valgrind ./pcre2test -q $defaultstack $bmode $testdata/testinput17 testtry checkresult $? 17 "" fi fi - # Tests for the POSIX interface with UTF/UCP (8-bit only) + # Tests for the POSIX interface without UTF/UCP (8-bit only) if [ $do18 = yes ] ; then echo $title18 if [ "$bits" = "16" -o "$bits" = "32" ] ; then echo " Skipped when running 16/32-bit tests" - elif [ $utf -eq 0 ] ; then - echo " Skipped because UTF-$bits support is not available" else $sim $valgrind ./pcre2test -q $defaultstack $bmode $testdata/testinput18 testtry checkresult $? 18 "" fi fi - # Serialization tests + # Tests for the POSIX interface with UTF/UCP (8-bit only) if [ $do19 = yes ] ; then echo $title19 - $sim $valgrind ./pcre2test -q $defaultstack $bmode $testdata/testinput19 testtry - checkresult $? 19 "" + if [ "$bits" = "16" -o "$bits" = "32" ] ; then + echo " Skipped when running 16/32-bit tests" + elif [ $utf -eq 0 ] ; then + echo " Skipped because UTF-$bits support is not available" + else + $sim $valgrind ./pcre2test -q $defaultstack $bmode $testdata/testinput19 testtry + checkresult $? 19 "" + fi + fi + + # Serialization tests + + if [ $do20 = yes ] ; then + echo $title20 + $sim $valgrind ./pcre2test -q $defaultstack $bmode $testdata/testinput20 testtry + checkresult $? 20 "" fi # End of loop for 8/16/32-bit tests diff --git a/doc/pcre2api.3 b/doc/pcre2api.3 index d8ae9d6..7cd20d3 100644 --- a/doc/pcre2api.3 +++ b/doc/pcre2api.3 @@ -1,4 +1,4 @@ -.TH PCRE2API 3 "29 July 2015" "PCRE2 10.21" +.TH PCRE2API 3 "18 August 2015" "PCRE2 10.21" .SH NAME PCRE2 - Perl-compatible regular expressions (revised API) .sp @@ -2022,12 +2022,19 @@ If the pattern is anchored, such a match can occur only if the pattern contains .sp When PCRE2_UTF is set at compile time, the validity of the subject as a UTF string is checked by default when \fBpcre2_match()\fP is subsequently called. -The entire string is checked before any other processing takes place, and a +If a non-zero starting offset is given, the check is applied only to that part +of the subject that could be inspected during matching, and there is a check +that the starting offset points to the first code unit of a character or to the +end of the subject. If there are no lookbehind assertions in the pattern, the +check starts at the starting offset. Otherwise, it starts at the length of the +longest lookbehind before the starting offset, or at the start of the subject +if there are not that many characters before the starting offset. Note that the +sequences \eb and \eB are one-character lookbehinds. +.P +The check is carried out before any other processing takes place, and a negative error code is returned if the check fails. There are several UTF error codes for each code unit width, corresponding to different problems with the -code unit sequence. The value of \fIstartoffset\fP is also checked, to ensure -that it points to the start of a character or to the end of the subject. There -are discussions about the validity of +code unit sequence. There are discussions about the validity of .\" HTML .\" UTF-8 strings, @@ -2939,6 +2946,6 @@ Cambridge, England. .rs .sp .nf -Last updated: 29 July 2015 +Last updated: 18 August 2015 Copyright (c) 1997-2015 University of Cambridge. .fi diff --git a/doc/pcre2unicode.3 b/doc/pcre2unicode.3 index 6c32bc0..cd98ce8 100644 --- a/doc/pcre2unicode.3 +++ b/doc/pcre2unicode.3 @@ -1,4 +1,4 @@ -.TH PCRE2UNICODE 3 "23 November 2014" "PCRE2 10.00" +.TH PCRE2UNICODE 3 "18 August 2015" "PCRE2 10.21" .SH NAME PCRE - Perl-compatible regular expressions (revised API) .SH "UNICODE AND UTF SUPPORT" @@ -117,11 +117,21 @@ UTF-16 and UTF-32 strings can indicate their endianness by special code knows as a byte-order mark (BOM). The PCRE2 functions do not handle this, expecting strings to be in host byte order. .P -The entire string is checked before any other processing takes place. In -addition to checking the format of the string, there is a check to ensure that -all code points lie in the range U+0 to U+10FFFF, excluding the surrogate area. -The so-called "non-character" code points are not excluded because Unicode -corrigendum #9 makes it clear that they should not be. +A UTF string is checked before any other processing takes place. In the case of +\fBpcre2_match()\fP and \fBpcre2_dfa_match()\fP calls with a non-zero starting +offset, the check is applied only to that part of the subject that could be +inspected during matching, and there is a check that the starting offset points +to the first code unit of a character or to the end of the subject. If there +are no lookbehind assertions in the pattern, the check starts at the starting +offset. Otherwise, it starts at the length of the longest lookbehind before the +starting offset, or at the start of the subject if there are not that many +characters before the starting offset. Note that the sequences \eb and \eB are +one-character lookbehinds. +.P +In addition to checking the format of the string, there is a check to ensure +that all code points lie in the range U+0 to U+10FFFF, excluding the surrogate +area. The so-called "non-character" code points are not excluded because +Unicode corrigendum #9 makes it clear that they should not be. .P Characters in the "Surrogate Area" of Unicode are reserved for use by UTF-16, where they are used in pairs to encode code points with values greater than @@ -252,6 +262,6 @@ Cambridge, England. .rs .sp .nf -Last updated: 23 November 2014 -Copyright (c) 1997-2014 University of Cambridge. +Last updated: 18 August 2015 +Copyright (c) 1997-2015 University of Cambridge. .fi diff --git a/src/pcre2_compile.c b/src/pcre2_compile.c index aa65fd3..4fe0236 100644 --- a/src/pcre2_compile.c +++ b/src/pcre2_compile.c @@ -4682,7 +4682,7 @@ for (;; ptr++) that it's a length rather than a small character. */ #ifdef MAYBE_UTF_MULTI - if (utf && NOT_FIRSTCHAR(code[-1])) + if (utf && NOT_FIRSTCU(code[-1])) { PCRE2_UCHAR *lastchar = code - 1; BACKCHAR(lastchar); diff --git a/src/pcre2_dfa_match.c b/src/pcre2_dfa_match.c index ff4d332..3cfa454 100644 --- a/src/pcre2_dfa_match.c +++ b/src/pcre2_dfa_match.c @@ -2774,7 +2774,7 @@ for (;;) { PCRE2_SPTR p = start_subject + local_offsets[rc]; PCRE2_SPTR pp = start_subject + local_offsets[rc+1]; - while (p < pp) if (NOT_FIRSTCHAR(*p++)) charcount--; + while (p < pp) if (NOT_FIRSTCU(*p++)) charcount--; } #endif if (charcount > 0) @@ -2874,7 +2874,7 @@ for (;;) PCRE2_SPTR pp = local_ptr; charcount = (int)(pp - p); #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32 - if (utf) while (p < pp) if (NOT_FIRSTCHAR(*p++)) charcount--; + if (utf) while (p < pp) if (NOT_FIRSTCU(*p++)) charcount--; #endif ADD_NEW_DATA(-next_state_offset, 0, (charcount - 1)); } @@ -2960,7 +2960,7 @@ for (;;) { PCRE2_SPTR p = start_subject + local_offsets[0]; PCRE2_SPTR pp = start_subject + local_offsets[1]; - while (p < pp) if (NOT_FIRSTCHAR(*p++)) charcount--; + while (p < pp) if (NOT_FIRSTCU(*p++)) charcount--; } #endif ADD_NEW_DATA(-next_state_offset, 0, (charcount - 1)); @@ -3264,18 +3264,50 @@ switch(re->newline_convention) /* Check a UTF string for validity if required. For 8-bit and 16-bit strings, we must also check that a starting offset does not point into the middle of a -multiunit character. */ +multiunit character. We check only the portion of the subject that is going to +be inspected during matching - from the offset minus the maximum back reference +to the given length. This saves time when a small part of a large subject is +being matched by the use of a starting offset. Note that the maximum lookbehind +is a number of characters, not code units. */ #ifdef SUPPORT_UNICODE if (utf && (options & PCRE2_NO_UTF_CHECK) == 0) { - match_data->rc = PRIV(valid_utf)(subject, length, &(match_data->startchar)); - if (match_data->rc != 0) return match_data->rc; + PCRE2_SPTR check_subject = start_match; /* start_match includes offset */ + + if (start_offset > 0) + { #if PCRE2_CODE_UNIT_WIDTH != 32 - if (start_offset > 0 && start_offset < length && - NOT_FIRSTCHAR(subject[start_offset])) - return PCRE2_ERROR_BADUTFOFFSET; + unsigned int i; + if (start_match < end_subject && NOT_FIRSTCU(*start_match)) + return PCRE2_ERROR_BADUTFOFFSET; + for (i = re->max_lookbehind; i > 0 && check_subject > subject; i--) + { + check_subject--; + while (check_subject > subject && +#if PCRE2_CODE_UNIT_WIDTH == 8 + (*check_subject & 0xc0) == 0x80) +#else /* 16-bit */ + (*check_subject & 0xfc00) == 0xdc00) +#endif /* PCRE2_CODE_UNIT_WIDTH == 8 */ + check_subject--; + } +#else /* In the 32-bit library, one code unit equals one character. */ + check_subject -= re->max_lookbehind; + if (check_subject < subject) check_subject = subject; #endif /* PCRE2_CODE_UNIT_WIDTH != 32 */ + } + + /* Validate the relevant portion of the subject. After an error, adjust the + offset to be an absolute offset in the whole string. */ + + match_data->rc = PRIV(valid_utf)(check_subject, + length - (check_subject - subject), &(match_data->startchar)); + if (match_data->rc != 0) + { + match_data->startchar += check_subject - subject; + return match_data->rc; + } } #endif /* SUPPORT_UNICODE */ diff --git a/src/pcre2_intmodedep.h b/src/pcre2_intmodedep.h index 704c375..85ceb06 100644 --- a/src/pcre2_intmodedep.h +++ b/src/pcre2_intmodedep.h @@ -72,7 +72,7 @@ just to undefine them all. */ #undef MAX_MARK #undef MAX_PATTERN_SIZE #undef MAX_UTF_SINGLE_CU -#undef NOT_FIRSTCHAR +#undef NOT_FIRSTCU #undef PUT #undef PUT2 #undef PUT2INC @@ -252,7 +252,7 @@ UTF support is omitted, we don't even define them. */ /* #define MAX_UTF_SINGLE_CU */ /* #define HAS_EXTRALEN(c) */ /* #define GET_EXTRALEN(c) */ -/* #define NOT_FIRSTCHAR(c) */ +/* #define NOT_FIRSTCU(c) */ #define GETCHAR(c, eptr) c = *eptr; #define GETCHARTEST(c, eptr) c = *eptr; #define GETCHARINC(c, eptr) c = *eptr++; @@ -285,10 +285,10 @@ Otherwise it has an undefined behaviour. */ #define GET_EXTRALEN(c) (PRIV(utf8_table4)[(c) & 0x3f]) -/* Returns TRUE, if the given character is not the first character -of a UTF sequence. */ +/* Returns TRUE, if the given value is not the first code unit of a UTF +sequence. */ -#define NOT_FIRSTCHAR(c) (((c) & 0xc0) == 0x80) +#define NOT_FIRSTCU(c) (((c) & 0xc0) == 0x80) /* Get the next UTF-8 character, not advancing the pointer. This is called when we know we are in UTF-8 mode. */ @@ -371,10 +371,10 @@ Otherwise it has an undefined behaviour. */ #define GET_EXTRALEN(c) 1 -/* Returns TRUE, if the given character is not the first character -of a UTF sequence. */ +/* Returns TRUE, if the given value is not the first code unit of a UTF +sequence. */ -#define NOT_FIRSTCHAR(c) (((c) & 0xfc00) == 0xdc00) +#define NOT_FIRSTCU(c) (((c) & 0xfc00) == 0xdc00) /* Base macro to pick up the low surrogate of a UTF-16 character, not advancing the pointer. */ @@ -469,7 +469,7 @@ into one PCRE2_UCHAR unit. */ #define MAX_UTF_SINGLE_CU (0x10ffffu) #define HAS_EXTRALEN(c) (0) #define GET_EXTRALEN(c) (0) -#define NOT_FIRSTCHAR(c) (0) +#define NOT_FIRSTCU(c) (0) /* Get the next UTF-32 character, not advancing the pointer. This is called when we know we are in UTF-32 mode. */ diff --git a/src/pcre2_match.c b/src/pcre2_match.c index 41113a5..8875d6d 100644 --- a/src/pcre2_match.c +++ b/src/pcre2_match.c @@ -6485,6 +6485,7 @@ mb->match_frames_base = &frame_zero; subject string. */ if (length == PCRE2_ZERO_TERMINATED) length = PRIV(strlen)(subject); +end_subject = subject + length; /* Plausibility checks */ @@ -6536,18 +6537,50 @@ mb->partial = ((options & PCRE2_PARTIAL_HARD) != 0)? 2 : /* Check a UTF string for validity if required. For 8-bit and 16-bit strings, we must also check that a starting offset does not point into the middle of a -multiunit character. */ +multiunit character. We check only the portion of the subject that is going to +be inspected during matching - from the offset minus the maximum back reference +to the given length. This saves time when a small part of a large subject is +being matched by the use of a starting offset. Note that the maximum lookbehind +is a number of characters, not code units. */ #ifdef SUPPORT_UNICODE if (utf && (options & PCRE2_NO_UTF_CHECK) == 0) { - match_data->rc = PRIV(valid_utf)(subject, length, &(match_data->startchar)); - if (match_data->rc != 0) return match_data->rc; + PCRE2_SPTR check_subject = start_match; /* start_match includes offset */ + + if (start_offset > 0) + { #if PCRE2_CODE_UNIT_WIDTH != 32 - if (start_offset > 0 && start_offset < length && - NOT_FIRSTCHAR(subject[start_offset])) - return PCRE2_ERROR_BADUTFOFFSET; + unsigned int i; + if (start_match < end_subject && NOT_FIRSTCU(*start_match)) + return PCRE2_ERROR_BADUTFOFFSET; + for (i = re->max_lookbehind; i > 0 && check_subject > subject; i--) + { + check_subject--; + while (check_subject > subject && +#if PCRE2_CODE_UNIT_WIDTH == 8 + (*check_subject & 0xc0) == 0x80) +#else /* 16-bit */ + (*check_subject & 0xfc00) == 0xdc00) +#endif /* PCRE2_CODE_UNIT_WIDTH == 8 */ + check_subject--; + } +#else /* In the 32-bit library, one code unit equals one character. */ + check_subject -= re->max_lookbehind; + if (check_subject < subject) check_subject = subject; #endif /* PCRE2_CODE_UNIT_WIDTH != 32 */ + } + + /* Validate the relevant portion of the subject. After an error, adjust the + offset to be an absolute offset in the whole string. */ + + match_data->rc = PRIV(valid_utf)(check_subject, + length - (check_subject - subject), &(match_data->startchar)); + if (match_data->rc != 0) + { + match_data->startchar += check_subject - subject; + return match_data->rc; + } } #endif /* SUPPORT_UNICODE */ @@ -6594,7 +6627,7 @@ else mb->start_subject = subject; mb->start_offset = start_offset; -mb->end_subject = end_subject = mb->start_subject + length; +mb->end_subject = end_subject; mb->hasthen = (re->flags & PCRE2_HASTHEN) != 0; mb->moptions = options; /* Match options */ diff --git a/testdata/testinput10 b/testdata/testinput10 index a1fdd92..dc43629 100644 --- a/testdata/testinput10 +++ b/testdata/testinput10 @@ -132,7 +132,36 @@ \xf9\x87\x80\x80\x80\=no_utf_check \xfc\x84\x80\x80\x80\x80\=no_utf_check \xfd\x83\x80\x80\x80\x80\=no_utf_check + +# Similar tests with offsets +/badutf/utf + X\xdfabcd + X\xdfabcd\=offset=1 + X\xdfabcd\=offset=2 + +/(?<=x)badutf/utf + X\xdfabcd + X\xdfabcd\=offset=1 + X\xdfabcd\=offset=2 + X\xdfabcd\=offset=3 + X\xdfabcd\xdf\=offset=3 + +/(?<=xx)badutf/utf + X\xdfabcd + X\xdfabcd\=offset=1 + X\xdfabcd\=offset=2 + X\xdfabcd\=offset=3 + +/(?<=xxxx)badutf/utf + X\xdfabcd + X\xdfabcd\=offset=1 + X\xdfabcd\=offset=2 + X\xdfabcd\=offset=3 + X\xdfabcd\=offset=6 + X\xdfabc\xdf\=offset=6 + X\xdfabc\xdf\=offset=7 + /\x{100}/IB,utf /\x{1000}/IB,utf diff --git a/testdata/testinput12 b/testdata/testinput12 index 1cba4af..8223908 100644 --- a/testdata/testinput12 +++ b/testdata/testinput12 @@ -158,6 +158,7 @@ /X/utf XX\x{d800} + XX\x{d800}\=offset=3 XX\x{d800}\=no_utf_check XX\x{da00} XX\x{da00}\=no_utf_check @@ -169,6 +170,9 @@ XX\x{dfff}\=no_utf_check XX\x{110000} XX\x{d800}\x{1234} + +/(?<=.)X/utf + XX\x{d800}\=offset=3 /(*UTF16)\x{11234}/ abcd\x{11234}pqr diff --git a/testdata/testinput14 b/testdata/testinput14 index b4fc203..f97f3ec 100644 --- a/testdata/testinput14 +++ b/testdata/testinput14 @@ -1,155 +1,37 @@ -# These are: -# -# (1) Tests of the match-limiting features. The results are different for -# interpretive or JIT matching, so this test should not be run with JIT. The -# same tests are run using JIT in test 16. +# These test special (mostly error) UTF features of DFA matching. They are a +# selection of the more comprehensive tests that are run for non-DFA matching. +# The output is different for the different widths. -# (2) Other tests that must not be run with JIT. +#subject dfa -/(a+)*zz/I - aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaazzbbbbbb\=find_limits - aaaaaaaaaaaaaz\=find_limits +/X/utf + XX\x{d800} + XX\x{d800}\=offset=3 + XX\x{d800}\=no_utf_check + XX\x{da00} + XX\x{da00}\=no_utf_check + XX\x{dc00} + XX\x{dc00}\=no_utf_check + XX\x{de00} + XX\x{de00}\=no_utf_check + XX\x{dfff} + XX\x{dfff}\=no_utf_check + XX\x{110000} + XX\x{d800}\x{1234} + +/badutf/utf + X\xdf + XX\xef + XXX\xef\x80 + X\xf7 + XX\xf7\x80 + XXX\xf7\x80\x80 -!((?:\s|//.*\\n|/[*](?:\\n|.)*?[*]/)*)!I - /* this is a C style comment */\=find_limits - -/^(?>a)++/ - aa\=find_limits - aaaaaaaaa\=find_limits - -/(a)(?1)++/ - aa\=find_limits - aaaaaaaaa\=find_limits - -/a(?:.)*?a/ims - abbbbbbbbbbbbbbbbbbbbba\=find_limits - -/a(?:.(*THEN))*?a/ims - abbbbbbbbbbbbbbbbbbbbba\=find_limits - -/a(?:.(*THEN:ABC))*?a/ims - abbbbbbbbbbbbbbbbbbbbba\=find_limits - -/^(?>a+)(?>b+)(?>c+)(?>d+)(?>e+)/ - aabbccddee\=find_limits - -/^(?>(a+))(?>(b+))(?>(c+))(?>(d+))(?>(e+))/ - aabbccddee\=find_limits - -/^(?>(a+))(?>b+)(?>(c+))(?>d+)(?>(e+))/ - aabbccddee\=find_limits - -/(*LIMIT_MATCH=12bc)abc/ - -/(*LIMIT_MATCH=4294967290)abc/ - -/(*LIMIT_RECURSION=4294967280)abc/I - -/(a+)*zz/ - aaaaaaaaaaaaaz - aaaaaaaaaaaaaz\=match_limit=3000 - -/(a+)*zz/ - aaaaaaaaaaaaaz\=recursion_limit=10 - -/(*LIMIT_MATCH=3000)(a+)*zz/I - aaaaaaaaaaaaaz - aaaaaaaaaaaaaz\=match_limit=60000 - -/(*LIMIT_MATCH=60000)(*LIMIT_MATCH=3000)(a+)*zz/I - aaaaaaaaaaaaaz - -/(*LIMIT_MATCH=60000)(a+)*zz/I - aaaaaaaaaaaaaz - aaaaaaaaaaaaaz\=match_limit=3000 - -/(*LIMIT_RECURSION=10)(a+)*zz/I - aaaaaaaaaaaaaz - aaaaaaaaaaaaaz\=recursion_limit=1000 - -/(*LIMIT_RECURSION=10)(*LIMIT_RECURSION=1000)(a+)*zz/I - aaaaaaaaaaaaaz - -/(*LIMIT_RECURSION=1000)(a+)*zz/I - aaaaaaaaaaaaaz - aaaaaaaaaaaaaz\=recursion_limit=10 - -# These three have infinitely nested recursions. - -/((?2))((?1))/ - abc - -/((?(R2)a+|(?1)b))/ - aaaabcde - -/(?(R)a*(?1)|((?R))b)/ - aaaabcde - -# The allusedtext modifier does not work with JIT, which does not maintain -# the leftchar/rightchar data. - -/abc(?=xyz)/allusedtext - abcxyzpqr - abcxyzpqr\=aftertext - -/(?<=pqr)abc(?=xyz)/allusedtext - xyzpqrabcxyzpqr - xyzpqrabcxyzpqr\=aftertext - -/a\b/ - a.\=allusedtext - a\=allusedtext - -/abc\Kxyz/ - abcxyz\=allusedtext - -/abc(?=xyz(*ACCEPT))/ - abcxyz\=allusedtext - -/abc(?=abcde)(?=ab)/allusedtext - abcabcdefg - -# These tests provoke recursion loops, which give a different error message -# when JIT is used. - -/(?R)/I - abcd - -/(a|(?R))/I - abcd - defg - -/(ab|(bc|(de|(?R))))/I - abcd - fghi - -/(ab|(bc|(de|(?1))))/I - abcd - fghi - -/x(ab|(bc|(de|(?1)x)x)x)/I - xab123 - xfghi - -/(?!\w)(?R)/ - abcd - =abc - -/(?=\w)(?R)/ - =abc - abcd - -/(?a)++/ + aa\=find_limits + aaaaaaaaa\=find_limits + +/(a)(?1)++/ + aa\=find_limits + aaaaaaaaa\=find_limits + +/a(?:.)*?a/ims + abbbbbbbbbbbbbbbbbbbbba\=find_limits + +/a(?:.(*THEN))*?a/ims + abbbbbbbbbbbbbbbbbbbbba\=find_limits + +/a(?:.(*THEN:ABC))*?a/ims + abbbbbbbbbbbbbbbbbbbbba\=find_limits + +/^(?>a+)(?>b+)(?>c+)(?>d+)(?>e+)/ + aabbccddee\=find_limits + +/^(?>(a+))(?>(b+))(?>(c+))(?>(d+))(?>(e+))/ + aabbccddee\=find_limits + +/^(?>(a+))(?>b+)(?>(c+))(?>d+)(?>(e+))/ + aabbccddee\=find_limits + +/(*LIMIT_MATCH=12bc)abc/ + +/(*LIMIT_MATCH=4294967290)abc/ + +/(*LIMIT_RECURSION=4294967280)abc/I + +/(a+)*zz/ + aaaaaaaaaaaaaz + aaaaaaaaaaaaaz\=match_limit=3000 + +/(a+)*zz/ + aaaaaaaaaaaaaz\=recursion_limit=10 + +/(*LIMIT_MATCH=3000)(a+)*zz/I + aaaaaaaaaaaaaz + aaaaaaaaaaaaaz\=match_limit=60000 + +/(*LIMIT_MATCH=60000)(*LIMIT_MATCH=3000)(a+)*zz/I + aaaaaaaaaaaaaz + +/(*LIMIT_MATCH=60000)(a+)*zz/I + aaaaaaaaaaaaaz + aaaaaaaaaaaaaz\=match_limit=3000 + +/(*LIMIT_RECURSION=10)(a+)*zz/I + aaaaaaaaaaaaaz + aaaaaaaaaaaaaz\=recursion_limit=1000 + +/(*LIMIT_RECURSION=10)(*LIMIT_RECURSION=1000)(a+)*zz/I + aaaaaaaaaaaaaz + +/(*LIMIT_RECURSION=1000)(a+)*zz/I + aaaaaaaaaaaaaz + aaaaaaaaaaaaaz\=recursion_limit=10 + +# These three have infinitely nested recursions. + +/((?2))((?1))/ + abc + +/((?(R2)a+|(?1)b))/ + aaaabcde + +/(?(R)a*(?1)|((?R))b)/ + aaaabcde + +# The allusedtext modifier does not work with JIT, which does not maintain +# the leftchar/rightchar data. + +/abc(?=xyz)/allusedtext + abcxyzpqr + abcxyzpqr\=aftertext + +/(?<=pqr)abc(?=xyz)/allusedtext + xyzpqrabcxyzpqr + xyzpqrabcxyzpqr\=aftertext + +/a\b/ + a.\=allusedtext + a\=allusedtext + +/abc\Kxyz/ + abcxyz\=allusedtext + +/abc(?=xyz(*ACCEPT))/ + abcxyz\=allusedtext + +/abc(?=abcde)(?=ab)/allusedtext + abcabcdefg + +# These tests provoke recursion loops, which give a different error message +# when JIT is used. + +/(?R)/I + abcd + +/(a|(?R))/I + abcd + defg + +/(ab|(bc|(de|(?R))))/I + abcd + fghi + +/(ab|(bc|(de|(?1))))/I + abcd + fghi + +/x(ab|(bc|(de|(?1)x)x)x)/I + xab123 + xfghi + +/(?!\w)(?R)/ + abcd + =abc + +/(?=\w)(?R)/ + =abc + abcd + +/(?a)++/ - aa\=find_limits - aaaaaaaaa\=find_limits - -/(a)(?1)++/ - aa\=find_limits - aaaaaaaaa\=find_limits - -/a(?:.)*?a/ims - abbbbbbbbbbbbbbbbbbbbba\=find_limits - -/a(?:.(*THEN))*?a/ims - abbbbbbbbbbbbbbbbbbbbba\=find_limits - -/a(?:.(*THEN:ABC))*?a/ims - abbbbbbbbbbbbbbbbbbbbba\=find_limits - -/^(?>a+)(?>b+)(?>c+)(?>d+)(?>e+)/ - aabbccddee\=find_limits - -/^(?>(a+))(?>(b+))(?>(c+))(?>(d+))(?>(e+))/ - aabbccddee\=find_limits - -/^(?>(a+))(?>b+)(?>(c+))(?>d+)(?>(e+))/ - aabbccddee\=find_limits - -/^(?>(a+))(?>b+)(?>(c+))(?>d+)(?>(e+))/jitfast - aabbccddee\=find_limits - aabbccddee\=jitstack=1 - -/(a+)*zz/ - aaaaaaaaaaaaaz - aaaaaaaaaaaaaz\=match_limit=3000 - -/(*LIMIT_MATCH=3000)(a+)*zz/I - aaaaaaaaaaaaaz - aaaaaaaaaaaaaz\=match_limit=60000 - -/(*LIMIT_MATCH=60000)(*LIMIT_MATCH=3000)(a+)*zz/I - aaaaaaaaaaaaaz - -/(*LIMIT_MATCH=60000)(a+)*zz/I - aaaaaaaaaaaaaz - aaaaaaaaaaaaaz\=match_limit=3000 - -# These three have infinitely nested recursions. - -/((?2))((?1))/ - abc - -/((?(R2)a+|(?1)b))/ - aaaabcde - -/(?(R)a*(?1)|((?R))b)/ - aaaabcde - -# Invalid options disable JIT when called via pcre2_match(), causing the -# match to happen via the interpreter, but for fast JIT invalid options are -# ignored, so an unanchored match happens. - -/abcd/ - abcd\=anchored - fail abcd\=anchored - -/abcd/jitfast - abcd\=anchored - succeed abcd\=anchored - -# Push/pop does not lose the JIT information, though jitverify applies only to -# compilation, but serializing (save/load) discards JIT data completely. - -/^abc\Kdef/info,push -#pop jitverify - abcdef - -/^abc\Kdef/info,push -#save testsaved1 -#load testsaved1 -#pop jitverify - abcdef - -#load testsaved1 -#pop jit,jitverify - abcdef - -# Test pattern compilation - -/(?:a|b|c|d|e)(?R)/jit=1 - -/(?:a|b|c|d|e)(?R)(?R)/jit=1 - -/(a(?:a|b|c|d|e)b){8,16}/jit=1 - -/(?:|a|){100}x/jit=1 - -# These tests provoke recursion loops, which give a different error message -# when JIT is used. - -/(?R)/I - abcd - -/(a|(?R))/I - abcd - defg - -/(ab|(bc|(de|(?R))))/I - abcd - fghi - -/(ab|(bc|(de|(?1))))/I - abcd - fghi - -/x(ab|(bc|(de|(?1)x)x)x)/I - xab123 - xfghi - -/(?!\w)(?R)/ - abcd - =abc - -/(?=\w)(?R)/ - =abc - abcd - -/(?a)++/ + aa\=find_limits + aaaaaaaaa\=find_limits + +/(a)(?1)++/ + aa\=find_limits + aaaaaaaaa\=find_limits -/abc/ - abc\=partial_hard +/a(?:.)*?a/ims + abbbbbbbbbbbbbbbbbbbbba\=find_limits + +/a(?:.(*THEN))*?a/ims + abbbbbbbbbbbbbbbbbbbbba\=find_limits -# Real tests +/a(?:.(*THEN:ABC))*?a/ims + abbbbbbbbbbbbbbbbbbbbba\=find_limits -/abc/ +/^(?>a+)(?>b+)(?>c+)(?>d+)(?>e+)/ + aabbccddee\=find_limits + +/^(?>(a+))(?>(b+))(?>(c+))(?>(d+))(?>(e+))/ + aabbccddee\=find_limits + +/^(?>(a+))(?>b+)(?>(c+))(?>d+)(?>(e+))/ + aabbccddee\=find_limits + +/^(?>(a+))(?>b+)(?>(c+))(?>d+)(?>(e+))/jitfast + aabbccddee\=find_limits + aabbccddee\=jitstack=1 + +/(a+)*zz/ + aaaaaaaaaaaaaz + aaaaaaaaaaaaaz\=match_limit=3000 + +/(*LIMIT_MATCH=3000)(a+)*zz/I + aaaaaaaaaaaaaz + aaaaaaaaaaaaaz\=match_limit=60000 + +/(*LIMIT_MATCH=60000)(*LIMIT_MATCH=3000)(a+)*zz/I + aaaaaaaaaaaaaz + +/(*LIMIT_MATCH=60000)(a+)*zz/I + aaaaaaaaaaaaaz + aaaaaaaaaaaaaz\=match_limit=3000 + +# These three have infinitely nested recursions. + +/((?2))((?1))/ abc - *** Failers -/^abc|def/ +/((?(R2)a+|(?1)b))/ + aaaabcde + +/(?(R)a*(?1)|((?R))b)/ + aaaabcde + +# Invalid options disable JIT when called via pcre2_match(), causing the +# match to happen via the interpreter, but for fast JIT invalid options are +# ignored, so an unanchored match happens. + +/abcd/ + abcd\=anchored + fail abcd\=anchored + +/abcd/jitfast + abcd\=anchored + succeed abcd\=anchored + +# Push/pop does not lose the JIT information, though jitverify applies only to +# compilation, but serializing (save/load) discards JIT data completely. + +/^abc\Kdef/info,push +#pop jitverify abcdef - abcdef\=notbol -/.*((abc)$|(def))/ - defabc - defabc\=noteol +/^abc\Kdef/info,push +#save testsaved1 +#load testsaved1 +#pop jitverify + abcdef + +#load testsaved1 +#pop jit,jitverify + abcdef + +# Test pattern compilation -/the quick brown fox/ - the quick brown fox - *** Failers - The Quick Brown Fox +/(?:a|b|c|d|e)(?R)/jit=1 -/the quick brown fox/i - the quick brown fox - The Quick Brown Fox +/(?:a|b|c|d|e)(?R)(?R)/jit=1 -/abc.def/ - *** Failers - abc\ndef +/(a(?:a|b|c|d|e)b){8,16}/jit=1 -/abc$/ - abc - abc\n +/(?:|a|){100}x/jit=1 -/(abc)\2/ +# These tests provoke recursion loops, which give a different error message +# when JIT is used. -/(abc\1)/ - abc +/(?R)/I + abcd -/a*(b+)(z)(z)/ - aaaabbbbzzzz - aaaabbbbzzzz\=ovector=0 - aaaabbbbzzzz\=ovector=1 - aaaabbbbzzzz\=ovector=2 +/(a|(?R))/I + abcd + defg -/ab.cd/ - ab-cd - ab=cd - ** Failers - ab\ncd +/(ab|(bc|(de|(?R))))/I + abcd + fghi -/ab.cd/s - ab-cd - ab=cd - ab\ncd +/(ab|(bc|(de|(?1))))/I + abcd + fghi -/a(b)c/no_auto_capture - abc +/x(ab|(bc|(de|(?1)x)x)x)/I + xab123 + xfghi -/a(?Pb)c/no_auto_capture - abc +/(?!\w)(?R)/ + abcd + =abc -/a?|b?/ - abc - ** Failers - ddd\=notempty +/(?=\w)(?R)/ + =abc + abcd -/\w+A/ - CDAAAAB +/(?b)c/no_auto_capture + abc + +/a?|b?/ + abc + ** Failers + ddd\=notempty + +/\w+A/ + CDAAAAB + +/\w+A/ungreedy + CDAAAAB + +/\Biss\B/I,aftertext + Mississippi + +/abc/\ + +"(?(?C)" + +# End of testdata/testinput18 diff --git a/testdata/testinput19 b/testdata/testinput19 index 155fd13..eebce2c 100644 --- a/testdata/testinput19 +++ b/testdata/testinput19 @@ -1,62 +1,17 @@ -# This set of tests exercises the serialization/deserialization functions in -# the library. It does not use UTF or JIT. - -#forbid_utf - -# Compile several patterns, push them onto the stack, and then write them -# all to a file. - -#pattern push - -/(?(?&NAME_PAT))\s+(?(?&ADDRESS_PAT)) - (?(DEFINE) - (?[a-z]+) - (?\d+) - )/x -/^(?:((.)(?1)\2|)|((.)(?3)\4|.))$/i - -#save testsaved1 - -# Do it again for some more patterns. - -/(*MARK:A)(*SKIP:B)(C|X)/mark -/(?:(?foo)|(?bar))\k/dupnames - -#save testsaved2 -#pattern -push - -# Reload the patterns, then pop them one by one and check them. - -#load testsaved1 -#load testsaved2 - -#pop info - foofoo - barbar +# This set of tests is run only with the 8-bit library. It tests the POSIX +# interface with UTF/UCP support, which is supported only with the 8-bit +# library. This test should not be run with JIT (which is not available for the +# POSIX interface). -#pop mark - C - D +#pattern posix + +/a\x{1234}b/utf + a\x{1234}b + +/\w/ + +++\x{c2} + +/\w/ucp + +++\x{c2} -#pop - AmanaplanacanalPanama - -#pop info - metcalfe 33 - -# Check for an error when different tables are used. - -/abc/push,tables=1 -/xyz/push,tables=2 -#save testsaved1 - -#pop - xyz - -#pop - abc - -#pop should give an error - pqr - -# End of testinput19 +# End of testdata/testinput19 diff --git a/testdata/testinput20 b/testdata/testinput20 new file mode 100644 index 0000000..3a8f06e --- /dev/null +++ b/testdata/testinput20 @@ -0,0 +1,62 @@ +# This set of tests exercises the serialization/deserialization functions in +# the library. It does not use UTF or JIT. + +#forbid_utf + +# Compile several patterns, push them onto the stack, and then write them +# all to a file. + +#pattern push + +/(?(?&NAME_PAT))\s+(?(?&ADDRESS_PAT)) + (?(DEFINE) + (?[a-z]+) + (?\d+) + )/x +/^(?:((.)(?1)\2|)|((.)(?3)\4|.))$/i + +#save testsaved1 + +# Do it again for some more patterns. + +/(*MARK:A)(*SKIP:B)(C|X)/mark +/(?:(?foo)|(?bar))\k/dupnames + +#save testsaved2 +#pattern -push + +# Reload the patterns, then pop them one by one and check them. + +#load testsaved1 +#load testsaved2 + +#pop info + foofoo + barbar + +#pop mark + C + D + +#pop + AmanaplanacanalPanama + +#pop info + metcalfe 33 + +# Check for an error when different tables are used. + +/abc/push,tables=1 +/xyz/push,tables=2 +#save testsaved1 + +#pop + xyz + +#pop + abc + +#pop should give an error + pqr + +# End of testinput20 diff --git a/testdata/testoutput10 b/testdata/testoutput10 index ef248db..89b4d36 100644 --- a/testdata/testoutput10 +++ b/testdata/testoutput10 @@ -235,7 +235,55 @@ No match No match \xfd\x83\x80\x80\x80\x80\=no_utf_check No match + +# Similar tests with offsets +/badutf/utf + X\xdfabcd +Failed: error -8: UTF-8 error: byte 2 top bits not 0x80 at offset 1 + X\xdfabcd\=offset=1 +Failed: error -8: UTF-8 error: byte 2 top bits not 0x80 at offset 1 + X\xdfabcd\=offset=2 +No match + +/(?<=x)badutf/utf + X\xdfabcd +Failed: error -8: UTF-8 error: byte 2 top bits not 0x80 at offset 1 + X\xdfabcd\=offset=1 +Failed: error -8: UTF-8 error: byte 2 top bits not 0x80 at offset 1 + X\xdfabcd\=offset=2 +Failed: error -8: UTF-8 error: byte 2 top bits not 0x80 at offset 1 + X\xdfabcd\=offset=3 +No match + X\xdfabcd\xdf\=offset=3 +Failed: error -3: UTF-8 error: 1 byte missing at end at offset 6 + +/(?<=xx)badutf/utf + X\xdfabcd +Failed: error -8: UTF-8 error: byte 2 top bits not 0x80 at offset 1 + X\xdfabcd\=offset=1 +Failed: error -8: UTF-8 error: byte 2 top bits not 0x80 at offset 1 + X\xdfabcd\=offset=2 +Failed: error -8: UTF-8 error: byte 2 top bits not 0x80 at offset 1 + X\xdfabcd\=offset=3 +Failed: error -8: UTF-8 error: byte 2 top bits not 0x80 at offset 1 + +/(?<=xxxx)badutf/utf + X\xdfabcd +Failed: error -8: UTF-8 error: byte 2 top bits not 0x80 at offset 1 + X\xdfabcd\=offset=1 +Failed: error -8: UTF-8 error: byte 2 top bits not 0x80 at offset 1 + X\xdfabcd\=offset=2 +Failed: error -8: UTF-8 error: byte 2 top bits not 0x80 at offset 1 + X\xdfabcd\=offset=3 +Failed: error -8: UTF-8 error: byte 2 top bits not 0x80 at offset 1 + X\xdfabcd\=offset=6 +No match + X\xdfabc\xdf\=offset=6 +Failed: error -3: UTF-8 error: 1 byte missing at end at offset 5 + X\xdfabc\xdf\=offset=7 +Failed: error -33: bad offset value + /\x{100}/IB,utf ------------------------------------------------------------------ Bra diff --git a/testdata/testoutput12-16 b/testdata/testoutput12-16 index 81584dd..2676866 100644 --- a/testdata/testoutput12-16 +++ b/testdata/testoutput12-16 @@ -609,6 +609,8 @@ Failed: error 106 at offset 13: missing terminating ] for character class /X/utf XX\x{d800} Failed: error -24: UTF-16 error: missing low surrogate at end at offset 2 + XX\x{d800}\=offset=3 +No match XX\x{d800}\=no_utf_check 0: X XX\x{da00} @@ -631,6 +633,10 @@ Failed: error -26: UTF-16 error: isolated low surrogate at offset 2 ** Failed: character \x{110000} is greater than 0x10ffff and so cannot be converted to UTF-16 XX\x{d800}\x{1234} Failed: error -25: UTF-16 error: invalid low surrogate at offset 3 + +/(?<=.)X/utf + XX\x{d800}\=offset=3 +Failed: error -24: UTF-16 error: missing low surrogate at end at offset 2 /(*UTF16)\x{11234}/ abcd\x{11234}pqr diff --git a/testdata/testoutput12-32 b/testdata/testoutput12-32 index 89ac70a..a805b88 100644 --- a/testdata/testoutput12-32 +++ b/testdata/testoutput12-32 @@ -602,6 +602,8 @@ Failed: error 106 at offset 13: missing terminating ] for character class /X/utf XX\x{d800} Failed: error -27: UTF-32 error: code points 0xd800-0xdfff are not defined at offset 2 + XX\x{d800}\=offset=3 +No match XX\x{d800}\=no_utf_check 0: X XX\x{da00} @@ -624,6 +626,10 @@ Failed: error -27: UTF-32 error: code points 0xd800-0xdfff are not defined at of Failed: error -28: UTF-32 error: code points greater than 0x10ffff are not defined at offset 2 XX\x{d800}\x{1234} Failed: error -27: UTF-32 error: code points 0xd800-0xdfff are not defined at offset 2 + +/(?<=.)X/utf + XX\x{d800}\=offset=3 +Failed: error -27: UTF-32 error: code points 0xd800-0xdfff are not defined at offset 2 /(*UTF16)\x{11234}/ Failed: error 160 at offset 5: (*VERB) not recognized or malformed diff --git a/testdata/testoutput14 b/testdata/testoutput14 deleted file mode 100644 index fe012d9..0000000 --- a/testdata/testoutput14 +++ /dev/null @@ -1,334 +0,0 @@ -# These are: -# -# (1) Tests of the match-limiting features. The results are different for -# interpretive or JIT matching, so this test should not be run with JIT. The -# same tests are run using JIT in test 16. - -# (2) Other tests that must not be run with JIT. - -/(a+)*zz/I -Capturing subpattern count = 1 -Starting code units: a z -Last code unit = 'z' -Subject length lower bound = 2 - aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaazzbbbbbb\=find_limits -Minimum match limit = 8 -Minimum recursion limit = 6 - 0: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaazz - 1: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa - aaaaaaaaaaaaaz\=find_limits -Minimum match limit = 32768 -Minimum recursion limit = 29 -No match - -!((?:\s|//.*\\n|/[*](?:\\n|.)*?[*]/)*)!I -Capturing subpattern count = 1 -May match empty string -Subject length lower bound = 0 - /* this is a C style comment */\=find_limits -Minimum match limit = 120 -Minimum recursion limit = 6 - 0: /* this is a C style comment */ - 1: /* this is a C style comment */ - -/^(?>a)++/ - aa\=find_limits -Minimum match limit = 5 -Minimum recursion limit = 2 - 0: aa - aaaaaaaaa\=find_limits -Minimum match limit = 12 -Minimum recursion limit = 2 - 0: aaaaaaaaa - -/(a)(?1)++/ - aa\=find_limits -Minimum match limit = 7 -Minimum recursion limit = 4 - 0: aa - 1: a - aaaaaaaaa\=find_limits -Minimum match limit = 21 -Minimum recursion limit = 4 - 0: aaaaaaaaa - 1: a - -/a(?:.)*?a/ims - abbbbbbbbbbbbbbbbbbbbba\=find_limits -Minimum match limit = 65 -Minimum recursion limit = 2 - 0: abbbbbbbbbbbbbbbbbbbbba - -/a(?:.(*THEN))*?a/ims - abbbbbbbbbbbbbbbbbbbbba\=find_limits -Minimum match limit = 86 -Minimum recursion limit = 45 - 0: abbbbbbbbbbbbbbbbbbbbba - -/a(?:.(*THEN:ABC))*?a/ims - abbbbbbbbbbbbbbbbbbbbba\=find_limits -Minimum match limit = 86 -Minimum recursion limit = 45 - 0: abbbbbbbbbbbbbbbbbbbbba - -/^(?>a+)(?>b+)(?>c+)(?>d+)(?>e+)/ - aabbccddee\=find_limits -Minimum match limit = 7 -Minimum recursion limit = 2 - 0: aabbccddee - -/^(?>(a+))(?>(b+))(?>(c+))(?>(d+))(?>(e+))/ - aabbccddee\=find_limits -Minimum match limit = 17 -Minimum recursion limit = 16 - 0: aabbccddee - 1: aa - 2: bb - 3: cc - 4: dd - 5: ee - -/^(?>(a+))(?>b+)(?>(c+))(?>d+)(?>(e+))/ - aabbccddee\=find_limits -Minimum match limit = 13 -Minimum recursion limit = 10 - 0: aabbccddee - 1: aa - 2: cc - 3: ee - -/(*LIMIT_MATCH=12bc)abc/ -Failed: error 160 at offset 17: (*VERB) not recognized or malformed - -/(*LIMIT_MATCH=4294967290)abc/ -Failed: error 160 at offset 24: (*VERB) not recognized or malformed - -/(*LIMIT_RECURSION=4294967280)abc/I -Capturing subpattern count = 0 -Recursion limit = 4294967280 -First code unit = 'a' -Last code unit = 'c' -Subject length lower bound = 3 - -/(a+)*zz/ - aaaaaaaaaaaaaz -No match - aaaaaaaaaaaaaz\=match_limit=3000 -Failed: error -47: match limit exceeded - -/(a+)*zz/ - aaaaaaaaaaaaaz\=recursion_limit=10 -Failed: error -53: recursion limit exceeded - -/(*LIMIT_MATCH=3000)(a+)*zz/I -Capturing subpattern count = 1 -Match limit = 3000 -Starting code units: a z -Last code unit = 'z' -Subject length lower bound = 2 - aaaaaaaaaaaaaz -Failed: error -47: match limit exceeded - aaaaaaaaaaaaaz\=match_limit=60000 -Failed: error -47: match limit exceeded - -/(*LIMIT_MATCH=60000)(*LIMIT_MATCH=3000)(a+)*zz/I -Capturing subpattern count = 1 -Match limit = 3000 -Starting code units: a z -Last code unit = 'z' -Subject length lower bound = 2 - aaaaaaaaaaaaaz -Failed: error -47: match limit exceeded - -/(*LIMIT_MATCH=60000)(a+)*zz/I -Capturing subpattern count = 1 -Match limit = 60000 -Starting code units: a z -Last code unit = 'z' -Subject length lower bound = 2 - aaaaaaaaaaaaaz -No match - aaaaaaaaaaaaaz\=match_limit=3000 -Failed: error -47: match limit exceeded - -/(*LIMIT_RECURSION=10)(a+)*zz/I -Capturing subpattern count = 1 -Recursion limit = 10 -Starting code units: a z -Last code unit = 'z' -Subject length lower bound = 2 - aaaaaaaaaaaaaz -Failed: error -53: recursion limit exceeded - aaaaaaaaaaaaaz\=recursion_limit=1000 -Failed: error -53: recursion limit exceeded - -/(*LIMIT_RECURSION=10)(*LIMIT_RECURSION=1000)(a+)*zz/I -Capturing subpattern count = 1 -Recursion limit = 1000 -Starting code units: a z -Last code unit = 'z' -Subject length lower bound = 2 - aaaaaaaaaaaaaz -No match - -/(*LIMIT_RECURSION=1000)(a+)*zz/I -Capturing subpattern count = 1 -Recursion limit = 1000 -Starting code units: a z -Last code unit = 'z' -Subject length lower bound = 2 - aaaaaaaaaaaaaz -No match - aaaaaaaaaaaaaz\=recursion_limit=10 -Failed: error -53: recursion limit exceeded - -# These three have infinitely nested recursions. - -/((?2))((?1))/ - abc -Failed: error -52: nested recursion at the same subject position - -/((?(R2)a+|(?1)b))/ - aaaabcde -Failed: error -52: nested recursion at the same subject position - -/(?(R)a*(?1)|((?R))b)/ - aaaabcde -Failed: error -52: nested recursion at the same subject position - -# The allusedtext modifier does not work with JIT, which does not maintain -# the leftchar/rightchar data. - -/abc(?=xyz)/allusedtext - abcxyzpqr - 0: abcxyz - >>> - abcxyzpqr\=aftertext - 0: abcxyz - >>> - 0+ xyzpqr - -/(?<=pqr)abc(?=xyz)/allusedtext - xyzpqrabcxyzpqr - 0: pqrabcxyz - <<< >>> - xyzpqrabcxyzpqr\=aftertext - 0: pqrabcxyz - <<< >>> - 0+ xyzpqr - -/a\b/ - a.\=allusedtext - 0: a. - > - a\=allusedtext - 0: a - -/abc\Kxyz/ - abcxyz\=allusedtext - 0: abcxyz - <<< - -/abc(?=xyz(*ACCEPT))/ - abcxyz\=allusedtext - 0: abcxyz - >>> - -/abc(?=abcde)(?=ab)/allusedtext - abcabcdefg - 0: abcabcde - >>>>> - -# These tests provoke recursion loops, which give a different error message -# when JIT is used. - -/(?R)/I -Capturing subpattern count = 0 -May match empty string -Subject length lower bound = 0 - abcd -Failed: error -52: nested recursion at the same subject position - -/(a|(?R))/I -Capturing subpattern count = 1 -May match empty string -Subject length lower bound = 1 - abcd - 0: a - 1: a - defg -Failed: error -52: nested recursion at the same subject position - -/(ab|(bc|(de|(?R))))/I -Capturing subpattern count = 3 -May match empty string -Subject length lower bound = 2 - abcd - 0: ab - 1: ab - fghi -Failed: error -52: nested recursion at the same subject position - -/(ab|(bc|(de|(?1))))/I -Capturing subpattern count = 3 -May match empty string -Subject length lower bound = 2 - abcd - 0: ab - 1: ab - fghi -Failed: error -52: nested recursion at the same subject position - -/x(ab|(bc|(de|(?1)x)x)x)/I -Capturing subpattern count = 3 -First code unit = 'x' -Subject length lower bound = 3 - xab123 - 0: xab - 1: ab - xfghi -Failed: error -52: nested recursion at the same subject position - -/(?!\w)(?R)/ - abcd -Failed: error -52: nested recursion at the same subject position - =abc -Failed: error -52: nested recursion at the same subject position - -/(?=\w)(?R)/ - =abc -Failed: error -52: nested recursion at the same subject position - abcd -Failed: error -52: nested recursion at the same subject position - -/(?a)++/ + aa\=find_limits +Minimum match limit = 5 +Minimum recursion limit = 2 + 0: aa + aaaaaaaaa\=find_limits +Minimum match limit = 12 +Minimum recursion limit = 2 + 0: aaaaaaaaa + +/(a)(?1)++/ + aa\=find_limits +Minimum match limit = 7 +Minimum recursion limit = 4 + 0: aa + 1: a + aaaaaaaaa\=find_limits +Minimum match limit = 21 +Minimum recursion limit = 4 + 0: aaaaaaaaa + 1: a + +/a(?:.)*?a/ims + abbbbbbbbbbbbbbbbbbbbba\=find_limits +Minimum match limit = 65 +Minimum recursion limit = 2 + 0: abbbbbbbbbbbbbbbbbbbbba + +/a(?:.(*THEN))*?a/ims + abbbbbbbbbbbbbbbbbbbbba\=find_limits +Minimum match limit = 86 +Minimum recursion limit = 45 + 0: abbbbbbbbbbbbbbbbbbbbba + +/a(?:.(*THEN:ABC))*?a/ims + abbbbbbbbbbbbbbbbbbbbba\=find_limits +Minimum match limit = 86 +Minimum recursion limit = 45 + 0: abbbbbbbbbbbbbbbbbbbbba + +/^(?>a+)(?>b+)(?>c+)(?>d+)(?>e+)/ + aabbccddee\=find_limits +Minimum match limit = 7 +Minimum recursion limit = 2 + 0: aabbccddee + +/^(?>(a+))(?>(b+))(?>(c+))(?>(d+))(?>(e+))/ + aabbccddee\=find_limits +Minimum match limit = 17 +Minimum recursion limit = 16 + 0: aabbccddee + 1: aa + 2: bb + 3: cc + 4: dd + 5: ee + +/^(?>(a+))(?>b+)(?>(c+))(?>d+)(?>(e+))/ + aabbccddee\=find_limits +Minimum match limit = 13 +Minimum recursion limit = 10 + 0: aabbccddee + 1: aa + 2: cc + 3: ee + +/(*LIMIT_MATCH=12bc)abc/ +Failed: error 160 at offset 17: (*VERB) not recognized or malformed + +/(*LIMIT_MATCH=4294967290)abc/ +Failed: error 160 at offset 24: (*VERB) not recognized or malformed + +/(*LIMIT_RECURSION=4294967280)abc/I Capturing subpattern count = 0 +Recursion limit = 4294967280 First code unit = 'a' Last code unit = 'c' Subject length lower bound = 3 -JIT support is not available in this version of PCRE2 -/a*/I +/(a+)*zz/ + aaaaaaaaaaaaaz +No match + aaaaaaaaaaaaaz\=match_limit=3000 +Failed: error -47: match limit exceeded + +/(a+)*zz/ + aaaaaaaaaaaaaz\=recursion_limit=10 +Failed: error -53: recursion limit exceeded + +/(*LIMIT_MATCH=3000)(a+)*zz/I +Capturing subpattern count = 1 +Match limit = 3000 +Starting code units: a z +Last code unit = 'z' +Subject length lower bound = 2 + aaaaaaaaaaaaaz +Failed: error -47: match limit exceeded + aaaaaaaaaaaaaz\=match_limit=60000 +Failed: error -47: match limit exceeded + +/(*LIMIT_MATCH=60000)(*LIMIT_MATCH=3000)(a+)*zz/I +Capturing subpattern count = 1 +Match limit = 3000 +Starting code units: a z +Last code unit = 'z' +Subject length lower bound = 2 + aaaaaaaaaaaaaz +Failed: error -47: match limit exceeded + +/(*LIMIT_MATCH=60000)(a+)*zz/I +Capturing subpattern count = 1 +Match limit = 60000 +Starting code units: a z +Last code unit = 'z' +Subject length lower bound = 2 + aaaaaaaaaaaaaz +No match + aaaaaaaaaaaaaz\=match_limit=3000 +Failed: error -47: match limit exceeded + +/(*LIMIT_RECURSION=10)(a+)*zz/I +Capturing subpattern count = 1 +Recursion limit = 10 +Starting code units: a z +Last code unit = 'z' +Subject length lower bound = 2 + aaaaaaaaaaaaaz +Failed: error -53: recursion limit exceeded + aaaaaaaaaaaaaz\=recursion_limit=1000 +Failed: error -53: recursion limit exceeded + +/(*LIMIT_RECURSION=10)(*LIMIT_RECURSION=1000)(a+)*zz/I +Capturing subpattern count = 1 +Recursion limit = 1000 +Starting code units: a z +Last code unit = 'z' +Subject length lower bound = 2 + aaaaaaaaaaaaaz +No match + +/(*LIMIT_RECURSION=1000)(a+)*zz/I +Capturing subpattern count = 1 +Recursion limit = 1000 +Starting code units: a z +Last code unit = 'z' +Subject length lower bound = 2 + aaaaaaaaaaaaaz +No match + aaaaaaaaaaaaaz\=recursion_limit=10 +Failed: error -53: recursion limit exceeded + +# These three have infinitely nested recursions. + +/((?2))((?1))/ + abc +Failed: error -52: nested recursion at the same subject position + +/((?(R2)a+|(?1)b))/ + aaaabcde +Failed: error -52: nested recursion at the same subject position + +/(?(R)a*(?1)|((?R))b)/ + aaaabcde +Failed: error -52: nested recursion at the same subject position + +# The allusedtext modifier does not work with JIT, which does not maintain +# the leftchar/rightchar data. + +/abc(?=xyz)/allusedtext + abcxyzpqr + 0: abcxyz + >>> + abcxyzpqr\=aftertext + 0: abcxyz + >>> + 0+ xyzpqr + +/(?<=pqr)abc(?=xyz)/allusedtext + xyzpqrabcxyzpqr + 0: pqrabcxyz + <<< >>> + xyzpqrabcxyzpqr\=aftertext + 0: pqrabcxyz + <<< >>> + 0+ xyzpqr + +/a\b/ + a.\=allusedtext + 0: a. + > + a\=allusedtext + 0: a + +/abc\Kxyz/ + abcxyz\=allusedtext + 0: abcxyz + <<< + +/abc(?=xyz(*ACCEPT))/ + abcxyz\=allusedtext + 0: abcxyz + >>> + +/abc(?=abcde)(?=ab)/allusedtext + abcabcdefg + 0: abcabcde + >>>>> + +# These tests provoke recursion loops, which give a different error message +# when JIT is used. + +/(?R)/I Capturing subpattern count = 0 May match empty string Subject length lower bound = 0 + abcd +Failed: error -52: nested recursion at the same subject position + +/(a|(?R))/I +Capturing subpattern count = 1 +May match empty string +Subject length lower bound = 1 + abcd + 0: a + 1: a + defg +Failed: error -52: nested recursion at the same subject position + +/(ab|(bc|(de|(?R))))/I +Capturing subpattern count = 3 +May match empty string +Subject length lower bound = 2 + abcd + 0: ab + 1: ab + fghi +Failed: error -52: nested recursion at the same subject position + +/(ab|(bc|(de|(?1))))/I +Capturing subpattern count = 3 +May match empty string +Subject length lower bound = 2 + abcd + 0: ab + 1: ab + fghi +Failed: error -52: nested recursion at the same subject position + +/x(ab|(bc|(de|(?1)x)x)x)/I +Capturing subpattern count = 3 +First code unit = 'x' +Subject length lower bound = 3 + xab123 + 0: xab + 1: ab + xfghi +Failed: error -52: nested recursion at the same subject position + +/(?!\w)(?R)/ + abcd +Failed: error -52: nested recursion at the same subject position + =abc +Failed: error -52: nested recursion at the same subject position + +/(?=\w)(?R)/ + =abc +Failed: error -52: nested recursion at the same subject position + abcd +Failed: error -52: nested recursion at the same subject position + +/(?a)++/ - aa\=find_limits -Minimum match limit = 1 - 0: aa (JIT) - aaaaaaaaa\=find_limits -Minimum match limit = 1 - 0: aaaaaaaaa (JIT) - -/(a)(?1)++/ - aa\=find_limits -Minimum match limit = 1 - 0: aa (JIT) - 1: a - aaaaaaaaa\=find_limits -Minimum match limit = 1 - 0: aaaaaaaaa (JIT) - 1: a - -/a(?:.)*?a/ims - abbbbbbbbbbbbbbbbbbbbba\=find_limits -Minimum match limit = 1 - 0: abbbbbbbbbbbbbbbbbbbbba (JIT) - -/a(?:.(*THEN))*?a/ims - abbbbbbbbbbbbbbbbbbbbba\=find_limits -Minimum match limit = 1 - 0: abbbbbbbbbbbbbbbbbbbbba (JIT) - -/a(?:.(*THEN:ABC))*?a/ims - abbbbbbbbbbbbbbbbbbbbba\=find_limits -Minimum match limit = 1 - 0: abbbbbbbbbbbbbbbbbbbbba (JIT) - -/^(?>a+)(?>b+)(?>c+)(?>d+)(?>e+)/ - aabbccddee\=find_limits -Minimum match limit = 5 - 0: aabbccddee (JIT) - -/^(?>(a+))(?>(b+))(?>(c+))(?>(d+))(?>(e+))/ - aabbccddee\=find_limits -Minimum match limit = 5 - 0: aabbccddee (JIT) - 1: aa - 2: bb - 3: cc - 4: dd - 5: ee - -/^(?>(a+))(?>b+)(?>(c+))(?>d+)(?>(e+))/ - aabbccddee\=find_limits -Minimum match limit = 5 - 0: aabbccddee (JIT) - 1: aa - 2: cc - 3: ee - -/^(?>(a+))(?>b+)(?>(c+))(?>d+)(?>(e+))/jitfast - aabbccddee\=find_limits -Minimum match limit = 5 - 0: aabbccddee (JIT) - 1: aa - 2: cc - 3: ee - aabbccddee\=jitstack=1 - 0: aabbccddee (JIT) - 1: aa - 2: cc - 3: ee - -/(a+)*zz/ - aaaaaaaaaaaaaz -No match (JIT) - aaaaaaaaaaaaaz\=match_limit=3000 -Failed: error -47: match limit exceeded - -/(*LIMIT_MATCH=3000)(a+)*zz/I -Capturing subpattern count = 1 -Match limit = 3000 -Starting code units: a z -Last code unit = 'z' -Subject length lower bound = 2 -JIT compilation was successful - aaaaaaaaaaaaaz -Failed: error -47: match limit exceeded - aaaaaaaaaaaaaz\=match_limit=60000 -Failed: error -47: match limit exceeded - -/(*LIMIT_MATCH=60000)(*LIMIT_MATCH=3000)(a+)*zz/I -Capturing subpattern count = 1 -Match limit = 3000 -Starting code units: a z -Last code unit = 'z' -Subject length lower bound = 2 -JIT compilation was successful - aaaaaaaaaaaaaz -Failed: error -47: match limit exceeded - -/(*LIMIT_MATCH=60000)(a+)*zz/I -Capturing subpattern count = 1 -Match limit = 60000 -Starting code units: a z -Last code unit = 'z' -Subject length lower bound = 2 -JIT compilation was successful - aaaaaaaaaaaaaz -No match (JIT) - aaaaaaaaaaaaaz\=match_limit=3000 -Failed: error -47: match limit exceeded - -# These three have infinitely nested recursions. - -/((?2))((?1))/ - abc -Failed: error -46: JIT stack limit reached - -/((?(R2)a+|(?1)b))/ - aaaabcde -Failed: error -46: JIT stack limit reached - -/(?(R)a*(?1)|((?R))b)/ - aaaabcde -Failed: error -46: JIT stack limit reached - -# Invalid options disable JIT when called via pcre2_match(), causing the -# match to happen via the interpreter, but for fast JIT invalid options are -# ignored, so an unanchored match happens. - -/abcd/ - abcd\=anchored - 0: abcd - fail abcd\=anchored -No match - -/abcd/jitfast - abcd\=anchored - 0: abcd (JIT) - succeed abcd\=anchored - 0: abcd (JIT) - -# Push/pop does not lose the JIT information, though jitverify applies only to -# compilation, but serializing (save/load) discards JIT data completely. - -/^abc\Kdef/info,push -** Applies only to compile when pattern is stacked with 'push': jitverify -Capturing subpattern count = 0 -Compile options: -Overall options: anchored -Subject length lower bound = 6 -JIT compilation was successful -#pop jitverify - abcdef - 0: def (JIT) - -/^abc\Kdef/info,push -** Applies only to compile when pattern is stacked with 'push': jitverify -Capturing subpattern count = 0 -Compile options: -Overall options: anchored -Subject length lower bound = 6 -JIT compilation was successful -#save testsaved1 -#load testsaved1 -#pop jitverify - abcdef - 0: def - -#load testsaved1 -#pop jit,jitverify - abcdef - 0: def (JIT) - -# Test pattern compilation - -/(?:a|b|c|d|e)(?R)/jit=1 - -/(?:a|b|c|d|e)(?R)(?R)/jit=1 - -/(a(?:a|b|c|d|e)b){8,16}/jit=1 - -/(?:|a|){100}x/jit=1 - -# These tests provoke recursion loops, which give a different error message -# when JIT is used. - -/(?R)/I -Capturing subpattern count = 0 -May match empty string -Subject length lower bound = 0 -JIT compilation was successful - abcd -Failed: error -46: JIT stack limit reached - -/(a|(?R))/I -Capturing subpattern count = 1 -May match empty string -Subject length lower bound = 1 -JIT compilation was successful - abcd - 0: a (JIT) - 1: a - defg -Failed: error -46: JIT stack limit reached - -/(ab|(bc|(de|(?R))))/I -Capturing subpattern count = 3 -May match empty string -Subject length lower bound = 2 -JIT compilation was successful - abcd - 0: ab (JIT) - 1: ab - fghi -Failed: error -46: JIT stack limit reached - -/(ab|(bc|(de|(?1))))/I -Capturing subpattern count = 3 -May match empty string -Subject length lower bound = 2 -JIT compilation was successful - abcd - 0: ab (JIT) - 1: ab - fghi -Failed: error -46: JIT stack limit reached - -/x(ab|(bc|(de|(?1)x)x)x)/I -Capturing subpattern count = 3 -First code unit = 'x' +Last code unit = 'c' Subject length lower bound = 3 -JIT compilation was successful - xab123 - 0: xab (JIT) - 1: ab - xfghi -Failed: error -46: JIT stack limit reached +JIT support is not available in this version of PCRE2 -/(?!\w)(?R)/ - abcd -Failed: error -46: JIT stack limit reached - =abc -Failed: error -46: JIT stack limit reached - -/(?=\w)(?R)/ - =abc -Failed: error -46: JIT stack limit reached - abcd -Failed: error -46: JIT stack limit reached - -/(?a)++/ + aa\=find_limits +Minimum match limit = 1 + 0: aa (JIT) + aaaaaaaaa\=find_limits +Minimum match limit = 1 + 0: aaaaaaaaa (JIT) + +/(a)(?1)++/ + aa\=find_limits +Minimum match limit = 1 + 0: aa (JIT) + 1: a + aaaaaaaaa\=find_limits +Minimum match limit = 1 + 0: aaaaaaaaa (JIT) + 1: a -/abc/ - abc\=partial_hard -** Ignored with POSIX interface: partial_hard - 0: abc +/a(?:.)*?a/ims + abbbbbbbbbbbbbbbbbbbbba\=find_limits +Minimum match limit = 1 + 0: abbbbbbbbbbbbbbbbbbbbba (JIT) + +/a(?:.(*THEN))*?a/ims + abbbbbbbbbbbbbbbbbbbbba\=find_limits +Minimum match limit = 1 + 0: abbbbbbbbbbbbbbbbbbbbba (JIT) -# Real tests +/a(?:.(*THEN:ABC))*?a/ims + abbbbbbbbbbbbbbbbbbbbba\=find_limits +Minimum match limit = 1 + 0: abbbbbbbbbbbbbbbbbbbbba (JIT) -/abc/ +/^(?>a+)(?>b+)(?>c+)(?>d+)(?>e+)/ + aabbccddee\=find_limits +Minimum match limit = 5 + 0: aabbccddee (JIT) + +/^(?>(a+))(?>(b+))(?>(c+))(?>(d+))(?>(e+))/ + aabbccddee\=find_limits +Minimum match limit = 5 + 0: aabbccddee (JIT) + 1: aa + 2: bb + 3: cc + 4: dd + 5: ee + +/^(?>(a+))(?>b+)(?>(c+))(?>d+)(?>(e+))/ + aabbccddee\=find_limits +Minimum match limit = 5 + 0: aabbccddee (JIT) + 1: aa + 2: cc + 3: ee + +/^(?>(a+))(?>b+)(?>(c+))(?>d+)(?>(e+))/jitfast + aabbccddee\=find_limits +Minimum match limit = 5 + 0: aabbccddee (JIT) + 1: aa + 2: cc + 3: ee + aabbccddee\=jitstack=1 + 0: aabbccddee (JIT) + 1: aa + 2: cc + 3: ee + +/(a+)*zz/ + aaaaaaaaaaaaaz +No match (JIT) + aaaaaaaaaaaaaz\=match_limit=3000 +Failed: error -47: match limit exceeded + +/(*LIMIT_MATCH=3000)(a+)*zz/I +Capturing subpattern count = 1 +Match limit = 3000 +Starting code units: a z +Last code unit = 'z' +Subject length lower bound = 2 +JIT compilation was successful + aaaaaaaaaaaaaz +Failed: error -47: match limit exceeded + aaaaaaaaaaaaaz\=match_limit=60000 +Failed: error -47: match limit exceeded + +/(*LIMIT_MATCH=60000)(*LIMIT_MATCH=3000)(a+)*zz/I +Capturing subpattern count = 1 +Match limit = 3000 +Starting code units: a z +Last code unit = 'z' +Subject length lower bound = 2 +JIT compilation was successful + aaaaaaaaaaaaaz +Failed: error -47: match limit exceeded + +/(*LIMIT_MATCH=60000)(a+)*zz/I +Capturing subpattern count = 1 +Match limit = 60000 +Starting code units: a z +Last code unit = 'z' +Subject length lower bound = 2 +JIT compilation was successful + aaaaaaaaaaaaaz +No match (JIT) + aaaaaaaaaaaaaz\=match_limit=3000 +Failed: error -47: match limit exceeded + +# These three have infinitely nested recursions. + +/((?2))((?1))/ abc - 0: abc - *** Failers -No match: POSIX code 17: match failed +Failed: error -46: JIT stack limit reached -/^abc|def/ +/((?(R2)a+|(?1)b))/ + aaaabcde +Failed: error -46: JIT stack limit reached + +/(?(R)a*(?1)|((?R))b)/ + aaaabcde +Failed: error -46: JIT stack limit reached + +# Invalid options disable JIT when called via pcre2_match(), causing the +# match to happen via the interpreter, but for fast JIT invalid options are +# ignored, so an unanchored match happens. + +/abcd/ + abcd\=anchored + 0: abcd + fail abcd\=anchored +No match + +/abcd/jitfast + abcd\=anchored + 0: abcd (JIT) + succeed abcd\=anchored + 0: abcd (JIT) + +# Push/pop does not lose the JIT information, though jitverify applies only to +# compilation, but serializing (save/load) discards JIT data completely. + +/^abc\Kdef/info,push +** Applies only to compile when pattern is stacked with 'push': jitverify +Capturing subpattern count = 0 +Compile options: +Overall options: anchored +Subject length lower bound = 6 +JIT compilation was successful +#pop jitverify + abcdef + 0: def (JIT) + +/^abc\Kdef/info,push +** Applies only to compile when pattern is stacked with 'push': jitverify +Capturing subpattern count = 0 +Compile options: +Overall options: anchored +Subject length lower bound = 6 +JIT compilation was successful +#save testsaved1 +#load testsaved1 +#pop jitverify abcdef - 0: abc - abcdef\=notbol 0: def + +#load testsaved1 +#pop jit,jitverify + abcdef + 0: def (JIT) + +# Test pattern compilation -/.*((abc)$|(def))/ - defabc - 0: defabc - 1: abc - 2: abc - defabc\=noteol - 0: def - 1: def - 3: def +/(?:a|b|c|d|e)(?R)/jit=1 -/the quick brown fox/ - the quick brown fox - 0: the quick brown fox - *** Failers -No match: POSIX code 17: match failed - The Quick Brown Fox -No match: POSIX code 17: match failed +/(?:a|b|c|d|e)(?R)(?R)/jit=1 -/the quick brown fox/i - the quick brown fox - 0: the quick brown fox - The Quick Brown Fox - 0: The Quick Brown Fox +/(a(?:a|b|c|d|e)b){8,16}/jit=1 -/abc.def/ - *** Failers -No match: POSIX code 17: match failed - abc\ndef -No match: POSIX code 17: match failed +/(?:|a|){100}x/jit=1 -/abc$/ - abc - 0: abc - abc\n - 0: abc +# These tests provoke recursion loops, which give a different error message +# when JIT is used. -/(abc)\2/ -Failed: POSIX code 15: bad back reference at offset 6 +/(?R)/I +Capturing subpattern count = 0 +May match empty string +Subject length lower bound = 0 +JIT compilation was successful + abcd +Failed: error -46: JIT stack limit reached -/(abc\1)/ - abc -No match: POSIX code 17: match failed +/(a|(?R))/I +Capturing subpattern count = 1 +May match empty string +Subject length lower bound = 1 +JIT compilation was successful + abcd + 0: a (JIT) + 1: a + defg +Failed: error -46: JIT stack limit reached -/a*(b+)(z)(z)/ - aaaabbbbzzzz - 0: aaaabbbbzz - 1: bbbb - 2: z - 3: z - aaaabbbbzzzz\=ovector=0 -Matched without capture - aaaabbbbzzzz\=ovector=1 - 0: aaaabbbbzz - aaaabbbbzzzz\=ovector=2 - 0: aaaabbbbzz - 1: bbbb +/(ab|(bc|(de|(?R))))/I +Capturing subpattern count = 3 +May match empty string +Subject length lower bound = 2 +JIT compilation was successful + abcd + 0: ab (JIT) + 1: ab + fghi +Failed: error -46: JIT stack limit reached -/ab.cd/ - ab-cd - 0: ab-cd - ab=cd - 0: ab=cd - ** Failers -No match: POSIX code 17: match failed - ab\ncd -No match: POSIX code 17: match failed +/(ab|(bc|(de|(?1))))/I +Capturing subpattern count = 3 +May match empty string +Subject length lower bound = 2 +JIT compilation was successful + abcd + 0: ab (JIT) + 1: ab + fghi +Failed: error -46: JIT stack limit reached -/ab.cd/s - ab-cd - 0: ab-cd - ab=cd - 0: ab=cd - ab\ncd - 0: ab\x0acd +/x(ab|(bc|(de|(?1)x)x)x)/I +Capturing subpattern count = 3 +First code unit = 'x' +Subject length lower bound = 3 +JIT compilation was successful + xab123 + 0: xab (JIT) + 1: ab + xfghi +Failed: error -46: JIT stack limit reached -/a(b)c/no_auto_capture - abc -Matched with REG_NOSUB +/(?!\w)(?R)/ + abcd +Failed: error -46: JIT stack limit reached + =abc +Failed: error -46: JIT stack limit reached -/a(?Pb)c/no_auto_capture - abc -Matched with REG_NOSUB +/(?=\w)(?R)/ + =abc +Failed: error -46: JIT stack limit reached + abcd +Failed: error -46: JIT stack limit reached -/a?|b?/ - abc - 0: a - ** Failers - 0: - ddd\=notempty -No match: POSIX code 17: match failed +/(?b)c/no_auto_capture + abc +Matched with REG_NOSUB + +/a?|b?/ + abc + 0: a + ** Failers + 0: + ddd\=notempty +No match: POSIX code 17: match failed + +/\w+A/ + CDAAAAB + 0: CDAAAA + +/\w+A/ungreedy + CDAAAAB + 0: CDA + +/\Biss\B/I,aftertext +** Ignored with POSIX interface: info + Mississippi + 0: iss + 0+ issippi + +/abc/\ +Failed: POSIX code 9: bad escape sequence at offset 4 + +"(?(?C)" +Failed: POSIX code 3: pattern error at offset 2 + +# End of testdata/testinput18 diff --git a/testdata/testoutput19 b/testdata/testoutput19 index 7f3aa0c..e2bdf32 100644 --- a/testdata/testoutput19 +++ b/testdata/testoutput19 @@ -1,100 +1,20 @@ -# This set of tests exercises the serialization/deserialization functions in -# the library. It does not use UTF or JIT. - -#forbid_utf - -# Compile several patterns, push them onto the stack, and then write them -# all to a file. - -#pattern push - -/(?(?&NAME_PAT))\s+(?(?&ADDRESS_PAT)) - (?(DEFINE) - (?[a-z]+) - (?\d+) - )/x -/^(?:((.)(?1)\2|)|((.)(?3)\4|.))$/i - -#save testsaved1 - -# Do it again for some more patterns. - -/(*MARK:A)(*SKIP:B)(C|X)/mark -** Ignored when compiled pattern is stacked with 'push': mark -/(?:(?foo)|(?bar))\k/dupnames - -#save testsaved2 -#pattern -push - -# Reload the patterns, then pop them one by one and check them. - -#load testsaved1 -#load testsaved2 - -#pop info -Capturing subpattern count = 2 -Max back reference = 2 -Named capturing subpatterns: - n 1 - n 2 -Options: dupnames -Starting code units: b f -Subject length lower bound = 6 - foofoo - 0: foofoo - 1: foo - barbar - 0: barbar - 1: - 2: bar +# This set of tests is run only with the 8-bit library. It tests the POSIX +# interface with UTF/UCP support, which is supported only with the 8-bit +# library. This test should not be run with JIT (which is not available for the +# POSIX interface). -#pop mark - C - 0: C - 1: C -MK: A - D -No match, mark = A +#pattern posix + +/a\x{1234}b/utf + a\x{1234}b + 0: a\x{1234}b + +/\w/ + +++\x{c2} +No match: POSIX code 17: match failed + +/\w/ucp + +++\x{c2} + 0: \xc2 -#pop - AmanaplanacanalPanama - 0: AmanaplanacanalPanama - 1: - 2: - 3: AmanaplanacanalPanama - 4: A - -#pop info -Capturing subpattern count = 4 -Named capturing subpatterns: - ADDR 2 - ADDRESS_PAT 4 - NAME 1 - NAME_PAT 3 -Options: extended -Subject length lower bound = 3 - metcalfe 33 - 0: metcalfe 33 - 1: metcalfe - 2: 33 - -# Check for an error when different tables are used. - -/abc/push,tables=1 -/xyz/push,tables=2 -#save testsaved1 -Serialization failed: error -30: patterns do not all use the same character tables - -#pop - xyz - 0: xyz - -#pop - abc - 0: abc - -#pop should give an error -** Can't pop off an empty stack - pqr - -# End of testinput19 +# End of testdata/testinput19 diff --git a/testdata/testoutput20 b/testdata/testoutput20 new file mode 100644 index 0000000..3dcf51b --- /dev/null +++ b/testdata/testoutput20 @@ -0,0 +1,100 @@ +# This set of tests exercises the serialization/deserialization functions in +# the library. It does not use UTF or JIT. + +#forbid_utf + +# Compile several patterns, push them onto the stack, and then write them +# all to a file. + +#pattern push + +/(?(?&NAME_PAT))\s+(?(?&ADDRESS_PAT)) + (?(DEFINE) + (?[a-z]+) + (?\d+) + )/x +/^(?:((.)(?1)\2|)|((.)(?3)\4|.))$/i + +#save testsaved1 + +# Do it again for some more patterns. + +/(*MARK:A)(*SKIP:B)(C|X)/mark +** Ignored when compiled pattern is stacked with 'push': mark +/(?:(?foo)|(?bar))\k/dupnames + +#save testsaved2 +#pattern -push + +# Reload the patterns, then pop them one by one and check them. + +#load testsaved1 +#load testsaved2 + +#pop info +Capturing subpattern count = 2 +Max back reference = 2 +Named capturing subpatterns: + n 1 + n 2 +Options: dupnames +Starting code units: b f +Subject length lower bound = 6 + foofoo + 0: foofoo + 1: foo + barbar + 0: barbar + 1: + 2: bar + +#pop mark + C + 0: C + 1: C +MK: A + D +No match, mark = A + +#pop + AmanaplanacanalPanama + 0: AmanaplanacanalPanama + 1: + 2: + 3: AmanaplanacanalPanama + 4: A + +#pop info +Capturing subpattern count = 4 +Named capturing subpatterns: + ADDR 2 + ADDRESS_PAT 4 + NAME 1 + NAME_PAT 3 +Options: extended +Subject length lower bound = 3 + metcalfe 33 + 0: metcalfe 33 + 1: metcalfe + 2: 33 + +# Check for an error when different tables are used. + +/abc/push,tables=1 +/xyz/push,tables=2 +#save testsaved1 +Serialization failed: error -30: patterns do not all use the same character tables + +#pop + xyz + 0: xyz + +#pop + abc + 0: abc + +#pop should give an error +** Can't pop off an empty stack + pqr + +# End of testinput20