diff --git a/ChangeLog b/ChangeLog
index 7cd2eba..e8f5daa 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -145,6 +145,10 @@ was fixed.
39. Match limit check added to recursion. This issue was found by Karl Skomski
with a custom LLVM fuzzer.
+40. Arrange for the UTF check in pcre2_match() and pcre2_dfa_match() to look
+only at the part of the subject that is relevant when the starting offset is
+non-zero.
+
Version 10.20 30-June-2015
--------------------------
diff --git a/RunTest b/RunTest
index fb758fe..a0345c8 100755
--- a/RunTest
+++ b/RunTest
@@ -68,12 +68,13 @@ title10="Test 10: Specials for the 8-bit library with UTF-8 and UCP support"
title11="Test 11: Specials for the basic 16-bit and 32-bit libraries"
title12="Test 12: Specials for the 16-bit and 32-bit libraries UTF and UCP support"
title13="Test 13: DFA specials for the basic 16-bit and 32-bit libraries"
-title14="Test 14: Non-JIT limits and other non-JIT tests"
-title15="Test 15: JIT-specific features when JIT is not available"
-title16="Test 16: JIT-specific features when JIT is available"
-title17="Test 17: Tests of the POSIX interface, excluding UTF/UCP"
-title18="Test 18: Tests of the POSIX interface with UTF/UCP"
-title19="Test 19: Serialization tests"
+title14="Test 14: DFA specials for UTF and UCP support"
+title15="Test 15: Non-JIT limits and other non-JIT tests"
+title16="Test 16: JIT-specific features when JIT is not available"
+title17="Test 17: JIT-specific features when JIT is available"
+title18="Test 18: Tests of the POSIX interface, excluding UTF/UCP"
+title19="Test 19: Tests of the POSIX interface with UTF/UCP"
+title20="Test 20: Serialization tests"
maxtest=18
if [ $# -eq 1 -a "$1" = "list" ]; then
@@ -97,6 +98,7 @@ if [ $# -eq 1 -a "$1" = "list" ]; then
echo $title17
echo $title18
echo $title19
+ echo $title20
exit 0
fi
@@ -219,6 +221,7 @@ do16=no
do17=no
do18=no
do19=no
+do20=no
while [ $# -gt 0 ] ; do
case $1 in
@@ -242,10 +245,11 @@ while [ $# -gt 0 ] ; do
17) do17=yes;;
18) do18=yes;;
19) do19=yes;;
+ 20) do20=yes;;
-8) arg8=yes;;
-16) arg16=yes;;
-32) arg32=yes;;
- bigstack|-bigstack) bigstack=yes;;
+ bigstack|-bigstack) bigstack=yes;;
nojit|-nojit) nojit=yes;;
sim|-sim) shift; sim=$1;;
valgrind|-valgrind) valgrind="valgrind --tool=memcheck -q --smc-check=all";;
@@ -305,10 +309,10 @@ if [ $? -eq 0 ] ; then
else
test2stack="-S 1024"
defaultstack="-S 64"
- fi
+ fi
else
test2stack=""
- defaultstack=""
+ defaultstack=""
fi
# All of 8-bit, 16-bit, and 32-bit character strings may be supported, but only
@@ -387,7 +391,8 @@ if [ $do0 = no -a $do1 = no -a $do2 = no -a $do3 = no -a \
$do4 = no -a $do5 = no -a $do6 = no -a $do7 = no -a \
$do8 = no -a $do9 = no -a $do10 = no -a $do11 = no -a \
$do12 = no -a $do13 = no -a $do14 = no -a $do15 = no -a \
- $do16 = no -a $do17 = no -a $do18 = no -a $do19 = no \
+ $do16 = no -a $do17 = no -a $do18 = no -a $do19 = no -a \
+ $do20 = no \
]; then
do0=yes
do1=yes
@@ -409,6 +414,7 @@ if [ $do0 = no -a $do1 = no -a $do2 = no -a $do3 = no -a \
do17=yes
do18=yes
do19=yes
+ do20=yes
fi
# Handle any explicit skips at this stage, so that an argument list may consist
@@ -688,71 +694,79 @@ for bmode in "$test8" "$test16" "$test32"; do
checkresult $? 13 ""
fi
fi
-
- # Test non-JIT match and recursion limits
+
+ # Tests for DFA UTF and UCP features. Output is different for the different widths.
if [ $do14 = yes ] ; then
echo $title14
- $sim $valgrind ./pcre2test -q $defaultstack $bmode $testdata/testinput14 testtry
- checkresult $? 14 ""
+ $sim $valgrind ./pcre2test -q $defaultstack $bmode $opt $testdata/testinput14 testtry
+ checkresult $? 14-$bits "$opt"
+ fi
+
+ # Test non-JIT match and recursion limits
+
+ if [ $do15 = yes ] ; then
+ echo $title15
+ $sim $valgrind ./pcre2test -q $defaultstack $bmode $testdata/testinput15 testtry
+ checkresult $? 15 ""
fi
# Test JIT-specific features when JIT is not available
- if [ $do15 = yes ] ; then
- echo $title15
- if [ $jit -ne 0 ] ; then
- echo " Skipped because JIT is available"
- else
- $sim $valgrind ./pcre2test -q $defaultstack $bmode $testdata/testinput15 testtry
- checkresult $? 15 ""
- fi
- fi
-
- # Test JIT-specific features when JIT is available
-
if [ $do16 = yes ] ; then
echo $title16
- if [ $jit -eq 0 -o "$nojit" = "yes" ] ; then
- echo " Skipped because JIT is not available or nojit was specified"
+ if [ $jit -ne 0 ] ; then
+ echo " Skipped because JIT is available"
else
$sim $valgrind ./pcre2test -q $defaultstack $bmode $testdata/testinput16 testtry
checkresult $? 16 ""
fi
fi
- # Tests for the POSIX interface without UTF/UCP (8-bit only)
+ # Test JIT-specific features when JIT is available
if [ $do17 = yes ] ; then
echo $title17
- if [ "$bits" = "16" -o "$bits" = "32" ] ; then
- echo " Skipped when running 16/32-bit tests"
+ if [ $jit -eq 0 -o "$nojit" = "yes" ] ; then
+ echo " Skipped because JIT is not available or nojit was specified"
else
$sim $valgrind ./pcre2test -q $defaultstack $bmode $testdata/testinput17 testtry
checkresult $? 17 ""
fi
fi
- # Tests for the POSIX interface with UTF/UCP (8-bit only)
+ # Tests for the POSIX interface without UTF/UCP (8-bit only)
if [ $do18 = yes ] ; then
echo $title18
if [ "$bits" = "16" -o "$bits" = "32" ] ; then
echo " Skipped when running 16/32-bit tests"
- elif [ $utf -eq 0 ] ; then
- echo " Skipped because UTF-$bits support is not available"
else
$sim $valgrind ./pcre2test -q $defaultstack $bmode $testdata/testinput18 testtry
checkresult $? 18 ""
fi
fi
- # Serialization tests
+ # Tests for the POSIX interface with UTF/UCP (8-bit only)
if [ $do19 = yes ] ; then
echo $title19
- $sim $valgrind ./pcre2test -q $defaultstack $bmode $testdata/testinput19 testtry
- checkresult $? 19 ""
+ if [ "$bits" = "16" -o "$bits" = "32" ] ; then
+ echo " Skipped when running 16/32-bit tests"
+ elif [ $utf -eq 0 ] ; then
+ echo " Skipped because UTF-$bits support is not available"
+ else
+ $sim $valgrind ./pcre2test -q $defaultstack $bmode $testdata/testinput19 testtry
+ checkresult $? 19 ""
+ fi
+ fi
+
+ # Serialization tests
+
+ if [ $do20 = yes ] ; then
+ echo $title20
+ $sim $valgrind ./pcre2test -q $defaultstack $bmode $testdata/testinput20 testtry
+ checkresult $? 20 ""
fi
# End of loop for 8/16/32-bit tests
diff --git a/doc/pcre2api.3 b/doc/pcre2api.3
index d8ae9d6..7cd20d3 100644
--- a/doc/pcre2api.3
+++ b/doc/pcre2api.3
@@ -1,4 +1,4 @@
-.TH PCRE2API 3 "29 July 2015" "PCRE2 10.21"
+.TH PCRE2API 3 "18 August 2015" "PCRE2 10.21"
.SH NAME
PCRE2 - Perl-compatible regular expressions (revised API)
.sp
@@ -2022,12 +2022,19 @@ If the pattern is anchored, such a match can occur only if the pattern contains
.sp
When PCRE2_UTF is set at compile time, the validity of the subject as a UTF
string is checked by default when \fBpcre2_match()\fP is subsequently called.
-The entire string is checked before any other processing takes place, and a
+If a non-zero starting offset is given, the check is applied only to that part
+of the subject that could be inspected during matching, and there is a check
+that the starting offset points to the first code unit of a character or to the
+end of the subject. If there are no lookbehind assertions in the pattern, the
+check starts at the starting offset. Otherwise, it starts at the length of the
+longest lookbehind before the starting offset, or at the start of the subject
+if there are not that many characters before the starting offset. Note that the
+sequences \eb and \eB are one-character lookbehinds.
+.P
+The check is carried out before any other processing takes place, and a
negative error code is returned if the check fails. There are several UTF error
codes for each code unit width, corresponding to different problems with the
-code unit sequence. The value of \fIstartoffset\fP is also checked, to ensure
-that it points to the start of a character or to the end of the subject. There
-are discussions about the validity of
+code unit sequence. There are discussions about the validity of
.\" HTML
.\"
UTF-8 strings,
@@ -2939,6 +2946,6 @@ Cambridge, England.
.rs
.sp
.nf
-Last updated: 29 July 2015
+Last updated: 18 August 2015
Copyright (c) 1997-2015 University of Cambridge.
.fi
diff --git a/doc/pcre2unicode.3 b/doc/pcre2unicode.3
index 6c32bc0..cd98ce8 100644
--- a/doc/pcre2unicode.3
+++ b/doc/pcre2unicode.3
@@ -1,4 +1,4 @@
-.TH PCRE2UNICODE 3 "23 November 2014" "PCRE2 10.00"
+.TH PCRE2UNICODE 3 "18 August 2015" "PCRE2 10.21"
.SH NAME
PCRE - Perl-compatible regular expressions (revised API)
.SH "UNICODE AND UTF SUPPORT"
@@ -117,11 +117,21 @@ UTF-16 and UTF-32 strings can indicate their endianness by special code knows
as a byte-order mark (BOM). The PCRE2 functions do not handle this, expecting
strings to be in host byte order.
.P
-The entire string is checked before any other processing takes place. In
-addition to checking the format of the string, there is a check to ensure that
-all code points lie in the range U+0 to U+10FFFF, excluding the surrogate area.
-The so-called "non-character" code points are not excluded because Unicode
-corrigendum #9 makes it clear that they should not be.
+A UTF string is checked before any other processing takes place. In the case of
+\fBpcre2_match()\fP and \fBpcre2_dfa_match()\fP calls with a non-zero starting
+offset, the check is applied only to that part of the subject that could be
+inspected during matching, and there is a check that the starting offset points
+to the first code unit of a character or to the end of the subject. If there
+are no lookbehind assertions in the pattern, the check starts at the starting
+offset. Otherwise, it starts at the length of the longest lookbehind before the
+starting offset, or at the start of the subject if there are not that many
+characters before the starting offset. Note that the sequences \eb and \eB are
+one-character lookbehinds.
+.P
+In addition to checking the format of the string, there is a check to ensure
+that all code points lie in the range U+0 to U+10FFFF, excluding the surrogate
+area. The so-called "non-character" code points are not excluded because
+Unicode corrigendum #9 makes it clear that they should not be.
.P
Characters in the "Surrogate Area" of Unicode are reserved for use by UTF-16,
where they are used in pairs to encode code points with values greater than
@@ -252,6 +262,6 @@ Cambridge, England.
.rs
.sp
.nf
-Last updated: 23 November 2014
-Copyright (c) 1997-2014 University of Cambridge.
+Last updated: 18 August 2015
+Copyright (c) 1997-2015 University of Cambridge.
.fi
diff --git a/src/pcre2_compile.c b/src/pcre2_compile.c
index aa65fd3..4fe0236 100644
--- a/src/pcre2_compile.c
+++ b/src/pcre2_compile.c
@@ -4682,7 +4682,7 @@ for (;; ptr++)
that it's a length rather than a small character. */
#ifdef MAYBE_UTF_MULTI
- if (utf && NOT_FIRSTCHAR(code[-1]))
+ if (utf && NOT_FIRSTCU(code[-1]))
{
PCRE2_UCHAR *lastchar = code - 1;
BACKCHAR(lastchar);
diff --git a/src/pcre2_dfa_match.c b/src/pcre2_dfa_match.c
index ff4d332..3cfa454 100644
--- a/src/pcre2_dfa_match.c
+++ b/src/pcre2_dfa_match.c
@@ -2774,7 +2774,7 @@ for (;;)
{
PCRE2_SPTR p = start_subject + local_offsets[rc];
PCRE2_SPTR pp = start_subject + local_offsets[rc+1];
- while (p < pp) if (NOT_FIRSTCHAR(*p++)) charcount--;
+ while (p < pp) if (NOT_FIRSTCU(*p++)) charcount--;
}
#endif
if (charcount > 0)
@@ -2874,7 +2874,7 @@ for (;;)
PCRE2_SPTR pp = local_ptr;
charcount = (int)(pp - p);
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
- if (utf) while (p < pp) if (NOT_FIRSTCHAR(*p++)) charcount--;
+ if (utf) while (p < pp) if (NOT_FIRSTCU(*p++)) charcount--;
#endif
ADD_NEW_DATA(-next_state_offset, 0, (charcount - 1));
}
@@ -2960,7 +2960,7 @@ for (;;)
{
PCRE2_SPTR p = start_subject + local_offsets[0];
PCRE2_SPTR pp = start_subject + local_offsets[1];
- while (p < pp) if (NOT_FIRSTCHAR(*p++)) charcount--;
+ while (p < pp) if (NOT_FIRSTCU(*p++)) charcount--;
}
#endif
ADD_NEW_DATA(-next_state_offset, 0, (charcount - 1));
@@ -3264,18 +3264,50 @@ switch(re->newline_convention)
/* Check a UTF string for validity if required. For 8-bit and 16-bit strings,
we must also check that a starting offset does not point into the middle of a
-multiunit character. */
+multiunit character. We check only the portion of the subject that is going to
+be inspected during matching - from the offset minus the maximum back reference
+to the given length. This saves time when a small part of a large subject is
+being matched by the use of a starting offset. Note that the maximum lookbehind
+is a number of characters, not code units. */
#ifdef SUPPORT_UNICODE
if (utf && (options & PCRE2_NO_UTF_CHECK) == 0)
{
- match_data->rc = PRIV(valid_utf)(subject, length, &(match_data->startchar));
- if (match_data->rc != 0) return match_data->rc;
+ PCRE2_SPTR check_subject = start_match; /* start_match includes offset */
+
+ if (start_offset > 0)
+ {
#if PCRE2_CODE_UNIT_WIDTH != 32
- if (start_offset > 0 && start_offset < length &&
- NOT_FIRSTCHAR(subject[start_offset]))
- return PCRE2_ERROR_BADUTFOFFSET;
+ unsigned int i;
+ if (start_match < end_subject && NOT_FIRSTCU(*start_match))
+ return PCRE2_ERROR_BADUTFOFFSET;
+ for (i = re->max_lookbehind; i > 0 && check_subject > subject; i--)
+ {
+ check_subject--;
+ while (check_subject > subject &&
+#if PCRE2_CODE_UNIT_WIDTH == 8
+ (*check_subject & 0xc0) == 0x80)
+#else /* 16-bit */
+ (*check_subject & 0xfc00) == 0xdc00)
+#endif /* PCRE2_CODE_UNIT_WIDTH == 8 */
+ check_subject--;
+ }
+#else /* In the 32-bit library, one code unit equals one character. */
+ check_subject -= re->max_lookbehind;
+ if (check_subject < subject) check_subject = subject;
#endif /* PCRE2_CODE_UNIT_WIDTH != 32 */
+ }
+
+ /* Validate the relevant portion of the subject. After an error, adjust the
+ offset to be an absolute offset in the whole string. */
+
+ match_data->rc = PRIV(valid_utf)(check_subject,
+ length - (check_subject - subject), &(match_data->startchar));
+ if (match_data->rc != 0)
+ {
+ match_data->startchar += check_subject - subject;
+ return match_data->rc;
+ }
}
#endif /* SUPPORT_UNICODE */
diff --git a/src/pcre2_intmodedep.h b/src/pcre2_intmodedep.h
index 704c375..85ceb06 100644
--- a/src/pcre2_intmodedep.h
+++ b/src/pcre2_intmodedep.h
@@ -72,7 +72,7 @@ just to undefine them all. */
#undef MAX_MARK
#undef MAX_PATTERN_SIZE
#undef MAX_UTF_SINGLE_CU
-#undef NOT_FIRSTCHAR
+#undef NOT_FIRSTCU
#undef PUT
#undef PUT2
#undef PUT2INC
@@ -252,7 +252,7 @@ UTF support is omitted, we don't even define them. */
/* #define MAX_UTF_SINGLE_CU */
/* #define HAS_EXTRALEN(c) */
/* #define GET_EXTRALEN(c) */
-/* #define NOT_FIRSTCHAR(c) */
+/* #define NOT_FIRSTCU(c) */
#define GETCHAR(c, eptr) c = *eptr;
#define GETCHARTEST(c, eptr) c = *eptr;
#define GETCHARINC(c, eptr) c = *eptr++;
@@ -285,10 +285,10 @@ Otherwise it has an undefined behaviour. */
#define GET_EXTRALEN(c) (PRIV(utf8_table4)[(c) & 0x3f])
-/* Returns TRUE, if the given character is not the first character
-of a UTF sequence. */
+/* Returns TRUE, if the given value is not the first code unit of a UTF
+sequence. */
-#define NOT_FIRSTCHAR(c) (((c) & 0xc0) == 0x80)
+#define NOT_FIRSTCU(c) (((c) & 0xc0) == 0x80)
/* Get the next UTF-8 character, not advancing the pointer. This is called when
we know we are in UTF-8 mode. */
@@ -371,10 +371,10 @@ Otherwise it has an undefined behaviour. */
#define GET_EXTRALEN(c) 1
-/* Returns TRUE, if the given character is not the first character
-of a UTF sequence. */
+/* Returns TRUE, if the given value is not the first code unit of a UTF
+sequence. */
-#define NOT_FIRSTCHAR(c) (((c) & 0xfc00) == 0xdc00)
+#define NOT_FIRSTCU(c) (((c) & 0xfc00) == 0xdc00)
/* Base macro to pick up the low surrogate of a UTF-16 character, not
advancing the pointer. */
@@ -469,7 +469,7 @@ into one PCRE2_UCHAR unit. */
#define MAX_UTF_SINGLE_CU (0x10ffffu)
#define HAS_EXTRALEN(c) (0)
#define GET_EXTRALEN(c) (0)
-#define NOT_FIRSTCHAR(c) (0)
+#define NOT_FIRSTCU(c) (0)
/* Get the next UTF-32 character, not advancing the pointer. This is called when
we know we are in UTF-32 mode. */
diff --git a/src/pcre2_match.c b/src/pcre2_match.c
index 41113a5..8875d6d 100644
--- a/src/pcre2_match.c
+++ b/src/pcre2_match.c
@@ -6485,6 +6485,7 @@ mb->match_frames_base = &frame_zero;
subject string. */
if (length == PCRE2_ZERO_TERMINATED) length = PRIV(strlen)(subject);
+end_subject = subject + length;
/* Plausibility checks */
@@ -6536,18 +6537,50 @@ mb->partial = ((options & PCRE2_PARTIAL_HARD) != 0)? 2 :
/* Check a UTF string for validity if required. For 8-bit and 16-bit strings,
we must also check that a starting offset does not point into the middle of a
-multiunit character. */
+multiunit character. We check only the portion of the subject that is going to
+be inspected during matching - from the offset minus the maximum back reference
+to the given length. This saves time when a small part of a large subject is
+being matched by the use of a starting offset. Note that the maximum lookbehind
+is a number of characters, not code units. */
#ifdef SUPPORT_UNICODE
if (utf && (options & PCRE2_NO_UTF_CHECK) == 0)
{
- match_data->rc = PRIV(valid_utf)(subject, length, &(match_data->startchar));
- if (match_data->rc != 0) return match_data->rc;
+ PCRE2_SPTR check_subject = start_match; /* start_match includes offset */
+
+ if (start_offset > 0)
+ {
#if PCRE2_CODE_UNIT_WIDTH != 32
- if (start_offset > 0 && start_offset < length &&
- NOT_FIRSTCHAR(subject[start_offset]))
- return PCRE2_ERROR_BADUTFOFFSET;
+ unsigned int i;
+ if (start_match < end_subject && NOT_FIRSTCU(*start_match))
+ return PCRE2_ERROR_BADUTFOFFSET;
+ for (i = re->max_lookbehind; i > 0 && check_subject > subject; i--)
+ {
+ check_subject--;
+ while (check_subject > subject &&
+#if PCRE2_CODE_UNIT_WIDTH == 8
+ (*check_subject & 0xc0) == 0x80)
+#else /* 16-bit */
+ (*check_subject & 0xfc00) == 0xdc00)
+#endif /* PCRE2_CODE_UNIT_WIDTH == 8 */
+ check_subject--;
+ }
+#else /* In the 32-bit library, one code unit equals one character. */
+ check_subject -= re->max_lookbehind;
+ if (check_subject < subject) check_subject = subject;
#endif /* PCRE2_CODE_UNIT_WIDTH != 32 */
+ }
+
+ /* Validate the relevant portion of the subject. After an error, adjust the
+ offset to be an absolute offset in the whole string. */
+
+ match_data->rc = PRIV(valid_utf)(check_subject,
+ length - (check_subject - subject), &(match_data->startchar));
+ if (match_data->rc != 0)
+ {
+ match_data->startchar += check_subject - subject;
+ return match_data->rc;
+ }
}
#endif /* SUPPORT_UNICODE */
@@ -6594,7 +6627,7 @@ else
mb->start_subject = subject;
mb->start_offset = start_offset;
-mb->end_subject = end_subject = mb->start_subject + length;
+mb->end_subject = end_subject;
mb->hasthen = (re->flags & PCRE2_HASTHEN) != 0;
mb->moptions = options; /* Match options */
diff --git a/testdata/testinput10 b/testdata/testinput10
index a1fdd92..dc43629 100644
--- a/testdata/testinput10
+++ b/testdata/testinput10
@@ -132,7 +132,36 @@
\xf9\x87\x80\x80\x80\=no_utf_check
\xfc\x84\x80\x80\x80\x80\=no_utf_check
\xfd\x83\x80\x80\x80\x80\=no_utf_check
+
+# Similar tests with offsets
+/badutf/utf
+ X\xdfabcd
+ X\xdfabcd\=offset=1
+ X\xdfabcd\=offset=2
+
+/(?<=x)badutf/utf
+ X\xdfabcd
+ X\xdfabcd\=offset=1
+ X\xdfabcd\=offset=2
+ X\xdfabcd\=offset=3
+ X\xdfabcd\xdf\=offset=3
+
+/(?<=xx)badutf/utf
+ X\xdfabcd
+ X\xdfabcd\=offset=1
+ X\xdfabcd\=offset=2
+ X\xdfabcd\=offset=3
+
+/(?<=xxxx)badutf/utf
+ X\xdfabcd
+ X\xdfabcd\=offset=1
+ X\xdfabcd\=offset=2
+ X\xdfabcd\=offset=3
+ X\xdfabcd\=offset=6
+ X\xdfabc\xdf\=offset=6
+ X\xdfabc\xdf\=offset=7
+
/\x{100}/IB,utf
/\x{1000}/IB,utf
diff --git a/testdata/testinput12 b/testdata/testinput12
index 1cba4af..8223908 100644
--- a/testdata/testinput12
+++ b/testdata/testinput12
@@ -158,6 +158,7 @@
/X/utf
XX\x{d800}
+ XX\x{d800}\=offset=3
XX\x{d800}\=no_utf_check
XX\x{da00}
XX\x{da00}\=no_utf_check
@@ -169,6 +170,9 @@
XX\x{dfff}\=no_utf_check
XX\x{110000}
XX\x{d800}\x{1234}
+
+/(?<=.)X/utf
+ XX\x{d800}\=offset=3
/(*UTF16)\x{11234}/
abcd\x{11234}pqr
diff --git a/testdata/testinput14 b/testdata/testinput14
index b4fc203..f97f3ec 100644
--- a/testdata/testinput14
+++ b/testdata/testinput14
@@ -1,155 +1,37 @@
-# These are:
-#
-# (1) Tests of the match-limiting features. The results are different for
-# interpretive or JIT matching, so this test should not be run with JIT. The
-# same tests are run using JIT in test 16.
+# These test special (mostly error) UTF features of DFA matching. They are a
+# selection of the more comprehensive tests that are run for non-DFA matching.
+# The output is different for the different widths.
-# (2) Other tests that must not be run with JIT.
+#subject dfa
-/(a+)*zz/I
- aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaazzbbbbbb\=find_limits
- aaaaaaaaaaaaaz\=find_limits
+/X/utf
+ XX\x{d800}
+ XX\x{d800}\=offset=3
+ XX\x{d800}\=no_utf_check
+ XX\x{da00}
+ XX\x{da00}\=no_utf_check
+ XX\x{dc00}
+ XX\x{dc00}\=no_utf_check
+ XX\x{de00}
+ XX\x{de00}\=no_utf_check
+ XX\x{dfff}
+ XX\x{dfff}\=no_utf_check
+ XX\x{110000}
+ XX\x{d800}\x{1234}
+
+/badutf/utf
+ X\xdf
+ XX\xef
+ XXX\xef\x80
+ X\xf7
+ XX\xf7\x80
+ XXX\xf7\x80\x80
-!((?:\s|//.*\\n|/[*](?:\\n|.)*?[*]/)*)!I
- /* this is a C style comment */\=find_limits
-
-/^(?>a)++/
- aa\=find_limits
- aaaaaaaaa\=find_limits
-
-/(a)(?1)++/
- aa\=find_limits
- aaaaaaaaa\=find_limits
-
-/a(?:.)*?a/ims
- abbbbbbbbbbbbbbbbbbbbba\=find_limits
-
-/a(?:.(*THEN))*?a/ims
- abbbbbbbbbbbbbbbbbbbbba\=find_limits
-
-/a(?:.(*THEN:ABC))*?a/ims
- abbbbbbbbbbbbbbbbbbbbba\=find_limits
-
-/^(?>a+)(?>b+)(?>c+)(?>d+)(?>e+)/
- aabbccddee\=find_limits
-
-/^(?>(a+))(?>(b+))(?>(c+))(?>(d+))(?>(e+))/
- aabbccddee\=find_limits
-
-/^(?>(a+))(?>b+)(?>(c+))(?>d+)(?>(e+))/
- aabbccddee\=find_limits
-
-/(*LIMIT_MATCH=12bc)abc/
-
-/(*LIMIT_MATCH=4294967290)abc/
-
-/(*LIMIT_RECURSION=4294967280)abc/I
-
-/(a+)*zz/
- aaaaaaaaaaaaaz
- aaaaaaaaaaaaaz\=match_limit=3000
-
-/(a+)*zz/
- aaaaaaaaaaaaaz\=recursion_limit=10
-
-/(*LIMIT_MATCH=3000)(a+)*zz/I
- aaaaaaaaaaaaaz
- aaaaaaaaaaaaaz\=match_limit=60000
-
-/(*LIMIT_MATCH=60000)(*LIMIT_MATCH=3000)(a+)*zz/I
- aaaaaaaaaaaaaz
-
-/(*LIMIT_MATCH=60000)(a+)*zz/I
- aaaaaaaaaaaaaz
- aaaaaaaaaaaaaz\=match_limit=3000
-
-/(*LIMIT_RECURSION=10)(a+)*zz/I
- aaaaaaaaaaaaaz
- aaaaaaaaaaaaaz\=recursion_limit=1000
-
-/(*LIMIT_RECURSION=10)(*LIMIT_RECURSION=1000)(a+)*zz/I
- aaaaaaaaaaaaaz
-
-/(*LIMIT_RECURSION=1000)(a+)*zz/I
- aaaaaaaaaaaaaz
- aaaaaaaaaaaaaz\=recursion_limit=10
-
-# These three have infinitely nested recursions.
-
-/((?2))((?1))/
- abc
-
-/((?(R2)a+|(?1)b))/
- aaaabcde
-
-/(?(R)a*(?1)|((?R))b)/
- aaaabcde
-
-# The allusedtext modifier does not work with JIT, which does not maintain
-# the leftchar/rightchar data.
-
-/abc(?=xyz)/allusedtext
- abcxyzpqr
- abcxyzpqr\=aftertext
-
-/(?<=pqr)abc(?=xyz)/allusedtext
- xyzpqrabcxyzpqr
- xyzpqrabcxyzpqr\=aftertext
-
-/a\b/
- a.\=allusedtext
- a\=allusedtext
-
-/abc\Kxyz/
- abcxyz\=allusedtext
-
-/abc(?=xyz(*ACCEPT))/
- abcxyz\=allusedtext
-
-/abc(?=abcde)(?=ab)/allusedtext
- abcabcdefg
-
-# These tests provoke recursion loops, which give a different error message
-# when JIT is used.
-
-/(?R)/I
- abcd
-
-/(a|(?R))/I
- abcd
- defg
-
-/(ab|(bc|(de|(?R))))/I
- abcd
- fghi
-
-/(ab|(bc|(de|(?1))))/I
- abcd
- fghi
-
-/x(ab|(bc|(de|(?1)x)x)x)/I
- xab123
- xfghi
-
-/(?!\w)(?R)/
- abcd
- =abc
-
-/(?=\w)(?R)/
- =abc
- abcd
-
-/(?a)++/
+ aa\=find_limits
+ aaaaaaaaa\=find_limits
+
+/(a)(?1)++/
+ aa\=find_limits
+ aaaaaaaaa\=find_limits
+
+/a(?:.)*?a/ims
+ abbbbbbbbbbbbbbbbbbbbba\=find_limits
+
+/a(?:.(*THEN))*?a/ims
+ abbbbbbbbbbbbbbbbbbbbba\=find_limits
+
+/a(?:.(*THEN:ABC))*?a/ims
+ abbbbbbbbbbbbbbbbbbbbba\=find_limits
+
+/^(?>a+)(?>b+)(?>c+)(?>d+)(?>e+)/
+ aabbccddee\=find_limits
+
+/^(?>(a+))(?>(b+))(?>(c+))(?>(d+))(?>(e+))/
+ aabbccddee\=find_limits
+
+/^(?>(a+))(?>b+)(?>(c+))(?>d+)(?>(e+))/
+ aabbccddee\=find_limits
+
+/(*LIMIT_MATCH=12bc)abc/
+
+/(*LIMIT_MATCH=4294967290)abc/
+
+/(*LIMIT_RECURSION=4294967280)abc/I
+
+/(a+)*zz/
+ aaaaaaaaaaaaaz
+ aaaaaaaaaaaaaz\=match_limit=3000
+
+/(a+)*zz/
+ aaaaaaaaaaaaaz\=recursion_limit=10
+
+/(*LIMIT_MATCH=3000)(a+)*zz/I
+ aaaaaaaaaaaaaz
+ aaaaaaaaaaaaaz\=match_limit=60000
+
+/(*LIMIT_MATCH=60000)(*LIMIT_MATCH=3000)(a+)*zz/I
+ aaaaaaaaaaaaaz
+
+/(*LIMIT_MATCH=60000)(a+)*zz/I
+ aaaaaaaaaaaaaz
+ aaaaaaaaaaaaaz\=match_limit=3000
+
+/(*LIMIT_RECURSION=10)(a+)*zz/I
+ aaaaaaaaaaaaaz
+ aaaaaaaaaaaaaz\=recursion_limit=1000
+
+/(*LIMIT_RECURSION=10)(*LIMIT_RECURSION=1000)(a+)*zz/I
+ aaaaaaaaaaaaaz
+
+/(*LIMIT_RECURSION=1000)(a+)*zz/I
+ aaaaaaaaaaaaaz
+ aaaaaaaaaaaaaz\=recursion_limit=10
+
+# These three have infinitely nested recursions.
+
+/((?2))((?1))/
+ abc
+
+/((?(R2)a+|(?1)b))/
+ aaaabcde
+
+/(?(R)a*(?1)|((?R))b)/
+ aaaabcde
+
+# The allusedtext modifier does not work with JIT, which does not maintain
+# the leftchar/rightchar data.
+
+/abc(?=xyz)/allusedtext
+ abcxyzpqr
+ abcxyzpqr\=aftertext
+
+/(?<=pqr)abc(?=xyz)/allusedtext
+ xyzpqrabcxyzpqr
+ xyzpqrabcxyzpqr\=aftertext
+
+/a\b/
+ a.\=allusedtext
+ a\=allusedtext
+
+/abc\Kxyz/
+ abcxyz\=allusedtext
+
+/abc(?=xyz(*ACCEPT))/
+ abcxyz\=allusedtext
+
+/abc(?=abcde)(?=ab)/allusedtext
+ abcabcdefg
+
+# These tests provoke recursion loops, which give a different error message
+# when JIT is used.
+
+/(?R)/I
+ abcd
+
+/(a|(?R))/I
+ abcd
+ defg
+
+/(ab|(bc|(de|(?R))))/I
+ abcd
+ fghi
+
+/(ab|(bc|(de|(?1))))/I
+ abcd
+ fghi
+
+/x(ab|(bc|(de|(?1)x)x)x)/I
+ xab123
+ xfghi
+
+/(?!\w)(?R)/
+ abcd
+ =abc
+
+/(?=\w)(?R)/
+ =abc
+ abcd
+
+/(?a)++/
- aa\=find_limits
- aaaaaaaaa\=find_limits
-
-/(a)(?1)++/
- aa\=find_limits
- aaaaaaaaa\=find_limits
-
-/a(?:.)*?a/ims
- abbbbbbbbbbbbbbbbbbbbba\=find_limits
-
-/a(?:.(*THEN))*?a/ims
- abbbbbbbbbbbbbbbbbbbbba\=find_limits
-
-/a(?:.(*THEN:ABC))*?a/ims
- abbbbbbbbbbbbbbbbbbbbba\=find_limits
-
-/^(?>a+)(?>b+)(?>c+)(?>d+)(?>e+)/
- aabbccddee\=find_limits
-
-/^(?>(a+))(?>(b+))(?>(c+))(?>(d+))(?>(e+))/
- aabbccddee\=find_limits
-
-/^(?>(a+))(?>b+)(?>(c+))(?>d+)(?>(e+))/
- aabbccddee\=find_limits
-
-/^(?>(a+))(?>b+)(?>(c+))(?>d+)(?>(e+))/jitfast
- aabbccddee\=find_limits
- aabbccddee\=jitstack=1
-
-/(a+)*zz/
- aaaaaaaaaaaaaz
- aaaaaaaaaaaaaz\=match_limit=3000
-
-/(*LIMIT_MATCH=3000)(a+)*zz/I
- aaaaaaaaaaaaaz
- aaaaaaaaaaaaaz\=match_limit=60000
-
-/(*LIMIT_MATCH=60000)(*LIMIT_MATCH=3000)(a+)*zz/I
- aaaaaaaaaaaaaz
-
-/(*LIMIT_MATCH=60000)(a+)*zz/I
- aaaaaaaaaaaaaz
- aaaaaaaaaaaaaz\=match_limit=3000
-
-# These three have infinitely nested recursions.
-
-/((?2))((?1))/
- abc
-
-/((?(R2)a+|(?1)b))/
- aaaabcde
-
-/(?(R)a*(?1)|((?R))b)/
- aaaabcde
-
-# Invalid options disable JIT when called via pcre2_match(), causing the
-# match to happen via the interpreter, but for fast JIT invalid options are
-# ignored, so an unanchored match happens.
-
-/abcd/
- abcd\=anchored
- fail abcd\=anchored
-
-/abcd/jitfast
- abcd\=anchored
- succeed abcd\=anchored
-
-# Push/pop does not lose the JIT information, though jitverify applies only to
-# compilation, but serializing (save/load) discards JIT data completely.
-
-/^abc\Kdef/info,push
-#pop jitverify
- abcdef
-
-/^abc\Kdef/info,push
-#save testsaved1
-#load testsaved1
-#pop jitverify
- abcdef
-
-#load testsaved1
-#pop jit,jitverify
- abcdef
-
-# Test pattern compilation
-
-/(?:a|b|c|d|e)(?R)/jit=1
-
-/(?:a|b|c|d|e)(?R)(?R)/jit=1
-
-/(a(?:a|b|c|d|e)b){8,16}/jit=1
-
-/(?:|a|){100}x/jit=1
-
-# These tests provoke recursion loops, which give a different error message
-# when JIT is used.
-
-/(?R)/I
- abcd
-
-/(a|(?R))/I
- abcd
- defg
-
-/(ab|(bc|(de|(?R))))/I
- abcd
- fghi
-
-/(ab|(bc|(de|(?1))))/I
- abcd
- fghi
-
-/x(ab|(bc|(de|(?1)x)x)x)/I
- xab123
- xfghi
-
-/(?!\w)(?R)/
- abcd
- =abc
-
-/(?=\w)(?R)/
- =abc
- abcd
-
-/(?a)++/
+ aa\=find_limits
+ aaaaaaaaa\=find_limits
+
+/(a)(?1)++/
+ aa\=find_limits
+ aaaaaaaaa\=find_limits
-/abc/
- abc\=partial_hard
+/a(?:.)*?a/ims
+ abbbbbbbbbbbbbbbbbbbbba\=find_limits
+
+/a(?:.(*THEN))*?a/ims
+ abbbbbbbbbbbbbbbbbbbbba\=find_limits
-# Real tests
+/a(?:.(*THEN:ABC))*?a/ims
+ abbbbbbbbbbbbbbbbbbbbba\=find_limits
-/abc/
+/^(?>a+)(?>b+)(?>c+)(?>d+)(?>e+)/
+ aabbccddee\=find_limits
+
+/^(?>(a+))(?>(b+))(?>(c+))(?>(d+))(?>(e+))/
+ aabbccddee\=find_limits
+
+/^(?>(a+))(?>b+)(?>(c+))(?>d+)(?>(e+))/
+ aabbccddee\=find_limits
+
+/^(?>(a+))(?>b+)(?>(c+))(?>d+)(?>(e+))/jitfast
+ aabbccddee\=find_limits
+ aabbccddee\=jitstack=1
+
+/(a+)*zz/
+ aaaaaaaaaaaaaz
+ aaaaaaaaaaaaaz\=match_limit=3000
+
+/(*LIMIT_MATCH=3000)(a+)*zz/I
+ aaaaaaaaaaaaaz
+ aaaaaaaaaaaaaz\=match_limit=60000
+
+/(*LIMIT_MATCH=60000)(*LIMIT_MATCH=3000)(a+)*zz/I
+ aaaaaaaaaaaaaz
+
+/(*LIMIT_MATCH=60000)(a+)*zz/I
+ aaaaaaaaaaaaaz
+ aaaaaaaaaaaaaz\=match_limit=3000
+
+# These three have infinitely nested recursions.
+
+/((?2))((?1))/
abc
- *** Failers
-/^abc|def/
+/((?(R2)a+|(?1)b))/
+ aaaabcde
+
+/(?(R)a*(?1)|((?R))b)/
+ aaaabcde
+
+# Invalid options disable JIT when called via pcre2_match(), causing the
+# match to happen via the interpreter, but for fast JIT invalid options are
+# ignored, so an unanchored match happens.
+
+/abcd/
+ abcd\=anchored
+ fail abcd\=anchored
+
+/abcd/jitfast
+ abcd\=anchored
+ succeed abcd\=anchored
+
+# Push/pop does not lose the JIT information, though jitverify applies only to
+# compilation, but serializing (save/load) discards JIT data completely.
+
+/^abc\Kdef/info,push
+#pop jitverify
abcdef
- abcdef\=notbol
-/.*((abc)$|(def))/
- defabc
- defabc\=noteol
+/^abc\Kdef/info,push
+#save testsaved1
+#load testsaved1
+#pop jitverify
+ abcdef
+
+#load testsaved1
+#pop jit,jitverify
+ abcdef
+
+# Test pattern compilation
-/the quick brown fox/
- the quick brown fox
- *** Failers
- The Quick Brown Fox
+/(?:a|b|c|d|e)(?R)/jit=1
-/the quick brown fox/i
- the quick brown fox
- The Quick Brown Fox
+/(?:a|b|c|d|e)(?R)(?R)/jit=1
-/abc.def/
- *** Failers
- abc\ndef
+/(a(?:a|b|c|d|e)b){8,16}/jit=1
-/abc$/
- abc
- abc\n
+/(?:|a|){100}x/jit=1
-/(abc)\2/
+# These tests provoke recursion loops, which give a different error message
+# when JIT is used.
-/(abc\1)/
- abc
+/(?R)/I
+ abcd
-/a*(b+)(z)(z)/
- aaaabbbbzzzz
- aaaabbbbzzzz\=ovector=0
- aaaabbbbzzzz\=ovector=1
- aaaabbbbzzzz\=ovector=2
+/(a|(?R))/I
+ abcd
+ defg
-/ab.cd/
- ab-cd
- ab=cd
- ** Failers
- ab\ncd
+/(ab|(bc|(de|(?R))))/I
+ abcd
+ fghi
-/ab.cd/s
- ab-cd
- ab=cd
- ab\ncd
+/(ab|(bc|(de|(?1))))/I
+ abcd
+ fghi
-/a(b)c/no_auto_capture
- abc
+/x(ab|(bc|(de|(?1)x)x)x)/I
+ xab123
+ xfghi
-/a(?Pb)c/no_auto_capture
- abc
+/(?!\w)(?R)/
+ abcd
+ =abc
-/a?|b?/
- abc
- ** Failers
- ddd\=notempty
+/(?=\w)(?R)/
+ =abc
+ abcd
-/\w+A/
- CDAAAAB
+/(?b)c/no_auto_capture
+ abc
+
+/a?|b?/
+ abc
+ ** Failers
+ ddd\=notempty
+
+/\w+A/
+ CDAAAAB
+
+/\w+A/ungreedy
+ CDAAAAB
+
+/\Biss\B/I,aftertext
+ Mississippi
+
+/abc/\
+
+"(?(?C)"
+
+# End of testdata/testinput18
diff --git a/testdata/testinput19 b/testdata/testinput19
index 155fd13..eebce2c 100644
--- a/testdata/testinput19
+++ b/testdata/testinput19
@@ -1,62 +1,17 @@
-# This set of tests exercises the serialization/deserialization functions in
-# the library. It does not use UTF or JIT.
-
-#forbid_utf
-
-# Compile several patterns, push them onto the stack, and then write them
-# all to a file.
-
-#pattern push
-
-/(?(?&NAME_PAT))\s+(?(?&ADDRESS_PAT))
- (?(DEFINE)
- (?[a-z]+)
- (?\d+)
- )/x
-/^(?:((.)(?1)\2|)|((.)(?3)\4|.))$/i
-
-#save testsaved1
-
-# Do it again for some more patterns.
-
-/(*MARK:A)(*SKIP:B)(C|X)/mark
-/(?:(?foo)|(?bar))\k/dupnames
-
-#save testsaved2
-#pattern -push
-
-# Reload the patterns, then pop them one by one and check them.
-
-#load testsaved1
-#load testsaved2
-
-#pop info
- foofoo
- barbar
+# This set of tests is run only with the 8-bit library. It tests the POSIX
+# interface with UTF/UCP support, which is supported only with the 8-bit
+# library. This test should not be run with JIT (which is not available for the
+# POSIX interface).
-#pop mark
- C
- D
+#pattern posix
+
+/a\x{1234}b/utf
+ a\x{1234}b
+
+/\w/
+ +++\x{c2}
+
+/\w/ucp
+ +++\x{c2}
-#pop
- AmanaplanacanalPanama
-
-#pop info
- metcalfe 33
-
-# Check for an error when different tables are used.
-
-/abc/push,tables=1
-/xyz/push,tables=2
-#save testsaved1
-
-#pop
- xyz
-
-#pop
- abc
-
-#pop should give an error
- pqr
-
-# End of testinput19
+# End of testdata/testinput19
diff --git a/testdata/testinput20 b/testdata/testinput20
new file mode 100644
index 0000000..3a8f06e
--- /dev/null
+++ b/testdata/testinput20
@@ -0,0 +1,62 @@
+# This set of tests exercises the serialization/deserialization functions in
+# the library. It does not use UTF or JIT.
+
+#forbid_utf
+
+# Compile several patterns, push them onto the stack, and then write them
+# all to a file.
+
+#pattern push
+
+/(?(?&NAME_PAT))\s+(?(?&ADDRESS_PAT))
+ (?(DEFINE)
+ (?[a-z]+)
+ (?\d+)
+ )/x
+/^(?:((.)(?1)\2|)|((.)(?3)\4|.))$/i
+
+#save testsaved1
+
+# Do it again for some more patterns.
+
+/(*MARK:A)(*SKIP:B)(C|X)/mark
+/(?:(?foo)|(?bar))\k/dupnames
+
+#save testsaved2
+#pattern -push
+
+# Reload the patterns, then pop them one by one and check them.
+
+#load testsaved1
+#load testsaved2
+
+#pop info
+ foofoo
+ barbar
+
+#pop mark
+ C
+ D
+
+#pop
+ AmanaplanacanalPanama
+
+#pop info
+ metcalfe 33
+
+# Check for an error when different tables are used.
+
+/abc/push,tables=1
+/xyz/push,tables=2
+#save testsaved1
+
+#pop
+ xyz
+
+#pop
+ abc
+
+#pop should give an error
+ pqr
+
+# End of testinput20
diff --git a/testdata/testoutput10 b/testdata/testoutput10
index ef248db..89b4d36 100644
--- a/testdata/testoutput10
+++ b/testdata/testoutput10
@@ -235,7 +235,55 @@ No match
No match
\xfd\x83\x80\x80\x80\x80\=no_utf_check
No match
+
+# Similar tests with offsets
+/badutf/utf
+ X\xdfabcd
+Failed: error -8: UTF-8 error: byte 2 top bits not 0x80 at offset 1
+ X\xdfabcd\=offset=1
+Failed: error -8: UTF-8 error: byte 2 top bits not 0x80 at offset 1
+ X\xdfabcd\=offset=2
+No match
+
+/(?<=x)badutf/utf
+ X\xdfabcd
+Failed: error -8: UTF-8 error: byte 2 top bits not 0x80 at offset 1
+ X\xdfabcd\=offset=1
+Failed: error -8: UTF-8 error: byte 2 top bits not 0x80 at offset 1
+ X\xdfabcd\=offset=2
+Failed: error -8: UTF-8 error: byte 2 top bits not 0x80 at offset 1
+ X\xdfabcd\=offset=3
+No match
+ X\xdfabcd\xdf\=offset=3
+Failed: error -3: UTF-8 error: 1 byte missing at end at offset 6
+
+/(?<=xx)badutf/utf
+ X\xdfabcd
+Failed: error -8: UTF-8 error: byte 2 top bits not 0x80 at offset 1
+ X\xdfabcd\=offset=1
+Failed: error -8: UTF-8 error: byte 2 top bits not 0x80 at offset 1
+ X\xdfabcd\=offset=2
+Failed: error -8: UTF-8 error: byte 2 top bits not 0x80 at offset 1
+ X\xdfabcd\=offset=3
+Failed: error -8: UTF-8 error: byte 2 top bits not 0x80 at offset 1
+
+/(?<=xxxx)badutf/utf
+ X\xdfabcd
+Failed: error -8: UTF-8 error: byte 2 top bits not 0x80 at offset 1
+ X\xdfabcd\=offset=1
+Failed: error -8: UTF-8 error: byte 2 top bits not 0x80 at offset 1
+ X\xdfabcd\=offset=2
+Failed: error -8: UTF-8 error: byte 2 top bits not 0x80 at offset 1
+ X\xdfabcd\=offset=3
+Failed: error -8: UTF-8 error: byte 2 top bits not 0x80 at offset 1
+ X\xdfabcd\=offset=6
+No match
+ X\xdfabc\xdf\=offset=6
+Failed: error -3: UTF-8 error: 1 byte missing at end at offset 5
+ X\xdfabc\xdf\=offset=7
+Failed: error -33: bad offset value
+
/\x{100}/IB,utf
------------------------------------------------------------------
Bra
diff --git a/testdata/testoutput12-16 b/testdata/testoutput12-16
index 81584dd..2676866 100644
--- a/testdata/testoutput12-16
+++ b/testdata/testoutput12-16
@@ -609,6 +609,8 @@ Failed: error 106 at offset 13: missing terminating ] for character class
/X/utf
XX\x{d800}
Failed: error -24: UTF-16 error: missing low surrogate at end at offset 2
+ XX\x{d800}\=offset=3
+No match
XX\x{d800}\=no_utf_check
0: X
XX\x{da00}
@@ -631,6 +633,10 @@ Failed: error -26: UTF-16 error: isolated low surrogate at offset 2
** Failed: character \x{110000} is greater than 0x10ffff and so cannot be converted to UTF-16
XX\x{d800}\x{1234}
Failed: error -25: UTF-16 error: invalid low surrogate at offset 3
+
+/(?<=.)X/utf
+ XX\x{d800}\=offset=3
+Failed: error -24: UTF-16 error: missing low surrogate at end at offset 2
/(*UTF16)\x{11234}/
abcd\x{11234}pqr
diff --git a/testdata/testoutput12-32 b/testdata/testoutput12-32
index 89ac70a..a805b88 100644
--- a/testdata/testoutput12-32
+++ b/testdata/testoutput12-32
@@ -602,6 +602,8 @@ Failed: error 106 at offset 13: missing terminating ] for character class
/X/utf
XX\x{d800}
Failed: error -27: UTF-32 error: code points 0xd800-0xdfff are not defined at offset 2
+ XX\x{d800}\=offset=3
+No match
XX\x{d800}\=no_utf_check
0: X
XX\x{da00}
@@ -624,6 +626,10 @@ Failed: error -27: UTF-32 error: code points 0xd800-0xdfff are not defined at of
Failed: error -28: UTF-32 error: code points greater than 0x10ffff are not defined at offset 2
XX\x{d800}\x{1234}
Failed: error -27: UTF-32 error: code points 0xd800-0xdfff are not defined at offset 2
+
+/(?<=.)X/utf
+ XX\x{d800}\=offset=3
+Failed: error -27: UTF-32 error: code points 0xd800-0xdfff are not defined at offset 2
/(*UTF16)\x{11234}/
Failed: error 160 at offset 5: (*VERB) not recognized or malformed
diff --git a/testdata/testoutput14 b/testdata/testoutput14
deleted file mode 100644
index fe012d9..0000000
--- a/testdata/testoutput14
+++ /dev/null
@@ -1,334 +0,0 @@
-# These are:
-#
-# (1) Tests of the match-limiting features. The results are different for
-# interpretive or JIT matching, so this test should not be run with JIT. The
-# same tests are run using JIT in test 16.
-
-# (2) Other tests that must not be run with JIT.
-
-/(a+)*zz/I
-Capturing subpattern count = 1
-Starting code units: a z
-Last code unit = 'z'
-Subject length lower bound = 2
- aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaazzbbbbbb\=find_limits
-Minimum match limit = 8
-Minimum recursion limit = 6
- 0: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaazz
- 1: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
- aaaaaaaaaaaaaz\=find_limits
-Minimum match limit = 32768
-Minimum recursion limit = 29
-No match
-
-!((?:\s|//.*\\n|/[*](?:\\n|.)*?[*]/)*)!I
-Capturing subpattern count = 1
-May match empty string
-Subject length lower bound = 0
- /* this is a C style comment */\=find_limits
-Minimum match limit = 120
-Minimum recursion limit = 6
- 0: /* this is a C style comment */
- 1: /* this is a C style comment */
-
-/^(?>a)++/
- aa\=find_limits
-Minimum match limit = 5
-Minimum recursion limit = 2
- 0: aa
- aaaaaaaaa\=find_limits
-Minimum match limit = 12
-Minimum recursion limit = 2
- 0: aaaaaaaaa
-
-/(a)(?1)++/
- aa\=find_limits
-Minimum match limit = 7
-Minimum recursion limit = 4
- 0: aa
- 1: a
- aaaaaaaaa\=find_limits
-Minimum match limit = 21
-Minimum recursion limit = 4
- 0: aaaaaaaaa
- 1: a
-
-/a(?:.)*?a/ims
- abbbbbbbbbbbbbbbbbbbbba\=find_limits
-Minimum match limit = 65
-Minimum recursion limit = 2
- 0: abbbbbbbbbbbbbbbbbbbbba
-
-/a(?:.(*THEN))*?a/ims
- abbbbbbbbbbbbbbbbbbbbba\=find_limits
-Minimum match limit = 86
-Minimum recursion limit = 45
- 0: abbbbbbbbbbbbbbbbbbbbba
-
-/a(?:.(*THEN:ABC))*?a/ims
- abbbbbbbbbbbbbbbbbbbbba\=find_limits
-Minimum match limit = 86
-Minimum recursion limit = 45
- 0: abbbbbbbbbbbbbbbbbbbbba
-
-/^(?>a+)(?>b+)(?>c+)(?>d+)(?>e+)/
- aabbccddee\=find_limits
-Minimum match limit = 7
-Minimum recursion limit = 2
- 0: aabbccddee
-
-/^(?>(a+))(?>(b+))(?>(c+))(?>(d+))(?>(e+))/
- aabbccddee\=find_limits
-Minimum match limit = 17
-Minimum recursion limit = 16
- 0: aabbccddee
- 1: aa
- 2: bb
- 3: cc
- 4: dd
- 5: ee
-
-/^(?>(a+))(?>b+)(?>(c+))(?>d+)(?>(e+))/
- aabbccddee\=find_limits
-Minimum match limit = 13
-Minimum recursion limit = 10
- 0: aabbccddee
- 1: aa
- 2: cc
- 3: ee
-
-/(*LIMIT_MATCH=12bc)abc/
-Failed: error 160 at offset 17: (*VERB) not recognized or malformed
-
-/(*LIMIT_MATCH=4294967290)abc/
-Failed: error 160 at offset 24: (*VERB) not recognized or malformed
-
-/(*LIMIT_RECURSION=4294967280)abc/I
-Capturing subpattern count = 0
-Recursion limit = 4294967280
-First code unit = 'a'
-Last code unit = 'c'
-Subject length lower bound = 3
-
-/(a+)*zz/
- aaaaaaaaaaaaaz
-No match
- aaaaaaaaaaaaaz\=match_limit=3000
-Failed: error -47: match limit exceeded
-
-/(a+)*zz/
- aaaaaaaaaaaaaz\=recursion_limit=10
-Failed: error -53: recursion limit exceeded
-
-/(*LIMIT_MATCH=3000)(a+)*zz/I
-Capturing subpattern count = 1
-Match limit = 3000
-Starting code units: a z
-Last code unit = 'z'
-Subject length lower bound = 2
- aaaaaaaaaaaaaz
-Failed: error -47: match limit exceeded
- aaaaaaaaaaaaaz\=match_limit=60000
-Failed: error -47: match limit exceeded
-
-/(*LIMIT_MATCH=60000)(*LIMIT_MATCH=3000)(a+)*zz/I
-Capturing subpattern count = 1
-Match limit = 3000
-Starting code units: a z
-Last code unit = 'z'
-Subject length lower bound = 2
- aaaaaaaaaaaaaz
-Failed: error -47: match limit exceeded
-
-/(*LIMIT_MATCH=60000)(a+)*zz/I
-Capturing subpattern count = 1
-Match limit = 60000
-Starting code units: a z
-Last code unit = 'z'
-Subject length lower bound = 2
- aaaaaaaaaaaaaz
-No match
- aaaaaaaaaaaaaz\=match_limit=3000
-Failed: error -47: match limit exceeded
-
-/(*LIMIT_RECURSION=10)(a+)*zz/I
-Capturing subpattern count = 1
-Recursion limit = 10
-Starting code units: a z
-Last code unit = 'z'
-Subject length lower bound = 2
- aaaaaaaaaaaaaz
-Failed: error -53: recursion limit exceeded
- aaaaaaaaaaaaaz\=recursion_limit=1000
-Failed: error -53: recursion limit exceeded
-
-/(*LIMIT_RECURSION=10)(*LIMIT_RECURSION=1000)(a+)*zz/I
-Capturing subpattern count = 1
-Recursion limit = 1000
-Starting code units: a z
-Last code unit = 'z'
-Subject length lower bound = 2
- aaaaaaaaaaaaaz
-No match
-
-/(*LIMIT_RECURSION=1000)(a+)*zz/I
-Capturing subpattern count = 1
-Recursion limit = 1000
-Starting code units: a z
-Last code unit = 'z'
-Subject length lower bound = 2
- aaaaaaaaaaaaaz
-No match
- aaaaaaaaaaaaaz\=recursion_limit=10
-Failed: error -53: recursion limit exceeded
-
-# These three have infinitely nested recursions.
-
-/((?2))((?1))/
- abc
-Failed: error -52: nested recursion at the same subject position
-
-/((?(R2)a+|(?1)b))/
- aaaabcde
-Failed: error -52: nested recursion at the same subject position
-
-/(?(R)a*(?1)|((?R))b)/
- aaaabcde
-Failed: error -52: nested recursion at the same subject position
-
-# The allusedtext modifier does not work with JIT, which does not maintain
-# the leftchar/rightchar data.
-
-/abc(?=xyz)/allusedtext
- abcxyzpqr
- 0: abcxyz
- >>>
- abcxyzpqr\=aftertext
- 0: abcxyz
- >>>
- 0+ xyzpqr
-
-/(?<=pqr)abc(?=xyz)/allusedtext
- xyzpqrabcxyzpqr
- 0: pqrabcxyz
- <<< >>>
- xyzpqrabcxyzpqr\=aftertext
- 0: pqrabcxyz
- <<< >>>
- 0+ xyzpqr
-
-/a\b/
- a.\=allusedtext
- 0: a.
- >
- a\=allusedtext
- 0: a
-
-/abc\Kxyz/
- abcxyz\=allusedtext
- 0: abcxyz
- <<<
-
-/abc(?=xyz(*ACCEPT))/
- abcxyz\=allusedtext
- 0: abcxyz
- >>>
-
-/abc(?=abcde)(?=ab)/allusedtext
- abcabcdefg
- 0: abcabcde
- >>>>>
-
-# These tests provoke recursion loops, which give a different error message
-# when JIT is used.
-
-/(?R)/I
-Capturing subpattern count = 0
-May match empty string
-Subject length lower bound = 0
- abcd
-Failed: error -52: nested recursion at the same subject position
-
-/(a|(?R))/I
-Capturing subpattern count = 1
-May match empty string
-Subject length lower bound = 1
- abcd
- 0: a
- 1: a
- defg
-Failed: error -52: nested recursion at the same subject position
-
-/(ab|(bc|(de|(?R))))/I
-Capturing subpattern count = 3
-May match empty string
-Subject length lower bound = 2
- abcd
- 0: ab
- 1: ab
- fghi
-Failed: error -52: nested recursion at the same subject position
-
-/(ab|(bc|(de|(?1))))/I
-Capturing subpattern count = 3
-May match empty string
-Subject length lower bound = 2
- abcd
- 0: ab
- 1: ab
- fghi
-Failed: error -52: nested recursion at the same subject position
-
-/x(ab|(bc|(de|(?1)x)x)x)/I
-Capturing subpattern count = 3
-First code unit = 'x'
-Subject length lower bound = 3
- xab123
- 0: xab
- 1: ab
- xfghi
-Failed: error -52: nested recursion at the same subject position
-
-/(?!\w)(?R)/
- abcd
-Failed: error -52: nested recursion at the same subject position
- =abc
-Failed: error -52: nested recursion at the same subject position
-
-/(?=\w)(?R)/
- =abc
-Failed: error -52: nested recursion at the same subject position
- abcd
-Failed: error -52: nested recursion at the same subject position
-
-/(?a)++/
+ aa\=find_limits
+Minimum match limit = 5
+Minimum recursion limit = 2
+ 0: aa
+ aaaaaaaaa\=find_limits
+Minimum match limit = 12
+Minimum recursion limit = 2
+ 0: aaaaaaaaa
+
+/(a)(?1)++/
+ aa\=find_limits
+Minimum match limit = 7
+Minimum recursion limit = 4
+ 0: aa
+ 1: a
+ aaaaaaaaa\=find_limits
+Minimum match limit = 21
+Minimum recursion limit = 4
+ 0: aaaaaaaaa
+ 1: a
+
+/a(?:.)*?a/ims
+ abbbbbbbbbbbbbbbbbbbbba\=find_limits
+Minimum match limit = 65
+Minimum recursion limit = 2
+ 0: abbbbbbbbbbbbbbbbbbbbba
+
+/a(?:.(*THEN))*?a/ims
+ abbbbbbbbbbbbbbbbbbbbba\=find_limits
+Minimum match limit = 86
+Minimum recursion limit = 45
+ 0: abbbbbbbbbbbbbbbbbbbbba
+
+/a(?:.(*THEN:ABC))*?a/ims
+ abbbbbbbbbbbbbbbbbbbbba\=find_limits
+Minimum match limit = 86
+Minimum recursion limit = 45
+ 0: abbbbbbbbbbbbbbbbbbbbba
+
+/^(?>a+)(?>b+)(?>c+)(?>d+)(?>e+)/
+ aabbccddee\=find_limits
+Minimum match limit = 7
+Minimum recursion limit = 2
+ 0: aabbccddee
+
+/^(?>(a+))(?>(b+))(?>(c+))(?>(d+))(?>(e+))/
+ aabbccddee\=find_limits
+Minimum match limit = 17
+Minimum recursion limit = 16
+ 0: aabbccddee
+ 1: aa
+ 2: bb
+ 3: cc
+ 4: dd
+ 5: ee
+
+/^(?>(a+))(?>b+)(?>(c+))(?>d+)(?>(e+))/
+ aabbccddee\=find_limits
+Minimum match limit = 13
+Minimum recursion limit = 10
+ 0: aabbccddee
+ 1: aa
+ 2: cc
+ 3: ee
+
+/(*LIMIT_MATCH=12bc)abc/
+Failed: error 160 at offset 17: (*VERB) not recognized or malformed
+
+/(*LIMIT_MATCH=4294967290)abc/
+Failed: error 160 at offset 24: (*VERB) not recognized or malformed
+
+/(*LIMIT_RECURSION=4294967280)abc/I
Capturing subpattern count = 0
+Recursion limit = 4294967280
First code unit = 'a'
Last code unit = 'c'
Subject length lower bound = 3
-JIT support is not available in this version of PCRE2
-/a*/I
+/(a+)*zz/
+ aaaaaaaaaaaaaz
+No match
+ aaaaaaaaaaaaaz\=match_limit=3000
+Failed: error -47: match limit exceeded
+
+/(a+)*zz/
+ aaaaaaaaaaaaaz\=recursion_limit=10
+Failed: error -53: recursion limit exceeded
+
+/(*LIMIT_MATCH=3000)(a+)*zz/I
+Capturing subpattern count = 1
+Match limit = 3000
+Starting code units: a z
+Last code unit = 'z'
+Subject length lower bound = 2
+ aaaaaaaaaaaaaz
+Failed: error -47: match limit exceeded
+ aaaaaaaaaaaaaz\=match_limit=60000
+Failed: error -47: match limit exceeded
+
+/(*LIMIT_MATCH=60000)(*LIMIT_MATCH=3000)(a+)*zz/I
+Capturing subpattern count = 1
+Match limit = 3000
+Starting code units: a z
+Last code unit = 'z'
+Subject length lower bound = 2
+ aaaaaaaaaaaaaz
+Failed: error -47: match limit exceeded
+
+/(*LIMIT_MATCH=60000)(a+)*zz/I
+Capturing subpattern count = 1
+Match limit = 60000
+Starting code units: a z
+Last code unit = 'z'
+Subject length lower bound = 2
+ aaaaaaaaaaaaaz
+No match
+ aaaaaaaaaaaaaz\=match_limit=3000
+Failed: error -47: match limit exceeded
+
+/(*LIMIT_RECURSION=10)(a+)*zz/I
+Capturing subpattern count = 1
+Recursion limit = 10
+Starting code units: a z
+Last code unit = 'z'
+Subject length lower bound = 2
+ aaaaaaaaaaaaaz
+Failed: error -53: recursion limit exceeded
+ aaaaaaaaaaaaaz\=recursion_limit=1000
+Failed: error -53: recursion limit exceeded
+
+/(*LIMIT_RECURSION=10)(*LIMIT_RECURSION=1000)(a+)*zz/I
+Capturing subpattern count = 1
+Recursion limit = 1000
+Starting code units: a z
+Last code unit = 'z'
+Subject length lower bound = 2
+ aaaaaaaaaaaaaz
+No match
+
+/(*LIMIT_RECURSION=1000)(a+)*zz/I
+Capturing subpattern count = 1
+Recursion limit = 1000
+Starting code units: a z
+Last code unit = 'z'
+Subject length lower bound = 2
+ aaaaaaaaaaaaaz
+No match
+ aaaaaaaaaaaaaz\=recursion_limit=10
+Failed: error -53: recursion limit exceeded
+
+# These three have infinitely nested recursions.
+
+/((?2))((?1))/
+ abc
+Failed: error -52: nested recursion at the same subject position
+
+/((?(R2)a+|(?1)b))/
+ aaaabcde
+Failed: error -52: nested recursion at the same subject position
+
+/(?(R)a*(?1)|((?R))b)/
+ aaaabcde
+Failed: error -52: nested recursion at the same subject position
+
+# The allusedtext modifier does not work with JIT, which does not maintain
+# the leftchar/rightchar data.
+
+/abc(?=xyz)/allusedtext
+ abcxyzpqr
+ 0: abcxyz
+ >>>
+ abcxyzpqr\=aftertext
+ 0: abcxyz
+ >>>
+ 0+ xyzpqr
+
+/(?<=pqr)abc(?=xyz)/allusedtext
+ xyzpqrabcxyzpqr
+ 0: pqrabcxyz
+ <<< >>>
+ xyzpqrabcxyzpqr\=aftertext
+ 0: pqrabcxyz
+ <<< >>>
+ 0+ xyzpqr
+
+/a\b/
+ a.\=allusedtext
+ 0: a.
+ >
+ a\=allusedtext
+ 0: a
+
+/abc\Kxyz/
+ abcxyz\=allusedtext
+ 0: abcxyz
+ <<<
+
+/abc(?=xyz(*ACCEPT))/
+ abcxyz\=allusedtext
+ 0: abcxyz
+ >>>
+
+/abc(?=abcde)(?=ab)/allusedtext
+ abcabcdefg
+ 0: abcabcde
+ >>>>>
+
+# These tests provoke recursion loops, which give a different error message
+# when JIT is used.
+
+/(?R)/I
Capturing subpattern count = 0
May match empty string
Subject length lower bound = 0
+ abcd
+Failed: error -52: nested recursion at the same subject position
+
+/(a|(?R))/I
+Capturing subpattern count = 1
+May match empty string
+Subject length lower bound = 1
+ abcd
+ 0: a
+ 1: a
+ defg
+Failed: error -52: nested recursion at the same subject position
+
+/(ab|(bc|(de|(?R))))/I
+Capturing subpattern count = 3
+May match empty string
+Subject length lower bound = 2
+ abcd
+ 0: ab
+ 1: ab
+ fghi
+Failed: error -52: nested recursion at the same subject position
+
+/(ab|(bc|(de|(?1))))/I
+Capturing subpattern count = 3
+May match empty string
+Subject length lower bound = 2
+ abcd
+ 0: ab
+ 1: ab
+ fghi
+Failed: error -52: nested recursion at the same subject position
+
+/x(ab|(bc|(de|(?1)x)x)x)/I
+Capturing subpattern count = 3
+First code unit = 'x'
+Subject length lower bound = 3
+ xab123
+ 0: xab
+ 1: ab
+ xfghi
+Failed: error -52: nested recursion at the same subject position
+
+/(?!\w)(?R)/
+ abcd
+Failed: error -52: nested recursion at the same subject position
+ =abc
+Failed: error -52: nested recursion at the same subject position
+
+/(?=\w)(?R)/
+ =abc
+Failed: error -52: nested recursion at the same subject position
+ abcd
+Failed: error -52: nested recursion at the same subject position
+
+/(?a)++/
- aa\=find_limits
-Minimum match limit = 1
- 0: aa (JIT)
- aaaaaaaaa\=find_limits
-Minimum match limit = 1
- 0: aaaaaaaaa (JIT)
-
-/(a)(?1)++/
- aa\=find_limits
-Minimum match limit = 1
- 0: aa (JIT)
- 1: a
- aaaaaaaaa\=find_limits
-Minimum match limit = 1
- 0: aaaaaaaaa (JIT)
- 1: a
-
-/a(?:.)*?a/ims
- abbbbbbbbbbbbbbbbbbbbba\=find_limits
-Minimum match limit = 1
- 0: abbbbbbbbbbbbbbbbbbbbba (JIT)
-
-/a(?:.(*THEN))*?a/ims
- abbbbbbbbbbbbbbbbbbbbba\=find_limits
-Minimum match limit = 1
- 0: abbbbbbbbbbbbbbbbbbbbba (JIT)
-
-/a(?:.(*THEN:ABC))*?a/ims
- abbbbbbbbbbbbbbbbbbbbba\=find_limits
-Minimum match limit = 1
- 0: abbbbbbbbbbbbbbbbbbbbba (JIT)
-
-/^(?>a+)(?>b+)(?>c+)(?>d+)(?>e+)/
- aabbccddee\=find_limits
-Minimum match limit = 5
- 0: aabbccddee (JIT)
-
-/^(?>(a+))(?>(b+))(?>(c+))(?>(d+))(?>(e+))/
- aabbccddee\=find_limits
-Minimum match limit = 5
- 0: aabbccddee (JIT)
- 1: aa
- 2: bb
- 3: cc
- 4: dd
- 5: ee
-
-/^(?>(a+))(?>b+)(?>(c+))(?>d+)(?>(e+))/
- aabbccddee\=find_limits
-Minimum match limit = 5
- 0: aabbccddee (JIT)
- 1: aa
- 2: cc
- 3: ee
-
-/^(?>(a+))(?>b+)(?>(c+))(?>d+)(?>(e+))/jitfast
- aabbccddee\=find_limits
-Minimum match limit = 5
- 0: aabbccddee (JIT)
- 1: aa
- 2: cc
- 3: ee
- aabbccddee\=jitstack=1
- 0: aabbccddee (JIT)
- 1: aa
- 2: cc
- 3: ee
-
-/(a+)*zz/
- aaaaaaaaaaaaaz
-No match (JIT)
- aaaaaaaaaaaaaz\=match_limit=3000
-Failed: error -47: match limit exceeded
-
-/(*LIMIT_MATCH=3000)(a+)*zz/I
-Capturing subpattern count = 1
-Match limit = 3000
-Starting code units: a z
-Last code unit = 'z'
-Subject length lower bound = 2
-JIT compilation was successful
- aaaaaaaaaaaaaz
-Failed: error -47: match limit exceeded
- aaaaaaaaaaaaaz\=match_limit=60000
-Failed: error -47: match limit exceeded
-
-/(*LIMIT_MATCH=60000)(*LIMIT_MATCH=3000)(a+)*zz/I
-Capturing subpattern count = 1
-Match limit = 3000
-Starting code units: a z
-Last code unit = 'z'
-Subject length lower bound = 2
-JIT compilation was successful
- aaaaaaaaaaaaaz
-Failed: error -47: match limit exceeded
-
-/(*LIMIT_MATCH=60000)(a+)*zz/I
-Capturing subpattern count = 1
-Match limit = 60000
-Starting code units: a z
-Last code unit = 'z'
-Subject length lower bound = 2
-JIT compilation was successful
- aaaaaaaaaaaaaz
-No match (JIT)
- aaaaaaaaaaaaaz\=match_limit=3000
-Failed: error -47: match limit exceeded
-
-# These three have infinitely nested recursions.
-
-/((?2))((?1))/
- abc
-Failed: error -46: JIT stack limit reached
-
-/((?(R2)a+|(?1)b))/
- aaaabcde
-Failed: error -46: JIT stack limit reached
-
-/(?(R)a*(?1)|((?R))b)/
- aaaabcde
-Failed: error -46: JIT stack limit reached
-
-# Invalid options disable JIT when called via pcre2_match(), causing the
-# match to happen via the interpreter, but for fast JIT invalid options are
-# ignored, so an unanchored match happens.
-
-/abcd/
- abcd\=anchored
- 0: abcd
- fail abcd\=anchored
-No match
-
-/abcd/jitfast
- abcd\=anchored
- 0: abcd (JIT)
- succeed abcd\=anchored
- 0: abcd (JIT)
-
-# Push/pop does not lose the JIT information, though jitverify applies only to
-# compilation, but serializing (save/load) discards JIT data completely.
-
-/^abc\Kdef/info,push
-** Applies only to compile when pattern is stacked with 'push': jitverify
-Capturing subpattern count = 0
-Compile options:
-Overall options: anchored
-Subject length lower bound = 6
-JIT compilation was successful
-#pop jitverify
- abcdef
- 0: def (JIT)
-
-/^abc\Kdef/info,push
-** Applies only to compile when pattern is stacked with 'push': jitverify
-Capturing subpattern count = 0
-Compile options:
-Overall options: anchored
-Subject length lower bound = 6
-JIT compilation was successful
-#save testsaved1
-#load testsaved1
-#pop jitverify
- abcdef
- 0: def
-
-#load testsaved1
-#pop jit,jitverify
- abcdef
- 0: def (JIT)
-
-# Test pattern compilation
-
-/(?:a|b|c|d|e)(?R)/jit=1
-
-/(?:a|b|c|d|e)(?R)(?R)/jit=1
-
-/(a(?:a|b|c|d|e)b){8,16}/jit=1
-
-/(?:|a|){100}x/jit=1
-
-# These tests provoke recursion loops, which give a different error message
-# when JIT is used.
-
-/(?R)/I
-Capturing subpattern count = 0
-May match empty string
-Subject length lower bound = 0
-JIT compilation was successful
- abcd
-Failed: error -46: JIT stack limit reached
-
-/(a|(?R))/I
-Capturing subpattern count = 1
-May match empty string
-Subject length lower bound = 1
-JIT compilation was successful
- abcd
- 0: a (JIT)
- 1: a
- defg
-Failed: error -46: JIT stack limit reached
-
-/(ab|(bc|(de|(?R))))/I
-Capturing subpattern count = 3
-May match empty string
-Subject length lower bound = 2
-JIT compilation was successful
- abcd
- 0: ab (JIT)
- 1: ab
- fghi
-Failed: error -46: JIT stack limit reached
-
-/(ab|(bc|(de|(?1))))/I
-Capturing subpattern count = 3
-May match empty string
-Subject length lower bound = 2
-JIT compilation was successful
- abcd
- 0: ab (JIT)
- 1: ab
- fghi
-Failed: error -46: JIT stack limit reached
-
-/x(ab|(bc|(de|(?1)x)x)x)/I
-Capturing subpattern count = 3
-First code unit = 'x'
+Last code unit = 'c'
Subject length lower bound = 3
-JIT compilation was successful
- xab123
- 0: xab (JIT)
- 1: ab
- xfghi
-Failed: error -46: JIT stack limit reached
+JIT support is not available in this version of PCRE2
-/(?!\w)(?R)/
- abcd
-Failed: error -46: JIT stack limit reached
- =abc
-Failed: error -46: JIT stack limit reached
-
-/(?=\w)(?R)/
- =abc
-Failed: error -46: JIT stack limit reached
- abcd
-Failed: error -46: JIT stack limit reached
-
-/(?a)++/
+ aa\=find_limits
+Minimum match limit = 1
+ 0: aa (JIT)
+ aaaaaaaaa\=find_limits
+Minimum match limit = 1
+ 0: aaaaaaaaa (JIT)
+
+/(a)(?1)++/
+ aa\=find_limits
+Minimum match limit = 1
+ 0: aa (JIT)
+ 1: a
+ aaaaaaaaa\=find_limits
+Minimum match limit = 1
+ 0: aaaaaaaaa (JIT)
+ 1: a
-/abc/
- abc\=partial_hard
-** Ignored with POSIX interface: partial_hard
- 0: abc
+/a(?:.)*?a/ims
+ abbbbbbbbbbbbbbbbbbbbba\=find_limits
+Minimum match limit = 1
+ 0: abbbbbbbbbbbbbbbbbbbbba (JIT)
+
+/a(?:.(*THEN))*?a/ims
+ abbbbbbbbbbbbbbbbbbbbba\=find_limits
+Minimum match limit = 1
+ 0: abbbbbbbbbbbbbbbbbbbbba (JIT)
-# Real tests
+/a(?:.(*THEN:ABC))*?a/ims
+ abbbbbbbbbbbbbbbbbbbbba\=find_limits
+Minimum match limit = 1
+ 0: abbbbbbbbbbbbbbbbbbbbba (JIT)
-/abc/
+/^(?>a+)(?>b+)(?>c+)(?>d+)(?>e+)/
+ aabbccddee\=find_limits
+Minimum match limit = 5
+ 0: aabbccddee (JIT)
+
+/^(?>(a+))(?>(b+))(?>(c+))(?>(d+))(?>(e+))/
+ aabbccddee\=find_limits
+Minimum match limit = 5
+ 0: aabbccddee (JIT)
+ 1: aa
+ 2: bb
+ 3: cc
+ 4: dd
+ 5: ee
+
+/^(?>(a+))(?>b+)(?>(c+))(?>d+)(?>(e+))/
+ aabbccddee\=find_limits
+Minimum match limit = 5
+ 0: aabbccddee (JIT)
+ 1: aa
+ 2: cc
+ 3: ee
+
+/^(?>(a+))(?>b+)(?>(c+))(?>d+)(?>(e+))/jitfast
+ aabbccddee\=find_limits
+Minimum match limit = 5
+ 0: aabbccddee (JIT)
+ 1: aa
+ 2: cc
+ 3: ee
+ aabbccddee\=jitstack=1
+ 0: aabbccddee (JIT)
+ 1: aa
+ 2: cc
+ 3: ee
+
+/(a+)*zz/
+ aaaaaaaaaaaaaz
+No match (JIT)
+ aaaaaaaaaaaaaz\=match_limit=3000
+Failed: error -47: match limit exceeded
+
+/(*LIMIT_MATCH=3000)(a+)*zz/I
+Capturing subpattern count = 1
+Match limit = 3000
+Starting code units: a z
+Last code unit = 'z'
+Subject length lower bound = 2
+JIT compilation was successful
+ aaaaaaaaaaaaaz
+Failed: error -47: match limit exceeded
+ aaaaaaaaaaaaaz\=match_limit=60000
+Failed: error -47: match limit exceeded
+
+/(*LIMIT_MATCH=60000)(*LIMIT_MATCH=3000)(a+)*zz/I
+Capturing subpattern count = 1
+Match limit = 3000
+Starting code units: a z
+Last code unit = 'z'
+Subject length lower bound = 2
+JIT compilation was successful
+ aaaaaaaaaaaaaz
+Failed: error -47: match limit exceeded
+
+/(*LIMIT_MATCH=60000)(a+)*zz/I
+Capturing subpattern count = 1
+Match limit = 60000
+Starting code units: a z
+Last code unit = 'z'
+Subject length lower bound = 2
+JIT compilation was successful
+ aaaaaaaaaaaaaz
+No match (JIT)
+ aaaaaaaaaaaaaz\=match_limit=3000
+Failed: error -47: match limit exceeded
+
+# These three have infinitely nested recursions.
+
+/((?2))((?1))/
abc
- 0: abc
- *** Failers
-No match: POSIX code 17: match failed
+Failed: error -46: JIT stack limit reached
-/^abc|def/
+/((?(R2)a+|(?1)b))/
+ aaaabcde
+Failed: error -46: JIT stack limit reached
+
+/(?(R)a*(?1)|((?R))b)/
+ aaaabcde
+Failed: error -46: JIT stack limit reached
+
+# Invalid options disable JIT when called via pcre2_match(), causing the
+# match to happen via the interpreter, but for fast JIT invalid options are
+# ignored, so an unanchored match happens.
+
+/abcd/
+ abcd\=anchored
+ 0: abcd
+ fail abcd\=anchored
+No match
+
+/abcd/jitfast
+ abcd\=anchored
+ 0: abcd (JIT)
+ succeed abcd\=anchored
+ 0: abcd (JIT)
+
+# Push/pop does not lose the JIT information, though jitverify applies only to
+# compilation, but serializing (save/load) discards JIT data completely.
+
+/^abc\Kdef/info,push
+** Applies only to compile when pattern is stacked with 'push': jitverify
+Capturing subpattern count = 0
+Compile options:
+Overall options: anchored
+Subject length lower bound = 6
+JIT compilation was successful
+#pop jitverify
+ abcdef
+ 0: def (JIT)
+
+/^abc\Kdef/info,push
+** Applies only to compile when pattern is stacked with 'push': jitverify
+Capturing subpattern count = 0
+Compile options:
+Overall options: anchored
+Subject length lower bound = 6
+JIT compilation was successful
+#save testsaved1
+#load testsaved1
+#pop jitverify
abcdef
- 0: abc
- abcdef\=notbol
0: def
+
+#load testsaved1
+#pop jit,jitverify
+ abcdef
+ 0: def (JIT)
+
+# Test pattern compilation
-/.*((abc)$|(def))/
- defabc
- 0: defabc
- 1: abc
- 2: abc
- defabc\=noteol
- 0: def
- 1: def
- 3: def
+/(?:a|b|c|d|e)(?R)/jit=1
-/the quick brown fox/
- the quick brown fox
- 0: the quick brown fox
- *** Failers
-No match: POSIX code 17: match failed
- The Quick Brown Fox
-No match: POSIX code 17: match failed
+/(?:a|b|c|d|e)(?R)(?R)/jit=1
-/the quick brown fox/i
- the quick brown fox
- 0: the quick brown fox
- The Quick Brown Fox
- 0: The Quick Brown Fox
+/(a(?:a|b|c|d|e)b){8,16}/jit=1
-/abc.def/
- *** Failers
-No match: POSIX code 17: match failed
- abc\ndef
-No match: POSIX code 17: match failed
+/(?:|a|){100}x/jit=1
-/abc$/
- abc
- 0: abc
- abc\n
- 0: abc
+# These tests provoke recursion loops, which give a different error message
+# when JIT is used.
-/(abc)\2/
-Failed: POSIX code 15: bad back reference at offset 6
+/(?R)/I
+Capturing subpattern count = 0
+May match empty string
+Subject length lower bound = 0
+JIT compilation was successful
+ abcd
+Failed: error -46: JIT stack limit reached
-/(abc\1)/
- abc
-No match: POSIX code 17: match failed
+/(a|(?R))/I
+Capturing subpattern count = 1
+May match empty string
+Subject length lower bound = 1
+JIT compilation was successful
+ abcd
+ 0: a (JIT)
+ 1: a
+ defg
+Failed: error -46: JIT stack limit reached
-/a*(b+)(z)(z)/
- aaaabbbbzzzz
- 0: aaaabbbbzz
- 1: bbbb
- 2: z
- 3: z
- aaaabbbbzzzz\=ovector=0
-Matched without capture
- aaaabbbbzzzz\=ovector=1
- 0: aaaabbbbzz
- aaaabbbbzzzz\=ovector=2
- 0: aaaabbbbzz
- 1: bbbb
+/(ab|(bc|(de|(?R))))/I
+Capturing subpattern count = 3
+May match empty string
+Subject length lower bound = 2
+JIT compilation was successful
+ abcd
+ 0: ab (JIT)
+ 1: ab
+ fghi
+Failed: error -46: JIT stack limit reached
-/ab.cd/
- ab-cd
- 0: ab-cd
- ab=cd
- 0: ab=cd
- ** Failers
-No match: POSIX code 17: match failed
- ab\ncd
-No match: POSIX code 17: match failed
+/(ab|(bc|(de|(?1))))/I
+Capturing subpattern count = 3
+May match empty string
+Subject length lower bound = 2
+JIT compilation was successful
+ abcd
+ 0: ab (JIT)
+ 1: ab
+ fghi
+Failed: error -46: JIT stack limit reached
-/ab.cd/s
- ab-cd
- 0: ab-cd
- ab=cd
- 0: ab=cd
- ab\ncd
- 0: ab\x0acd
+/x(ab|(bc|(de|(?1)x)x)x)/I
+Capturing subpattern count = 3
+First code unit = 'x'
+Subject length lower bound = 3
+JIT compilation was successful
+ xab123
+ 0: xab (JIT)
+ 1: ab
+ xfghi
+Failed: error -46: JIT stack limit reached
-/a(b)c/no_auto_capture
- abc
-Matched with REG_NOSUB
+/(?!\w)(?R)/
+ abcd
+Failed: error -46: JIT stack limit reached
+ =abc
+Failed: error -46: JIT stack limit reached
-/a(?Pb)c/no_auto_capture
- abc
-Matched with REG_NOSUB
+/(?=\w)(?R)/
+ =abc
+Failed: error -46: JIT stack limit reached
+ abcd
+Failed: error -46: JIT stack limit reached
-/a?|b?/
- abc
- 0: a
- ** Failers
- 0:
- ddd\=notempty
-No match: POSIX code 17: match failed
+/(?b)c/no_auto_capture
+ abc
+Matched with REG_NOSUB
+
+/a?|b?/
+ abc
+ 0: a
+ ** Failers
+ 0:
+ ddd\=notempty
+No match: POSIX code 17: match failed
+
+/\w+A/
+ CDAAAAB
+ 0: CDAAAA
+
+/\w+A/ungreedy
+ CDAAAAB
+ 0: CDA
+
+/\Biss\B/I,aftertext
+** Ignored with POSIX interface: info
+ Mississippi
+ 0: iss
+ 0+ issippi
+
+/abc/\
+Failed: POSIX code 9: bad escape sequence at offset 4
+
+"(?(?C)"
+Failed: POSIX code 3: pattern error at offset 2
+
+# End of testdata/testinput18
diff --git a/testdata/testoutput19 b/testdata/testoutput19
index 7f3aa0c..e2bdf32 100644
--- a/testdata/testoutput19
+++ b/testdata/testoutput19
@@ -1,100 +1,20 @@
-# This set of tests exercises the serialization/deserialization functions in
-# the library. It does not use UTF or JIT.
-
-#forbid_utf
-
-# Compile several patterns, push them onto the stack, and then write them
-# all to a file.
-
-#pattern push
-
-/(?(?&NAME_PAT))\s+(?(?&ADDRESS_PAT))
- (?(DEFINE)
- (?[a-z]+)
- (?\d+)
- )/x
-/^(?:((.)(?1)\2|)|((.)(?3)\4|.))$/i
-
-#save testsaved1
-
-# Do it again for some more patterns.
-
-/(*MARK:A)(*SKIP:B)(C|X)/mark
-** Ignored when compiled pattern is stacked with 'push': mark
-/(?:(?foo)|(?bar))\k/dupnames
-
-#save testsaved2
-#pattern -push
-
-# Reload the patterns, then pop them one by one and check them.
-
-#load testsaved1
-#load testsaved2
-
-#pop info
-Capturing subpattern count = 2
-Max back reference = 2
-Named capturing subpatterns:
- n 1
- n 2
-Options: dupnames
-Starting code units: b f
-Subject length lower bound = 6
- foofoo
- 0: foofoo
- 1: foo
- barbar
- 0: barbar
- 1:
- 2: bar
+# This set of tests is run only with the 8-bit library. It tests the POSIX
+# interface with UTF/UCP support, which is supported only with the 8-bit
+# library. This test should not be run with JIT (which is not available for the
+# POSIX interface).
-#pop mark
- C
- 0: C
- 1: C
-MK: A
- D
-No match, mark = A
+#pattern posix
+
+/a\x{1234}b/utf
+ a\x{1234}b
+ 0: a\x{1234}b
+
+/\w/
+ +++\x{c2}
+No match: POSIX code 17: match failed
+
+/\w/ucp
+ +++\x{c2}
+ 0: \xc2
-#pop
- AmanaplanacanalPanama
- 0: AmanaplanacanalPanama
- 1:
- 2:
- 3: AmanaplanacanalPanama
- 4: A
-
-#pop info
-Capturing subpattern count = 4
-Named capturing subpatterns:
- ADDR 2
- ADDRESS_PAT 4
- NAME 1
- NAME_PAT 3
-Options: extended
-Subject length lower bound = 3
- metcalfe 33
- 0: metcalfe 33
- 1: metcalfe
- 2: 33
-
-# Check for an error when different tables are used.
-
-/abc/push,tables=1
-/xyz/push,tables=2
-#save testsaved1
-Serialization failed: error -30: patterns do not all use the same character tables
-
-#pop
- xyz
- 0: xyz
-
-#pop
- abc
- 0: abc
-
-#pop should give an error
-** Can't pop off an empty stack
- pqr
-
-# End of testinput19
+# End of testdata/testinput19
diff --git a/testdata/testoutput20 b/testdata/testoutput20
new file mode 100644
index 0000000..3dcf51b
--- /dev/null
+++ b/testdata/testoutput20
@@ -0,0 +1,100 @@
+# This set of tests exercises the serialization/deserialization functions in
+# the library. It does not use UTF or JIT.
+
+#forbid_utf
+
+# Compile several patterns, push them onto the stack, and then write them
+# all to a file.
+
+#pattern push
+
+/(?(?&NAME_PAT))\s+(?(?&ADDRESS_PAT))
+ (?(DEFINE)
+ (?[a-z]+)
+ (?\d+)
+ )/x
+/^(?:((.)(?1)\2|)|((.)(?3)\4|.))$/i
+
+#save testsaved1
+
+# Do it again for some more patterns.
+
+/(*MARK:A)(*SKIP:B)(C|X)/mark
+** Ignored when compiled pattern is stacked with 'push': mark
+/(?:(?foo)|(?bar))\k/dupnames
+
+#save testsaved2
+#pattern -push
+
+# Reload the patterns, then pop them one by one and check them.
+
+#load testsaved1
+#load testsaved2
+
+#pop info
+Capturing subpattern count = 2
+Max back reference = 2
+Named capturing subpatterns:
+ n 1
+ n 2
+Options: dupnames
+Starting code units: b f
+Subject length lower bound = 6
+ foofoo
+ 0: foofoo
+ 1: foo
+ barbar
+ 0: barbar
+ 1:
+ 2: bar
+
+#pop mark
+ C
+ 0: C
+ 1: C
+MK: A
+ D
+No match, mark = A
+
+#pop
+ AmanaplanacanalPanama
+ 0: AmanaplanacanalPanama
+ 1:
+ 2:
+ 3: AmanaplanacanalPanama
+ 4: A
+
+#pop info
+Capturing subpattern count = 4
+Named capturing subpatterns:
+ ADDR 2
+ ADDRESS_PAT 4
+ NAME 1
+ NAME_PAT 3
+Options: extended
+Subject length lower bound = 3
+ metcalfe 33
+ 0: metcalfe 33
+ 1: metcalfe
+ 2: 33
+
+# Check for an error when different tables are used.
+
+/abc/push,tables=1
+/xyz/push,tables=2
+#save testsaved1
+Serialization failed: error -30: patterns do not all use the same character tables
+
+#pop
+ xyz
+ 0: xyz
+
+#pop
+ abc
+ 0: abc
+
+#pop should give an error
+** Can't pop off an empty stack
+ pqr
+
+# End of testinput20