Runtime UTF checks now take not of the starting offset.

This commit is contained in:
Philip.Hazel 2015-08-18 10:34:05 +00:00
parent 1370a49dfe
commit ee41aa906f
30 changed files with 2077 additions and 1664 deletions

View File

@ -145,6 +145,10 @@ was fixed.
39. Match limit check added to recursion. This issue was found by Karl Skomski
with a custom LLVM fuzzer.
40. Arrange for the UTF check in pcre2_match() and pcre2_dfa_match() to look
only at the part of the subject that is relevant when the starting offset is
non-zero.
Version 10.20 30-June-2015
--------------------------

88
RunTest
View File

@ -68,12 +68,13 @@ title10="Test 10: Specials for the 8-bit library with UTF-8 and UCP support"
title11="Test 11: Specials for the basic 16-bit and 32-bit libraries"
title12="Test 12: Specials for the 16-bit and 32-bit libraries UTF and UCP support"
title13="Test 13: DFA specials for the basic 16-bit and 32-bit libraries"
title14="Test 14: Non-JIT limits and other non-JIT tests"
title15="Test 15: JIT-specific features when JIT is not available"
title16="Test 16: JIT-specific features when JIT is available"
title17="Test 17: Tests of the POSIX interface, excluding UTF/UCP"
title18="Test 18: Tests of the POSIX interface with UTF/UCP"
title19="Test 19: Serialization tests"
title14="Test 14: DFA specials for UTF and UCP support"
title15="Test 15: Non-JIT limits and other non-JIT tests"
title16="Test 16: JIT-specific features when JIT is not available"
title17="Test 17: JIT-specific features when JIT is available"
title18="Test 18: Tests of the POSIX interface, excluding UTF/UCP"
title19="Test 19: Tests of the POSIX interface with UTF/UCP"
title20="Test 20: Serialization tests"
maxtest=18
if [ $# -eq 1 -a "$1" = "list" ]; then
@ -97,6 +98,7 @@ if [ $# -eq 1 -a "$1" = "list" ]; then
echo $title17
echo $title18
echo $title19
echo $title20
exit 0
fi
@ -219,6 +221,7 @@ do16=no
do17=no
do18=no
do19=no
do20=no
while [ $# -gt 0 ] ; do
case $1 in
@ -242,10 +245,11 @@ while [ $# -gt 0 ] ; do
17) do17=yes;;
18) do18=yes;;
19) do19=yes;;
20) do20=yes;;
-8) arg8=yes;;
-16) arg16=yes;;
-32) arg32=yes;;
bigstack|-bigstack) bigstack=yes;;
bigstack|-bigstack) bigstack=yes;;
nojit|-nojit) nojit=yes;;
sim|-sim) shift; sim=$1;;
valgrind|-valgrind) valgrind="valgrind --tool=memcheck -q --smc-check=all";;
@ -305,10 +309,10 @@ if [ $? -eq 0 ] ; then
else
test2stack="-S 1024"
defaultstack="-S 64"
fi
fi
else
test2stack=""
defaultstack=""
defaultstack=""
fi
# All of 8-bit, 16-bit, and 32-bit character strings may be supported, but only
@ -387,7 +391,8 @@ if [ $do0 = no -a $do1 = no -a $do2 = no -a $do3 = no -a \
$do4 = no -a $do5 = no -a $do6 = no -a $do7 = no -a \
$do8 = no -a $do9 = no -a $do10 = no -a $do11 = no -a \
$do12 = no -a $do13 = no -a $do14 = no -a $do15 = no -a \
$do16 = no -a $do17 = no -a $do18 = no -a $do19 = no \
$do16 = no -a $do17 = no -a $do18 = no -a $do19 = no -a \
$do20 = no \
]; then
do0=yes
do1=yes
@ -409,6 +414,7 @@ if [ $do0 = no -a $do1 = no -a $do2 = no -a $do3 = no -a \
do17=yes
do18=yes
do19=yes
do20=yes
fi
# Handle any explicit skips at this stage, so that an argument list may consist
@ -688,71 +694,79 @@ for bmode in "$test8" "$test16" "$test32"; do
checkresult $? 13 ""
fi
fi
# Test non-JIT match and recursion limits
# Tests for DFA UTF and UCP features. Output is different for the different widths.
if [ $do14 = yes ] ; then
echo $title14
$sim $valgrind ./pcre2test -q $defaultstack $bmode $testdata/testinput14 testtry
checkresult $? 14 ""
$sim $valgrind ./pcre2test -q $defaultstack $bmode $opt $testdata/testinput14 testtry
checkresult $? 14-$bits "$opt"
fi
# Test non-JIT match and recursion limits
if [ $do15 = yes ] ; then
echo $title15
$sim $valgrind ./pcre2test -q $defaultstack $bmode $testdata/testinput15 testtry
checkresult $? 15 ""
fi
# Test JIT-specific features when JIT is not available
if [ $do15 = yes ] ; then
echo $title15
if [ $jit -ne 0 ] ; then
echo " Skipped because JIT is available"
else
$sim $valgrind ./pcre2test -q $defaultstack $bmode $testdata/testinput15 testtry
checkresult $? 15 ""
fi
fi
# Test JIT-specific features when JIT is available
if [ $do16 = yes ] ; then
echo $title16
if [ $jit -eq 0 -o "$nojit" = "yes" ] ; then
echo " Skipped because JIT is not available or nojit was specified"
if [ $jit -ne 0 ] ; then
echo " Skipped because JIT is available"
else
$sim $valgrind ./pcre2test -q $defaultstack $bmode $testdata/testinput16 testtry
checkresult $? 16 ""
fi
fi
# Tests for the POSIX interface without UTF/UCP (8-bit only)
# Test JIT-specific features when JIT is available
if [ $do17 = yes ] ; then
echo $title17
if [ "$bits" = "16" -o "$bits" = "32" ] ; then
echo " Skipped when running 16/32-bit tests"
if [ $jit -eq 0 -o "$nojit" = "yes" ] ; then
echo " Skipped because JIT is not available or nojit was specified"
else
$sim $valgrind ./pcre2test -q $defaultstack $bmode $testdata/testinput17 testtry
checkresult $? 17 ""
fi
fi
# Tests for the POSIX interface with UTF/UCP (8-bit only)
# Tests for the POSIX interface without UTF/UCP (8-bit only)
if [ $do18 = yes ] ; then
echo $title18
if [ "$bits" = "16" -o "$bits" = "32" ] ; then
echo " Skipped when running 16/32-bit tests"
elif [ $utf -eq 0 ] ; then
echo " Skipped because UTF-$bits support is not available"
else
$sim $valgrind ./pcre2test -q $defaultstack $bmode $testdata/testinput18 testtry
checkresult $? 18 ""
fi
fi
# Serialization tests
# Tests for the POSIX interface with UTF/UCP (8-bit only)
if [ $do19 = yes ] ; then
echo $title19
$sim $valgrind ./pcre2test -q $defaultstack $bmode $testdata/testinput19 testtry
checkresult $? 19 ""
if [ "$bits" = "16" -o "$bits" = "32" ] ; then
echo " Skipped when running 16/32-bit tests"
elif [ $utf -eq 0 ] ; then
echo " Skipped because UTF-$bits support is not available"
else
$sim $valgrind ./pcre2test -q $defaultstack $bmode $testdata/testinput19 testtry
checkresult $? 19 ""
fi
fi
# Serialization tests
if [ $do20 = yes ] ; then
echo $title20
$sim $valgrind ./pcre2test -q $defaultstack $bmode $testdata/testinput20 testtry
checkresult $? 20 ""
fi
# End of loop for 8/16/32-bit tests

View File

@ -1,4 +1,4 @@
.TH PCRE2API 3 "29 July 2015" "PCRE2 10.21"
.TH PCRE2API 3 "18 August 2015" "PCRE2 10.21"
.SH NAME
PCRE2 - Perl-compatible regular expressions (revised API)
.sp
@ -2022,12 +2022,19 @@ If the pattern is anchored, such a match can occur only if the pattern contains
.sp
When PCRE2_UTF is set at compile time, the validity of the subject as a UTF
string is checked by default when \fBpcre2_match()\fP is subsequently called.
The entire string is checked before any other processing takes place, and a
If a non-zero starting offset is given, the check is applied only to that part
of the subject that could be inspected during matching, and there is a check
that the starting offset points to the first code unit of a character or to the
end of the subject. If there are no lookbehind assertions in the pattern, the
check starts at the starting offset. Otherwise, it starts at the length of the
longest lookbehind before the starting offset, or at the start of the subject
if there are not that many characters before the starting offset. Note that the
sequences \eb and \eB are one-character lookbehinds.
.P
The check is carried out before any other processing takes place, and a
negative error code is returned if the check fails. There are several UTF error
codes for each code unit width, corresponding to different problems with the
code unit sequence. The value of \fIstartoffset\fP is also checked, to ensure
that it points to the start of a character or to the end of the subject. There
are discussions about the validity of
code unit sequence. There are discussions about the validity of
.\" HTML <a href="pcre2unicode.html#utf8strings">
.\" </a>
UTF-8 strings,
@ -2939,6 +2946,6 @@ Cambridge, England.
.rs
.sp
.nf
Last updated: 29 July 2015
Last updated: 18 August 2015
Copyright (c) 1997-2015 University of Cambridge.
.fi

View File

@ -1,4 +1,4 @@
.TH PCRE2UNICODE 3 "23 November 2014" "PCRE2 10.00"
.TH PCRE2UNICODE 3 "18 August 2015" "PCRE2 10.21"
.SH NAME
PCRE - Perl-compatible regular expressions (revised API)
.SH "UNICODE AND UTF SUPPORT"
@ -117,11 +117,21 @@ UTF-16 and UTF-32 strings can indicate their endianness by special code knows
as a byte-order mark (BOM). The PCRE2 functions do not handle this, expecting
strings to be in host byte order.
.P
The entire string is checked before any other processing takes place. In
addition to checking the format of the string, there is a check to ensure that
all code points lie in the range U+0 to U+10FFFF, excluding the surrogate area.
The so-called "non-character" code points are not excluded because Unicode
corrigendum #9 makes it clear that they should not be.
A UTF string is checked before any other processing takes place. In the case of
\fBpcre2_match()\fP and \fBpcre2_dfa_match()\fP calls with a non-zero starting
offset, the check is applied only to that part of the subject that could be
inspected during matching, and there is a check that the starting offset points
to the first code unit of a character or to the end of the subject. If there
are no lookbehind assertions in the pattern, the check starts at the starting
offset. Otherwise, it starts at the length of the longest lookbehind before the
starting offset, or at the start of the subject if there are not that many
characters before the starting offset. Note that the sequences \eb and \eB are
one-character lookbehinds.
.P
In addition to checking the format of the string, there is a check to ensure
that all code points lie in the range U+0 to U+10FFFF, excluding the surrogate
area. The so-called "non-character" code points are not excluded because
Unicode corrigendum #9 makes it clear that they should not be.
.P
Characters in the "Surrogate Area" of Unicode are reserved for use by UTF-16,
where they are used in pairs to encode code points with values greater than
@ -252,6 +262,6 @@ Cambridge, England.
.rs
.sp
.nf
Last updated: 23 November 2014
Copyright (c) 1997-2014 University of Cambridge.
Last updated: 18 August 2015
Copyright (c) 1997-2015 University of Cambridge.
.fi

View File

@ -4682,7 +4682,7 @@ for (;; ptr++)
that it's a length rather than a small character. */
#ifdef MAYBE_UTF_MULTI
if (utf && NOT_FIRSTCHAR(code[-1]))
if (utf && NOT_FIRSTCU(code[-1]))
{
PCRE2_UCHAR *lastchar = code - 1;
BACKCHAR(lastchar);

View File

@ -2774,7 +2774,7 @@ for (;;)
{
PCRE2_SPTR p = start_subject + local_offsets[rc];
PCRE2_SPTR pp = start_subject + local_offsets[rc+1];
while (p < pp) if (NOT_FIRSTCHAR(*p++)) charcount--;
while (p < pp) if (NOT_FIRSTCU(*p++)) charcount--;
}
#endif
if (charcount > 0)
@ -2874,7 +2874,7 @@ for (;;)
PCRE2_SPTR pp = local_ptr;
charcount = (int)(pp - p);
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
if (utf) while (p < pp) if (NOT_FIRSTCHAR(*p++)) charcount--;
if (utf) while (p < pp) if (NOT_FIRSTCU(*p++)) charcount--;
#endif
ADD_NEW_DATA(-next_state_offset, 0, (charcount - 1));
}
@ -2960,7 +2960,7 @@ for (;;)
{
PCRE2_SPTR p = start_subject + local_offsets[0];
PCRE2_SPTR pp = start_subject + local_offsets[1];
while (p < pp) if (NOT_FIRSTCHAR(*p++)) charcount--;
while (p < pp) if (NOT_FIRSTCU(*p++)) charcount--;
}
#endif
ADD_NEW_DATA(-next_state_offset, 0, (charcount - 1));
@ -3264,18 +3264,50 @@ switch(re->newline_convention)
/* Check a UTF string for validity if required. For 8-bit and 16-bit strings,
we must also check that a starting offset does not point into the middle of a
multiunit character. */
multiunit character. We check only the portion of the subject that is going to
be inspected during matching - from the offset minus the maximum back reference
to the given length. This saves time when a small part of a large subject is
being matched by the use of a starting offset. Note that the maximum lookbehind
is a number of characters, not code units. */
#ifdef SUPPORT_UNICODE
if (utf && (options & PCRE2_NO_UTF_CHECK) == 0)
{
match_data->rc = PRIV(valid_utf)(subject, length, &(match_data->startchar));
if (match_data->rc != 0) return match_data->rc;
PCRE2_SPTR check_subject = start_match; /* start_match includes offset */
if (start_offset > 0)
{
#if PCRE2_CODE_UNIT_WIDTH != 32
if (start_offset > 0 && start_offset < length &&
NOT_FIRSTCHAR(subject[start_offset]))
return PCRE2_ERROR_BADUTFOFFSET;
unsigned int i;
if (start_match < end_subject && NOT_FIRSTCU(*start_match))
return PCRE2_ERROR_BADUTFOFFSET;
for (i = re->max_lookbehind; i > 0 && check_subject > subject; i--)
{
check_subject--;
while (check_subject > subject &&
#if PCRE2_CODE_UNIT_WIDTH == 8
(*check_subject & 0xc0) == 0x80)
#else /* 16-bit */
(*check_subject & 0xfc00) == 0xdc00)
#endif /* PCRE2_CODE_UNIT_WIDTH == 8 */
check_subject--;
}
#else /* In the 32-bit library, one code unit equals one character. */
check_subject -= re->max_lookbehind;
if (check_subject < subject) check_subject = subject;
#endif /* PCRE2_CODE_UNIT_WIDTH != 32 */
}
/* Validate the relevant portion of the subject. After an error, adjust the
offset to be an absolute offset in the whole string. */
match_data->rc = PRIV(valid_utf)(check_subject,
length - (check_subject - subject), &(match_data->startchar));
if (match_data->rc != 0)
{
match_data->startchar += check_subject - subject;
return match_data->rc;
}
}
#endif /* SUPPORT_UNICODE */

View File

@ -72,7 +72,7 @@ just to undefine them all. */
#undef MAX_MARK
#undef MAX_PATTERN_SIZE
#undef MAX_UTF_SINGLE_CU
#undef NOT_FIRSTCHAR
#undef NOT_FIRSTCU
#undef PUT
#undef PUT2
#undef PUT2INC
@ -252,7 +252,7 @@ UTF support is omitted, we don't even define them. */
/* #define MAX_UTF_SINGLE_CU */
/* #define HAS_EXTRALEN(c) */
/* #define GET_EXTRALEN(c) */
/* #define NOT_FIRSTCHAR(c) */
/* #define NOT_FIRSTCU(c) */
#define GETCHAR(c, eptr) c = *eptr;
#define GETCHARTEST(c, eptr) c = *eptr;
#define GETCHARINC(c, eptr) c = *eptr++;
@ -285,10 +285,10 @@ Otherwise it has an undefined behaviour. */
#define GET_EXTRALEN(c) (PRIV(utf8_table4)[(c) & 0x3f])
/* Returns TRUE, if the given character is not the first character
of a UTF sequence. */
/* Returns TRUE, if the given value is not the first code unit of a UTF
sequence. */
#define NOT_FIRSTCHAR(c) (((c) & 0xc0) == 0x80)
#define NOT_FIRSTCU(c) (((c) & 0xc0) == 0x80)
/* Get the next UTF-8 character, not advancing the pointer. This is called when
we know we are in UTF-8 mode. */
@ -371,10 +371,10 @@ Otherwise it has an undefined behaviour. */
#define GET_EXTRALEN(c) 1
/* Returns TRUE, if the given character is not the first character
of a UTF sequence. */
/* Returns TRUE, if the given value is not the first code unit of a UTF
sequence. */
#define NOT_FIRSTCHAR(c) (((c) & 0xfc00) == 0xdc00)
#define NOT_FIRSTCU(c) (((c) & 0xfc00) == 0xdc00)
/* Base macro to pick up the low surrogate of a UTF-16 character, not
advancing the pointer. */
@ -469,7 +469,7 @@ into one PCRE2_UCHAR unit. */
#define MAX_UTF_SINGLE_CU (0x10ffffu)
#define HAS_EXTRALEN(c) (0)
#define GET_EXTRALEN(c) (0)
#define NOT_FIRSTCHAR(c) (0)
#define NOT_FIRSTCU(c) (0)
/* Get the next UTF-32 character, not advancing the pointer. This is called when
we know we are in UTF-32 mode. */

View File

@ -6485,6 +6485,7 @@ mb->match_frames_base = &frame_zero;
subject string. */
if (length == PCRE2_ZERO_TERMINATED) length = PRIV(strlen)(subject);
end_subject = subject + length;
/* Plausibility checks */
@ -6536,18 +6537,50 @@ mb->partial = ((options & PCRE2_PARTIAL_HARD) != 0)? 2 :
/* Check a UTF string for validity if required. For 8-bit and 16-bit strings,
we must also check that a starting offset does not point into the middle of a
multiunit character. */
multiunit character. We check only the portion of the subject that is going to
be inspected during matching - from the offset minus the maximum back reference
to the given length. This saves time when a small part of a large subject is
being matched by the use of a starting offset. Note that the maximum lookbehind
is a number of characters, not code units. */
#ifdef SUPPORT_UNICODE
if (utf && (options & PCRE2_NO_UTF_CHECK) == 0)
{
match_data->rc = PRIV(valid_utf)(subject, length, &(match_data->startchar));
if (match_data->rc != 0) return match_data->rc;
PCRE2_SPTR check_subject = start_match; /* start_match includes offset */
if (start_offset > 0)
{
#if PCRE2_CODE_UNIT_WIDTH != 32
if (start_offset > 0 && start_offset < length &&
NOT_FIRSTCHAR(subject[start_offset]))
return PCRE2_ERROR_BADUTFOFFSET;
unsigned int i;
if (start_match < end_subject && NOT_FIRSTCU(*start_match))
return PCRE2_ERROR_BADUTFOFFSET;
for (i = re->max_lookbehind; i > 0 && check_subject > subject; i--)
{
check_subject--;
while (check_subject > subject &&
#if PCRE2_CODE_UNIT_WIDTH == 8
(*check_subject & 0xc0) == 0x80)
#else /* 16-bit */
(*check_subject & 0xfc00) == 0xdc00)
#endif /* PCRE2_CODE_UNIT_WIDTH == 8 */
check_subject--;
}
#else /* In the 32-bit library, one code unit equals one character. */
check_subject -= re->max_lookbehind;
if (check_subject < subject) check_subject = subject;
#endif /* PCRE2_CODE_UNIT_WIDTH != 32 */
}
/* Validate the relevant portion of the subject. After an error, adjust the
offset to be an absolute offset in the whole string. */
match_data->rc = PRIV(valid_utf)(check_subject,
length - (check_subject - subject), &(match_data->startchar));
if (match_data->rc != 0)
{
match_data->startchar += check_subject - subject;
return match_data->rc;
}
}
#endif /* SUPPORT_UNICODE */
@ -6594,7 +6627,7 @@ else
mb->start_subject = subject;
mb->start_offset = start_offset;
mb->end_subject = end_subject = mb->start_subject + length;
mb->end_subject = end_subject;
mb->hasthen = (re->flags & PCRE2_HASTHEN) != 0;
mb->moptions = options; /* Match options */

29
testdata/testinput10 vendored
View File

@ -132,7 +132,36 @@
\xf9\x87\x80\x80\x80\=no_utf_check
\xfc\x84\x80\x80\x80\x80\=no_utf_check
\xfd\x83\x80\x80\x80\x80\=no_utf_check
# Similar tests with offsets
/badutf/utf
X\xdfabcd
X\xdfabcd\=offset=1
X\xdfabcd\=offset=2
/(?<=x)badutf/utf
X\xdfabcd
X\xdfabcd\=offset=1
X\xdfabcd\=offset=2
X\xdfabcd\=offset=3
X\xdfabcd\xdf\=offset=3
/(?<=xx)badutf/utf
X\xdfabcd
X\xdfabcd\=offset=1
X\xdfabcd\=offset=2
X\xdfabcd\=offset=3
/(?<=xxxx)badutf/utf
X\xdfabcd
X\xdfabcd\=offset=1
X\xdfabcd\=offset=2
X\xdfabcd\=offset=3
X\xdfabcd\=offset=6
X\xdfabc\xdf\=offset=6
X\xdfabc\xdf\=offset=7
/\x{100}/IB,utf
/\x{1000}/IB,utf

View File

@ -158,6 +158,7 @@
/X/utf
XX\x{d800}
XX\x{d800}\=offset=3
XX\x{d800}\=no_utf_check
XX\x{da00}
XX\x{da00}\=no_utf_check
@ -169,6 +170,9 @@
XX\x{dfff}\=no_utf_check
XX\x{110000}
XX\x{d800}\x{1234}
/(?<=.)X/utf
XX\x{d800}\=offset=3
/(*UTF16)\x{11234}/
abcd\x{11234}pqr

182
testdata/testinput14 vendored
View File

@ -1,155 +1,37 @@
# These are:
#
# (1) Tests of the match-limiting features. The results are different for
# interpretive or JIT matching, so this test should not be run with JIT. The
# same tests are run using JIT in test 16.
# These test special (mostly error) UTF features of DFA matching. They are a
# selection of the more comprehensive tests that are run for non-DFA matching.
# The output is different for the different widths.
# (2) Other tests that must not be run with JIT.
#subject dfa
/(a+)*zz/I
aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaazzbbbbbb\=find_limits
aaaaaaaaaaaaaz\=find_limits
/X/utf
XX\x{d800}
XX\x{d800}\=offset=3
XX\x{d800}\=no_utf_check
XX\x{da00}
XX\x{da00}\=no_utf_check
XX\x{dc00}
XX\x{dc00}\=no_utf_check
XX\x{de00}
XX\x{de00}\=no_utf_check
XX\x{dfff}
XX\x{dfff}\=no_utf_check
XX\x{110000}
XX\x{d800}\x{1234}
/badutf/utf
X\xdf
XX\xef
XXX\xef\x80
X\xf7
XX\xf7\x80
XXX\xf7\x80\x80
!((?:\s|//.*\\n|/[*](?:\\n|.)*?[*]/)*)!I
/* this is a C style comment */\=find_limits
/^(?>a)++/
aa\=find_limits
aaaaaaaaa\=find_limits
/(a)(?1)++/
aa\=find_limits
aaaaaaaaa\=find_limits
/a(?:.)*?a/ims
abbbbbbbbbbbbbbbbbbbbba\=find_limits
/a(?:.(*THEN))*?a/ims
abbbbbbbbbbbbbbbbbbbbba\=find_limits
/a(?:.(*THEN:ABC))*?a/ims
abbbbbbbbbbbbbbbbbbbbba\=find_limits
/^(?>a+)(?>b+)(?>c+)(?>d+)(?>e+)/
aabbccddee\=find_limits
/^(?>(a+))(?>(b+))(?>(c+))(?>(d+))(?>(e+))/
aabbccddee\=find_limits
/^(?>(a+))(?>b+)(?>(c+))(?>d+)(?>(e+))/
aabbccddee\=find_limits
/(*LIMIT_MATCH=12bc)abc/
/(*LIMIT_MATCH=4294967290)abc/
/(*LIMIT_RECURSION=4294967280)abc/I
/(a+)*zz/
aaaaaaaaaaaaaz
aaaaaaaaaaaaaz\=match_limit=3000
/(a+)*zz/
aaaaaaaaaaaaaz\=recursion_limit=10
/(*LIMIT_MATCH=3000)(a+)*zz/I
aaaaaaaaaaaaaz
aaaaaaaaaaaaaz\=match_limit=60000
/(*LIMIT_MATCH=60000)(*LIMIT_MATCH=3000)(a+)*zz/I
aaaaaaaaaaaaaz
/(*LIMIT_MATCH=60000)(a+)*zz/I
aaaaaaaaaaaaaz
aaaaaaaaaaaaaz\=match_limit=3000
/(*LIMIT_RECURSION=10)(a+)*zz/I
aaaaaaaaaaaaaz
aaaaaaaaaaaaaz\=recursion_limit=1000
/(*LIMIT_RECURSION=10)(*LIMIT_RECURSION=1000)(a+)*zz/I
aaaaaaaaaaaaaz
/(*LIMIT_RECURSION=1000)(a+)*zz/I
aaaaaaaaaaaaaz
aaaaaaaaaaaaaz\=recursion_limit=10
# These three have infinitely nested recursions.
/((?2))((?1))/
abc
/((?(R2)a+|(?1)b))/
aaaabcde
/(?(R)a*(?1)|((?R))b)/
aaaabcde
# The allusedtext modifier does not work with JIT, which does not maintain
# the leftchar/rightchar data.
/abc(?=xyz)/allusedtext
abcxyzpqr
abcxyzpqr\=aftertext
/(?<=pqr)abc(?=xyz)/allusedtext
xyzpqrabcxyzpqr
xyzpqrabcxyzpqr\=aftertext
/a\b/
a.\=allusedtext
a\=allusedtext
/abc\Kxyz/
abcxyz\=allusedtext
/abc(?=xyz(*ACCEPT))/
abcxyz\=allusedtext
/abc(?=abcde)(?=ab)/allusedtext
abcabcdefg
# These tests provoke recursion loops, which give a different error message
# when JIT is used.
/(?R)/I
abcd
/(a|(?R))/I
abcd
defg
/(ab|(bc|(de|(?R))))/I
abcd
fghi
/(ab|(bc|(de|(?1))))/I
abcd
fghi
/x(ab|(bc|(de|(?1)x)x)x)/I
xab123
xfghi
/(?!\w)(?R)/
abcd
=abc
/(?=\w)(?R)/
=abc
abcd
/(?<!\w)(?R)/
abcd
/(?<=\w)(?R)/
abcd
/(a+|(?R)b)/
aaa
bbb
/[^\xff]((?1))/BI
abcd
/shortutf/utf
XX\xdf\=ph
XX\xef\=ph
XX\xef\x80\=ph
\xf7\=ph
\xf7\x80\=ph
# End of testinput14

156
testdata/testinput15 vendored
View File

@ -1,9 +1,155 @@
# This test is run only when JIT support is not available. It checks that an
# attempt to use it has the expected behaviour. It also tests things that
# are different without JIT.
# These are:
#
# (1) Tests of the match-limiting features. The results are different for
# interpretive or JIT matching, so this test should not be run with JIT. The
# same tests are run using JIT in test 17.
/abc/I,jit,jitverify
# (2) Other tests that must not be run with JIT.
/a*/I
/(a+)*zz/I
aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaazzbbbbbb\=find_limits
aaaaaaaaaaaaaz\=find_limits
!((?:\s|//.*\\n|/[*](?:\\n|.)*?[*]/)*)!I
/* this is a C style comment */\=find_limits
/^(?>a)++/
aa\=find_limits
aaaaaaaaa\=find_limits
/(a)(?1)++/
aa\=find_limits
aaaaaaaaa\=find_limits
/a(?:.)*?a/ims
abbbbbbbbbbbbbbbbbbbbba\=find_limits
/a(?:.(*THEN))*?a/ims
abbbbbbbbbbbbbbbbbbbbba\=find_limits
/a(?:.(*THEN:ABC))*?a/ims
abbbbbbbbbbbbbbbbbbbbba\=find_limits
/^(?>a+)(?>b+)(?>c+)(?>d+)(?>e+)/
aabbccddee\=find_limits
/^(?>(a+))(?>(b+))(?>(c+))(?>(d+))(?>(e+))/
aabbccddee\=find_limits
/^(?>(a+))(?>b+)(?>(c+))(?>d+)(?>(e+))/
aabbccddee\=find_limits
/(*LIMIT_MATCH=12bc)abc/
/(*LIMIT_MATCH=4294967290)abc/
/(*LIMIT_RECURSION=4294967280)abc/I
/(a+)*zz/
aaaaaaaaaaaaaz
aaaaaaaaaaaaaz\=match_limit=3000
/(a+)*zz/
aaaaaaaaaaaaaz\=recursion_limit=10
/(*LIMIT_MATCH=3000)(a+)*zz/I
aaaaaaaaaaaaaz
aaaaaaaaaaaaaz\=match_limit=60000
/(*LIMIT_MATCH=60000)(*LIMIT_MATCH=3000)(a+)*zz/I
aaaaaaaaaaaaaz
/(*LIMIT_MATCH=60000)(a+)*zz/I
aaaaaaaaaaaaaz
aaaaaaaaaaaaaz\=match_limit=3000
/(*LIMIT_RECURSION=10)(a+)*zz/I
aaaaaaaaaaaaaz
aaaaaaaaaaaaaz\=recursion_limit=1000
/(*LIMIT_RECURSION=10)(*LIMIT_RECURSION=1000)(a+)*zz/I
aaaaaaaaaaaaaz
/(*LIMIT_RECURSION=1000)(a+)*zz/I
aaaaaaaaaaaaaz
aaaaaaaaaaaaaz\=recursion_limit=10
# These three have infinitely nested recursions.
/((?2))((?1))/
abc
/((?(R2)a+|(?1)b))/
aaaabcde
/(?(R)a*(?1)|((?R))b)/
aaaabcde
# The allusedtext modifier does not work with JIT, which does not maintain
# the leftchar/rightchar data.
/abc(?=xyz)/allusedtext
abcxyzpqr
abcxyzpqr\=aftertext
/(?<=pqr)abc(?=xyz)/allusedtext
xyzpqrabcxyzpqr
xyzpqrabcxyzpqr\=aftertext
/a\b/
a.\=allusedtext
a\=allusedtext
/abc\Kxyz/
abcxyz\=allusedtext
/abc(?=xyz(*ACCEPT))/
abcxyz\=allusedtext
/abc(?=abcde)(?=ab)/allusedtext
abcabcdefg
# These tests provoke recursion loops, which give a different error message
# when JIT is used.
/(?R)/I
abcd
/(a|(?R))/I
abcd
defg
/(ab|(bc|(de|(?R))))/I
abcd
fghi
/(ab|(bc|(de|(?1))))/I
abcd
fghi
/x(ab|(bc|(de|(?1)x)x)x)/I
xab123
xfghi
/(?!\w)(?R)/
abcd
=abc
/(?=\w)(?R)/
=abc
abcd
/(?<!\w)(?R)/
abcd
/(?<=\w)(?R)/
abcd
/(a+|(?R)b)/
aaa
bbb
/[^\xff]((?1))/BI
abcd
# End of testinput15

251
testdata/testinput16 vendored

File diff suppressed because one or more lines are too long

289
testdata/testinput17 vendored

File diff suppressed because one or more lines are too long

100
testdata/testinput18 vendored
View File

@ -1,17 +1,95 @@
# This set of tests is run only with the 8-bit library. It tests the POSIX
# interface with UTF/UCP support, which is supported only with the 8-bit
# library. This test should not be run with JIT (which is not available for the
# POSIX interface).
# interface, which is supported only with the 8-bit library. This test should
# not be run with JIT (which is not available for the POSIX interface).
#forbid_utf
#pattern posix
/a\x{1234}b/utf
a\x{1234}b
# Test invalid options
/\w/
+++\x{c2}
/abc/auto_callout
/\w/ucp
+++\x{c2}
# End of testdata/testinput17
/abc/
abc\=find_limits
/abc/
abc\=partial_hard
# Real tests
/abc/
abc
*** Failers
/^abc|def/
abcdef
abcdef\=notbol
/.*((abc)$|(def))/
defabc
defabc\=noteol
/the quick brown fox/
the quick brown fox
*** Failers
The Quick Brown Fox
/the quick brown fox/i
the quick brown fox
The Quick Brown Fox
/abc.def/
*** Failers
abc\ndef
/abc$/
abc
abc\n
/(abc)\2/
/(abc\1)/
abc
/a*(b+)(z)(z)/
aaaabbbbzzzz
aaaabbbbzzzz\=ovector=0
aaaabbbbzzzz\=ovector=1
aaaabbbbzzzz\=ovector=2
/ab.cd/
ab-cd
ab=cd
** Failers
ab\ncd
/ab.cd/s
ab-cd
ab=cd
ab\ncd
/a(b)c/no_auto_capture
abc
/a(?P<name>b)c/no_auto_capture
abc
/a?|b?/
abc
** Failers
ddd\=notempty
/\w+A/
CDAAAAB
/\w+A/ungreedy
CDAAAAB
/\Biss\B/I,aftertext
Mississippi
/abc/\
"(?(?C)"
# End of testdata/testinput18

75
testdata/testinput19 vendored
View File

@ -1,62 +1,17 @@
# This set of tests exercises the serialization/deserialization functions in
# the library. It does not use UTF or JIT.
#forbid_utf
# Compile several patterns, push them onto the stack, and then write them
# all to a file.
#pattern push
/(?<NAME>(?&NAME_PAT))\s+(?<ADDR>(?&ADDRESS_PAT))
(?(DEFINE)
(?<NAME_PAT>[a-z]+)
(?<ADDRESS_PAT>\d+)
)/x
/^(?:((.)(?1)\2|)|((.)(?3)\4|.))$/i
#save testsaved1
# Do it again for some more patterns.
/(*MARK:A)(*SKIP:B)(C|X)/mark
/(?:(?<n>foo)|(?<n>bar))\k<n>/dupnames
#save testsaved2
#pattern -push
# Reload the patterns, then pop them one by one and check them.
#load testsaved1
#load testsaved2
#pop info
foofoo
barbar
# This set of tests is run only with the 8-bit library. It tests the POSIX
# interface with UTF/UCP support, which is supported only with the 8-bit
# library. This test should not be run with JIT (which is not available for the
# POSIX interface).
#pop mark
C
D
#pattern posix
/a\x{1234}b/utf
a\x{1234}b
/\w/
+++\x{c2}
/\w/ucp
+++\x{c2}
#pop
AmanaplanacanalPanama
#pop info
metcalfe 33
# Check for an error when different tables are used.
/abc/push,tables=1
/xyz/push,tables=2
#save testsaved1
#pop
xyz
#pop
abc
#pop should give an error
pqr
# End of testinput19
# End of testdata/testinput19

62
testdata/testinput20 vendored Normal file
View File

@ -0,0 +1,62 @@
# This set of tests exercises the serialization/deserialization functions in
# the library. It does not use UTF or JIT.
#forbid_utf
# Compile several patterns, push them onto the stack, and then write them
# all to a file.
#pattern push
/(?<NAME>(?&NAME_PAT))\s+(?<ADDR>(?&ADDRESS_PAT))
(?(DEFINE)
(?<NAME_PAT>[a-z]+)
(?<ADDRESS_PAT>\d+)
)/x
/^(?:((.)(?1)\2|)|((.)(?3)\4|.))$/i
#save testsaved1
# Do it again for some more patterns.
/(*MARK:A)(*SKIP:B)(C|X)/mark
/(?:(?<n>foo)|(?<n>bar))\k<n>/dupnames
#save testsaved2
#pattern -push
# Reload the patterns, then pop them one by one and check them.
#load testsaved1
#load testsaved2
#pop info
foofoo
barbar
#pop mark
C
D
#pop
AmanaplanacanalPanama
#pop info
metcalfe 33
# Check for an error when different tables are used.
/abc/push,tables=1
/xyz/push,tables=2
#save testsaved1
#pop
xyz
#pop
abc
#pop should give an error
pqr
# End of testinput20

48
testdata/testoutput10 vendored
View File

@ -235,7 +235,55 @@ No match
No match
\xfd\x83\x80\x80\x80\x80\=no_utf_check
No match
# Similar tests with offsets
/badutf/utf
X\xdfabcd
Failed: error -8: UTF-8 error: byte 2 top bits not 0x80 at offset 1
X\xdfabcd\=offset=1
Failed: error -8: UTF-8 error: byte 2 top bits not 0x80 at offset 1
X\xdfabcd\=offset=2
No match
/(?<=x)badutf/utf
X\xdfabcd
Failed: error -8: UTF-8 error: byte 2 top bits not 0x80 at offset 1
X\xdfabcd\=offset=1
Failed: error -8: UTF-8 error: byte 2 top bits not 0x80 at offset 1
X\xdfabcd\=offset=2
Failed: error -8: UTF-8 error: byte 2 top bits not 0x80 at offset 1
X\xdfabcd\=offset=3
No match
X\xdfabcd\xdf\=offset=3
Failed: error -3: UTF-8 error: 1 byte missing at end at offset 6
/(?<=xx)badutf/utf
X\xdfabcd
Failed: error -8: UTF-8 error: byte 2 top bits not 0x80 at offset 1
X\xdfabcd\=offset=1
Failed: error -8: UTF-8 error: byte 2 top bits not 0x80 at offset 1
X\xdfabcd\=offset=2
Failed: error -8: UTF-8 error: byte 2 top bits not 0x80 at offset 1
X\xdfabcd\=offset=3
Failed: error -8: UTF-8 error: byte 2 top bits not 0x80 at offset 1
/(?<=xxxx)badutf/utf
X\xdfabcd
Failed: error -8: UTF-8 error: byte 2 top bits not 0x80 at offset 1
X\xdfabcd\=offset=1
Failed: error -8: UTF-8 error: byte 2 top bits not 0x80 at offset 1
X\xdfabcd\=offset=2
Failed: error -8: UTF-8 error: byte 2 top bits not 0x80 at offset 1
X\xdfabcd\=offset=3
Failed: error -8: UTF-8 error: byte 2 top bits not 0x80 at offset 1
X\xdfabcd\=offset=6
No match
X\xdfabc\xdf\=offset=6
Failed: error -3: UTF-8 error: 1 byte missing at end at offset 5
X\xdfabc\xdf\=offset=7
Failed: error -33: bad offset value
/\x{100}/IB,utf
------------------------------------------------------------------
Bra

View File

@ -609,6 +609,8 @@ Failed: error 106 at offset 13: missing terminating ] for character class
/X/utf
XX\x{d800}
Failed: error -24: UTF-16 error: missing low surrogate at end at offset 2
XX\x{d800}\=offset=3
No match
XX\x{d800}\=no_utf_check
0: X
XX\x{da00}
@ -631,6 +633,10 @@ Failed: error -26: UTF-16 error: isolated low surrogate at offset 2
** Failed: character \x{110000} is greater than 0x10ffff and so cannot be converted to UTF-16
XX\x{d800}\x{1234}
Failed: error -25: UTF-16 error: invalid low surrogate at offset 3
/(?<=.)X/utf
XX\x{d800}\=offset=3
Failed: error -24: UTF-16 error: missing low surrogate at end at offset 2
/(*UTF16)\x{11234}/
abcd\x{11234}pqr

View File

@ -602,6 +602,8 @@ Failed: error 106 at offset 13: missing terminating ] for character class
/X/utf
XX\x{d800}
Failed: error -27: UTF-32 error: code points 0xd800-0xdfff are not defined at offset 2
XX\x{d800}\=offset=3
No match
XX\x{d800}\=no_utf_check
0: X
XX\x{da00}
@ -624,6 +626,10 @@ Failed: error -27: UTF-32 error: code points 0xd800-0xdfff are not defined at of
Failed: error -28: UTF-32 error: code points greater than 0x10ffff are not defined at offset 2
XX\x{d800}\x{1234}
Failed: error -27: UTF-32 error: code points 0xd800-0xdfff are not defined at offset 2
/(?<=.)X/utf
XX\x{d800}\=offset=3
Failed: error -27: UTF-32 error: code points 0xd800-0xdfff are not defined at offset 2
/(*UTF16)\x{11234}/
Failed: error 160 at offset 5: (*VERB) not recognized or malformed

334
testdata/testoutput14 vendored
View File

@ -1,334 +0,0 @@
# These are:
#
# (1) Tests of the match-limiting features. The results are different for
# interpretive or JIT matching, so this test should not be run with JIT. The
# same tests are run using JIT in test 16.
# (2) Other tests that must not be run with JIT.
/(a+)*zz/I
Capturing subpattern count = 1
Starting code units: a z
Last code unit = 'z'
Subject length lower bound = 2
aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaazzbbbbbb\=find_limits
Minimum match limit = 8
Minimum recursion limit = 6
0: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaazz
1: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
aaaaaaaaaaaaaz\=find_limits
Minimum match limit = 32768
Minimum recursion limit = 29
No match
!((?:\s|//.*\\n|/[*](?:\\n|.)*?[*]/)*)!I
Capturing subpattern count = 1
May match empty string
Subject length lower bound = 0
/* this is a C style comment */\=find_limits
Minimum match limit = 120
Minimum recursion limit = 6
0: /* this is a C style comment */
1: /* this is a C style comment */
/^(?>a)++/
aa\=find_limits
Minimum match limit = 5
Minimum recursion limit = 2
0: aa
aaaaaaaaa\=find_limits
Minimum match limit = 12
Minimum recursion limit = 2
0: aaaaaaaaa
/(a)(?1)++/
aa\=find_limits
Minimum match limit = 7
Minimum recursion limit = 4
0: aa
1: a
aaaaaaaaa\=find_limits
Minimum match limit = 21
Minimum recursion limit = 4
0: aaaaaaaaa
1: a
/a(?:.)*?a/ims
abbbbbbbbbbbbbbbbbbbbba\=find_limits
Minimum match limit = 65
Minimum recursion limit = 2
0: abbbbbbbbbbbbbbbbbbbbba
/a(?:.(*THEN))*?a/ims
abbbbbbbbbbbbbbbbbbbbba\=find_limits
Minimum match limit = 86
Minimum recursion limit = 45
0: abbbbbbbbbbbbbbbbbbbbba
/a(?:.(*THEN:ABC))*?a/ims
abbbbbbbbbbbbbbbbbbbbba\=find_limits
Minimum match limit = 86
Minimum recursion limit = 45
0: abbbbbbbbbbbbbbbbbbbbba
/^(?>a+)(?>b+)(?>c+)(?>d+)(?>e+)/
aabbccddee\=find_limits
Minimum match limit = 7
Minimum recursion limit = 2
0: aabbccddee
/^(?>(a+))(?>(b+))(?>(c+))(?>(d+))(?>(e+))/
aabbccddee\=find_limits
Minimum match limit = 17
Minimum recursion limit = 16
0: aabbccddee
1: aa
2: bb
3: cc
4: dd
5: ee
/^(?>(a+))(?>b+)(?>(c+))(?>d+)(?>(e+))/
aabbccddee\=find_limits
Minimum match limit = 13
Minimum recursion limit = 10
0: aabbccddee
1: aa
2: cc
3: ee
/(*LIMIT_MATCH=12bc)abc/
Failed: error 160 at offset 17: (*VERB) not recognized or malformed
/(*LIMIT_MATCH=4294967290)abc/
Failed: error 160 at offset 24: (*VERB) not recognized or malformed
/(*LIMIT_RECURSION=4294967280)abc/I
Capturing subpattern count = 0
Recursion limit = 4294967280
First code unit = 'a'
Last code unit = 'c'
Subject length lower bound = 3
/(a+)*zz/
aaaaaaaaaaaaaz
No match
aaaaaaaaaaaaaz\=match_limit=3000
Failed: error -47: match limit exceeded
/(a+)*zz/
aaaaaaaaaaaaaz\=recursion_limit=10
Failed: error -53: recursion limit exceeded
/(*LIMIT_MATCH=3000)(a+)*zz/I
Capturing subpattern count = 1
Match limit = 3000
Starting code units: a z
Last code unit = 'z'
Subject length lower bound = 2
aaaaaaaaaaaaaz
Failed: error -47: match limit exceeded
aaaaaaaaaaaaaz\=match_limit=60000
Failed: error -47: match limit exceeded
/(*LIMIT_MATCH=60000)(*LIMIT_MATCH=3000)(a+)*zz/I
Capturing subpattern count = 1
Match limit = 3000
Starting code units: a z
Last code unit = 'z'
Subject length lower bound = 2
aaaaaaaaaaaaaz
Failed: error -47: match limit exceeded
/(*LIMIT_MATCH=60000)(a+)*zz/I
Capturing subpattern count = 1
Match limit = 60000
Starting code units: a z
Last code unit = 'z'
Subject length lower bound = 2
aaaaaaaaaaaaaz
No match
aaaaaaaaaaaaaz\=match_limit=3000
Failed: error -47: match limit exceeded
/(*LIMIT_RECURSION=10)(a+)*zz/I
Capturing subpattern count = 1
Recursion limit = 10
Starting code units: a z
Last code unit = 'z'
Subject length lower bound = 2
aaaaaaaaaaaaaz
Failed: error -53: recursion limit exceeded
aaaaaaaaaaaaaz\=recursion_limit=1000
Failed: error -53: recursion limit exceeded
/(*LIMIT_RECURSION=10)(*LIMIT_RECURSION=1000)(a+)*zz/I
Capturing subpattern count = 1
Recursion limit = 1000
Starting code units: a z
Last code unit = 'z'
Subject length lower bound = 2
aaaaaaaaaaaaaz
No match
/(*LIMIT_RECURSION=1000)(a+)*zz/I
Capturing subpattern count = 1
Recursion limit = 1000
Starting code units: a z
Last code unit = 'z'
Subject length lower bound = 2
aaaaaaaaaaaaaz
No match
aaaaaaaaaaaaaz\=recursion_limit=10
Failed: error -53: recursion limit exceeded
# These three have infinitely nested recursions.
/((?2))((?1))/
abc
Failed: error -52: nested recursion at the same subject position
/((?(R2)a+|(?1)b))/
aaaabcde
Failed: error -52: nested recursion at the same subject position
/(?(R)a*(?1)|((?R))b)/
aaaabcde
Failed: error -52: nested recursion at the same subject position
# The allusedtext modifier does not work with JIT, which does not maintain
# the leftchar/rightchar data.
/abc(?=xyz)/allusedtext
abcxyzpqr
0: abcxyz
>>>
abcxyzpqr\=aftertext
0: abcxyz
>>>
0+ xyzpqr
/(?<=pqr)abc(?=xyz)/allusedtext
xyzpqrabcxyzpqr
0: pqrabcxyz
<<< >>>
xyzpqrabcxyzpqr\=aftertext
0: pqrabcxyz
<<< >>>
0+ xyzpqr
/a\b/
a.\=allusedtext
0: a.
>
a\=allusedtext
0: a
/abc\Kxyz/
abcxyz\=allusedtext
0: abcxyz
<<<
/abc(?=xyz(*ACCEPT))/
abcxyz\=allusedtext
0: abcxyz
>>>
/abc(?=abcde)(?=ab)/allusedtext
abcabcdefg
0: abcabcde
>>>>>
# These tests provoke recursion loops, which give a different error message
# when JIT is used.
/(?R)/I
Capturing subpattern count = 0
May match empty string
Subject length lower bound = 0
abcd
Failed: error -52: nested recursion at the same subject position
/(a|(?R))/I
Capturing subpattern count = 1
May match empty string
Subject length lower bound = 1
abcd
0: a
1: a
defg
Failed: error -52: nested recursion at the same subject position
/(ab|(bc|(de|(?R))))/I
Capturing subpattern count = 3
May match empty string
Subject length lower bound = 2
abcd
0: ab
1: ab
fghi
Failed: error -52: nested recursion at the same subject position
/(ab|(bc|(de|(?1))))/I
Capturing subpattern count = 3
May match empty string
Subject length lower bound = 2
abcd
0: ab
1: ab
fghi
Failed: error -52: nested recursion at the same subject position
/x(ab|(bc|(de|(?1)x)x)x)/I
Capturing subpattern count = 3
First code unit = 'x'
Subject length lower bound = 3
xab123
0: xab
1: ab
xfghi
Failed: error -52: nested recursion at the same subject position
/(?!\w)(?R)/
abcd
Failed: error -52: nested recursion at the same subject position
=abc
Failed: error -52: nested recursion at the same subject position
/(?=\w)(?R)/
=abc
Failed: error -52: nested recursion at the same subject position
abcd
Failed: error -52: nested recursion at the same subject position
/(?<!\w)(?R)/
abcd
Failed: error -52: nested recursion at the same subject position
/(?<=\w)(?R)/
abcd
Failed: error -52: nested recursion at the same subject position
/(a+|(?R)b)/
aaa
0: aaa
1: aaa
bbb
Failed: error -52: nested recursion at the same subject position
/[^\xff]((?1))/BI
------------------------------------------------------------------
Bra
[^\x{ff}]
CBra 1
Recurse
Ket
Ket
End
------------------------------------------------------------------
Capturing subpattern count = 1
Subject length lower bound = 1
abcd
Failed: error -52: nested recursion at the same subject position
# End of testinput14

61
testdata/testoutput14-16 vendored Normal file
View File

@ -0,0 +1,61 @@
# These test special (mostly error) UTF features of DFA matching. They are a
# selection of the more comprehensive tests that are run for non-DFA matching.
# The output is different for the different widths.
#subject dfa
/X/utf
XX\x{d800}
Failed: error -24: UTF-16 error: missing low surrogate at end at offset 2
XX\x{d800}\=offset=3
No match
XX\x{d800}\=no_utf_check
0: X
XX\x{da00}
Failed: error -24: UTF-16 error: missing low surrogate at end at offset 2
XX\x{da00}\=no_utf_check
0: X
XX\x{dc00}
Failed: error -26: UTF-16 error: isolated low surrogate at offset 2
XX\x{dc00}\=no_utf_check
0: X
XX\x{de00}
Failed: error -26: UTF-16 error: isolated low surrogate at offset 2
XX\x{de00}\=no_utf_check
0: X
XX\x{dfff}
Failed: error -26: UTF-16 error: isolated low surrogate at offset 2
XX\x{dfff}\=no_utf_check
0: X
XX\x{110000}
** Failed: character \x{110000} is greater than 0x10ffff and so cannot be converted to UTF-16
XX\x{d800}\x{1234}
Failed: error -25: UTF-16 error: invalid low surrogate at offset 3
/badutf/utf
X\xdf
No match
XX\xef
No match
XXX\xef\x80
No match
X\xf7
No match
XX\xf7\x80
No match
XXX\xf7\x80\x80
No match
/shortutf/utf
XX\xdf\=ph
No match
XX\xef\=ph
No match
XX\xef\x80\=ph
No match
\xf7\=ph
No match
\xf7\x80\=ph
No match
# End of testinput14

61
testdata/testoutput14-32 vendored Normal file
View File

@ -0,0 +1,61 @@
# These test special (mostly error) UTF features of DFA matching. They are a
# selection of the more comprehensive tests that are run for non-DFA matching.
# The output is different for the different widths.
#subject dfa
/X/utf
XX\x{d800}
Failed: error -27: UTF-32 error: code points 0xd800-0xdfff are not defined at offset 2
XX\x{d800}\=offset=3
No match
XX\x{d800}\=no_utf_check
0: X
XX\x{da00}
Failed: error -27: UTF-32 error: code points 0xd800-0xdfff are not defined at offset 2
XX\x{da00}\=no_utf_check
0: X
XX\x{dc00}
Failed: error -27: UTF-32 error: code points 0xd800-0xdfff are not defined at offset 2
XX\x{dc00}\=no_utf_check
0: X
XX\x{de00}
Failed: error -27: UTF-32 error: code points 0xd800-0xdfff are not defined at offset 2
XX\x{de00}\=no_utf_check
0: X
XX\x{dfff}
Failed: error -27: UTF-32 error: code points 0xd800-0xdfff are not defined at offset 2
XX\x{dfff}\=no_utf_check
0: X
XX\x{110000}
Failed: error -28: UTF-32 error: code points greater than 0x10ffff are not defined at offset 2
XX\x{d800}\x{1234}
Failed: error -27: UTF-32 error: code points 0xd800-0xdfff are not defined at offset 2
/badutf/utf
X\xdf
No match
XX\xef
No match
XXX\xef\x80
No match
X\xf7
No match
XX\xf7\x80
No match
XXX\xf7\x80\x80
No match
/shortutf/utf
XX\xdf\=ph
No match
XX\xef\=ph
No match
XX\xef\x80\=ph
No match
\xf7\=ph
No match
\xf7\x80\=ph
No match
# End of testinput14

61
testdata/testoutput14-8 vendored Normal file
View File

@ -0,0 +1,61 @@
# These test special (mostly error) UTF features of DFA matching. They are a
# selection of the more comprehensive tests that are run for non-DFA matching.
# The output is different for the different widths.
#subject dfa
/X/utf
XX\x{d800}
Failed: error -16: UTF-8 error: code points 0xd800-0xdfff are not defined at offset 2
XX\x{d800}\=offset=3
Error -36 (bad UTF-8 offset)
XX\x{d800}\=no_utf_check
0: X
XX\x{da00}
Failed: error -16: UTF-8 error: code points 0xd800-0xdfff are not defined at offset 2
XX\x{da00}\=no_utf_check
0: X
XX\x{dc00}
Failed: error -16: UTF-8 error: code points 0xd800-0xdfff are not defined at offset 2
XX\x{dc00}\=no_utf_check
0: X
XX\x{de00}
Failed: error -16: UTF-8 error: code points 0xd800-0xdfff are not defined at offset 2
XX\x{de00}\=no_utf_check
0: X
XX\x{dfff}
Failed: error -16: UTF-8 error: code points 0xd800-0xdfff are not defined at offset 2
XX\x{dfff}\=no_utf_check
0: X
XX\x{110000}
Failed: error -15: UTF-8 error: code points greater than 0x10ffff are not defined at offset 2
XX\x{d800}\x{1234}
Failed: error -16: UTF-8 error: code points 0xd800-0xdfff are not defined at offset 2
/badutf/utf
X\xdf
Failed: error -3: UTF-8 error: 1 byte missing at end at offset 1
XX\xef
Failed: error -4: UTF-8 error: 2 bytes missing at end at offset 2
XXX\xef\x80
Failed: error -3: UTF-8 error: 1 byte missing at end at offset 3
X\xf7
Failed: error -5: UTF-8 error: 3 bytes missing at end at offset 1
XX\xf7\x80
Failed: error -4: UTF-8 error: 2 bytes missing at end at offset 2
XXX\xf7\x80\x80
Failed: error -3: UTF-8 error: 1 byte missing at end at offset 3
/shortutf/utf
XX\xdf\=ph
Failed: error -3: UTF-8 error: 1 byte missing at end at offset 2
XX\xef\=ph
Failed: error -4: UTF-8 error: 2 bytes missing at end at offset 2
XX\xef\x80\=ph
Failed: error -3: UTF-8 error: 1 byte missing at end at offset 2
\xf7\=ph
Failed: error -5: UTF-8 error: 3 bytes missing at end at offset 0
\xf7\x80\=ph
Failed: error -4: UTF-8 error: 2 bytes missing at end at offset 0
# End of testinput14

329
testdata/testoutput15 vendored
View File

@ -1,17 +1,334 @@
# This test is run only when JIT support is not available. It checks that an
# attempt to use it has the expected behaviour. It also tests things that
# are different without JIT.
# These are:
#
# (1) Tests of the match-limiting features. The results are different for
# interpretive or JIT matching, so this test should not be run with JIT. The
# same tests are run using JIT in test 17.
/abc/I,jit,jitverify
# (2) Other tests that must not be run with JIT.
/(a+)*zz/I
Capturing subpattern count = 1
Starting code units: a z
Last code unit = 'z'
Subject length lower bound = 2
aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaazzbbbbbb\=find_limits
Minimum match limit = 8
Minimum recursion limit = 6
0: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaazz
1: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
aaaaaaaaaaaaaz\=find_limits
Minimum match limit = 32768
Minimum recursion limit = 29
No match
!((?:\s|//.*\\n|/[*](?:\\n|.)*?[*]/)*)!I
Capturing subpattern count = 1
May match empty string
Subject length lower bound = 0
/* this is a C style comment */\=find_limits
Minimum match limit = 120
Minimum recursion limit = 6
0: /* this is a C style comment */
1: /* this is a C style comment */
/^(?>a)++/
aa\=find_limits
Minimum match limit = 5
Minimum recursion limit = 2
0: aa
aaaaaaaaa\=find_limits
Minimum match limit = 12
Minimum recursion limit = 2
0: aaaaaaaaa
/(a)(?1)++/
aa\=find_limits
Minimum match limit = 7
Minimum recursion limit = 4
0: aa
1: a
aaaaaaaaa\=find_limits
Minimum match limit = 21
Minimum recursion limit = 4
0: aaaaaaaaa
1: a
/a(?:.)*?a/ims
abbbbbbbbbbbbbbbbbbbbba\=find_limits
Minimum match limit = 65
Minimum recursion limit = 2
0: abbbbbbbbbbbbbbbbbbbbba
/a(?:.(*THEN))*?a/ims
abbbbbbbbbbbbbbbbbbbbba\=find_limits
Minimum match limit = 86
Minimum recursion limit = 45
0: abbbbbbbbbbbbbbbbbbbbba
/a(?:.(*THEN:ABC))*?a/ims
abbbbbbbbbbbbbbbbbbbbba\=find_limits
Minimum match limit = 86
Minimum recursion limit = 45
0: abbbbbbbbbbbbbbbbbbbbba
/^(?>a+)(?>b+)(?>c+)(?>d+)(?>e+)/
aabbccddee\=find_limits
Minimum match limit = 7
Minimum recursion limit = 2
0: aabbccddee
/^(?>(a+))(?>(b+))(?>(c+))(?>(d+))(?>(e+))/
aabbccddee\=find_limits
Minimum match limit = 17
Minimum recursion limit = 16
0: aabbccddee
1: aa
2: bb
3: cc
4: dd
5: ee
/^(?>(a+))(?>b+)(?>(c+))(?>d+)(?>(e+))/
aabbccddee\=find_limits
Minimum match limit = 13
Minimum recursion limit = 10
0: aabbccddee
1: aa
2: cc
3: ee
/(*LIMIT_MATCH=12bc)abc/
Failed: error 160 at offset 17: (*VERB) not recognized or malformed
/(*LIMIT_MATCH=4294967290)abc/
Failed: error 160 at offset 24: (*VERB) not recognized or malformed
/(*LIMIT_RECURSION=4294967280)abc/I
Capturing subpattern count = 0
Recursion limit = 4294967280
First code unit = 'a'
Last code unit = 'c'
Subject length lower bound = 3
JIT support is not available in this version of PCRE2
/a*/I
/(a+)*zz/
aaaaaaaaaaaaaz
No match
aaaaaaaaaaaaaz\=match_limit=3000
Failed: error -47: match limit exceeded
/(a+)*zz/
aaaaaaaaaaaaaz\=recursion_limit=10
Failed: error -53: recursion limit exceeded
/(*LIMIT_MATCH=3000)(a+)*zz/I
Capturing subpattern count = 1
Match limit = 3000
Starting code units: a z
Last code unit = 'z'
Subject length lower bound = 2
aaaaaaaaaaaaaz
Failed: error -47: match limit exceeded
aaaaaaaaaaaaaz\=match_limit=60000
Failed: error -47: match limit exceeded
/(*LIMIT_MATCH=60000)(*LIMIT_MATCH=3000)(a+)*zz/I
Capturing subpattern count = 1
Match limit = 3000
Starting code units: a z
Last code unit = 'z'
Subject length lower bound = 2
aaaaaaaaaaaaaz
Failed: error -47: match limit exceeded
/(*LIMIT_MATCH=60000)(a+)*zz/I
Capturing subpattern count = 1
Match limit = 60000
Starting code units: a z
Last code unit = 'z'
Subject length lower bound = 2
aaaaaaaaaaaaaz
No match
aaaaaaaaaaaaaz\=match_limit=3000
Failed: error -47: match limit exceeded
/(*LIMIT_RECURSION=10)(a+)*zz/I
Capturing subpattern count = 1
Recursion limit = 10
Starting code units: a z
Last code unit = 'z'
Subject length lower bound = 2
aaaaaaaaaaaaaz
Failed: error -53: recursion limit exceeded
aaaaaaaaaaaaaz\=recursion_limit=1000
Failed: error -53: recursion limit exceeded
/(*LIMIT_RECURSION=10)(*LIMIT_RECURSION=1000)(a+)*zz/I
Capturing subpattern count = 1
Recursion limit = 1000
Starting code units: a z
Last code unit = 'z'
Subject length lower bound = 2
aaaaaaaaaaaaaz
No match
/(*LIMIT_RECURSION=1000)(a+)*zz/I
Capturing subpattern count = 1
Recursion limit = 1000
Starting code units: a z
Last code unit = 'z'
Subject length lower bound = 2
aaaaaaaaaaaaaz
No match
aaaaaaaaaaaaaz\=recursion_limit=10
Failed: error -53: recursion limit exceeded
# These three have infinitely nested recursions.
/((?2))((?1))/
abc
Failed: error -52: nested recursion at the same subject position
/((?(R2)a+|(?1)b))/
aaaabcde
Failed: error -52: nested recursion at the same subject position
/(?(R)a*(?1)|((?R))b)/
aaaabcde
Failed: error -52: nested recursion at the same subject position
# The allusedtext modifier does not work with JIT, which does not maintain
# the leftchar/rightchar data.
/abc(?=xyz)/allusedtext
abcxyzpqr
0: abcxyz
>>>
abcxyzpqr\=aftertext
0: abcxyz
>>>
0+ xyzpqr
/(?<=pqr)abc(?=xyz)/allusedtext
xyzpqrabcxyzpqr
0: pqrabcxyz
<<< >>>
xyzpqrabcxyzpqr\=aftertext
0: pqrabcxyz
<<< >>>
0+ xyzpqr
/a\b/
a.\=allusedtext
0: a.
>
a\=allusedtext
0: a
/abc\Kxyz/
abcxyz\=allusedtext
0: abcxyz
<<<
/abc(?=xyz(*ACCEPT))/
abcxyz\=allusedtext
0: abcxyz
>>>
/abc(?=abcde)(?=ab)/allusedtext
abcabcdefg
0: abcabcde
>>>>>
# These tests provoke recursion loops, which give a different error message
# when JIT is used.
/(?R)/I
Capturing subpattern count = 0
May match empty string
Subject length lower bound = 0
abcd
Failed: error -52: nested recursion at the same subject position
/(a|(?R))/I
Capturing subpattern count = 1
May match empty string
Subject length lower bound = 1
abcd
0: a
1: a
defg
Failed: error -52: nested recursion at the same subject position
/(ab|(bc|(de|(?R))))/I
Capturing subpattern count = 3
May match empty string
Subject length lower bound = 2
abcd
0: ab
1: ab
fghi
Failed: error -52: nested recursion at the same subject position
/(ab|(bc|(de|(?1))))/I
Capturing subpattern count = 3
May match empty string
Subject length lower bound = 2
abcd
0: ab
1: ab
fghi
Failed: error -52: nested recursion at the same subject position
/x(ab|(bc|(de|(?1)x)x)x)/I
Capturing subpattern count = 3
First code unit = 'x'
Subject length lower bound = 3
xab123
0: xab
1: ab
xfghi
Failed: error -52: nested recursion at the same subject position
/(?!\w)(?R)/
abcd
Failed: error -52: nested recursion at the same subject position
=abc
Failed: error -52: nested recursion at the same subject position
/(?=\w)(?R)/
=abc
Failed: error -52: nested recursion at the same subject position
abcd
Failed: error -52: nested recursion at the same subject position
/(?<!\w)(?R)/
abcd
Failed: error -52: nested recursion at the same subject position
/(?<=\w)(?R)/
abcd
Failed: error -52: nested recursion at the same subject position
/(a+|(?R)b)/
aaa
0: aaa
1: aaa
bbb
Failed: error -52: nested recursion at the same subject position
/[^\xff]((?1))/BI
------------------------------------------------------------------
Bra
[^\x{ff}]
CBra 1
Recurse
Ket
Ket
End
------------------------------------------------------------------
Capturing subpattern count = 1
Subject length lower bound = 1
abcd
Failed: error -52: nested recursion at the same subject position
# End of testinput15

487
testdata/testoutput16 vendored

File diff suppressed because one or more lines are too long

576
testdata/testoutput17 vendored

File diff suppressed because one or more lines are too long

154
testdata/testoutput18 vendored
View File

@ -1,20 +1,148 @@
# This set of tests is run only with the 8-bit library. It tests the POSIX
# interface with UTF/UCP support, which is supported only with the 8-bit
# library. This test should not be run with JIT (which is not available for the
# POSIX interface).
# interface, which is supported only with the 8-bit library. This test should
# not be run with JIT (which is not available for the POSIX interface).
#forbid_utf
#pattern posix
/a\x{1234}b/utf
a\x{1234}b
0: a\x{1234}b
# Test invalid options
/\w/
+++\x{c2}
/abc/auto_callout
** Ignored with POSIX interface: auto_callout
/abc/
abc\=find_limits
** Ignored with POSIX interface: find_limits
0: abc
/abc/
abc\=partial_hard
** Ignored with POSIX interface: partial_hard
0: abc
# Real tests
/abc/
abc
0: abc
*** Failers
No match: POSIX code 17: match failed
/\w/ucp
+++\x{c2}
0: \xc2
# End of testdata/testinput17
/^abc|def/
abcdef
0: abc
abcdef\=notbol
0: def
/.*((abc)$|(def))/
defabc
0: defabc
1: abc
2: abc
defabc\=noteol
0: def
1: def
3: def
/the quick brown fox/
the quick brown fox
0: the quick brown fox
*** Failers
No match: POSIX code 17: match failed
The Quick Brown Fox
No match: POSIX code 17: match failed
/the quick brown fox/i
the quick brown fox
0: the quick brown fox
The Quick Brown Fox
0: The Quick Brown Fox
/abc.def/
*** Failers
No match: POSIX code 17: match failed
abc\ndef
No match: POSIX code 17: match failed
/abc$/
abc
0: abc
abc\n
0: abc
/(abc)\2/
Failed: POSIX code 15: bad back reference at offset 6
/(abc\1)/
abc
No match: POSIX code 17: match failed
/a*(b+)(z)(z)/
aaaabbbbzzzz
0: aaaabbbbzz
1: bbbb
2: z
3: z
aaaabbbbzzzz\=ovector=0
Matched without capture
aaaabbbbzzzz\=ovector=1
0: aaaabbbbzz
aaaabbbbzzzz\=ovector=2
0: aaaabbbbzz
1: bbbb
/ab.cd/
ab-cd
0: ab-cd
ab=cd
0: ab=cd
** Failers
No match: POSIX code 17: match failed
ab\ncd
No match: POSIX code 17: match failed
/ab.cd/s
ab-cd
0: ab-cd
ab=cd
0: ab=cd
ab\ncd
0: ab\x0acd
/a(b)c/no_auto_capture
abc
Matched with REG_NOSUB
/a(?P<name>b)c/no_auto_capture
abc
Matched with REG_NOSUB
/a?|b?/
abc
0: a
** Failers
0:
ddd\=notempty
No match: POSIX code 17: match failed
/\w+A/
CDAAAAB
0: CDAAAA
/\w+A/ungreedy
CDAAAAB
0: CDA
/\Biss\B/I,aftertext
** Ignored with POSIX interface: info
Mississippi
0: iss
0+ issippi
/abc/\
Failed: POSIX code 9: bad escape sequence at offset 4
"(?(?C)"
Failed: POSIX code 3: pattern error at offset 2
# End of testdata/testinput18

116
testdata/testoutput19 vendored
View File

@ -1,100 +1,20 @@
# This set of tests exercises the serialization/deserialization functions in
# the library. It does not use UTF or JIT.
#forbid_utf
# Compile several patterns, push them onto the stack, and then write them
# all to a file.
#pattern push
/(?<NAME>(?&NAME_PAT))\s+(?<ADDR>(?&ADDRESS_PAT))
(?(DEFINE)
(?<NAME_PAT>[a-z]+)
(?<ADDRESS_PAT>\d+)
)/x
/^(?:((.)(?1)\2|)|((.)(?3)\4|.))$/i
#save testsaved1
# Do it again for some more patterns.
/(*MARK:A)(*SKIP:B)(C|X)/mark
** Ignored when compiled pattern is stacked with 'push': mark
/(?:(?<n>foo)|(?<n>bar))\k<n>/dupnames
#save testsaved2
#pattern -push
# Reload the patterns, then pop them one by one and check them.
#load testsaved1
#load testsaved2
#pop info
Capturing subpattern count = 2
Max back reference = 2
Named capturing subpatterns:
n 1
n 2
Options: dupnames
Starting code units: b f
Subject length lower bound = 6
foofoo
0: foofoo
1: foo
barbar
0: barbar
1: <unset>
2: bar
# This set of tests is run only with the 8-bit library. It tests the POSIX
# interface with UTF/UCP support, which is supported only with the 8-bit
# library. This test should not be run with JIT (which is not available for the
# POSIX interface).
#pop mark
C
0: C
1: C
MK: A
D
No match, mark = A
#pattern posix
/a\x{1234}b/utf
a\x{1234}b
0: a\x{1234}b
/\w/
+++\x{c2}
No match: POSIX code 17: match failed
/\w/ucp
+++\x{c2}
0: \xc2
#pop
AmanaplanacanalPanama
0: AmanaplanacanalPanama
1: <unset>
2: <unset>
3: AmanaplanacanalPanama
4: A
#pop info
Capturing subpattern count = 4
Named capturing subpatterns:
ADDR 2
ADDRESS_PAT 4
NAME 1
NAME_PAT 3
Options: extended
Subject length lower bound = 3
metcalfe 33
0: metcalfe 33
1: metcalfe
2: 33
# Check for an error when different tables are used.
/abc/push,tables=1
/xyz/push,tables=2
#save testsaved1
Serialization failed: error -30: patterns do not all use the same character tables
#pop
xyz
0: xyz
#pop
abc
0: abc
#pop should give an error
** Can't pop off an empty stack
pqr
# End of testinput19
# End of testdata/testinput19

100
testdata/testoutput20 vendored Normal file
View File

@ -0,0 +1,100 @@
# This set of tests exercises the serialization/deserialization functions in
# the library. It does not use UTF or JIT.
#forbid_utf
# Compile several patterns, push them onto the stack, and then write them
# all to a file.
#pattern push
/(?<NAME>(?&NAME_PAT))\s+(?<ADDR>(?&ADDRESS_PAT))
(?(DEFINE)
(?<NAME_PAT>[a-z]+)
(?<ADDRESS_PAT>\d+)
)/x
/^(?:((.)(?1)\2|)|((.)(?3)\4|.))$/i
#save testsaved1
# Do it again for some more patterns.
/(*MARK:A)(*SKIP:B)(C|X)/mark
** Ignored when compiled pattern is stacked with 'push': mark
/(?:(?<n>foo)|(?<n>bar))\k<n>/dupnames
#save testsaved2
#pattern -push
# Reload the patterns, then pop them one by one and check them.
#load testsaved1
#load testsaved2
#pop info
Capturing subpattern count = 2
Max back reference = 2
Named capturing subpatterns:
n 1
n 2
Options: dupnames
Starting code units: b f
Subject length lower bound = 6
foofoo
0: foofoo
1: foo
barbar
0: barbar
1: <unset>
2: bar
#pop mark
C
0: C
1: C
MK: A
D
No match, mark = A
#pop
AmanaplanacanalPanama
0: AmanaplanacanalPanama
1: <unset>
2: <unset>
3: AmanaplanacanalPanama
4: A
#pop info
Capturing subpattern count = 4
Named capturing subpatterns:
ADDR 2
ADDRESS_PAT 4
NAME 1
NAME_PAT 3
Options: extended
Subject length lower bound = 3
metcalfe 33
0: metcalfe 33
1: metcalfe
2: 33
# Check for an error when different tables are used.
/abc/push,tables=1
/xyz/push,tables=2
#save testsaved1
Serialization failed: error -30: patterns do not all use the same character tables
#pop
xyz
0: xyz
#pop
abc
0: abc
#pop should give an error
** Can't pop off an empty stack
pqr
# End of testinput20