Runtime UTF checks now take not of the starting offset.
This commit is contained in:
parent
1370a49dfe
commit
ee41aa906f
|
@ -145,6 +145,10 @@ was fixed.
|
|||
39. Match limit check added to recursion. This issue was found by Karl Skomski
|
||||
with a custom LLVM fuzzer.
|
||||
|
||||
40. Arrange for the UTF check in pcre2_match() and pcre2_dfa_match() to look
|
||||
only at the part of the subject that is relevant when the starting offset is
|
||||
non-zero.
|
||||
|
||||
|
||||
Version 10.20 30-June-2015
|
||||
--------------------------
|
||||
|
|
80
RunTest
80
RunTest
|
@ -68,12 +68,13 @@ title10="Test 10: Specials for the 8-bit library with UTF-8 and UCP support"
|
|||
title11="Test 11: Specials for the basic 16-bit and 32-bit libraries"
|
||||
title12="Test 12: Specials for the 16-bit and 32-bit libraries UTF and UCP support"
|
||||
title13="Test 13: DFA specials for the basic 16-bit and 32-bit libraries"
|
||||
title14="Test 14: Non-JIT limits and other non-JIT tests"
|
||||
title15="Test 15: JIT-specific features when JIT is not available"
|
||||
title16="Test 16: JIT-specific features when JIT is available"
|
||||
title17="Test 17: Tests of the POSIX interface, excluding UTF/UCP"
|
||||
title18="Test 18: Tests of the POSIX interface with UTF/UCP"
|
||||
title19="Test 19: Serialization tests"
|
||||
title14="Test 14: DFA specials for UTF and UCP support"
|
||||
title15="Test 15: Non-JIT limits and other non-JIT tests"
|
||||
title16="Test 16: JIT-specific features when JIT is not available"
|
||||
title17="Test 17: JIT-specific features when JIT is available"
|
||||
title18="Test 18: Tests of the POSIX interface, excluding UTF/UCP"
|
||||
title19="Test 19: Tests of the POSIX interface with UTF/UCP"
|
||||
title20="Test 20: Serialization tests"
|
||||
maxtest=18
|
||||
|
||||
if [ $# -eq 1 -a "$1" = "list" ]; then
|
||||
|
@ -97,6 +98,7 @@ if [ $# -eq 1 -a "$1" = "list" ]; then
|
|||
echo $title17
|
||||
echo $title18
|
||||
echo $title19
|
||||
echo $title20
|
||||
exit 0
|
||||
fi
|
||||
|
||||
|
@ -219,6 +221,7 @@ do16=no
|
|||
do17=no
|
||||
do18=no
|
||||
do19=no
|
||||
do20=no
|
||||
|
||||
while [ $# -gt 0 ] ; do
|
||||
case $1 in
|
||||
|
@ -242,6 +245,7 @@ while [ $# -gt 0 ] ; do
|
|||
17) do17=yes;;
|
||||
18) do18=yes;;
|
||||
19) do19=yes;;
|
||||
20) do20=yes;;
|
||||
-8) arg8=yes;;
|
||||
-16) arg16=yes;;
|
||||
-32) arg32=yes;;
|
||||
|
@ -387,7 +391,8 @@ if [ $do0 = no -a $do1 = no -a $do2 = no -a $do3 = no -a \
|
|||
$do4 = no -a $do5 = no -a $do6 = no -a $do7 = no -a \
|
||||
$do8 = no -a $do9 = no -a $do10 = no -a $do11 = no -a \
|
||||
$do12 = no -a $do13 = no -a $do14 = no -a $do15 = no -a \
|
||||
$do16 = no -a $do17 = no -a $do18 = no -a $do19 = no \
|
||||
$do16 = no -a $do17 = no -a $do18 = no -a $do19 = no -a \
|
||||
$do20 = no \
|
||||
]; then
|
||||
do0=yes
|
||||
do1=yes
|
||||
|
@ -409,6 +414,7 @@ if [ $do0 = no -a $do1 = no -a $do2 = no -a $do3 = no -a \
|
|||
do17=yes
|
||||
do18=yes
|
||||
do19=yes
|
||||
do20=yes
|
||||
fi
|
||||
|
||||
# Handle any explicit skips at this stage, so that an argument list may consist
|
||||
|
@ -689,70 +695,78 @@ for bmode in "$test8" "$test16" "$test32"; do
|
|||
fi
|
||||
fi
|
||||
|
||||
# Test non-JIT match and recursion limits
|
||||
# Tests for DFA UTF and UCP features. Output is different for the different widths.
|
||||
|
||||
if [ $do14 = yes ] ; then
|
||||
echo $title14
|
||||
$sim $valgrind ./pcre2test -q $defaultstack $bmode $testdata/testinput14 testtry
|
||||
checkresult $? 14 ""
|
||||
$sim $valgrind ./pcre2test -q $defaultstack $bmode $opt $testdata/testinput14 testtry
|
||||
checkresult $? 14-$bits "$opt"
|
||||
fi
|
||||
|
||||
# Test non-JIT match and recursion limits
|
||||
|
||||
if [ $do15 = yes ] ; then
|
||||
echo $title15
|
||||
$sim $valgrind ./pcre2test -q $defaultstack $bmode $testdata/testinput15 testtry
|
||||
checkresult $? 15 ""
|
||||
fi
|
||||
|
||||
# Test JIT-specific features when JIT is not available
|
||||
|
||||
if [ $do15 = yes ] ; then
|
||||
echo $title15
|
||||
if [ $jit -ne 0 ] ; then
|
||||
echo " Skipped because JIT is available"
|
||||
else
|
||||
$sim $valgrind ./pcre2test -q $defaultstack $bmode $testdata/testinput15 testtry
|
||||
checkresult $? 15 ""
|
||||
fi
|
||||
fi
|
||||
|
||||
# Test JIT-specific features when JIT is available
|
||||
|
||||
if [ $do16 = yes ] ; then
|
||||
echo $title16
|
||||
if [ $jit -eq 0 -o "$nojit" = "yes" ] ; then
|
||||
echo " Skipped because JIT is not available or nojit was specified"
|
||||
if [ $jit -ne 0 ] ; then
|
||||
echo " Skipped because JIT is available"
|
||||
else
|
||||
$sim $valgrind ./pcre2test -q $defaultstack $bmode $testdata/testinput16 testtry
|
||||
checkresult $? 16 ""
|
||||
fi
|
||||
fi
|
||||
|
||||
# Tests for the POSIX interface without UTF/UCP (8-bit only)
|
||||
# Test JIT-specific features when JIT is available
|
||||
|
||||
if [ $do17 = yes ] ; then
|
||||
echo $title17
|
||||
if [ "$bits" = "16" -o "$bits" = "32" ] ; then
|
||||
echo " Skipped when running 16/32-bit tests"
|
||||
if [ $jit -eq 0 -o "$nojit" = "yes" ] ; then
|
||||
echo " Skipped because JIT is not available or nojit was specified"
|
||||
else
|
||||
$sim $valgrind ./pcre2test -q $defaultstack $bmode $testdata/testinput17 testtry
|
||||
checkresult $? 17 ""
|
||||
fi
|
||||
fi
|
||||
|
||||
# Tests for the POSIX interface with UTF/UCP (8-bit only)
|
||||
# Tests for the POSIX interface without UTF/UCP (8-bit only)
|
||||
|
||||
if [ $do18 = yes ] ; then
|
||||
echo $title18
|
||||
if [ "$bits" = "16" -o "$bits" = "32" ] ; then
|
||||
echo " Skipped when running 16/32-bit tests"
|
||||
elif [ $utf -eq 0 ] ; then
|
||||
echo " Skipped because UTF-$bits support is not available"
|
||||
else
|
||||
$sim $valgrind ./pcre2test -q $defaultstack $bmode $testdata/testinput18 testtry
|
||||
checkresult $? 18 ""
|
||||
fi
|
||||
fi
|
||||
|
||||
# Serialization tests
|
||||
# Tests for the POSIX interface with UTF/UCP (8-bit only)
|
||||
|
||||
if [ $do19 = yes ] ; then
|
||||
echo $title19
|
||||
$sim $valgrind ./pcre2test -q $defaultstack $bmode $testdata/testinput19 testtry
|
||||
checkresult $? 19 ""
|
||||
if [ "$bits" = "16" -o "$bits" = "32" ] ; then
|
||||
echo " Skipped when running 16/32-bit tests"
|
||||
elif [ $utf -eq 0 ] ; then
|
||||
echo " Skipped because UTF-$bits support is not available"
|
||||
else
|
||||
$sim $valgrind ./pcre2test -q $defaultstack $bmode $testdata/testinput19 testtry
|
||||
checkresult $? 19 ""
|
||||
fi
|
||||
fi
|
||||
|
||||
# Serialization tests
|
||||
|
||||
if [ $do20 = yes ] ; then
|
||||
echo $title20
|
||||
$sim $valgrind ./pcre2test -q $defaultstack $bmode $testdata/testinput20 testtry
|
||||
checkresult $? 20 ""
|
||||
fi
|
||||
|
||||
# End of loop for 8/16/32-bit tests
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
.TH PCRE2API 3 "29 July 2015" "PCRE2 10.21"
|
||||
.TH PCRE2API 3 "18 August 2015" "PCRE2 10.21"
|
||||
.SH NAME
|
||||
PCRE2 - Perl-compatible regular expressions (revised API)
|
||||
.sp
|
||||
|
@ -2022,12 +2022,19 @@ If the pattern is anchored, such a match can occur only if the pattern contains
|
|||
.sp
|
||||
When PCRE2_UTF is set at compile time, the validity of the subject as a UTF
|
||||
string is checked by default when \fBpcre2_match()\fP is subsequently called.
|
||||
The entire string is checked before any other processing takes place, and a
|
||||
If a non-zero starting offset is given, the check is applied only to that part
|
||||
of the subject that could be inspected during matching, and there is a check
|
||||
that the starting offset points to the first code unit of a character or to the
|
||||
end of the subject. If there are no lookbehind assertions in the pattern, the
|
||||
check starts at the starting offset. Otherwise, it starts at the length of the
|
||||
longest lookbehind before the starting offset, or at the start of the subject
|
||||
if there are not that many characters before the starting offset. Note that the
|
||||
sequences \eb and \eB are one-character lookbehinds.
|
||||
.P
|
||||
The check is carried out before any other processing takes place, and a
|
||||
negative error code is returned if the check fails. There are several UTF error
|
||||
codes for each code unit width, corresponding to different problems with the
|
||||
code unit sequence. The value of \fIstartoffset\fP is also checked, to ensure
|
||||
that it points to the start of a character or to the end of the subject. There
|
||||
are discussions about the validity of
|
||||
code unit sequence. There are discussions about the validity of
|
||||
.\" HTML <a href="pcre2unicode.html#utf8strings">
|
||||
.\" </a>
|
||||
UTF-8 strings,
|
||||
|
@ -2939,6 +2946,6 @@ Cambridge, England.
|
|||
.rs
|
||||
.sp
|
||||
.nf
|
||||
Last updated: 29 July 2015
|
||||
Last updated: 18 August 2015
|
||||
Copyright (c) 1997-2015 University of Cambridge.
|
||||
.fi
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
.TH PCRE2UNICODE 3 "23 November 2014" "PCRE2 10.00"
|
||||
.TH PCRE2UNICODE 3 "18 August 2015" "PCRE2 10.21"
|
||||
.SH NAME
|
||||
PCRE - Perl-compatible regular expressions (revised API)
|
||||
.SH "UNICODE AND UTF SUPPORT"
|
||||
|
@ -117,11 +117,21 @@ UTF-16 and UTF-32 strings can indicate their endianness by special code knows
|
|||
as a byte-order mark (BOM). The PCRE2 functions do not handle this, expecting
|
||||
strings to be in host byte order.
|
||||
.P
|
||||
The entire string is checked before any other processing takes place. In
|
||||
addition to checking the format of the string, there is a check to ensure that
|
||||
all code points lie in the range U+0 to U+10FFFF, excluding the surrogate area.
|
||||
The so-called "non-character" code points are not excluded because Unicode
|
||||
corrigendum #9 makes it clear that they should not be.
|
||||
A UTF string is checked before any other processing takes place. In the case of
|
||||
\fBpcre2_match()\fP and \fBpcre2_dfa_match()\fP calls with a non-zero starting
|
||||
offset, the check is applied only to that part of the subject that could be
|
||||
inspected during matching, and there is a check that the starting offset points
|
||||
to the first code unit of a character or to the end of the subject. If there
|
||||
are no lookbehind assertions in the pattern, the check starts at the starting
|
||||
offset. Otherwise, it starts at the length of the longest lookbehind before the
|
||||
starting offset, or at the start of the subject if there are not that many
|
||||
characters before the starting offset. Note that the sequences \eb and \eB are
|
||||
one-character lookbehinds.
|
||||
.P
|
||||
In addition to checking the format of the string, there is a check to ensure
|
||||
that all code points lie in the range U+0 to U+10FFFF, excluding the surrogate
|
||||
area. The so-called "non-character" code points are not excluded because
|
||||
Unicode corrigendum #9 makes it clear that they should not be.
|
||||
.P
|
||||
Characters in the "Surrogate Area" of Unicode are reserved for use by UTF-16,
|
||||
where they are used in pairs to encode code points with values greater than
|
||||
|
@ -252,6 +262,6 @@ Cambridge, England.
|
|||
.rs
|
||||
.sp
|
||||
.nf
|
||||
Last updated: 23 November 2014
|
||||
Copyright (c) 1997-2014 University of Cambridge.
|
||||
Last updated: 18 August 2015
|
||||
Copyright (c) 1997-2015 University of Cambridge.
|
||||
.fi
|
||||
|
|
|
@ -4682,7 +4682,7 @@ for (;; ptr++)
|
|||
that it's a length rather than a small character. */
|
||||
|
||||
#ifdef MAYBE_UTF_MULTI
|
||||
if (utf && NOT_FIRSTCHAR(code[-1]))
|
||||
if (utf && NOT_FIRSTCU(code[-1]))
|
||||
{
|
||||
PCRE2_UCHAR *lastchar = code - 1;
|
||||
BACKCHAR(lastchar);
|
||||
|
|
|
@ -2774,7 +2774,7 @@ for (;;)
|
|||
{
|
||||
PCRE2_SPTR p = start_subject + local_offsets[rc];
|
||||
PCRE2_SPTR pp = start_subject + local_offsets[rc+1];
|
||||
while (p < pp) if (NOT_FIRSTCHAR(*p++)) charcount--;
|
||||
while (p < pp) if (NOT_FIRSTCU(*p++)) charcount--;
|
||||
}
|
||||
#endif
|
||||
if (charcount > 0)
|
||||
|
@ -2874,7 +2874,7 @@ for (;;)
|
|||
PCRE2_SPTR pp = local_ptr;
|
||||
charcount = (int)(pp - p);
|
||||
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
|
||||
if (utf) while (p < pp) if (NOT_FIRSTCHAR(*p++)) charcount--;
|
||||
if (utf) while (p < pp) if (NOT_FIRSTCU(*p++)) charcount--;
|
||||
#endif
|
||||
ADD_NEW_DATA(-next_state_offset, 0, (charcount - 1));
|
||||
}
|
||||
|
@ -2960,7 +2960,7 @@ for (;;)
|
|||
{
|
||||
PCRE2_SPTR p = start_subject + local_offsets[0];
|
||||
PCRE2_SPTR pp = start_subject + local_offsets[1];
|
||||
while (p < pp) if (NOT_FIRSTCHAR(*p++)) charcount--;
|
||||
while (p < pp) if (NOT_FIRSTCU(*p++)) charcount--;
|
||||
}
|
||||
#endif
|
||||
ADD_NEW_DATA(-next_state_offset, 0, (charcount - 1));
|
||||
|
@ -3264,18 +3264,50 @@ switch(re->newline_convention)
|
|||
|
||||
/* Check a UTF string for validity if required. For 8-bit and 16-bit strings,
|
||||
we must also check that a starting offset does not point into the middle of a
|
||||
multiunit character. */
|
||||
multiunit character. We check only the portion of the subject that is going to
|
||||
be inspected during matching - from the offset minus the maximum back reference
|
||||
to the given length. This saves time when a small part of a large subject is
|
||||
being matched by the use of a starting offset. Note that the maximum lookbehind
|
||||
is a number of characters, not code units. */
|
||||
|
||||
#ifdef SUPPORT_UNICODE
|
||||
if (utf && (options & PCRE2_NO_UTF_CHECK) == 0)
|
||||
{
|
||||
match_data->rc = PRIV(valid_utf)(subject, length, &(match_data->startchar));
|
||||
if (match_data->rc != 0) return match_data->rc;
|
||||
PCRE2_SPTR check_subject = start_match; /* start_match includes offset */
|
||||
|
||||
if (start_offset > 0)
|
||||
{
|
||||
#if PCRE2_CODE_UNIT_WIDTH != 32
|
||||
if (start_offset > 0 && start_offset < length &&
|
||||
NOT_FIRSTCHAR(subject[start_offset]))
|
||||
return PCRE2_ERROR_BADUTFOFFSET;
|
||||
unsigned int i;
|
||||
if (start_match < end_subject && NOT_FIRSTCU(*start_match))
|
||||
return PCRE2_ERROR_BADUTFOFFSET;
|
||||
for (i = re->max_lookbehind; i > 0 && check_subject > subject; i--)
|
||||
{
|
||||
check_subject--;
|
||||
while (check_subject > subject &&
|
||||
#if PCRE2_CODE_UNIT_WIDTH == 8
|
||||
(*check_subject & 0xc0) == 0x80)
|
||||
#else /* 16-bit */
|
||||
(*check_subject & 0xfc00) == 0xdc00)
|
||||
#endif /* PCRE2_CODE_UNIT_WIDTH == 8 */
|
||||
check_subject--;
|
||||
}
|
||||
#else /* In the 32-bit library, one code unit equals one character. */
|
||||
check_subject -= re->max_lookbehind;
|
||||
if (check_subject < subject) check_subject = subject;
|
||||
#endif /* PCRE2_CODE_UNIT_WIDTH != 32 */
|
||||
}
|
||||
|
||||
/* Validate the relevant portion of the subject. After an error, adjust the
|
||||
offset to be an absolute offset in the whole string. */
|
||||
|
||||
match_data->rc = PRIV(valid_utf)(check_subject,
|
||||
length - (check_subject - subject), &(match_data->startchar));
|
||||
if (match_data->rc != 0)
|
||||
{
|
||||
match_data->startchar += check_subject - subject;
|
||||
return match_data->rc;
|
||||
}
|
||||
}
|
||||
#endif /* SUPPORT_UNICODE */
|
||||
|
||||
|
|
|
@ -72,7 +72,7 @@ just to undefine them all. */
|
|||
#undef MAX_MARK
|
||||
#undef MAX_PATTERN_SIZE
|
||||
#undef MAX_UTF_SINGLE_CU
|
||||
#undef NOT_FIRSTCHAR
|
||||
#undef NOT_FIRSTCU
|
||||
#undef PUT
|
||||
#undef PUT2
|
||||
#undef PUT2INC
|
||||
|
@ -252,7 +252,7 @@ UTF support is omitted, we don't even define them. */
|
|||
/* #define MAX_UTF_SINGLE_CU */
|
||||
/* #define HAS_EXTRALEN(c) */
|
||||
/* #define GET_EXTRALEN(c) */
|
||||
/* #define NOT_FIRSTCHAR(c) */
|
||||
/* #define NOT_FIRSTCU(c) */
|
||||
#define GETCHAR(c, eptr) c = *eptr;
|
||||
#define GETCHARTEST(c, eptr) c = *eptr;
|
||||
#define GETCHARINC(c, eptr) c = *eptr++;
|
||||
|
@ -285,10 +285,10 @@ Otherwise it has an undefined behaviour. */
|
|||
|
||||
#define GET_EXTRALEN(c) (PRIV(utf8_table4)[(c) & 0x3f])
|
||||
|
||||
/* Returns TRUE, if the given character is not the first character
|
||||
of a UTF sequence. */
|
||||
/* Returns TRUE, if the given value is not the first code unit of a UTF
|
||||
sequence. */
|
||||
|
||||
#define NOT_FIRSTCHAR(c) (((c) & 0xc0) == 0x80)
|
||||
#define NOT_FIRSTCU(c) (((c) & 0xc0) == 0x80)
|
||||
|
||||
/* Get the next UTF-8 character, not advancing the pointer. This is called when
|
||||
we know we are in UTF-8 mode. */
|
||||
|
@ -371,10 +371,10 @@ Otherwise it has an undefined behaviour. */
|
|||
|
||||
#define GET_EXTRALEN(c) 1
|
||||
|
||||
/* Returns TRUE, if the given character is not the first character
|
||||
of a UTF sequence. */
|
||||
/* Returns TRUE, if the given value is not the first code unit of a UTF
|
||||
sequence. */
|
||||
|
||||
#define NOT_FIRSTCHAR(c) (((c) & 0xfc00) == 0xdc00)
|
||||
#define NOT_FIRSTCU(c) (((c) & 0xfc00) == 0xdc00)
|
||||
|
||||
/* Base macro to pick up the low surrogate of a UTF-16 character, not
|
||||
advancing the pointer. */
|
||||
|
@ -469,7 +469,7 @@ into one PCRE2_UCHAR unit. */
|
|||
#define MAX_UTF_SINGLE_CU (0x10ffffu)
|
||||
#define HAS_EXTRALEN(c) (0)
|
||||
#define GET_EXTRALEN(c) (0)
|
||||
#define NOT_FIRSTCHAR(c) (0)
|
||||
#define NOT_FIRSTCU(c) (0)
|
||||
|
||||
/* Get the next UTF-32 character, not advancing the pointer. This is called when
|
||||
we know we are in UTF-32 mode. */
|
||||
|
|
|
@ -6485,6 +6485,7 @@ mb->match_frames_base = &frame_zero;
|
|||
subject string. */
|
||||
|
||||
if (length == PCRE2_ZERO_TERMINATED) length = PRIV(strlen)(subject);
|
||||
end_subject = subject + length;
|
||||
|
||||
/* Plausibility checks */
|
||||
|
||||
|
@ -6536,18 +6537,50 @@ mb->partial = ((options & PCRE2_PARTIAL_HARD) != 0)? 2 :
|
|||
|
||||
/* Check a UTF string for validity if required. For 8-bit and 16-bit strings,
|
||||
we must also check that a starting offset does not point into the middle of a
|
||||
multiunit character. */
|
||||
multiunit character. We check only the portion of the subject that is going to
|
||||
be inspected during matching - from the offset minus the maximum back reference
|
||||
to the given length. This saves time when a small part of a large subject is
|
||||
being matched by the use of a starting offset. Note that the maximum lookbehind
|
||||
is a number of characters, not code units. */
|
||||
|
||||
#ifdef SUPPORT_UNICODE
|
||||
if (utf && (options & PCRE2_NO_UTF_CHECK) == 0)
|
||||
{
|
||||
match_data->rc = PRIV(valid_utf)(subject, length, &(match_data->startchar));
|
||||
if (match_data->rc != 0) return match_data->rc;
|
||||
PCRE2_SPTR check_subject = start_match; /* start_match includes offset */
|
||||
|
||||
if (start_offset > 0)
|
||||
{
|
||||
#if PCRE2_CODE_UNIT_WIDTH != 32
|
||||
if (start_offset > 0 && start_offset < length &&
|
||||
NOT_FIRSTCHAR(subject[start_offset]))
|
||||
return PCRE2_ERROR_BADUTFOFFSET;
|
||||
unsigned int i;
|
||||
if (start_match < end_subject && NOT_FIRSTCU(*start_match))
|
||||
return PCRE2_ERROR_BADUTFOFFSET;
|
||||
for (i = re->max_lookbehind; i > 0 && check_subject > subject; i--)
|
||||
{
|
||||
check_subject--;
|
||||
while (check_subject > subject &&
|
||||
#if PCRE2_CODE_UNIT_WIDTH == 8
|
||||
(*check_subject & 0xc0) == 0x80)
|
||||
#else /* 16-bit */
|
||||
(*check_subject & 0xfc00) == 0xdc00)
|
||||
#endif /* PCRE2_CODE_UNIT_WIDTH == 8 */
|
||||
check_subject--;
|
||||
}
|
||||
#else /* In the 32-bit library, one code unit equals one character. */
|
||||
check_subject -= re->max_lookbehind;
|
||||
if (check_subject < subject) check_subject = subject;
|
||||
#endif /* PCRE2_CODE_UNIT_WIDTH != 32 */
|
||||
}
|
||||
|
||||
/* Validate the relevant portion of the subject. After an error, adjust the
|
||||
offset to be an absolute offset in the whole string. */
|
||||
|
||||
match_data->rc = PRIV(valid_utf)(check_subject,
|
||||
length - (check_subject - subject), &(match_data->startchar));
|
||||
if (match_data->rc != 0)
|
||||
{
|
||||
match_data->startchar += check_subject - subject;
|
||||
return match_data->rc;
|
||||
}
|
||||
}
|
||||
#endif /* SUPPORT_UNICODE */
|
||||
|
||||
|
@ -6594,7 +6627,7 @@ else
|
|||
|
||||
mb->start_subject = subject;
|
||||
mb->start_offset = start_offset;
|
||||
mb->end_subject = end_subject = mb->start_subject + length;
|
||||
mb->end_subject = end_subject;
|
||||
mb->hasthen = (re->flags & PCRE2_HASTHEN) != 0;
|
||||
|
||||
mb->moptions = options; /* Match options */
|
||||
|
|
|
@ -133,6 +133,35 @@
|
|||
\xfc\x84\x80\x80\x80\x80\=no_utf_check
|
||||
\xfd\x83\x80\x80\x80\x80\=no_utf_check
|
||||
|
||||
# Similar tests with offsets
|
||||
|
||||
/badutf/utf
|
||||
X\xdfabcd
|
||||
X\xdfabcd\=offset=1
|
||||
X\xdfabcd\=offset=2
|
||||
|
||||
/(?<=x)badutf/utf
|
||||
X\xdfabcd
|
||||
X\xdfabcd\=offset=1
|
||||
X\xdfabcd\=offset=2
|
||||
X\xdfabcd\=offset=3
|
||||
X\xdfabcd\xdf\=offset=3
|
||||
|
||||
/(?<=xx)badutf/utf
|
||||
X\xdfabcd
|
||||
X\xdfabcd\=offset=1
|
||||
X\xdfabcd\=offset=2
|
||||
X\xdfabcd\=offset=3
|
||||
|
||||
/(?<=xxxx)badutf/utf
|
||||
X\xdfabcd
|
||||
X\xdfabcd\=offset=1
|
||||
X\xdfabcd\=offset=2
|
||||
X\xdfabcd\=offset=3
|
||||
X\xdfabcd\=offset=6
|
||||
X\xdfabc\xdf\=offset=6
|
||||
X\xdfabc\xdf\=offset=7
|
||||
|
||||
/\x{100}/IB,utf
|
||||
|
||||
/\x{1000}/IB,utf
|
||||
|
|
|
@ -158,6 +158,7 @@
|
|||
|
||||
/X/utf
|
||||
XX\x{d800}
|
||||
XX\x{d800}\=offset=3
|
||||
XX\x{d800}\=no_utf_check
|
||||
XX\x{da00}
|
||||
XX\x{da00}\=no_utf_check
|
||||
|
@ -170,6 +171,9 @@
|
|||
XX\x{110000}
|
||||
XX\x{d800}\x{1234}
|
||||
|
||||
/(?<=.)X/utf
|
||||
XX\x{d800}\=offset=3
|
||||
|
||||
/(*UTF16)\x{11234}/
|
||||
abcd\x{11234}pqr
|
||||
|
||||
|
|
|
@ -1,155 +1,37 @@
|
|||
# These are:
|
||||
#
|
||||
# (1) Tests of the match-limiting features. The results are different for
|
||||
# interpretive or JIT matching, so this test should not be run with JIT. The
|
||||
# same tests are run using JIT in test 16.
|
||||
# These test special (mostly error) UTF features of DFA matching. They are a
|
||||
# selection of the more comprehensive tests that are run for non-DFA matching.
|
||||
# The output is different for the different widths.
|
||||
|
||||
# (2) Other tests that must not be run with JIT.
|
||||
#subject dfa
|
||||
|
||||
/(a+)*zz/I
|
||||
aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaazzbbbbbb\=find_limits
|
||||
aaaaaaaaaaaaaz\=find_limits
|
||||
/X/utf
|
||||
XX\x{d800}
|
||||
XX\x{d800}\=offset=3
|
||||
XX\x{d800}\=no_utf_check
|
||||
XX\x{da00}
|
||||
XX\x{da00}\=no_utf_check
|
||||
XX\x{dc00}
|
||||
XX\x{dc00}\=no_utf_check
|
||||
XX\x{de00}
|
||||
XX\x{de00}\=no_utf_check
|
||||
XX\x{dfff}
|
||||
XX\x{dfff}\=no_utf_check
|
||||
XX\x{110000}
|
||||
XX\x{d800}\x{1234}
|
||||
|
||||
!((?:\s|//.*\\n|/[*](?:\\n|.)*?[*]/)*)!I
|
||||
/* this is a C style comment */\=find_limits
|
||||
/badutf/utf
|
||||
X\xdf
|
||||
XX\xef
|
||||
XXX\xef\x80
|
||||
X\xf7
|
||||
XX\xf7\x80
|
||||
XXX\xf7\x80\x80
|
||||
|
||||
/^(?>a)++/
|
||||
aa\=find_limits
|
||||
aaaaaaaaa\=find_limits
|
||||
|
||||
/(a)(?1)++/
|
||||
aa\=find_limits
|
||||
aaaaaaaaa\=find_limits
|
||||
|
||||
/a(?:.)*?a/ims
|
||||
abbbbbbbbbbbbbbbbbbbbba\=find_limits
|
||||
|
||||
/a(?:.(*THEN))*?a/ims
|
||||
abbbbbbbbbbbbbbbbbbbbba\=find_limits
|
||||
|
||||
/a(?:.(*THEN:ABC))*?a/ims
|
||||
abbbbbbbbbbbbbbbbbbbbba\=find_limits
|
||||
|
||||
/^(?>a+)(?>b+)(?>c+)(?>d+)(?>e+)/
|
||||
aabbccddee\=find_limits
|
||||
|
||||
/^(?>(a+))(?>(b+))(?>(c+))(?>(d+))(?>(e+))/
|
||||
aabbccddee\=find_limits
|
||||
|
||||
/^(?>(a+))(?>b+)(?>(c+))(?>d+)(?>(e+))/
|
||||
aabbccddee\=find_limits
|
||||
|
||||
/(*LIMIT_MATCH=12bc)abc/
|
||||
|
||||
/(*LIMIT_MATCH=4294967290)abc/
|
||||
|
||||
/(*LIMIT_RECURSION=4294967280)abc/I
|
||||
|
||||
/(a+)*zz/
|
||||
aaaaaaaaaaaaaz
|
||||
aaaaaaaaaaaaaz\=match_limit=3000
|
||||
|
||||
/(a+)*zz/
|
||||
aaaaaaaaaaaaaz\=recursion_limit=10
|
||||
|
||||
/(*LIMIT_MATCH=3000)(a+)*zz/I
|
||||
aaaaaaaaaaaaaz
|
||||
aaaaaaaaaaaaaz\=match_limit=60000
|
||||
|
||||
/(*LIMIT_MATCH=60000)(*LIMIT_MATCH=3000)(a+)*zz/I
|
||||
aaaaaaaaaaaaaz
|
||||
|
||||
/(*LIMIT_MATCH=60000)(a+)*zz/I
|
||||
aaaaaaaaaaaaaz
|
||||
aaaaaaaaaaaaaz\=match_limit=3000
|
||||
|
||||
/(*LIMIT_RECURSION=10)(a+)*zz/I
|
||||
aaaaaaaaaaaaaz
|
||||
aaaaaaaaaaaaaz\=recursion_limit=1000
|
||||
|
||||
/(*LIMIT_RECURSION=10)(*LIMIT_RECURSION=1000)(a+)*zz/I
|
||||
aaaaaaaaaaaaaz
|
||||
|
||||
/(*LIMIT_RECURSION=1000)(a+)*zz/I
|
||||
aaaaaaaaaaaaaz
|
||||
aaaaaaaaaaaaaz\=recursion_limit=10
|
||||
|
||||
# These three have infinitely nested recursions.
|
||||
|
||||
/((?2))((?1))/
|
||||
abc
|
||||
|
||||
/((?(R2)a+|(?1)b))/
|
||||
aaaabcde
|
||||
|
||||
/(?(R)a*(?1)|((?R))b)/
|
||||
aaaabcde
|
||||
|
||||
# The allusedtext modifier does not work with JIT, which does not maintain
|
||||
# the leftchar/rightchar data.
|
||||
|
||||
/abc(?=xyz)/allusedtext
|
||||
abcxyzpqr
|
||||
abcxyzpqr\=aftertext
|
||||
|
||||
/(?<=pqr)abc(?=xyz)/allusedtext
|
||||
xyzpqrabcxyzpqr
|
||||
xyzpqrabcxyzpqr\=aftertext
|
||||
|
||||
/a\b/
|
||||
a.\=allusedtext
|
||||
a\=allusedtext
|
||||
|
||||
/abc\Kxyz/
|
||||
abcxyz\=allusedtext
|
||||
|
||||
/abc(?=xyz(*ACCEPT))/
|
||||
abcxyz\=allusedtext
|
||||
|
||||
/abc(?=abcde)(?=ab)/allusedtext
|
||||
abcabcdefg
|
||||
|
||||
# These tests provoke recursion loops, which give a different error message
|
||||
# when JIT is used.
|
||||
|
||||
/(?R)/I
|
||||
abcd
|
||||
|
||||
/(a|(?R))/I
|
||||
abcd
|
||||
defg
|
||||
|
||||
/(ab|(bc|(de|(?R))))/I
|
||||
abcd
|
||||
fghi
|
||||
|
||||
/(ab|(bc|(de|(?1))))/I
|
||||
abcd
|
||||
fghi
|
||||
|
||||
/x(ab|(bc|(de|(?1)x)x)x)/I
|
||||
xab123
|
||||
xfghi
|
||||
|
||||
/(?!\w)(?R)/
|
||||
abcd
|
||||
=abc
|
||||
|
||||
/(?=\w)(?R)/
|
||||
=abc
|
||||
abcd
|
||||
|
||||
/(?<!\w)(?R)/
|
||||
abcd
|
||||
|
||||
/(?<=\w)(?R)/
|
||||
abcd
|
||||
|
||||
/(a+|(?R)b)/
|
||||
aaa
|
||||
bbb
|
||||
|
||||
/[^\xff]((?1))/BI
|
||||
abcd
|
||||
/shortutf/utf
|
||||
XX\xdf\=ph
|
||||
XX\xef\=ph
|
||||
XX\xef\x80\=ph
|
||||
\xf7\=ph
|
||||
\xf7\x80\=ph
|
||||
|
||||
# End of testinput14
|
||||
|
|
|
@ -1,9 +1,155 @@
|
|||
# This test is run only when JIT support is not available. It checks that an
|
||||
# attempt to use it has the expected behaviour. It also tests things that
|
||||
# are different without JIT.
|
||||
# These are:
|
||||
#
|
||||
# (1) Tests of the match-limiting features. The results are different for
|
||||
# interpretive or JIT matching, so this test should not be run with JIT. The
|
||||
# same tests are run using JIT in test 17.
|
||||
|
||||
/abc/I,jit,jitverify
|
||||
# (2) Other tests that must not be run with JIT.
|
||||
|
||||
/a*/I
|
||||
/(a+)*zz/I
|
||||
aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaazzbbbbbb\=find_limits
|
||||
aaaaaaaaaaaaaz\=find_limits
|
||||
|
||||
!((?:\s|//.*\\n|/[*](?:\\n|.)*?[*]/)*)!I
|
||||
/* this is a C style comment */\=find_limits
|
||||
|
||||
/^(?>a)++/
|
||||
aa\=find_limits
|
||||
aaaaaaaaa\=find_limits
|
||||
|
||||
/(a)(?1)++/
|
||||
aa\=find_limits
|
||||
aaaaaaaaa\=find_limits
|
||||
|
||||
/a(?:.)*?a/ims
|
||||
abbbbbbbbbbbbbbbbbbbbba\=find_limits
|
||||
|
||||
/a(?:.(*THEN))*?a/ims
|
||||
abbbbbbbbbbbbbbbbbbbbba\=find_limits
|
||||
|
||||
/a(?:.(*THEN:ABC))*?a/ims
|
||||
abbbbbbbbbbbbbbbbbbbbba\=find_limits
|
||||
|
||||
/^(?>a+)(?>b+)(?>c+)(?>d+)(?>e+)/
|
||||
aabbccddee\=find_limits
|
||||
|
||||
/^(?>(a+))(?>(b+))(?>(c+))(?>(d+))(?>(e+))/
|
||||
aabbccddee\=find_limits
|
||||
|
||||
/^(?>(a+))(?>b+)(?>(c+))(?>d+)(?>(e+))/
|
||||
aabbccddee\=find_limits
|
||||
|
||||
/(*LIMIT_MATCH=12bc)abc/
|
||||
|
||||
/(*LIMIT_MATCH=4294967290)abc/
|
||||
|
||||
/(*LIMIT_RECURSION=4294967280)abc/I
|
||||
|
||||
/(a+)*zz/
|
||||
aaaaaaaaaaaaaz
|
||||
aaaaaaaaaaaaaz\=match_limit=3000
|
||||
|
||||
/(a+)*zz/
|
||||
aaaaaaaaaaaaaz\=recursion_limit=10
|
||||
|
||||
/(*LIMIT_MATCH=3000)(a+)*zz/I
|
||||
aaaaaaaaaaaaaz
|
||||
aaaaaaaaaaaaaz\=match_limit=60000
|
||||
|
||||
/(*LIMIT_MATCH=60000)(*LIMIT_MATCH=3000)(a+)*zz/I
|
||||
aaaaaaaaaaaaaz
|
||||
|
||||
/(*LIMIT_MATCH=60000)(a+)*zz/I
|
||||
aaaaaaaaaaaaaz
|
||||
aaaaaaaaaaaaaz\=match_limit=3000
|
||||
|
||||
/(*LIMIT_RECURSION=10)(a+)*zz/I
|
||||
aaaaaaaaaaaaaz
|
||||
aaaaaaaaaaaaaz\=recursion_limit=1000
|
||||
|
||||
/(*LIMIT_RECURSION=10)(*LIMIT_RECURSION=1000)(a+)*zz/I
|
||||
aaaaaaaaaaaaaz
|
||||
|
||||
/(*LIMIT_RECURSION=1000)(a+)*zz/I
|
||||
aaaaaaaaaaaaaz
|
||||
aaaaaaaaaaaaaz\=recursion_limit=10
|
||||
|
||||
# These three have infinitely nested recursions.
|
||||
|
||||
/((?2))((?1))/
|
||||
abc
|
||||
|
||||
/((?(R2)a+|(?1)b))/
|
||||
aaaabcde
|
||||
|
||||
/(?(R)a*(?1)|((?R))b)/
|
||||
aaaabcde
|
||||
|
||||
# The allusedtext modifier does not work with JIT, which does not maintain
|
||||
# the leftchar/rightchar data.
|
||||
|
||||
/abc(?=xyz)/allusedtext
|
||||
abcxyzpqr
|
||||
abcxyzpqr\=aftertext
|
||||
|
||||
/(?<=pqr)abc(?=xyz)/allusedtext
|
||||
xyzpqrabcxyzpqr
|
||||
xyzpqrabcxyzpqr\=aftertext
|
||||
|
||||
/a\b/
|
||||
a.\=allusedtext
|
||||
a\=allusedtext
|
||||
|
||||
/abc\Kxyz/
|
||||
abcxyz\=allusedtext
|
||||
|
||||
/abc(?=xyz(*ACCEPT))/
|
||||
abcxyz\=allusedtext
|
||||
|
||||
/abc(?=abcde)(?=ab)/allusedtext
|
||||
abcabcdefg
|
||||
|
||||
# These tests provoke recursion loops, which give a different error message
|
||||
# when JIT is used.
|
||||
|
||||
/(?R)/I
|
||||
abcd
|
||||
|
||||
/(a|(?R))/I
|
||||
abcd
|
||||
defg
|
||||
|
||||
/(ab|(bc|(de|(?R))))/I
|
||||
abcd
|
||||
fghi
|
||||
|
||||
/(ab|(bc|(de|(?1))))/I
|
||||
abcd
|
||||
fghi
|
||||
|
||||
/x(ab|(bc|(de|(?1)x)x)x)/I
|
||||
xab123
|
||||
xfghi
|
||||
|
||||
/(?!\w)(?R)/
|
||||
abcd
|
||||
=abc
|
||||
|
||||
/(?=\w)(?R)/
|
||||
=abc
|
||||
abcd
|
||||
|
||||
/(?<!\w)(?R)/
|
||||
abcd
|
||||
|
||||
/(?<=\w)(?R)/
|
||||
abcd
|
||||
|
||||
/(a+|(?R)b)/
|
||||
aaa
|
||||
bbb
|
||||
|
||||
/[^\xff]((?1))/BI
|
||||
abcd
|
||||
|
||||
# End of testinput15
|
||||
|
|
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
|
@ -1,17 +1,95 @@
|
|||
# This set of tests is run only with the 8-bit library. It tests the POSIX
|
||||
# interface with UTF/UCP support, which is supported only with the 8-bit
|
||||
# library. This test should not be run with JIT (which is not available for the
|
||||
# POSIX interface).
|
||||
# interface, which is supported only with the 8-bit library. This test should
|
||||
# not be run with JIT (which is not available for the POSIX interface).
|
||||
|
||||
#forbid_utf
|
||||
#pattern posix
|
||||
|
||||
/a\x{1234}b/utf
|
||||
a\x{1234}b
|
||||
# Test invalid options
|
||||
|
||||
/\w/
|
||||
+++\x{c2}
|
||||
/abc/auto_callout
|
||||
|
||||
/\w/ucp
|
||||
+++\x{c2}
|
||||
/abc/
|
||||
abc\=find_limits
|
||||
|
||||
# End of testdata/testinput17
|
||||
/abc/
|
||||
abc\=partial_hard
|
||||
|
||||
# Real tests
|
||||
|
||||
/abc/
|
||||
abc
|
||||
*** Failers
|
||||
|
||||
/^abc|def/
|
||||
abcdef
|
||||
abcdef\=notbol
|
||||
|
||||
/.*((abc)$|(def))/
|
||||
defabc
|
||||
defabc\=noteol
|
||||
|
||||
/the quick brown fox/
|
||||
the quick brown fox
|
||||
*** Failers
|
||||
The Quick Brown Fox
|
||||
|
||||
/the quick brown fox/i
|
||||
the quick brown fox
|
||||
The Quick Brown Fox
|
||||
|
||||
/abc.def/
|
||||
*** Failers
|
||||
abc\ndef
|
||||
|
||||
/abc$/
|
||||
abc
|
||||
abc\n
|
||||
|
||||
/(abc)\2/
|
||||
|
||||
/(abc\1)/
|
||||
abc
|
||||
|
||||
/a*(b+)(z)(z)/
|
||||
aaaabbbbzzzz
|
||||
aaaabbbbzzzz\=ovector=0
|
||||
aaaabbbbzzzz\=ovector=1
|
||||
aaaabbbbzzzz\=ovector=2
|
||||
|
||||
/ab.cd/
|
||||
ab-cd
|
||||
ab=cd
|
||||
** Failers
|
||||
ab\ncd
|
||||
|
||||
/ab.cd/s
|
||||
ab-cd
|
||||
ab=cd
|
||||
ab\ncd
|
||||
|
||||
/a(b)c/no_auto_capture
|
||||
abc
|
||||
|
||||
/a(?P<name>b)c/no_auto_capture
|
||||
abc
|
||||
|
||||
/a?|b?/
|
||||
abc
|
||||
** Failers
|
||||
ddd\=notempty
|
||||
|
||||
/\w+A/
|
||||
CDAAAAB
|
||||
|
||||
/\w+A/ungreedy
|
||||
CDAAAAB
|
||||
|
||||
/\Biss\B/I,aftertext
|
||||
Mississippi
|
||||
|
||||
/abc/\
|
||||
|
||||
"(?(?C)"
|
||||
|
||||
# End of testdata/testinput18
|
||||
|
|
|
@ -1,62 +1,17 @@
|
|||
# This set of tests exercises the serialization/deserialization functions in
|
||||
# the library. It does not use UTF or JIT.
|
||||
# This set of tests is run only with the 8-bit library. It tests the POSIX
|
||||
# interface with UTF/UCP support, which is supported only with the 8-bit
|
||||
# library. This test should not be run with JIT (which is not available for the
|
||||
# POSIX interface).
|
||||
|
||||
#forbid_utf
|
||||
#pattern posix
|
||||
|
||||
# Compile several patterns, push them onto the stack, and then write them
|
||||
# all to a file.
|
||||
/a\x{1234}b/utf
|
||||
a\x{1234}b
|
||||
|
||||
#pattern push
|
||||
/\w/
|
||||
+++\x{c2}
|
||||
|
||||
/(?<NAME>(?&NAME_PAT))\s+(?<ADDR>(?&ADDRESS_PAT))
|
||||
(?(DEFINE)
|
||||
(?<NAME_PAT>[a-z]+)
|
||||
(?<ADDRESS_PAT>\d+)
|
||||
)/x
|
||||
/^(?:((.)(?1)\2|)|((.)(?3)\4|.))$/i
|
||||
/\w/ucp
|
||||
+++\x{c2}
|
||||
|
||||
#save testsaved1
|
||||
|
||||
# Do it again for some more patterns.
|
||||
|
||||
/(*MARK:A)(*SKIP:B)(C|X)/mark
|
||||
/(?:(?<n>foo)|(?<n>bar))\k<n>/dupnames
|
||||
|
||||
#save testsaved2
|
||||
#pattern -push
|
||||
|
||||
# Reload the patterns, then pop them one by one and check them.
|
||||
|
||||
#load testsaved1
|
||||
#load testsaved2
|
||||
|
||||
#pop info
|
||||
foofoo
|
||||
barbar
|
||||
|
||||
#pop mark
|
||||
C
|
||||
D
|
||||
|
||||
#pop
|
||||
AmanaplanacanalPanama
|
||||
|
||||
#pop info
|
||||
metcalfe 33
|
||||
|
||||
# Check for an error when different tables are used.
|
||||
|
||||
/abc/push,tables=1
|
||||
/xyz/push,tables=2
|
||||
#save testsaved1
|
||||
|
||||
#pop
|
||||
xyz
|
||||
|
||||
#pop
|
||||
abc
|
||||
|
||||
#pop should give an error
|
||||
pqr
|
||||
|
||||
# End of testinput19
|
||||
# End of testdata/testinput19
|
||||
|
|
|
@ -0,0 +1,62 @@
|
|||
# This set of tests exercises the serialization/deserialization functions in
|
||||
# the library. It does not use UTF or JIT.
|
||||
|
||||
#forbid_utf
|
||||
|
||||
# Compile several patterns, push them onto the stack, and then write them
|
||||
# all to a file.
|
||||
|
||||
#pattern push
|
||||
|
||||
/(?<NAME>(?&NAME_PAT))\s+(?<ADDR>(?&ADDRESS_PAT))
|
||||
(?(DEFINE)
|
||||
(?<NAME_PAT>[a-z]+)
|
||||
(?<ADDRESS_PAT>\d+)
|
||||
)/x
|
||||
/^(?:((.)(?1)\2|)|((.)(?3)\4|.))$/i
|
||||
|
||||
#save testsaved1
|
||||
|
||||
# Do it again for some more patterns.
|
||||
|
||||
/(*MARK:A)(*SKIP:B)(C|X)/mark
|
||||
/(?:(?<n>foo)|(?<n>bar))\k<n>/dupnames
|
||||
|
||||
#save testsaved2
|
||||
#pattern -push
|
||||
|
||||
# Reload the patterns, then pop them one by one and check them.
|
||||
|
||||
#load testsaved1
|
||||
#load testsaved2
|
||||
|
||||
#pop info
|
||||
foofoo
|
||||
barbar
|
||||
|
||||
#pop mark
|
||||
C
|
||||
D
|
||||
|
||||
#pop
|
||||
AmanaplanacanalPanama
|
||||
|
||||
#pop info
|
||||
metcalfe 33
|
||||
|
||||
# Check for an error when different tables are used.
|
||||
|
||||
/abc/push,tables=1
|
||||
/xyz/push,tables=2
|
||||
#save testsaved1
|
||||
|
||||
#pop
|
||||
xyz
|
||||
|
||||
#pop
|
||||
abc
|
||||
|
||||
#pop should give an error
|
||||
pqr
|
||||
|
||||
# End of testinput20
|
|
@ -236,6 +236,54 @@ No match
|
|||
\xfd\x83\x80\x80\x80\x80\=no_utf_check
|
||||
No match
|
||||
|
||||
# Similar tests with offsets
|
||||
|
||||
/badutf/utf
|
||||
X\xdfabcd
|
||||
Failed: error -8: UTF-8 error: byte 2 top bits not 0x80 at offset 1
|
||||
X\xdfabcd\=offset=1
|
||||
Failed: error -8: UTF-8 error: byte 2 top bits not 0x80 at offset 1
|
||||
X\xdfabcd\=offset=2
|
||||
No match
|
||||
|
||||
/(?<=x)badutf/utf
|
||||
X\xdfabcd
|
||||
Failed: error -8: UTF-8 error: byte 2 top bits not 0x80 at offset 1
|
||||
X\xdfabcd\=offset=1
|
||||
Failed: error -8: UTF-8 error: byte 2 top bits not 0x80 at offset 1
|
||||
X\xdfabcd\=offset=2
|
||||
Failed: error -8: UTF-8 error: byte 2 top bits not 0x80 at offset 1
|
||||
X\xdfabcd\=offset=3
|
||||
No match
|
||||
X\xdfabcd\xdf\=offset=3
|
||||
Failed: error -3: UTF-8 error: 1 byte missing at end at offset 6
|
||||
|
||||
/(?<=xx)badutf/utf
|
||||
X\xdfabcd
|
||||
Failed: error -8: UTF-8 error: byte 2 top bits not 0x80 at offset 1
|
||||
X\xdfabcd\=offset=1
|
||||
Failed: error -8: UTF-8 error: byte 2 top bits not 0x80 at offset 1
|
||||
X\xdfabcd\=offset=2
|
||||
Failed: error -8: UTF-8 error: byte 2 top bits not 0x80 at offset 1
|
||||
X\xdfabcd\=offset=3
|
||||
Failed: error -8: UTF-8 error: byte 2 top bits not 0x80 at offset 1
|
||||
|
||||
/(?<=xxxx)badutf/utf
|
||||
X\xdfabcd
|
||||
Failed: error -8: UTF-8 error: byte 2 top bits not 0x80 at offset 1
|
||||
X\xdfabcd\=offset=1
|
||||
Failed: error -8: UTF-8 error: byte 2 top bits not 0x80 at offset 1
|
||||
X\xdfabcd\=offset=2
|
||||
Failed: error -8: UTF-8 error: byte 2 top bits not 0x80 at offset 1
|
||||
X\xdfabcd\=offset=3
|
||||
Failed: error -8: UTF-8 error: byte 2 top bits not 0x80 at offset 1
|
||||
X\xdfabcd\=offset=6
|
||||
No match
|
||||
X\xdfabc\xdf\=offset=6
|
||||
Failed: error -3: UTF-8 error: 1 byte missing at end at offset 5
|
||||
X\xdfabc\xdf\=offset=7
|
||||
Failed: error -33: bad offset value
|
||||
|
||||
/\x{100}/IB,utf
|
||||
------------------------------------------------------------------
|
||||
Bra
|
||||
|
|
|
@ -609,6 +609,8 @@ Failed: error 106 at offset 13: missing terminating ] for character class
|
|||
/X/utf
|
||||
XX\x{d800}
|
||||
Failed: error -24: UTF-16 error: missing low surrogate at end at offset 2
|
||||
XX\x{d800}\=offset=3
|
||||
No match
|
||||
XX\x{d800}\=no_utf_check
|
||||
0: X
|
||||
XX\x{da00}
|
||||
|
@ -632,6 +634,10 @@ Failed: error -26: UTF-16 error: isolated low surrogate at offset 2
|
|||
XX\x{d800}\x{1234}
|
||||
Failed: error -25: UTF-16 error: invalid low surrogate at offset 3
|
||||
|
||||
/(?<=.)X/utf
|
||||
XX\x{d800}\=offset=3
|
||||
Failed: error -24: UTF-16 error: missing low surrogate at end at offset 2
|
||||
|
||||
/(*UTF16)\x{11234}/
|
||||
abcd\x{11234}pqr
|
||||
0: \x{11234}
|
||||
|
|
|
@ -602,6 +602,8 @@ Failed: error 106 at offset 13: missing terminating ] for character class
|
|||
/X/utf
|
||||
XX\x{d800}
|
||||
Failed: error -27: UTF-32 error: code points 0xd800-0xdfff are not defined at offset 2
|
||||
XX\x{d800}\=offset=3
|
||||
No match
|
||||
XX\x{d800}\=no_utf_check
|
||||
0: X
|
||||
XX\x{da00}
|
||||
|
@ -625,6 +627,10 @@ Failed: error -28: UTF-32 error: code points greater than 0x10ffff are not defin
|
|||
XX\x{d800}\x{1234}
|
||||
Failed: error -27: UTF-32 error: code points 0xd800-0xdfff are not defined at offset 2
|
||||
|
||||
/(?<=.)X/utf
|
||||
XX\x{d800}\=offset=3
|
||||
Failed: error -27: UTF-32 error: code points 0xd800-0xdfff are not defined at offset 2
|
||||
|
||||
/(*UTF16)\x{11234}/
|
||||
Failed: error 160 at offset 5: (*VERB) not recognized or malformed
|
||||
abcd\x{11234}pqr
|
||||
|
|
|
@ -1,334 +0,0 @@
|
|||
# These are:
|
||||
#
|
||||
# (1) Tests of the match-limiting features. The results are different for
|
||||
# interpretive or JIT matching, so this test should not be run with JIT. The
|
||||
# same tests are run using JIT in test 16.
|
||||
|
||||
# (2) Other tests that must not be run with JIT.
|
||||
|
||||
/(a+)*zz/I
|
||||
Capturing subpattern count = 1
|
||||
Starting code units: a z
|
||||
Last code unit = 'z'
|
||||
Subject length lower bound = 2
|
||||
aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaazzbbbbbb\=find_limits
|
||||
Minimum match limit = 8
|
||||
Minimum recursion limit = 6
|
||||
0: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaazz
|
||||
1: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
|
||||
aaaaaaaaaaaaaz\=find_limits
|
||||
Minimum match limit = 32768
|
||||
Minimum recursion limit = 29
|
||||
No match
|
||||
|
||||
!((?:\s|//.*\\n|/[*](?:\\n|.)*?[*]/)*)!I
|
||||
Capturing subpattern count = 1
|
||||
May match empty string
|
||||
Subject length lower bound = 0
|
||||
/* this is a C style comment */\=find_limits
|
||||
Minimum match limit = 120
|
||||
Minimum recursion limit = 6
|
||||
0: /* this is a C style comment */
|
||||
1: /* this is a C style comment */
|
||||
|
||||
/^(?>a)++/
|
||||
aa\=find_limits
|
||||
Minimum match limit = 5
|
||||
Minimum recursion limit = 2
|
||||
0: aa
|
||||
aaaaaaaaa\=find_limits
|
||||
Minimum match limit = 12
|
||||
Minimum recursion limit = 2
|
||||
0: aaaaaaaaa
|
||||
|
||||
/(a)(?1)++/
|
||||
aa\=find_limits
|
||||
Minimum match limit = 7
|
||||
Minimum recursion limit = 4
|
||||
0: aa
|
||||
1: a
|
||||
aaaaaaaaa\=find_limits
|
||||
Minimum match limit = 21
|
||||
Minimum recursion limit = 4
|
||||
0: aaaaaaaaa
|
||||
1: a
|
||||
|
||||
/a(?:.)*?a/ims
|
||||
abbbbbbbbbbbbbbbbbbbbba\=find_limits
|
||||
Minimum match limit = 65
|
||||
Minimum recursion limit = 2
|
||||
0: abbbbbbbbbbbbbbbbbbbbba
|
||||
|
||||
/a(?:.(*THEN))*?a/ims
|
||||
abbbbbbbbbbbbbbbbbbbbba\=find_limits
|
||||
Minimum match limit = 86
|
||||
Minimum recursion limit = 45
|
||||
0: abbbbbbbbbbbbbbbbbbbbba
|
||||
|
||||
/a(?:.(*THEN:ABC))*?a/ims
|
||||
abbbbbbbbbbbbbbbbbbbbba\=find_limits
|
||||
Minimum match limit = 86
|
||||
Minimum recursion limit = 45
|
||||
0: abbbbbbbbbbbbbbbbbbbbba
|
||||
|
||||
/^(?>a+)(?>b+)(?>c+)(?>d+)(?>e+)/
|
||||
aabbccddee\=find_limits
|
||||
Minimum match limit = 7
|
||||
Minimum recursion limit = 2
|
||||
0: aabbccddee
|
||||
|
||||
/^(?>(a+))(?>(b+))(?>(c+))(?>(d+))(?>(e+))/
|
||||
aabbccddee\=find_limits
|
||||
Minimum match limit = 17
|
||||
Minimum recursion limit = 16
|
||||
0: aabbccddee
|
||||
1: aa
|
||||
2: bb
|
||||
3: cc
|
||||
4: dd
|
||||
5: ee
|
||||
|
||||
/^(?>(a+))(?>b+)(?>(c+))(?>d+)(?>(e+))/
|
||||
aabbccddee\=find_limits
|
||||
Minimum match limit = 13
|
||||
Minimum recursion limit = 10
|
||||
0: aabbccddee
|
||||
1: aa
|
||||
2: cc
|
||||
3: ee
|
||||
|
||||
/(*LIMIT_MATCH=12bc)abc/
|
||||
Failed: error 160 at offset 17: (*VERB) not recognized or malformed
|
||||
|
||||
/(*LIMIT_MATCH=4294967290)abc/
|
||||
Failed: error 160 at offset 24: (*VERB) not recognized or malformed
|
||||
|
||||
/(*LIMIT_RECURSION=4294967280)abc/I
|
||||
Capturing subpattern count = 0
|
||||
Recursion limit = 4294967280
|
||||
First code unit = 'a'
|
||||
Last code unit = 'c'
|
||||
Subject length lower bound = 3
|
||||
|
||||
/(a+)*zz/
|
||||
aaaaaaaaaaaaaz
|
||||
No match
|
||||
aaaaaaaaaaaaaz\=match_limit=3000
|
||||
Failed: error -47: match limit exceeded
|
||||
|
||||
/(a+)*zz/
|
||||
aaaaaaaaaaaaaz\=recursion_limit=10
|
||||
Failed: error -53: recursion limit exceeded
|
||||
|
||||
/(*LIMIT_MATCH=3000)(a+)*zz/I
|
||||
Capturing subpattern count = 1
|
||||
Match limit = 3000
|
||||
Starting code units: a z
|
||||
Last code unit = 'z'
|
||||
Subject length lower bound = 2
|
||||
aaaaaaaaaaaaaz
|
||||
Failed: error -47: match limit exceeded
|
||||
aaaaaaaaaaaaaz\=match_limit=60000
|
||||
Failed: error -47: match limit exceeded
|
||||
|
||||
/(*LIMIT_MATCH=60000)(*LIMIT_MATCH=3000)(a+)*zz/I
|
||||
Capturing subpattern count = 1
|
||||
Match limit = 3000
|
||||
Starting code units: a z
|
||||
Last code unit = 'z'
|
||||
Subject length lower bound = 2
|
||||
aaaaaaaaaaaaaz
|
||||
Failed: error -47: match limit exceeded
|
||||
|
||||
/(*LIMIT_MATCH=60000)(a+)*zz/I
|
||||
Capturing subpattern count = 1
|
||||
Match limit = 60000
|
||||
Starting code units: a z
|
||||
Last code unit = 'z'
|
||||
Subject length lower bound = 2
|
||||
aaaaaaaaaaaaaz
|
||||
No match
|
||||
aaaaaaaaaaaaaz\=match_limit=3000
|
||||
Failed: error -47: match limit exceeded
|
||||
|
||||
/(*LIMIT_RECURSION=10)(a+)*zz/I
|
||||
Capturing subpattern count = 1
|
||||
Recursion limit = 10
|
||||
Starting code units: a z
|
||||
Last code unit = 'z'
|
||||
Subject length lower bound = 2
|
||||
aaaaaaaaaaaaaz
|
||||
Failed: error -53: recursion limit exceeded
|
||||
aaaaaaaaaaaaaz\=recursion_limit=1000
|
||||
Failed: error -53: recursion limit exceeded
|
||||
|
||||
/(*LIMIT_RECURSION=10)(*LIMIT_RECURSION=1000)(a+)*zz/I
|
||||
Capturing subpattern count = 1
|
||||
Recursion limit = 1000
|
||||
Starting code units: a z
|
||||
Last code unit = 'z'
|
||||
Subject length lower bound = 2
|
||||
aaaaaaaaaaaaaz
|
||||
No match
|
||||
|
||||
/(*LIMIT_RECURSION=1000)(a+)*zz/I
|
||||
Capturing subpattern count = 1
|
||||
Recursion limit = 1000
|
||||
Starting code units: a z
|
||||
Last code unit = 'z'
|
||||
Subject length lower bound = 2
|
||||
aaaaaaaaaaaaaz
|
||||
No match
|
||||
aaaaaaaaaaaaaz\=recursion_limit=10
|
||||
Failed: error -53: recursion limit exceeded
|
||||
|
||||
# These three have infinitely nested recursions.
|
||||
|
||||
/((?2))((?1))/
|
||||
abc
|
||||
Failed: error -52: nested recursion at the same subject position
|
||||
|
||||
/((?(R2)a+|(?1)b))/
|
||||
aaaabcde
|
||||
Failed: error -52: nested recursion at the same subject position
|
||||
|
||||
/(?(R)a*(?1)|((?R))b)/
|
||||
aaaabcde
|
||||
Failed: error -52: nested recursion at the same subject position
|
||||
|
||||
# The allusedtext modifier does not work with JIT, which does not maintain
|
||||
# the leftchar/rightchar data.
|
||||
|
||||
/abc(?=xyz)/allusedtext
|
||||
abcxyzpqr
|
||||
0: abcxyz
|
||||
>>>
|
||||
abcxyzpqr\=aftertext
|
||||
0: abcxyz
|
||||
>>>
|
||||
0+ xyzpqr
|
||||
|
||||
/(?<=pqr)abc(?=xyz)/allusedtext
|
||||
xyzpqrabcxyzpqr
|
||||
0: pqrabcxyz
|
||||
<<< >>>
|
||||
xyzpqrabcxyzpqr\=aftertext
|
||||
0: pqrabcxyz
|
||||
<<< >>>
|
||||
0+ xyzpqr
|
||||
|
||||
/a\b/
|
||||
a.\=allusedtext
|
||||
0: a.
|
||||
>
|
||||
a\=allusedtext
|
||||
0: a
|
||||
|
||||
/abc\Kxyz/
|
||||
abcxyz\=allusedtext
|
||||
0: abcxyz
|
||||
<<<
|
||||
|
||||
/abc(?=xyz(*ACCEPT))/
|
||||
abcxyz\=allusedtext
|
||||
0: abcxyz
|
||||
>>>
|
||||
|
||||
/abc(?=abcde)(?=ab)/allusedtext
|
||||
abcabcdefg
|
||||
0: abcabcde
|
||||
>>>>>
|
||||
|
||||
# These tests provoke recursion loops, which give a different error message
|
||||
# when JIT is used.
|
||||
|
||||
/(?R)/I
|
||||
Capturing subpattern count = 0
|
||||
May match empty string
|
||||
Subject length lower bound = 0
|
||||
abcd
|
||||
Failed: error -52: nested recursion at the same subject position
|
||||
|
||||
/(a|(?R))/I
|
||||
Capturing subpattern count = 1
|
||||
May match empty string
|
||||
Subject length lower bound = 1
|
||||
abcd
|
||||
0: a
|
||||
1: a
|
||||
defg
|
||||
Failed: error -52: nested recursion at the same subject position
|
||||
|
||||
/(ab|(bc|(de|(?R))))/I
|
||||
Capturing subpattern count = 3
|
||||
May match empty string
|
||||
Subject length lower bound = 2
|
||||
abcd
|
||||
0: ab
|
||||
1: ab
|
||||
fghi
|
||||
Failed: error -52: nested recursion at the same subject position
|
||||
|
||||
/(ab|(bc|(de|(?1))))/I
|
||||
Capturing subpattern count = 3
|
||||
May match empty string
|
||||
Subject length lower bound = 2
|
||||
abcd
|
||||
0: ab
|
||||
1: ab
|
||||
fghi
|
||||
Failed: error -52: nested recursion at the same subject position
|
||||
|
||||
/x(ab|(bc|(de|(?1)x)x)x)/I
|
||||
Capturing subpattern count = 3
|
||||
First code unit = 'x'
|
||||
Subject length lower bound = 3
|
||||
xab123
|
||||
0: xab
|
||||
1: ab
|
||||
xfghi
|
||||
Failed: error -52: nested recursion at the same subject position
|
||||
|
||||
/(?!\w)(?R)/
|
||||
abcd
|
||||
Failed: error -52: nested recursion at the same subject position
|
||||
=abc
|
||||
Failed: error -52: nested recursion at the same subject position
|
||||
|
||||
/(?=\w)(?R)/
|
||||
=abc
|
||||
Failed: error -52: nested recursion at the same subject position
|
||||
abcd
|
||||
Failed: error -52: nested recursion at the same subject position
|
||||
|
||||
/(?<!\w)(?R)/
|
||||
abcd
|
||||
Failed: error -52: nested recursion at the same subject position
|
||||
|
||||
/(?<=\w)(?R)/
|
||||
abcd
|
||||
Failed: error -52: nested recursion at the same subject position
|
||||
|
||||
/(a+|(?R)b)/
|
||||
aaa
|
||||
0: aaa
|
||||
1: aaa
|
||||
bbb
|
||||
Failed: error -52: nested recursion at the same subject position
|
||||
|
||||
/[^\xff]((?1))/BI
|
||||
------------------------------------------------------------------
|
||||
Bra
|
||||
[^\x{ff}]
|
||||
CBra 1
|
||||
Recurse
|
||||
Ket
|
||||
Ket
|
||||
End
|
||||
------------------------------------------------------------------
|
||||
Capturing subpattern count = 1
|
||||
Subject length lower bound = 1
|
||||
abcd
|
||||
Failed: error -52: nested recursion at the same subject position
|
||||
|
||||
# End of testinput14
|
|
@ -0,0 +1,61 @@
|
|||
# These test special (mostly error) UTF features of DFA matching. They are a
|
||||
# selection of the more comprehensive tests that are run for non-DFA matching.
|
||||
# The output is different for the different widths.
|
||||
|
||||
#subject dfa
|
||||
|
||||
/X/utf
|
||||
XX\x{d800}
|
||||
Failed: error -24: UTF-16 error: missing low surrogate at end at offset 2
|
||||
XX\x{d800}\=offset=3
|
||||
No match
|
||||
XX\x{d800}\=no_utf_check
|
||||
0: X
|
||||
XX\x{da00}
|
||||
Failed: error -24: UTF-16 error: missing low surrogate at end at offset 2
|
||||
XX\x{da00}\=no_utf_check
|
||||
0: X
|
||||
XX\x{dc00}
|
||||
Failed: error -26: UTF-16 error: isolated low surrogate at offset 2
|
||||
XX\x{dc00}\=no_utf_check
|
||||
0: X
|
||||
XX\x{de00}
|
||||
Failed: error -26: UTF-16 error: isolated low surrogate at offset 2
|
||||
XX\x{de00}\=no_utf_check
|
||||
0: X
|
||||
XX\x{dfff}
|
||||
Failed: error -26: UTF-16 error: isolated low surrogate at offset 2
|
||||
XX\x{dfff}\=no_utf_check
|
||||
0: X
|
||||
XX\x{110000}
|
||||
** Failed: character \x{110000} is greater than 0x10ffff and so cannot be converted to UTF-16
|
||||
XX\x{d800}\x{1234}
|
||||
Failed: error -25: UTF-16 error: invalid low surrogate at offset 3
|
||||
|
||||
/badutf/utf
|
||||
X\xdf
|
||||
No match
|
||||
XX\xef
|
||||
No match
|
||||
XXX\xef\x80
|
||||
No match
|
||||
X\xf7
|
||||
No match
|
||||
XX\xf7\x80
|
||||
No match
|
||||
XXX\xf7\x80\x80
|
||||
No match
|
||||
|
||||
/shortutf/utf
|
||||
XX\xdf\=ph
|
||||
No match
|
||||
XX\xef\=ph
|
||||
No match
|
||||
XX\xef\x80\=ph
|
||||
No match
|
||||
\xf7\=ph
|
||||
No match
|
||||
\xf7\x80\=ph
|
||||
No match
|
||||
|
||||
# End of testinput14
|
|
@ -0,0 +1,61 @@
|
|||
# These test special (mostly error) UTF features of DFA matching. They are a
|
||||
# selection of the more comprehensive tests that are run for non-DFA matching.
|
||||
# The output is different for the different widths.
|
||||
|
||||
#subject dfa
|
||||
|
||||
/X/utf
|
||||
XX\x{d800}
|
||||
Failed: error -27: UTF-32 error: code points 0xd800-0xdfff are not defined at offset 2
|
||||
XX\x{d800}\=offset=3
|
||||
No match
|
||||
XX\x{d800}\=no_utf_check
|
||||
0: X
|
||||
XX\x{da00}
|
||||
Failed: error -27: UTF-32 error: code points 0xd800-0xdfff are not defined at offset 2
|
||||
XX\x{da00}\=no_utf_check
|
||||
0: X
|
||||
XX\x{dc00}
|
||||
Failed: error -27: UTF-32 error: code points 0xd800-0xdfff are not defined at offset 2
|
||||
XX\x{dc00}\=no_utf_check
|
||||
0: X
|
||||
XX\x{de00}
|
||||
Failed: error -27: UTF-32 error: code points 0xd800-0xdfff are not defined at offset 2
|
||||
XX\x{de00}\=no_utf_check
|
||||
0: X
|
||||
XX\x{dfff}
|
||||
Failed: error -27: UTF-32 error: code points 0xd800-0xdfff are not defined at offset 2
|
||||
XX\x{dfff}\=no_utf_check
|
||||
0: X
|
||||
XX\x{110000}
|
||||
Failed: error -28: UTF-32 error: code points greater than 0x10ffff are not defined at offset 2
|
||||
XX\x{d800}\x{1234}
|
||||
Failed: error -27: UTF-32 error: code points 0xd800-0xdfff are not defined at offset 2
|
||||
|
||||
/badutf/utf
|
||||
X\xdf
|
||||
No match
|
||||
XX\xef
|
||||
No match
|
||||
XXX\xef\x80
|
||||
No match
|
||||
X\xf7
|
||||
No match
|
||||
XX\xf7\x80
|
||||
No match
|
||||
XXX\xf7\x80\x80
|
||||
No match
|
||||
|
||||
/shortutf/utf
|
||||
XX\xdf\=ph
|
||||
No match
|
||||
XX\xef\=ph
|
||||
No match
|
||||
XX\xef\x80\=ph
|
||||
No match
|
||||
\xf7\=ph
|
||||
No match
|
||||
\xf7\x80\=ph
|
||||
No match
|
||||
|
||||
# End of testinput14
|
|
@ -0,0 +1,61 @@
|
|||
# These test special (mostly error) UTF features of DFA matching. They are a
|
||||
# selection of the more comprehensive tests that are run for non-DFA matching.
|
||||
# The output is different for the different widths.
|
||||
|
||||
#subject dfa
|
||||
|
||||
/X/utf
|
||||
XX\x{d800}
|
||||
Failed: error -16: UTF-8 error: code points 0xd800-0xdfff are not defined at offset 2
|
||||
XX\x{d800}\=offset=3
|
||||
Error -36 (bad UTF-8 offset)
|
||||
XX\x{d800}\=no_utf_check
|
||||
0: X
|
||||
XX\x{da00}
|
||||
Failed: error -16: UTF-8 error: code points 0xd800-0xdfff are not defined at offset 2
|
||||
XX\x{da00}\=no_utf_check
|
||||
0: X
|
||||
XX\x{dc00}
|
||||
Failed: error -16: UTF-8 error: code points 0xd800-0xdfff are not defined at offset 2
|
||||
XX\x{dc00}\=no_utf_check
|
||||
0: X
|
||||
XX\x{de00}
|
||||
Failed: error -16: UTF-8 error: code points 0xd800-0xdfff are not defined at offset 2
|
||||
XX\x{de00}\=no_utf_check
|
||||
0: X
|
||||
XX\x{dfff}
|
||||
Failed: error -16: UTF-8 error: code points 0xd800-0xdfff are not defined at offset 2
|
||||
XX\x{dfff}\=no_utf_check
|
||||
0: X
|
||||
XX\x{110000}
|
||||
Failed: error -15: UTF-8 error: code points greater than 0x10ffff are not defined at offset 2
|
||||
XX\x{d800}\x{1234}
|
||||
Failed: error -16: UTF-8 error: code points 0xd800-0xdfff are not defined at offset 2
|
||||
|
||||
/badutf/utf
|
||||
X\xdf
|
||||
Failed: error -3: UTF-8 error: 1 byte missing at end at offset 1
|
||||
XX\xef
|
||||
Failed: error -4: UTF-8 error: 2 bytes missing at end at offset 2
|
||||
XXX\xef\x80
|
||||
Failed: error -3: UTF-8 error: 1 byte missing at end at offset 3
|
||||
X\xf7
|
||||
Failed: error -5: UTF-8 error: 3 bytes missing at end at offset 1
|
||||
XX\xf7\x80
|
||||
Failed: error -4: UTF-8 error: 2 bytes missing at end at offset 2
|
||||
XXX\xf7\x80\x80
|
||||
Failed: error -3: UTF-8 error: 1 byte missing at end at offset 3
|
||||
|
||||
/shortutf/utf
|
||||
XX\xdf\=ph
|
||||
Failed: error -3: UTF-8 error: 1 byte missing at end at offset 2
|
||||
XX\xef\=ph
|
||||
Failed: error -4: UTF-8 error: 2 bytes missing at end at offset 2
|
||||
XX\xef\x80\=ph
|
||||
Failed: error -3: UTF-8 error: 1 byte missing at end at offset 2
|
||||
\xf7\=ph
|
||||
Failed: error -5: UTF-8 error: 3 bytes missing at end at offset 0
|
||||
\xf7\x80\=ph
|
||||
Failed: error -4: UTF-8 error: 2 bytes missing at end at offset 0
|
||||
|
||||
# End of testinput14
|
|
@ -1,17 +1,334 @@
|
|||
# This test is run only when JIT support is not available. It checks that an
|
||||
# attempt to use it has the expected behaviour. It also tests things that
|
||||
# are different without JIT.
|
||||
# These are:
|
||||
#
|
||||
# (1) Tests of the match-limiting features. The results are different for
|
||||
# interpretive or JIT matching, so this test should not be run with JIT. The
|
||||
# same tests are run using JIT in test 17.
|
||||
|
||||
/abc/I,jit,jitverify
|
||||
# (2) Other tests that must not be run with JIT.
|
||||
|
||||
/(a+)*zz/I
|
||||
Capturing subpattern count = 1
|
||||
Starting code units: a z
|
||||
Last code unit = 'z'
|
||||
Subject length lower bound = 2
|
||||
aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaazzbbbbbb\=find_limits
|
||||
Minimum match limit = 8
|
||||
Minimum recursion limit = 6
|
||||
0: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaazz
|
||||
1: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
|
||||
aaaaaaaaaaaaaz\=find_limits
|
||||
Minimum match limit = 32768
|
||||
Minimum recursion limit = 29
|
||||
No match
|
||||
|
||||
!((?:\s|//.*\\n|/[*](?:\\n|.)*?[*]/)*)!I
|
||||
Capturing subpattern count = 1
|
||||
May match empty string
|
||||
Subject length lower bound = 0
|
||||
/* this is a C style comment */\=find_limits
|
||||
Minimum match limit = 120
|
||||
Minimum recursion limit = 6
|
||||
0: /* this is a C style comment */
|
||||
1: /* this is a C style comment */
|
||||
|
||||
/^(?>a)++/
|
||||
aa\=find_limits
|
||||
Minimum match limit = 5
|
||||
Minimum recursion limit = 2
|
||||
0: aa
|
||||
aaaaaaaaa\=find_limits
|
||||
Minimum match limit = 12
|
||||
Minimum recursion limit = 2
|
||||
0: aaaaaaaaa
|
||||
|
||||
/(a)(?1)++/
|
||||
aa\=find_limits
|
||||
Minimum match limit = 7
|
||||
Minimum recursion limit = 4
|
||||
0: aa
|
||||
1: a
|
||||
aaaaaaaaa\=find_limits
|
||||
Minimum match limit = 21
|
||||
Minimum recursion limit = 4
|
||||
0: aaaaaaaaa
|
||||
1: a
|
||||
|
||||
/a(?:.)*?a/ims
|
||||
abbbbbbbbbbbbbbbbbbbbba\=find_limits
|
||||
Minimum match limit = 65
|
||||
Minimum recursion limit = 2
|
||||
0: abbbbbbbbbbbbbbbbbbbbba
|
||||
|
||||
/a(?:.(*THEN))*?a/ims
|
||||
abbbbbbbbbbbbbbbbbbbbba\=find_limits
|
||||
Minimum match limit = 86
|
||||
Minimum recursion limit = 45
|
||||
0: abbbbbbbbbbbbbbbbbbbbba
|
||||
|
||||
/a(?:.(*THEN:ABC))*?a/ims
|
||||
abbbbbbbbbbbbbbbbbbbbba\=find_limits
|
||||
Minimum match limit = 86
|
||||
Minimum recursion limit = 45
|
||||
0: abbbbbbbbbbbbbbbbbbbbba
|
||||
|
||||
/^(?>a+)(?>b+)(?>c+)(?>d+)(?>e+)/
|
||||
aabbccddee\=find_limits
|
||||
Minimum match limit = 7
|
||||
Minimum recursion limit = 2
|
||||
0: aabbccddee
|
||||
|
||||
/^(?>(a+))(?>(b+))(?>(c+))(?>(d+))(?>(e+))/
|
||||
aabbccddee\=find_limits
|
||||
Minimum match limit = 17
|
||||
Minimum recursion limit = 16
|
||||
0: aabbccddee
|
||||
1: aa
|
||||
2: bb
|
||||
3: cc
|
||||
4: dd
|
||||
5: ee
|
||||
|
||||
/^(?>(a+))(?>b+)(?>(c+))(?>d+)(?>(e+))/
|
||||
aabbccddee\=find_limits
|
||||
Minimum match limit = 13
|
||||
Minimum recursion limit = 10
|
||||
0: aabbccddee
|
||||
1: aa
|
||||
2: cc
|
||||
3: ee
|
||||
|
||||
/(*LIMIT_MATCH=12bc)abc/
|
||||
Failed: error 160 at offset 17: (*VERB) not recognized or malformed
|
||||
|
||||
/(*LIMIT_MATCH=4294967290)abc/
|
||||
Failed: error 160 at offset 24: (*VERB) not recognized or malformed
|
||||
|
||||
/(*LIMIT_RECURSION=4294967280)abc/I
|
||||
Capturing subpattern count = 0
|
||||
Recursion limit = 4294967280
|
||||
First code unit = 'a'
|
||||
Last code unit = 'c'
|
||||
Subject length lower bound = 3
|
||||
JIT support is not available in this version of PCRE2
|
||||
|
||||
/a*/I
|
||||
/(a+)*zz/
|
||||
aaaaaaaaaaaaaz
|
||||
No match
|
||||
aaaaaaaaaaaaaz\=match_limit=3000
|
||||
Failed: error -47: match limit exceeded
|
||||
|
||||
/(a+)*zz/
|
||||
aaaaaaaaaaaaaz\=recursion_limit=10
|
||||
Failed: error -53: recursion limit exceeded
|
||||
|
||||
/(*LIMIT_MATCH=3000)(a+)*zz/I
|
||||
Capturing subpattern count = 1
|
||||
Match limit = 3000
|
||||
Starting code units: a z
|
||||
Last code unit = 'z'
|
||||
Subject length lower bound = 2
|
||||
aaaaaaaaaaaaaz
|
||||
Failed: error -47: match limit exceeded
|
||||
aaaaaaaaaaaaaz\=match_limit=60000
|
||||
Failed: error -47: match limit exceeded
|
||||
|
||||
/(*LIMIT_MATCH=60000)(*LIMIT_MATCH=3000)(a+)*zz/I
|
||||
Capturing subpattern count = 1
|
||||
Match limit = 3000
|
||||
Starting code units: a z
|
||||
Last code unit = 'z'
|
||||
Subject length lower bound = 2
|
||||
aaaaaaaaaaaaaz
|
||||
Failed: error -47: match limit exceeded
|
||||
|
||||
/(*LIMIT_MATCH=60000)(a+)*zz/I
|
||||
Capturing subpattern count = 1
|
||||
Match limit = 60000
|
||||
Starting code units: a z
|
||||
Last code unit = 'z'
|
||||
Subject length lower bound = 2
|
||||
aaaaaaaaaaaaaz
|
||||
No match
|
||||
aaaaaaaaaaaaaz\=match_limit=3000
|
||||
Failed: error -47: match limit exceeded
|
||||
|
||||
/(*LIMIT_RECURSION=10)(a+)*zz/I
|
||||
Capturing subpattern count = 1
|
||||
Recursion limit = 10
|
||||
Starting code units: a z
|
||||
Last code unit = 'z'
|
||||
Subject length lower bound = 2
|
||||
aaaaaaaaaaaaaz
|
||||
Failed: error -53: recursion limit exceeded
|
||||
aaaaaaaaaaaaaz\=recursion_limit=1000
|
||||
Failed: error -53: recursion limit exceeded
|
||||
|
||||
/(*LIMIT_RECURSION=10)(*LIMIT_RECURSION=1000)(a+)*zz/I
|
||||
Capturing subpattern count = 1
|
||||
Recursion limit = 1000
|
||||
Starting code units: a z
|
||||
Last code unit = 'z'
|
||||
Subject length lower bound = 2
|
||||
aaaaaaaaaaaaaz
|
||||
No match
|
||||
|
||||
/(*LIMIT_RECURSION=1000)(a+)*zz/I
|
||||
Capturing subpattern count = 1
|
||||
Recursion limit = 1000
|
||||
Starting code units: a z
|
||||
Last code unit = 'z'
|
||||
Subject length lower bound = 2
|
||||
aaaaaaaaaaaaaz
|
||||
No match
|
||||
aaaaaaaaaaaaaz\=recursion_limit=10
|
||||
Failed: error -53: recursion limit exceeded
|
||||
|
||||
# These three have infinitely nested recursions.
|
||||
|
||||
/((?2))((?1))/
|
||||
abc
|
||||
Failed: error -52: nested recursion at the same subject position
|
||||
|
||||
/((?(R2)a+|(?1)b))/
|
||||
aaaabcde
|
||||
Failed: error -52: nested recursion at the same subject position
|
||||
|
||||
/(?(R)a*(?1)|((?R))b)/
|
||||
aaaabcde
|
||||
Failed: error -52: nested recursion at the same subject position
|
||||
|
||||
# The allusedtext modifier does not work with JIT, which does not maintain
|
||||
# the leftchar/rightchar data.
|
||||
|
||||
/abc(?=xyz)/allusedtext
|
||||
abcxyzpqr
|
||||
0: abcxyz
|
||||
>>>
|
||||
abcxyzpqr\=aftertext
|
||||
0: abcxyz
|
||||
>>>
|
||||
0+ xyzpqr
|
||||
|
||||
/(?<=pqr)abc(?=xyz)/allusedtext
|
||||
xyzpqrabcxyzpqr
|
||||
0: pqrabcxyz
|
||||
<<< >>>
|
||||
xyzpqrabcxyzpqr\=aftertext
|
||||
0: pqrabcxyz
|
||||
<<< >>>
|
||||
0+ xyzpqr
|
||||
|
||||
/a\b/
|
||||
a.\=allusedtext
|
||||
0: a.
|
||||
>
|
||||
a\=allusedtext
|
||||
0: a
|
||||
|
||||
/abc\Kxyz/
|
||||
abcxyz\=allusedtext
|
||||
0: abcxyz
|
||||
<<<
|
||||
|
||||
/abc(?=xyz(*ACCEPT))/
|
||||
abcxyz\=allusedtext
|
||||
0: abcxyz
|
||||
>>>
|
||||
|
||||
/abc(?=abcde)(?=ab)/allusedtext
|
||||
abcabcdefg
|
||||
0: abcabcde
|
||||
>>>>>
|
||||
|
||||
# These tests provoke recursion loops, which give a different error message
|
||||
# when JIT is used.
|
||||
|
||||
/(?R)/I
|
||||
Capturing subpattern count = 0
|
||||
May match empty string
|
||||
Subject length lower bound = 0
|
||||
abcd
|
||||
Failed: error -52: nested recursion at the same subject position
|
||||
|
||||
/(a|(?R))/I
|
||||
Capturing subpattern count = 1
|
||||
May match empty string
|
||||
Subject length lower bound = 1
|
||||
abcd
|
||||
0: a
|
||||
1: a
|
||||
defg
|
||||
Failed: error -52: nested recursion at the same subject position
|
||||
|
||||
/(ab|(bc|(de|(?R))))/I
|
||||
Capturing subpattern count = 3
|
||||
May match empty string
|
||||
Subject length lower bound = 2
|
||||
abcd
|
||||
0: ab
|
||||
1: ab
|
||||
fghi
|
||||
Failed: error -52: nested recursion at the same subject position
|
||||
|
||||
/(ab|(bc|(de|(?1))))/I
|
||||
Capturing subpattern count = 3
|
||||
May match empty string
|
||||
Subject length lower bound = 2
|
||||
abcd
|
||||
0: ab
|
||||
1: ab
|
||||
fghi
|
||||
Failed: error -52: nested recursion at the same subject position
|
||||
|
||||
/x(ab|(bc|(de|(?1)x)x)x)/I
|
||||
Capturing subpattern count = 3
|
||||
First code unit = 'x'
|
||||
Subject length lower bound = 3
|
||||
xab123
|
||||
0: xab
|
||||
1: ab
|
||||
xfghi
|
||||
Failed: error -52: nested recursion at the same subject position
|
||||
|
||||
/(?!\w)(?R)/
|
||||
abcd
|
||||
Failed: error -52: nested recursion at the same subject position
|
||||
=abc
|
||||
Failed: error -52: nested recursion at the same subject position
|
||||
|
||||
/(?=\w)(?R)/
|
||||
=abc
|
||||
Failed: error -52: nested recursion at the same subject position
|
||||
abcd
|
||||
Failed: error -52: nested recursion at the same subject position
|
||||
|
||||
/(?<!\w)(?R)/
|
||||
abcd
|
||||
Failed: error -52: nested recursion at the same subject position
|
||||
|
||||
/(?<=\w)(?R)/
|
||||
abcd
|
||||
Failed: error -52: nested recursion at the same subject position
|
||||
|
||||
/(a+|(?R)b)/
|
||||
aaa
|
||||
0: aaa
|
||||
1: aaa
|
||||
bbb
|
||||
Failed: error -52: nested recursion at the same subject position
|
||||
|
||||
/[^\xff]((?1))/BI
|
||||
------------------------------------------------------------------
|
||||
Bra
|
||||
[^\x{ff}]
|
||||
CBra 1
|
||||
Recurse
|
||||
Ket
|
||||
Ket
|
||||
End
|
||||
------------------------------------------------------------------
|
||||
Capturing subpattern count = 1
|
||||
Subject length lower bound = 1
|
||||
abcd
|
||||
Failed: error -52: nested recursion at the same subject position
|
||||
|
||||
# End of testinput15
|
||||
|
|
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
|
@ -1,20 +1,148 @@
|
|||
# This set of tests is run only with the 8-bit library. It tests the POSIX
|
||||
# interface with UTF/UCP support, which is supported only with the 8-bit
|
||||
# library. This test should not be run with JIT (which is not available for the
|
||||
# POSIX interface).
|
||||
# interface, which is supported only with the 8-bit library. This test should
|
||||
# not be run with JIT (which is not available for the POSIX interface).
|
||||
|
||||
#forbid_utf
|
||||
#pattern posix
|
||||
|
||||
/a\x{1234}b/utf
|
||||
a\x{1234}b
|
||||
0: a\x{1234}b
|
||||
# Test invalid options
|
||||
|
||||
/\w/
|
||||
+++\x{c2}
|
||||
/abc/auto_callout
|
||||
** Ignored with POSIX interface: auto_callout
|
||||
|
||||
/abc/
|
||||
abc\=find_limits
|
||||
** Ignored with POSIX interface: find_limits
|
||||
0: abc
|
||||
|
||||
/abc/
|
||||
abc\=partial_hard
|
||||
** Ignored with POSIX interface: partial_hard
|
||||
0: abc
|
||||
|
||||
# Real tests
|
||||
|
||||
/abc/
|
||||
abc
|
||||
0: abc
|
||||
*** Failers
|
||||
No match: POSIX code 17: match failed
|
||||
|
||||
/\w/ucp
|
||||
+++\x{c2}
|
||||
0: \xc2
|
||||
/^abc|def/
|
||||
abcdef
|
||||
0: abc
|
||||
abcdef\=notbol
|
||||
0: def
|
||||
|
||||
# End of testdata/testinput17
|
||||
/.*((abc)$|(def))/
|
||||
defabc
|
||||
0: defabc
|
||||
1: abc
|
||||
2: abc
|
||||
defabc\=noteol
|
||||
0: def
|
||||
1: def
|
||||
3: def
|
||||
|
||||
/the quick brown fox/
|
||||
the quick brown fox
|
||||
0: the quick brown fox
|
||||
*** Failers
|
||||
No match: POSIX code 17: match failed
|
||||
The Quick Brown Fox
|
||||
No match: POSIX code 17: match failed
|
||||
|
||||
/the quick brown fox/i
|
||||
the quick brown fox
|
||||
0: the quick brown fox
|
||||
The Quick Brown Fox
|
||||
0: The Quick Brown Fox
|
||||
|
||||
/abc.def/
|
||||
*** Failers
|
||||
No match: POSIX code 17: match failed
|
||||
abc\ndef
|
||||
No match: POSIX code 17: match failed
|
||||
|
||||
/abc$/
|
||||
abc
|
||||
0: abc
|
||||
abc\n
|
||||
0: abc
|
||||
|
||||
/(abc)\2/
|
||||
Failed: POSIX code 15: bad back reference at offset 6
|
||||
|
||||
/(abc\1)/
|
||||
abc
|
||||
No match: POSIX code 17: match failed
|
||||
|
||||
/a*(b+)(z)(z)/
|
||||
aaaabbbbzzzz
|
||||
0: aaaabbbbzz
|
||||
1: bbbb
|
||||
2: z
|
||||
3: z
|
||||
aaaabbbbzzzz\=ovector=0
|
||||
Matched without capture
|
||||
aaaabbbbzzzz\=ovector=1
|
||||
0: aaaabbbbzz
|
||||
aaaabbbbzzzz\=ovector=2
|
||||
0: aaaabbbbzz
|
||||
1: bbbb
|
||||
|
||||
/ab.cd/
|
||||
ab-cd
|
||||
0: ab-cd
|
||||
ab=cd
|
||||
0: ab=cd
|
||||
** Failers
|
||||
No match: POSIX code 17: match failed
|
||||
ab\ncd
|
||||
No match: POSIX code 17: match failed
|
||||
|
||||
/ab.cd/s
|
||||
ab-cd
|
||||
0: ab-cd
|
||||
ab=cd
|
||||
0: ab=cd
|
||||
ab\ncd
|
||||
0: ab\x0acd
|
||||
|
||||
/a(b)c/no_auto_capture
|
||||
abc
|
||||
Matched with REG_NOSUB
|
||||
|
||||
/a(?P<name>b)c/no_auto_capture
|
||||
abc
|
||||
Matched with REG_NOSUB
|
||||
|
||||
/a?|b?/
|
||||
abc
|
||||
0: a
|
||||
** Failers
|
||||
0:
|
||||
ddd\=notempty
|
||||
No match: POSIX code 17: match failed
|
||||
|
||||
/\w+A/
|
||||
CDAAAAB
|
||||
0: CDAAAA
|
||||
|
||||
/\w+A/ungreedy
|
||||
CDAAAAB
|
||||
0: CDA
|
||||
|
||||
/\Biss\B/I,aftertext
|
||||
** Ignored with POSIX interface: info
|
||||
Mississippi
|
||||
0: iss
|
||||
0+ issippi
|
||||
|
||||
/abc/\
|
||||
Failed: POSIX code 9: bad escape sequence at offset 4
|
||||
|
||||
"(?(?C)"
|
||||
Failed: POSIX code 3: pattern error at offset 2
|
||||
|
||||
# End of testdata/testinput18
|
||||
|
|
|
@ -1,100 +1,20 @@
|
|||
# This set of tests exercises the serialization/deserialization functions in
|
||||
# the library. It does not use UTF or JIT.
|
||||
# This set of tests is run only with the 8-bit library. It tests the POSIX
|
||||
# interface with UTF/UCP support, which is supported only with the 8-bit
|
||||
# library. This test should not be run with JIT (which is not available for the
|
||||
# POSIX interface).
|
||||
|
||||
#forbid_utf
|
||||
#pattern posix
|
||||
|
||||
# Compile several patterns, push them onto the stack, and then write them
|
||||
# all to a file.
|
||||
/a\x{1234}b/utf
|
||||
a\x{1234}b
|
||||
0: a\x{1234}b
|
||||
|
||||
#pattern push
|
||||
/\w/
|
||||
+++\x{c2}
|
||||
No match: POSIX code 17: match failed
|
||||
|
||||
/(?<NAME>(?&NAME_PAT))\s+(?<ADDR>(?&ADDRESS_PAT))
|
||||
(?(DEFINE)
|
||||
(?<NAME_PAT>[a-z]+)
|
||||
(?<ADDRESS_PAT>\d+)
|
||||
)/x
|
||||
/^(?:((.)(?1)\2|)|((.)(?3)\4|.))$/i
|
||||
/\w/ucp
|
||||
+++\x{c2}
|
||||
0: \xc2
|
||||
|
||||
#save testsaved1
|
||||
|
||||
# Do it again for some more patterns.
|
||||
|
||||
/(*MARK:A)(*SKIP:B)(C|X)/mark
|
||||
** Ignored when compiled pattern is stacked with 'push': mark
|
||||
/(?:(?<n>foo)|(?<n>bar))\k<n>/dupnames
|
||||
|
||||
#save testsaved2
|
||||
#pattern -push
|
||||
|
||||
# Reload the patterns, then pop them one by one and check them.
|
||||
|
||||
#load testsaved1
|
||||
#load testsaved2
|
||||
|
||||
#pop info
|
||||
Capturing subpattern count = 2
|
||||
Max back reference = 2
|
||||
Named capturing subpatterns:
|
||||
n 1
|
||||
n 2
|
||||
Options: dupnames
|
||||
Starting code units: b f
|
||||
Subject length lower bound = 6
|
||||
foofoo
|
||||
0: foofoo
|
||||
1: foo
|
||||
barbar
|
||||
0: barbar
|
||||
1: <unset>
|
||||
2: bar
|
||||
|
||||
#pop mark
|
||||
C
|
||||
0: C
|
||||
1: C
|
||||
MK: A
|
||||
D
|
||||
No match, mark = A
|
||||
|
||||
#pop
|
||||
AmanaplanacanalPanama
|
||||
0: AmanaplanacanalPanama
|
||||
1: <unset>
|
||||
2: <unset>
|
||||
3: AmanaplanacanalPanama
|
||||
4: A
|
||||
|
||||
#pop info
|
||||
Capturing subpattern count = 4
|
||||
Named capturing subpatterns:
|
||||
ADDR 2
|
||||
ADDRESS_PAT 4
|
||||
NAME 1
|
||||
NAME_PAT 3
|
||||
Options: extended
|
||||
Subject length lower bound = 3
|
||||
metcalfe 33
|
||||
0: metcalfe 33
|
||||
1: metcalfe
|
||||
2: 33
|
||||
|
||||
# Check for an error when different tables are used.
|
||||
|
||||
/abc/push,tables=1
|
||||
/xyz/push,tables=2
|
||||
#save testsaved1
|
||||
Serialization failed: error -30: patterns do not all use the same character tables
|
||||
|
||||
#pop
|
||||
xyz
|
||||
0: xyz
|
||||
|
||||
#pop
|
||||
abc
|
||||
0: abc
|
||||
|
||||
#pop should give an error
|
||||
** Can't pop off an empty stack
|
||||
pqr
|
||||
|
||||
# End of testinput19
|
||||
# End of testdata/testinput19
|
||||
|
|
|
@ -0,0 +1,100 @@
|
|||
# This set of tests exercises the serialization/deserialization functions in
|
||||
# the library. It does not use UTF or JIT.
|
||||
|
||||
#forbid_utf
|
||||
|
||||
# Compile several patterns, push them onto the stack, and then write them
|
||||
# all to a file.
|
||||
|
||||
#pattern push
|
||||
|
||||
/(?<NAME>(?&NAME_PAT))\s+(?<ADDR>(?&ADDRESS_PAT))
|
||||
(?(DEFINE)
|
||||
(?<NAME_PAT>[a-z]+)
|
||||
(?<ADDRESS_PAT>\d+)
|
||||
)/x
|
||||
/^(?:((.)(?1)\2|)|((.)(?3)\4|.))$/i
|
||||
|
||||
#save testsaved1
|
||||
|
||||
# Do it again for some more patterns.
|
||||
|
||||
/(*MARK:A)(*SKIP:B)(C|X)/mark
|
||||
** Ignored when compiled pattern is stacked with 'push': mark
|
||||
/(?:(?<n>foo)|(?<n>bar))\k<n>/dupnames
|
||||
|
||||
#save testsaved2
|
||||
#pattern -push
|
||||
|
||||
# Reload the patterns, then pop them one by one and check them.
|
||||
|
||||
#load testsaved1
|
||||
#load testsaved2
|
||||
|
||||
#pop info
|
||||
Capturing subpattern count = 2
|
||||
Max back reference = 2
|
||||
Named capturing subpatterns:
|
||||
n 1
|
||||
n 2
|
||||
Options: dupnames
|
||||
Starting code units: b f
|
||||
Subject length lower bound = 6
|
||||
foofoo
|
||||
0: foofoo
|
||||
1: foo
|
||||
barbar
|
||||
0: barbar
|
||||
1: <unset>
|
||||
2: bar
|
||||
|
||||
#pop mark
|
||||
C
|
||||
0: C
|
||||
1: C
|
||||
MK: A
|
||||
D
|
||||
No match, mark = A
|
||||
|
||||
#pop
|
||||
AmanaplanacanalPanama
|
||||
0: AmanaplanacanalPanama
|
||||
1: <unset>
|
||||
2: <unset>
|
||||
3: AmanaplanacanalPanama
|
||||
4: A
|
||||
|
||||
#pop info
|
||||
Capturing subpattern count = 4
|
||||
Named capturing subpatterns:
|
||||
ADDR 2
|
||||
ADDRESS_PAT 4
|
||||
NAME 1
|
||||
NAME_PAT 3
|
||||
Options: extended
|
||||
Subject length lower bound = 3
|
||||
metcalfe 33
|
||||
0: metcalfe 33
|
||||
1: metcalfe
|
||||
2: 33
|
||||
|
||||
# Check for an error when different tables are used.
|
||||
|
||||
/abc/push,tables=1
|
||||
/xyz/push,tables=2
|
||||
#save testsaved1
|
||||
Serialization failed: error -30: patterns do not all use the same character tables
|
||||
|
||||
#pop
|
||||
xyz
|
||||
0: xyz
|
||||
|
||||
#pop
|
||||
abc
|
||||
0: abc
|
||||
|
||||
#pop should give an error
|
||||
** Can't pop off an empty stack
|
||||
pqr
|
||||
|
||||
# End of testinput20
|
Loading…
Reference in New Issue