Add more tests.

This commit is contained in:
Philip.Hazel 2014-08-03 17:50:08 +00:00
parent 2addfec25d
commit 8792477279
15 changed files with 24064 additions and 239 deletions

206
RunTest
View File

@ -48,17 +48,16 @@
# Define test titles in variables so that they can be output as a list. Some
# of them are modified (e.g. with -8 or -16) when used in the actual tests.
title1="Test 1: Main functionality (Compatible with Perl >= 5.10)"
title1="Test 1: Main non-UTF, non-UCP functionality (compatible with Perl >= 5.10)"
title2="Test 2: API, errors, internals, and non-Perl stuff"
title3="Test 3: Locale-specific features"
title4A="Test 4: UTF"
title4B=" and Unicode property support (Compatible with Perl >= 5.10)"
#title5="Test 5: API, internals, and non-Perl stuff for UTF"
#title6="Test 6: Unicode property support (Compatible with Perl >= 5.10)"
#title7="Test 7: API, internals, and non-Perl stuff for Unicode property support"
#title8="Test 8: DFA matching main functionality"
#title9="Test 9: DFA matching with UTF"
#title10="Test 10: DFA matching with Unicode properties"
title4B=" and Unicode property support (compatible with Perl >= 5.10)"
title5A="Test 5: API, internals, and non-Perl stuff for UTF"
title5B=" and UCP support"
title6="Test 6: DFA matching main non-UTF, non-UCP functionality"
title7A="Test 7: DFA matching with UTF"
title7B=" and Unicode property support"
#title11="Test 11: Internal offsets and code size tests"
#title12="Test 12: JIT-specific features (when JIT is available)"
#title13="Test 13: JIT-specific features (when JIT is not available)"
@ -80,12 +79,12 @@ maxtest=2
if [ $# -eq 1 -a "$1" = "list" ]; then
echo $title1
echo $title2 "(not UTF)"
echo $title2 "(not UTF or UCP)"
echo $title3
echo $title4A $title4B
# echo $title5 support
# echo $title6
# echo $title7
echo $title5A $title5B
echo $title6
echo $title7A $title7B
# echo $title8
# echo $title9
# echo $title10
@ -176,9 +175,9 @@ do1=no
do2=no
do3=no
do4=no
#do5=no
#do6=no
#do7=no
do5=no
do6=no
do7=no
#do8=no
#do9=no
#do10=no
@ -205,9 +204,9 @@ while [ $# -gt 0 ] ; do
2) do2=yes;;
3) do3=yes;;
4) do4=yes;;
# 5) do5=yes;;
# 6) do6=yes;;
# 7) do7=yes;;
5) do5=yes;;
6) do6=yes;;
7) do7=yes;;
# 8) do8=yes;;
# 9) do9=yes;;
# 10) do10=yes;;
@ -346,9 +345,10 @@ fi
# If no specific tests were requested, select all. Those that are not
# relevant will be automatically skipped.
if [ $do1 = no -a $do2 = no -a $do3 = no -a $do4 = no \
if [ $do1 = no -a $do2 = no -a $do3 = no -a $do4 = no -a \
$do5 = no -a $do6 = no -a $do7 = no \
]; then
# -a $do5 = no -a $do6 = no -a $do7 = no -a $do8 = no -a \
# -a $do8 = no -a \
# $do9 = no -a $do10 = no -a $do11 = no -a $do12 = no -a \
# $do13 = no -a $do14 = no -a $do15 = no -a $do16 = no -a \
# $do17 = no -a $do18 = no -a $do19 = no -a $do20 = no -a \
@ -359,9 +359,9 @@ if [ $do1 = no -a $do2 = no -a $do3 = no -a $do4 = no \
do2=yes
do3=yes
do4=yes
# do5=yes
# do6=yes
# do7=yes
do5=yes
do6=yes
do7=yes
# do8=yes
# do9=yes
# do10=yes
@ -425,7 +425,7 @@ fi
# PCRE2 tests that are not JIT or Perl-compatible: API, errors, internals
if [ $do2 = yes ] ; then
echo $title2 "(not UTF-$bits)"
echo $title2 "(excluding UTF-$bits)"
for opt in "" $jitopt; do
$sim $valgrind ./pcre2test -q $bmode $opt $testdata/testinput2 testtry
if [ $? = 0 ] ; then
@ -537,117 +537,53 @@ if [ $do4 = yes ] ; then
fi
fi
#if [ $do5 = yes ] ; then
# echo ${title5}-${bits} support
# if [ $utf -eq 0 ] ; then
# echo " Skipped because UTF-$bits support is not available"
# else
# for opt in "" "-s" $jitopt; do
# $sim $valgrind ./pcre2test -q $bmode $opt $testdata/testinput5 testtry
# if [ $? = 0 ] ; then
# $cf $testdata/testoutput5 testtry
# if [ $? != 0 ] ; then exit 1; fi
# else exit 1
# fi
# if [ "$opt" = "-s" ] ; then echo " OK with study"
# elif [ "$opt" = "-s+" ] ; then echo " OK with JIT study"
# else echo " OK"
# fi
# done
# fi
#fi
#
#if [ $do6 = yes ] ; then
# echo $title6
# if [ $utf -eq 0 -o $ucp -eq 0 ] ; then
# echo " Skipped because Unicode property support is not available"
# else
# for opt in "" "-s" $jitopt; do
# $sim $valgrind ./pcre2test -q $bmode $opt $testdata/testinput6 testtry
# if [ $? = 0 ] ; then
# $cf $testdata/testoutput6 testtry
# if [ $? != 0 ] ; then exit 1; fi
# else exit 1
# fi
# if [ "$opt" = "-s" ] ; then echo " OK with study"
# elif [ "$opt" = "-s+" ] ; then echo " OK with JIT study"
# else echo " OK"
# fi
# done
# fi
#fi
#
## Test non-Perl-compatible Unicode property support
#
#if [ $do7 = yes ] ; then
# echo $title7
# if [ $utf -eq 0 -o $ucp -eq 0 ] ; then
# echo " Skipped because Unicode property support is not available"
# else
# for opt in "" "-s" $jitopt; do
# $sim $valgrind ./pcre2test -q $bmode $opt $testdata/testinput7 testtry
# if [ $? = 0 ] ; then
# $cf $testdata/testoutput7 testtry
# if [ $? != 0 ] ; then exit 1; fi
# else exit 1
# fi
# if [ "$opt" = "-s" ] ; then echo " OK with study"
# elif [ "$opt" = "-s+" ] ; then echo " OK with JIT study"
# else echo " OK"
# fi
# done
# fi
#fi
#
## Tests for DFA matching support
#
#if [ $do8 = yes ] ; then
# echo $title8
# for opt in "" "-s"; do
# $sim $valgrind ./pcre2test -q $bmode $opt -dfa $testdata/testinput8 testtry
# if [ $? = 0 ] ; then
# $cf $testdata/testoutput8 testtry
# if [ $? != 0 ] ; then exit 1; fi
# else exit 1
# fi
# if [ "$opt" = "-s" ] ; then echo " OK with study" ; else echo " OK"; fi
# done
#fi
#
#if [ $do9 = yes ] ; then
# echo ${title9}-${bits}
# if [ $utf -eq 0 ] ; then
# echo " Skipped because UTF-$bits support is not available"
# else
# for opt in "" "-s"; do
# $sim $valgrind ./pcre2test -q $bmode $opt -dfa $testdata/testinput9 testtry
# if [ $? = 0 ] ; then
# $cf $testdata/testoutput9 testtry
# if [ $? != 0 ] ; then exit 1; fi
# else exit 1
# fi
# if [ "$opt" = "-s" ] ; then echo " OK with study" ; else echo " OK"; fi
# done
# fi
#fi
#
#if [ $do10 = yes ] ; then
# echo $title10
# if [ $utf -eq 0 -o $ucp -eq 0 ] ; then
# echo " Skipped because Unicode property support is not available"
# else
# for opt in "" "-s"; do
# $sim $valgrind ./pcre2test -q $bmode $opt -dfa $testdata/testinput10 testtry
# if [ $? = 0 ] ; then
# $cf $testdata/testoutput10 testtry
# if [ $? != 0 ] ; then exit 1; fi
# else exit 1
# fi
# if [ "$opt" = "-s" ] ; then echo " OK with study" ; else echo " OK"; fi
# done
# fi
#fi
#
if [ $do5 = yes ] ; then
echo ${title5A}-${bits}$title5B
if [ $utf -eq 0 ] ; then
echo " Skipped because UTF-$bits support is not available"
else
for opt in "" $jitopt; do
$sim $valgrind ./pcre2test -q $bmode $opt $testdata/testinput5 testtry
if [ $? = 0 ] ; then
$cf $testdata/testoutput5 testtry
if [ $? != 0 ] ; then exit 1; fi
else exit 1
fi
if [ "$opt" = "-jit" ] ; then echo " OK with JIT"
else echo " OK"
fi
done
fi
fi
# Tests for DFA matching support
if [ $do6 = yes ] ; then
echo $title6
$sim $valgrind ./pcre2test -q $bmode $testdata/testinput6 testtry
if [ $? = 0 ] ; then
$cf $testdata/testoutput6 testtry
if [ $? != 0 ] ; then exit 1; fi
else exit 1
fi
echo " OK"
fi
if [ $do7 = yes ] ; then
echo ${title7A}-${bits}$title7B
if [ $utf -eq 0 ] ; then
echo " Skipped because UTF-$bits support is not available"
else
$sim $valgrind ./pcre2test -q $bmode $opt $testdata/testinput7 testtry
if [ $? = 0 ] ; then
$cf $testdata/testoutput7 testtry
if [ $? != 0 ] ; then exit 1; fi
else exit 1
fi
echo " OK"
fi
fi
## Test of internal offsets and code sizes. This test is run only when there
## is Unicode property support and the link size is 2. The actual tests are
## mostly the same as in some of the above, but in this test we inspect some

View File

@ -123,19 +123,21 @@ D is inspected during pcre2_dfa_match() execution
#define PCRE2_JIT_PARTIAL_SOFT 0x00000002
#define PCRE2_JIT_PARTIAL_HARD 0x00000004
/* These are for pcre2_match() and pcre2_dfa_match(). */
/* These are for pcre2_match() and pcre2_dfa_match(). Note that PCRE2_ANCHORED,
PCRE2_NO_START_OPTIMIZE, and PCRE2_NO_UTF_CHECK can also be passed to these
functions, so take care not to define synonyms by mistake. */
#define PCRE2_NOTBOL 0x00000001
#define PCRE2_NOTEOL 0x00000002
#define PCRE2_NOTEMPTY 0x00000004
#define PCRE2_NOTEMPTY_ATSTART 0x00000008
#define PCRE2_PARTIAL_SOFT 0x00000010
#define PCRE2_PARTIAL_HARD 0x00000020
#define PCRE2_NOTBOL 0x00000008
#define PCRE2_NOTEOL 0x00000010
#define PCRE2_NOTEMPTY 0x00000020
#define PCRE2_NOTEMPTY_ATSTART 0x00000040
#define PCRE2_PARTIAL_SOFT 0x00000080
#define PCRE2_PARTIAL_HARD 0x00000100
/* These are additional options for pcre2_dfa_match(). */
#define PCRE2_DFA_RESTART 0x00000040
#define PCRE2_DFA_SHORTEST 0x00000080
#define PCRE2_DFA_RESTART 0x00000200
#define PCRE2_DFA_SHORTEST 0x00000400
/* Newline and \R settings, for use in the compile and match contexts. The
newline values must be kept in step with values set in config.h and both sets

View File

@ -123,19 +123,21 @@ D is inspected during pcre2_dfa_match() execution
#define PCRE2_JIT_PARTIAL_SOFT 0x00000002
#define PCRE2_JIT_PARTIAL_HARD 0x00000004
/* These are for pcre2_match() and pcre2_dfa_match(). */
/* These are for pcre2_match() and pcre2_dfa_match(). Note that PCRE2_ANCHORED,
PCRE2_NO_START_OPTIMIZE, and PCRE2_NO_UTF_CHECK can also be passed to these
functions, so take care not to define synonyms by mistake. */
#define PCRE2_NOTBOL 0x00000001
#define PCRE2_NOTEOL 0x00000002
#define PCRE2_NOTEMPTY 0x00000004
#define PCRE2_NOTEMPTY_ATSTART 0x00000008
#define PCRE2_PARTIAL_SOFT 0x00000010
#define PCRE2_PARTIAL_HARD 0x00000020
#define PCRE2_NOTBOL 0x00000008
#define PCRE2_NOTEOL 0x00000010
#define PCRE2_NOTEMPTY 0x00000020
#define PCRE2_NOTEMPTY_ATSTART 0x00000040
#define PCRE2_PARTIAL_SOFT 0x00000080
#define PCRE2_PARTIAL_HARD 0x00000100
/* These are additional options for pcre2_dfa_match(). */
#define PCRE2_DFA_RESTART 0x00000040
#define PCRE2_DFA_SHORTEST 0x00000080
#define PCRE2_DFA_RESTART 0x00000200
#define PCRE2_DFA_SHORTEST 0x00000400
/* Newline and \R settings, for use in the compile and match contexts. The
newline values must be kept in step with values set in config.h and both sets

View File

@ -107,14 +107,14 @@ return -1;
REAL_PCRE *re = (REAL_PCRE *)argument_re;
pcre_study_data *study;
#ifndef COMPILE_PCRE8
#if PCRE2_CODE_UNIT_WIDTH != 8
pcre_uchar *ptr;
int length;
#if defined SUPPORT_UTF && defined COMPILE_PCRE16
#if defined SUPPORT_UTF && PCRE2_CODE_UNIT_WIDTH == 16
BOOL utf;
BOOL utf16_char;
#endif /* SUPPORT_UTF && COMPILE_PCRE16 */
#endif /* !COMPILE_PCRE8 */
#endif
#endif
if (re == NULL) return PCRE_ERROR_NULL;
if (re->magic_number == MAGIC_NUMBER)
@ -134,10 +134,10 @@ re->flags = swap_uint32(re->flags);
re->limit_match = swap_uint32(re->limit_match);
re->limit_recursion = swap_uint32(re->limit_recursion);
#if defined COMPILE_PCRE8 || defined COMPILE_PCRE16
#if PCRE2_CODE_UNIT_WIDTH == 8 || PCRE2_CODE_UNIT_WIDTH == 16
re->first_char = swap_uint16(re->first_char);
re->req_char = swap_uint16(re->req_char);
#elif defined COMPILE_PCRE32
#elif PCRE2_CODE_UNIT_WIDTH == 32
re->first_char = swap_uint32(re->first_char);
re->req_char = swap_uint32(re->req_char);
#endif
@ -159,27 +159,27 @@ if (extra_data != NULL && (extra_data->flags & PCRE_EXTRA_STUDY_DATA) != 0)
study->minlength = swap_uint32(study->minlength);
}
#ifndef COMPILE_PCRE8
#if PCRE2_CODE_UNIT_WIDTH != 8
ptr = (pcre_uchar *)re + re->name_table_offset;
length = re->name_count * re->name_entry_size;
#if defined SUPPORT_UTF && defined COMPILE_PCRE16
#if defined SUPPORT_UTF && PCRE2_CODE_UNIT_WIDTH == 16
utf = (re->options & PCRE_UTF16) != 0;
utf16_char = FALSE;
#endif /* SUPPORT_UTF && COMPILE_PCRE16 */
#endif
while(TRUE)
{
/* Swap previous characters. */
while (length-- > 0)
{
#if defined COMPILE_PCRE16
#if PCRE2_CODE_UNIT_WIDTH == 16
*ptr = swap_uint16(*ptr);
#elif defined COMPILE_PCRE32
#elif PCRE2_CODE_UNIT_WIDTH == 32
*ptr = swap_uint32(*ptr);
#endif
ptr++;
}
#if defined SUPPORT_UTF && defined COMPILE_PCRE16
#if defined SUPPORT_UTF && PCRE2_CODE_UNIT_WIDTH == 16
if (utf16_char)
{
if (HAS_EXTRALEN(ptr[-1]))
@ -194,9 +194,9 @@ while(TRUE)
/* Get next opcode. */
length = 0;
#if defined COMPILE_PCRE16
#if PCRE2_CODE_UNIT_WIDTH == 16
*ptr = swap_uint16(*ptr);
#elif defined COMPILE_PCRE32
#elif PCRE2_CODE_UNIT_WIDTH == 32
*ptr = swap_uint32(*ptr);
#endif
switch (*ptr)
@ -204,7 +204,7 @@ while(TRUE)
case OP_END:
return 0;
#if defined SUPPORT_UTF && defined COMPILE_PCRE16
#if defined SUPPORT_UTF && PCRE2_CODE_UNIT_WIDTH == 16
case OP_CHAR:
case OP_CHARI:
case OP_NOT:
@ -279,12 +279,12 @@ while(TRUE)
case OP_XCLASS:
/* Reverse the size of the XCLASS instance. */
ptr++;
#if defined COMPILE_PCRE16
#if PCRE2_CODE_UNIT_WIDTH == 16
*ptr = swap_uint16(*ptr);
#elif defined COMPILE_PCRE32
#elif PCRE2_CODE_UNIT_WIDTH == 32
*ptr = swap_uint32(*ptr);
#endif
#ifndef COMPILE_PCRE32
#if PCRE2_CODE_UNIT_WIDTH != 32
if (LINK_SIZE > 1)
{
/* LINK_SIZE can be 1 or 2 in 16 bit mode. */
@ -294,9 +294,9 @@ while(TRUE)
#endif
ptr++;
length = (GET(ptr, -LINK_SIZE)) - (1 + LINK_SIZE + 1);
#if defined COMPILE_PCRE16
#if PCRE2_CODE_UNIT_WIDTH == 16
*ptr = swap_uint16(*ptr);
#elif defined COMPILE_PCRE32
#elif PCRE2_CODE_UNIT_WIDTH == 32
*ptr = swap_uint32(*ptr);
#endif
if ((*ptr & XCL_MAP) != 0)
@ -310,7 +310,7 @@ while(TRUE)
ptr++;
}
/* Control should never reach here in 16/32 bit mode. */
#endif /* !COMPILE_PCRE8 */
#endif
#endif /* NEVER */

View File

@ -54,21 +54,22 @@ POSSIBILITY OF SUCH DAMAGE.
by defining macros in order to minimize #if usage. */
#if PCRE2_CODE_UNIT_WIDTH == 8
#define STRING_UTFn_RIGHTPAR STRING_UTF8_RIGHTPAR, 5
#define XDIGIT(c) xdigitab[c]
#define MAYBE_UTF_MULTI /* UTF chars may use multiple code units */
#define STRING_UTFn_RIGHTPAR STRING_UTF8_RIGHTPAR, 5
#define XDIGIT(c) xdigitab[c]
#else /* Either 16-bit or 32-bit */
#define XDIGIT(c) (MAX_255(c)? xdigitab[c] : 0xff)
#define XDIGIT(c) (MAX_255(c)? xdigitab[c] : 0xff)
#if PCRE2_CODE_UNIT_WIDTH == 16
#define STRING_UTFn_RIGHTPAR STRING_UTF16_RIGHTPAR, 6
#define MAYBE_UTF_MULTI /* UTF chars may use multiple code units */
#define STRING_UTFn_RIGHTPAR STRING_UTF16_RIGHTPAR, 6
#else
#define STRING_UTFn_RIGHTPAR STRING_UTF32_RIGHTPAR, 6
#else /* 33-bit */
#define STRING_UTFn_RIGHTPAR STRING_UTF32_RIGHTPAR, 6
#endif
#endif
/* Function definitions to allow mutual recursion */
static int
@ -1308,7 +1309,7 @@ for (code = first_significant_code(code + PRIV(OP_lengths)[*code], TRUE);
actual length is stored in the compiled code, so we must update "code"
here. */
#if defined SUPPORT_UTF || !defined COMPILE_PCRE8
#if defined SUPPORT_UTF || PCRE2_CODE_UNIT_WIDTH != 8
case OP_XCLASS:
ccode = code += GET(code, 1);
goto CHECK_CLASS_REPEAT;
@ -1318,7 +1319,7 @@ for (code = first_significant_code(code + PRIV(OP_lengths)[*code], TRUE);
case OP_NCLASS:
ccode = code + PRIV(OP_lengths)[OP_CLASS];
#if defined SUPPORT_UTF || !defined COMPILE_PCRE8
#if defined SUPPORT_UTF || PCRE2_CODE_UNIT_WIDTH != 8
CHECK_CLASS_REPEAT:
#endif
@ -1875,7 +1876,7 @@ else
c -= CHAR_0;
while(i++ < 2 && ptr[1] >= CHAR_0 && ptr[1] <= CHAR_7)
c = c * 8 + *(++ptr) - CHAR_0;
#ifdef COMPILE_PCRE8
#if PCRE2_CODE_UNIT_WIDTH == 8
if (!utf && c > 0xff) *errorcodeptr = ERR51;
#endif
break;
@ -1894,15 +1895,15 @@ else
{
cc = *ptr++;
if (c == 0 && cc == CHAR_0) continue; /* Leading zeroes */
#ifdef COMPILE_PCRE32
#if PCRE2_CODE_UNIT_WIDTH == 32
if (c >= 0x20000000l) { overflow = TRUE; break; }
#endif
c = (c << 3) + cc - CHAR_0 ;
#if defined COMPILE_PCRE8
#if PCRE2_CODE_UNIT_WIDTH == 8
if (c > (utf ? 0x10ffffU : 0xffU)) { overflow = TRUE; break; }
#elif defined COMPILE_PCRE16
#elif PCRE2_CODE_UNIT_WIDTH == 16
if (c > (utf ? 0x10ffffU : 0xffffU)) { overflow = TRUE; break; }
#elif defined COMPILE_PCRE32
#elif PCRE2_CODE_UNIT_WIDTH == 32
if (utf && c > 0x10ffffU) { overflow = TRUE; break; }
#endif
}
@ -3039,7 +3040,6 @@ dynamically as we process the pattern. */
#ifdef SUPPORT_UTF
BOOL utf = (options & PCRE2_UTF) != 0;
#if PCRE2_CODE_UNIT_WIDTH != 32
#define MAYBE_UTF_MULTI /* UTF chars may use multiple code units */
PCRE2_UCHAR utf_units[6]; /* For setting up multi-cu chars */
#endif
@ -7608,7 +7608,7 @@ help in the case when a regex compiled on a system with 4-byte pointers is run
on another with 8-byte pointers. */
#ifdef FIXME
#ifdef COMPILE_PCRE32
#if PCRE2_CODE_UNIT_WIDTH == 32
re->dummy = 0;
#else
re->dummy1 = re->dummy2 = re->dummy3 = 0;

View File

@ -1400,7 +1400,7 @@ for (;;)
case 0x2028:
case 0x2029:
#endif /* Not EBCDIC */
if ((mb->moptions & PCRE2_BSR_ANYCRLF) != 0) break;
if (mb->bsr_convention == PCRE2_BSR_ANYCRLF) break;
goto ANYNL01;
case CHAR_CR:
@ -1669,7 +1669,7 @@ for (;;)
case 0x2028:
case 0x2029:
#endif /* Not EBCDIC */
if ((mb->moptions & PCRE2_BSR_ANYCRLF) != 0) break;
if (mb->bsr_convention == PCRE2_BSR_ANYCRLF) break;
goto ANYNL02;
case CHAR_CR:
@ -1939,7 +1939,7 @@ for (;;)
case 0x2028:
case 0x2029:
#endif /* Not EBCDIC */
if ((mb->moptions & PCRE2_BSR_ANYCRLF) != 0) break;
if (mb->bsr_convention == PCRE2_BSR_ANYCRLF) break;
goto ANYNL03;
case CHAR_CR:
@ -2121,7 +2121,7 @@ for (;;)
case 0x2028:
case 0x2029:
#endif /* Not EBCDIC */
if ((mb->moptions & PCRE2_BSR_ANYCRLF) != 0) break;
if (mb->bsr_convention == PCRE2_BSR_ANYCRLF) break;
case CHAR_LF:
ADD_NEW(state_offset + 1, 0);
@ -3378,7 +3378,7 @@ for (;;)
/* The following two optimizations are disabled for partial matching. */
if ((mb->moptions & PCRE2_PARTIAL_HARD & PCRE2_PARTIAL_SOFT) == 0)
if ((mb->moptions & (PCRE2_PARTIAL_HARD|PCRE2_PARTIAL_SOFT)) == 0)
{
/* The minimum matching length is a lower bound; no actual string of that
length may actually match the pattern. Although the value is, strictly,
@ -3470,6 +3470,8 @@ for (;;)
match_data->ovector[1] = (PCRE2_OFFSET)(end_subject - subject);
}
match_data->leftchar = (PCRE2_OFFSET)(mb->start_used_ptr - subject);
match_data->rightchar = 0; /* FIXME */
match_data->startchar = (PCRE2_OFFSET)(start_match - subject);
match_data->rc = rc;
return rc;
}

View File

@ -90,30 +90,26 @@ static unsigned int
print_char(FILE *f, PCRE2_SPTR ptr, BOOL utf)
{
uint32_t c = *ptr;
#if defined SUPPORT_UTF && PCRE2_CODE_UNIT_WIDTH == 8
int a, i, s;
#endif
BOOL one_code_unit = !utf;
/* If UTF is supported and requested, check for a one-code-unit character. The
16-bit and 32-bit tests are for malformed UTF, and should only trigger if the
sanity check is turned off. */
/* If UTF is supported and requested, check for a valid single code unit. */
#ifdef SUPPORT_UTF
if (utf)
{
#if PCRE2_CODE_UNIT_WIDTH == 8
utf = (c & 0xc0) == 0xc0;
one_code_unit = c < 0x80;
#elif PCRE2_CODE_UNIT_WIDTH == 16
utf = (c & 0xfc00) == 0xd800;
one_code_unit = (c & 0xfc00) != 0xd800;
#else
utf = (c & 0xfffff800u) != 0xd800u;
one_code_unit = (c & 0xfffff800u) != 0xd800u;
#endif
}
#endif /* SUPPORT_UTF */
/* Handle a one-code-unit character at any width. */
/* Handle a valid one-code-unit character at any width. */
if (!utf)
if (one_code_unit)
{
if (PRINTABLE(c)) fprintf(f, "%c", (char)c);
else if (c < 0x80) fprintf(f, "\\x%02x", c);
@ -121,41 +117,43 @@ if (!utf)
return 0;
}
/* Per-width code for handling non-one-code-unit UTF characters. */
/* Per-width code for invalid UTF code units and multi-unit UTF characters. */
#ifdef SUPPORT_UTF
/* Handle a multi-byte UTF-8 character. */
/* Malformed UTF-8 should occur only if the sanity check has been turned off.
Rather than swallow random bytes, just stop if we hit a bad one. Print it with
\X instead of \x as an indication. */
#if PCRE2_CODE_UNIT_WIDTH == 8
a = utf8_table4[c & 0x3f]; /* Number of additional bytes */
s = 6*a;
c = (c & utf8_table3[a]) << s;
for (i = 1; i <= a; i++)
if ((c & 0xc0) != 0xc0)
{
/* This is a check for malformed UTF-8; it should only occur if the sanity
check has been turned off. Rather than swallow random bytes, just stop if
we hit a bad one. Print it with \X instead of \x as an indication. */
if ((ptr[i] & 0xc0) != 0x80)
{
fprintf(f, "\\X{%x}", c);
return i - 1;
}
/* The byte is OK */
s -= 6;
c |= (ptr[i] & 0x3f) << s;
fprintf(f, "\\X{%x}", c); /* Invalid starting byte */
return 0;
}
fprintf(f, "\\x{%x}", c);
return a;
else
{
int i;
int a = utf8_table4[c & 0x3f]; /* Number of additional bytes */
int s = 6*a;
c = (c & utf8_table3[a]) << s;
for (i = 1; i <= a; i++)
{
if ((ptr[i] & 0xc0) != 0x80)
{
fprintf(f, "\\X{%x}", c); /* Invalid secondary byte */
return i - 1;
}
s -= 6;
c |= (ptr[i] & 0x3f) << s;
}
fprintf(f, "\\x{%x}", c);
return a;
}
#endif /* PCRE2_CODE_UNIT_WIDTH == 8 */
/* Handle a multi-code-unit UTF-16 character, starting with a check for
malformed UTF-16; it should only occur if the sanity check has been turned off.
Rather than swallow a low surrogate, just stop if we hit a bad one. Print it
with \X instead of \x as an indication. */
/* UTF-16: rather than swallow a low surrogate, just stop if we hit a bad one.
Print it with \X instead of \x as an indication. */
#if PCRE2_CODE_UNIT_WIDTH == 16
if ((ptr[1] & 0xfc00) != 0xdc00)
@ -176,7 +174,7 @@ as an indication. */
fprintf(f, "\\X{%x}", c);
return 0;
#endif /* PCRE2_CODE_UNIT_WIDTH == 32 */
#endif /* SUPPORT_UTF */
#endif /* SUPPORT_UTF */
}

View File

@ -751,7 +751,7 @@ set_start_bits(pcre2_real_code *re, PCRE2_SPTR code, BOOL utf)
register uint32_t c;
int yield = SSB_DONE;
#if defined SUPPORT_UTF && defined COMPILE_PCRE8
#if defined SUPPORT_UTF && PCRE2_CODE_UNIT_WIDTH == 8
int table_limit = utf? 16:32;
#else
int table_limit = 32;

1569
testdata/testinput5 vendored Normal file

File diff suppressed because it is too large Load Diff

4786
testdata/testinput6 vendored Normal file

File diff suppressed because it is too large Load Diff

2126
testdata/testinput7 vendored Normal file

File diff suppressed because it is too large Load Diff

View File

@ -47,7 +47,7 @@ Subject length lower bound = 3
abc
0: abc
abc\=anchored
No match
0: abc
*** Failers
No match
defabc
@ -352,7 +352,7 @@ Subject length lower bound = 3
abcdef
0: abc
abcdef\=notbol
No match
0: def
/.*((abc)$|(def))/I
Capturing subpattern count = 3

3963
testdata/testoutput5 vendored Normal file

File diff suppressed because it is too large Load Diff

7669
testdata/testoutput6 vendored Normal file

File diff suppressed because it is too large Load Diff

3772
testdata/testoutput7 vendored Normal file

File diff suppressed because it is too large Load Diff