Tests 1 and 2 are converted (but without save/restore).

This commit is contained in:
Philip.Hazel 2014-07-24 16:32:38 +00:00
parent 1701838220
commit 017b6a1624
12 changed files with 35118 additions and 217 deletions

995
RunTest Executable file
View File

@ -0,0 +1,995 @@
#! /bin/sh
###############################################################################
# Run the PCRE2 tests using the pcre2test program. The appropriate tests are
# selected, depending on which build-time options were used.
#
# When JIT support is available, all appropriate tests are run with and without
# JIT, unless "nojit" is given on the command line. There are also two tests
# for JIT-specific features, one to be run when JIT support is available
# (unless "nojit" is specified), and one when it is not.
#
# Whichever of the 8-, 16- and 32-bit libraries exist are tested. It is also
# possible to select which to test by giving "-8", "-16" or "-32" on the
# command line.
#
# As well as "nojit", "-8", "-16", and "-32", arguments for this script are
# individual test numbers, ranges of tests such as 3-6 or 3- (meaning 3 to the
# end), or a number preceded by ~ to exclude a test. For example, "3-15 ~10"
# runs tests 3 to 15, excluding test 10, and just "~10" runs all the tests
# except test 10. Whatever order the arguments are in, the tests are always run
# in numerical order.
#
# The special argument "3S" runs test 3, stopping if it fails. Test 3 is the
# locale test, and failure usually means there's an issue with the locale
# rather than a bug in PCRE2, so normally subsequent tests are run. "3S" is
# useful when you want to debug or update the test.
#
# Inappropriate tests are automatically skipped (with a comment to say so): for
# example, if JIT support is not compiled, test 12 is skipped, whereas if JIT
# support is compiled, test 13 is skipped.
#
# Other arguments can be one of the words "valgrind", "valgrind-log", or "sim"
# followed by an argument to run cross-compiled executables under a simulator,
# for example:
#
# RunTest 3 sim "qemu-arm -s 8388608"
#
# There are two special cases where only one argument is allowed:
#
# If the first and only argument is "ebcdic", the script runs the special
# EBCDIC test that can be useful for checking certain EBCDIC features, even
# when run in an ASCII environment.
#
# If the script is obeyed as "RunTest list", a list of available tests is
# output, but none of them are run.
###############################################################################
# Define test titles in variables so that they can be output as a list. Some
# of them are modified (e.g. with -8 or -16) when used in the actual tests.
title1="Test 1: Main functionality (Compatible with Perl >= 5.10)"
title2="Test 2: API, errors, internals, and non-Perl stuff"
#title3="Test 3: Locale-specific features"
#title4A="Test 4: UTF"
#title4B=" support (Compatible with Perl >= 5.10)"
#title5="Test 5: API, internals, and non-Perl stuff for UTF"
#title6="Test 6: Unicode property support (Compatible with Perl >= 5.10)"
#title7="Test 7: API, internals, and non-Perl stuff for Unicode property support"
#title8="Test 8: DFA matching main functionality"
#title9="Test 9: DFA matching with UTF"
#title10="Test 10: DFA matching with Unicode properties"
#title11="Test 11: Internal offsets and code size tests"
#title12="Test 12: JIT-specific features (when JIT is available)"
#title13="Test 13: JIT-specific features (when JIT is not available)"
#title14="Test 14: Specials for the basic 8-bit library"
#title15="Test 15: Specials for the 8-bit library with UTF-8 support"
#title16="Test 16: Specials for the 8-bit library with Unicode propery support"
#title17="Test 17: Specials for the basic 16/32-bit library"
#title18="Test 18: Specials for the 16/32-bit library with UTF-16/32 support"
#title19="Test 19: Specials for the 16/32-bit library with Unicode property support"
#title20="Test 20: DFA specials for the basic 16/32-bit library"
#title21="Test 21: Reloads for the basic 16/32-bit library"
#title22="Test 22: Reloads for the 16/32-bit library with UTF-16/32 support"
#title23="Test 23: Specials for the 16-bit library"
#title24="Test 24: Specials for the 16-bit library with UTF-16 support"
#title25="Test 25: Specials for the 32-bit library"
#title26="Test 26: Specials for the 32-bit library with UTF-32 support"
maxtest=2
if [ $# -eq 1 -a "$1" = "list" ]; then
echo $title1
echo $title2 "(not UTF)"
# echo $title3
# echo $title4A $title4B
# echo $title5 support
# echo $title6
# echo $title7
# echo $title8
# echo $title9
# echo $title10
# echo $title11
# echo $title12
# echo $title13
# echo $title14
# echo $title15
# echo $title16
# echo $title17
# echo $title18
# echo $title19
# echo $title20
# echo $title21
# echo $title22
# echo $title23
# echo $title24
# echo $title25
# echo $title26
exit 0
fi
# Set up a suitable "diff" command for comparison. Some systems
# have a diff that lacks a -u option. Try to deal with this.
cf="diff"
diff -u /dev/null /dev/null 2>/dev/null && cf="diff -u"
# Find the test data
if [ -n "$srcdir" -a -d "$srcdir" ] ; then
testdata="$srcdir/testdata"
elif [ -d "./testdata" ] ; then
testdata=./testdata
elif [ -d "../testdata" ] ; then
testdata=../testdata
else
echo "Cannot find the testdata directory"
exit 1
fi
# ------ Special EBCDIC Test -------
if [ $# -eq 1 -a "$1" = "ebcdic" ]; then
./pcre2test -C ebcdic >/dev/null
ebcdic=$?
if [ $ebcdic -ne 1 ] ; then
echo "Cannot run EBCDIC tests: EBCDIC support not compiled"
exit 1
fi
for opt in "" "-dfa"; do
./pcre2test -q $opt $testdata/testinputEBC >testtry
if [ $? = 0 ] ; then
$cf $testdata/testoutputEBC testtry
if [ $? != 0 ] ; then exit 1; fi
else exit 1
fi
if [ "$opt" = "-dfa" ] ; then echo " OK using DFA"
else echo " OK"
fi
done
exit 0
fi
# ------ Normal Tests ------
# Default values
arg8=
arg16=
arg32=
nojit=
sim=
skip=
valgrind=
# This is in case the caller has set aliases (as I do - PH)
unset cp ls mv rm
# Process options and select which tests to run; for those that are explicitly
# requested, check that the necessary optional facilities are available.
do1=no
do2=no
#do3=no
#do4=no
#do5=no
#do6=no
#do7=no
#do8=no
#do9=no
#do10=no
#do11=no
#do12=no
#do13=no
#do14=no
#do15=no
#do16=no
#do17=no
#do18=no
#do19=no
#do20=no
#do21=no
#do22=no
#do23=no
#do24=no
#do25=no
#do26=no
while [ $# -gt 0 ] ; do
case $1 in
1) do1=yes;;
2) do2=yes;;
# 3) do3=yes;;
# 4) do4=yes;;
# 5) do5=yes;;
# 6) do6=yes;;
# 7) do7=yes;;
# 8) do8=yes;;
# 9) do9=yes;;
# 10) do10=yes;;
# 11) do11=yes;;
# 12) do12=yes;;
# 13) do13=yes;;
# 14) do14=yes;;
# 15) do15=yes;;
# 16) do16=yes;;
# 17) do17=yes;;
# 18) do18=yes;;
# 19) do19=yes;;
# 20) do20=yes;;
# 21) do21=yes;;
# 22) do22=yes;;
# 23) do23=yes;;
# 24) do24=yes;;
# 25) do25=yes;;
# 26) do26=yes;;
-8) arg8=yes;;
-16) arg16=yes;;
-32) arg32=yes;;
nojit) nojit=yes;;
sim) shift; sim=$1;;
valgrind) valgrind="valgrind --tool=memcheck -q --smc-check=all";;
valgrind-log) valgrind="valgrind --tool=memcheck --num-callers=30 --leak-check=no --error-limit=no --smc-check=all --log-file=report.%p ";;
~*)
if expr "$1" : '~[0-9][0-9]*$' >/dev/null; then
skip="$skip `expr "$1" : '~\([0-9]*\)*$'`"
else
echo "Unknown option or test selector '$1'"; exit 1
fi
;;
*-*)
if expr "$1" : '[0-9][0-9]*-[0-9]*$' >/dev/null; then
tf=`expr "$1" : '\([0-9]*\)'`
tt=`expr "$1" : '.*-\([0-9]*\)'`
if [ "$tt" = "" ] ; then tt=$maxtest; fi
if expr \( "$tf" "<" 1 \) \| \( "$tt" ">" "$maxtest" \) >/dev/null; then
echo "Invalid test range '$1'"; exit 1
fi
while expr "$tf" "<=" "$tt" >/dev/null; do
eval do${tf}=yes
tf=`expr $tf + 1`
done
else
echo "Invalid test range '$1'"; exit 1
fi
;;
*) echo "Unknown option or test selector '$1'"; exit 1;;
esac
shift
done
# Find which optional facilities are available.
$sim ./pcre2test -C linksize >/dev/null
link_size=$?
if [ $link_size -lt 2 ] ; then
echo "Failed to find internal link size"
exit 1
fi
if [ $link_size -gt 4 ] ; then
echo "Failed to find internal link size"
exit 1
fi
# All of 8-bit, 16-bit, and 32-bit character strings may be supported, but only
# one need be.
$sim ./pcre2test -C pcre8 >/dev/null
support8=$?
$sim ./pcre2test -C pcre16 >/dev/null
support16=$?
$sim ./pcre2test -C pcre32 >/dev/null
support32=$?
# Initialize all bitsizes skipped
test8=skip
test16=skip
test32=skip
# If no bitsize arguments, select all that are available
if [ "$arg8$arg16$arg32" = "" ] ; then
if [ $support8 -ne 0 ] ; then
test8=
fi
if [ $support16 -ne 0 ] ; then
test16=-16
fi
if [ $support32 -ne 0 ] ; then
test32=-32
fi
# Select requested bit sizes
else
if [ "$arg8" = yes ] ; then
if [ $support8 -eq 0 ] ; then
echo "Cannot run 8-bit library tests: 8-bit library not compiled"
exit 1
fi
test8=
fi
if [ "$arg16" = yes ] ; then
if [ $support16 -eq 0 ] ; then
echo "Cannot run 16-bit library tests: 16-bit library not compiled"
exit 1
fi
test16=-16
fi
if [ "$arg32" = yes ] ; then
if [ $support32 -eq 0 ] ; then
echo "Cannot run 32-bit library tests: 32-bit library not compiled"
exit 1
fi
test32=-32
fi
fi
# UTF support always applies to all bit sizes if both are supported; we can't
# have UTF-8 support without UTF-16 support (for example).
$sim ./pcre2test -C utf >/dev/null
utf=$?
jitopt=
$sim ./pcre2test -C jit >/dev/null
jit=$?
if [ $jit -ne 0 -a "$nojit" != "yes" ] ; then
jitopt=-jit
fi
# If no specific tests were requested, select all. Those that are not
# relevant will be automatically skipped.
if [ $do1 = no -a $do2 = no ]; then
# -a $do3 = no -a $do4 = no -a \
# $do5 = no -a $do6 = no -a $do7 = no -a $do8 = no -a \
# $do9 = no -a $do10 = no -a $do11 = no -a $do12 = no -a \
# $do13 = no -a $do14 = no -a $do15 = no -a $do16 = no -a \
# $do17 = no -a $do18 = no -a $do19 = no -a $do20 = no -a \
# $do21 = no -a $do22 = no -a $do23 = no -a $do24 = no -a \
# $do25 = no -a $do26 = no
do1=yes
do2=yes
# do3=yes
# do4=yes
# do5=yes
# do6=yes
# do7=yes
# do8=yes
# do9=yes
# do10=yes
# do11=yes
# do12=yes
# do13=yes
# do14=yes
# do15=yes
# do16=yes
# do17=yes
# do18=yes
# do19=yes
# do20=yes
# do21=yes
# do22=yes
# do23=yes
# do24=yes
# do25=yes
# do26=yes
fi
# Handle any explicit skips at this stage, so that an argument list may consist
# only of explicit skips.
for i in $skip; do eval do$i=no; done
# Show which release and which test data
echo ""
echo PCRE2 C library tests using test data from $testdata
$sim ./pcre2test /dev/null
echo ""
for bmode in "$test8" "$test16" "$test32"; do
case "$bmode" in
skip) continue;;
-16) if [ "$test8$test32" != "skipskip" ] ; then echo ""; fi
bits=16; echo "---- Testing 16-bit library ----"; echo "";;
-32) if [ "$test8$test16" != "skipskip" ] ; then echo ""; fi
bits=32; echo "---- Testing 32-bit library ----"; echo "";;
*) bits=8; echo "---- Testing 8-bit library ----"; echo "";;
esac
# Primary test, compatible with JIT and all versions of Perl >= 5.8
if [ $do1 = yes ] ; then
echo $title1
for opt in "" $jitopt; do
$sim $valgrind ./pcre2test -q $bmode $opt $testdata/testinput1 testtry
if [ $? = 0 ] ; then
$cf $testdata/testoutput1 testtry
if [ $? != 0 ] ; then exit 1; fi
else exit 1
fi
if [ "$opt" = "-jit" ] ; then echo " OK with JIT"
else echo " OK"
fi
done
fi
# PCRE2 tests that are not JIT or Perl-compatible: API, errors, internals
if [ $do2 = yes ] ; then
echo $title2 "(not UTF-$bits)"
for opt in "" $jitopt; do
$sim $valgrind ./pcre2test -q $bmode $opt $testdata/testinput2 testtry
if [ $? = 0 ] ; then
$cf $testdata/testoutput2 testtry
if [ $? != 0 ] ; then exit 1; fi
else
echo " "
echo "** Test 2 requires a lot of stack. If it has crashed with a"
echo "** segmentation fault, it may be that you do not have enough"
echo "** stack available by default. Please see the 'pcre2stack' man"
echo "** page for a discussion of PCRE2's stack usage."
echo " "
exit 1
fi
if [ "$opt" = "-jit" ] ; then echo " OK with JIT"
else echo " OK"
fi
done
fi
## Locale-specific tests, provided that either the "fr_FR" or the "french"
## locale is available. The former is the Unix-like standard; the latter is
## for Windows. Another possibility is "fr". Unfortunately, different versions
## of the French locale give different outputs for some items. This test passes
## if the output matches any one of the alternative output files.
#
#if [ $do3 = yes ] ; then
# locale -a | grep '^fr_FR$' >/dev/null
# if [ $? -eq 0 ] ; then
# locale=fr_FR
# infile=$testdata/testinput3
# outfile=$testdata/testoutput3
# outfile2=$testdata/testoutput3A
# outfile3=$testdata/testoutput3B
# else
# infile=test3input
# outfile=test3output
# outfile2=test3outputA
# outfile3=test3outputB
# locale -a | grep '^french$' >/dev/null
# if [ $? -eq 0 ] ; then
# locale=french
# sed 's/fr_FR/french/' $testdata/testinput3 >test3input
# sed 's/fr_FR/french/' $testdata/testoutput3 >test3output
# sed 's/fr_FR/french/' $testdata/testoutput3A >test3outputA
# sed 's/fr_FR/french/' $testdata/testoutput3B >test3outputB
# else
# locale -a | grep '^fr$' >/dev/null
# if [ $? -eq 0 ] ; then
# locale=fr
# sed 's/fr_FR/fr/' $testdata/intestinput3 >test3input
# sed 's/fr_FR/fr/' $testdata/intestoutput3 >test3output
# sed 's/fr_FR/fr/' $testdata/intestoutput3A >test3outputA
# sed 's/fr_FR/fr/' $testdata/intestoutput3B >test3outputB
# else
# locale=
# fi
# fi
# fi
#
# if [ "$locale" != "" ] ; then
# echo $title3 "(using '$locale' locale)"
# for opt in "" "-s" $jitopt; do
# $sim $valgrind ./pcre2test -q $bmode $opt $infile testtry
# if [ $? = 0 ] ; then
# if $cf $outfile testtry >teststdout || \
# $cf $outfile2 testtry >teststdout || \
# $cf $outfile3 testtry >teststdout
# then
# if [ "$opt" = "-s" ] ; then echo " OK with study"
# elif [ "$opt" = "-s+" ] ; then echo " OK with JIT study"
# else echo " OK"
# fi
# else
# echo "** Locale test did not run successfully. The output did not match"
# echo " $outfile, $outfile2 or $outfile3."
# echo " This may mean that there is a problem with the locale settings rather"
# echo " than a bug in PCRE."
# exit 1
# fi
# else exit 1
# fi
# done
# else
# echo "Cannot test locale-specific features - none of the 'fr_FR', 'fr' or"
# echo "'french' locales exist, or the \"locale\" command is not available"
# echo "to check for them."
# echo " "
# fi
#fi
#
## Additional tests for UTF support
#
#if [ $do4 = yes ] ; then
# echo ${title4A}-${bits}${title4B}
# if [ $utf -eq 0 ] ; then
# echo " Skipped because UTF-$bits support is not available"
# else
# for opt in "" "-s" $jitopt; do
# $sim $valgrind ./pcre2test -q $bmode $opt $testdata/testinput4 testtry
# if [ $? = 0 ] ; then
# $cf $testdata/testoutput4 testtry
# if [ $? != 0 ] ; then exit 1; fi
# else exit 1
# fi
# if [ "$opt" = "-s" ] ; then echo " OK with study"
# elif [ "$opt" = "-s+" ] ; then echo " OK with JIT study"
# else echo " OK"
# fi
# done
# fi
#fi
#
#if [ $do5 = yes ] ; then
# echo ${title5}-${bits} support
# if [ $utf -eq 0 ] ; then
# echo " Skipped because UTF-$bits support is not available"
# else
# for opt in "" "-s" $jitopt; do
# $sim $valgrind ./pcre2test -q $bmode $opt $testdata/testinput5 testtry
# if [ $? = 0 ] ; then
# $cf $testdata/testoutput5 testtry
# if [ $? != 0 ] ; then exit 1; fi
# else exit 1
# fi
# if [ "$opt" = "-s" ] ; then echo " OK with study"
# elif [ "$opt" = "-s+" ] ; then echo " OK with JIT study"
# else echo " OK"
# fi
# done
# fi
#fi
#
#if [ $do6 = yes ] ; then
# echo $title6
# if [ $utf -eq 0 -o $ucp -eq 0 ] ; then
# echo " Skipped because Unicode property support is not available"
# else
# for opt in "" "-s" $jitopt; do
# $sim $valgrind ./pcre2test -q $bmode $opt $testdata/testinput6 testtry
# if [ $? = 0 ] ; then
# $cf $testdata/testoutput6 testtry
# if [ $? != 0 ] ; then exit 1; fi
# else exit 1
# fi
# if [ "$opt" = "-s" ] ; then echo " OK with study"
# elif [ "$opt" = "-s+" ] ; then echo " OK with JIT study"
# else echo " OK"
# fi
# done
# fi
#fi
#
## Test non-Perl-compatible Unicode property support
#
#if [ $do7 = yes ] ; then
# echo $title7
# if [ $utf -eq 0 -o $ucp -eq 0 ] ; then
# echo " Skipped because Unicode property support is not available"
# else
# for opt in "" "-s" $jitopt; do
# $sim $valgrind ./pcre2test -q $bmode $opt $testdata/testinput7 testtry
# if [ $? = 0 ] ; then
# $cf $testdata/testoutput7 testtry
# if [ $? != 0 ] ; then exit 1; fi
# else exit 1
# fi
# if [ "$opt" = "-s" ] ; then echo " OK with study"
# elif [ "$opt" = "-s+" ] ; then echo " OK with JIT study"
# else echo " OK"
# fi
# done
# fi
#fi
#
## Tests for DFA matching support
#
#if [ $do8 = yes ] ; then
# echo $title8
# for opt in "" "-s"; do
# $sim $valgrind ./pcre2test -q $bmode $opt -dfa $testdata/testinput8 testtry
# if [ $? = 0 ] ; then
# $cf $testdata/testoutput8 testtry
# if [ $? != 0 ] ; then exit 1; fi
# else exit 1
# fi
# if [ "$opt" = "-s" ] ; then echo " OK with study" ; else echo " OK"; fi
# done
#fi
#
#if [ $do9 = yes ] ; then
# echo ${title9}-${bits}
# if [ $utf -eq 0 ] ; then
# echo " Skipped because UTF-$bits support is not available"
# else
# for opt in "" "-s"; do
# $sim $valgrind ./pcre2test -q $bmode $opt -dfa $testdata/testinput9 testtry
# if [ $? = 0 ] ; then
# $cf $testdata/testoutput9 testtry
# if [ $? != 0 ] ; then exit 1; fi
# else exit 1
# fi
# if [ "$opt" = "-s" ] ; then echo " OK with study" ; else echo " OK"; fi
# done
# fi
#fi
#
#if [ $do10 = yes ] ; then
# echo $title10
# if [ $utf -eq 0 -o $ucp -eq 0 ] ; then
# echo " Skipped because Unicode property support is not available"
# else
# for opt in "" "-s"; do
# $sim $valgrind ./pcre2test -q $bmode $opt -dfa $testdata/testinput10 testtry
# if [ $? = 0 ] ; then
# $cf $testdata/testoutput10 testtry
# if [ $? != 0 ] ; then exit 1; fi
# else exit 1
# fi
# if [ "$opt" = "-s" ] ; then echo " OK with study" ; else echo " OK"; fi
# done
# fi
#fi
#
## Test of internal offsets and code sizes. This test is run only when there
## is Unicode property support and the link size is 2. The actual tests are
## mostly the same as in some of the above, but in this test we inspect some
## offsets and sizes that require a known link size. This is a doublecheck for
## the maintainer, just in case something changes unexpectely. The output from
## this test is not the same in 8-bit and 16-bit modes.
#
#if [ $do11 = yes ] ; then
# echo $title11
# if [ $link_size -ne 2 ] ; then
# echo " Skipped because link size is not 2"
# elif [ $ucp -eq 0 ] ; then
# echo " Skipped because Unicode property support is not available"
# else
# for opt in "" "-s"; do
# $sim $valgrind ./pcre2test -q $bmode $opt $testdata/testinput11 testtry
# if [ $? = 0 ] ; then
# $cf $testdata/testoutput11-$bits testtry
# if [ $? != 0 ] ; then exit 1; fi
# else exit 1
# fi
# if [ "$opt" = "-s" ] ; then echo " OK with study" ; else echo " OK"; fi
# done
# fi
#fi
#
## Test JIT-specific features when JIT is available
#
#if [ $do12 = yes ] ; then
# echo $title12
# if [ $jit -eq 0 -o "$nojit" = "yes" ] ; then
# echo " Skipped because JIT is not available or not usable"
# else
# $sim $valgrind ./pcre2test -q $bmode $testdata/testinput12 testtry
# if [ $? = 0 ] ; then
# $cf $testdata/testoutput12 testtry
# if [ $? != 0 ] ; then exit 1; fi
# else exit 1
# fi
# echo " OK"
# fi
#fi
#
## Test JIT-specific features when JIT is not available
#
#if [ $do13 = yes ] ; then
# echo $title13
# if [ $jit -ne 0 ] ; then
# echo " Skipped because JIT is available"
# else
# $sim $valgrind ./pcre2test -q $bmode $testdata/testinput13 testtry
# if [ $? = 0 ] ; then
# $cf $testdata/testoutput13 testtry
# if [ $? != 0 ] ; then exit 1; fi
# else exit 1
# fi
# echo " OK"
# fi
#fi
#
## Tests for 8-bit-specific features
#
#if [ "$do14" = yes ] ; then
# echo $title14
# if [ "$bits" = "16" -o "$bits" = "32" ] ; then
# echo " Skipped when running 16/32-bit tests"
# else
# cp -f $testdata/saved16 testsaved16
# cp -f $testdata/saved32 testsaved32
# for opt in "" "-s" $jitopt; do
# $sim $valgrind ./pcre2test -q $bmode $opt $testdata/testinput14 testtry
# if [ $? = 0 ] ; then
# $cf $testdata/testoutput14 testtry
# if [ $? != 0 ] ; then exit 1; fi
# else exit 1
# fi
# if [ "$opt" = "-s" ] ; then echo " OK with study"
# elif [ "$opt" = "-s+" ] ; then echo " OK with JIT study"
# else echo " OK"
# fi
# done
# fi
#fi
#
## Tests for 8-bit-specific features (needs UTF-8 support)
#
#if [ "$do15" = yes ] ; then
# echo $title15
# if [ "$bits" = "16" -o "$bits" = "32" ] ; then
# echo " Skipped when running 16/32-bit tests"
# elif [ $utf -eq 0 ] ; then
# echo " Skipped because UTF-$bits support is not available"
# else
# for opt in "" "-s" $jitopt; do
# $sim $valgrind ./pcre2test -q $bmode $opt $testdata/testinput15 testtry
# if [ $? = 0 ] ; then
# $cf $testdata/testoutput15 testtry
# if [ $? != 0 ] ; then exit 1; fi
# else exit 1
# fi
# if [ "$opt" = "-s" ] ; then echo " OK with study"
# elif [ "$opt" = "-s+" ] ; then echo " OK with JIT study"
# else echo " OK"
# fi
# done
# fi
#fi
#
## Tests for 8-bit-specific features (Unicode property support)
#
#if [ $do16 = yes ] ; then
# echo $title16
# if [ "$bits" = "16" -o "$bits" = "32" ] ; then
# echo " Skipped when running 16/32-bit tests"
# elif [ $ucp -eq 0 ] ; then
# echo " Skipped because Unicode property support is not available"
# else
# for opt in "" "-s" $jitopt; do
# $sim $valgrind ./pcre2test -q $bmode $opt $testdata/testinput16 testtry
# if [ $? = 0 ] ; then
# $cf $testdata/testoutput16 testtry
# if [ $? != 0 ] ; then exit 1; fi
# else exit 1
# fi
# if [ "$opt" = "-s" ] ; then echo " OK with study"
# elif [ "$opt" = "-s+" ] ; then echo " OK with JIT study"
# else echo " OK"
# fi
# done
# fi
#fi
#
## Tests for 16/32-bit-specific features
#
#if [ $do17 = yes ] ; then
# echo $title17
# if [ "$bits" = "8" ] ; then
# echo " Skipped when running 8-bit tests"
# else
# for opt in "" "-s" $jitopt; do
# $sim $valgrind ./pcre2test -q $bmode $opt $testdata/testinput17 testtry
# if [ $? = 0 ] ; then
# $cf $testdata/testoutput17 testtry
# if [ $? != 0 ] ; then exit 1; fi
# else exit 1
# fi
# if [ "$opt" = "-s" ] ; then echo " OK with study"
# elif [ "$opt" = "-s+" ] ; then echo " OK with JIT study"
# else echo " OK"
# fi
# done
# fi
#fi
#
## Tests for 16/32-bit-specific features (UTF-16/32 support)
#
#if [ $do18 = yes ] ; then
# echo $title18
# if [ "$bits" = "8" ] ; then
# echo " Skipped when running 8-bit tests"
# elif [ $utf -eq 0 ] ; then
# echo " Skipped because UTF-$bits support is not available"
# else
# for opt in "" "-s" $jitopt; do
# $sim $valgrind ./pcre2test -q $bmode $opt $testdata/testinput18 testtry
# if [ $? = 0 ] ; then
# $cf $testdata/testoutput18-$bits testtry
# if [ $? != 0 ] ; then exit 1; fi
# else exit 1
# fi
# if [ "$opt" = "-s" ] ; then echo " OK with study"
# elif [ "$opt" = "-s+" ] ; then echo " OK with JIT study"
# else echo " OK"
# fi
# done
# fi
#fi
#
## Tests for 16/32-bit-specific features (Unicode property support)
#
#if [ $do19 = yes ] ; then
# echo $title19
# if [ "$bits" = "8" ] ; then
# echo " Skipped when running 8-bit tests"
# elif [ $ucp -eq 0 ] ; then
# echo " Skipped because Unicode property support is not available"
# else
# for opt in "" "-s" $jitopt; do
# $sim $valgrind ./pcre2test -q $bmode $opt $testdata/testinput19 testtry
# if [ $? = 0 ] ; then
# $cf $testdata/testoutput19 testtry
# if [ $? != 0 ] ; then exit 1; fi
# else exit 1
# fi
# if [ "$opt" = "-s" ] ; then echo " OK with study"
# elif [ "$opt" = "-s+" ] ; then echo " OK with JIT study"
# else echo " OK"
# fi
# done
# fi
#fi
#
## Tests for 16/32-bit-specific features in DFA non-UTF-16/32 mode
#
#if [ $do20 = yes ] ; then
# echo $title20
# if [ "$bits" = "8" ] ; then
# echo " Skipped when running 8-bit tests"
# else
# for opt in "" "-s"; do
# $sim $valgrind ./pcre2test -q $bmode $opt -dfa $testdata/testinput20 testtry
# if [ $? = 0 ] ; then
# $cf $testdata/testoutput20 testtry
# if [ $? != 0 ] ; then exit 1; fi
# else exit 1
# fi
# if [ "$opt" = "-s" ] ; then echo " OK with study"
# else echo " OK"
# fi
# done
# fi
#fi
#
## Tests for reloads with 16/32-bit library
#
#if [ $do21 = yes ] ; then
# echo $title21
# if [ "$bits" = "8" ] ; then
# echo " Skipped when running 8-bit tests"
# elif [ $link_size -ne 2 ] ; then
# echo " Skipped because link size is not 2"
# else
# cp -f $testdata/saved8 testsaved8
# cp -f $testdata/saved16LE-1 testsaved16LE-1
# cp -f $testdata/saved16BE-1 testsaved16BE-1
# cp -f $testdata/saved32LE-1 testsaved32LE-1
# cp -f $testdata/saved32BE-1 testsaved32BE-1
# $sim $valgrind ./pcre2test -q $bmode $testdata/testinput21 testtry
# if [ $? = 0 ] ; then
# $cf $testdata/testoutput21-$bits testtry
# if [ $? != 0 ] ; then exit 1; fi
# else exit 1
# fi
# echo " OK"
# fi
#fi
#
## Tests for reloads with 16/32-bit library (UTF-16 support)
#
#if [ $do22 = yes ] ; then
# echo $title22
# if [ "$bits" = "8" ] ; then
# echo " Skipped when running 8-bit tests"
# elif [ $utf -eq 0 ] ; then
# echo " Skipped because UTF-$bits support is not available"
# elif [ $link_size -ne 2 ] ; then
# echo " Skipped because link size is not 2"
# else
# cp -f $testdata/saved16LE-2 testsaved16LE-2
# cp -f $testdata/saved16BE-2 testsaved16BE-2
# cp -f $testdata/saved32LE-2 testsaved32LE-2
# cp -f $testdata/saved32BE-2 testsaved32BE-2
# $sim $valgrind ./pcre2test -q $bmode $testdata/testinput22 testtry
# if [ $? = 0 ] ; then
# $cf $testdata/testoutput22-$bits testtry
# if [ $? != 0 ] ; then exit 1; fi
# else exit 1
# fi
# echo " OK"
# fi
#fi
#
#if [ $do23 = yes ] ; then
# echo $title23
# if [ "$bits" = "8" -o "$bits" = "32" ] ; then
# echo " Skipped when running 8/32-bit tests"
# else
# $sim $valgrind ./pcre2test -q $bmode $testdata/testinput23 testtry
# if [ $? = 0 ] ; then
# $cf $testdata/testoutput23 testtry
# if [ $? != 0 ] ; then exit 1; fi
# else exit 1
# fi
# echo " OK"
# fi
#fi
#
#if [ $do24 = yes ] ; then
# echo $title24
# if [ "$bits" = "8" -o "$bits" = "32" ] ; then
# echo " Skipped when running 8/32-bit tests"
# elif [ $utf -eq 0 ] ; then
# echo " Skipped because UTF-$bits support is not available"
# else
# $sim $valgrind ./pcre2test -q $bmode $testdata/testinput24 testtry
# if [ $? = 0 ] ; then
# $cf $testdata/testoutput24 testtry
# if [ $? != 0 ] ; then exit 1; fi
# else exit 1
# fi
# echo " OK"
# fi
#fi
#
#if [ $do25 = yes ] ; then
# echo $title25
# if [ "$bits" = "8" -o "$bits" = "16" ] ; then
# echo " Skipped when running 8/16-bit tests"
# else
# $sim $valgrind ./pcre2test -q $bmode $testdata/testinput25 testtry
# if [ $? = 0 ] ; then
# $cf $testdata/testoutput25 testtry
# if [ $? != 0 ] ; then exit 1; fi
# else exit 1
# fi
# echo " OK"
# fi
#fi
#
#if [ $do26 = yes ] ; then
# echo $title26
# if [ "$bits" = "8" -o "$bits" = "16" ] ; then
# echo " Skipped when running 8/16-bit tests"
# elif [ $utf -eq 0 ] ; then
# echo " Skipped because UTF-$bits support is not available"
# else
# $sim $valgrind ./pcre2test -q $bmode $testdata/testinput26 testtry
# if [ $? = 0 ] ; then
# $cf $testdata/testoutput26 testtry
# if [ $? != 0 ] ; then exit 1; fi
# else exit 1
# fi
# echo " OK"
# fi
#fi
# End of loop for 8/16/32-bit tests
done
# Clean up local working files
rm -f test3input test3output test3outputA testNinput testsaved* teststderr teststdout testtry
# End

View File

@ -1,4 +1,4 @@
.TH PCRE2TEST 1 "05 July 2014" "PCRE 10.00" .TH PCRE2TEST 1 "22 July 2014" "PCRE 10.00"
.SH NAME .SH NAME
pcre2test - a program for testing Perl-compatible regular expressions. pcre2test - a program for testing Perl-compatible regular expressions.
.SH SYNOPSIS .SH SYNOPSIS
@ -51,7 +51,7 @@ before being passed to the library functions. Results are converted back to
8-bit code units for output. 8-bit code units for output.
.P .P
In the rest of this document, the names of library functions and structures In the rest of this document, the names of library functions and structures
are given in generic form, for example, \fBpcre_compile()\fP. The actual are given in generic form, for example, \fBpcre_compile()\fP. The actual
names used in the libraries have a suffix _8, _16, or _32, as appropriate. names used in the libraries have a suffix _8, _16, or _32, as appropriate.
. .
. .
@ -130,8 +130,8 @@ form and information about the compiled pattern is output after compilation;
\fB-d\fP is equivalent to \fB-b -i\fP. \fB-d\fP is equivalent to \fB-b -i\fP.
.TP 10 .TP 10
\fB-dfa\fP \fB-dfa\fP
Behave as if each subject line has the \fBdfa\fP modifier; matching is done Behave as if each subject line has the \fBdfa\fP modifier; matching is done
using the \fBpcre2_dfa_match()\fP function instead of the default using the \fBpcre2_dfa_match()\fP function instead of the default
\fBpcre2_match()\fP. \fBpcre2_match()\fP.
.TP 10 .TP 10
\fB-help\fP \fB-help\fP
@ -141,6 +141,10 @@ Output a brief summary these options and then exit.
Behave as if each pattern has the \fB/info\fP modifier; information about the Behave as if each pattern has the \fB/info\fP modifier; information about the
compiled pattern is given after compilation. compiled pattern is given after compilation.
.TP 10 .TP 10
\fB-jit\fP
Behave as if each pattern line has the \fBjit\fP modifier; after successful
compilation, each pattern is passed to the just-in-time compiler, if available.
.TP 10
\fB-pattern\fB \fImodifier-list\fP \fB-pattern\fB \fImodifier-list\fP
Behave as if each pattern line contains the given modifiers. Behave as if each pattern line contains the given modifiers.
.TP 10 .TP 10
@ -152,7 +156,7 @@ On Unix-like systems, set the size of the run-time stack to \fIsize\fP
megabytes. megabytes.
.TP10 .TP10
\fB-subject\fP \fImodifier-list\fP \fB-subject\fP \fImodifier-list\fP
Behave as if each subject line contains the given modifiers. Behave as if each subject line contains the given modifiers.
.TP 10 .TP 10
\fB-t\fP \fB-t\fP
Run each compile and match many times with a timer, and output the resulting Run each compile and match many times with a timer, and output the resulting
@ -191,7 +195,7 @@ the \fB-help\fP option states whether or not \fBreadline()\fP will be used.
The program handles any number of tests, each of which consists of a set of The program handles any number of tests, each of which consists of a set of
input lines. Each set starts with a regular expression pattern, followed by any input lines. Each set starts with a regular expression pattern, followed by any
number of subject lines to be matched against that pattern. In between sets of number of subject lines to be matched against that pattern. In between sets of
test data, command lines that begin with a hash (#) character may appear. This test data, command lines that begin with a hash (#) character may appear. This
file format, with some restrictions, can also be processed by the file format, with some restrictions, can also be processed by the
\fBperltest.pl\fP script that is distributed with PCRE2 as a means of checking \fBperltest.pl\fP script that is distributed with PCRE2 as a means of checking
that the behaviour of PCRE2 and Perl is the same. that the behaviour of PCRE2 and Perl is the same.
@ -212,52 +216,63 @@ still input to be read.
.SH "COMMAND LINES" .SH "COMMAND LINES"
.rs .rs
.sp .sp
In between sets of test data, a line that begins with a hash (#) character is In between sets of test data, a line that begins with a hash (#) character is
interpreted as a command line. If the first character is followed by white interpreted as a command line. If the first character is followed by white
space or an exclamation mark, the line is treated as a comment, and ignored. space or an exclamation mark, the line is treated as a comment, and ignored.
Otherwise, the following commands are recognized: Otherwise, the following commands are recognized:
.sp
#forbid_utf
.sp
Subsequent patterns automatically have the PCRE2_NEVER_UTF and PCRE2_NEVER_UCP
options set, which locks out the use of UTF and Unicode property features. This
is a trigger guard that is used in test files to ensure that UTF/Unicode tests
are not accidentally added to files that are used when UTF support is not
included in the library. This effect can also be obtained by the use of
\fB#pattern\fP; the difference is that \fB#forbid_utf\fP cannot be unset, and
the automatic options are not displayed in pattern information, to avoid
cluttering up test output.
.sp .sp
#load <file name> #load <file name>
.sp .sp
Load a pre-compiled pattern that has been saved in a file. This command must be Load a pre-compiled pattern that has been saved in a file. This command must be
followed immediately by any subject lines that are to be matched by the followed immediately by any subject lines that are to be matched by the
pattern. pattern.
.sp .sp
#pattern <modifier-list> #pattern <modifier-list>
.sp .sp
This command sets a default modifier list that applies to all subsequent This command sets a default modifier list that applies to all subsequent
patterns. Modifiers on a pattern can change these settings. patterns. Modifiers on a pattern can change these settings.
.sp .sp
#perltest #perltest
.sp .sp
The appearance of this line causes all subsequent modifier settings to be The appearance of this line causes all subsequent modifier settings to be
checked for compatibility with the \fBperltest.pl\fP script, which is used to checked for compatibility with the \fBperltest.pl\fP script, which is used to
confirm that Perl gives the same results as PCRE2. Also, apart from comment confirm that Perl gives the same results as PCRE2. Also, apart from comment
lines, none of the other command lines are permitted, because they and many lines, none of the other command lines are permitted, because they and many
of the modifiers are specific to \fBpcre2test\fP, and should not be used in of the modifiers are specific to \fBpcre2test\fP, and should not be used in
test files that are also processed by \fBperltest.pl\fP. The \fP#perltest\fB test files that are also processed by \fBperltest.pl\fP. The \fP#perltest\fB
command helps detect tests that are accidentally put in the wrong file. command helps detect tests that are accidentally put in the wrong file.
.sp .sp
#subject <modifier-list> #subject <modifier-list>
.sp .sp
This command sets a default modifier list that applies to all subsequent This command sets a default modifier list that applies to all subsequent
subject lines. Modifiers on a subject line can change these settings. subject lines. Modifiers on a subject line can change these settings.
. .
. .
.SH "MODIFIER SYNTAX" .SH "MODIFIER SYNTAX"
.rs .rs
.sp .sp
Modifier lists are used with both pattern and subject lines. Items in a list Modifier lists are used with both pattern and subject lines. Items in a list
are separated by commas and optional white space. Some modifiers may be given are separated by commas and optional white space. Some modifiers may be given
for both patterns and subject lines, whereas others are valid for one or the for both patterns and subject lines, whereas others are valid for one or the
other only. Each modifier has a long name, for example "anchored", and some of other only. Each modifier has a long name, for example "anchored", and some of
them must be followed by an equals sign and a value, for example, "offset=12". them must be followed by an equals sign and a value, for example, "offset=12".
Modifiers that do not take values may be preceded by a minus sign to turn off a Modifiers that do not take values may be preceded by a minus sign to turn off a
previous default setting. previous default setting.
.P .P
A few of the more common modifiers can also be specified as single or double A few of the more common modifiers can also be specified as single or double
letters, for example "i" for "caseless". In documentation, following the Perl letters, for example "i" for "caseless". In documentation, following the Perl
convention, these are written with a slash ("the /i modifier") for clarity. convention, these are written with a slash ("the /i modifier") for clarity.
Abbreviated modifiers must all be concatenated in the first item of a modifier Abbreviated modifiers must all be concatenated in the first item of a modifier
list. If the first item is not recognized as a long modifier name, it is list. If the first item is not recognized as a long modifier name, it is
interpreted as a sequence of these abbreviations. For example: interpreted as a sequence of these abbreviations. For example:
@ -340,28 +355,29 @@ possible to construct invalid UTF-16 sequences for testing purposes.
In UTF-32 mode, all 4- to 8-digit \ex{...} values are accepted. This makes it In UTF-32 mode, all 4- to 8-digit \ex{...} values are accepted. This makes it
possible to construct invalid UTF-32 sequences for testing purposes. possible to construct invalid UTF-32 sequences for testing purposes.
.P .P
There is a special backslash sequence that specifies replication of one or more There is a special backslash sequence that specifies replication of one or more
characters: characters:
.sp .sp
\e[<characters>]{<count>} \e[<characters>]{<count>}
.sp .sp
This makes it possible to test long strings without having to provide them as This makes it possible to test long strings without having to provide them as
part of the file. For example: part of the file. For example:
.sp .sp
\e[abc]{4} \e[abc]{4}
.sp .sp
is converted to "abcabcabcabc". This feature does not support nesting. To is converted to "abcabcabcabc". This feature does not support nesting. To
include a closing square bracket in the characters, code it as \ex5D. include a closing square bracket in the characters, code it as \ex5D.
.P .P
A backslash followed by an equals sign marke the end of the subject string and A backslash followed by an equals sign marke the end of the subject string and
the start of a modifier list. For example: the start of a modifier list. For example:
.sp .sp
abc\=notbol,notempty abc\=notbol,notempty
.sp .sp
A backslash followed by anything else causes an error. However, if the very A backslash followed by any other non-alphanumeric character just escapes that
last character in the line is a backslash (and there is no modifier list), it character. A backslash followed by anything else causes an error. However, if
is ignored. This gives a way of passing an empty line as data, since a real the very last character in the line is a backslash (and there is no modifier
empty line terminates the data input. list), it is ignored. This gives a way of passing an empty line as data, since
a real empty line terminates the data input.
. .
. .
.SH "PATTERN MODIFIERS" .SH "PATTERN MODIFIERS"
@ -375,7 +391,7 @@ can add to or override default modifiers that were set by a previous
.SS "Setting compilation options" .SS "Setting compilation options"
.rs .rs
.sp .sp
The following modifiers set options for \fBpcre2_compile()\fP. The most common The following modifiers set options for \fBpcre2_compile()\fP. The most common
ones have single-letter abbreviations. See ones have single-letter abbreviations. See
.\" HREF .\" HREF
\fBpcreapi\fP \fBpcreapi\fP
@ -421,10 +437,10 @@ about the pattern:
flipbytes flip endianness flipbytes flip endianness
/BB fullbincode show binary code with lengths /BB fullbincode show binary code with lengths
/I info show info about compiled pattern /I info show info about compiled pattern
hex pattern is coded in hexadecimal hex pattern is coded in hexadecimal
jit[=<number>] use JIT jit[=<number>] use JIT
locale=<name> use this locale locale=<name> use this locale
memory show memory used memory show memory used
newline=<type> set newline type newline=<type> set newline type
parens_nest_limit=<n> set maximum parentheses depth parens_nest_limit=<n> set maximum parentheses depth
perlcompat lock out non-Perl modifiers perlcompat lock out non-Perl modifiers
@ -432,7 +448,7 @@ about the pattern:
save=<file name> save compiled pattern save=<file name> save compiled pattern
stackguard=<number> test the stackguard feature stackguard=<number> test the stackguard feature
tables=[0|1|2] select internal tables tables=[0|1|2] select internal tables
use_length use the pattern's length use_length use the pattern's length
.sp .sp
The effects of these modifiers are described in the following sections. The effects of these modifiers are described in the following sections.
FIXME: Give more examples. FIXME: Give more examples.
@ -441,23 +457,23 @@ FIXME: Give more examples.
.SS "Newline and \eR handling" .SS "Newline and \eR handling"
.rs .rs
.sp .sp
The \fBbsr\fP modifier specifies what \eR in a pattern should match. If it is The \fBbsr\fP modifier specifies what \eR in a pattern should match. If it is
set to "anycrlf", \eR matches CR, LF, or CRLF only. If it is set to "unicode", set to "anycrlf", \eR matches CR, LF, or CRLF only. If it is set to "unicode",
\eR matches any Unicode newline sequence. The default is specified when PCRE2 \eR matches any Unicode newline sequence. The default is specified when PCRE2
is built, with the default default being Unicode. is built, with the default default being Unicode.
.P .P
The \fBnewline\fP modifier specifies which characters are to be interpreted as The \fBnewline\fP modifier specifies which characters are to be interpreted as
newlines, both in the pattern and (by default) in subject lines. The type must newlines, both in the pattern and (by default) in subject lines. The type must
be one of CR, LF, CRLF, ANYCRLF, or ANY. be one of CR, LF, CRLF, ANYCRLF, or ANY.
.P .P
Both the \eR and newline settings can be changed at match time, but if this is Both the \eR and newline settings can be changed at match time, but if this is
done, JIT matching is disabled. done, JIT matching is disabled.
. .
. .
.SS "Information about a pattern" .SS "Information about a pattern"
.rs .rs
.sp .sp
The \fBdebug\fP modifier is a shorthand for \fBinfo,fullbincode\fP, requesting The \fBdebug\fP modifier is a shorthand for \fBinfo,fullbincode\fP, requesting
all available information. all available information.
.P .P
The \fBbincode\fP modifier causes a representation of the compiled code to be The \fBbincode\fP modifier causes a representation of the compiled code to be
@ -466,12 +482,12 @@ values, which ensures that the same output is generated for different internal
link sizes and different code unit widths. By using \fBbincode\fP, the same link sizes and different code unit widths. By using \fBbincode\fP, the same
regression tests can be used in different environments. regression tests can be used in different environments.
.P .P
The \fBfullbincode\fP modifier, by contrast, \fIdoes\fP include length and The \fBfullbincode\fP modifier, by contrast, \fIdoes\fP include length and
offset values. This is used in a few special tests and is also useful for offset values. This is used in a few special tests and is also useful for
one-off tests. one-off tests.
.P .P
The \fBinfo\fP modifier requests information about the compiled pattern The \fBinfo\fP modifier requests information about the compiled pattern
(whether it is anchored, has a fixed first character, and so on). The (whether it is anchored, has a fixed first character, and so on). The
information is obtained from the \fBpcre2_pattern_info()\fP function. information is obtained from the \fBpcre2_pattern_info()\fP function.
. .
. .
@ -490,21 +506,21 @@ below.
.SS "Specifying a pattern in hex" .SS "Specifying a pattern in hex"
.rs .rs
.sp .sp
The \fBhex\fP modifier specifies that the characters of the pattern are to be The \fBhex\fP modifier specifies that the characters of the pattern are to be
interpreted as pairs of hexadecimal digits. White space is permitted between interpreted as pairs of hexadecimal digits. White space is permitted between
pairs. For example: pairs. For example:
.sp .sp
/ab 32 59/hex /ab 32 59/hex
.sp .sp
This feature is provided as a way of creating patterns that contain binary zero This feature is provided as a way of creating patterns that contain binary zero
characters. When \fBhex\fP is set, it implies \fBuse_length\fP. characters. When \fBhex\fP is set, it implies \fBuse_length\fP.
. .
. .
.SS "Using the pattern's length" .SS "Using the pattern's length"
.rs .rs
.sp .sp
By default, \fBpcre2test\fP passes patterns as zero-terminated strings to By default, \fBpcre2test\fP passes patterns as zero-terminated strings to
\fBpcre2_compile()\fP, giving the length as -1. If \fBuse_length\fP is set, the \fBpcre2_compile()\fP, giving the length as -1. If \fBuse_length\fP is set, the
length of the pattern is passed. This is implied if \fBhex\fP is set. length of the pattern is passed. This is implied if \fBhex\fP is set.
. .
. .
@ -549,7 +565,7 @@ character tables for the locale, and this is then passed to
\fBpcre2_compile()\fP when compiling the regular expression. The same tables \fBpcre2_compile()\fP when compiling the regular expression. The same tables
are used when matching the following subject lines. The \fB/locale\fP modifier are used when matching the following subject lines. The \fB/locale\fP modifier
applies only to the pattern on which it appears, but can be given in a applies only to the pattern on which it appears, but can be given in a
\fB#pattern\fP command if a default is needed. Setting a locale and alternate \fB#pattern\fP command if a default is needed. Setting a locale and alternate
character tables are mutually exclusive. character tables are mutually exclusive.
. .
. .
@ -566,7 +582,7 @@ also output.
.SS "Limiting nested parentheses" .SS "Limiting nested parentheses"
.rs .rs
.sp .sp
The \fBparens_nest_limit\fP modifier sets a limit on the depth of nested The \fBparens_nest_limit\fP modifier sets a limit on the depth of nested
parentheses in a pattern. Breaching the limit causes a compilation error. parentheses in a pattern. Breaching the limit causes a compilation error.
. .
. .
@ -594,14 +610,17 @@ below. All other modifiers cause an error.
.rs .rs
.sp .sp
The \fB/stackguard\fP modifier is used to test the use of The \fB/stackguard\fP modifier is used to test the use of
\fBpcre2_stack_guard\fP. It must be followed by '0' or '1', specifying the \fBpcre2_set_compile_recursion_guard()\fP, a function that is provided to
return code to be given from an external function that is passed to PCRE2 and enable stack availability to be checked during compilation (see the
used for stack checking during compilation (see the
.\" HREF .\" HREF
\fBpcre2api\fP \fBpcre2api\fP
.\" .\"
documentation for details). FIXME: this needs doing properly once the test is documentation for details). If the number specified by the modifier is greater
implemented. Mention nested parens limit. than zero, \fBpcre2_set_compile_recursion_guard()\fP is called to set up
callback from \fBpcre2_compile()\fP to a local function. The argument it is
passed is the current nesting parenthesis depth; if this is greater than the
value given by the modifier, non-zero is returned, causing the compilation to
be aborted.
. .
. .
.SS "Using alternative character tables" .SS "Using alternative character tables"
@ -618,7 +637,7 @@ different character tables. The digit specifies the tables as follows:
2 a set of tables defining ISO 8859 characters 2 a set of tables defining ISO 8859 characters
.sp .sp
In table 2, some characters whose codes are greater than 128 are identified as In table 2, some characters whose codes are greater than 128 are identified as
letters, digits, spaces, etc. Setting alternate character tables and a locale letters, digits, spaces, etc. Setting alternate character tables and a locale
are mutually exclusive. are mutually exclusive.
. .
. .
@ -635,24 +654,24 @@ not affect the compilation process.
allcaptures show all captures allcaptures show all captures
/gg altglobal alternative global matching /gg altglobal alternative global matching
/g global global matching /g global global matching
jitverify verify JIT usage jitverify verify JIT usage
mark show mark values mark show mark values
.sp .sp
These modifiers may not appear in a \fB#pattern\fP command. If you want them as These modifiers may not appear in a \fB#pattern\fP command. If you want them as
defaults, set them in a \fB#subject\fP command. defaults, set them in a \fB#subject\fP command.
. .
. .
.SH "SUBJECT MODIFIERS" .SH "SUBJECT MODIFIERS"
.rs .rs
.sp .sp
The modifiers that can appear in subject lines and the \fB#subject\fP The modifiers that can appear in subject lines and the \fB#subject\fP
command are of two types. command are of two types.
. .
. .
.SS "Setting match options" .SS "Setting match options"
.rs .rs
.sp .sp
The following modifiers set options for \fBpcre2_match()\fP or The following modifiers set options for \fBpcre2_match()\fP or
\fBpcre2_dfa_match()\fP. See \fBpcre2_dfa_match()\fP. See
.\" HREF .\" HREF
\fBpcreapi\fP \fBpcreapi\fP
@ -674,7 +693,7 @@ for a description of their effects.
If the \fB/posix\fP modifier was present on the pattern, causing the POSIX If the \fB/posix\fP modifier was present on the pattern, causing the POSIX
wrapper API to be used, the only option-setting modifiers that have any effect wrapper API to be used, the only option-setting modifiers that have any effect
are \fBnotbol\fP, \fBnotempty\fP, and \fBnoteol\fP, causing REG_NOTBOL, are \fBnotbol\fP, \fBnotempty\fP, and \fBnoteol\fP, causing REG_NOTBOL,
REG_NOTEMPTY, and REG_NOTEOL, respectively, to be passed to \fBregexec()\fP. REG_NOTEMPTY, and REG_NOTEOL, respectively, to be passed to \fBregexec()\fP.
Any other modifiers cause an error. Any other modifiers cause an error.
. .
.SS "Setting match controls" .SS "Setting match controls"
@ -691,7 +710,7 @@ pattern.
/gg altglobal alternative global matching /gg altglobal alternative global matching
bsr=[anycrlf|unicode] specify \eR handling bsr=[anycrlf|unicode] specify \eR handling
callout_capture show captures at callout time callout_capture show captures at callout time
callout_data=<n> set a value to pass via callouts callout_data=<n> set a value to pass via callouts
callout_fail=<n>[:<m>] control callout failure callout_fail=<n>[:<m>] control callout failure
callout_none do not supply a callout function callout_none do not supply a callout function
copy=<number or name> copy captured substring copy=<number or name> copy captured substring
@ -717,9 +736,9 @@ FIXME: Give more examples.
.SS "Newline and \eR handling" .SS "Newline and \eR handling"
.rs .rs
.sp .sp
These modifiers set the newline and \eR processing conventions for the subject These modifiers set the newline and \eR processing conventions for the subject
line, overriding any values that were set at compile time (as described above). line, overriding any values that were set at compile time (as described above).
JIT matching is disabled if these settings are changed at match time. JIT matching is disabled if these settings are changed at match time.
. .
. .
.SS "Showing more text" .SS "Showing more text"
@ -751,31 +770,31 @@ A callout function is supplied when \fBpcre2test\fP calls the library matching
functions, unless \fBcallout_none\fP is specified. If \fBcallout_capture\fP is functions, unless \fBcallout_none\fP is specified. If \fBcallout_capture\fP is
set, the current captured groups are output when a callout occurs. set, the current captured groups are output when a callout occurs.
.P .P
The \fBcallout_fail\fP modifier can be given one or two numbers. If there is The \fBcallout_fail\fP modifier can be given one or two numbers. If there is
only one number, 1 is returned instead of 0 when a callout of that number is only one number, 1 is returned instead of 0 when a callout of that number is
reached. If two numbers are given, 1 is returned when callout <n> is reached reached. If two numbers are given, 1 is returned when callout <n> is reached
for the <m>th time. for the <m>th time.
.P .P
The \fBcallout_data\fP modifier can be given an unsigned or a negative number. The \fBcallout_data\fP modifier can be given an unsigned or a negative number.
Any value other than zero is used as a return from \fBpcre2test\fP's callout Any value other than zero is used as a return from \fBpcre2test\fP's callout
function. function.
. .
. .
.SS "Testing substring extraction functions" .SS "Testing substring extraction functions"
.rs .rs
.sp .sp
The \fBcopy\fP and \fBget\fP modifiers can be used to test the The \fBcopy\fP and \fBget\fP modifiers can be used to test the
\fBpcre2_substring_copy_xxx()\fP and \fBpcre2_substring_get_xxx()\fP functions. \fBpcre2_substring_copy_xxx()\fP and \fBpcre2_substring_get_xxx()\fP functions.
They can be given more than once, and each can specify a group name or number, They can be given more than once, and each can specify a group name or number,
for example: for example:
.sp .sp
abcd\=copy=1,copy=3,get=G1 abcd\=copy=1,copy=3,get=G1
.sp .sp
If the \fB#subject\fP command is used to set default copy and get lists, these If the \fB#subject\fP command is used to set default copy and get lists, these
can be unset by specifying a negative number for numbered groups and an empty can be unset by specifying a negative number for numbered groups and an empty
name for named groups. name for named groups.
.P .P
The \fBgetall\fP modifier tests \fBpcre2_substring_list_get()\fP, which The \fBgetall\fP modifier tests \fBpcre2_substring_list_get()\fP, which
extracts all captured substrings. extracts all captured substrings.
.P .P
If the subject line is successfully matched, the substrings extracted by the If the subject line is successfully matched, the substrings extracted by the
@ -820,7 +839,7 @@ default 32K is necessary only for very complicated patterns.
.SS "Setting match and recursion limits" .SS "Setting match and recursion limits"
.rs .rs
.sp .sp
The \fBmatch_limit\fP and \fBrecursion_limit\fP modifiers set the appropriate The \fBmatch_limit\fP and \fBrecursion_limit\fP modifiers set the appropriate
limits in the match context. These values are ignored when the limits in the match context. These values are ignored when the
\fBfind_limits\fP modifier is specified. \fBfind_limits\fP modifier is specified.
. .
@ -857,23 +876,23 @@ is added to the non-match message.
.SS "Showing memory usage" .SS "Showing memory usage"
.rs .rs
.sp .sp
The \fBmemory\fP modifier causes \fBpcre2test\fP to log all memory allocation The \fBmemory\fP modifier causes \fBpcre2test\fP to log all memory allocation
and freeing calls that occur during a match operation. and freeing calls that occur during a match operation.
. .
. .
.SS "Setting a starting offset" .SS "Setting a starting offset"
.rs .rs
.sp .sp
The \fBoffset\fP modifier sets an offset in the subject string at which The \fBoffset\fP modifier sets an offset in the subject string at which
matching starts. Its value is a number of code units, not characters. matching starts. Its value is a number of code units, not characters.
. .
. .
.SS "Setting the size of the output vector" .SS "Setting the size of the output vector"
.rs .rs
.sp .sp
The \fBovector\fP modifier applies only to the subject line in which it The \fBovector\fP modifier applies only to the subject line in which it
appears, though of course it can also be used to set a default in a appears, though of course it can also be used to set a default in a
\fB#subject\fP command. It specifies the number of pairs of offsets that are \fB#subject\fP command. It specifies the number of pairs of offsets that are
available for storing matching information. The default is 15. available for storing matching information. The default is 15.
. .
. .
@ -909,7 +928,7 @@ Otherwise, it outputs "No match" when the return is PCRE2_ERROR_NOMATCH, or
return is PCRE2_ERROR_PARTIAL. (Note that this is the return is PCRE2_ERROR_PARTIAL. (Note that this is the
entire substring that was inspected during the partial match; it may include entire substring that was inspected during the partial match; it may include
characters before the actual match start if a lookbehind assertion, \eK, \eb, characters before the actual match start if a lookbehind assertion, \eK, \eb,
or \eB was involved.) or \eB was involved.)
.P .P
For any other return, \fBpcre2test\fP outputs the PCRE2 For any other return, \fBpcre2test\fP outputs the PCRE2
negative error number and a short descriptive phrase. If the error is a failed negative error number and a short descriptive phrase. If the error is a failed
@ -1210,6 +1229,6 @@ Cambridge CB2 3QH, England.
.rs .rs
.sp .sp
.nf .nf
Last updated: 05 July 2014 Last updated: 22 July 2014
Copyright (c) 1997-2014 University of Cambridge. Copyright (c) 1997-2014 University of Cambridge.
.fi .fi

247
perltest.pl Executable file
View File

@ -0,0 +1,247 @@
#! /usr/bin/env perl
# Program for testing regular expressions with perl to check that PCRE2 handles
# them the same. This version needs to have "use utf8" at the start for running
# the UTF-8 tests, but *not* for the other tests. The only way I've found for
# doing this is to cat this line in explicitly in the RunPerlTest script. I've
# also used this method to supply "require Encode" for the UTF-8 tests, so that
# the main test will still run where Encode is not installed.
#use utf8;
#require Encode;
# Function for turning a string into a string of printing chars.
sub pchars {
my($t) = "";
if ($utf8)
{
@p = unpack('U*', $_[0]);
foreach $c (@p)
{
if ($c >= 32 && $c < 127) { $t .= chr $c; }
else { $t .= sprintf("\\x{%02x}", $c);
}
}
}
else
{
foreach $c (split(//, $_[0]))
{
if (ord $c >= 32 && ord $c < 127) { $t .= $c; }
else { $t .= sprintf("\\x%02x", ord $c); }
}
}
$t;
}
# Read lines from named file or stdin and write to named file or stdout; lines
# consist of a regular expression, in delimiters and optionally followed by
# options, followed by a set of test data, terminated by an empty line.
# Sort out the input and output files
if (@ARGV > 0)
{
open(INFILE, "<$ARGV[0]") || die "Failed to open $ARGV[0]\n";
$infile = "INFILE";
}
else { $infile = "STDIN"; }
if (@ARGV > 1)
{
open(OUTFILE, ">$ARGV[1]") || die "Failed to open $ARGV[1]\n";
$outfile = "OUTFILE";
}
else { $outfile = "STDOUT"; }
printf($outfile "Perl $] Regular Expressions\n\n");
# Main loop
NEXT_RE:
for (;;)
{
printf " re> " if $infile eq "STDIN";
last if ! ($_ = <$infile>);
printf $outfile "$_" if $infile ne "STDIN";
next if ($_ =~ /^\s*$/ || $_ =~ /^#/);
$pattern = $_;
while ($pattern !~ /^\s*(.).*\1/s)
{
printf " > " if $infile eq "STDIN";
last if ! ($_ = <$infile>);
printf $outfile "$_" if $infile ne "STDIN";
$pattern .= $_;
}
chomp($pattern);
$pattern =~ s/\s+$//;
# Split the pattern from the modifiers and adjust them as necessary.
$pattern =~ /^\s*((.).*\2)(.*)$/s;
$pat = $1;
$mod = $3;
# The private "aftertext" modifier means "print $' afterwards".
$showrest = ($mod =~ s/aftertext,?//);
# "allaftertext" is used by pcretest to print remainders after captures
$mod =~ s/allaftertext,?//;
# Detect utf
$utf8 = $mod =~ s/utf,?//;
# Remove "dupnames".
$mod =~ s/dupnames,?//;
# Remove "mark" (asks pcre2test to check MARK data) */
$mod =~ s/mark,?//;
# "ucp" asks pcre2test to set PCRE_UCP; change this to /u for Perl
$mod =~ s/W(?=[a-zA-Z]*$)/u/;
# Remove "no_auto_possess" and "no_start_optimize" (disable PCRE2 optimizations)
$mod =~ s/no_auto_possess,?//;
$mod =~ s/no_start_optimize,?//;
# Add back retained modifiers and check that the pattern is valid.
$mod =~ s/,//g;
$pattern = "$pat$mod";
eval "\$_ =~ ${pattern}";
if ($@)
{
printf $outfile "Error: $@";
if ($infile != "STDIN")
{
for (;;)
{
last if ! ($_ = <$infile>);
last if $_ =~ /^\s*$/;
}
}
next NEXT_RE;
}
# If the /g modifier is present, we want to put a loop round the matching;
# otherwise just a single "if".
$cmd = ($pattern =~ /g[a-z]*$/)? "while" : "if";
# If the pattern is actually the null string, Perl uses the most recently
# executed (and successfully compiled) regex is used instead. This is a
# nasty trap for the unwary! The PCRE2 test suite does contain null strings
# in places - if they are allowed through here all sorts of weird and
# unexpected effects happen. To avoid this, we replace such patterns with
# a non-null pattern that has the same effect.
$pattern = "/(?#)/$2" if ($pattern =~ /^(.)\1(.*)$/);
# Read data lines and test them
for (;;)
{
printf "data> " if $infile eq "STDIN";
last NEXT_RE if ! ($_ = <$infile>);
chomp;
printf $outfile "$_\n" if $infile ne "STDIN";
s/\s+$//; # Remove trailing space
s/^\s+//; # Remove leading space
s/\\Y//g; # Remove \Y (pcretest flag to set PCRE_NO_START_OPTIMIZE)
last if ($_ eq "");
$x = eval "\"$_\""; # To get escapes processed
# Empty array for holding results, ensure $REGERROR and $REGMARK are
# unset, then do the matching.
@subs = ();
$pushes = "push \@subs,\$&;" .
"push \@subs,\$1;" .
"push \@subs,\$2;" .
"push \@subs,\$3;" .
"push \@subs,\$4;" .
"push \@subs,\$5;" .
"push \@subs,\$6;" .
"push \@subs,\$7;" .
"push \@subs,\$8;" .
"push \@subs,\$9;" .
"push \@subs,\$10;" .
"push \@subs,\$11;" .
"push \@subs,\$12;" .
"push \@subs,\$13;" .
"push \@subs,\$14;" .
"push \@subs,\$15;" .
"push \@subs,\$16;" .
"push \@subs,\$'; }";
undef $REGERROR;
undef $REGMARK;
eval "${cmd} (\$x =~ ${pattern}) {" . $pushes;
if ($@)
{
printf $outfile "Error: $@\n";
next NEXT_RE;
}
elsif (scalar(@subs) == 0)
{
printf $outfile "No match";
if (defined $REGERROR && $REGERROR != 1)
{ printf $outfile (", mark = %s", &pchars($REGERROR)); }
printf $outfile "\n";
}
else
{
while (scalar(@subs) != 0)
{
printf $outfile (" 0: %s\n", &pchars($subs[0]));
printf $outfile (" 0+ %s\n", &pchars($subs[17])) if $showrest;
$last_printed = 0;
for ($i = 1; $i <= 16; $i++)
{
if (defined $subs[$i])
{
while ($last_printed++ < $i-1)
{ printf $outfile ("%2d: <unset>\n", $last_printed); }
printf $outfile ("%2d: %s\n", $i, &pchars($subs[$i]));
$last_printed = $i;
}
}
splice(@subs, 0, 18);
}
# It seems that $REGMARK is not marked as UTF-8 even when use utf8 is
# set and the input pattern was a UTF-8 string. We can, however, force
# it to be so marked.
if (defined $REGMARK && $REGMARK != 1)
{
$xx = $REGMARK;
$xx = Encode::decode_utf8($xx) if $utf8;
printf $outfile ("MK: %s\n", &pchars($xx));
}
}
}
}
# printf $outfile "\n";
# End

View File

@ -561,7 +561,7 @@ enum { ERR0 = COMPILE_ERROR_BASE,
ERR41, ERR42, ERR43, ERR44, ERR45, ERR46, ERR47, ERR48, ERR49, ERR50, ERR41, ERR42, ERR43, ERR44, ERR45, ERR46, ERR47, ERR48, ERR49, ERR50,
ERR51, ERR52, ERR53, ERR54, ERR55, ERR56, ERR57, ERR58, ERR59, ERR60, ERR51, ERR52, ERR53, ERR54, ERR55, ERR56, ERR57, ERR58, ERR59, ERR60,
ERR61, ERR62, ERR63, ERR64, ERR65, ERR66, ERR67, ERR68, ERR69, ERR70, ERR61, ERR62, ERR63, ERR64, ERR65, ERR66, ERR67, ERR68, ERR69, ERR70,
ERR71, ERR72, ERR73, ERR74, ERR75, ERR76, ERR77 }; ERR71, ERR72, ERR73, ERR74, ERR75, ERR76, ERR77, ERR78 };
/* This is a table of start-of-pattern options such as (*UTF) and settings such /* This is a table of start-of-pattern options such as (*UTF) and settings such
as (*LIMIT_MATCH=nnnn) and (*CRLF). For completeness and backward as (*LIMIT_MATCH=nnnn) and (*CRLF). For completeness and backward
@ -1703,10 +1703,10 @@ else
ptr += 4; ptr += 4;
if (utf) if (utf)
{ {
if (c > 0x10ffffU) *errorcodeptr = ERR76; if (c > 0x10ffffU) *errorcodeptr = ERR77;
else if (c >= 0xd800 && c <= 0xdfff) *errorcodeptr = ERR73; else if (c >= 0xd800 && c <= 0xdfff) *errorcodeptr = ERR73;
} }
else if (c > MAX_NON_UTF_CHAR) *errorcodeptr = ERR76; else if (c > MAX_NON_UTF_CHAR) *errorcodeptr = ERR77;
} }
break; break;
@ -1815,12 +1815,11 @@ else
recommended to avoid the ambiguities in the old syntax. recommended to avoid the ambiguities in the old syntax.
Outside a character class, the digits are read as a decimal number. If the Outside a character class, the digits are read as a decimal number. If the
number is less than 8 (used to be 10), or if there are that many previous number is less than 10, or if there are that many previous extracting left
extracting left brackets, then it is a back reference. Otherwise, up to brackets, it is a back reference. Otherwise, up to three octal digits are
three octal digits are read to form an escaped byte. Thus \123 is likely to read to form an escaped byte. Thus \123 is likely to be octal 123 (cf
be octal 123 (cf \0123, which is octal 012 followed by the literal 3). If \0123, which is octal 012 followed by the literal 3). If the octal value is
the octal value is greater than 377, the least significant 8 bits are greater than 377, the least significant 8 bits are taken.
taken. \8 and \9 are treated as the literal characters 8 and 9.
Inside a character class, \ followed by a digit is always either a literal Inside a character class, \ followed by a digit is always either a literal
8 or 9 or an octal number. */ 8 or 9 or an octal number. */
@ -1832,7 +1831,7 @@ else
{ {
oldptr = ptr; oldptr = ptr;
/* The integer range is limited by the machine's int representation. */ /* The integer range is limited by the machine's int representation. */
s = (int)(c -CHAR_0); s = (int)(c - CHAR_0);
overflow = FALSE; overflow = FALSE;
while (IS_DIGIT(ptr[1])) while (IS_DIGIT(ptr[1]))
{ {
@ -1849,7 +1848,7 @@ else
*errorcodeptr = ERR61; *errorcodeptr = ERR61;
break; break;
} }
if (s < 8 || s <= cb->bracount) /* Check for back reference */ if (s < 10 || s <= cb->bracount) /* Check for back reference */
{ {
escape = -s; escape = -s;
break; break;
@ -1886,7 +1885,7 @@ else
case CHAR_o: case CHAR_o:
if (ptr[1] != CHAR_LEFT_CURLY_BRACKET) *errorcodeptr = ERR55; else if (ptr[1] != CHAR_LEFT_CURLY_BRACKET) *errorcodeptr = ERR55; else
if (ptr[2] == CHAR_RIGHT_CURLY_BRACKET) *errorcodeptr = ERR77; else if (ptr[2] == CHAR_RIGHT_CURLY_BRACKET) *errorcodeptr = ERR78; else
{ {
ptr += 2; ptr += 2;
c = 0; c = 0;
@ -1947,7 +1946,7 @@ else
ptr += 2; ptr += 2;
if (*ptr == CHAR_RIGHT_CURLY_BRACKET) if (*ptr == CHAR_RIGHT_CURLY_BRACKET)
{ {
*errorcodeptr = ERR77; *errorcodeptr = ERR78;
break; break;
} }
c = 0; c = 0;
@ -1955,12 +1954,12 @@ else
while ((cc = XDIGIT(*ptr)) != 0xff) while ((cc = XDIGIT(*ptr)) != 0xff)
{ {
ptr++;
if (c == 0 && cc == 0) continue; /* Leading zeroes */ if (c == 0 && cc == 0) continue; /* Leading zeroes */
#if PCRE2_CODE_UNIT_WIDTH == 32 #if PCRE2_CODE_UNIT_WIDTH == 32
if (c >= 0x10000000l) { overflow = TRUE; break; } if (c >= 0x10000000l) { overflow = TRUE; break; }
#endif #endif
c = (c << 4) | cc; c = (c << 4) | cc;
ptr++;
if ((utf && c > 0x10ffffU) || (!utf && c > MAX_NON_UTF_CHAR)) if ((utf && c > 0x10ffffU) || (!utf && c > MAX_NON_UTF_CHAR))
{ {
overflow = TRUE; overflow = TRUE;
@ -2002,9 +2001,9 @@ else
break; break;
/* For \c, a following letter is upper-cased; then the 0x40 bit is flipped. /* For \c, a following letter is upper-cased; then the 0x40 bit is flipped.
An error is given if the byte following \c is not an ASCII character. This An error is given if the byte following \c is not a printable ASCII
coding is ASCII-specific, but then the whole concept of \cx is character. This coding is ASCII-specific, but then the whole concept of \cx
ASCII-specific. (However, an EBCDIC equivalent has now been added.) */ is ASCII-specific. (However, an EBCDIC equivalent has now been added.) */
case CHAR_c: case CHAR_c:
c = *(++ptr); c = *(++ptr);
@ -2014,7 +2013,7 @@ else
break; break;
} }
#ifndef EBCDIC /* ASCII/UTF-8 coding */ #ifndef EBCDIC /* ASCII/UTF-8 coding */
if (c > 127) /* Excludes all non-ASCII in either mode */ if (c < 32 || c > 126) /* Excludes all non-printable ASCII */
{ {
*errorcodeptr = ERR68; *errorcodeptr = ERR68;
break; break;
@ -3820,7 +3819,7 @@ for (;; ptr++)
{ {
ptr += 2; ptr += 2;
if (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_E) if (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_E)
{ ptr += 2; goto CONTINUE_CLASS; } { ptr += 2; continue; }
inescq = TRUE; inescq = TRUE;
break; break;
} }
@ -4981,7 +4980,7 @@ for (;; ptr++)
arglen = (int)(ptr - arg); arglen = (int)(ptr - arg);
if ((unsigned int)arglen > MAX_MARK) if ((unsigned int)arglen > MAX_MARK)
{ {
*errorcodeptr = ERR75; *errorcodeptr = ERR76;
goto FAILED; goto FAILED;
} }
} }
@ -6533,10 +6532,10 @@ Arguments:
reset_bracount TRUE to reset the count for each branch reset_bracount TRUE to reset the count for each branch
skipunits skip this many code units at start (for brackets and OP_COND) skipunits skip this many code units at start (for brackets and OP_COND)
cond_depth depth of nesting for conditional subpatterns cond_depth depth of nesting for conditional subpatterns
firstcuptr place to put the first required code unit firstcuptr place to put the first required code unit
firstcuflagsptr place to put the first code unit flags, or a negative number firstcuflagsptr place to put the first code unit flags, or a negative number
reqcuptr place to put the last required code unit reqcuptr place to put the last required code unit
reqcuflagsptr place to put the last required code unit flags, or a negative number reqcuflagsptr place to put the last required code unit flags, or a negative number
bcptr pointer to the chain of currently open branches bcptr pointer to the chain of currently open branches
cb points to the data block with tables pointers etc. cb points to the data block with tables pointers etc.
lengthptr NULL during the real compile phase lengthptr NULL during the real compile phase
@ -6548,10 +6547,9 @@ Returns: TRUE on success
static BOOL static BOOL
compile_regex(uint32_t options, PCRE2_UCHAR **codeptr, PCRE2_SPTR *ptrptr, compile_regex(uint32_t options, PCRE2_UCHAR **codeptr, PCRE2_SPTR *ptrptr,
int *errorcodeptr, BOOL lookbehind, BOOL reset_bracount, int skipunits, int *errorcodeptr, BOOL lookbehind, BOOL reset_bracount, int skipunits,
int cond_depth, int cond_depth, uint32_t *firstcuptr, int32_t *firstcuflagsptr,
uint32_t *firstcuptr, int32_t *firstcuflagsptr, uint32_t *reqcuptr, int32_t *reqcuflagsptr, branch_chain *bcptr,
uint32_t *reqcuptr, int32_t *reqcuflagsptr, compile_block *cb, size_t *lengthptr)
branch_chain *bcptr, compile_block *cb, size_t *lengthptr)
{ {
PCRE2_SPTR ptr = *ptrptr; PCRE2_SPTR ptr = *ptrptr;
PCRE2_UCHAR *code = *codeptr; PCRE2_UCHAR *code = *codeptr;
@ -6569,15 +6567,13 @@ unsigned int orig_bracount;
unsigned int max_bracount; unsigned int max_bracount;
branch_chain bc; branch_chain bc;
#ifdef FIXME
/* If set, call the external function that checks for stack availability. */ /* If set, call the external function that checks for stack availability. */
if (ccontext->stack_guard != NULL && ccontext->stack_guard(0)) if (cb->cx->stack_guard != NULL && cb->cx->stack_guard(cb->parens_depth))
{ {
*errorcodeptr= ERR33; *errorcodeptr= ERR33;
return FALSE; return FALSE;
} }
#endif
/* Miscellaneous initialization */ /* Miscellaneous initialization */
@ -7434,7 +7430,11 @@ while (ptr[skipatstart] == CHAR_LEFT_PARENTHESIS &&
if (c > UINT32_MAX / 10 - 1) break; /* Integer overflow */ if (c > UINT32_MAX / 10 - 1) break; /* Integer overflow */
c = c*10 + ptr[pp++] - CHAR_0; c = c*10 + ptr[pp++] - CHAR_0;
} }
if (ptr[pp++] != CHAR_RIGHT_PARENTHESIS) goto END_PSO; if (ptr[pp++] != CHAR_RIGHT_PARENTHESIS)
{
errorcode = ERR60;
goto HAD_ERROR;
}
if (p->type == PSO_LIMM) limit_match = c; if (p->type == PSO_LIMM) limit_match = c;
else limit_recursion = c; else limit_recursion = c;
skipatstart += pp - skipatstart; skipatstart += pp - skipatstart;
@ -7443,12 +7443,11 @@ while (ptr[skipatstart] == CHAR_LEFT_PARENTHESIS &&
break; /* Out of the table scan loop */ break; /* Out of the table scan loop */
} }
} }
if (i > sizeof(pso_list)/sizeof(pso)) break; /* Out of pso loop */ if (i >= sizeof(pso_list)/sizeof(pso)) break; /* Out of pso loop */
} }
/* End of pattern-start options; advance to start of real regex. */ /* End of pattern-start options; advance to start of real regex. */
END_PSO:
ptr += skipatstart; ptr += skipatstart;
/* Can't support UTF or UCP unless PCRE2 has been compiled with UTF support. */ /* Can't support UTF or UCP unless PCRE2 has been compiled with UTF support. */
@ -7476,6 +7475,15 @@ if (utf)
(errorcode = PRIV(valid_utf)(pattern, -1, erroroffset)) != 0) (errorcode = PRIV(valid_utf)(pattern, -1, erroroffset)) != 0)
goto HAD_ERROR; goto HAD_ERROR;
} }
/* Check UCP lockout. */
if ((cb.external_options & (PCRE2_UCP|PCRE2_NEVER_UCP)) ==
(PCRE2_UCP|PCRE2_NEVER_UCP))
{
errorcode = ERR75;
goto HAD_ERROR;
}
/* Process the BSR setting. */ /* Process the BSR setting. */

View File

@ -148,15 +148,16 @@ static const char compile_error_texts[] =
"different names for subpatterns of the same number are not allowed\0" "different names for subpatterns of the same number are not allowed\0"
"(*MARK) must have an argument\0" "(*MARK) must have an argument\0"
"non-hex character in \\x{} (closing brace missing?)\0" "non-hex character in \\x{} (closing brace missing?)\0"
"\\c must be followed by an ASCII character\0" "\\c must be followed by a printable ASCII character\0"
"\\k is not followed by a braced, angle-bracketed, or quoted name\0" "\\k is not followed by a braced, angle-bracketed, or quoted name\0"
/* 70 */ /* 70 */
"internal error: unknown opcode in find_fixedlength()\0" "internal error: unknown opcode in find_fixedlength()\0"
"\\N is not supported in a class\0" "\\N is not supported in a class\0"
"too many forward references\0" "too many forward references\0"
"disallowed Unicode code point (>= 0xd800 && <= 0xdfff)\0" "disallowed Unicode code point (>= 0xd800 && <= 0xdfff)\0"
"using (*UTF) is disabled by the application\0" "using UTF is disabled by the application\0"
/* 75 */ /* 75 */
"using UCP is disabled by the application\0"
"name is too long in (*MARK), (*PRUNE), (*SKIP), or (*THEN)\0" "name is too long in (*MARK), (*PRUNE), (*SKIP), or (*THEN)\0"
"character code point value in \\u.... sequence is too large\0" "character code point value in \\u.... sequence is too large\0"
"digits missing in \\x{} or \\o{}\0" "digits missing in \\x{} or \\o{}\0"
@ -223,7 +224,7 @@ static const char match_error_texts[] =
"JIT stack limit reached\0" "JIT stack limit reached\0"
"match limit exceeded\0" "match limit exceeded\0"
"no more memory\0" "no more memory\0"
"unknown substring\0" "unknown or unset substring\0"
/* 50 */ /* 50 */
"NULL argument passed\0" "NULL argument passed\0"
"nested recursion at the same subject position\0" "nested recursion at the same subject position\0"

View File

@ -6782,6 +6782,12 @@ ENDLOOP:
release_match_heapframes(&frame_zero, mb); release_match_heapframes(&frame_zero, mb);
#endif #endif
/* Fill in fields that are always returned in the match data. */
match_data->code = re;
match_data->subject = subject;
match_data->mark = mb->mark;
/* Handle a fully successful match. */ /* Handle a fully successful match. */
if (rc == MATCH_MATCH || rc == MATCH_ACCEPT) if (rc == MATCH_MATCH || rc == MATCH_ACCEPT)
@ -6841,26 +6847,27 @@ if (rc == MATCH_MATCH || rc == MATCH_ACCEPT)
match_data->ovector[0] = mb->start_match_ptr - mb->start_subject; match_data->ovector[0] = mb->start_match_ptr - mb->start_subject;
match_data->ovector[1] = mb->end_match_ptr - mb->start_subject; match_data->ovector[1] = mb->end_match_ptr - mb->start_subject;
} }
/* Set the remaining returned values */
/* Fill in the remaining fields that are returned in the match data. */
match_data->code = re;
match_data->subject = subject;
match_data->leftchar = mb->start_used_ptr - subject; match_data->leftchar = mb->start_used_ptr - subject;
match_data->rightchar = 0; /* FIXME */ match_data->rightchar = 0; /* FIXME */
match_data->startchar = start_match - subject; match_data->startchar = start_match - subject;
match_data->mark = mb->mark;
return match_data->rc; return match_data->rc;
} }
/* Control gets here if there has been a partial match, an error, or if the /* Control gets here if there has been a partial match, an error, or if the
overall match attempt has failed at all permitted starting positions. For overall match attempt has failed at all permitted starting positions. Any mark
anything other than nomatch or partial match, just return the code. */ data is in the nomatch_mark field. */
match_data->mark = mb->nomatch_mark;
/* For anything other than nomatch or partial match, just return the code. */
if (rc != MATCH_NOMATCH && rc != PCRE2_ERROR_PARTIAL) if (rc != MATCH_NOMATCH && rc != PCRE2_ERROR_PARTIAL)
match_data->rc = rc; match_data->rc = rc;
/* Handle a partial match. */ /* Else handle a partial match. */
else if (match_partial != NULL) else if (match_partial != NULL)
{ {
@ -6870,16 +6877,16 @@ else if (match_partial != NULL)
match_data->ovector[1] = end_subject - subject; match_data->ovector[1] = end_subject - subject;
} }
match_data->leftchar = start_partial - subject; match_data->leftchar = start_partial - subject;
match_data->rightchar = 0; /* FIXME */
match_data->startchar = match_partial - subject;
match_data->rc = PCRE2_ERROR_PARTIAL; match_data->rc = PCRE2_ERROR_PARTIAL;
} }
/* This is the classic nomatch case. */ /* Else this is the classic nomatch case. */
else else match_data->rc = PCRE2_ERROR_NOMATCH;
{
match_data->rc = PCRE2_ERROR_NOMATCH; /* Free any temporary offsets. */
match_data->mark = mb->nomatch_mark;
}
if (using_temporary_offsets) if (using_temporary_offsets)
mb->memctl.free(mb->ovector, mb->memctl.memory_data); mb->memctl.free(mb->ovector, mb->memctl.memory_data);

View File

@ -119,6 +119,7 @@ size_t left, right;
size_t p = 0; size_t p = 0;
PCRE2_SPTR subject = match_data->subject; PCRE2_SPTR subject = match_data->subject;
if (stringnumber >= match_data->oveccount || if (stringnumber >= match_data->oveccount ||
stringnumber > match_data->code->top_bracket ||
(left = match_data->ovector[stringnumber*2]) == PCRE2_UNSET) (left = match_data->ovector[stringnumber*2]) == PCRE2_UNSET)
return PCRE2_ERROR_NOSUBSTRING; return PCRE2_ERROR_NOSUBSTRING;
right = match_data->ovector[stringnumber*2+1]; right = match_data->ovector[stringnumber*2+1];
@ -203,6 +204,7 @@ PCRE2_UCHAR *yield;
PCRE2_SPTR subject = match_data->subject; PCRE2_SPTR subject = match_data->subject;
if (stringnumber >= match_data->oveccount || if (stringnumber >= match_data->oveccount ||
stringnumber > match_data->code->top_bracket ||
(left = match_data->ovector[stringnumber*2]) == PCRE2_UNSET) (left = match_data->ovector[stringnumber*2]) == PCRE2_UNSET)
return PCRE2_ERROR_NOSUBSTRING; return PCRE2_ERROR_NOSUBSTRING;
right = match_data->ovector[stringnumber*2+1]; right = match_data->ovector[stringnumber*2+1];
@ -293,6 +295,7 @@ pcre2_substring_length_bynumber(pcre2_match_data *match_data,
int stringnumber) int stringnumber)
{ {
if (stringnumber >= match_data->oveccount || if (stringnumber >= match_data->oveccount ||
stringnumber > match_data->code->top_bracket ||
match_data->ovector[stringnumber*2] == PCRE2_UNSET) match_data->ovector[stringnumber*2] == PCRE2_UNSET)
return PCRE2_ERROR_NOSUBSTRING; return PCRE2_ERROR_NOSUBSTRING;
return match_data->ovector[stringnumber*2 + 1] - return match_data->ovector[stringnumber*2 + 1] -

View File

@ -46,7 +46,6 @@ POSSIBILITY OF SUCH DAMAGE.
. save code and #load . save code and #load
. JIT - compile, time, verify . JIT - compile, time, verify
. memory handling testing . memory handling testing
. stackguard testing
*/ */
@ -435,7 +434,7 @@ static modstruct modlist[] = {
{ "dfa_shortest", MOD_DAT, MOD_OPT, PCRE2_DFA_SHORTEST, DO(options) }, { "dfa_shortest", MOD_DAT, MOD_OPT, PCRE2_DFA_SHORTEST, DO(options) },
{ "dollar_endonly", MOD_PAT, MOD_OPT, PCRE2_DOLLAR_ENDONLY, PO(options) }, { "dollar_endonly", MOD_PAT, MOD_OPT, PCRE2_DOLLAR_ENDONLY, PO(options) },
{ "dotall", MOD_PATP, MOD_OPT, PCRE2_DOTALL, PO(options) }, { "dotall", MOD_PATP, MOD_OPT, PCRE2_DOTALL, PO(options) },
{ "dupnames", MOD_PAT, MOD_OPT, PCRE2_DUPNAMES, PO(options) }, { "dupnames", MOD_PATP, MOD_OPT, PCRE2_DUPNAMES, PO(options) },
{ "extended", MOD_PATP, MOD_OPT, PCRE2_EXTENDED, PO(options) }, { "extended", MOD_PATP, MOD_OPT, PCRE2_EXTENDED, PO(options) },
{ "find_limits", MOD_DAT, MOD_CTL, CTL_FINDLIMITS, DO(control) }, { "find_limits", MOD_DAT, MOD_CTL, CTL_FINDLIMITS, DO(control) },
{ "firstline", MOD_PAT, MOD_OPT, PCRE2_FIRSTLINE, PO(options) }, { "firstline", MOD_PAT, MOD_OPT, PCRE2_FIRSTLINE, PO(options) },
@ -612,6 +611,7 @@ clock_t total_compile_time = 0;
clock_t total_match_time = 0; clock_t total_match_time = 0;
static uint32_t dfa_matched; static uint32_t dfa_matched;
static uint32_t forbid_utf = 0;
static uint32_t max_oveccount; static uint32_t max_oveccount;
static uint32_t callout_count; static uint32_t callout_count;
@ -830,6 +830,14 @@ are supported. */
pcre2_set_character_tables_16(G(a,16),b); \ pcre2_set_character_tables_16(G(a,16),b); \
else \ else \
pcre2_set_character_tables_32(G(a,32),b) pcre2_set_character_tables_32(G(a,32),b)
#define PCRE2_SET_COMPILE_RECURSION_GUARD(a,b) \
if (test_mode == PCRE8_MODE) \
pcre2_set_compile_recursion_guard_8(G(a,8),b); \
else if (test_mode == PCRE16_MODE) \
pcre2_set_compile_recursion_guard_16(G(a,16),b); \
else \
pcre2_set_compile_recursion_guard_32(G(a,32),b)
#define PCRE2_SET_MATCH_LIMIT(a,b) \ #define PCRE2_SET_MATCH_LIMIT(a,b) \
if (test_mode == PCRE8_MODE) \ if (test_mode == PCRE8_MODE) \
@ -1102,6 +1110,12 @@ the three different cases. */
else \ else \
G(pcre2_set_character_tables_,BITTWO)(G(a,BITTWO),b) G(pcre2_set_character_tables_,BITTWO)(G(a,BITTWO),b)
#define PCRE2_SET_COMPILE_RECURSION_GUARD(a,b) \
if (test_mode == G(G(PCRE,BITONE),_MODE)) \
G(pcre2_set_compile_recursion_guard_,BITONE)(G(a,BITONE),b); \
else \
G(pcre2_set_compile_recursion_guard_,BITTWO)(G(a,BITTWO),b)
#define PCRE2_SET_MATCH_LIMIT(a,b) \ #define PCRE2_SET_MATCH_LIMIT(a,b) \
if (test_mode == G(G(PCRE,BITONE),_MODE)) \ if (test_mode == G(G(PCRE,BITONE),_MODE)) \
G(pcre2_set_match_limit_,BITONE)(G(a,BITONE),b); \ G(pcre2_set_match_limit_,BITONE)(G(a,BITONE),b); \
@ -1245,8 +1259,10 @@ the three different cases. */
#define PCRE2_PATTERN_INFO(a,b,c,d) a = pcre2_pattern_info_8(G(b,8),c,d) #define PCRE2_PATTERN_INFO(a,b,c,d) a = pcre2_pattern_info_8(G(b,8),c,d)
#define PCRE2_PRINTINT(a) pcre2_printint_8(compiled_code8,outfile,a) #define PCRE2_PRINTINT(a) pcre2_printint_8(compiled_code8,outfile,a)
#define PCRE2_SET_CALLOUT(a,b,c) \ #define PCRE2_SET_CALLOUT(a,b,c) \
pcre2_set_callout_8(G(a,8),(int (*)(pcre2_callout_block_8 *))b,c); pcre2_set_callout_8(G(a,8),(int (*)(pcre2_callout_block_8 *))b,c)
#define PCRE2_SET_CHARACTER_TABLES(a,b) pcre2_set_character_tables_8(G(a,8),b) #define PCRE2_SET_CHARACTER_TABLES(a,b) pcre2_set_character_tables_8(G(a,8),b)
#define PCRE2_SET_COMPILE_RECURSION_GUARD(a,b) \
pcre2_set_compile_recursion_guard_8(G(a,8),b)
#define PCRE2_SET_MATCH_LIMIT(a,b) pcre2_set_match_limit_8(G(a,8),b) #define PCRE2_SET_MATCH_LIMIT(a,b) pcre2_set_match_limit_8(G(a,8),b)
#define PCRE2_SET_RECURSION_LIMIT(a,b) pcre2_set_recursion_limit_8(G(a,8),b) #define PCRE2_SET_RECURSION_LIMIT(a,b) pcre2_set_recursion_limit_8(G(a,8),b)
#define PCRE2_SUBSTRING_COPY_BYNAME(a,b,c,d,e) \ #define PCRE2_SUBSTRING_COPY_BYNAME(a,b,c,d,e) \
@ -1304,12 +1320,14 @@ the three different cases. */
#define PCRE2_SET_CALLOUT(a,b,c) \ #define PCRE2_SET_CALLOUT(a,b,c) \
pcre2_set_callout_16(G(a,16),(int (*)(pcre2_callout_block_16 *))b,c); pcre2_set_callout_16(G(a,16),(int (*)(pcre2_callout_block_16 *))b,c);
#define PCRE2_SET_CHARACTER_TABLES(a,b) pcre2_set_character_tables_16(G(a,16),b) #define PCRE2_SET_CHARACTER_TABLES(a,b) pcre2_set_character_tables_16(G(a,16),b)
#define PCRE2_SET_COMPILE_RECURSION_GUARD(a,b) \
pcre2_set_compile_recursion_guard_16(G(a,16),b)
#define PCRE2_SET_MATCH_LIMIT(a,b) pcre2_set_match_limit_16(G(a,16),b) #define PCRE2_SET_MATCH_LIMIT(a,b) pcre2_set_match_limit_16(G(a,16),b)
#define PCRE2_SET_RECURSION_LIMIT(a,b) pcre2_set_recursion_limit_16(G(a,16),b) #define PCRE2_SET_RECURSION_LIMIT(a,b) pcre2_set_recursion_limit_16(G(a,16),b)
#define PCRE2_SUBSTRING_COPY_BYNAME(a,b,c,d,e) \ #define PCRE2_SUBSTRING_COPY_BYNAME(a,b,c,d,e) \
a = pcre2_substring_copy_byname_16(G(b,16),G(c,16),(PCRE2_UCHAR16 *)d,e); a = pcre2_substring_copy_byname_16(G(b,16),G(c,16),(PCRE2_UCHAR16 *)d,e)
#define PCRE2_SUBSTRING_COPY_BYNUMBER(a,b,c,d,e) \ #define PCRE2_SUBSTRING_COPY_BYNUMBER(a,b,c,d,e) \
a = pcre2_substring_copy_bynumber_16(G(b,16),c,(PCRE2_UCHAR16 *)d,e); a = pcre2_substring_copy_bynumber_16(G(b,16),c,(PCRE2_UCHAR16 *)d,e)
#define PCRE2_SUBSTRING_FREE(a) pcre2_substring_free_16((PCRE2_UCHAR16 *)a) #define PCRE2_SUBSTRING_FREE(a) pcre2_substring_free_16((PCRE2_UCHAR16 *)a)
#define PCRE2_SUBSTRING_GET_BYNAME(a,b,c,d) \ #define PCRE2_SUBSTRING_GET_BYNAME(a,b,c,d) \
a = pcre2_substring_get_byname_16(G(b,16),G(c,16),(PCRE2_UCHAR16 **)d) a = pcre2_substring_get_byname_16(G(b,16),G(c,16),(PCRE2_UCHAR16 **)d)
@ -1361,10 +1379,12 @@ the three different cases. */
#define PCRE2_SET_CALLOUT(a,b,c) \ #define PCRE2_SET_CALLOUT(a,b,c) \
pcre2_set_callout_32(G(a,32),(int (*)(pcre2_callout_block_32 *))b,c); pcre2_set_callout_32(G(a,32),(int (*)(pcre2_callout_block_32 *))b,c);
#define PCRE2_SET_CHARACTER_TABLES(a,b) pcre2_set_character_tables_32(G(a,32),b) #define PCRE2_SET_CHARACTER_TABLES(a,b) pcre2_set_character_tables_32(G(a,32),b)
#define PCRE2_SET_COMPILE_RECURSION_GUARD(a,b) \
pcre2_set_compile_recursion_guard_32(G(a,32),b)
#define PCRE2_SET_MATCH_LIMIT(a,b) pcre2_set_match_limit_32(G(a,32),b) #define PCRE2_SET_MATCH_LIMIT(a,b) pcre2_set_match_limit_32(G(a,32),b)
#define PCRE2_SET_RECURSION_LIMIT(a,b) pcre2_set_recursion_limit_32(G(a,32),b) #define PCRE2_SET_RECURSION_LIMIT(a,b) pcre2_set_recursion_limit_32(G(a,32),b)
#define PCRE2_SUBSTRING_COPY_BYNAME(a,b,c,d,e) \ #define PCRE2_SUBSTRING_COPY_BYNAME(a,b,c,d,e) \
a = pcre2_substring_copy_byname_32(G(b,32),G(c,32),(PCRE2_UCHAR32 *)d,e); a = pcre2_substring_copy_byname_32(G(b,32),G(c,32),(PCRE2_UCHAR32 *)d,e)
#define PCRE2_SUBSTRING_COPY_BYNUMBER(a,b,c,d,e) \ #define PCRE2_SUBSTRING_COPY_BYNUMBER(a,b,c,d,e) \
a = pcre2_substring_copy_bynumber_32(G(b,32),c,(PCRE2_UCHAR32 *)d,e); a = pcre2_substring_copy_bynumber_32(G(b,32),c,(PCRE2_UCHAR32 *)d,e);
#define PCRE2_SUBSTRING_FREE(a) pcre2_substring_free_32((PCRE2_UCHAR32 *)a) #define PCRE2_SUBSTRING_FREE(a) pcre2_substring_free_32((PCRE2_UCHAR32 *)a)
@ -1766,6 +1786,25 @@ free(block);
#endif /* NO_RECURSE */ #endif /* NO_RECURSE */
/*************************************************
* Callback function for stack guard *
*************************************************/
/* This is set up to be called from pcre2_compile() when the stackguard=n
modifier sets a value greater than zero. The test we do is whether the
parenthesis nesting depth is greater than the value set by the modifier.
Argument: the current parenthesis nesting depth
Returns: non-zero to kill the compilation
*/
static int
stack_guard(uint32_t depth)
{
return depth > pat_patctl.stackguard_test;
}
/************************************************* /*************************************************
* Convert UTF-8 character to code point * * Convert UTF-8 character to code point *
*************************************************/ *************************************************/
@ -2031,16 +2070,16 @@ return i + 1;
#ifdef SUPPORT_PCRE16 #ifdef SUPPORT_PCRE16
/************************************************* /*************************************************
* Convert a string to 16-bit * * Convert pattern to 16-bit *
*************************************************/ *************************************************/
/* The input is always interpreted as a string of UTF-8 bytes. If all the input /* In UTF mode the input is always interpreted as a string of UTF-8 bytes. If
bytes are ASCII, the space needed for a 16-bit string is exactly double the all the input bytes are ASCII, the space needed for a 16-bit string is exactly
8-bit size. Otherwise, the size needed for a 16-bit string is no more than double the 8-bit size. Otherwise, the size needed for a 16-bit string is no
double, because up to 0xffff uses no more than 3 bytes in UTF-8 but possibly 4 more than double, because up to 0xffff uses no more than 3 bytes in UTF-8 but
in UTF-16. Higher values use 4 bytes in UTF-8 and up to 4 bytes in UTF-16. The possibly 4 in UTF-16. Higher values use 4 bytes in UTF-8 and up to 4 bytes in
result is always left in pbuffer16. Impose a minimum size to save repeated UTF-16. The result is always left in pbuffer16. Impose a minimum size to save
re-sizing. repeated re-sizing.
Note that this function does not object to surrogate values. This is Note that this function does not object to surrogate values. This is
deliberate; it makes it possible to construct UTF-16 strings that are invalid, deliberate; it makes it possible to construct UTF-16 strings that are invalid,
@ -2074,9 +2113,13 @@ if (pbuffer16_size < 2*len + 2)
exit(1); exit(1);
} }
} }
pp = pbuffer16;
while (len > 0) pp = pbuffer16;
if (!utf)
{
while (len-- > 0) *pp++ = *p++;
}
else while (len > 0)
{ {
uint32_t c; uint32_t c;
int chlen = utf82ord(p, &c); int chlen = utf82ord(p, &c);
@ -2102,15 +2145,15 @@ return pp - pbuffer16;
#ifdef SUPPORT_PCRE32 #ifdef SUPPORT_PCRE32
/************************************************* /*************************************************
* Convert a string to 32-bit * * Convert pattern to 32-bit *
*************************************************/ *************************************************/
/* The input is always interpreted as a string of UTF-8 bytes. If all the input /* In UTF mode the input is always interpreted as a string of UTF-8 bytes. If
bytes are ASCII, the space needed for a 32-bit string is exactly four times the all the input bytes are ASCII, the space needed for a 32-bit string is exactly
8-bit size. Otherwise, the size needed for a 32-bit string is no more than four four times the 8-bit size. Otherwise, the size needed for a 32-bit string is no
times, because the number of characters must be less than the number of bytes. more than four times, because the number of characters must be less than the
The result is always left in pbuffer32. Impose a minimum size to save repeated number of bytes. The result is always left in pbuffer32. Impose a minimum size
re-sizing. to save repeated re-sizing.
Note that this function does not object to surrogate values. This is Note that this function does not object to surrogate values. This is
deliberate; it makes it possible to construct UTF-32 strings that are invalid, deliberate; it makes it possible to construct UTF-32 strings that are invalid,
@ -2143,9 +2186,13 @@ if (pbuffer32_size < 4*len + 4)
exit(1); exit(1);
} }
} }
pp = pbuffer32;
while (len > 0) pp = pbuffer32;
if (!utf)
{
while (len-- > 0) *pp++ = *p++;
}
else while (len > 0)
{ {
uint32_t c; uint32_t c;
int chlen = utf82ord(p, &c); int chlen = utf82ord(p, &c);
@ -3020,9 +3067,26 @@ if ((pat_patctl.control & CTL_INFO) != 0)
pattern_info(PCRE2_INFO_ARGOPTIONS, &compile_options); pattern_info(PCRE2_INFO_ARGOPTIONS, &compile_options);
pattern_info(PCRE2_INFO_ALLOPTIONS, &overall_options); pattern_info(PCRE2_INFO_ALLOPTIONS, &overall_options);
/* Remove UTF/UCP if they were there only because of forbid_utf. This saves
cluttering up the verification output of non-UTF test files. */
if ((pat_patctl.options & PCRE2_NEVER_UTF) == 0)
{
compile_options &= ~PCRE2_NEVER_UTF;
overall_options &= ~PCRE2_NEVER_UTF;
}
if ((pat_patctl.options & PCRE2_NEVER_UCP) == 0)
{
compile_options &= ~PCRE2_NEVER_UCP;
overall_options &= ~PCRE2_NEVER_UCP;
}
if ((compile_options|overall_options) == 0) if ((compile_options|overall_options) == 0)
fprintf(outfile, "No options\n"); fprintf(outfile, "No options\n");
else if (compile_options == overall_options)
show_compile_options(compile_options, "Options:", "\n");
else else
{ {
show_compile_options(compile_options, "Compile options:", "\n"); show_compile_options(compile_options, "Compile options:", "\n");
@ -3035,26 +3099,26 @@ if ((pat_patctl.control & CTL_INFO) != 0)
fprintf(outfile, "\\R matches %s\n", (bsr_convention == PCRE2_BSR_UNICODE)? fprintf(outfile, "\\R matches %s\n", (bsr_convention == PCRE2_BSR_UNICODE)?
"any Unicode newline" : "CR, LF, or CRLF"); "any Unicode newline" : "CR, LF, or CRLF");
switch (newline_convention) if (newline_convention != NEWLINE_DEFAULT) switch (newline_convention)
{ {
case PCRE2_NEWLINE_CR: case PCRE2_NEWLINE_CR:
fprintf(outfile, "Newline is CR\n"); fprintf(outfile, "Forced newline is CR\n");
break; break;
case PCRE2_NEWLINE_LF: case PCRE2_NEWLINE_LF:
fprintf(outfile, "Newline is LF\n"); fprintf(outfile, "Forced newline is LF\n");
break; break;
case PCRE2_NEWLINE_CRLF: case PCRE2_NEWLINE_CRLF:
fprintf(outfile, "Newline is CRLF\n"); fprintf(outfile, "Forced newline is CRLF\n");
break; break;
case PCRE2_NEWLINE_ANYCRLF: case PCRE2_NEWLINE_ANYCRLF:
fprintf(outfile, "Newline is CR, LF, or CRLF\n"); fprintf(outfile, "Forced newline is CR, LF, or CRLF\n");
break; break;
case PCRE2_NEWLINE_ANY: case PCRE2_NEWLINE_ANY:
fprintf(outfile, "Newline is any Unicode newline\n"); fprintf(outfile, "Forced newline is any Unicode newline\n");
break; break;
default: default:
@ -3063,7 +3127,7 @@ if ((pat_patctl.control & CTL_INFO) != 0)
if (first_ctype == 2) if (first_ctype == 2)
{ {
fprintf(outfile, "First char at start or follows newline\n"); fprintf(outfile, "First code unit at start or follows newline\n");
} }
else if (first_ctype == 1) else if (first_ctype == 1)
{ {
@ -3079,35 +3143,7 @@ if ((pat_patctl.control & CTL_INFO) != 0)
fprintf(outfile, "%s\n", caseless); fprintf(outfile, "%s\n", caseless);
} }
} }
else else if (start_bits != NULL)
{
fprintf(outfile, "No first code unit\n");
}
if (last_ctype == 0)
{
fprintf(outfile, "No last code unit\n");
}
else
{
const char *caseless =
((FLD(compiled_code, flags) & PCRE2_LASTCASELESS) == 0)?
"" : " (caseless)";
if (PRINTOK(last_cunit))
fprintf(outfile, "Last code unit = \'%c\'%s\n", last_cunit, caseless);
else
{
fprintf(outfile, "Last code unit = ");
pchar(last_cunit, FALSE, outfile);
fprintf(outfile, "%s\n", caseless);
}
}
fprintf(outfile, "Subject length lower bound = %d\n", minlength);
if (start_bits == NULL)
fprintf(outfile, "No starting code unit list\n");
else
{ {
int i; int i;
int c = 24; int c = 24;
@ -3135,6 +3171,31 @@ if ((pat_patctl.control & CTL_INFO) != 0)
} }
fprintf(outfile, "\n"); fprintf(outfile, "\n");
} }
else
{
fprintf(outfile, "No first code unit\n");
}
if (last_ctype == 0)
{
fprintf(outfile, "No last code unit\n");
}
else
{
const char *caseless =
((FLD(compiled_code, flags) & PCRE2_LASTCASELESS) == 0)?
"" : " (caseless)";
if (PRINTOK(last_cunit))
fprintf(outfile, "Last code unit = \'%c\'%s\n", last_cunit, caseless);
else
{
fprintf(outfile, "Last code unit = ");
pchar(last_cunit, FALSE, outfile);
fprintf(outfile, "%s\n", caseless);
}
}
fprintf(outfile, "Subject length lower bound = %d\n", minlength);
/* FIXME: tidy this up */ /* FIXME: tidy this up */
@ -3183,7 +3244,11 @@ if (restrict_for_perl_test)
return PR_ABEND; return PR_ABEND;
} }
if (strncmp((char *)buffer, "#pattern", 8) == 0 && isspace(buffer[8])) if (strncmp((char *)buffer, "#forbid_utf", 11) == 0 && isspace(buffer[11]))
{
forbid_utf = PCRE2_NEVER_UTF|PCRE2_NEVER_UCP;
}
else if (strncmp((char *)buffer, "#pattern", 8) == 0 && isspace(buffer[8]))
{ {
(void)decode_modifiers(buffer + 8, CTX_DEFPAT, &def_patctl, NULL); (void)decode_modifiers(buffer + 8, CTX_DEFPAT, &def_patctl, NULL);
} }
@ -3491,6 +3556,13 @@ else switch (pat_patctl.tables_id)
PCRE2_SET_CHARACTER_TABLES(pat_context, use_tables); PCRE2_SET_CHARACTER_TABLES(pat_context, use_tables);
/* Set up for the stackguard test. */
if (pat_patctl.stackguard_test != 0)
{
PCRE2_SET_COMPILE_RECURSION_GUARD(pat_context, stack_guard);
}
/* Handle compiling via the POSIX interface, which doesn't support the /* Handle compiling via the POSIX interface, which doesn't support the
timing, showing, or debugging options, nor the ability to pass over timing, showing, or debugging options, nor the ability to pass over
local character tables. Neither does it have 16-bit or 32-bit support. */ local character tables. Neither does it have 16-bit or 32-bit support. */
@ -3604,7 +3676,7 @@ if (timeit > 0)
for (i = 0; i < timeit; i++) for (i = 0; i < timeit; i++)
{ {
PCRE2_COMPILE(compiled_code, pbuffer, patlen, PCRE2_COMPILE(compiled_code, pbuffer, patlen,
pat_patctl.options, &errorcode, &erroroffset, pat_context); pat_patctl.options|forbid_utf, &errorcode, &erroroffset, pat_context);
if (TEST(compiled_code, !=, NULL)) if (TEST(compiled_code, !=, NULL))
{ SUB1(pcre2_code_free, compiled_code); } { SUB1(pcre2_code_free, compiled_code); }
} }
@ -3618,8 +3690,8 @@ if (timeit > 0)
/* A final compile that is used "for real". */ /* A final compile that is used "for real". */
PCRE2_COMPILE(compiled_code, pbuffer, patlen, pat_patctl.options, &errorcode, PCRE2_COMPILE(compiled_code, pbuffer, patlen, pat_patctl.options|forbid_utf,
&erroroffset, pat_context); &errorcode, &erroroffset, pat_context);
/* Compilation failed; go back for another re, skipping to blank line /* Compilation failed; go back for another re, skipping to blank line
if non-interactive. */ if non-interactive. */
@ -3782,15 +3854,13 @@ for (;;)
min = mid; min = mid;
mid = (mid == max - 1)? max : (max != UINT32_MAX)? (min + max)/2 : mid*2; mid = (mid == max - 1)? max : (max != UINT32_MAX)? (min + max)/2 : mid*2;
} }
else if (capcount >= 0 || else if (capcount >= 0 ||
capcount == PCRE2_ERROR_NOMATCH || capcount == PCRE2_ERROR_NOMATCH ||
capcount == PCRE2_ERROR_PARTIAL) capcount == PCRE2_ERROR_PARTIAL)
{ {
if (mid == min + 1) if (mid == min + 1)
{ {
if (capcount != PCRE2_ERROR_NOMATCH) fprintf(outfile, "Minimum %s limit = %d\n", msg, mid);
fprintf(outfile, "Minimum %s limit = %d\n", msg, mid);
break; break;
} }
max = mid; max = mid;
@ -4184,8 +4254,11 @@ while ((c = *p++) != 0)
continue; continue;
default: default:
fprintf(outfile, "** Unrecognized escape sequence \"\\%c\"\n", c); if (isalnum(c))
return PR_OK; {
fprintf(outfile, "** Unrecognized escape sequence \"\\%c\"\n", c);
return PR_OK;
}
} }
/* We now have a character value in c that may be greater than 255. /* We now have a character value in c that may be greater than 255.
@ -4608,7 +4681,12 @@ for (gmatched = 0;; gmatched++)
PCRE2_SUBSTRING_COPY_BYNUMBER(rc, match_data, n, copybuffer, PCRE2_SUBSTRING_COPY_BYNUMBER(rc, match_data, n, copybuffer,
sizeof(copybuffer)/code_unit_size); sizeof(copybuffer)/code_unit_size);
if (rc < 0) if (rc < 0)
fprintf(outfile, "copy substring %d failed %d\n", n, rc); {
fprintf(outfile, "copy substring %d failed (%d): ", n, rc);
PCRE2_GET_ERROR_MESSAGE(rc, rc, pbuffer);
PCHARSV(CASTVAR(void *, pbuffer), 0, rc, FALSE, outfile);
fprintf(outfile, "\n");
}
else else
{ {
fprintf(outfile, "%2dC ", n); fprintf(outfile, "%2dC ", n);
@ -4641,7 +4719,10 @@ for (gmatched = 0;; gmatched++)
copybuffer, sizeof(copybuffer)/code_unit_size); copybuffer, sizeof(copybuffer)/code_unit_size);
if (rc < 0) if (rc < 0)
{ {
fprintf(outfile, "copy substring '%s' failed %d\n", nptr, rc); fprintf(outfile, "copy substring '%s' failed (%d): ", nptr, rc);
PCRE2_GET_ERROR_MESSAGE(rc, rc, pbuffer);
PCHARSV(CASTVAR(void *, pbuffer), 0, rc, FALSE, outfile);
fprintf(outfile, "\n");
} }
else else
{ {
@ -4661,7 +4742,12 @@ for (gmatched = 0;; gmatched++)
uint32_t n = (uint32_t)(dat_datctl.get_numbers[i]); uint32_t n = (uint32_t)(dat_datctl.get_numbers[i]);
PCRE2_SUBSTRING_GET_BYNUMBER(rc, match_data, n, &gotbuffer); PCRE2_SUBSTRING_GET_BYNUMBER(rc, match_data, n, &gotbuffer);
if (rc < 0) if (rc < 0)
fprintf(outfile, "get substring %d failed %d\n", n, rc); {
fprintf(outfile, "get substring %d failed (%d): ", n, rc);
PCRE2_GET_ERROR_MESSAGE(rc, rc, pbuffer);
PCHARSV(CASTVAR(void *, pbuffer), 0, rc, FALSE, outfile);
fprintf(outfile, "\n");
}
else else
{ {
fprintf(outfile, "%2dG ", n); fprintf(outfile, "%2dG ", n);
@ -4694,7 +4780,10 @@ for (gmatched = 0;; gmatched++)
PCRE2_SUBSTRING_GET_BYNAME(rc, match_data, pbuffer, &gotbuffer); PCRE2_SUBSTRING_GET_BYNAME(rc, match_data, pbuffer, &gotbuffer);
if (rc < 0) if (rc < 0)
{ {
fprintf(outfile, "get substring '%s' failed %d\n", nptr, rc); fprintf(outfile, "get substring '%s' failed (%d): ", nptr, rc);
PCRE2_GET_ERROR_MESSAGE(rc, rc, pbuffer);
PCHARSV(CASTVAR(void *, pbuffer), 0, rc, FALSE, outfile);
fprintf(outfile, "\n");
} }
else else
{ {
@ -4715,7 +4804,12 @@ for (gmatched = 0;; gmatched++)
size_t *lengths; size_t *lengths;
PCRE2_SUBSTRING_LIST_GET(rc, match_data, &stringlist, &lengths); PCRE2_SUBSTRING_LIST_GET(rc, match_data, &stringlist, &lengths);
if (rc < 0) if (rc < 0)
fprintf(outfile, "get substring list failed %d\n", rc); {
fprintf(outfile, "get substring list failed (%d): ", rc);
PCRE2_GET_ERROR_MESSAGE(rc, rc, pbuffer);
PCHARSV(CASTVAR(void *, pbuffer), 0, rc, FALSE, outfile);
fprintf(outfile, "\n");
}
else else
{ {
for (i = 0; i < capcount; i++) for (i = 0; i < capcount; i++)
@ -4737,7 +4831,6 @@ for (gmatched = 0;; gmatched++)
else if (capcount == PCRE2_ERROR_PARTIAL) else if (capcount == PCRE2_ERROR_PARTIAL)
{ {
PCRE2_OFFSET leftchar = FLD(match_data, leftchar); PCRE2_OFFSET leftchar = FLD(match_data, leftchar);
fprintf(outfile, "Partial match"); fprintf(outfile, "Partial match");
if (leftchar != FLD(match_data, startchar)) if (leftchar != FLD(match_data, startchar))
fprintf(outfile, " at offset %d", (int)FLD(match_data, startchar)); fprintf(outfile, " at offset %d", (int)FLD(match_data, startchar));
@ -4880,8 +4973,8 @@ for (gmatched = 0;; gmatched++)
else else
{ {
pp += end_offset * code_unit_size; pp += end_offset * code_unit_size;
len -= end_offset; len -= end_offset * code_unit_size;
ulen -= end_offset *code_unit_size; ulen -= end_offset;
} }
} }
} /* End of global loop */ } /* End of global loop */
@ -4894,7 +4987,7 @@ return PR_OK;
/************************************************* /*************************************************
* Print PCRE version * * Print PCRE2 version *
*************************************************/ *************************************************/
/* The version string was read into 'version' at the start of execution. */ /* The version string was read into 'version' at the start of execution. */
@ -4903,7 +4996,7 @@ static void
print_version(FILE *f) print_version(FILE *f)
{ {
VERSION_TYPE *vp; VERSION_TYPE *vp;
fprintf(f, "PCRE version "); fprintf(f, "PCRE2 version ");
for (vp = version; *vp != 0; vp++) fprintf(f, "%c", *vp); for (vp = version; *vp != 0; vp++) fprintf(f, "%c", *vp);
fprintf(f, "\n"); fprintf(f, "\n");
} }
@ -4976,6 +5069,7 @@ printf(" -d set default pattern control 'debug'\n");
printf(" -dfa set default subject control 'dfa'\n"); printf(" -dfa set default subject control 'dfa'\n");
printf(" -help show usage information\n"); printf(" -help show usage information\n");
printf(" -i set default pattern control 'info'\n"); printf(" -i set default pattern control 'info'\n");
printf(" -jit set default pattern control 'jit'\n");
printf(" -q quiet: do not output PCRE version number at start\n"); printf(" -q quiet: do not output PCRE version number at start\n");
printf(" -pattern <s> set default pattern control fields\n"); printf(" -pattern <s> set default pattern control fields\n");
printf(" -subject <s> set default subject control fields\n"); printf(" -subject <s> set default subject control fields\n");
@ -5261,10 +5355,18 @@ while (argc > 1 && argv[op][0] == '-')
/* Set some common pattern and subject controls */ /* Set some common pattern and subject controls */
else if (strcmp(arg, "-dfa") == 0) def_datctl.control |= CTL_DFA;
else if (strcmp(arg, "-b") == 0) def_patctl.control |= CTL_FULLBINCODE; else if (strcmp(arg, "-b") == 0) def_patctl.control |= CTL_FULLBINCODE;
else if (strcmp(arg, "-d") == 0) def_patctl.control |= CTL_DEBUG; else if (strcmp(arg, "-d") == 0) def_patctl.control |= CTL_DEBUG;
else if (strcmp(arg, "-i") == 0) def_patctl.control |= CTL_INFO; else if (strcmp(arg, "-i") == 0) def_patctl.control |= CTL_INFO;
else if (strcmp(arg, "-dfa") == 0) def_datctl.control |= CTL_DFA; else if (strcmp(arg, "-jit") == 0)
{
def_patctl.jit = 7; /* full & partial */
#ifndef SUPPORT_JIT
fprintf(stderr, "** Warning: JIT support is not available: "
"-jit calls dummy functions.\n");
#endif
}
/* Set timing parameters */ /* Set timing parameters */
@ -5500,10 +5602,11 @@ while (notdone)
else else
{ {
while (isspace(*p)) p++; while (isspace(*p)) p++;
if (*p != 0) if (*p != 0)
{ {
fprintf(stderr, "** Invalid pattern delimiter '%c'.\n", *buffer); fprintf(outfile, "** Invalid pattern delimiter '%c' (x%x).\n", *buffer,
*buffer);
rc = PR_SKIP; rc = PR_SKIP;
} }
} }

5695
testdata/testinput1 vendored Normal file

File diff suppressed because it is too large Load Diff

4034
testdata/testinput2 vendored Normal file

File diff suppressed because it is too large Load Diff

9389
testdata/testoutput1 vendored Normal file

File diff suppressed because it is too large Load Diff

14400
testdata/testoutput2 vendored Normal file

File diff suppressed because it is too large Load Diff