From bb68c64c4067a4936ac45a6493eccbeee665ccf1 Mon Sep 17 00:00:00 2001 From: "Philip.Hazel" Date: Tue, 8 Sep 2015 17:01:17 +0000 Subject: [PATCH] Implement #newline_default and adjust testdata and scripts to use it. --- ChangeLog | 7 ++++ RunGrepTest | 19 ++++++++-- RunTest | 10 ++++-- doc/pcre2test.1 | 32 +++++++++++++++-- maint/ManyConfigTests | 33 ++++++------------ src/pcre2test.c | 75 +++++++++++++++++++++++++++++++++------- testdata/testinput1 | 1 + testdata/testinput11 | 1 + testdata/testinput18 | 6 ++-- testdata/testinput2 | 1 + testdata/testinput4 | 1 + testdata/testinput6 | 1 + testdata/testinput7 | 1 + testdata/testinput9 | 1 + testdata/testoutput1 | 1 + testdata/testoutput11-16 | 1 + testdata/testoutput11-32 | 1 + testdata/testoutput18 | 6 ++-- testdata/testoutput2 | 1 + testdata/testoutput4 | 1 + testdata/testoutput6 | 1 + testdata/testoutput7 | 1 + testdata/testoutput9 | 1 + 23 files changed, 153 insertions(+), 50 deletions(-) diff --git a/ChangeLog b/ChangeLog index 3f1fb23..4e18926 100644 --- a/ChangeLog +++ b/ChangeLog @@ -172,6 +172,13 @@ commas. 49. Fixed two issues in JIT. These were found by Karl Skomski with a custom LLVM fuzzer. +50. The pcre2test program has been extended by adding the #newline_default +command. This has made it possible to run the standard tests when PCRE2 is +compiled with either CR or CRLF as the default newline convention. As part of +this work, the new command was added to several test files and the testing +scripts were modified. The pcre2grep tests can now also be run when there is no +LF in the default newline convention. + Version 10.20 30-June-2015 -------------------------- diff --git a/RunGrepTest b/RunGrepTest index 79bf5a3..bdbb40b 100755 --- a/RunGrepTest +++ b/RunGrepTest @@ -19,12 +19,18 @@ unset cp ls mv rm builddir=`pwd` pcre2grep=$builddir/pcre2grep +pcre2test=$builddir/pcre2test if [ ! -x $pcre2grep ] ; then echo "** $pcre2grep does not exist or is not execuatble." exit 1 fi +if [ ! -x $pcre2test ] ; then + echo "** $pcre2test does not exist or is not execuatble." + exit 1 +fi + valgrind= while [ $# -gt 0 ] ; do case $1 in @@ -34,7 +40,6 @@ while [ $# -gt 0 ] ; do shift done -echo " " pcre2grep_version=`$pcre2grep -V` if [ "$valgrind" = "" ] ; then echo "Testing $pcre2grep_version" @@ -69,14 +74,22 @@ fi # Check for the availability of UTF-8 support -./pcre2test -C unicode >/dev/null +$pcre2test -C unicode >/dev/null utf8=$? +# Check default newline convention. If it does not include LF, force LF. + +nl=`$pcre2test -C newline` +if [ "$nl" != "LF" -a "$nl" != "ANY" -a "$nl" != "ANYCRLF" ]; then + pcre2grep="$pcre2grep -N LF" + echo "Default newline setting forced to LF" +fi + # ------ Function to run and check a special pcre2grep arguments test ------- checkspecial() { - $valgrind ./pcre2grep $1 >>testtrygrep 2>&1 + $valgrind $pcre2grep $1 >>testtrygrep 2>&1 if [ $? -ne $2 ] ; then echo "** pcre2grep $1 failed - check testtrygrep" exit 1 diff --git a/RunTest b/RunTest index 9f5efac..d72a574 100755 --- a/RunTest +++ b/RunTest @@ -75,7 +75,7 @@ title17="Test 17: JIT-specific features when JIT is available" title18="Test 18: Tests of the POSIX interface, excluding UTF/UCP" title19="Test 19: Tests of the POSIX interface with UTF/UCP" title20="Test 20: Serialization tests" -maxtest=18 +maxtest=20 if [ $# -eq 1 -a "$1" = "list" ]; then echo $title0 @@ -699,8 +699,12 @@ for bmode in "$test8" "$test16" "$test32"; do if [ $do14 = yes ] ; then echo $title14 - $sim $valgrind ./pcre2test -q $defaultstack $bmode $opt $testdata/testinput14 testtry - checkresult $? 14-$bits "" + if [ $utf -eq 0 ] ; then + echo " Skipped because UTF-$bits support is not available" + else + $sim $valgrind ./pcre2test -q $defaultstack $bmode $opt $testdata/testinput14 testtry + checkresult $? 14-$bits "" + fi fi # Test non-JIT match and recursion limits diff --git a/doc/pcre2test.1 b/doc/pcre2test.1 index df822ae..d5169e6 100644 --- a/doc/pcre2test.1 +++ b/doc/pcre2test.1 @@ -1,4 +1,4 @@ -.TH PCRE2TEST 1 "03 September 2015" "PCRE 10.21" +.TH PCRE2TEST 1 "08 September 2015" "PCRE 10.21" .SH NAME pcre2test - a program for testing Perl-compatible regular expressions. .SH SYNOPSIS @@ -259,6 +259,34 @@ described in the section entitled "Saving and restoring compiled patterns" .\" below. .\" +.sp + #newline_default [] +.sp +When PCRE2 is built, a default newline convention can be specified. This +determines which characters and/or character pairs are recognized as indicating +a newline in a pattern or subject string. The default can be overridden when a +pattern is compiled. The standard test files contain tests of various newline +conventions, but the majority of the tests expect a single linefeed to be +recognized as a newline by default. Without special action the tests would fail +when PCRE2 is compiled with either CR or CRLF as the default newline. +.P +The #newline_default command specifies a list of newline types that are +acceptable as the default. The types must be one of CR, LF, CRLF, ANYCRLF, or +ANY (in upper or lower case), for example: +.sp + #newline_default LF Any anyCRLF +.sp +If the default newline is in the list, this command has no effect. Otherwise, +except when testing the POSIX API, a \fBnewline\fP modifier that specifies the +first newline convention in the list (LF in the above example) is added to any +pattern that does not already have a \fBnewline\fP modifier. If the newline +list is empty, the feature is turned off. This command is present in a number +of the standard test input files. +.P +When the POSIX API is being tested there is no way to override the default +newline convention, though it is possible to set the newline convention from +within the pattern. A warning is given if the \fBposix\fP modifier is used when +\fB#newline_default\fP would set a default for the non-POSIX API. .sp #pattern .sp @@ -1457,6 +1485,6 @@ Cambridge, England. .rs .sp .nf -Last updated: 03 September 2015 +Last updated: 08 September 2015 Copyright (c) 1997-2015 University of Cambridge. .fi diff --git a/maint/ManyConfigTests b/maint/ManyConfigTests index 1830baa..a0401e1 100755 --- a/maint/ManyConfigTests +++ b/maint/ManyConfigTests @@ -131,33 +131,22 @@ runtest() ./pcre2test -C fi - nl=`./pcre2test -C newline` - if [ "$nl" = "LF" -o "$nl" = "ANY" -o "$nl" = "ANYCRLF" ]; then - nlok=1 - else - nlok=0 - fi - ./pcre2test -C jit >/dev/null jit=$? ./pcre2test -C pcre2-8 >/dev/null pcre2_8=$? - if [ $nlok -gt 0 ]; then - echo "Running PCRE2 library tests $withvalgrind" - $srcdir/RunTest $valgrind >teststdout 2>teststderr - if [ $? -ne 0 -o -s teststderr ]; then - echo " " - echo "**** Test failed ****" - cat teststderr - if [ -s teststdout ] ; then cat teststdout; fi - exit 1 - fi - else - echo "Skipping PCRE2 library tests: newline is $nl" + echo "Running PCRE2 library tests $withvalgrind" + $srcdir/RunTest $valgrind >teststdout 2>teststderr + if [ $? -ne 0 -o -s teststderr ]; then + echo " " + echo "**** Test failed ****" + cat teststderr + if [ -s teststdout ] ; then cat teststdout; fi + exit 1 fi - if [ $nlok -gt 0 -a $pcre2_8 -gt 0 ]; then + if [ $pcre2_8 -gt 0 ]; then echo "Running pcre2grep tests $withvalgrind" $srcdir/RunGrepTest $valgrind >teststdout 2>teststderr if [ $? -ne 0 -o -s teststderr ]; then @@ -167,10 +156,8 @@ runtest() cat teststdout exit 1 fi - elif [ $nlok -gt 0 ]; then - echo "Skipping pcre2grep tests: 8-bit library not compiled" else - echo "Skipping pcre2grep tests: newline is $nl" + echo "Skipping pcre2grep tests: 8-bit library not compiled" fi if [ "$jit" -gt 0 ]; then diff --git a/src/pcre2test.c b/src/pcre2test.c index 3f0af15..de2e1dd 100644 --- a/src/pcre2test.c +++ b/src/pcre2test.c @@ -336,17 +336,18 @@ typedef struct cmdstruct { int value; } cmdstruct; -enum { CMD_FORBID_UTF, CMD_LOAD, CMD_PATTERN, CMD_PERLTEST, CMD_POP, CMD_SAVE, - CMD_SUBJECT, CMD_UNKNOWN }; +enum { CMD_FORBID_UTF, CMD_LOAD, CMD_NEWLINE_DEFAULT, CMD_PATTERN, + CMD_PERLTEST, CMD_POP, CMD_SAVE, CMD_SUBJECT, CMD_UNKNOWN }; static cmdstruct cmdlist[] = { - { "forbid_utf", CMD_FORBID_UTF }, - { "load", CMD_LOAD }, - { "pattern", CMD_PATTERN }, - { "perltest", CMD_PERLTEST }, - { "pop", CMD_POP }, - { "save", CMD_SAVE }, - { "subject", CMD_SUBJECT }}; + { "forbid_utf", CMD_FORBID_UTF }, + { "load", CMD_LOAD }, + { "newline_default", CMD_NEWLINE_DEFAULT }, + { "pattern", CMD_PATTERN }, + { "perltest", CMD_PERLTEST }, + { "pop", CMD_POP }, + { "save", CMD_SAVE }, + { "subject", CMD_SUBJECT }}; #define cmdlistcount sizeof(cmdlist)/sizeof(cmdstruct) @@ -720,6 +721,8 @@ static uint32_t maxlookbehind; static uint32_t max_oveccount; static uint32_t callout_count; +static uint16_t local_newline_default = 0; + static VERSION_TYPE jittarget[VERSION_SIZE]; static VERSION_TYPE version[VERSION_SIZE]; static VERSION_TYPE uversion[VERSION_SIZE]; @@ -3420,7 +3423,7 @@ Returns: nothing static void show_controls(uint32_t controls, const char *before) { -fprintf(outfile, "%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s", +fprintf(outfile, "%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s", before, ((controls & CTL_AFTERTEXT) != 0)? " aftertext" : "", ((controls & CTL_ALLAFTERTEXT) != 0)? " allaftertext" : "", @@ -3428,6 +3431,7 @@ fprintf(outfile, "%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s", ((controls & CTL_ALLUSEDTEXT) != 0)? " allusedtext" : "", ((controls & CTL_ALTGLOBAL) != 0)? " altglobal" : "", ((controls & CTL_BINCODE) != 0)? " bincode" : "", + ((controls & CTL_BSR_SET) != 0)? " bsr" : "", ((controls & CTL_CALLOUT_CAPTURE) != 0)? " callout_capture" : "", ((controls & CTL_CALLOUT_INFO) != 0)? " callout_info" : "", ((controls & CTL_CALLOUT_NONE) != 0)? " callout_none" : "", @@ -3442,6 +3446,7 @@ fprintf(outfile, "%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s", ((controls & CTL_JITVERIFY) != 0)? " jitverify" : "", ((controls & CTL_MARK) != 0)? " mark" : "", ((controls & CTL_MEMORY) != 0)? " memory" : "", + ((controls & CTL_NL_SET) != 0)? " newline" : "", ((controls & CTL_POSIX) != 0)? " posix" : "", ((controls & CTL_PUSH) != 0)? " push" : "", ((controls & CTL_STARTCHAR) != 0)? " startchar" : "", @@ -3472,7 +3477,7 @@ else fprintf(outfile, "%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s", before, ((options & PCRE2_ALT_BSUX) != 0)? " alt_bsux" : "", ((options & PCRE2_ALT_CIRCUMFLEX) != 0)? " alt_circumflex" : "", - ((options & PCRE2_ALT_VERBNAMES) != 0)? " alt_verbnames" : "", + ((options & PCRE2_ALT_VERBNAMES) != 0)? " alt_verbnames" : "", ((options & PCRE2_ALLOW_EMPTY_CLASS) != 0)? " allow_empty_class" : "", ((options & PCRE2_ANCHORED) != 0)? " anchored" : "", ((options & PCRE2_AUTO_CALLOUT) != 0)? " auto_callout" : "", @@ -3775,8 +3780,7 @@ if ((pat_patctl.control & CTL_INFO) != 0) fprintf(outfile, "\\R matches %s\n", (bsr_convention == PCRE2_BSR_UNICODE)? "any Unicode newline" : "CR, LF, or CRLF"); - if ((pat_patctl.control & CTL_NL_SET) != 0 || - (FLD(compiled_code, flags) & PCRE2_NL_SET) != 0) + if ((FLD(compiled_code, flags) & PCRE2_NL_SET) != 0) { switch (newline_convention) { @@ -3993,6 +3997,7 @@ FILE *f; PCRE2_SIZE serial_size; size_t i; int rc, cmd, cmdlen; +uint16_t first_listed_newline; const char *cmdname; uint8_t *argptr, *serial; @@ -4047,6 +4052,31 @@ switch(cmd) (void)decode_modifiers(argptr, CTX_DEFDAT, NULL, &def_datctl); break; + /* Check the default newline, and if not one of those listed, set up the + first one to be forced. An empty list unsets. */ + + case CMD_NEWLINE_DEFAULT: + local_newline_default = 0; /* Unset */ + first_listed_newline = 0; + for (;;) + { + while (isspace(*argptr)) argptr++; + if (*argptr == 0) break; + for (i = 1; i < sizeof(newlines)/sizeof(char *); i++) + { + size_t nlen = strlen(newlines[i]); + if (strncmpic(argptr, (const uint8_t *)newlines[i], nlen) == 0 && + isspace(argptr[nlen])) + { + if (i == NEWLINE_DEFAULT) return PR_OK; /* Default is valid */ + if (first_listed_newline == 0) first_listed_newline = i; + } + } + while (*argptr != 0 && !isspace(*argptr)) argptr++; + } + local_newline_default = first_listed_newline; + break; + /* Pop a compiled pattern off the stack. Modifiers that do not affect the compiled pattern (e.g. to give information) are permitted. The default pattern modifiers are ignored. */ @@ -4371,6 +4401,8 @@ if ((pat_patctl.control & CTL_POSIX) != 0) show_controls(pat_patctl.control & ~POSIX_SUPPORTED_COMPILE_CONTROLS, msg); msg = ""; } + + if (local_newline_default != 0) prmsg(&msg, "#newline_default"); if (msg[0] == 0) fprintf(outfile, "\n"); @@ -4461,6 +4493,15 @@ if we had a hex pattern. */ if ((pat_patctl.control & CTL_HEXPAT) == 0) patlen = PCRE2_ZERO_TERMINATED; +/* If #newline_default has been used and the library was not compiled with an +appropriate default newline setting, local_newline_default will be non-zero. We +use this if there is no explicit newline modifier. */ + +if ((pat_patctl.control & CTL_NL_SET) == 0 && local_newline_default != 0) + { + SETFLD(pat_context, newline_convention, local_newline_default); + } + /* Compile many times when timing. */ if (timeit > 0) @@ -4551,6 +4592,14 @@ if (pat_patctl.jit != 0) } } +/* If an explicit newline modifier was given, set the information flag in the +pattern so that it is preserved over push/pop. */ + +if ((pat_patctl.control & CTL_NL_SET) != 0) + { + SETFLD(compiled_code, flags, FLD(compiled_code, flags) | PCRE2_NL_SET); + } + /* Output code size and other information if requested. */ if ((pat_patctl.control & CTL_MEMORY) != 0) show_memory_info(); diff --git a/testdata/testinput1 b/testdata/testinput1 index e27366b..3a40bab 100644 --- a/testdata/testinput1 +++ b/testdata/testinput1 @@ -3,6 +3,7 @@ # 32-bit PCRE libraries, and also using the perltest.pl script. #forbid_utf +#newline_default lf any anycrlf #perltest /the quick brown fox/ diff --git a/testdata/testinput11 b/testdata/testinput11 index 43d89e1..a670c1c 100644 --- a/testdata/testinput11 +++ b/testdata/testinput11 @@ -4,6 +4,7 @@ # different, so they have separate output files. #forbid_utf +#newline_default LF ANY ANYCRLF /a\Cb/ aXb diff --git a/testdata/testinput18 b/testdata/testinput18 index db5a48b..fcf9e21 100644 --- a/testdata/testinput18 +++ b/testdata/testinput18 @@ -38,11 +38,11 @@ the quick brown fox The Quick Brown Fox -/abc.def/ +/(*LF)abc.def/ *** Failers abc\ndef -/abc$/ +/(*LF)abc$/ abc abc\n @@ -57,7 +57,7 @@ aaaabbbbzzzz\=ovector=1 aaaabbbbzzzz\=ovector=2 -/ab.cd/ +/(*ANY)ab.cd/ ab-cd ab=cd ** Failers diff --git a/testdata/testinput2 b/testdata/testinput2 index 3df7590..719f3e4 100644 --- a/testdata/testinput2 +++ b/testdata/testinput2 @@ -9,6 +9,7 @@ # test 5. #forbid_utf +#newline_default lf any anycrlf # Test binary zeroes in the pattern diff --git a/testdata/testinput4 b/testdata/testinput4 index 1210272..90c4c25 100644 --- a/testdata/testinput4 +++ b/testdata/testinput4 @@ -3,6 +3,7 @@ # some of the property tests may differ because of different versions of # Unicode in use by PCRE2 and Perl. +#newline_default lf anycrlf any #perltest /a.b/utf diff --git a/testdata/testinput6 b/testdata/testinput6 index 2651c91..636447a 100644 --- a/testdata/testinput6 +++ b/testdata/testinput6 @@ -4,6 +4,7 @@ #forbid_utf #subject dfa +#newline_default lf anycrlf any /abc/ abc diff --git a/testdata/testinput7 b/testdata/testinput7 index 879a414..70d8599 100644 --- a/testdata/testinput7 +++ b/testdata/testinput7 @@ -3,6 +3,7 @@ # used to force DFA matching for all tests. #subject dfa +#newline_default LF any anyCRLF /\x{100}ab/utf \x{100}ab diff --git a/testdata/testinput9 b/testdata/testinput9 index 7b97113..4d4bbe3 100644 --- a/testdata/testinput9 +++ b/testdata/testinput9 @@ -2,6 +2,7 @@ # UTF-8 or Unicode property support. */ #forbid_utf +#newline_default lf any anycrlf /a\Cb/ aXb diff --git a/testdata/testoutput1 b/testdata/testoutput1 index 343830f..1fe3d9d 100644 --- a/testdata/testoutput1 +++ b/testdata/testoutput1 @@ -3,6 +3,7 @@ # 32-bit PCRE libraries, and also using the perltest.pl script. #forbid_utf +#newline_default lf any anycrlf #perltest /the quick brown fox/ diff --git a/testdata/testoutput11-16 b/testdata/testoutput11-16 index eda8938..288d0a9 100644 --- a/testdata/testoutput11-16 +++ b/testdata/testoutput11-16 @@ -4,6 +4,7 @@ # different, so they have separate output files. #forbid_utf +#newline_default LF ANY ANYCRLF /a\Cb/ aXb diff --git a/testdata/testoutput11-32 b/testdata/testoutput11-32 index f048785..4a52966 100644 --- a/testdata/testoutput11-32 +++ b/testdata/testoutput11-32 @@ -4,6 +4,7 @@ # different, so they have separate output files. #forbid_utf +#newline_default LF ANY ANYCRLF /a\Cb/ aXb diff --git a/testdata/testoutput18 b/testdata/testoutput18 index 6c4a564..868e211 100644 --- a/testdata/testoutput18 +++ b/testdata/testoutput18 @@ -58,13 +58,13 @@ No match: POSIX code 17: match failed The Quick Brown Fox 0: The Quick Brown Fox -/abc.def/ +/(*LF)abc.def/ *** Failers No match: POSIX code 17: match failed abc\ndef No match: POSIX code 17: match failed -/abc$/ +/(*LF)abc$/ abc 0: abc abc\n @@ -91,7 +91,7 @@ Matched without capture 0: aaaabbbbzz 1: bbbb -/ab.cd/ +/(*ANY)ab.cd/ ab-cd 0: ab-cd ab=cd diff --git a/testdata/testoutput2 b/testdata/testoutput2 index 382ea58..a91028e 100644 --- a/testdata/testoutput2 +++ b/testdata/testoutput2 @@ -9,6 +9,7 @@ # test 5. #forbid_utf +#newline_default lf any anycrlf # Test binary zeroes in the pattern diff --git a/testdata/testoutput4 b/testdata/testoutput4 index d9e3053..6d6e5bf 100644 --- a/testdata/testoutput4 +++ b/testdata/testoutput4 @@ -3,6 +3,7 @@ # some of the property tests may differ because of different versions of # Unicode in use by PCRE2 and Perl. +#newline_default lf anycrlf any #perltest /a.b/utf diff --git a/testdata/testoutput6 b/testdata/testoutput6 index 0fef124..f52af6f 100644 --- a/testdata/testoutput6 +++ b/testdata/testoutput6 @@ -4,6 +4,7 @@ #forbid_utf #subject dfa +#newline_default lf anycrlf any /abc/ abc diff --git a/testdata/testoutput7 b/testdata/testoutput7 index a7f6a62..d0986df 100644 --- a/testdata/testoutput7 +++ b/testdata/testoutput7 @@ -3,6 +3,7 @@ # used to force DFA matching for all tests. #subject dfa +#newline_default LF any anyCRLF /\x{100}ab/utf \x{100}ab diff --git a/testdata/testoutput9 b/testdata/testoutput9 index bd56523..e198d36 100644 --- a/testdata/testoutput9 +++ b/testdata/testoutput9 @@ -2,6 +2,7 @@ # UTF-8 or Unicode property support. */ #forbid_utf +#newline_default lf any anycrlf /a\Cb/ aXb