From 8febd27344feb593d1b482b161a4703ee6478fec Mon Sep 17 00:00:00 2001 From: "Philip.Hazel" Date: Fri, 29 Jan 2016 18:16:59 +0000 Subject: [PATCH] Two pcre2test extensions: quoted literal substrings in hex patterns and detection of unsupported binary zeros in file input. --- ChangeLog | 7 ++++ doc/pcre2test.1 | 45 +++++++++++++--------- src/pcre2test.c | 91 ++++++++++++++++++++++++++++++++++---------- testdata/testinput2 | 12 ++++-- testdata/testinput6 | 3 +- testdata/testoutput2 | 15 ++++++-- testdata/testoutput6 | 3 +- 7 files changed, 126 insertions(+), 50 deletions(-) diff --git a/ChangeLog b/ChangeLog index f7af977..b7d4005 100644 --- a/ChangeLog +++ b/ChangeLog @@ -8,6 +8,13 @@ Version 10.22 29-January-2016 1. Applied Jason Hood's patches to RunTest.bat and testdata/wintestoutput3 to fix problems with running the tests under Windows. +2. Implemented a facility for quoting literal characters within hexadecimal +patterns in pcre2test, to make it easier to create patterns with just a few +non-printing characters. + +3. Binary zeros are not supported in pcre2test input files. It now detects them +and gives an error. + Version 10.21 12-January-2016 ----------------------------- diff --git a/doc/pcre2test.1 b/doc/pcre2test.1 index b8eef93..ce1bc08 100644 --- a/doc/pcre2test.1 +++ b/doc/pcre2test.1 @@ -1,4 +1,4 @@ -.TH PCRE2TEST 1 "12 December 2015" "PCRE 10.21" +.TH PCRE2TEST 1 "29 January 2016" "PCRE 10.22" .SH NAME pcre2test - a program for testing Perl-compatible regular expressions. .SH SYNOPSIS @@ -68,10 +68,11 @@ environments character 26 (hex 1A) causes an immediate end of file, and no further data is read. .P For maximum portability, therefore, it is safest to avoid non-printing -characters in \fBpcre2test\fP input files. There is a facility for specifying a -pattern's characters as hexadecimal pairs, thus making it possible to include -binary zeroes in a pattern for testing purposes. Subject lines are processed -for backslash escapes, which makes it possible to include any data value. +characters in \fBpcre2test\fP input files. There is a facility for specifying +some or all of a pattern's characters as hexadecimal pairs, thus making it +possible to include binary zeroes in a pattern for testing purposes. Subject +lines are processed for backslash escapes, which makes it possible to include +any data value. . . .SH "COMMAND LINE OPTIONS" @@ -523,7 +524,7 @@ about the pattern: debug same as info,fullbincode fullbincode show binary code with lengths /I info show info about compiled pattern - hex pattern is coded in hexadecimal + hex unquoted characters are hexadecimal jit[=] use JIT jitfast use JIT fast path jitverify verify JIT use @@ -614,20 +615,30 @@ testing that \fBpcre2_compile()\fP behaves correctly in this case (it uses default values). . . -.SS "Specifying a pattern in hex" +.SS "Specifying pattern characters in hexadecimal" .rs .sp -The \fBhex\fP modifier specifies that the characters of the pattern are to be -interpreted as pairs of hexadecimal digits. White space is permitted between -pairs. For example: +The \fBhex\fP modifier specifies that the characters of the pattern, except for +substrings enclosed in single or double quotes, are to be interpreted as pairs +of hexadecimal digits. This feature is provided as a way of creating patterns +that contain binary zeros and other non-printing characters. White space is +permitted between pairs of digits. For example, this pattern contains three +characters: .sp /ab 32 59/hex .sp -This feature is provided as a way of creating patterns that contain binary zero -and other non-printing characters. By default, \fBpcre2test\fP passes patterns -as zero-terminated strings to \fBpcre2_compile()\fP, giving the length as -PCRE2_ZERO_TERMINATED. However, for patterns specified in hexadecimal, the -actual length of the pattern is passed. +Parts of such a pattern are taken literally if quoted. This pattern contains +nine characters, only two of which are specified in hexadecimal: +.sp + /ab "literal" 32/hex +.sp +Either single or double quotes may be used. There is no way of including +the delimiter within a substring. +.P +By default, \fBpcre2test\fP passes patterns as zero-terminated strings to +\fBpcre2_compile()\fP, giving the length as PCRE2_ZERO_TERMINATED. However, for +patterns specified with the \fBhex\fP modifier, the actual length of the +pattern is passed. . . .SS "Generating long repetitive patterns" @@ -1640,6 +1651,6 @@ Cambridge, England. .rs .sp .nf -Last updated: 12 December 2015 -Copyright (c) 1997-2015 University of Cambridge. +Last updated: 29 January 2016 +Copyright (c) 1997-2016 University of Cambridge. .fi diff --git a/src/pcre2test.c b/src/pcre2test.c index 0a5879e..51f1298 100644 --- a/src/pcre2test.c +++ b/src/pcre2test.c @@ -2913,10 +2913,11 @@ pbuffer8 = new_pbuffer8; /* Input lines are read into buffer, but both patterns and data lines can be continued over multiple input lines. In addition, if the buffer fills up, we want to automatically expand it so as to be able to handle extremely large -lines that are needed for certain stress tests. When the input buffer is -expanded, the other two buffers must also be expanded likewise, and the -contents of pbuffer, which are a copy of the input for callouts, must be -preserved (for when expansion happens for a data line). This is not the most +lines that are needed for certain stress tests, although this is less likely +now that there are repetition features for both patterns and data. When the +input buffer is expanded, the other two buffers must also be expanded likewise, +and the contents of pbuffer, which are a copy of the input for callouts, must +be preserved (for when expansion happens for a data line). This is not the most optimal way of handling this, but hey, this is just a test program! Arguments: @@ -2940,7 +2941,7 @@ for (;;) if (rlen > 1000) { - int dlen; + size_t dlen; /* If libreadline or libedit support is required, use readline() to read a line if the input is a terminal. Note that readline() removes the trailing @@ -2971,9 +2972,23 @@ for (;;) return (here == start)? NULL : start; } - dlen = (int)strlen((char *)here); - if (dlen > 0 && here[dlen - 1] == '\n') return start; + dlen = strlen((char *)here); + if (here[dlen - 1] == '\n') return start; /* End of line reached */ here += dlen; + + /* If we have not read a newline when reading a file, we have either filled + the buffer or reached the end of the file. We can detect the former by + checking that the string fills the buffer, and the latter by feof(). If + neither of these is true, it means we read a binary zero which has caused + strlen() to give a short length. This is a hard error because pcre2test + expects to work with C strings. */ + + if (!INTERACTIVE(f) && dlen < rlen - 1 && !feof(f)) + { + fprintf(outfile, "** Binary zero encountered in input\n"); + fprintf(outfile, "** pcre2test run abandoned\n"); + exit(1); + } } else @@ -4451,9 +4466,9 @@ if (pat_patctl.jit == 0 && pat_patctl.jit = 7; /* Now copy the pattern to pbuffer8 for use in 8-bit testing and for reflecting -in callouts. Convert from hex if required; this must necessarily be fewer -characters so will always fit in pbuffer8. Alternatively, process for -repetition if requested. */ +in callouts. Convert from hex if requested (literal strings in quotes may be +present within the hexadecimal pairs). The result must necessarily be fewer +characters so will always fit in pbuffer8. */ if ((pat_patctl.control & CTL_HEXPAT) != 0) { @@ -4464,25 +4479,59 @@ if ((pat_patctl.control & CTL_HEXPAT) != 0) for (pp = buffer + 1; *pp != 0; pp++) { if (isspace(*pp)) continue; - c = toupper(*pp++); - if (*pp == 0) + c = *pp++; + + /* Handle a literal substring */ + + if (c == '\'' || c == '"') { - fprintf(outfile, "** Odd number of digits in hex pattern.\n"); - return PR_SKIP; + for (;; pp++) + { + d = *pp; + if (d == 0) + { + fprintf(outfile, "** Missing closing quote in hex pattern\n"); + return PR_SKIP; + } + if (d == c) break; + *pt++ = d; + } } - d = toupper(*pp); - if (!isxdigit(c) || !isxdigit(d)) + + /* Expect a hex pair */ + + else { - fprintf(outfile, "** Non-hex-digit in hex pattern.\n"); - return PR_SKIP; + if (!isxdigit(c)) + { + fprintf(outfile, "** Unexpected non-hex-digit '%c' in hex pattern: " + "quote missing?\n", c); + return PR_SKIP; + } + if (*pp == 0) + { + fprintf(outfile, "** Odd number of digits in hex pattern\n"); + return PR_SKIP; + } + d = *pp; + if (!isxdigit(d)) + { + fprintf(outfile, "** Unexpected non-hex-digit '%c' in hex pattern: " + "quote missing?\n", d); + return PR_SKIP; + } + c = toupper(c); + d = toupper(d); + *pt++ = ((isdigit(c)? (c - '0') : (c - 'A' + 10)) << 4) + + (isdigit(d)? (d - '0') : (d - 'A' + 10)); } - *pt++ = ((isdigit(c)? (c - '0') : (c - 'A' + 10)) << 4) + - (isdigit(d)? (d - '0') : (d - 'A' + 10)); } *pt = 0; patlen = pt - pbuffer8; } +/* If not a hex string, process for repetition expansion if requested. */ + else if ((pat_patctl.control & CTL_EXPAND) != 0) { uint8_t *pp, *pt; @@ -4567,7 +4616,7 @@ if (pat_patctl.locale[0] != 0) { if (pat_patctl.tables_id != 0) { - fprintf(outfile, "** 'Locale' and 'tables' must not both be set.\n"); + fprintf(outfile, "** 'Locale' and 'tables' must not both be set\n"); return PR_SKIP; } if (setlocale(LC_CTYPE, (const char *)pat_patctl.locale) == NULL) diff --git a/testdata/testinput2 b/testdata/testinput2 index 071cca1..40f22c4 100644 --- a/testdata/testinput2 +++ b/testdata/testinput2 @@ -4792,12 +4792,16 @@ a)"xI /(*MARK:A\x00b)/mark,alt_verbnames abc -# /(*MARK:A\x00b)/ -/28 2a 4d 41 52 4b 3a 41 00 62 29/mark,hex +/"(*MARK:A" 00 "b)"/mark,hex abc -# /(*MARK:A\x00b)/ -/28 2a 4d 41 52 4b 3a 41 00 62 29/mark,hex,alt_verbnames +/"(*MARK:A" 00 "b)"/mark,hex,alt_verbnames abc +/efg/hex + +/eff/hex + +/effg/hex + # End of testinput2 diff --git a/testdata/testinput6 b/testdata/testinput6 index c24b40f..a19bff3 100644 --- a/testdata/testinput6 +++ b/testdata/testinput6 @@ -4847,8 +4847,7 @@ aaaXY # Binary zero in callout string -# a ( ? C ' x z ' ) b -/ 61 28 3f 43 27 78 00 7a 27 29 62/hex +/"a(?C'x" 00 "z')b"/hex abcdefgh /(?(?!)a|b)/ diff --git a/testdata/testoutput2 b/testdata/testoutput2 index 7178410..13b7f7c 100644 --- a/testdata/testoutput2 +++ b/testdata/testoutput2 @@ -15146,16 +15146,23 @@ MK: A\x00b 0: MK: A\x00b -# /(*MARK:A\x00b)/ -/28 2a 4d 41 52 4b 3a 41 00 62 29/mark,hex +/"(*MARK:A" 00 "b)"/mark,hex abc 0: MK: A\x00b -# /(*MARK:A\x00b)/ -/28 2a 4d 41 52 4b 3a 41 00 62 29/mark,hex,alt_verbnames +/"(*MARK:A" 00 "b)"/mark,hex,alt_verbnames abc 0: MK: A\x00b +/efg/hex +** Unexpected non-hex-digit 'g' in hex pattern: quote missing? + +/eff/hex +** Odd number of digits in hex pattern + +/effg/hex +** Unexpected non-hex-digit 'g' in hex pattern: quote missing? + # End of testinput2 diff --git a/testdata/testoutput6 b/testdata/testoutput6 index 9ae092b..e4074cd 100644 --- a/testdata/testoutput6 +++ b/testdata/testoutput6 @@ -7619,8 +7619,7 @@ Callout (8): `code` 0: aaaX # Binary zero in callout string -# a ( ? C ' x z ' ) b -/ 61 28 3f 43 27 78 00 7a 27 29 62/hex +/"a(?C'x" 00 "z')b"/hex abcdefgh Callout (5): 'x\x00z' --->abcdefgh