Two pcre2test extensions: quoted literal substrings in hex patterns and
detection of unsupported binary zeros in file input.
This commit is contained in:
parent
fd008957d5
commit
8febd27344
|
@ -8,6 +8,13 @@ Version 10.22 29-January-2016
|
|||
1. Applied Jason Hood's patches to RunTest.bat and testdata/wintestoutput3
|
||||
to fix problems with running the tests under Windows.
|
||||
|
||||
2. Implemented a facility for quoting literal characters within hexadecimal
|
||||
patterns in pcre2test, to make it easier to create patterns with just a few
|
||||
non-printing characters.
|
||||
|
||||
3. Binary zeros are not supported in pcre2test input files. It now detects them
|
||||
and gives an error.
|
||||
|
||||
|
||||
Version 10.21 12-January-2016
|
||||
-----------------------------
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
.TH PCRE2TEST 1 "12 December 2015" "PCRE 10.21"
|
||||
.TH PCRE2TEST 1 "29 January 2016" "PCRE 10.22"
|
||||
.SH NAME
|
||||
pcre2test - a program for testing Perl-compatible regular expressions.
|
||||
.SH SYNOPSIS
|
||||
|
@ -68,10 +68,11 @@ environments character 26 (hex 1A) causes an immediate end of file, and no
|
|||
further data is read.
|
||||
.P
|
||||
For maximum portability, therefore, it is safest to avoid non-printing
|
||||
characters in \fBpcre2test\fP input files. There is a facility for specifying a
|
||||
pattern's characters as hexadecimal pairs, thus making it possible to include
|
||||
binary zeroes in a pattern for testing purposes. Subject lines are processed
|
||||
for backslash escapes, which makes it possible to include any data value.
|
||||
characters in \fBpcre2test\fP input files. There is a facility for specifying
|
||||
some or all of a pattern's characters as hexadecimal pairs, thus making it
|
||||
possible to include binary zeroes in a pattern for testing purposes. Subject
|
||||
lines are processed for backslash escapes, which makes it possible to include
|
||||
any data value.
|
||||
.
|
||||
.
|
||||
.SH "COMMAND LINE OPTIONS"
|
||||
|
@ -523,7 +524,7 @@ about the pattern:
|
|||
debug same as info,fullbincode
|
||||
fullbincode show binary code with lengths
|
||||
/I info show info about compiled pattern
|
||||
hex pattern is coded in hexadecimal
|
||||
hex unquoted characters are hexadecimal
|
||||
jit[=<number>] use JIT
|
||||
jitfast use JIT fast path
|
||||
jitverify verify JIT use
|
||||
|
@ -614,20 +615,30 @@ testing that \fBpcre2_compile()\fP behaves correctly in this case (it uses
|
|||
default values).
|
||||
.
|
||||
.
|
||||
.SS "Specifying a pattern in hex"
|
||||
.SS "Specifying pattern characters in hexadecimal"
|
||||
.rs
|
||||
.sp
|
||||
The \fBhex\fP modifier specifies that the characters of the pattern are to be
|
||||
interpreted as pairs of hexadecimal digits. White space is permitted between
|
||||
pairs. For example:
|
||||
The \fBhex\fP modifier specifies that the characters of the pattern, except for
|
||||
substrings enclosed in single or double quotes, are to be interpreted as pairs
|
||||
of hexadecimal digits. This feature is provided as a way of creating patterns
|
||||
that contain binary zeros and other non-printing characters. White space is
|
||||
permitted between pairs of digits. For example, this pattern contains three
|
||||
characters:
|
||||
.sp
|
||||
/ab 32 59/hex
|
||||
.sp
|
||||
This feature is provided as a way of creating patterns that contain binary zero
|
||||
and other non-printing characters. By default, \fBpcre2test\fP passes patterns
|
||||
as zero-terminated strings to \fBpcre2_compile()\fP, giving the length as
|
||||
PCRE2_ZERO_TERMINATED. However, for patterns specified in hexadecimal, the
|
||||
actual length of the pattern is passed.
|
||||
Parts of such a pattern are taken literally if quoted. This pattern contains
|
||||
nine characters, only two of which are specified in hexadecimal:
|
||||
.sp
|
||||
/ab "literal" 32/hex
|
||||
.sp
|
||||
Either single or double quotes may be used. There is no way of including
|
||||
the delimiter within a substring.
|
||||
.P
|
||||
By default, \fBpcre2test\fP passes patterns as zero-terminated strings to
|
||||
\fBpcre2_compile()\fP, giving the length as PCRE2_ZERO_TERMINATED. However, for
|
||||
patterns specified with the \fBhex\fP modifier, the actual length of the
|
||||
pattern is passed.
|
||||
.
|
||||
.
|
||||
.SS "Generating long repetitive patterns"
|
||||
|
@ -1640,6 +1651,6 @@ Cambridge, England.
|
|||
.rs
|
||||
.sp
|
||||
.nf
|
||||
Last updated: 12 December 2015
|
||||
Copyright (c) 1997-2015 University of Cambridge.
|
||||
Last updated: 29 January 2016
|
||||
Copyright (c) 1997-2016 University of Cambridge.
|
||||
.fi
|
||||
|
|
|
@ -2913,10 +2913,11 @@ pbuffer8 = new_pbuffer8;
|
|||
/* Input lines are read into buffer, but both patterns and data lines can be
|
||||
continued over multiple input lines. In addition, if the buffer fills up, we
|
||||
want to automatically expand it so as to be able to handle extremely large
|
||||
lines that are needed for certain stress tests. When the input buffer is
|
||||
expanded, the other two buffers must also be expanded likewise, and the
|
||||
contents of pbuffer, which are a copy of the input for callouts, must be
|
||||
preserved (for when expansion happens for a data line). This is not the most
|
||||
lines that are needed for certain stress tests, although this is less likely
|
||||
now that there are repetition features for both patterns and data. When the
|
||||
input buffer is expanded, the other two buffers must also be expanded likewise,
|
||||
and the contents of pbuffer, which are a copy of the input for callouts, must
|
||||
be preserved (for when expansion happens for a data line). This is not the most
|
||||
optimal way of handling this, but hey, this is just a test program!
|
||||
|
||||
Arguments:
|
||||
|
@ -2940,7 +2941,7 @@ for (;;)
|
|||
|
||||
if (rlen > 1000)
|
||||
{
|
||||
int dlen;
|
||||
size_t dlen;
|
||||
|
||||
/* If libreadline or libedit support is required, use readline() to read a
|
||||
line if the input is a terminal. Note that readline() removes the trailing
|
||||
|
@ -2971,9 +2972,23 @@ for (;;)
|
|||
return (here == start)? NULL : start;
|
||||
}
|
||||
|
||||
dlen = (int)strlen((char *)here);
|
||||
if (dlen > 0 && here[dlen - 1] == '\n') return start;
|
||||
dlen = strlen((char *)here);
|
||||
if (here[dlen - 1] == '\n') return start; /* End of line reached */
|
||||
here += dlen;
|
||||
|
||||
/* If we have not read a newline when reading a file, we have either filled
|
||||
the buffer or reached the end of the file. We can detect the former by
|
||||
checking that the string fills the buffer, and the latter by feof(). If
|
||||
neither of these is true, it means we read a binary zero which has caused
|
||||
strlen() to give a short length. This is a hard error because pcre2test
|
||||
expects to work with C strings. */
|
||||
|
||||
if (!INTERACTIVE(f) && dlen < rlen - 1 && !feof(f))
|
||||
{
|
||||
fprintf(outfile, "** Binary zero encountered in input\n");
|
||||
fprintf(outfile, "** pcre2test run abandoned\n");
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
else
|
||||
|
@ -4451,9 +4466,9 @@ if (pat_patctl.jit == 0 &&
|
|||
pat_patctl.jit = 7;
|
||||
|
||||
/* Now copy the pattern to pbuffer8 for use in 8-bit testing and for reflecting
|
||||
in callouts. Convert from hex if required; this must necessarily be fewer
|
||||
characters so will always fit in pbuffer8. Alternatively, process for
|
||||
repetition if requested. */
|
||||
in callouts. Convert from hex if requested (literal strings in quotes may be
|
||||
present within the hexadecimal pairs). The result must necessarily be fewer
|
||||
characters so will always fit in pbuffer8. */
|
||||
|
||||
if ((pat_patctl.control & CTL_HEXPAT) != 0)
|
||||
{
|
||||
|
@ -4464,25 +4479,59 @@ if ((pat_patctl.control & CTL_HEXPAT) != 0)
|
|||
for (pp = buffer + 1; *pp != 0; pp++)
|
||||
{
|
||||
if (isspace(*pp)) continue;
|
||||
c = toupper(*pp++);
|
||||
c = *pp++;
|
||||
|
||||
/* Handle a literal substring */
|
||||
|
||||
if (c == '\'' || c == '"')
|
||||
{
|
||||
for (;; pp++)
|
||||
{
|
||||
d = *pp;
|
||||
if (d == 0)
|
||||
{
|
||||
fprintf(outfile, "** Missing closing quote in hex pattern\n");
|
||||
return PR_SKIP;
|
||||
}
|
||||
if (d == c) break;
|
||||
*pt++ = d;
|
||||
}
|
||||
}
|
||||
|
||||
/* Expect a hex pair */
|
||||
|
||||
else
|
||||
{
|
||||
if (!isxdigit(c))
|
||||
{
|
||||
fprintf(outfile, "** Unexpected non-hex-digit '%c' in hex pattern: "
|
||||
"quote missing?\n", c);
|
||||
return PR_SKIP;
|
||||
}
|
||||
if (*pp == 0)
|
||||
{
|
||||
fprintf(outfile, "** Odd number of digits in hex pattern.\n");
|
||||
fprintf(outfile, "** Odd number of digits in hex pattern\n");
|
||||
return PR_SKIP;
|
||||
}
|
||||
d = toupper(*pp);
|
||||
if (!isxdigit(c) || !isxdigit(d))
|
||||
d = *pp;
|
||||
if (!isxdigit(d))
|
||||
{
|
||||
fprintf(outfile, "** Non-hex-digit in hex pattern.\n");
|
||||
fprintf(outfile, "** Unexpected non-hex-digit '%c' in hex pattern: "
|
||||
"quote missing?\n", d);
|
||||
return PR_SKIP;
|
||||
}
|
||||
c = toupper(c);
|
||||
d = toupper(d);
|
||||
*pt++ = ((isdigit(c)? (c - '0') : (c - 'A' + 10)) << 4) +
|
||||
(isdigit(d)? (d - '0') : (d - 'A' + 10));
|
||||
}
|
||||
}
|
||||
*pt = 0;
|
||||
patlen = pt - pbuffer8;
|
||||
}
|
||||
|
||||
/* If not a hex string, process for repetition expansion if requested. */
|
||||
|
||||
else if ((pat_patctl.control & CTL_EXPAND) != 0)
|
||||
{
|
||||
uint8_t *pp, *pt;
|
||||
|
@ -4567,7 +4616,7 @@ if (pat_patctl.locale[0] != 0)
|
|||
{
|
||||
if (pat_patctl.tables_id != 0)
|
||||
{
|
||||
fprintf(outfile, "** 'Locale' and 'tables' must not both be set.\n");
|
||||
fprintf(outfile, "** 'Locale' and 'tables' must not both be set\n");
|
||||
return PR_SKIP;
|
||||
}
|
||||
if (setlocale(LC_CTYPE, (const char *)pat_patctl.locale) == NULL)
|
||||
|
|
|
@ -4792,12 +4792,16 @@ a)"xI
|
|||
/(*MARK:A\x00b)/mark,alt_verbnames
|
||||
abc
|
||||
|
||||
# /(*MARK:A\x00b)/
|
||||
/28 2a 4d 41 52 4b 3a 41 00 62 29/mark,hex
|
||||
/"(*MARK:A" 00 "b)"/mark,hex
|
||||
abc
|
||||
|
||||
# /(*MARK:A\x00b)/
|
||||
/28 2a 4d 41 52 4b 3a 41 00 62 29/mark,hex,alt_verbnames
|
||||
/"(*MARK:A" 00 "b)"/mark,hex,alt_verbnames
|
||||
abc
|
||||
|
||||
/efg/hex
|
||||
|
||||
/eff/hex
|
||||
|
||||
/effg/hex
|
||||
|
||||
# End of testinput2
|
||||
|
|
|
@ -4847,8 +4847,7 @@
|
|||
aaaXY
|
||||
|
||||
# Binary zero in callout string
|
||||
# a ( ? C ' x z ' ) b
|
||||
/ 61 28 3f 43 27 78 00 7a 27 29 62/hex
|
||||
/"a(?C'x" 00 "z')b"/hex
|
||||
abcdefgh
|
||||
|
||||
/(?(?!)a|b)/
|
||||
|
|
|
@ -15146,16 +15146,23 @@ MK: A\x00b
|
|||
0:
|
||||
MK: A\x00b
|
||||
|
||||
# /(*MARK:A\x00b)/
|
||||
/28 2a 4d 41 52 4b 3a 41 00 62 29/mark,hex
|
||||
/"(*MARK:A" 00 "b)"/mark,hex
|
||||
abc
|
||||
0:
|
||||
MK: A\x00b
|
||||
|
||||
# /(*MARK:A\x00b)/
|
||||
/28 2a 4d 41 52 4b 3a 41 00 62 29/mark,hex,alt_verbnames
|
||||
/"(*MARK:A" 00 "b)"/mark,hex,alt_verbnames
|
||||
abc
|
||||
0:
|
||||
MK: A\x00b
|
||||
|
||||
/efg/hex
|
||||
** Unexpected non-hex-digit 'g' in hex pattern: quote missing?
|
||||
|
||||
/eff/hex
|
||||
** Odd number of digits in hex pattern
|
||||
|
||||
/effg/hex
|
||||
** Unexpected non-hex-digit 'g' in hex pattern: quote missing?
|
||||
|
||||
# End of testinput2
|
||||
|
|
|
@ -7619,8 +7619,7 @@ Callout (8): `code`
|
|||
0: aaaX
|
||||
|
||||
# Binary zero in callout string
|
||||
# a ( ? C ' x z ' ) b
|
||||
/ 61 28 3f 43 27 78 00 7a 27 29 62/hex
|
||||
/"a(?C'x" 00 "z')b"/hex
|
||||
abcdefgh
|
||||
Callout (5): 'x\x00z'
|
||||
--->abcdefgh
|
||||
|
|
Loading…
Reference in New Issue