Add support for invalid UTF-8 matching to pcre2grep.
This commit is contained in:
parent
5850cc5928
commit
4f31de2866
|
@ -18,6 +18,10 @@ detects invalid characters in the 0xd800-0xdfff range.
|
||||||
interpreter, and integrate with the existing JIT support via the new
|
interpreter, and integrate with the existing JIT support via the new
|
||||||
PCRE2_MATCH_INVALID_UTF compile-time option.
|
PCRE2_MATCH_INVALID_UTF compile-time option.
|
||||||
|
|
||||||
|
5. Give more error detail for invalid UTF-8 when detected in pcre2grep.
|
||||||
|
|
||||||
|
6. Add support for invalid UTF-8 to pcre2grep.
|
||||||
|
|
||||||
|
|
||||||
Version 10.33 16-April-2019
|
Version 10.33 16-April-2019
|
||||||
---------------------------
|
---------------------------
|
||||||
|
|
12
RunGrepTest
12
RunGrepTest
|
@ -8,7 +8,7 @@
|
||||||
# * Put printf arguments in single, not double quotes to avoid unwanted
|
# * Put printf arguments in single, not double quotes to avoid unwanted
|
||||||
# escaping.
|
# escaping.
|
||||||
# * Use \0 for binary zero in printf, not \x0, for the benefit of older
|
# * Use \0 for binary zero in printf, not \x0, for the benefit of older
|
||||||
# versions.
|
# versions (and use octal for other special values).
|
||||||
|
|
||||||
# Set the C locale, so that sort(1) behaves predictably.
|
# Set the C locale, so that sort(1) behaves predictably.
|
||||||
|
|
||||||
|
@ -677,6 +677,16 @@ if [ $utf8 -ne 0 ] ; then
|
||||||
(cd $srcdir; $valgrind $vjs $pcre2grep --line-offsets -u --newline=any '(?<=\K\x{17f})' ./testdata/grepinput8) >>testtrygrep
|
(cd $srcdir; $valgrind $vjs $pcre2grep --line-offsets -u --newline=any '(?<=\K\x{17f})' ./testdata/grepinput8) >>testtrygrep
|
||||||
echo "RC=$?" >>testtrygrep
|
echo "RC=$?" >>testtrygrep
|
||||||
|
|
||||||
|
echo "---------------------------- Test U4 ------------------------------" >>testtrygrep
|
||||||
|
printf 'A\341\200\200\200CD\342\200\200Z\n' >testtemp1grep
|
||||||
|
(cd $srcdir; $valgrind $vjs $pcre2grep -u -o '....' testtemp1grep) >>testtrygrep 2>&1
|
||||||
|
echo "RC=$?" >>testtrygrep
|
||||||
|
|
||||||
|
echo "---------------------------- Test U5 ------------------------------" >>testtrygrep
|
||||||
|
printf 'A\341\200\200\200CD\342\200\200Z\n' >testtemp1grep
|
||||||
|
(cd $srcdir; $valgrind $vjs $pcre2grep -U -o '....' testtemp1grep) >>testtrygrep
|
||||||
|
echo "RC=$?" >>testtrygrep
|
||||||
|
|
||||||
$cf $srcdir/testdata/grepoutput8 testtrygrep
|
$cf $srcdir/testdata/grepoutput8 testtrygrep
|
||||||
if [ $? != 0 ] ; then exit 1; fi
|
if [ $? != 0 ] ; then exit 1; fi
|
||||||
|
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
.TH PCRE2GREP 1 "24 November 2018" "PCRE2 10.33"
|
.TH PCRE2GREP 1 "28 May 2019" "PCRE2 10.34"
|
||||||
.SH NAME
|
.SH NAME
|
||||||
pcre2grep - a grep with Perl-compatible regular expressions.
|
pcre2grep - a grep with Perl-compatible regular expressions.
|
||||||
.SH SYNOPSIS
|
.SH SYNOPSIS
|
||||||
|
@ -644,12 +644,22 @@ is listed. If file names are being output, the grand total is preceded by
|
||||||
ignored when used with \fB-L\fP (list files without matches), because the grand
|
ignored when used with \fB-L\fP (list files without matches), because the grand
|
||||||
total would always be zero.
|
total would always be zero.
|
||||||
.TP
|
.TP
|
||||||
\fB-u\fP, \fB--utf-8\fP
|
\fB-u\fP, \fB--utf\fP
|
||||||
Operate in UTF-8 mode. This option is available only if PCRE2 has been compiled
|
Operate in UTF-8 mode. This option is available only if PCRE2 has been compiled
|
||||||
with UTF-8 support. All patterns (including those for any \fB--exclude\fP and
|
with UTF-8 support. All patterns (including those for any \fB--exclude\fP and
|
||||||
\fB--include\fP options) and all subject lines that are scanned must be valid
|
\fB--include\fP options) and all subject lines that are scanned must be valid
|
||||||
strings of UTF-8 characters.
|
strings of UTF-8 characters.
|
||||||
.TP
|
.TP
|
||||||
|
\fb-U\fP, \fB--utf-allow-invalid\fP
|
||||||
|
As \fB--utf\fP, but in addition subject lines may contain invalid UTF-8 code
|
||||||
|
unit sequences. These can never form part of any pattern match. This facility
|
||||||
|
allows valid UTF-8 strings to be sought in executable or other binary files.
|
||||||
|
For more details about matching in non-valid UTF-8 strings, see the
|
||||||
|
.\" HREF
|
||||||
|
\fBpcre2unicode\fP(3)
|
||||||
|
.\"
|
||||||
|
documentation.
|
||||||
|
.TP
|
||||||
\fB-V\fP, \fB--version\fP
|
\fB-V\fP, \fB--version\fP
|
||||||
Write the version numbers of \fBpcre2grep\fP and the PCRE2 library to the
|
Write the version numbers of \fBpcre2grep\fP and the PCRE2 library to the
|
||||||
standard output and then exit. Anything else on the command line is
|
standard output and then exit. Anything else on the command line is
|
||||||
|
@ -711,9 +721,9 @@ as in the GNU \fBgrep\fP program. Any long option of the form
|
||||||
\fB--file-offsets\fP, \fB--heap-limit\fP, \fB--include-dir\fP,
|
\fB--file-offsets\fP, \fB--heap-limit\fP, \fB--include-dir\fP,
|
||||||
\fB--line-offsets\fP, \fB--locale\fP, \fB--match-limit\fP, \fB-M\fP,
|
\fB--line-offsets\fP, \fB--locale\fP, \fB--match-limit\fP, \fB-M\fP,
|
||||||
\fB--multiline\fP, \fB-N\fP, \fB--newline\fP, \fB--om-separator\fP,
|
\fB--multiline\fP, \fB-N\fP, \fB--newline\fP, \fB--om-separator\fP,
|
||||||
\fB--output\fP, \fB-u\fP, and \fB--utf-8\fP options are specific to
|
\fB--output\fP, \fB-u\fP, \fB--utf\fP, \fB-U\fP, and \fB--utf-allow-invalid\fP
|
||||||
\fBpcre2grep\fP, as is the use of the \fB--only-matching\fP option with a
|
options are specific to \fBpcre2grep\fP, as is the use of the
|
||||||
capturing parentheses number.
|
\fB--only-matching\fP option with a capturing parentheses number.
|
||||||
.P
|
.P
|
||||||
Although most of the common options work the same way, a few are different in
|
Although most of the common options work the same way, a few are different in
|
||||||
\fBpcre2grep\fP. For example, the \fB--include\fP option's argument is a glob
|
\fBpcre2grep\fP. For example, the \fB--include\fP option's argument is a glob
|
||||||
|
@ -884,6 +894,6 @@ Cambridge, England.
|
||||||
.rs
|
.rs
|
||||||
.sp
|
.sp
|
||||||
.nf
|
.nf
|
||||||
Last updated: 24 November 2018
|
Last updated: 28 May 2019
|
||||||
Copyright (c) 1997-2018 University of Cambridge.
|
Copyright (c) 1997-2019 University of Cambridge.
|
||||||
.fi
|
.fi
|
||||||
|
|
|
@ -13,7 +13,7 @@ distribution because other apparatus is needed to compile pcre2grep for z/OS.
|
||||||
The header can be found in the special z/OS distribution, which is available
|
The header can be found in the special z/OS distribution, which is available
|
||||||
from www.zaconsultants.net or from www.cbttape.org.
|
from www.zaconsultants.net or from www.cbttape.org.
|
||||||
|
|
||||||
Copyright (c) 1997-2018 University of Cambridge
|
Copyright (c) 1997-2019 University of Cambridge
|
||||||
|
|
||||||
-----------------------------------------------------------------------------
|
-----------------------------------------------------------------------------
|
||||||
Redistribution and use in source and binary forms, with or without
|
Redistribution and use in source and binary forms, with or without
|
||||||
|
@ -110,6 +110,19 @@ MSVC 10/2010. Except for VC6 (which is missing some fundamentals and fails). */
|
||||||
#define snprintf _snprintf
|
#define snprintf _snprintf
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
/* VC and older compilers don't support %td or %zu, and even some that claim to
|
||||||
|
be C99 don't support it (hence DISABLE_PERCENT_ZT). */
|
||||||
|
|
||||||
|
#if defined(_MSC_VER) || !defined(__STDC_VERSION__) || __STDC_VERSION__ < 199901L || defined(DISABLE_PERCENT_ZT)
|
||||||
|
#define PTR_FORM "lu"
|
||||||
|
#define SIZ_FORM "lu"
|
||||||
|
#define SIZ_CAST (unsigned long int)
|
||||||
|
#else
|
||||||
|
#define PTR_FORM "td"
|
||||||
|
#define SIZ_FORM "zu"
|
||||||
|
#define SIZ_CAST
|
||||||
|
#endif
|
||||||
|
|
||||||
#define FALSE 0
|
#define FALSE 0
|
||||||
#define TRUE 1
|
#define TRUE 1
|
||||||
|
|
||||||
|
@ -451,6 +464,7 @@ static option_item optionlist[] = {
|
||||||
{ OP_NODATA, 's', NULL, "no-messages", "suppress error messages" },
|
{ OP_NODATA, 's', NULL, "no-messages", "suppress error messages" },
|
||||||
{ OP_NODATA, 't', NULL, "total-count", "print total count of matching lines" },
|
{ OP_NODATA, 't', NULL, "total-count", "print total count of matching lines" },
|
||||||
{ OP_NODATA, 'u', NULL, "utf", "use UTF mode" },
|
{ OP_NODATA, 'u', NULL, "utf", "use UTF mode" },
|
||||||
|
{ OP_NODATA, 'U', NULL, "utf-allow-invalid", "use UTF mode, allow for invalid code units" },
|
||||||
{ OP_NODATA, 'V', NULL, "version", "print version information and exit" },
|
{ OP_NODATA, 'V', NULL, "version", "print version information and exit" },
|
||||||
{ OP_NODATA, 'v', NULL, "invert-match", "select non-matching lines" },
|
{ OP_NODATA, 'v', NULL, "invert-match", "select non-matching lines" },
|
||||||
{ OP_NODATA, 'w', NULL, "word-regex(p)", "force patterns to match only as words" },
|
{ OP_NODATA, 'w', NULL, "word-regex(p)", "force patterns to match only as words" },
|
||||||
|
@ -1733,6 +1747,15 @@ for (i = 1; p != NULL; p = p->next, i++)
|
||||||
fprintf(stderr, "%s", msg);
|
fprintf(stderr, "%s", msg);
|
||||||
FWRITE_IGNORE(matchptr, 1, slen, stderr); /* In case binary zero included */
|
FWRITE_IGNORE(matchptr, 1, slen, stderr); /* In case binary zero included */
|
||||||
fprintf(stderr, "\n\n");
|
fprintf(stderr, "\n\n");
|
||||||
|
if (*mrc <= PCRE2_ERROR_UTF8_ERR1 &&
|
||||||
|
*mrc >= PCRE2_ERROR_UTF8_ERR21)
|
||||||
|
{
|
||||||
|
unsigned char mbuffer[256];
|
||||||
|
PCRE2_SIZE startchar = pcre2_get_startchar(match_data);
|
||||||
|
(void)pcre2_get_error_message(*mrc, mbuffer, sizeof(mbuffer));
|
||||||
|
fprintf(stderr, "%s at offset %" SIZ_FORM "\n\n", mbuffer,
|
||||||
|
SIZ_CAST startchar);
|
||||||
|
}
|
||||||
if (*mrc == PCRE2_ERROR_MATCHLIMIT || *mrc == PCRE2_ERROR_DEPTHLIMIT ||
|
if (*mrc == PCRE2_ERROR_MATCHLIMIT || *mrc == PCRE2_ERROR_DEPTHLIMIT ||
|
||||||
*mrc == PCRE2_ERROR_HEAPLIMIT || *mrc == PCRE2_ERROR_JIT_STACKLIMIT)
|
*mrc == PCRE2_ERROR_HEAPLIMIT || *mrc == PCRE2_ERROR_JIT_STACKLIMIT)
|
||||||
resource_error = TRUE;
|
resource_error = TRUE;
|
||||||
|
@ -3401,6 +3424,7 @@ switch(letter)
|
||||||
case 's': silent = TRUE; break;
|
case 's': silent = TRUE; break;
|
||||||
case 't': show_total_count = TRUE; break;
|
case 't': show_total_count = TRUE; break;
|
||||||
case 'u': options |= PCRE2_UTF; utf = TRUE; break;
|
case 'u': options |= PCRE2_UTF; utf = TRUE; break;
|
||||||
|
case 'U': options |= PCRE2_UTF|PCRE2_MATCH_INVALID_UTF; utf = TRUE; break;
|
||||||
case 'v': invert = TRUE; break;
|
case 'v': invert = TRUE; break;
|
||||||
case 'w': extra_options |= PCRE2_EXTRA_MATCH_WORD; break;
|
case 'w': extra_options |= PCRE2_EXTRA_MATCH_WORD; break;
|
||||||
case 'x': extra_options |= PCRE2_EXTRA_MATCH_LINE; break;
|
case 'x': extra_options |= PCRE2_EXTRA_MATCH_LINE; break;
|
||||||
|
|
|
@ -18,3 +18,14 @@ RC=0
|
||||||
22:6,2
|
22:6,2
|
||||||
22:8,2
|
22:8,2
|
||||||
RC=0
|
RC=0
|
||||||
|
---------------------------- Test U4 ------------------------------
|
||||||
|
pcre2grep: pcre2_match() gave error -22 while matching this text:
|
||||||
|
|
||||||
|
Aက€CD Z
|
||||||
|
|
||||||
|
UTF-8 error: isolated byte with 0x80 bit set at offset 4
|
||||||
|
|
||||||
|
RC=1
|
||||||
|
---------------------------- Test U5 ------------------------------
|
||||||
|
CD Z
|
||||||
|
RC=0
|
||||||
|
|
Loading…
Reference in New Issue