From 4f31de28663a4797c9b0567b43f037d892474884 Mon Sep 17 00:00:00 2001 From: "Philip.Hazel" Date: Tue, 28 May 2019 14:14:22 +0000 Subject: [PATCH] Add support for invalid UTF-8 matching to pcre2grep. --- ChangeLog | 4 ++++ RunGrepTest | 12 +++++++++++- doc/pcre2grep.1 | 24 +++++++++++++++++------- src/pcre2grep.c | 26 +++++++++++++++++++++++++- testdata/grepoutput8 | 11 +++++++++++ 5 files changed, 68 insertions(+), 9 deletions(-) diff --git a/ChangeLog b/ChangeLog index d638929..84e051a 100644 --- a/ChangeLog +++ b/ChangeLog @@ -18,6 +18,10 @@ detects invalid characters in the 0xd800-0xdfff range. interpreter, and integrate with the existing JIT support via the new PCRE2_MATCH_INVALID_UTF compile-time option. +5. Give more error detail for invalid UTF-8 when detected in pcre2grep. + +6. Add support for invalid UTF-8 to pcre2grep. + Version 10.33 16-April-2019 --------------------------- diff --git a/RunGrepTest b/RunGrepTest index bac1f1b..9990b7e 100755 --- a/RunGrepTest +++ b/RunGrepTest @@ -8,7 +8,7 @@ # * Put printf arguments in single, not double quotes to avoid unwanted # escaping. # * Use \0 for binary zero in printf, not \x0, for the benefit of older -# versions. +# versions (and use octal for other special values). # Set the C locale, so that sort(1) behaves predictably. @@ -676,6 +676,16 @@ if [ $utf8 -ne 0 ] ; then echo "---------------------------- Test U3 ------------------------------" >>testtrygrep (cd $srcdir; $valgrind $vjs $pcre2grep --line-offsets -u --newline=any '(?<=\K\x{17f})' ./testdata/grepinput8) >>testtrygrep echo "RC=$?" >>testtrygrep + + echo "---------------------------- Test U4 ------------------------------" >>testtrygrep + printf 'A\341\200\200\200CD\342\200\200Z\n' >testtemp1grep + (cd $srcdir; $valgrind $vjs $pcre2grep -u -o '....' testtemp1grep) >>testtrygrep 2>&1 + echo "RC=$?" >>testtrygrep + + echo "---------------------------- Test U5 ------------------------------" >>testtrygrep + printf 'A\341\200\200\200CD\342\200\200Z\n' >testtemp1grep + (cd $srcdir; $valgrind $vjs $pcre2grep -U -o '....' testtemp1grep) >>testtrygrep + echo "RC=$?" >>testtrygrep $cf $srcdir/testdata/grepoutput8 testtrygrep if [ $? != 0 ] ; then exit 1; fi diff --git a/doc/pcre2grep.1 b/doc/pcre2grep.1 index 6b3219b..b5dad9f 100644 --- a/doc/pcre2grep.1 +++ b/doc/pcre2grep.1 @@ -1,4 +1,4 @@ -.TH PCRE2GREP 1 "24 November 2018" "PCRE2 10.33" +.TH PCRE2GREP 1 "28 May 2019" "PCRE2 10.34" .SH NAME pcre2grep - a grep with Perl-compatible regular expressions. .SH SYNOPSIS @@ -644,12 +644,22 @@ is listed. If file names are being output, the grand total is preceded by ignored when used with \fB-L\fP (list files without matches), because the grand total would always be zero. .TP -\fB-u\fP, \fB--utf-8\fP +\fB-u\fP, \fB--utf\fP Operate in UTF-8 mode. This option is available only if PCRE2 has been compiled with UTF-8 support. All patterns (including those for any \fB--exclude\fP and \fB--include\fP options) and all subject lines that are scanned must be valid strings of UTF-8 characters. .TP +\fb-U\fP, \fB--utf-allow-invalid\fP +As \fB--utf\fP, but in addition subject lines may contain invalid UTF-8 code +unit sequences. These can never form part of any pattern match. This facility +allows valid UTF-8 strings to be sought in executable or other binary files. +For more details about matching in non-valid UTF-8 strings, see the +.\" HREF +\fBpcre2unicode\fP(3) +.\" +documentation. +.TP \fB-V\fP, \fB--version\fP Write the version numbers of \fBpcre2grep\fP and the PCRE2 library to the standard output and then exit. Anything else on the command line is @@ -711,9 +721,9 @@ as in the GNU \fBgrep\fP program. Any long option of the form \fB--file-offsets\fP, \fB--heap-limit\fP, \fB--include-dir\fP, \fB--line-offsets\fP, \fB--locale\fP, \fB--match-limit\fP, \fB-M\fP, \fB--multiline\fP, \fB-N\fP, \fB--newline\fP, \fB--om-separator\fP, -\fB--output\fP, \fB-u\fP, and \fB--utf-8\fP options are specific to -\fBpcre2grep\fP, as is the use of the \fB--only-matching\fP option with a -capturing parentheses number. +\fB--output\fP, \fB-u\fP, \fB--utf\fP, \fB-U\fP, and \fB--utf-allow-invalid\fP +options are specific to \fBpcre2grep\fP, as is the use of the +\fB--only-matching\fP option with a capturing parentheses number. .P Although most of the common options work the same way, a few are different in \fBpcre2grep\fP. For example, the \fB--include\fP option's argument is a glob @@ -884,6 +894,6 @@ Cambridge, England. .rs .sp .nf -Last updated: 24 November 2018 -Copyright (c) 1997-2018 University of Cambridge. +Last updated: 28 May 2019 +Copyright (c) 1997-2019 University of Cambridge. .fi diff --git a/src/pcre2grep.c b/src/pcre2grep.c index a3cc3ec..02464a3 100644 --- a/src/pcre2grep.c +++ b/src/pcre2grep.c @@ -13,7 +13,7 @@ distribution because other apparatus is needed to compile pcre2grep for z/OS. The header can be found in the special z/OS distribution, which is available from www.zaconsultants.net or from www.cbttape.org. - Copyright (c) 1997-2018 University of Cambridge + Copyright (c) 1997-2019 University of Cambridge ----------------------------------------------------------------------------- Redistribution and use in source and binary forms, with or without @@ -110,6 +110,19 @@ MSVC 10/2010. Except for VC6 (which is missing some fundamentals and fails). */ #define snprintf _snprintf #endif +/* VC and older compilers don't support %td or %zu, and even some that claim to +be C99 don't support it (hence DISABLE_PERCENT_ZT). */ + +#if defined(_MSC_VER) || !defined(__STDC_VERSION__) || __STDC_VERSION__ < 199901L || defined(DISABLE_PERCENT_ZT) +#define PTR_FORM "lu" +#define SIZ_FORM "lu" +#define SIZ_CAST (unsigned long int) +#else +#define PTR_FORM "td" +#define SIZ_FORM "zu" +#define SIZ_CAST +#endif + #define FALSE 0 #define TRUE 1 @@ -451,6 +464,7 @@ static option_item optionlist[] = { { OP_NODATA, 's', NULL, "no-messages", "suppress error messages" }, { OP_NODATA, 't', NULL, "total-count", "print total count of matching lines" }, { OP_NODATA, 'u', NULL, "utf", "use UTF mode" }, + { OP_NODATA, 'U', NULL, "utf-allow-invalid", "use UTF mode, allow for invalid code units" }, { OP_NODATA, 'V', NULL, "version", "print version information and exit" }, { OP_NODATA, 'v', NULL, "invert-match", "select non-matching lines" }, { OP_NODATA, 'w', NULL, "word-regex(p)", "force patterns to match only as words" }, @@ -1733,6 +1747,15 @@ for (i = 1; p != NULL; p = p->next, i++) fprintf(stderr, "%s", msg); FWRITE_IGNORE(matchptr, 1, slen, stderr); /* In case binary zero included */ fprintf(stderr, "\n\n"); + if (*mrc <= PCRE2_ERROR_UTF8_ERR1 && + *mrc >= PCRE2_ERROR_UTF8_ERR21) + { + unsigned char mbuffer[256]; + PCRE2_SIZE startchar = pcre2_get_startchar(match_data); + (void)pcre2_get_error_message(*mrc, mbuffer, sizeof(mbuffer)); + fprintf(stderr, "%s at offset %" SIZ_FORM "\n\n", mbuffer, + SIZ_CAST startchar); + } if (*mrc == PCRE2_ERROR_MATCHLIMIT || *mrc == PCRE2_ERROR_DEPTHLIMIT || *mrc == PCRE2_ERROR_HEAPLIMIT || *mrc == PCRE2_ERROR_JIT_STACKLIMIT) resource_error = TRUE; @@ -3401,6 +3424,7 @@ switch(letter) case 's': silent = TRUE; break; case 't': show_total_count = TRUE; break; case 'u': options |= PCRE2_UTF; utf = TRUE; break; + case 'U': options |= PCRE2_UTF|PCRE2_MATCH_INVALID_UTF; utf = TRUE; break; case 'v': invert = TRUE; break; case 'w': extra_options |= PCRE2_EXTRA_MATCH_WORD; break; case 'x': extra_options |= PCRE2_EXTRA_MATCH_LINE; break; diff --git a/testdata/grepoutput8 b/testdata/grepoutput8 index aaed6ae..1bac20b 100644 --- a/testdata/grepoutput8 +++ b/testdata/grepoutput8 @@ -18,3 +18,14 @@ RC=0 22:6,2 22:8,2 RC=0 +---------------------------- Test U4 ------------------------------ +pcre2grep: pcre2_match() gave error -22 while matching this text: + +Aက€CD Z + +UTF-8 error: isolated byte with 0x80 bit set at offset 4 + +RC=1 +---------------------------- Test U5 ------------------------------ +CD Z +RC=0