From aec5c96cf5ba2c48b6766f4c1b6dca02f8e70a6f Mon Sep 17 00:00:00 2001 From: "Philip.Hazel" Date: Fri, 27 Nov 2015 15:58:44 +0000 Subject: [PATCH] Fix bad offset value in invalid UTF pattern error. --- ChangeLog | 5 ++++- src/pcre2_compile.c | 7 ++++--- src/pcre2_error.c | 2 +- testdata/testinput10 | 4 +++- testdata/testoutput10 | 11 +++++++---- 5 files changed, 19 insertions(+), 10 deletions(-) diff --git a/ChangeLog b/ChangeLog index afd8834..1e60a78 100644 --- a/ChangeLog +++ b/ChangeLog @@ -332,7 +332,10 @@ because it sets the "startoffset" parameter for pcre2_match(). 99. If PCRE2_AUTO_CALLOUT was set on a pattern that had a (?# comment between an item and its qualifier (for example, A(?#comment)?B) pcre2_compile() -misbehaved. +misbehaved. This bug was found by the LLVM fuzzer. + +100. The error for an invalid UTF pattern string always gave the code unit +offset as zero instead of where the invalidity was found. Version 10.20 30-June-2015 diff --git a/src/pcre2_compile.c b/src/pcre2_compile.c index 67061f8..453e206 100644 --- a/src/pcre2_compile.c +++ b/src/pcre2_compile.c @@ -8468,7 +8468,7 @@ if (utf) } if ((options & PCRE2_NO_UTF_CHECK) == 0 && (errorcode = PRIV(valid_utf)(pattern, patlen, erroroffset)) != 0) - goto HAD_ERROR; + goto HAD_UTF_ERROR; } /* Check UCP lockout. */ @@ -8849,10 +8849,11 @@ via the dreaded goto. */ if (errorcode != 0) { HAD_ERROR: + *erroroffset = (int)(ptr - pattern); + HAD_UTF_ERROR: + *errorptr = errorcode; pcre2_code_free(re); re = NULL; - *errorptr = errorcode; - *erroroffset = (int)(ptr - pattern); goto EXIT; } diff --git a/src/pcre2_error.c b/src/pcre2_error.c index 0aa108e..a90662b 100644 --- a/src/pcre2_error.c +++ b/src/pcre2_error.c @@ -204,7 +204,7 @@ static const char match_error_texts[] = /* 20 */ "UTF-8 error: overlong 5-byte sequence\0" "UTF-8 error: overlong 6-byte sequence\0" - "UTF-8 error: isolated 0x80 byte\0" + "UTF-8 error: isolated byte with 0x80 bit set\0" "UTF-8 error: illegal byte (0xfe or 0xff)\0" "UTF-16 error: missing low surrogate at end\0" /* 25 */ diff --git a/testdata/testinput10 b/testdata/testinput10 index 681ba9a..550e1c9 100644 --- a/testdata/testinput10 +++ b/testdata/testinput10 @@ -1,7 +1,7 @@ # This set of tests is for UTF-8 support and Unicode property support, with # relevance only for the 8-bit library. -# The next 3 patterns have UTF-8 errors +# The next 4 patterns have UTF-8 errors /[Ã]/utf @@ -9,6 +9,8 @@ /ÃÃÃxxx/utf +/‚‚‚‚‚‚‚Ã/utf + # Now test subjects /badutf/utf diff --git a/testdata/testoutput10 b/testdata/testoutput10 index a93070f..9761f0f 100644 --- a/testdata/testoutput10 +++ b/testdata/testoutput10 @@ -1,10 +1,10 @@ # This set of tests is for UTF-8 support and Unicode property support, with # relevance only for the 8-bit library. -# The next 3 patterns have UTF-8 errors +# The next 4 patterns have UTF-8 errors /[Ã]/utf -Failed: error -8 at offset 0: UTF-8 error: byte 2 top bits not 0x80 +Failed: error -8 at offset 1: UTF-8 error: byte 2 top bits not 0x80 /Ã/utf Failed: error -3 at offset 0: UTF-8 error: 1 byte missing at end @@ -12,6 +12,9 @@ Failed: error -3 at offset 0: UTF-8 error: 1 byte missing at end /ÃÃÃxxx/utf Failed: error -8 at offset 0: UTF-8 error: byte 2 top bits not 0x80 +/‚‚‚‚‚‚‚Ã/utf +Failed: error -22 at offset 2: UTF-8 error: isolated byte with 0x80 bit set + # Now test subjects /badutf/utf @@ -89,7 +92,7 @@ Failed: error -20: UTF-8 error: overlong 5-byte sequence at offset 0 \xfc\x80\x80\x80\x80\x8f Failed: error -21: UTF-8 error: overlong 6-byte sequence at offset 0 \x80 -Failed: error -22: UTF-8 error: isolated 0x80 byte at offset 0 +Failed: error -22: UTF-8 error: isolated byte with 0x80 bit set at offset 0 \xfe Failed: error -23: UTF-8 error: illegal byte (0xfe or 0xff) at offset 0 \xff @@ -1534,6 +1537,6 @@ Options: utf First code unit = 'x' Subject length lower bound = 1 a\x80zx\=offset=3 -Failed: error -22: UTF-8 error: isolated 0x80 byte at offset 1 +Failed: error -22: UTF-8 error: isolated byte with 0x80 bit set at offset 1 # End of testinput10