Fix bad offset value in invalid UTF pattern error.

2015-11-27 15:58:44 +00:00 · 2015-11-27 15:58:44 +00:00 · aec5c96cf5
parent 2eb24e2dac
commit aec5c96cf5
5 changed files with 19 additions and 10 deletions
--- a/5
+++ b/5
@ -332,7 +332,10 @@ because it sets the "startoffset" parameter for pcre2_match().
 99. If PCRE2_AUTO_CALLOUT was set on a pattern that had a (?# comment between 
 an item and its qualifier (for example, A(?#comment)?B) pcre2_compile() 
-misbehaved.
+misbehaved. This bug was found by the LLVM fuzzer.
 100. The error for an invalid UTF pattern string always gave the code unit 
 offset as zero instead of where the invalidity was found.
 Version 10.20 30-June-2015
--- a/src/pcre2_compile.c
+++ b/src/pcre2_compile.c
@ -8468,7 +8468,7 @@ if (utf)
    }
  if ((options & PCRE2_NO_UTF_CHECK) == 0 &&
       (errorcode = PRIV(valid_utf)(pattern, patlen, erroroffset)) != 0)
-    goto HAD_ERROR;
+    goto HAD_UTF_ERROR;
  }
 /* Check UCP lockout. */
@ -8849,10 +8849,11 @@ via the dreaded goto. */
 if (errorcode != 0)
  {
  HAD_ERROR:
  *erroroffset = (int)(ptr - pattern);
  HAD_UTF_ERROR:
  *errorptr = errorcode;
  pcre2_code_free(re);
  re = NULL;
  *errorptr = errorcode;
  *erroroffset = (int)(ptr - pattern);
  goto EXIT;
  }
--- a/src/pcre2_error.c
+++ b/src/pcre2_error.c
@ -204,7 +204,7 @@ static const char match_error_texts[] =
  /* 20 */
  "UTF-8 error: overlong 5-byte sequence\0"
  "UTF-8 error: overlong 6-byte sequence\0"
-  "UTF-8 error: isolated 0x80 byte\0"
+  "UTF-8 error: isolated byte with 0x80 bit set\0"
  "UTF-8 error: illegal byte (0xfe or 0xff)\0"
  "UTF-16 error: missing low surrogate at end\0"
  /* 25 */
--- a/testdata/testinput10
+++ b/testdata/testinput10
@ -1,7 +1,7 @@
 # This set of tests is for UTF-8 support and Unicode property support, with
 # relevance only for the 8-bit library.
-# The next 3 patterns have UTF-8 errors
+# The next 4 patterns have UTF-8 errors
 /[Ã]/utf
@ -9,6 +9,8 @@
 /ÃÃÃxxx/utf
 /Ã‚‚‚‚‚‚‚‚Ã/utf
 # Now test subjects
 /badutf/utf
--- a/testdata/testoutput10
+++ b/testdata/testoutput10
@ -1,10 +1,10 @@
 # This set of tests is for UTF-8 support and Unicode property support, with
 # relevance only for the 8-bit library.
-# The next 3 patterns have UTF-8 errors
+# The next 4 patterns have UTF-8 errors
 /[Ã]/utf
-Failed: error -8 at offset 0: UTF-8 error: byte 2 top bits not 0x80
+Failed: error -8 at offset 1: UTF-8 error: byte 2 top bits not 0x80
 /Ã/utf
 Failed: error -3 at offset 0: UTF-8 error: 1 byte missing at end
@ -12,6 +12,9 @@ Failed: error -3 at offset 0: UTF-8 error: 1 byte missing at end
 /ÃÃÃxxx/utf
 Failed: error -8 at offset 0: UTF-8 error: byte 2 top bits not 0x80
 /Ã‚‚‚‚‚‚‚‚Ã/utf
 Failed: error -22 at offset 2: UTF-8 error: isolated byte with 0x80 bit set
 # Now test subjects
 /badutf/utf
@ -89,7 +92,7 @@ Failed: error -20: UTF-8 error: overlong 5-byte sequence at offset 0
    \xfc\x80\x80\x80\x80\x8f
 Failed: error -21: UTF-8 error: overlong 6-byte sequence at offset 0
    \x80
-Failed: error -22: UTF-8 error: isolated 0x80 byte at offset 0
+Failed: error -22: UTF-8 error: isolated byte with 0x80 bit set at offset 0
    \xfe
 Failed: error -23: UTF-8 error: illegal byte (0xfe or 0xff) at offset 0
    \xff
@ -1534,6 +1537,6 @@ Options: utf
 First code unit = 'x'
 Subject length lower bound = 1
    a\x80zx\=offset=3
-Failed: error -22: UTF-8 error: isolated 0x80 byte at offset 1
+Failed: error -22: UTF-8 error: isolated byte with 0x80 bit set at offset 1
 # End of testinput10