Fix bad offset value in invalid UTF pattern error.

2015-11-27 15:58:44 +00:00 · 2015-11-27 15:58:44 +00:00 · aec5c96cf5
parent 2eb24e2dac
commit aec5c96cf5
5 changed files with 19 additions and 10 deletions
--- a/5
+++ b/5
@ -332,7 +332,10 @@ because it sets the "startoffset" parameter for pcre2_match().

 99. If PCRE2_AUTO_CALLOUT was set on a pattern that had a (?# comment between 
 an item and its qualifier (for example, A(?#comment)?B) pcre2_compile() 
-misbehaved.
+misbehaved. This bug was found by the LLVM fuzzer.
+
+100. The error for an invalid UTF pattern string always gave the code unit 
+offset as zero instead of where the invalidity was found.


 Version 10.20 30-June-2015
--- a/src/pcre2_compile.c
+++ b/src/pcre2_compile.c
@ -8468,7 +8468,7 @@ if (utf)
    }
  if ((options & PCRE2_NO_UTF_CHECK) == 0 &&
       (errorcode = PRIV(valid_utf)(pattern, patlen, erroroffset)) != 0)
-    goto HAD_ERROR;
+    goto HAD_UTF_ERROR;
  }

 /* Check UCP lockout. */
@ -8849,10 +8849,11 @@ via the dreaded goto. */
 if (errorcode != 0)
  {
  HAD_ERROR:
+  *erroroffset = (int)(ptr - pattern);
+  HAD_UTF_ERROR:
+  *errorptr = errorcode;
  pcre2_code_free(re);
  re = NULL;
-  *errorptr = errorcode;
-  *erroroffset = (int)(ptr - pattern);
  goto EXIT;
  }

--- a/src/pcre2_error.c
+++ b/src/pcre2_error.c
@ -204,7 +204,7 @@ static const char match_error_texts[] =
  /* 20 */
  "UTF-8 error: overlong 5-byte sequence\0"
  "UTF-8 error: overlong 6-byte sequence\0"
-  "UTF-8 error: isolated 0x80 byte\0"
+  "UTF-8 error: isolated byte with 0x80 bit set\0"
  "UTF-8 error: illegal byte (0xfe or 0xff)\0"
  "UTF-16 error: missing low surrogate at end\0"
  /* 25 */
--- a/testdata/testinput10
+++ b/testdata/testinput10
@ -1,7 +1,7 @@
 # This set of tests is for UTF-8 support and Unicode property support, with
 # relevance only for the 8-bit library.

-# The next 3 patterns have UTF-8 errors
+# The next 4 patterns have UTF-8 errors

 /[Ã]/utf

@ -9,6 +9,8 @@

 /ÃÃÃxxx/utf

+/Ã‚‚‚‚‚‚‚‚Ã/utf
+
 # Now test subjects

 /badutf/utf
--- a/testdata/testoutput10
+++ b/testdata/testoutput10
@ -1,10 +1,10 @@
 # This set of tests is for UTF-8 support and Unicode property support, with
 # relevance only for the 8-bit library.

-# The next 3 patterns have UTF-8 errors
+# The next 4 patterns have UTF-8 errors

 /[Ã]/utf
-Failed: error -8 at offset 0: UTF-8 error: byte 2 top bits not 0x80
+Failed: error -8 at offset 1: UTF-8 error: byte 2 top bits not 0x80

 /Ã/utf
 Failed: error -3 at offset 0: UTF-8 error: 1 byte missing at end
@ -12,6 +12,9 @@ Failed: error -3 at offset 0: UTF-8 error: 1 byte missing at end
 /ÃÃÃxxx/utf
 Failed: error -8 at offset 0: UTF-8 error: byte 2 top bits not 0x80

+/Ã‚‚‚‚‚‚‚‚Ã/utf
+Failed: error -22 at offset 2: UTF-8 error: isolated byte with 0x80 bit set
+
 # Now test subjects

 /badutf/utf
@ -89,7 +92,7 @@ Failed: error -20: UTF-8 error: overlong 5-byte sequence at offset 0
    \xfc\x80\x80\x80\x80\x8f
 Failed: error -21: UTF-8 error: overlong 6-byte sequence at offset 0
    \x80
-Failed: error -22: UTF-8 error: isolated 0x80 byte at offset 0
+Failed: error -22: UTF-8 error: isolated byte with 0x80 bit set at offset 0
    \xfe
 Failed: error -23: UTF-8 error: illegal byte (0xfe or 0xff) at offset 0
    \xff
@ -1534,6 +1537,6 @@ Options: utf
 First code unit = 'x'
 Subject length lower bound = 1
    a\x80zx\=offset=3
-Failed: error -22: UTF-8 error: isolated 0x80 byte at offset 1
+Failed: error -22: UTF-8 error: isolated byte with 0x80 bit set at offset 1

 # End of testinput10