Fix bad offset value in invalid UTF pattern error.

This commit is contained in:
Philip.Hazel 2015-11-27 15:58:44 +00:00
parent 2eb24e2dac
commit aec5c96cf5
5 changed files with 19 additions and 10 deletions

View File

@ -332,7 +332,10 @@ because it sets the "startoffset" parameter for pcre2_match().
99. If PCRE2_AUTO_CALLOUT was set on a pattern that had a (?# comment between
an item and its qualifier (for example, A(?#comment)?B) pcre2_compile()
misbehaved.
misbehaved. This bug was found by the LLVM fuzzer.
100. The error for an invalid UTF pattern string always gave the code unit
offset as zero instead of where the invalidity was found.
Version 10.20 30-June-2015

View File

@ -8468,7 +8468,7 @@ if (utf)
}
if ((options & PCRE2_NO_UTF_CHECK) == 0 &&
(errorcode = PRIV(valid_utf)(pattern, patlen, erroroffset)) != 0)
goto HAD_ERROR;
goto HAD_UTF_ERROR;
}
/* Check UCP lockout. */
@ -8849,10 +8849,11 @@ via the dreaded goto. */
if (errorcode != 0)
{
HAD_ERROR:
*erroroffset = (int)(ptr - pattern);
HAD_UTF_ERROR:
*errorptr = errorcode;
pcre2_code_free(re);
re = NULL;
*errorptr = errorcode;
*erroroffset = (int)(ptr - pattern);
goto EXIT;
}

View File

@ -204,7 +204,7 @@ static const char match_error_texts[] =
/* 20 */
"UTF-8 error: overlong 5-byte sequence\0"
"UTF-8 error: overlong 6-byte sequence\0"
"UTF-8 error: isolated 0x80 byte\0"
"UTF-8 error: isolated byte with 0x80 bit set\0"
"UTF-8 error: illegal byte (0xfe or 0xff)\0"
"UTF-16 error: missing low surrogate at end\0"
/* 25 */

View File

@ -1,7 +1,7 @@
# This set of tests is for UTF-8 support and Unicode property support, with
# relevance only for the 8-bit library.
# The next 3 patterns have UTF-8 errors
# The next 4 patterns have UTF-8 errors
/[Ã]/utf
@ -9,6 +9,8 @@
/ÃÃÃxxx/utf
/‚‚‚‚‚‚‚Ã/utf
# Now test subjects
/badutf/utf

11
testdata/testoutput10 vendored
View File

@ -1,10 +1,10 @@
# This set of tests is for UTF-8 support and Unicode property support, with
# relevance only for the 8-bit library.
# The next 3 patterns have UTF-8 errors
# The next 4 patterns have UTF-8 errors
/[Ã]/utf
Failed: error -8 at offset 0: UTF-8 error: byte 2 top bits not 0x80
Failed: error -8 at offset 1: UTF-8 error: byte 2 top bits not 0x80
/Ã/utf
Failed: error -3 at offset 0: UTF-8 error: 1 byte missing at end
@ -12,6 +12,9 @@ Failed: error -3 at offset 0: UTF-8 error: 1 byte missing at end
/ÃÃÃxxx/utf
Failed: error -8 at offset 0: UTF-8 error: byte 2 top bits not 0x80
/‚‚‚‚‚‚‚Ã/utf
Failed: error -22 at offset 2: UTF-8 error: isolated byte with 0x80 bit set
# Now test subjects
/badutf/utf
@ -89,7 +92,7 @@ Failed: error -20: UTF-8 error: overlong 5-byte sequence at offset 0
\xfc\x80\x80\x80\x80\x8f
Failed: error -21: UTF-8 error: overlong 6-byte sequence at offset 0
\x80
Failed: error -22: UTF-8 error: isolated 0x80 byte at offset 0
Failed: error -22: UTF-8 error: isolated byte with 0x80 bit set at offset 0
\xfe
Failed: error -23: UTF-8 error: illegal byte (0xfe or 0xff) at offset 0
\xff
@ -1534,6 +1537,6 @@ Options: utf
First code unit = 'x'
Subject length lower bound = 1
a\x80zx\=offset=3
Failed: error -22: UTF-8 error: isolated 0x80 byte at offset 1
Failed: error -22: UTF-8 error: isolated byte with 0x80 bit set at offset 1
# End of testinput10