Fix bug in UTF-16 checker returning wrong offset for missing low surrogate.
This commit is contained in:
parent
a3057bbecd
commit
f50ee03f5d
|
@ -71,6 +71,12 @@ PCRE2_UCP is set without PCRE2_UTF, Unicode character properties are used for
|
||||||
upper/lower case computations on characters whose code points are greater than
|
upper/lower case computations on characters whose code points are greater than
|
||||||
127. Documentation is not yet updated. JIT is not yet updated.
|
127. Documentation is not yet updated. JIT is not yet updated.
|
||||||
|
|
||||||
|
19. The function for checking UTF-16 validity was returning an incorrect offset
|
||||||
|
for the start of the error when a high surrogate was not followed by a valid
|
||||||
|
low surrogate. This caused incorrect behaviour, for example when
|
||||||
|
PCRE2_MATCH_INVALID_UTF was set and a match started immediately following the
|
||||||
|
invalid high surrogate, such as /aa/ matching "\x{d800}aa".
|
||||||
|
|
||||||
|
|
||||||
Version 10.34 21-November-2019
|
Version 10.34 21-November-2019
|
||||||
------------------------------
|
------------------------------
|
||||||
|
|
|
@ -7,7 +7,7 @@ and semantics are as close as possible to those of the Perl 5 language.
|
||||||
|
|
||||||
Written by Philip Hazel
|
Written by Philip Hazel
|
||||||
Original API code Copyright (c) 1997-2012 University of Cambridge
|
Original API code Copyright (c) 1997-2012 University of Cambridge
|
||||||
New API code Copyright (c) 2016-2017 University of Cambridge
|
New API code Copyright (c) 2016-2020 University of Cambridge
|
||||||
|
|
||||||
-----------------------------------------------------------------------------
|
-----------------------------------------------------------------------------
|
||||||
Redistribution and use in source and binary forms, with or without
|
Redistribution and use in source and binary forms, with or without
|
||||||
|
@ -347,7 +347,7 @@ for (p = string; length > 0; p++)
|
||||||
length--;
|
length--;
|
||||||
if ((*p & 0xfc00) != 0xdc00)
|
if ((*p & 0xfc00) != 0xdc00)
|
||||||
{
|
{
|
||||||
*erroroffset = p - string;
|
*erroroffset = p - string - 1;
|
||||||
return PCRE2_ERROR_UTF16_ERR2;
|
return PCRE2_ERROR_UTF16_ERR2;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -444,6 +444,12 @@
|
||||||
\= Expect no match
|
\= Expect no match
|
||||||
A\x{d800}B
|
A\x{d800}B
|
||||||
A\x{110000}B
|
A\x{110000}B
|
||||||
|
|
||||||
|
/aa/utf,ucp,match_invalid_utf,global
|
||||||
|
aa\x{d800}aa
|
||||||
|
|
||||||
|
/aa/utf,ucp,match_invalid_utf,global
|
||||||
|
\x{d800}aa
|
||||||
|
|
||||||
# ----------------------------------------------------
|
# ----------------------------------------------------
|
||||||
|
|
||||||
|
|
|
@ -533,7 +533,7 @@ Failed: error -26: UTF-16 error: isolated low surrogate at offset 2
|
||||||
XX\x{110000}
|
XX\x{110000}
|
||||||
** Failed: character \x{110000} is greater than 0x10ffff and so cannot be converted to UTF-16
|
** Failed: character \x{110000} is greater than 0x10ffff and so cannot be converted to UTF-16
|
||||||
XX\x{d800}\x{1234}
|
XX\x{d800}\x{1234}
|
||||||
Failed: error -25: UTF-16 error: invalid low surrogate at offset 3
|
Failed: error -25: UTF-16 error: invalid low surrogate at offset 2
|
||||||
\= Expect no match
|
\= Expect no match
|
||||||
XX\x{d800}\=offset=3
|
XX\x{d800}\=offset=3
|
||||||
No match
|
No match
|
||||||
|
@ -1576,6 +1576,15 @@ No match
|
||||||
No match
|
No match
|
||||||
A\x{110000}B
|
A\x{110000}B
|
||||||
** Failed: character \x{110000} is greater than 0x10ffff and so cannot be converted to UTF-16
|
** Failed: character \x{110000} is greater than 0x10ffff and so cannot be converted to UTF-16
|
||||||
|
|
||||||
|
/aa/utf,ucp,match_invalid_utf,global
|
||||||
|
aa\x{d800}aa
|
||||||
|
0: aa
|
||||||
|
0: aa
|
||||||
|
|
||||||
|
/aa/utf,ucp,match_invalid_utf,global
|
||||||
|
\x{d800}aa
|
||||||
|
0: aa
|
||||||
|
|
||||||
# ----------------------------------------------------
|
# ----------------------------------------------------
|
||||||
|
|
||||||
|
|
|
@ -1574,6 +1574,15 @@ No match
|
||||||
No match
|
No match
|
||||||
A\x{110000}B
|
A\x{110000}B
|
||||||
No match
|
No match
|
||||||
|
|
||||||
|
/aa/utf,ucp,match_invalid_utf,global
|
||||||
|
aa\x{d800}aa
|
||||||
|
0: aa
|
||||||
|
0: aa
|
||||||
|
|
||||||
|
/aa/utf,ucp,match_invalid_utf,global
|
||||||
|
\x{d800}aa
|
||||||
|
0: aa
|
||||||
|
|
||||||
# ----------------------------------------------------
|
# ----------------------------------------------------
|
||||||
|
|
||||||
|
|
|
@ -33,7 +33,7 @@ Failed: error -26: UTF-16 error: isolated low surrogate at offset 2
|
||||||
XX\x{110000}
|
XX\x{110000}
|
||||||
** Failed: character \x{110000} is greater than 0x10ffff and so cannot be converted to UTF-16
|
** Failed: character \x{110000} is greater than 0x10ffff and so cannot be converted to UTF-16
|
||||||
XX\x{d800}\x{1234}
|
XX\x{d800}\x{1234}
|
||||||
Failed: error -25: UTF-16 error: invalid low surrogate at offset 3
|
Failed: error -25: UTF-16 error: invalid low surrogate at offset 2
|
||||||
|
|
||||||
/badutf/utf
|
/badutf/utf
|
||||||
X\xdf
|
X\xdf
|
||||||
|
|
Loading…
Reference in New Issue