Fix bugs in new UCP casing code for back references and characters with more

than 2 cases.
This commit is contained in:
Philip.Hazel 2020-02-26 16:53:39 +00:00
parent 305e273e99
commit 3155a6951f
8 changed files with 88 additions and 15 deletions

View File

@ -69,7 +69,7 @@ character is decoded in JIT.
18. Changes in many areas of the code so that when Unicode is supported and 18. Changes in many areas of the code so that when Unicode is supported and
PCRE2_UCP is set without PCRE2_UTF, Unicode character properties are used for PCRE2_UCP is set without PCRE2_UTF, Unicode character properties are used for
upper/lower case computations on characters whose code points are greater than upper/lower case computations on characters whose code points are greater than
127. Documentation is not yet updated. JIT is not yet updated. 127.
19. The function for checking UTF-16 validity was returning an incorrect offset 19. The function for checking UTF-16 validity was returning an incorrect offset
for the start of the error when a high surrogate was not followed by a valid for the start of the error when a high surrogate was not followed by a valid

View File

@ -5565,12 +5565,12 @@ for (;; pptr++)
zerofirstcu = firstcu; zerofirstcu = firstcu;
zerofirstcuflags = firstcuflags; zerofirstcuflags = firstcuflags;
/* For caseless UTF mode, check whether this character has more than /* For caseless UTF or UCP mode, check whether this character has more
one other case. If so, generate a special OP_NOTPROP item instead of than one other case. If so, generate a special OP_NOTPROP item instead of
OP_NOTI. */ OP_NOTI. */
#ifdef SUPPORT_UNICODE #ifdef SUPPORT_UNICODE
if (utf && (options & PCRE2_CASELESS) != 0 && if ((utf||ucp) && (options & PCRE2_CASELESS) != 0 &&
(d = UCD_CASESET(c)) != 0) (d = UCD_CASESET(c)) != 0)
{ {
*code++ = OP_NOTPROP; *code++ = OP_NOTPROP;
@ -7824,11 +7824,12 @@ for (;; pptr++)
NORMAL_CHAR_SET: /* Character is already in meta */ NORMAL_CHAR_SET: /* Character is already in meta */
matched_char = TRUE; matched_char = TRUE;
/* For caseless UTF mode, check whether this character has more than one /* For caseless UTF or UCP mode, check whether this character has more than
other case. If so, generate a special OP_PROP item instead of OP_CHARI. */ one other case. If so, generate a special OP_PROP item instead of OP_CHARI.
*/
#ifdef SUPPORT_UNICODE #ifdef SUPPORT_UNICODE
if (utf && (options & PCRE2_CASELESS) != 0) if ((utf||ucp) && (options & PCRE2_CASELESS) != 0)
{ {
uint32_t caseset = UCD_CASESET(meta); uint32_t caseset = UCD_CASESET(meta);
if (caseset != 0) if (caseset != 0)

View File

@ -381,8 +381,12 @@ length = Fovector[offset+1] - Fovector[offset];
if (caseless) if (caseless)
{ {
#if defined SUPPORT_UNICODE #if defined SUPPORT_UNICODE
if ((mb->poptions & PCRE2_UTF) != 0) BOOL utf = (mb->poptions & PCRE2_UTF) != 0;
if (utf || (mb->poptions & PCRE2_UCP) != 0)
{ {
PCRE2_SPTR endptr = p + length;
/* Match characters up to the end of the reference. NOTE: the number of /* Match characters up to the end of the reference. NOTE: the number of
code units matched may differ, because in UTF-8 there are some characters code units matched may differ, because in UTF-8 there are some characters
whose upper and lower case codes have different numbers of bytes. For whose upper and lower case codes have different numbers of bytes. For
@ -390,16 +394,25 @@ if (caseless)
bytes in UTF-8); a sequence of 3 of the former uses 6 bytes, as does a bytes in UTF-8); a sequence of 3 of the former uses 6 bytes, as does a
sequence of two of the latter. It is important, therefore, to check the sequence of two of the latter. It is important, therefore, to check the
length along the reference, not along the subject (earlier code did this length along the reference, not along the subject (earlier code did this
wrong). */ wrong). UCP without uses Unicode properties but without UTF encoding. */
PCRE2_SPTR endptr = p + length;
while (p < endptr) while (p < endptr)
{ {
uint32_t c, d; uint32_t c, d;
const ucd_record *ur; const ucd_record *ur;
if (eptr >= mb->end_subject) return 1; /* Partial match */ if (eptr >= mb->end_subject) return 1; /* Partial match */
if (utf)
{
GETCHARINC(c, eptr); GETCHARINC(c, eptr);
GETCHARINC(d, p); GETCHARINC(d, p);
}
else
{
c = *eptr++;
d = *p++;
}
ur = GET_UCD(d); ur = GET_UCD(d);
if (c != d && c != (uint32_t)((int)d + ur->other_case)) if (c != d && c != (uint32_t)((int)d + ur->other_case))
{ {
@ -415,7 +428,7 @@ if (caseless)
else else
#endif #endif
/* Not in UTF mode */ /* Not in UTF or UCP mode */
{ {
for (; length > 0; length--) for (; length > 0; length--)
{ {
@ -432,7 +445,8 @@ if (caseless)
} }
/* In the caseful case, we can just compare the code units, whether or not we /* In the caseful case, we can just compare the code units, whether or not we
are in UTF mode. When partial matching, we have to do this unit-by-unit. */ are in UTF and/or UCP mode. When partial matching, we have to do this unit by
unit. */
else else
{ {

14
testdata/testinput12 vendored
View File

@ -530,6 +530,20 @@
/X(\x{121})Y/ucp,replace=>\U$1<,substitute_extended /X(\x{121})Y/ucp,replace=>\U$1<,substitute_extended
X\x{121}Y X\x{121}Y
/s/i,ucp
\x{17f}
/s/i,utf
\x{17f}
/[^s]/i,ucp
\= Expect no match
\x{17f}
/[^s]/i,utf
\= Expect no match
\x{17f}
# ---------------------------------------------------- # ----------------------------------------------------
# End of testinput12 # End of testinput12

3
testdata/testinput5 vendored
View File

@ -2181,4 +2181,7 @@
/(|ß)7/caseless,ucp /(|ß)7/caseless,ucp
/(\xc1)\1/i,ucp
\xc1\xe1\=no_jit
# End of testinput5 # End of testinput5

View File

@ -1761,6 +1761,24 @@ Subject length lower bound = 1
X\x{121}Y X\x{121}Y
1: >\x{120}< 1: >\x{120}<
/s/i,ucp
\x{17f}
0: \x{17f}
/s/i,utf
\x{17f}
0: \x{17f}
/[^s]/i,ucp
\= Expect no match
\x{17f}
No match
/[^s]/i,utf
\= Expect no match
\x{17f}
No match
# ---------------------------------------------------- # ----------------------------------------------------
# End of testinput12 # End of testinput12

View File

@ -1759,6 +1759,24 @@ Subject length lower bound = 1
X\x{121}Y X\x{121}Y
1: >\x{120}< 1: >\x{120}<
/s/i,ucp
\x{17f}
0: \x{17f}
/s/i,utf
\x{17f}
0: \x{17f}
/[^s]/i,ucp
\= Expect no match
\x{17f}
No match
/[^s]/i,utf
\= Expect no match
\x{17f}
No match
# ---------------------------------------------------- # ----------------------------------------------------
# End of testinput12 # End of testinput12

View File

@ -4940,4 +4940,9 @@ Subject length lower bound = 3
/(|ß)7/caseless,ucp /(|ß)7/caseless,ucp
/(\xc1)\1/i,ucp
\xc1\xe1\=no_jit
0: \xc1\xe1
1: \xc1
# End of testinput5 # End of testinput5