Fix bugs in new UCP casing code for back references and characters with more
than 2 cases.
This commit is contained in:
parent
305e273e99
commit
3155a6951f
|
@ -69,7 +69,7 @@ character is decoded in JIT.
|
||||||
18. Changes in many areas of the code so that when Unicode is supported and
|
18. Changes in many areas of the code so that when Unicode is supported and
|
||||||
PCRE2_UCP is set without PCRE2_UTF, Unicode character properties are used for
|
PCRE2_UCP is set without PCRE2_UTF, Unicode character properties are used for
|
||||||
upper/lower case computations on characters whose code points are greater than
|
upper/lower case computations on characters whose code points are greater than
|
||||||
127. Documentation is not yet updated. JIT is not yet updated.
|
127.
|
||||||
|
|
||||||
19. The function for checking UTF-16 validity was returning an incorrect offset
|
19. The function for checking UTF-16 validity was returning an incorrect offset
|
||||||
for the start of the error when a high surrogate was not followed by a valid
|
for the start of the error when a high surrogate was not followed by a valid
|
||||||
|
|
|
@ -5565,12 +5565,12 @@ for (;; pptr++)
|
||||||
zerofirstcu = firstcu;
|
zerofirstcu = firstcu;
|
||||||
zerofirstcuflags = firstcuflags;
|
zerofirstcuflags = firstcuflags;
|
||||||
|
|
||||||
/* For caseless UTF mode, check whether this character has more than
|
/* For caseless UTF or UCP mode, check whether this character has more
|
||||||
one other case. If so, generate a special OP_NOTPROP item instead of
|
than one other case. If so, generate a special OP_NOTPROP item instead of
|
||||||
OP_NOTI. */
|
OP_NOTI. */
|
||||||
|
|
||||||
#ifdef SUPPORT_UNICODE
|
#ifdef SUPPORT_UNICODE
|
||||||
if (utf && (options & PCRE2_CASELESS) != 0 &&
|
if ((utf||ucp) && (options & PCRE2_CASELESS) != 0 &&
|
||||||
(d = UCD_CASESET(c)) != 0)
|
(d = UCD_CASESET(c)) != 0)
|
||||||
{
|
{
|
||||||
*code++ = OP_NOTPROP;
|
*code++ = OP_NOTPROP;
|
||||||
|
@ -7824,11 +7824,12 @@ for (;; pptr++)
|
||||||
NORMAL_CHAR_SET: /* Character is already in meta */
|
NORMAL_CHAR_SET: /* Character is already in meta */
|
||||||
matched_char = TRUE;
|
matched_char = TRUE;
|
||||||
|
|
||||||
/* For caseless UTF mode, check whether this character has more than one
|
/* For caseless UTF or UCP mode, check whether this character has more than
|
||||||
other case. If so, generate a special OP_PROP item instead of OP_CHARI. */
|
one other case. If so, generate a special OP_PROP item instead of OP_CHARI.
|
||||||
|
*/
|
||||||
|
|
||||||
#ifdef SUPPORT_UNICODE
|
#ifdef SUPPORT_UNICODE
|
||||||
if (utf && (options & PCRE2_CASELESS) != 0)
|
if ((utf||ucp) && (options & PCRE2_CASELESS) != 0)
|
||||||
{
|
{
|
||||||
uint32_t caseset = UCD_CASESET(meta);
|
uint32_t caseset = UCD_CASESET(meta);
|
||||||
if (caseset != 0)
|
if (caseset != 0)
|
||||||
|
|
|
@ -381,8 +381,12 @@ length = Fovector[offset+1] - Fovector[offset];
|
||||||
if (caseless)
|
if (caseless)
|
||||||
{
|
{
|
||||||
#if defined SUPPORT_UNICODE
|
#if defined SUPPORT_UNICODE
|
||||||
if ((mb->poptions & PCRE2_UTF) != 0)
|
BOOL utf = (mb->poptions & PCRE2_UTF) != 0;
|
||||||
|
|
||||||
|
if (utf || (mb->poptions & PCRE2_UCP) != 0)
|
||||||
{
|
{
|
||||||
|
PCRE2_SPTR endptr = p + length;
|
||||||
|
|
||||||
/* Match characters up to the end of the reference. NOTE: the number of
|
/* Match characters up to the end of the reference. NOTE: the number of
|
||||||
code units matched may differ, because in UTF-8 there are some characters
|
code units matched may differ, because in UTF-8 there are some characters
|
||||||
whose upper and lower case codes have different numbers of bytes. For
|
whose upper and lower case codes have different numbers of bytes. For
|
||||||
|
@ -390,16 +394,25 @@ if (caseless)
|
||||||
bytes in UTF-8); a sequence of 3 of the former uses 6 bytes, as does a
|
bytes in UTF-8); a sequence of 3 of the former uses 6 bytes, as does a
|
||||||
sequence of two of the latter. It is important, therefore, to check the
|
sequence of two of the latter. It is important, therefore, to check the
|
||||||
length along the reference, not along the subject (earlier code did this
|
length along the reference, not along the subject (earlier code did this
|
||||||
wrong). */
|
wrong). UCP without uses Unicode properties but without UTF encoding. */
|
||||||
|
|
||||||
PCRE2_SPTR endptr = p + length;
|
|
||||||
while (p < endptr)
|
while (p < endptr)
|
||||||
{
|
{
|
||||||
uint32_t c, d;
|
uint32_t c, d;
|
||||||
const ucd_record *ur;
|
const ucd_record *ur;
|
||||||
if (eptr >= mb->end_subject) return 1; /* Partial match */
|
if (eptr >= mb->end_subject) return 1; /* Partial match */
|
||||||
GETCHARINC(c, eptr);
|
|
||||||
GETCHARINC(d, p);
|
if (utf)
|
||||||
|
{
|
||||||
|
GETCHARINC(c, eptr);
|
||||||
|
GETCHARINC(d, p);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
c = *eptr++;
|
||||||
|
d = *p++;
|
||||||
|
}
|
||||||
|
|
||||||
ur = GET_UCD(d);
|
ur = GET_UCD(d);
|
||||||
if (c != d && c != (uint32_t)((int)d + ur->other_case))
|
if (c != d && c != (uint32_t)((int)d + ur->other_case))
|
||||||
{
|
{
|
||||||
|
@ -415,7 +428,7 @@ if (caseless)
|
||||||
else
|
else
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
/* Not in UTF mode */
|
/* Not in UTF or UCP mode */
|
||||||
{
|
{
|
||||||
for (; length > 0; length--)
|
for (; length > 0; length--)
|
||||||
{
|
{
|
||||||
|
@ -432,7 +445,8 @@ if (caseless)
|
||||||
}
|
}
|
||||||
|
|
||||||
/* In the caseful case, we can just compare the code units, whether or not we
|
/* In the caseful case, we can just compare the code units, whether or not we
|
||||||
are in UTF mode. When partial matching, we have to do this unit-by-unit. */
|
are in UTF and/or UCP mode. When partial matching, we have to do this unit by
|
||||||
|
unit. */
|
||||||
|
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
|
|
|
@ -530,6 +530,20 @@
|
||||||
/X(\x{121})Y/ucp,replace=>\U$1<,substitute_extended
|
/X(\x{121})Y/ucp,replace=>\U$1<,substitute_extended
|
||||||
X\x{121}Y
|
X\x{121}Y
|
||||||
|
|
||||||
|
/s/i,ucp
|
||||||
|
\x{17f}
|
||||||
|
|
||||||
|
/s/i,utf
|
||||||
|
\x{17f}
|
||||||
|
|
||||||
|
/[^s]/i,ucp
|
||||||
|
\= Expect no match
|
||||||
|
\x{17f}
|
||||||
|
|
||||||
|
/[^s]/i,utf
|
||||||
|
\= Expect no match
|
||||||
|
\x{17f}
|
||||||
|
|
||||||
# ----------------------------------------------------
|
# ----------------------------------------------------
|
||||||
|
|
||||||
# End of testinput12
|
# End of testinput12
|
||||||
|
|
|
@ -2181,4 +2181,7 @@
|
||||||
|
|
||||||
/(|ß)7/caseless,ucp
|
/(|ß)7/caseless,ucp
|
||||||
|
|
||||||
|
/(\xc1)\1/i,ucp
|
||||||
|
\xc1\xe1\=no_jit
|
||||||
|
|
||||||
# End of testinput5
|
# End of testinput5
|
||||||
|
|
|
@ -1761,6 +1761,24 @@ Subject length lower bound = 1
|
||||||
X\x{121}Y
|
X\x{121}Y
|
||||||
1: >\x{120}<
|
1: >\x{120}<
|
||||||
|
|
||||||
|
/s/i,ucp
|
||||||
|
\x{17f}
|
||||||
|
0: \x{17f}
|
||||||
|
|
||||||
|
/s/i,utf
|
||||||
|
\x{17f}
|
||||||
|
0: \x{17f}
|
||||||
|
|
||||||
|
/[^s]/i,ucp
|
||||||
|
\= Expect no match
|
||||||
|
\x{17f}
|
||||||
|
No match
|
||||||
|
|
||||||
|
/[^s]/i,utf
|
||||||
|
\= Expect no match
|
||||||
|
\x{17f}
|
||||||
|
No match
|
||||||
|
|
||||||
# ----------------------------------------------------
|
# ----------------------------------------------------
|
||||||
|
|
||||||
# End of testinput12
|
# End of testinput12
|
||||||
|
|
|
@ -1759,6 +1759,24 @@ Subject length lower bound = 1
|
||||||
X\x{121}Y
|
X\x{121}Y
|
||||||
1: >\x{120}<
|
1: >\x{120}<
|
||||||
|
|
||||||
|
/s/i,ucp
|
||||||
|
\x{17f}
|
||||||
|
0: \x{17f}
|
||||||
|
|
||||||
|
/s/i,utf
|
||||||
|
\x{17f}
|
||||||
|
0: \x{17f}
|
||||||
|
|
||||||
|
/[^s]/i,ucp
|
||||||
|
\= Expect no match
|
||||||
|
\x{17f}
|
||||||
|
No match
|
||||||
|
|
||||||
|
/[^s]/i,utf
|
||||||
|
\= Expect no match
|
||||||
|
\x{17f}
|
||||||
|
No match
|
||||||
|
|
||||||
# ----------------------------------------------------
|
# ----------------------------------------------------
|
||||||
|
|
||||||
# End of testinput12
|
# End of testinput12
|
||||||
|
|
|
@ -4940,4 +4940,9 @@ Subject length lower bound = 3
|
||||||
|
|
||||||
/(|ß)7/caseless,ucp
|
/(|ß)7/caseless,ucp
|
||||||
|
|
||||||
|
/(\xc1)\1/i,ucp
|
||||||
|
\xc1\xe1\=no_jit
|
||||||
|
0: \xc1\xe1
|
||||||
|
1: \xc1
|
||||||
|
|
||||||
# End of testinput5
|
# End of testinput5
|
||||||
|
|
Loading…
Reference in New Issue