diff --git a/ChangeLog b/ChangeLog index 95e0123..1e4e778 100644 --- a/ChangeLog +++ b/ChangeLog @@ -69,7 +69,7 @@ character is decoded in JIT. 18. Changes in many areas of the code so that when Unicode is supported and PCRE2_UCP is set without PCRE2_UTF, Unicode character properties are used for upper/lower case computations on characters whose code points are greater than -127. Documentation is not yet updated. JIT is not yet updated. +127. 19. The function for checking UTF-16 validity was returning an incorrect offset for the start of the error when a high surrogate was not followed by a valid diff --git a/src/pcre2_compile.c b/src/pcre2_compile.c index 13769a0..9f05d19 100644 --- a/src/pcre2_compile.c +++ b/src/pcre2_compile.c @@ -5565,12 +5565,12 @@ for (;; pptr++) zerofirstcu = firstcu; zerofirstcuflags = firstcuflags; - /* For caseless UTF mode, check whether this character has more than - one other case. If so, generate a special OP_NOTPROP item instead of + /* For caseless UTF or UCP mode, check whether this character has more + than one other case. If so, generate a special OP_NOTPROP item instead of OP_NOTI. */ #ifdef SUPPORT_UNICODE - if (utf && (options & PCRE2_CASELESS) != 0 && + if ((utf||ucp) && (options & PCRE2_CASELESS) != 0 && (d = UCD_CASESET(c)) != 0) { *code++ = OP_NOTPROP; @@ -7824,11 +7824,12 @@ for (;; pptr++) NORMAL_CHAR_SET: /* Character is already in meta */ matched_char = TRUE; - /* For caseless UTF mode, check whether this character has more than one - other case. If so, generate a special OP_PROP item instead of OP_CHARI. */ + /* For caseless UTF or UCP mode, check whether this character has more than + one other case. If so, generate a special OP_PROP item instead of OP_CHARI. + */ #ifdef SUPPORT_UNICODE - if (utf && (options & PCRE2_CASELESS) != 0) + if ((utf||ucp) && (options & PCRE2_CASELESS) != 0) { uint32_t caseset = UCD_CASESET(meta); if (caseset != 0) diff --git a/src/pcre2_match.c b/src/pcre2_match.c index 77c98f5..4b86134 100644 --- a/src/pcre2_match.c +++ b/src/pcre2_match.c @@ -381,8 +381,12 @@ length = Fovector[offset+1] - Fovector[offset]; if (caseless) { #if defined SUPPORT_UNICODE - if ((mb->poptions & PCRE2_UTF) != 0) + BOOL utf = (mb->poptions & PCRE2_UTF) != 0; + + if (utf || (mb->poptions & PCRE2_UCP) != 0) { + PCRE2_SPTR endptr = p + length; + /* Match characters up to the end of the reference. NOTE: the number of code units matched may differ, because in UTF-8 there are some characters whose upper and lower case codes have different numbers of bytes. For @@ -390,16 +394,25 @@ if (caseless) bytes in UTF-8); a sequence of 3 of the former uses 6 bytes, as does a sequence of two of the latter. It is important, therefore, to check the length along the reference, not along the subject (earlier code did this - wrong). */ - - PCRE2_SPTR endptr = p + length; + wrong). UCP without uses Unicode properties but without UTF encoding. */ + while (p < endptr) { uint32_t c, d; const ucd_record *ur; if (eptr >= mb->end_subject) return 1; /* Partial match */ - GETCHARINC(c, eptr); - GETCHARINC(d, p); + + if (utf) + { + GETCHARINC(c, eptr); + GETCHARINC(d, p); + } + else + { + c = *eptr++; + d = *p++; + } + ur = GET_UCD(d); if (c != d && c != (uint32_t)((int)d + ur->other_case)) { @@ -415,7 +428,7 @@ if (caseless) else #endif - /* Not in UTF mode */ + /* Not in UTF or UCP mode */ { for (; length > 0; length--) { @@ -432,7 +445,8 @@ if (caseless) } /* In the caseful case, we can just compare the code units, whether or not we -are in UTF mode. When partial matching, we have to do this unit-by-unit. */ +are in UTF and/or UCP mode. When partial matching, we have to do this unit by +unit. */ else { diff --git a/testdata/testinput12 b/testdata/testinput12 index fbfacc5..9b4f8d3 100644 --- a/testdata/testinput12 +++ b/testdata/testinput12 @@ -530,6 +530,20 @@ /X(\x{121})Y/ucp,replace=>\U$1<,substitute_extended X\x{121}Y +/s/i,ucp + \x{17f} + +/s/i,utf + \x{17f} + +/[^s]/i,ucp +\= Expect no match + \x{17f} + +/[^s]/i,utf +\= Expect no match + \x{17f} + # ---------------------------------------------------- # End of testinput12 diff --git a/testdata/testinput5 b/testdata/testinput5 index b3fcfef..ecac178 100644 --- a/testdata/testinput5 +++ b/testdata/testinput5 @@ -2181,4 +2181,7 @@ /(|ß)7/caseless,ucp +/(\xc1)\1/i,ucp + \xc1\xe1\=no_jit + # End of testinput5 diff --git a/testdata/testoutput12-16 b/testdata/testoutput12-16 index 9689ab1..84c4858 100644 --- a/testdata/testoutput12-16 +++ b/testdata/testoutput12-16 @@ -1761,6 +1761,24 @@ Subject length lower bound = 1 X\x{121}Y 1: >\x{120}< +/s/i,ucp + \x{17f} + 0: \x{17f} + +/s/i,utf + \x{17f} + 0: \x{17f} + +/[^s]/i,ucp +\= Expect no match + \x{17f} +No match + +/[^s]/i,utf +\= Expect no match + \x{17f} +No match + # ---------------------------------------------------- # End of testinput12 diff --git a/testdata/testoutput12-32 b/testdata/testoutput12-32 index c51c517..03b6e39 100644 --- a/testdata/testoutput12-32 +++ b/testdata/testoutput12-32 @@ -1759,6 +1759,24 @@ Subject length lower bound = 1 X\x{121}Y 1: >\x{120}< +/s/i,ucp + \x{17f} + 0: \x{17f} + +/s/i,utf + \x{17f} + 0: \x{17f} + +/[^s]/i,ucp +\= Expect no match + \x{17f} +No match + +/[^s]/i,utf +\= Expect no match + \x{17f} +No match + # ---------------------------------------------------- # End of testinput12 diff --git a/testdata/testoutput5 b/testdata/testoutput5 index 5bdf873..2ff8516 100644 --- a/testdata/testoutput5 +++ b/testdata/testoutput5 @@ -4940,4 +4940,9 @@ Subject length lower bound = 3 /(|ß)7/caseless,ucp +/(\xc1)\1/i,ucp + \xc1\xe1\=no_jit + 0: \xc1\xe1 + 1: \xc1 + # End of testinput5