From 3c869816ace0ae4dcce6bab78411cccaa5fb3dbc Mon Sep 17 00:00:00 2001 From: "Philip.Hazel" Date: Sat, 16 Nov 2019 17:30:07 +0000 Subject: [PATCH] Fix sometimes failing caseless non-ASCII matching in assertion. --- ChangeLog | 5 +++++ src/pcre2_compile.c | 13 +++++++++++++ testdata/testinput4 | 8 ++++++++ testdata/testoutput4 | 12 ++++++++++++ 4 files changed, 38 insertions(+) diff --git a/ChangeLog b/ChangeLog index dd52203..27033ae 100644 --- a/ChangeLog +++ b/ChangeLog @@ -177,6 +177,11 @@ sanitizer complaint (regexec is supposed to be thread safe). 37. Add NEON vectorization to JIT to speed up matching of first character and pairs of characters on ARM64 CPUs. +38. If a non-ASCII character was the first in a starting assertion in a +caseless match, the "first code unit" optimization did not get the casing +right, and the assertion failed to match a character in the other case if it +did not start with the same code unit. + Version 10.33 16-April-2019 --------------------------- diff --git a/src/pcre2_compile.c b/src/pcre2_compile.c index 3204973..800b61b 100644 --- a/src/pcre2_compile.c +++ b/src/pcre2_compile.c @@ -8741,6 +8741,19 @@ do { case OP_MINPLUSI: case OP_POSPLUSI: if (inassert == 0) return 0; + + /* If the character is more than one code unit long, we cannot set its + first code unit when matching caselessly. Later scanning may pick up + multiple code units. */ + +#ifdef SUPPORT_UNICODE +#if PCRE2_CODE_UNIT_WIDTH == 8 + if (scode[1] >= 0x80) return 0; +#elif PCRE2_CODE_UNIT_WIDTH == 16 + if (scode[1] >= 0xd800 && scode[1] <= 0xdfff) return 0; +#endif +#endif + if (cflags < 0) { c = scode[1]; cflags = REQ_CASELESS; } else if (c != scode[1]) return 0; break; diff --git a/testdata/testinput4 b/testdata/testinput4 index f3d498c..0871835 100644 --- a/testdata/testinput4 +++ b/testdata/testinput4 @@ -2483,4 +2483,12 @@ /\X*/ \xF3aaa\xE4\xEA\xEB\xFEa +/Я/i,utf + \x{42f} + \x{44f} + +/(?=Я)/i,utf + \x{42f} + \x{44f} + # End of testinput4 diff --git a/testdata/testoutput4 b/testdata/testoutput4 index 53926ed..2c8037b 100644 --- a/testdata/testoutput4 +++ b/testdata/testoutput4 @@ -4016,4 +4016,16 @@ No match \xF3aaa\xE4\xEA\xEB\xFEa 0: \xf3aaa\xe4\xea\xeb\xfea +/Я/i,utf + \x{42f} + 0: \x{42f} + \x{44f} + 0: \x{44f} + +/(?=Я)/i,utf + \x{42f} + 0: + \x{44f} + 0: + # End of testinput4