From fdd94791086e466835ed61640cf31380efd1fbc5 Mon Sep 17 00:00:00 2001 From: Philip Hazel Date: Wed, 26 Jan 2022 08:37:18 +0000 Subject: [PATCH] Fix incorrect compiling when [Aa] etc. are quantified --- ChangeLog | 6 ++++++ src/pcre2_compile.c | 21 +++++++++++++-------- testdata/testinput2 | 9 +++++++++ testdata/testoutput2 | 44 ++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 72 insertions(+), 8 deletions(-) diff --git a/ChangeLog b/ChangeLog index 4eb3fa7..f0eab7b 100644 --- a/ChangeLog +++ b/ChangeLog @@ -86,6 +86,12 @@ Clarke PR#72. 21. A user discovered that the library names in CMakeLists.txt for MSVC debugger (PDB) files were incorrect - perhaps never tried for PCRE2? +22. An item such as [Aa] is optimized into a caseless single character match. +When this was quantified (e.g. [Aa]{2}) and was also the last literal item in a +pattern, the optimizing "must be present for a match" character check was not +being flagged as caseless, causing some matches that should have succeeded to +fail. + Version 10.39 29-October-2021 ----------------------------- diff --git a/src/pcre2_compile.c b/src/pcre2_compile.c index fe2bf69..de259c9 100644 --- a/src/pcre2_compile.c +++ b/src/pcre2_compile.c @@ -2115,17 +2115,17 @@ if (c == CHAR_LEFT_CURLY_BRACKET) { if (ptr >= cb->end_pattern) goto ERROR_RETURN; c = *ptr++; - while (c == '_' || c == '-' || isspace(c)) + while (c == '_' || c == '-' || isspace(c)) { if (ptr >= cb->end_pattern) goto ERROR_RETURN; c = *ptr++; - } + } if (c == CHAR_NUL) goto ERROR_RETURN; if (c == CHAR_RIGHT_CURLY_BRACKET) break; name[i] = tolower(c); if ((c == ':' || c == '=') && vptr == NULL) vptr = name + i; } - + if (c != CHAR_RIGHT_CURLY_BRACKET) goto ERROR_RETURN; name[i] = 0; } @@ -2159,16 +2159,16 @@ another property can be diagnosed. */ if (vptr != NULL) { int offset = 0; - PCRE2_UCHAR sname[8]; + PCRE2_UCHAR sname[8]; *vptr = 0; /* Terminate property name */ if (PRIV(strcmp_c8)(name, STRING_bidiclass) == 0 || PRIV(strcmp_c8)(name, STRING_bc) == 0) { offset = 4; - sname[0] = CHAR_b; + sname[0] = CHAR_b; sname[1] = CHAR_i; /* There is no strcpy_c8 function */ - sname[2] = CHAR_d; + sname[2] = CHAR_d; sname[3] = CHAR_i; } @@ -7023,14 +7023,19 @@ for (;; pptr++) #endif /* MAYBE_UTF_MULTI */ /* Handle the case of a single code unit - either with no UTF support, or - with UTF disabled, or for a single-code-unit UTF character. */ + with UTF disabled, or for a single-code-unit UTF character. In the latter + case, for a repeated positive match, get the caseless flag for the + required code unit from the previous character, because a class like [Aa] + sets a caseless A but by now the req_caseopt flag has been reset. */ + { mcbuffer[0] = code[-1]; mclength = 1; if (op_previous <= OP_CHARI && repeat_min > 1) { reqcu = mcbuffer[0]; - reqcuflags = req_caseopt | cb->req_varyopt; + reqcuflags = cb->req_varyopt; + if (op_previous == OP_CHARI) reqcuflags |= REQ_CASELESS; } } goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */ diff --git a/testdata/testinput2 b/testdata/testinput2 index 849bc2e..d37d8f3 100644 --- a/testdata/testinput2 +++ b/testdata/testinput2 @@ -5923,4 +5923,13 @@ a)"xI # --------- +/[Aa]{2}/BI + aabcd + +/A{2}/iBI + aabcd + +/[Aa]{2,3}/BI + aabcd + # End of testinput2 diff --git a/testdata/testoutput2 b/testdata/testoutput2 index ce10f2b..ce090f8 100644 --- a/testdata/testoutput2 +++ b/testdata/testoutput2 @@ -17702,6 +17702,50 @@ Failed: error -51: NULL argument passed with non-zero length # --------- +/[Aa]{2}/BI +------------------------------------------------------------------ + Bra + /i A{2} + Ket + End +------------------------------------------------------------------ +Capture group count = 0 +First code unit = 'A' (caseless) +Last code unit = 'A' (caseless) +Subject length lower bound = 2 + aabcd + 0: aa + +/A{2}/iBI +------------------------------------------------------------------ + Bra + /i A{2} + Ket + End +------------------------------------------------------------------ +Capture group count = 0 +Options: caseless +First code unit = 'A' (caseless) +Last code unit = 'A' (caseless) +Subject length lower bound = 2 + aabcd + 0: aa + +/[Aa]{2,3}/BI +------------------------------------------------------------------ + Bra + /i A{2} + /i A?+ + Ket + End +------------------------------------------------------------------ +Capture group count = 0 +First code unit = 'A' (caseless) +Last code unit = 'A' (caseless) +Subject length lower bound = 2 + aabcd + 0: aa + # End of testinput2 Error -70: PCRE2_ERROR_BADDATA (unknown error number) Error -62: bad serialized data