From e413f3147cb00046982075ef06397f4a6c6a9727 Mon Sep 17 00:00:00 2001 From: "Philip.Hazel" Date: Fri, 13 Sep 2019 17:02:06 +0000 Subject: [PATCH] Optimize certain starting code unit bit maps into a single starting code unit. --- ChangeLog | 10 +++++ src/pcre2_study.c | 96 +++++++++++++++++++++++++++++++++++++++- testdata/testinput10 | 12 +++++ testdata/testinput12 | 12 +++++ testdata/testinput2 | 12 +++++ testdata/testoutput10 | 34 ++++++++++++++ testdata/testoutput12-16 | 34 ++++++++++++++ testdata/testoutput12-32 | 34 ++++++++++++++ testdata/testoutput2 | 36 ++++++++++++++- 9 files changed, 278 insertions(+), 2 deletions(-) diff --git a/ChangeLog b/ChangeLog index d234409..d584ab2 100644 --- a/ChangeLog +++ b/ChangeLog @@ -160,6 +160,16 @@ specifying any character with codepoint >= 0x100. Now the only bits that are set are for the relevant bytes that start the wide characters. This can give a noticeable performance improvement. +35. If the bitmap of starting code units contains only 1 or 2 bits, replace it +with a single starting code unit (1 bit) or a caseless single starting code +unit if the two relevant characters are case-partners. This is particularly +relevant to the 8-bit library, though it applies to all. It can give a +performance boost for patterns such as [Ww]ord and (word|WORD). However, this +optimization doesn't happen if there is a "required" code unit of the same +value (because the search for a "required" code unit starts at the match start +for non-unique first code unit patterns, but after a unique first code unit, +and patterns such as a*a need the former action). + Version 10.33 16-April-2019 --------------------------- diff --git a/src/pcre2_study.c b/src/pcre2_study.c index 23deebb..22c210b 100644 --- a/src/pcre2_study.c +++ b/src/pcre2_study.c @@ -1666,7 +1666,101 @@ if ((re->flags & (PCRE2_FIRSTSET|PCRE2_STARTLINE)) == 0) { int rc = set_start_bits(re, code, utf); if (rc == SSB_UNKNOWN) return 1; - if (rc == SSB_DONE) re->flags |= PCRE2_FIRSTMAPSET; + + /* If a list of starting code units was set up, scan the list to see if only + one or two were listed. Having only one listed is rare because usually a + single starting code unit will have been recognized and PCRE2_FIRSTSET set. + If two are listed, see if they are caseless versions of the same character; + if so we can replace the list with a caseless first code unit. This gives + better performance and is plausibly worth doing for patterns such as [Ww]ord + or (word|WORD). */ + + if (rc == SSB_DONE) + { + int i; + int a = -1; + int b = -1; + uint8_t *p = re->start_bitmap; + uint32_t flags = PCRE2_FIRSTMAPSET; + + for (i = 0; i < 256; p++, i += 8) + { + uint8_t x = *p; + if (x != 0) + { + int c; + uint8_t y = x & (~x + 1); /* Least significant bit */ + if (y != x) goto DONE; /* More than one bit set */ + + /* In the 16-bit and 32-bit libraries, the bit for 0xff means "0xff and + all wide characters", so we cannot use it here. */ + +#if PCRE2_CODE_UNIT_WIDTH != 8 + if (i == 248 && x == 0x80) goto DONE; +#endif + + /* Compute the character value */ + + c = i; + switch (x) + { + case 1: break; + case 2: c += 1; break; case 4: c += 2; break; + case 8: c += 3; break; case 16: c += 4; break; + case 32: c += 5; break; case 64: c += 6; break; + case 128: c += 7; break; + } + + /* c contains the code unit value, in the range 0-255. In 8-bit UTF + mode, only values < 128 can be used. */ + +#if PCRE2_CODE_UNIT_WIDTH == 8 + if (c > 127) goto DONE; +#endif + if (a < 0) a = c; /* First one found */ + else if (b < 0) /* Second one found */ + { + int d = TABLE_GET((unsigned int)c, re->tables + fcc_offset, c); + +#ifdef SUPPORT_UNICODE +#if PCRE2_CODE_UNIT_WIDTH == 8 + if (utf && UCD_CASESET(c) != 0) goto DONE; /* Multiple case set */ +#else /* 16-bit or 32-bit */ + if (UCD_CASESET(c) != 0) goto DONE; /* Multiple case set */ + if (utf && c > 127) d = UCD_OTHERCASE(c); +#endif /* Code width */ +#endif /* SUPPORT_UNICODE */ + + if (d != a) goto DONE; /* Not other case of a */ + b = c; + } + else goto DONE; /* More than two characters found */ + } + } + + /* Replace the start code unit bits with a first code unit, but only if it + is not the same as a required later code unit. This is because a search for + a required code unit starts after an explicit first code unit, but at a + code unit found from the bitmap. Patterns such as /a*a/ don't work + if both the start unit and required unit are the same. */ + + if (a >= 0 && + ( + (re->flags & PCRE2_LASTSET) == 0 || + ( + re->last_codeunit != (uint32_t)a && + (b < 0 || re->last_codeunit != (uint32_t)b) + ) + )) + { + re->first_codeunit = a; + flags = PCRE2_FIRSTSET; + if (b >= 0) flags |= PCRE2_FIRSTCASELESS; + } + + DONE: + re->flags |= flags; + } } /* Find the minimum length of subject string. If the pattern can match an empty diff --git a/testdata/testinput10 b/testdata/testinput10 index cf92525..dfcb145 100644 --- a/testdata/testinput10 +++ b/testdata/testinput10 @@ -567,4 +567,16 @@ /[\x{fff4}-\x{afff8}\x{10ffff}]/I,utf +/[\xff\x{ffff}]/I,utf + +/[\xff\x{ff}]/I,utf + +/[\xff\x{ff}]/I + +/[Ss]/I + +/[Ss]/I,utf + +/(?:\x{ff}|\x{3000})/I,utf + # End of testinput10 diff --git a/testdata/testinput12 b/testdata/testinput12 index dc4f809..bed00a5 100644 --- a/testdata/testinput12 +++ b/testdata/testinput12 @@ -451,4 +451,16 @@ /[\x{c1}\x{e1}]X[\x{145}\x{146}]/I,utf +/[\xff\x{ffff}]/I,utf + +/[\xff\x{ff}]/I,utf + +/[\xff\x{ff}]/I + +/[Ss]/I + +/[Ss]/I,utf + +/(?:\x{ff}|\x{3000})/I,utf + # End of testinput12 diff --git a/testdata/testinput2 b/testdata/testinput2 index 19f8b0a..655e519 100644 --- a/testdata/testinput2 +++ b/testdata/testinput2 @@ -1625,6 +1625,7 @@ /^a*A\d/IBi aaaA5 aaaa5 + a5 /(a*|b*)[cd]/I @@ -5760,4 +5761,15 @@ a)"xI /[aA]b[cC]/IB +/[cc]abcd/I + +/[Cc]abcd/I + +/[c]abcd/I + +/(?:c|C)abcd/I + +/(a)?a/I + manm + # End of testinput2 diff --git a/testdata/testoutput10 b/testdata/testoutput10 index 1fe44fb..4664b5a 100644 --- a/testdata/testoutput10 +++ b/testdata/testoutput10 @@ -1769,4 +1769,38 @@ Options: utf Starting code units: \xef \xf0 \xf1 \xf2 \xf4 Subject length lower bound = 1 +/[\xff\x{ffff}]/I,utf +Capture group count = 0 +Options: utf +Starting code units: \xc3 \xef +Subject length lower bound = 1 + +/[\xff\x{ff}]/I,utf +Capture group count = 0 +Options: utf +Starting code units: \xc3 +Subject length lower bound = 1 + +/[\xff\x{ff}]/I +Capture group count = 0 +Starting code units: \xff +Subject length lower bound = 1 + +/[Ss]/I +Capture group count = 0 +First code unit = 'S' (caseless) +Subject length lower bound = 1 + +/[Ss]/I,utf +Capture group count = 0 +Options: utf +Starting code units: S s +Subject length lower bound = 1 + +/(?:\x{ff}|\x{3000})/I,utf +Capture group count = 0 +Options: utf +Starting code units: \xc3 \xe3 +Subject length lower bound = 1 + # End of testinput10 diff --git a/testdata/testoutput12-16 b/testdata/testoutput12-16 index 9956ab1..3006bc1 100644 --- a/testdata/testoutput12-16 +++ b/testdata/testoutput12-16 @@ -1594,4 +1594,38 @@ First code unit = \xc1 (caseless) Last code unit = \x{145} (caseless) Subject length lower bound = 3 +/[\xff\x{ffff}]/I,utf +Capture group count = 0 +Options: utf +Starting code units: \xff +Subject length lower bound = 1 + +/[\xff\x{ff}]/I,utf +Capture group count = 0 +Options: utf +Starting code units: \xff +Subject length lower bound = 1 + +/[\xff\x{ff}]/I +Capture group count = 0 +Starting code units: \xff +Subject length lower bound = 1 + +/[Ss]/I +Capture group count = 0 +Starting code units: S s +Subject length lower bound = 1 + +/[Ss]/I,utf +Capture group count = 0 +Options: utf +Starting code units: S s +Subject length lower bound = 1 + +/(?:\x{ff}|\x{3000})/I,utf +Capture group count = 0 +Options: utf +Starting code units: \xff +Subject length lower bound = 1 + # End of testinput12 diff --git a/testdata/testoutput12-32 b/testdata/testoutput12-32 index ef89f56..ad240e2 100644 --- a/testdata/testoutput12-32 +++ b/testdata/testoutput12-32 @@ -1592,4 +1592,38 @@ First code unit = \xc1 (caseless) Last code unit = \x{145} (caseless) Subject length lower bound = 3 +/[\xff\x{ffff}]/I,utf +Capture group count = 0 +Options: utf +Starting code units: \xff +Subject length lower bound = 1 + +/[\xff\x{ff}]/I,utf +Capture group count = 0 +Options: utf +Starting code units: \xff +Subject length lower bound = 1 + +/[\xff\x{ff}]/I +Capture group count = 0 +Starting code units: \xff +Subject length lower bound = 1 + +/[Ss]/I +Capture group count = 0 +Starting code units: S s +Subject length lower bound = 1 + +/[Ss]/I,utf +Capture group count = 0 +Options: utf +Starting code units: S s +Subject length lower bound = 1 + +/(?:\x{ff}|\x{3000})/I,utf +Capture group count = 0 +Options: utf +Starting code units: \xff +Subject length lower bound = 1 + # End of testinput12 diff --git a/testdata/testoutput2 b/testdata/testoutput2 index 05da27e..c733c12 100644 --- a/testdata/testoutput2 +++ b/testdata/testoutput2 @@ -816,7 +816,7 @@ Capture group count = 1 Max back reference = 1 Compile options: Overall options: anchored -Starting code units: a +First code unit = 'a' Subject length lower bound = 4 \= Expect no match aaaa @@ -6492,6 +6492,8 @@ Subject length lower bound = 2 0: aaaA5 aaaa5 0: aaaa5 + a5 + 0: a5 /(a*|b*)[cd]/I Capture group count = 1 @@ -17401,6 +17403,38 @@ First code unit = 'a' (caseless) Last code unit = 'c' (caseless) Subject length lower bound = 3 +/[cc]abcd/I +Capture group count = 0 +First code unit = 'c' +Last code unit = 'd' +Subject length lower bound = 5 + +/[Cc]abcd/I +Capture group count = 0 +First code unit = 'C' (caseless) +Last code unit = 'd' +Subject length lower bound = 5 + +/[c]abcd/I +Capture group count = 0 +First code unit = 'c' +Last code unit = 'd' +Subject length lower bound = 5 + +/(?:c|C)abcd/I +Capture group count = 0 +First code unit = 'C' (caseless) +Last code unit = 'd' +Subject length lower bound = 5 + +/(a)?a/I +Capture group count = 1 +Starting code units: a +Last code unit = 'a' +Subject length lower bound = 1 + manm + 0: a + # End of testinput2 Error -70: PCRE2_ERROR_BADDATA (unknown error number) Error -62: bad serialized data