Optimize certain starting code unit bit maps into a single starting code unit.

2019-09-13 17:02:06 +00:00 · 2019-09-13 17:02:06 +00:00 · e413f3147c
parent d917899be5
commit e413f3147c
9 changed files with 278 additions and 2 deletions
--- a/10
+++ b/10
@ -160,6 +160,16 @@ specifying any character with codepoint >= 0x100. Now the only bits that are
 set are for the relevant bytes that start the wide characters. This can give a 
 noticeable performance improvement.
 35. If the bitmap of starting code units contains only 1 or 2 bits, replace it 
 with a single starting code unit (1 bit) or a caseless single starting code 
 unit if the two relevant characters are case-partners. This is particularly 
 relevant to the 8-bit library, though it applies to all. It can give a 
 performance boost for patterns such as [Ww]ord and (word|WORD). However, this 
 optimization doesn't happen if there is a "required" code unit of the same 
 value (because the search for a "required" code unit starts at the match start 
 for non-unique first code unit patterns, but after a unique first code unit, 
 and patterns such as a*a need the former action).
 Version 10.33 16-April-2019
 ---------------------------
--- a/src/pcre2_study.c
+++ b/src/pcre2_study.c
@ -1666,7 +1666,101 @@ if ((re->flags & (PCRE2_FIRSTSET|PCRE2_STARTLINE)) == 0)
  {
  int rc = set_start_bits(re, code, utf);
  if (rc == SSB_UNKNOWN) return 1;
-  if (rc == SSB_DONE) re->flags |= PCRE2_FIRSTMAPSET;
+
  /* If a list of starting code units was set up, scan the list to see if only
  one or two were listed. Having only one listed is rare because usually a
  single starting code unit will have been recognized and PCRE2_FIRSTSET set.
  If two are listed, see if they are caseless versions of the same character;
  if so we can replace the list with a caseless first code unit. This gives
  better performance and is plausibly worth doing for patterns such as [Ww]ord
  or (word|WORD). */
  if (rc == SSB_DONE)
    {
    int i;
    int a = -1;
    int b = -1;
    uint8_t *p = re->start_bitmap;
    uint32_t flags = PCRE2_FIRSTMAPSET;
    for (i = 0; i < 256; p++, i += 8)
      {
      uint8_t x = *p;
      if (x != 0)
        {
        int c;
        uint8_t y = x & (~x + 1);   /* Least significant bit */
        if (y != x) goto DONE;      /* More than one bit set */
        /* In the 16-bit and 32-bit libraries, the bit for 0xff means "0xff and
        all wide characters", so we cannot use it here. */
 #if PCRE2_CODE_UNIT_WIDTH != 8
        if (i == 248 && x == 0x80) goto DONE;
 #endif
        /* Compute the character value */
        c = i;
        switch (x)
          {
          case 1:   break;
          case 2:   c += 1; break;  case 4:  c += 2; break;
          case 8:   c += 3; break;  case 16: c += 4; break;
          case 32:  c += 5; break;  case 64: c += 6; break;
          case 128: c += 7; break;
          }
        /* c contains the code unit value, in the range 0-255. In 8-bit UTF
        mode, only values < 128 can be used. */
 #if PCRE2_CODE_UNIT_WIDTH == 8
        if (c > 127) goto DONE;
 #endif
        if (a < 0) a = c;   /* First one found */
        else if (b < 0)     /* Second one found */
          {
          int d = TABLE_GET((unsigned int)c, re->tables + fcc_offset, c);
 #ifdef SUPPORT_UNICODE
 #if PCRE2_CODE_UNIT_WIDTH == 8
          if (utf && UCD_CASESET(c) != 0) goto DONE;   /* Multiple case set */
 #else   /* 16-bit or 32-bit */
          if (UCD_CASESET(c) != 0) goto DONE;     /* Multiple case set */
          if (utf && c > 127) d = UCD_OTHERCASE(c);
 #endif  /* Code width */
 #endif  /* SUPPORT_UNICODE */
          if (d != a) goto DONE;   /* Not other case of a */
          b = c;
          }
        else goto DONE;   /* More than two characters found */
        }
      }
    /* Replace the start code unit bits with a first code unit, but only if it 
    is not the same as a required later code unit. This is because a search for
    a required code unit starts after an explicit first code unit, but at a
    code unit found from the bitmap. Patterns such as /a*a/ don't work 
    if both the start unit and required unit are the same. */
    if (a >= 0 && 
        (
        (re->flags & PCRE2_LASTSET) == 0 || 
          (
          re->last_codeunit != (uint32_t)a && 
          (b < 0 || re->last_codeunit != (uint32_t)b)
          )
        ))
      {
      re->first_codeunit = a;
      flags = PCRE2_FIRSTSET;
      if (b >= 0) flags |= PCRE2_FIRSTCASELESS;
      }
    DONE:
    re->flags |= flags;
    }
  }
 /* Find the minimum length of subject string. If the pattern can match an empty
--- a/testdata/testinput10
+++ b/testdata/testinput10
@ -567,4 +567,16 @@
 /[\x{fff4}-\x{afff8}\x{10ffff}]/I,utf
 /[\xff\x{ffff}]/I,utf
 /[\xff\x{ff}]/I,utf
 /[\xff\x{ff}]/I
 /[Ss]/I
 /[Ss]/I,utf
 /(?:\x{ff}|\x{3000})/I,utf
 # End of testinput10
--- a/testdata/testinput12
+++ b/testdata/testinput12
@ -451,4 +451,16 @@
 /[\x{c1}\x{e1}]X[\x{145}\x{146}]/I,utf
 /[\xff\x{ffff}]/I,utf
 /[\xff\x{ff}]/I,utf
 /[\xff\x{ff}]/I
 /[Ss]/I
 /[Ss]/I,utf
 /(?:\x{ff}|\x{3000})/I,utf
 # End of testinput12
--- a/testdata/testinput2
+++ b/testdata/testinput2
@ -1625,6 +1625,7 @@
 /^a*A\d/IBi
    aaaA5
    aaaa5
    a5 
 /(a*|b*)[cd]/I
@ -5760,4 +5761,15 @@ a)"xI
 /[aA]b[cC]/IB
 /[cc]abcd/I
 /[Cc]abcd/I
 /[c]abcd/I
 /(?:c|C)abcd/I
 /(a)?a/I
    manm
 # End of testinput2
--- a/testdata/testoutput10
+++ b/testdata/testoutput10
@ -1769,4 +1769,38 @@ Options: utf
 Starting code units: \xef \xf0 \xf1 \xf2 \xf4 
 Subject length lower bound = 1
 /[\xff\x{ffff}]/I,utf
 Capture group count = 0
 Options: utf
 Starting code units: \xc3 \xef 
 Subject length lower bound = 1
 /[\xff\x{ff}]/I,utf
 Capture group count = 0
 Options: utf
 Starting code units: \xc3 
 Subject length lower bound = 1
 /[\xff\x{ff}]/I
 Capture group count = 0
 Starting code units: \xff 
 Subject length lower bound = 1
 /[Ss]/I
 Capture group count = 0
 First code unit = 'S' (caseless)
 Subject length lower bound = 1
 /[Ss]/I,utf
 Capture group count = 0
 Options: utf
 Starting code units: S s 
 Subject length lower bound = 1
 /(?:\x{ff}|\x{3000})/I,utf
 Capture group count = 0
 Options: utf
 Starting code units: \xc3 \xe3 
 Subject length lower bound = 1
 # End of testinput10
--- a/testdata/testoutput12-16
+++ b/testdata/testoutput12-16
@ -1594,4 +1594,38 @@ First code unit = \xc1 (caseless)
 Last code unit = \x{145} (caseless)
 Subject length lower bound = 3
 /[\xff\x{ffff}]/I,utf
 Capture group count = 0
 Options: utf
 Starting code units: \xff 
 Subject length lower bound = 1
 /[\xff\x{ff}]/I,utf
 Capture group count = 0
 Options: utf
 Starting code units: \xff 
 Subject length lower bound = 1
 /[\xff\x{ff}]/I
 Capture group count = 0
 Starting code units: \xff 
 Subject length lower bound = 1
 /[Ss]/I
 Capture group count = 0
 Starting code units: S s 
 Subject length lower bound = 1
 /[Ss]/I,utf
 Capture group count = 0
 Options: utf
 Starting code units: S s 
 Subject length lower bound = 1
 /(?:\x{ff}|\x{3000})/I,utf
 Capture group count = 0
 Options: utf
 Starting code units: \xff 
 Subject length lower bound = 1
 # End of testinput12
--- a/testdata/testoutput12-32
+++ b/testdata/testoutput12-32
@ -1592,4 +1592,38 @@ First code unit = \xc1 (caseless)
 Last code unit = \x{145} (caseless)
 Subject length lower bound = 3
 /[\xff\x{ffff}]/I,utf
 Capture group count = 0
 Options: utf
 Starting code units: \xff 
 Subject length lower bound = 1
 /[\xff\x{ff}]/I,utf
 Capture group count = 0
 Options: utf
 Starting code units: \xff 
 Subject length lower bound = 1
 /[\xff\x{ff}]/I
 Capture group count = 0
 Starting code units: \xff 
 Subject length lower bound = 1
 /[Ss]/I
 Capture group count = 0
 Starting code units: S s 
 Subject length lower bound = 1
 /[Ss]/I,utf
 Capture group count = 0
 Options: utf
 Starting code units: S s 
 Subject length lower bound = 1
 /(?:\x{ff}|\x{3000})/I,utf
 Capture group count = 0
 Options: utf
 Starting code units: \xff 
 Subject length lower bound = 1
 # End of testinput12
--- a/testdata/testoutput2
+++ b/testdata/testoutput2
@ -816,7 +816,7 @@ Capture group count = 1
 Max back reference = 1
 Compile options: <none>
 Overall options: anchored
-Starting code units: a 
+First code unit = 'a'
 Subject length lower bound = 4
 \= Expect no match
    aaaa
@ -6492,6 +6492,8 @@ Subject length lower bound = 2
 0: aaaA5
    aaaa5
 0: aaaa5
    a5 
 0: a5
 /(a*|b*)[cd]/I
 Capture group count = 1
@ -17401,6 +17403,38 @@ First code unit = 'a' (caseless)
 Last code unit = 'c' (caseless)
 Subject length lower bound = 3
 /[cc]abcd/I
 Capture group count = 0
 First code unit = 'c'
 Last code unit = 'd'
 Subject length lower bound = 5
 /[Cc]abcd/I
 Capture group count = 0
 First code unit = 'C' (caseless)
 Last code unit = 'd'
 Subject length lower bound = 5
 /[c]abcd/I
 Capture group count = 0
 First code unit = 'c'
 Last code unit = 'd'
 Subject length lower bound = 5
 /(?:c|C)abcd/I
 Capture group count = 0
 First code unit = 'C' (caseless)
 Last code unit = 'd'
 Subject length lower bound = 5
 /(a)?a/I
 Capture group count = 1
 Starting code units: a 
 Last code unit = 'a'
 Subject length lower bound = 1
    manm
 0: a
 # End of testinput2
 Error -70: PCRE2_ERROR_BADDATA (unknown error number)
 Error -62: bad serialized data