Improve starting-byte bit map for UTF-8 patterns with wide characters in

classes.
2019-09-10 15:38:42 +00:00 · 2019-09-10 15:38:42 +00:00 · d917899be5
parent 78fae97f6c
commit d917899be5
4 changed files with 95 additions and 41 deletions
--- a/7
+++ b/7
@ -153,6 +153,13 @@ string pointer is close to 0.
 same character, to be treated as a single caseless character. This causes the 
 first and required code unit optimizations to kick in where relevant.
 34. Improve the bitmap of starting bytes for positive classes that include wide
 characters, but no property types, in UTF-8 mode. Previously, on encountering
 such a class, the bits for all bytes greater than \xc4 were set, thus
 specifying any character with codepoint >= 0x100. Now the only bits that are
 set are for the relevant bytes that start the wide characters. This can give a 
 noticeable performance improvement.
 Version 10.33 16-April-2019
 ---------------------------
--- a/src/pcre2_study.c
+++ b/src/pcre2_study.c
@ -909,7 +909,7 @@ if (table_limit != 32) for (c = 24; c < 32; c++) re->start_bitmap[c] = 0xff;
 /*************************************************
-*          Create bitmap of starting bytes       *
+*      Create bitmap of starting code units      *
 *************************************************/
 /* This function scans a compiled unanchored expression recursively and
@ -959,6 +959,9 @@ do
    {
    int rc;
    uint8_t *classmap = NULL;
 #ifdef SUPPORT_WIDE_CHARS
    PCRE2_UCHAR xclassflags;
 #endif
    switch(*tcode)
      {
@ -1467,20 +1470,59 @@ do
      negative XCLASS without a map, give up. If there are no property checks,
      there must be wide characters on the XCLASS list, because otherwise an
      XCLASS would not have been created. This means that code points >= 255
-      are always potential starters. */
+      are potential starters. In the UTF-8 case we can scan them and set bits
      for the relevant leading bytes. */
 #ifdef SUPPORT_WIDE_CHARS
      case OP_XCLASS:
-      if ((tcode[1 + LINK_SIZE] & XCL_HASPROP) != 0 ||
+      xclassflags = tcode[1 + LINK_SIZE];
-          (tcode[1 + LINK_SIZE] & (XCL_MAP|XCL_NOT)) == XCL_NOT)
+      if ((xclassflags & XCL_HASPROP) != 0 ||
          (xclassflags & (XCL_MAP|XCL_NOT)) == XCL_NOT)
        return SSB_FAIL;
      /* We have a positive XCLASS or a negative one without a map. Set up the
      map pointer if there is one, and fall through. */
-      classmap = ((tcode[1 + LINK_SIZE] & XCL_MAP) == 0)? NULL :
+      classmap = ((xclassflags & XCL_MAP) == 0)? NULL :
        (uint8_t *)(tcode + 1 + LINK_SIZE + 1);
-#endif
+
      /* In UTF-8 mode, scan the character list and set bits for leading bytes,
      then jump to handle the map. */
 #if PCRE2_CODE_UNIT_WIDTH == 8
      if (utf && (xclassflags & XCL_NOT) == 0)
        {
        PCRE2_UCHAR b, e;
        PCRE2_SPTR p = tcode + 1 + LINK_SIZE + 1 + ((classmap == NULL)? 0:32);
        tcode += GET(tcode, 1);
        for (;;) switch (*p++)
          {
          case XCL_SINGLE:
          b = *p++;
          while ((*p & 0xc0) == 0x80) p++;
          re->start_bitmap[b/8] |= (1u << (b&7));
          break;
          case XCL_RANGE:
          b = *p++;
          while ((*p & 0xc0) == 0x80) p++;
          e = *p++;
          while ((*p & 0xc0) == 0x80) p++;
          for (; b <= e; b++)
            re->start_bitmap[b/8] |= (1u << (b&7));
          break;
          case XCL_END:
          goto HANDLE_CLASSMAP;
          default:
          return SSB_UNKNOWN;   /* Internal error, should not occur */
          }
        }
 #endif  /* SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8 */
 #endif  /* SUPPORT_WIDE_CHARS */
      /* It seems that the fall through comment must be outside the #ifdef if
      it is to avoid the gcc compiler warning. */
@ -1522,6 +1564,9 @@ do
      greater than 127. In fact, there are only two possible starting bytes for
      characters in the range 128 - 255. */
 #if defined SUPPORT_WIDE_CHARS && PCRE2_CODE_UNIT_WIDTH == 8
      HANDLE_CLASSMAP:
 #endif
      if (classmap != NULL)
        {
 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8
--- a/testdata/testinput10
+++ b/testdata/testinput10
@ -561,4 +561,10 @@
 /[\x{c1}\x{e1}]X[\x{145}\x{146}]/I,utf
 /[󿾟,]/BI,utf
 /[\x{fff4}-\x{ffff8}]/I,utf
 /[\x{fff4}-\x{afff8}\x{10ffff}]/I,utf
 # End of testinput10
--- a/testdata/testoutput10
+++ b/testdata/testoutput10
@ -1256,11 +1256,7 @@ Subject length lower bound = 1
 ------------------------------------------------------------------
 Capture group count = 0
 Options: utf
-Starting code units: Z \xc4 \xc5 \xc6 \xc7 \xc8 \xc9 \xca \xcb \xcc \xcd 
+Starting code units: Z \xc4 
  \xce \xcf \xd0 \xd1 \xd2 \xd3 \xd4 \xd5 \xd6 \xd7 \xd8 \xd9 \xda \xdb \xdc 
  \xdd \xde \xdf \xe0 \xe1 \xe2 \xe3 \xe4 \xe5 \xe6 \xe7 \xe8 \xe9 \xea \xeb 
  \xec \xed \xee \xef \xf0 \xf1 \xf2 \xf3 \xf4 \xf5 \xf6 \xf7 \xf8 \xf9 \xfa 
  \xfb \xfc \xfd \xfe \xff 
 Subject length lower bound = 1
    Z\x{100}
 0: Z
@ -1278,11 +1274,7 @@ Subject length lower bound = 1
 ------------------------------------------------------------------
 Capture group count = 0
 Options: utf
-Starting code units: z { | } ~ \x7f \xc2 \xc3 \xc4 \xc5 \xc6 \xc7 \xc8 \xc9 
+Starting code units: z { | } ~ \x7f \xc2 \xc3 \xc4 
  \xca \xcb \xcc \xcd \xce \xcf \xd0 \xd1 \xd2 \xd3 \xd4 \xd5 \xd6 \xd7 \xd8 
  \xd9 \xda \xdb \xdc \xdd \xde \xdf \xe0 \xe1 \xe2 \xe3 \xe4 \xe5 \xe6 \xe7 
  \xe8 \xe9 \xea \xeb \xec \xed \xee \xef \xf0 \xf1 \xf2 \xf3 \xf4 \xf5 \xf6 
  \xf7 \xf8 \xf9 \xfa \xfb \xfc \xfd \xfe \xff 
 Subject length lower bound = 1
 /[z\Qa-d]Ā\E]/IB,utf
@ -1294,11 +1286,7 @@ Subject length lower bound = 1
 ------------------------------------------------------------------
 Capture group count = 0
 Options: utf
-Starting code units: - ] a d z \xc4 \xc5 \xc6 \xc7 \xc8 \xc9 \xca \xcb \xcc 
+Starting code units: - ] a d z \xc4 
  \xcd \xce \xcf \xd0 \xd1 \xd2 \xd3 \xd4 \xd5 \xd6 \xd7 \xd8 \xd9 \xda \xdb 
  \xdc \xdd \xde \xdf \xe0 \xe1 \xe2 \xe3 \xe4 \xe5 \xe6 \xe7 \xe8 \xe9 \xea 
  \xeb \xec \xed \xee \xef \xf0 \xf1 \xf2 \xf3 \xf4 \xf5 \xf6 \xf7 \xf8 \xf9 
  \xfa \xfb \xfc \xfd \xfe \xff 
 Subject length lower bound = 1
    \x{100}
 0: \x{100}
@ -1319,11 +1307,7 @@ Subject length lower bound = 1
 ------------------------------------------------------------------
 Capture group count = 1
 Options: utf
-Starting code units: a b \xc4 \xc5 \xc6 \xc7 \xc8 \xc9 \xca \xcb \xcc \xcd 
+Starting code units: a b \xc4 
  \xce \xcf \xd0 \xd1 \xd2 \xd3 \xd4 \xd5 \xd6 \xd7 \xd8 \xd9 \xda \xdb \xdc 
  \xdd \xde \xdf \xe0 \xe1 \xe2 \xe3 \xe4 \xe5 \xe6 \xe7 \xe8 \xe9 \xea \xeb 
  \xec \xed \xee \xef \xf0 \xf1 \xf2 \xf3 \xf4 \xf5 \xf6 \xf7 \xf8 \xf9 \xfa 
  \xfb \xfc \xfd \xfe \xff 
 Last code unit = 'z'
 Subject length lower bound = 7
@ -1440,11 +1424,7 @@ Subject length lower bound = 1
 ------------------------------------------------------------------
 Capture group count = 0
 Options: caseless utf
-Starting code units: \xc4 \xc5 \xc6 \xc7 \xc8 \xc9 \xca \xcb \xcc \xcd \xce 
+Starting code units: \xc4 
  \xcf \xd0 \xd1 \xd2 \xd3 \xd4 \xd5 \xd6 \xd7 \xd8 \xd9 \xda \xdb \xdc \xdd 
  \xde \xdf \xe0 \xe1 \xe2 \xe3 \xe4 \xe5 \xe6 \xe7 \xe8 \xe9 \xea \xeb \xec 
  \xed \xee \xef \xf0 \xf1 \xf2 \xf3 \xf4 \xf5 \xf6 \xf7 \xf8 \xf9 \xfa \xfb 
  \xfc \xfd \xfe \xff 
 Subject length lower bound = 1
    \x{104}
 0: \x{104}
@ -1467,11 +1447,7 @@ No match
 ------------------------------------------------------------------
 Capture group count = 0
 Options: caseless utf
-Starting code units: Z z { | } ~ \x7f \xc2 \xc3 \xc4 \xc5 \xc6 \xc7 \xc8 
+Starting code units: Z z { | } ~ \x7f \xc2 \xc3 \xc4 \xc5 \xce \xe1 \xe2 
  \xc9 \xca \xcb \xcc \xcd \xce \xcf \xd0 \xd1 \xd2 \xd3 \xd4 \xd5 \xd6 \xd7 
  \xd8 \xd9 \xda \xdb \xdc \xdd \xde \xdf \xe0 \xe1 \xe2 \xe3 \xe4 \xe5 \xe6 
  \xe7 \xe8 \xe9 \xea \xeb \xec \xed \xee \xef \xf0 \xf1 \xf2 \xf3 \xf4 \xf5 
  \xf6 \xf7 \xf8 \xf9 \xfa \xfb \xfc \xfd \xfe \xff 
 Subject length lower bound = 1
    Z
 0: Z
@ -1508,11 +1484,7 @@ No match
 ------------------------------------------------------------------
 Capture group count = 0
 Options: caseless utf
-Starting code units: Z z { | } ~ \x7f \xc2 \xc3 \xc4 \xc5 \xc6 \xc7 \xc8 
+Starting code units: Z z { | } ~ \x7f \xc2 \xc3 \xc4 \xc5 \xce \xe1 \xe2 
  \xc9 \xca \xcb \xcc \xcd \xce \xcf \xd0 \xd1 \xd2 \xd3 \xd4 \xd5 \xd6 \xd7 
  \xd8 \xd9 \xda \xdb \xdc \xdd \xde \xdf \xe0 \xe1 \xe2 \xe3 \xe4 \xe5 \xe6 
  \xe7 \xe8 \xe9 \xea \xeb \xec \xed \xee \xef \xf0 \xf1 \xf2 \xf3 \xf4 \xf5 
  \xf6 \xf7 \xf8 \xf9 \xfa \xfb \xfc \xfd \xfe \xff 
 Subject length lower bound = 1
 /\x{3a3}B/IBi,utf
@ -1773,4 +1745,28 @@ Starting code units: \xc3
 Last code unit = 'X'
 Subject length lower bound = 3
 /[󿾟,]/BI,utf
 ------------------------------------------------------------------
        Bra
        [,\x{fff9f}]
        Ket
        End
 ------------------------------------------------------------------
 Capture group count = 0
 Options: utf
 Starting code units: , \xf3 
 Subject length lower bound = 1
 /[\x{fff4}-\x{ffff8}]/I,utf
 Capture group count = 0
 Options: utf
 Starting code units: \xef \xf0 \xf1 \xf2 \xf3 
 Subject length lower bound = 1
 /[\x{fff4}-\x{afff8}\x{10ffff}]/I,utf
 Capture group count = 0
 Options: utf
 Starting code units: \xef \xf0 \xf1 \xf2 \xf4 
 Subject length lower bound = 1
 # End of testinput10