diff --git a/ChangeLog b/ChangeLog index dd348b9..d234409 100644 --- a/ChangeLog +++ b/ChangeLog @@ -153,6 +153,13 @@ string pointer is close to 0. same character, to be treated as a single caseless character. This causes the first and required code unit optimizations to kick in where relevant. +34. Improve the bitmap of starting bytes for positive classes that include wide +characters, but no property types, in UTF-8 mode. Previously, on encountering +such a class, the bits for all bytes greater than \xc4 were set, thus +specifying any character with codepoint >= 0x100. Now the only bits that are +set are for the relevant bytes that start the wide characters. This can give a +noticeable performance improvement. + Version 10.33 16-April-2019 --------------------------- diff --git a/src/pcre2_study.c b/src/pcre2_study.c index 6370e17..23deebb 100644 --- a/src/pcre2_study.c +++ b/src/pcre2_study.c @@ -909,7 +909,7 @@ if (table_limit != 32) for (c = 24; c < 32; c++) re->start_bitmap[c] = 0xff; /************************************************* -* Create bitmap of starting bytes * +* Create bitmap of starting code units * *************************************************/ /* This function scans a compiled unanchored expression recursively and @@ -959,6 +959,9 @@ do { int rc; uint8_t *classmap = NULL; +#ifdef SUPPORT_WIDE_CHARS + PCRE2_UCHAR xclassflags; +#endif switch(*tcode) { @@ -1467,20 +1470,59 @@ do negative XCLASS without a map, give up. If there are no property checks, there must be wide characters on the XCLASS list, because otherwise an XCLASS would not have been created. This means that code points >= 255 - are always potential starters. */ + are potential starters. In the UTF-8 case we can scan them and set bits + for the relevant leading bytes. */ #ifdef SUPPORT_WIDE_CHARS case OP_XCLASS: - if ((tcode[1 + LINK_SIZE] & XCL_HASPROP) != 0 || - (tcode[1 + LINK_SIZE] & (XCL_MAP|XCL_NOT)) == XCL_NOT) + xclassflags = tcode[1 + LINK_SIZE]; + if ((xclassflags & XCL_HASPROP) != 0 || + (xclassflags & (XCL_MAP|XCL_NOT)) == XCL_NOT) return SSB_FAIL; /* We have a positive XCLASS or a negative one without a map. Set up the map pointer if there is one, and fall through. */ - classmap = ((tcode[1 + LINK_SIZE] & XCL_MAP) == 0)? NULL : + classmap = ((xclassflags & XCL_MAP) == 0)? NULL : (uint8_t *)(tcode + 1 + LINK_SIZE + 1); -#endif + + /* In UTF-8 mode, scan the character list and set bits for leading bytes, + then jump to handle the map. */ + +#if PCRE2_CODE_UNIT_WIDTH == 8 + if (utf && (xclassflags & XCL_NOT) == 0) + { + PCRE2_UCHAR b, e; + PCRE2_SPTR p = tcode + 1 + LINK_SIZE + 1 + ((classmap == NULL)? 0:32); + tcode += GET(tcode, 1); + + for (;;) switch (*p++) + { + case XCL_SINGLE: + b = *p++; + while ((*p & 0xc0) == 0x80) p++; + re->start_bitmap[b/8] |= (1u << (b&7)); + break; + + case XCL_RANGE: + b = *p++; + while ((*p & 0xc0) == 0x80) p++; + e = *p++; + while ((*p & 0xc0) == 0x80) p++; + for (; b <= e; b++) + re->start_bitmap[b/8] |= (1u << (b&7)); + break; + + case XCL_END: + goto HANDLE_CLASSMAP; + + default: + return SSB_UNKNOWN; /* Internal error, should not occur */ + } + } +#endif /* SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8 */ +#endif /* SUPPORT_WIDE_CHARS */ + /* It seems that the fall through comment must be outside the #ifdef if it is to avoid the gcc compiler warning. */ @@ -1522,6 +1564,9 @@ do greater than 127. In fact, there are only two possible starting bytes for characters in the range 128 - 255. */ +#if defined SUPPORT_WIDE_CHARS && PCRE2_CODE_UNIT_WIDTH == 8 + HANDLE_CLASSMAP: +#endif if (classmap != NULL) { #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8 diff --git a/testdata/testinput10 b/testdata/testinput10 index 4353119..cf92525 100644 --- a/testdata/testinput10 +++ b/testdata/testinput10 @@ -561,4 +561,10 @@ /[\x{c1}\x{e1}]X[\x{145}\x{146}]/I,utf +/[󿾟,]/BI,utf + +/[\x{fff4}-\x{ffff8}]/I,utf + +/[\x{fff4}-\x{afff8}\x{10ffff}]/I,utf + # End of testinput10 diff --git a/testdata/testoutput10 b/testdata/testoutput10 index e5c7d3c..1fe44fb 100644 --- a/testdata/testoutput10 +++ b/testdata/testoutput10 @@ -1256,11 +1256,7 @@ Subject length lower bound = 1 ------------------------------------------------------------------ Capture group count = 0 Options: utf -Starting code units: Z \xc4 \xc5 \xc6 \xc7 \xc8 \xc9 \xca \xcb \xcc \xcd - \xce \xcf \xd0 \xd1 \xd2 \xd3 \xd4 \xd5 \xd6 \xd7 \xd8 \xd9 \xda \xdb \xdc - \xdd \xde \xdf \xe0 \xe1 \xe2 \xe3 \xe4 \xe5 \xe6 \xe7 \xe8 \xe9 \xea \xeb - \xec \xed \xee \xef \xf0 \xf1 \xf2 \xf3 \xf4 \xf5 \xf6 \xf7 \xf8 \xf9 \xfa - \xfb \xfc \xfd \xfe \xff +Starting code units: Z \xc4 Subject length lower bound = 1 Z\x{100} 0: Z @@ -1278,11 +1274,7 @@ Subject length lower bound = 1 ------------------------------------------------------------------ Capture group count = 0 Options: utf -Starting code units: z { | } ~ \x7f \xc2 \xc3 \xc4 \xc5 \xc6 \xc7 \xc8 \xc9 - \xca \xcb \xcc \xcd \xce \xcf \xd0 \xd1 \xd2 \xd3 \xd4 \xd5 \xd6 \xd7 \xd8 - \xd9 \xda \xdb \xdc \xdd \xde \xdf \xe0 \xe1 \xe2 \xe3 \xe4 \xe5 \xe6 \xe7 - \xe8 \xe9 \xea \xeb \xec \xed \xee \xef \xf0 \xf1 \xf2 \xf3 \xf4 \xf5 \xf6 - \xf7 \xf8 \xf9 \xfa \xfb \xfc \xfd \xfe \xff +Starting code units: z { | } ~ \x7f \xc2 \xc3 \xc4 Subject length lower bound = 1 /[z\Qa-d]Ā\E]/IB,utf @@ -1294,11 +1286,7 @@ Subject length lower bound = 1 ------------------------------------------------------------------ Capture group count = 0 Options: utf -Starting code units: - ] a d z \xc4 \xc5 \xc6 \xc7 \xc8 \xc9 \xca \xcb \xcc - \xcd \xce \xcf \xd0 \xd1 \xd2 \xd3 \xd4 \xd5 \xd6 \xd7 \xd8 \xd9 \xda \xdb - \xdc \xdd \xde \xdf \xe0 \xe1 \xe2 \xe3 \xe4 \xe5 \xe6 \xe7 \xe8 \xe9 \xea - \xeb \xec \xed \xee \xef \xf0 \xf1 \xf2 \xf3 \xf4 \xf5 \xf6 \xf7 \xf8 \xf9 - \xfa \xfb \xfc \xfd \xfe \xff +Starting code units: - ] a d z \xc4 Subject length lower bound = 1 \x{100} 0: \x{100} @@ -1319,11 +1307,7 @@ Subject length lower bound = 1 ------------------------------------------------------------------ Capture group count = 1 Options: utf -Starting code units: a b \xc4 \xc5 \xc6 \xc7 \xc8 \xc9 \xca \xcb \xcc \xcd - \xce \xcf \xd0 \xd1 \xd2 \xd3 \xd4 \xd5 \xd6 \xd7 \xd8 \xd9 \xda \xdb \xdc - \xdd \xde \xdf \xe0 \xe1 \xe2 \xe3 \xe4 \xe5 \xe6 \xe7 \xe8 \xe9 \xea \xeb - \xec \xed \xee \xef \xf0 \xf1 \xf2 \xf3 \xf4 \xf5 \xf6 \xf7 \xf8 \xf9 \xfa - \xfb \xfc \xfd \xfe \xff +Starting code units: a b \xc4 Last code unit = 'z' Subject length lower bound = 7 @@ -1440,11 +1424,7 @@ Subject length lower bound = 1 ------------------------------------------------------------------ Capture group count = 0 Options: caseless utf -Starting code units: \xc4 \xc5 \xc6 \xc7 \xc8 \xc9 \xca \xcb \xcc \xcd \xce - \xcf \xd0 \xd1 \xd2 \xd3 \xd4 \xd5 \xd6 \xd7 \xd8 \xd9 \xda \xdb \xdc \xdd - \xde \xdf \xe0 \xe1 \xe2 \xe3 \xe4 \xe5 \xe6 \xe7 \xe8 \xe9 \xea \xeb \xec - \xed \xee \xef \xf0 \xf1 \xf2 \xf3 \xf4 \xf5 \xf6 \xf7 \xf8 \xf9 \xfa \xfb - \xfc \xfd \xfe \xff +Starting code units: \xc4 Subject length lower bound = 1 \x{104} 0: \x{104} @@ -1467,11 +1447,7 @@ No match ------------------------------------------------------------------ Capture group count = 0 Options: caseless utf -Starting code units: Z z { | } ~ \x7f \xc2 \xc3 \xc4 \xc5 \xc6 \xc7 \xc8 - \xc9 \xca \xcb \xcc \xcd \xce \xcf \xd0 \xd1 \xd2 \xd3 \xd4 \xd5 \xd6 \xd7 - \xd8 \xd9 \xda \xdb \xdc \xdd \xde \xdf \xe0 \xe1 \xe2 \xe3 \xe4 \xe5 \xe6 - \xe7 \xe8 \xe9 \xea \xeb \xec \xed \xee \xef \xf0 \xf1 \xf2 \xf3 \xf4 \xf5 - \xf6 \xf7 \xf8 \xf9 \xfa \xfb \xfc \xfd \xfe \xff +Starting code units: Z z { | } ~ \x7f \xc2 \xc3 \xc4 \xc5 \xce \xe1 \xe2 Subject length lower bound = 1 Z 0: Z @@ -1508,11 +1484,7 @@ No match ------------------------------------------------------------------ Capture group count = 0 Options: caseless utf -Starting code units: Z z { | } ~ \x7f \xc2 \xc3 \xc4 \xc5 \xc6 \xc7 \xc8 - \xc9 \xca \xcb \xcc \xcd \xce \xcf \xd0 \xd1 \xd2 \xd3 \xd4 \xd5 \xd6 \xd7 - \xd8 \xd9 \xda \xdb \xdc \xdd \xde \xdf \xe0 \xe1 \xe2 \xe3 \xe4 \xe5 \xe6 - \xe7 \xe8 \xe9 \xea \xeb \xec \xed \xee \xef \xf0 \xf1 \xf2 \xf3 \xf4 \xf5 - \xf6 \xf7 \xf8 \xf9 \xfa \xfb \xfc \xfd \xfe \xff +Starting code units: Z z { | } ~ \x7f \xc2 \xc3 \xc4 \xc5 \xce \xe1 \xe2 Subject length lower bound = 1 /\x{3a3}B/IBi,utf @@ -1773,4 +1745,28 @@ Starting code units: \xc3 Last code unit = 'X' Subject length lower bound = 3 +/[󿾟,]/BI,utf +------------------------------------------------------------------ + Bra + [,\x{fff9f}] + Ket + End +------------------------------------------------------------------ +Capture group count = 0 +Options: utf +Starting code units: , \xf3 +Subject length lower bound = 1 + +/[\x{fff4}-\x{ffff8}]/I,utf +Capture group count = 0 +Options: utf +Starting code units: \xef \xf0 \xf1 \xf2 \xf3 +Subject length lower bound = 1 + +/[\x{fff4}-\x{afff8}\x{10ffff}]/I,utf +Capture group count = 0 +Options: utf +Starting code units: \xef \xf0 \xf1 \xf2 \xf4 +Subject length lower bound = 1 + # End of testinput10