Improve starting-byte bit map for UTF-8 patterns with wide characters in

classes.
This commit is contained in:
Philip.Hazel 2019-09-10 15:38:42 +00:00
parent 78fae97f6c
commit d917899be5
4 changed files with 95 additions and 41 deletions

View File

@ -153,6 +153,13 @@ string pointer is close to 0.
same character, to be treated as a single caseless character. This causes the same character, to be treated as a single caseless character. This causes the
first and required code unit optimizations to kick in where relevant. first and required code unit optimizations to kick in where relevant.
34. Improve the bitmap of starting bytes for positive classes that include wide
characters, but no property types, in UTF-8 mode. Previously, on encountering
such a class, the bits for all bytes greater than \xc4 were set, thus
specifying any character with codepoint >= 0x100. Now the only bits that are
set are for the relevant bytes that start the wide characters. This can give a
noticeable performance improvement.
Version 10.33 16-April-2019 Version 10.33 16-April-2019
--------------------------- ---------------------------

View File

@ -909,7 +909,7 @@ if (table_limit != 32) for (c = 24; c < 32; c++) re->start_bitmap[c] = 0xff;
/************************************************* /*************************************************
* Create bitmap of starting bytes * * Create bitmap of starting code units *
*************************************************/ *************************************************/
/* This function scans a compiled unanchored expression recursively and /* This function scans a compiled unanchored expression recursively and
@ -959,6 +959,9 @@ do
{ {
int rc; int rc;
uint8_t *classmap = NULL; uint8_t *classmap = NULL;
#ifdef SUPPORT_WIDE_CHARS
PCRE2_UCHAR xclassflags;
#endif
switch(*tcode) switch(*tcode)
{ {
@ -1467,20 +1470,59 @@ do
negative XCLASS without a map, give up. If there are no property checks, negative XCLASS without a map, give up. If there are no property checks,
there must be wide characters on the XCLASS list, because otherwise an there must be wide characters on the XCLASS list, because otherwise an
XCLASS would not have been created. This means that code points >= 255 XCLASS would not have been created. This means that code points >= 255
are always potential starters. */ are potential starters. In the UTF-8 case we can scan them and set bits
for the relevant leading bytes. */
#ifdef SUPPORT_WIDE_CHARS #ifdef SUPPORT_WIDE_CHARS
case OP_XCLASS: case OP_XCLASS:
if ((tcode[1 + LINK_SIZE] & XCL_HASPROP) != 0 || xclassflags = tcode[1 + LINK_SIZE];
(tcode[1 + LINK_SIZE] & (XCL_MAP|XCL_NOT)) == XCL_NOT) if ((xclassflags & XCL_HASPROP) != 0 ||
(xclassflags & (XCL_MAP|XCL_NOT)) == XCL_NOT)
return SSB_FAIL; return SSB_FAIL;
/* We have a positive XCLASS or a negative one without a map. Set up the /* We have a positive XCLASS or a negative one without a map. Set up the
map pointer if there is one, and fall through. */ map pointer if there is one, and fall through. */
classmap = ((tcode[1 + LINK_SIZE] & XCL_MAP) == 0)? NULL : classmap = ((xclassflags & XCL_MAP) == 0)? NULL :
(uint8_t *)(tcode + 1 + LINK_SIZE + 1); (uint8_t *)(tcode + 1 + LINK_SIZE + 1);
#endif
/* In UTF-8 mode, scan the character list and set bits for leading bytes,
then jump to handle the map. */
#if PCRE2_CODE_UNIT_WIDTH == 8
if (utf && (xclassflags & XCL_NOT) == 0)
{
PCRE2_UCHAR b, e;
PCRE2_SPTR p = tcode + 1 + LINK_SIZE + 1 + ((classmap == NULL)? 0:32);
tcode += GET(tcode, 1);
for (;;) switch (*p++)
{
case XCL_SINGLE:
b = *p++;
while ((*p & 0xc0) == 0x80) p++;
re->start_bitmap[b/8] |= (1u << (b&7));
break;
case XCL_RANGE:
b = *p++;
while ((*p & 0xc0) == 0x80) p++;
e = *p++;
while ((*p & 0xc0) == 0x80) p++;
for (; b <= e; b++)
re->start_bitmap[b/8] |= (1u << (b&7));
break;
case XCL_END:
goto HANDLE_CLASSMAP;
default:
return SSB_UNKNOWN; /* Internal error, should not occur */
}
}
#endif /* SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8 */
#endif /* SUPPORT_WIDE_CHARS */
/* It seems that the fall through comment must be outside the #ifdef if /* It seems that the fall through comment must be outside the #ifdef if
it is to avoid the gcc compiler warning. */ it is to avoid the gcc compiler warning. */
@ -1522,6 +1564,9 @@ do
greater than 127. In fact, there are only two possible starting bytes for greater than 127. In fact, there are only two possible starting bytes for
characters in the range 128 - 255. */ characters in the range 128 - 255. */
#if defined SUPPORT_WIDE_CHARS && PCRE2_CODE_UNIT_WIDTH == 8
HANDLE_CLASSMAP:
#endif
if (classmap != NULL) if (classmap != NULL)
{ {
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8

View File

@ -561,4 +561,10 @@
/[\x{c1}\x{e1}]X[\x{145}\x{146}]/I,utf /[\x{c1}\x{e1}]X[\x{145}\x{146}]/I,utf
/[󿾟,]/BI,utf
/[\x{fff4}-\x{ffff8}]/I,utf
/[\x{fff4}-\x{afff8}\x{10ffff}]/I,utf
# End of testinput10 # End of testinput10

66
testdata/testoutput10 vendored
View File

@ -1256,11 +1256,7 @@ Subject length lower bound = 1
------------------------------------------------------------------ ------------------------------------------------------------------
Capture group count = 0 Capture group count = 0
Options: utf Options: utf
Starting code units: Z \xc4 \xc5 \xc6 \xc7 \xc8 \xc9 \xca \xcb \xcc \xcd Starting code units: Z \xc4
\xce \xcf \xd0 \xd1 \xd2 \xd3 \xd4 \xd5 \xd6 \xd7 \xd8 \xd9 \xda \xdb \xdc
\xdd \xde \xdf \xe0 \xe1 \xe2 \xe3 \xe4 \xe5 \xe6 \xe7 \xe8 \xe9 \xea \xeb
\xec \xed \xee \xef \xf0 \xf1 \xf2 \xf3 \xf4 \xf5 \xf6 \xf7 \xf8 \xf9 \xfa
\xfb \xfc \xfd \xfe \xff
Subject length lower bound = 1 Subject length lower bound = 1
Z\x{100} Z\x{100}
0: Z 0: Z
@ -1278,11 +1274,7 @@ Subject length lower bound = 1
------------------------------------------------------------------ ------------------------------------------------------------------
Capture group count = 0 Capture group count = 0
Options: utf Options: utf
Starting code units: z { | } ~ \x7f \xc2 \xc3 \xc4 \xc5 \xc6 \xc7 \xc8 \xc9 Starting code units: z { | } ~ \x7f \xc2 \xc3 \xc4
\xca \xcb \xcc \xcd \xce \xcf \xd0 \xd1 \xd2 \xd3 \xd4 \xd5 \xd6 \xd7 \xd8
\xd9 \xda \xdb \xdc \xdd \xde \xdf \xe0 \xe1 \xe2 \xe3 \xe4 \xe5 \xe6 \xe7
\xe8 \xe9 \xea \xeb \xec \xed \xee \xef \xf0 \xf1 \xf2 \xf3 \xf4 \xf5 \xf6
\xf7 \xf8 \xf9 \xfa \xfb \xfc \xfd \xfe \xff
Subject length lower bound = 1 Subject length lower bound = 1
/[z\Qa-d]Ā\E]/IB,utf /[z\Qa-d]Ā\E]/IB,utf
@ -1294,11 +1286,7 @@ Subject length lower bound = 1
------------------------------------------------------------------ ------------------------------------------------------------------
Capture group count = 0 Capture group count = 0
Options: utf Options: utf
Starting code units: - ] a d z \xc4 \xc5 \xc6 \xc7 \xc8 \xc9 \xca \xcb \xcc Starting code units: - ] a d z \xc4
\xcd \xce \xcf \xd0 \xd1 \xd2 \xd3 \xd4 \xd5 \xd6 \xd7 \xd8 \xd9 \xda \xdb
\xdc \xdd \xde \xdf \xe0 \xe1 \xe2 \xe3 \xe4 \xe5 \xe6 \xe7 \xe8 \xe9 \xea
\xeb \xec \xed \xee \xef \xf0 \xf1 \xf2 \xf3 \xf4 \xf5 \xf6 \xf7 \xf8 \xf9
\xfa \xfb \xfc \xfd \xfe \xff
Subject length lower bound = 1 Subject length lower bound = 1
\x{100} \x{100}
0: \x{100} 0: \x{100}
@ -1319,11 +1307,7 @@ Subject length lower bound = 1
------------------------------------------------------------------ ------------------------------------------------------------------
Capture group count = 1 Capture group count = 1
Options: utf Options: utf
Starting code units: a b \xc4 \xc5 \xc6 \xc7 \xc8 \xc9 \xca \xcb \xcc \xcd Starting code units: a b \xc4
\xce \xcf \xd0 \xd1 \xd2 \xd3 \xd4 \xd5 \xd6 \xd7 \xd8 \xd9 \xda \xdb \xdc
\xdd \xde \xdf \xe0 \xe1 \xe2 \xe3 \xe4 \xe5 \xe6 \xe7 \xe8 \xe9 \xea \xeb
\xec \xed \xee \xef \xf0 \xf1 \xf2 \xf3 \xf4 \xf5 \xf6 \xf7 \xf8 \xf9 \xfa
\xfb \xfc \xfd \xfe \xff
Last code unit = 'z' Last code unit = 'z'
Subject length lower bound = 7 Subject length lower bound = 7
@ -1440,11 +1424,7 @@ Subject length lower bound = 1
------------------------------------------------------------------ ------------------------------------------------------------------
Capture group count = 0 Capture group count = 0
Options: caseless utf Options: caseless utf
Starting code units: \xc4 \xc5 \xc6 \xc7 \xc8 \xc9 \xca \xcb \xcc \xcd \xce Starting code units: \xc4
\xcf \xd0 \xd1 \xd2 \xd3 \xd4 \xd5 \xd6 \xd7 \xd8 \xd9 \xda \xdb \xdc \xdd
\xde \xdf \xe0 \xe1 \xe2 \xe3 \xe4 \xe5 \xe6 \xe7 \xe8 \xe9 \xea \xeb \xec
\xed \xee \xef \xf0 \xf1 \xf2 \xf3 \xf4 \xf5 \xf6 \xf7 \xf8 \xf9 \xfa \xfb
\xfc \xfd \xfe \xff
Subject length lower bound = 1 Subject length lower bound = 1
\x{104} \x{104}
0: \x{104} 0: \x{104}
@ -1467,11 +1447,7 @@ No match
------------------------------------------------------------------ ------------------------------------------------------------------
Capture group count = 0 Capture group count = 0
Options: caseless utf Options: caseless utf
Starting code units: Z z { | } ~ \x7f \xc2 \xc3 \xc4 \xc5 \xc6 \xc7 \xc8 Starting code units: Z z { | } ~ \x7f \xc2 \xc3 \xc4 \xc5 \xce \xe1 \xe2
\xc9 \xca \xcb \xcc \xcd \xce \xcf \xd0 \xd1 \xd2 \xd3 \xd4 \xd5 \xd6 \xd7
\xd8 \xd9 \xda \xdb \xdc \xdd \xde \xdf \xe0 \xe1 \xe2 \xe3 \xe4 \xe5 \xe6
\xe7 \xe8 \xe9 \xea \xeb \xec \xed \xee \xef \xf0 \xf1 \xf2 \xf3 \xf4 \xf5
\xf6 \xf7 \xf8 \xf9 \xfa \xfb \xfc \xfd \xfe \xff
Subject length lower bound = 1 Subject length lower bound = 1
Z Z
0: Z 0: Z
@ -1508,11 +1484,7 @@ No match
------------------------------------------------------------------ ------------------------------------------------------------------
Capture group count = 0 Capture group count = 0
Options: caseless utf Options: caseless utf
Starting code units: Z z { | } ~ \x7f \xc2 \xc3 \xc4 \xc5 \xc6 \xc7 \xc8 Starting code units: Z z { | } ~ \x7f \xc2 \xc3 \xc4 \xc5 \xce \xe1 \xe2
\xc9 \xca \xcb \xcc \xcd \xce \xcf \xd0 \xd1 \xd2 \xd3 \xd4 \xd5 \xd6 \xd7
\xd8 \xd9 \xda \xdb \xdc \xdd \xde \xdf \xe0 \xe1 \xe2 \xe3 \xe4 \xe5 \xe6
\xe7 \xe8 \xe9 \xea \xeb \xec \xed \xee \xef \xf0 \xf1 \xf2 \xf3 \xf4 \xf5
\xf6 \xf7 \xf8 \xf9 \xfa \xfb \xfc \xfd \xfe \xff
Subject length lower bound = 1 Subject length lower bound = 1
/\x{3a3}B/IBi,utf /\x{3a3}B/IBi,utf
@ -1773,4 +1745,28 @@ Starting code units: \xc3
Last code unit = 'X' Last code unit = 'X'
Subject length lower bound = 3 Subject length lower bound = 3
/[󿾟,]/BI,utf
------------------------------------------------------------------
Bra
[,\x{fff9f}]
Ket
End
------------------------------------------------------------------
Capture group count = 0
Options: utf
Starting code units: , \xf3
Subject length lower bound = 1
/[\x{fff4}-\x{ffff8}]/I,utf
Capture group count = 0
Options: utf
Starting code units: \xef \xf0 \xf1 \xf2 \xf3
Subject length lower bound = 1
/[\x{fff4}-\x{afff8}\x{10ffff}]/I,utf
Capture group count = 0
Options: utf
Starting code units: \xef \xf0 \xf1 \xf2 \xf4
Subject length lower bound = 1
# End of testinput10 # End of testinput10