Improve starting-byte bit map for UTF-8 patterns with wide characters in
classes.
This commit is contained in:
parent
78fae97f6c
commit
d917899be5
|
@ -153,6 +153,13 @@ string pointer is close to 0.
|
|||
same character, to be treated as a single caseless character. This causes the
|
||||
first and required code unit optimizations to kick in where relevant.
|
||||
|
||||
34. Improve the bitmap of starting bytes for positive classes that include wide
|
||||
characters, but no property types, in UTF-8 mode. Previously, on encountering
|
||||
such a class, the bits for all bytes greater than \xc4 were set, thus
|
||||
specifying any character with codepoint >= 0x100. Now the only bits that are
|
||||
set are for the relevant bytes that start the wide characters. This can give a
|
||||
noticeable performance improvement.
|
||||
|
||||
|
||||
Version 10.33 16-April-2019
|
||||
---------------------------
|
||||
|
|
|
@ -909,7 +909,7 @@ if (table_limit != 32) for (c = 24; c < 32; c++) re->start_bitmap[c] = 0xff;
|
|||
|
||||
|
||||
/*************************************************
|
||||
* Create bitmap of starting bytes *
|
||||
* Create bitmap of starting code units *
|
||||
*************************************************/
|
||||
|
||||
/* This function scans a compiled unanchored expression recursively and
|
||||
|
@ -959,6 +959,9 @@ do
|
|||
{
|
||||
int rc;
|
||||
uint8_t *classmap = NULL;
|
||||
#ifdef SUPPORT_WIDE_CHARS
|
||||
PCRE2_UCHAR xclassflags;
|
||||
#endif
|
||||
|
||||
switch(*tcode)
|
||||
{
|
||||
|
@ -1467,20 +1470,59 @@ do
|
|||
negative XCLASS without a map, give up. If there are no property checks,
|
||||
there must be wide characters on the XCLASS list, because otherwise an
|
||||
XCLASS would not have been created. This means that code points >= 255
|
||||
are always potential starters. */
|
||||
are potential starters. In the UTF-8 case we can scan them and set bits
|
||||
for the relevant leading bytes. */
|
||||
|
||||
#ifdef SUPPORT_WIDE_CHARS
|
||||
case OP_XCLASS:
|
||||
if ((tcode[1 + LINK_SIZE] & XCL_HASPROP) != 0 ||
|
||||
(tcode[1 + LINK_SIZE] & (XCL_MAP|XCL_NOT)) == XCL_NOT)
|
||||
xclassflags = tcode[1 + LINK_SIZE];
|
||||
if ((xclassflags & XCL_HASPROP) != 0 ||
|
||||
(xclassflags & (XCL_MAP|XCL_NOT)) == XCL_NOT)
|
||||
return SSB_FAIL;
|
||||
|
||||
/* We have a positive XCLASS or a negative one without a map. Set up the
|
||||
map pointer if there is one, and fall through. */
|
||||
|
||||
classmap = ((tcode[1 + LINK_SIZE] & XCL_MAP) == 0)? NULL :
|
||||
classmap = ((xclassflags & XCL_MAP) == 0)? NULL :
|
||||
(uint8_t *)(tcode + 1 + LINK_SIZE + 1);
|
||||
#endif
|
||||
|
||||
/* In UTF-8 mode, scan the character list and set bits for leading bytes,
|
||||
then jump to handle the map. */
|
||||
|
||||
#if PCRE2_CODE_UNIT_WIDTH == 8
|
||||
if (utf && (xclassflags & XCL_NOT) == 0)
|
||||
{
|
||||
PCRE2_UCHAR b, e;
|
||||
PCRE2_SPTR p = tcode + 1 + LINK_SIZE + 1 + ((classmap == NULL)? 0:32);
|
||||
tcode += GET(tcode, 1);
|
||||
|
||||
for (;;) switch (*p++)
|
||||
{
|
||||
case XCL_SINGLE:
|
||||
b = *p++;
|
||||
while ((*p & 0xc0) == 0x80) p++;
|
||||
re->start_bitmap[b/8] |= (1u << (b&7));
|
||||
break;
|
||||
|
||||
case XCL_RANGE:
|
||||
b = *p++;
|
||||
while ((*p & 0xc0) == 0x80) p++;
|
||||
e = *p++;
|
||||
while ((*p & 0xc0) == 0x80) p++;
|
||||
for (; b <= e; b++)
|
||||
re->start_bitmap[b/8] |= (1u << (b&7));
|
||||
break;
|
||||
|
||||
case XCL_END:
|
||||
goto HANDLE_CLASSMAP;
|
||||
|
||||
default:
|
||||
return SSB_UNKNOWN; /* Internal error, should not occur */
|
||||
}
|
||||
}
|
||||
#endif /* SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8 */
|
||||
#endif /* SUPPORT_WIDE_CHARS */
|
||||
|
||||
/* It seems that the fall through comment must be outside the #ifdef if
|
||||
it is to avoid the gcc compiler warning. */
|
||||
|
||||
|
@ -1522,6 +1564,9 @@ do
|
|||
greater than 127. In fact, there are only two possible starting bytes for
|
||||
characters in the range 128 - 255. */
|
||||
|
||||
#if defined SUPPORT_WIDE_CHARS && PCRE2_CODE_UNIT_WIDTH == 8
|
||||
HANDLE_CLASSMAP:
|
||||
#endif
|
||||
if (classmap != NULL)
|
||||
{
|
||||
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8
|
||||
|
|
|
@ -561,4 +561,10 @@
|
|||
|
||||
/[\x{c1}\x{e1}]X[\x{145}\x{146}]/I,utf
|
||||
|
||||
/[,]/BI,utf
|
||||
|
||||
/[\x{fff4}-\x{ffff8}]/I,utf
|
||||
|
||||
/[\x{fff4}-\x{afff8}\x{10ffff}]/I,utf
|
||||
|
||||
# End of testinput10
|
||||
|
|
|
@ -1256,11 +1256,7 @@ Subject length lower bound = 1
|
|||
------------------------------------------------------------------
|
||||
Capture group count = 0
|
||||
Options: utf
|
||||
Starting code units: Z \xc4 \xc5 \xc6 \xc7 \xc8 \xc9 \xca \xcb \xcc \xcd
|
||||
\xce \xcf \xd0 \xd1 \xd2 \xd3 \xd4 \xd5 \xd6 \xd7 \xd8 \xd9 \xda \xdb \xdc
|
||||
\xdd \xde \xdf \xe0 \xe1 \xe2 \xe3 \xe4 \xe5 \xe6 \xe7 \xe8 \xe9 \xea \xeb
|
||||
\xec \xed \xee \xef \xf0 \xf1 \xf2 \xf3 \xf4 \xf5 \xf6 \xf7 \xf8 \xf9 \xfa
|
||||
\xfb \xfc \xfd \xfe \xff
|
||||
Starting code units: Z \xc4
|
||||
Subject length lower bound = 1
|
||||
Z\x{100}
|
||||
0: Z
|
||||
|
@ -1278,11 +1274,7 @@ Subject length lower bound = 1
|
|||
------------------------------------------------------------------
|
||||
Capture group count = 0
|
||||
Options: utf
|
||||
Starting code units: z { | } ~ \x7f \xc2 \xc3 \xc4 \xc5 \xc6 \xc7 \xc8 \xc9
|
||||
\xca \xcb \xcc \xcd \xce \xcf \xd0 \xd1 \xd2 \xd3 \xd4 \xd5 \xd6 \xd7 \xd8
|
||||
\xd9 \xda \xdb \xdc \xdd \xde \xdf \xe0 \xe1 \xe2 \xe3 \xe4 \xe5 \xe6 \xe7
|
||||
\xe8 \xe9 \xea \xeb \xec \xed \xee \xef \xf0 \xf1 \xf2 \xf3 \xf4 \xf5 \xf6
|
||||
\xf7 \xf8 \xf9 \xfa \xfb \xfc \xfd \xfe \xff
|
||||
Starting code units: z { | } ~ \x7f \xc2 \xc3 \xc4
|
||||
Subject length lower bound = 1
|
||||
|
||||
/[z\Qa-d]Ā\E]/IB,utf
|
||||
|
@ -1294,11 +1286,7 @@ Subject length lower bound = 1
|
|||
------------------------------------------------------------------
|
||||
Capture group count = 0
|
||||
Options: utf
|
||||
Starting code units: - ] a d z \xc4 \xc5 \xc6 \xc7 \xc8 \xc9 \xca \xcb \xcc
|
||||
\xcd \xce \xcf \xd0 \xd1 \xd2 \xd3 \xd4 \xd5 \xd6 \xd7 \xd8 \xd9 \xda \xdb
|
||||
\xdc \xdd \xde \xdf \xe0 \xe1 \xe2 \xe3 \xe4 \xe5 \xe6 \xe7 \xe8 \xe9 \xea
|
||||
\xeb \xec \xed \xee \xef \xf0 \xf1 \xf2 \xf3 \xf4 \xf5 \xf6 \xf7 \xf8 \xf9
|
||||
\xfa \xfb \xfc \xfd \xfe \xff
|
||||
Starting code units: - ] a d z \xc4
|
||||
Subject length lower bound = 1
|
||||
\x{100}
|
||||
0: \x{100}
|
||||
|
@ -1319,11 +1307,7 @@ Subject length lower bound = 1
|
|||
------------------------------------------------------------------
|
||||
Capture group count = 1
|
||||
Options: utf
|
||||
Starting code units: a b \xc4 \xc5 \xc6 \xc7 \xc8 \xc9 \xca \xcb \xcc \xcd
|
||||
\xce \xcf \xd0 \xd1 \xd2 \xd3 \xd4 \xd5 \xd6 \xd7 \xd8 \xd9 \xda \xdb \xdc
|
||||
\xdd \xde \xdf \xe0 \xe1 \xe2 \xe3 \xe4 \xe5 \xe6 \xe7 \xe8 \xe9 \xea \xeb
|
||||
\xec \xed \xee \xef \xf0 \xf1 \xf2 \xf3 \xf4 \xf5 \xf6 \xf7 \xf8 \xf9 \xfa
|
||||
\xfb \xfc \xfd \xfe \xff
|
||||
Starting code units: a b \xc4
|
||||
Last code unit = 'z'
|
||||
Subject length lower bound = 7
|
||||
|
||||
|
@ -1440,11 +1424,7 @@ Subject length lower bound = 1
|
|||
------------------------------------------------------------------
|
||||
Capture group count = 0
|
||||
Options: caseless utf
|
||||
Starting code units: \xc4 \xc5 \xc6 \xc7 \xc8 \xc9 \xca \xcb \xcc \xcd \xce
|
||||
\xcf \xd0 \xd1 \xd2 \xd3 \xd4 \xd5 \xd6 \xd7 \xd8 \xd9 \xda \xdb \xdc \xdd
|
||||
\xde \xdf \xe0 \xe1 \xe2 \xe3 \xe4 \xe5 \xe6 \xe7 \xe8 \xe9 \xea \xeb \xec
|
||||
\xed \xee \xef \xf0 \xf1 \xf2 \xf3 \xf4 \xf5 \xf6 \xf7 \xf8 \xf9 \xfa \xfb
|
||||
\xfc \xfd \xfe \xff
|
||||
Starting code units: \xc4
|
||||
Subject length lower bound = 1
|
||||
\x{104}
|
||||
0: \x{104}
|
||||
|
@ -1467,11 +1447,7 @@ No match
|
|||
------------------------------------------------------------------
|
||||
Capture group count = 0
|
||||
Options: caseless utf
|
||||
Starting code units: Z z { | } ~ \x7f \xc2 \xc3 \xc4 \xc5 \xc6 \xc7 \xc8
|
||||
\xc9 \xca \xcb \xcc \xcd \xce \xcf \xd0 \xd1 \xd2 \xd3 \xd4 \xd5 \xd6 \xd7
|
||||
\xd8 \xd9 \xda \xdb \xdc \xdd \xde \xdf \xe0 \xe1 \xe2 \xe3 \xe4 \xe5 \xe6
|
||||
\xe7 \xe8 \xe9 \xea \xeb \xec \xed \xee \xef \xf0 \xf1 \xf2 \xf3 \xf4 \xf5
|
||||
\xf6 \xf7 \xf8 \xf9 \xfa \xfb \xfc \xfd \xfe \xff
|
||||
Starting code units: Z z { | } ~ \x7f \xc2 \xc3 \xc4 \xc5 \xce \xe1 \xe2
|
||||
Subject length lower bound = 1
|
||||
Z
|
||||
0: Z
|
||||
|
@ -1508,11 +1484,7 @@ No match
|
|||
------------------------------------------------------------------
|
||||
Capture group count = 0
|
||||
Options: caseless utf
|
||||
Starting code units: Z z { | } ~ \x7f \xc2 \xc3 \xc4 \xc5 \xc6 \xc7 \xc8
|
||||
\xc9 \xca \xcb \xcc \xcd \xce \xcf \xd0 \xd1 \xd2 \xd3 \xd4 \xd5 \xd6 \xd7
|
||||
\xd8 \xd9 \xda \xdb \xdc \xdd \xde \xdf \xe0 \xe1 \xe2 \xe3 \xe4 \xe5 \xe6
|
||||
\xe7 \xe8 \xe9 \xea \xeb \xec \xed \xee \xef \xf0 \xf1 \xf2 \xf3 \xf4 \xf5
|
||||
\xf6 \xf7 \xf8 \xf9 \xfa \xfb \xfc \xfd \xfe \xff
|
||||
Starting code units: Z z { | } ~ \x7f \xc2 \xc3 \xc4 \xc5 \xce \xe1 \xe2
|
||||
Subject length lower bound = 1
|
||||
|
||||
/\x{3a3}B/IBi,utf
|
||||
|
@ -1773,4 +1745,28 @@ Starting code units: \xc3
|
|||
Last code unit = 'X'
|
||||
Subject length lower bound = 3
|
||||
|
||||
/[,]/BI,utf
|
||||
------------------------------------------------------------------
|
||||
Bra
|
||||
[,\x{fff9f}]
|
||||
Ket
|
||||
End
|
||||
------------------------------------------------------------------
|
||||
Capture group count = 0
|
||||
Options: utf
|
||||
Starting code units: , \xf3
|
||||
Subject length lower bound = 1
|
||||
|
||||
/[\x{fff4}-\x{ffff8}]/I,utf
|
||||
Capture group count = 0
|
||||
Options: utf
|
||||
Starting code units: \xef \xf0 \xf1 \xf2 \xf3
|
||||
Subject length lower bound = 1
|
||||
|
||||
/[\x{fff4}-\x{afff8}\x{10ffff}]/I,utf
|
||||
Capture group count = 0
|
||||
Options: utf
|
||||
Starting code units: \xef \xf0 \xf1 \xf2 \xf4
|
||||
Subject length lower bound = 1
|
||||
|
||||
# End of testinput10
|
||||
|
|
Loading…
Reference in New Issue