Improve starting-byte bit map for UTF-8 patterns with wide characters in
classes.
This commit is contained in:
parent
78fae97f6c
commit
d917899be5
|
@ -153,6 +153,13 @@ string pointer is close to 0.
|
||||||
same character, to be treated as a single caseless character. This causes the
|
same character, to be treated as a single caseless character. This causes the
|
||||||
first and required code unit optimizations to kick in where relevant.
|
first and required code unit optimizations to kick in where relevant.
|
||||||
|
|
||||||
|
34. Improve the bitmap of starting bytes for positive classes that include wide
|
||||||
|
characters, but no property types, in UTF-8 mode. Previously, on encountering
|
||||||
|
such a class, the bits for all bytes greater than \xc4 were set, thus
|
||||||
|
specifying any character with codepoint >= 0x100. Now the only bits that are
|
||||||
|
set are for the relevant bytes that start the wide characters. This can give a
|
||||||
|
noticeable performance improvement.
|
||||||
|
|
||||||
|
|
||||||
Version 10.33 16-April-2019
|
Version 10.33 16-April-2019
|
||||||
---------------------------
|
---------------------------
|
||||||
|
|
|
@ -909,7 +909,7 @@ if (table_limit != 32) for (c = 24; c < 32; c++) re->start_bitmap[c] = 0xff;
|
||||||
|
|
||||||
|
|
||||||
/*************************************************
|
/*************************************************
|
||||||
* Create bitmap of starting bytes *
|
* Create bitmap of starting code units *
|
||||||
*************************************************/
|
*************************************************/
|
||||||
|
|
||||||
/* This function scans a compiled unanchored expression recursively and
|
/* This function scans a compiled unanchored expression recursively and
|
||||||
|
@ -959,6 +959,9 @@ do
|
||||||
{
|
{
|
||||||
int rc;
|
int rc;
|
||||||
uint8_t *classmap = NULL;
|
uint8_t *classmap = NULL;
|
||||||
|
#ifdef SUPPORT_WIDE_CHARS
|
||||||
|
PCRE2_UCHAR xclassflags;
|
||||||
|
#endif
|
||||||
|
|
||||||
switch(*tcode)
|
switch(*tcode)
|
||||||
{
|
{
|
||||||
|
@ -1467,20 +1470,59 @@ do
|
||||||
negative XCLASS without a map, give up. If there are no property checks,
|
negative XCLASS without a map, give up. If there are no property checks,
|
||||||
there must be wide characters on the XCLASS list, because otherwise an
|
there must be wide characters on the XCLASS list, because otherwise an
|
||||||
XCLASS would not have been created. This means that code points >= 255
|
XCLASS would not have been created. This means that code points >= 255
|
||||||
are always potential starters. */
|
are potential starters. In the UTF-8 case we can scan them and set bits
|
||||||
|
for the relevant leading bytes. */
|
||||||
|
|
||||||
#ifdef SUPPORT_WIDE_CHARS
|
#ifdef SUPPORT_WIDE_CHARS
|
||||||
case OP_XCLASS:
|
case OP_XCLASS:
|
||||||
if ((tcode[1 + LINK_SIZE] & XCL_HASPROP) != 0 ||
|
xclassflags = tcode[1 + LINK_SIZE];
|
||||||
(tcode[1 + LINK_SIZE] & (XCL_MAP|XCL_NOT)) == XCL_NOT)
|
if ((xclassflags & XCL_HASPROP) != 0 ||
|
||||||
|
(xclassflags & (XCL_MAP|XCL_NOT)) == XCL_NOT)
|
||||||
return SSB_FAIL;
|
return SSB_FAIL;
|
||||||
|
|
||||||
/* We have a positive XCLASS or a negative one without a map. Set up the
|
/* We have a positive XCLASS or a negative one without a map. Set up the
|
||||||
map pointer if there is one, and fall through. */
|
map pointer if there is one, and fall through. */
|
||||||
|
|
||||||
classmap = ((tcode[1 + LINK_SIZE] & XCL_MAP) == 0)? NULL :
|
classmap = ((xclassflags & XCL_MAP) == 0)? NULL :
|
||||||
(uint8_t *)(tcode + 1 + LINK_SIZE + 1);
|
(uint8_t *)(tcode + 1 + LINK_SIZE + 1);
|
||||||
#endif
|
|
||||||
|
/* In UTF-8 mode, scan the character list and set bits for leading bytes,
|
||||||
|
then jump to handle the map. */
|
||||||
|
|
||||||
|
#if PCRE2_CODE_UNIT_WIDTH == 8
|
||||||
|
if (utf && (xclassflags & XCL_NOT) == 0)
|
||||||
|
{
|
||||||
|
PCRE2_UCHAR b, e;
|
||||||
|
PCRE2_SPTR p = tcode + 1 + LINK_SIZE + 1 + ((classmap == NULL)? 0:32);
|
||||||
|
tcode += GET(tcode, 1);
|
||||||
|
|
||||||
|
for (;;) switch (*p++)
|
||||||
|
{
|
||||||
|
case XCL_SINGLE:
|
||||||
|
b = *p++;
|
||||||
|
while ((*p & 0xc0) == 0x80) p++;
|
||||||
|
re->start_bitmap[b/8] |= (1u << (b&7));
|
||||||
|
break;
|
||||||
|
|
||||||
|
case XCL_RANGE:
|
||||||
|
b = *p++;
|
||||||
|
while ((*p & 0xc0) == 0x80) p++;
|
||||||
|
e = *p++;
|
||||||
|
while ((*p & 0xc0) == 0x80) p++;
|
||||||
|
for (; b <= e; b++)
|
||||||
|
re->start_bitmap[b/8] |= (1u << (b&7));
|
||||||
|
break;
|
||||||
|
|
||||||
|
case XCL_END:
|
||||||
|
goto HANDLE_CLASSMAP;
|
||||||
|
|
||||||
|
default:
|
||||||
|
return SSB_UNKNOWN; /* Internal error, should not occur */
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#endif /* SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8 */
|
||||||
|
#endif /* SUPPORT_WIDE_CHARS */
|
||||||
|
|
||||||
/* It seems that the fall through comment must be outside the #ifdef if
|
/* It seems that the fall through comment must be outside the #ifdef if
|
||||||
it is to avoid the gcc compiler warning. */
|
it is to avoid the gcc compiler warning. */
|
||||||
|
|
||||||
|
@ -1522,6 +1564,9 @@ do
|
||||||
greater than 127. In fact, there are only two possible starting bytes for
|
greater than 127. In fact, there are only two possible starting bytes for
|
||||||
characters in the range 128 - 255. */
|
characters in the range 128 - 255. */
|
||||||
|
|
||||||
|
#if defined SUPPORT_WIDE_CHARS && PCRE2_CODE_UNIT_WIDTH == 8
|
||||||
|
HANDLE_CLASSMAP:
|
||||||
|
#endif
|
||||||
if (classmap != NULL)
|
if (classmap != NULL)
|
||||||
{
|
{
|
||||||
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8
|
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8
|
||||||
|
|
|
@ -561,4 +561,10 @@
|
||||||
|
|
||||||
/[\x{c1}\x{e1}]X[\x{145}\x{146}]/I,utf
|
/[\x{c1}\x{e1}]X[\x{145}\x{146}]/I,utf
|
||||||
|
|
||||||
|
/[,]/BI,utf
|
||||||
|
|
||||||
|
/[\x{fff4}-\x{ffff8}]/I,utf
|
||||||
|
|
||||||
|
/[\x{fff4}-\x{afff8}\x{10ffff}]/I,utf
|
||||||
|
|
||||||
# End of testinput10
|
# End of testinput10
|
||||||
|
|
|
@ -1256,11 +1256,7 @@ Subject length lower bound = 1
|
||||||
------------------------------------------------------------------
|
------------------------------------------------------------------
|
||||||
Capture group count = 0
|
Capture group count = 0
|
||||||
Options: utf
|
Options: utf
|
||||||
Starting code units: Z \xc4 \xc5 \xc6 \xc7 \xc8 \xc9 \xca \xcb \xcc \xcd
|
Starting code units: Z \xc4
|
||||||
\xce \xcf \xd0 \xd1 \xd2 \xd3 \xd4 \xd5 \xd6 \xd7 \xd8 \xd9 \xda \xdb \xdc
|
|
||||||
\xdd \xde \xdf \xe0 \xe1 \xe2 \xe3 \xe4 \xe5 \xe6 \xe7 \xe8 \xe9 \xea \xeb
|
|
||||||
\xec \xed \xee \xef \xf0 \xf1 \xf2 \xf3 \xf4 \xf5 \xf6 \xf7 \xf8 \xf9 \xfa
|
|
||||||
\xfb \xfc \xfd \xfe \xff
|
|
||||||
Subject length lower bound = 1
|
Subject length lower bound = 1
|
||||||
Z\x{100}
|
Z\x{100}
|
||||||
0: Z
|
0: Z
|
||||||
|
@ -1278,11 +1274,7 @@ Subject length lower bound = 1
|
||||||
------------------------------------------------------------------
|
------------------------------------------------------------------
|
||||||
Capture group count = 0
|
Capture group count = 0
|
||||||
Options: utf
|
Options: utf
|
||||||
Starting code units: z { | } ~ \x7f \xc2 \xc3 \xc4 \xc5 \xc6 \xc7 \xc8 \xc9
|
Starting code units: z { | } ~ \x7f \xc2 \xc3 \xc4
|
||||||
\xca \xcb \xcc \xcd \xce \xcf \xd0 \xd1 \xd2 \xd3 \xd4 \xd5 \xd6 \xd7 \xd8
|
|
||||||
\xd9 \xda \xdb \xdc \xdd \xde \xdf \xe0 \xe1 \xe2 \xe3 \xe4 \xe5 \xe6 \xe7
|
|
||||||
\xe8 \xe9 \xea \xeb \xec \xed \xee \xef \xf0 \xf1 \xf2 \xf3 \xf4 \xf5 \xf6
|
|
||||||
\xf7 \xf8 \xf9 \xfa \xfb \xfc \xfd \xfe \xff
|
|
||||||
Subject length lower bound = 1
|
Subject length lower bound = 1
|
||||||
|
|
||||||
/[z\Qa-d]Ā\E]/IB,utf
|
/[z\Qa-d]Ā\E]/IB,utf
|
||||||
|
@ -1294,11 +1286,7 @@ Subject length lower bound = 1
|
||||||
------------------------------------------------------------------
|
------------------------------------------------------------------
|
||||||
Capture group count = 0
|
Capture group count = 0
|
||||||
Options: utf
|
Options: utf
|
||||||
Starting code units: - ] a d z \xc4 \xc5 \xc6 \xc7 \xc8 \xc9 \xca \xcb \xcc
|
Starting code units: - ] a d z \xc4
|
||||||
\xcd \xce \xcf \xd0 \xd1 \xd2 \xd3 \xd4 \xd5 \xd6 \xd7 \xd8 \xd9 \xda \xdb
|
|
||||||
\xdc \xdd \xde \xdf \xe0 \xe1 \xe2 \xe3 \xe4 \xe5 \xe6 \xe7 \xe8 \xe9 \xea
|
|
||||||
\xeb \xec \xed \xee \xef \xf0 \xf1 \xf2 \xf3 \xf4 \xf5 \xf6 \xf7 \xf8 \xf9
|
|
||||||
\xfa \xfb \xfc \xfd \xfe \xff
|
|
||||||
Subject length lower bound = 1
|
Subject length lower bound = 1
|
||||||
\x{100}
|
\x{100}
|
||||||
0: \x{100}
|
0: \x{100}
|
||||||
|
@ -1319,11 +1307,7 @@ Subject length lower bound = 1
|
||||||
------------------------------------------------------------------
|
------------------------------------------------------------------
|
||||||
Capture group count = 1
|
Capture group count = 1
|
||||||
Options: utf
|
Options: utf
|
||||||
Starting code units: a b \xc4 \xc5 \xc6 \xc7 \xc8 \xc9 \xca \xcb \xcc \xcd
|
Starting code units: a b \xc4
|
||||||
\xce \xcf \xd0 \xd1 \xd2 \xd3 \xd4 \xd5 \xd6 \xd7 \xd8 \xd9 \xda \xdb \xdc
|
|
||||||
\xdd \xde \xdf \xe0 \xe1 \xe2 \xe3 \xe4 \xe5 \xe6 \xe7 \xe8 \xe9 \xea \xeb
|
|
||||||
\xec \xed \xee \xef \xf0 \xf1 \xf2 \xf3 \xf4 \xf5 \xf6 \xf7 \xf8 \xf9 \xfa
|
|
||||||
\xfb \xfc \xfd \xfe \xff
|
|
||||||
Last code unit = 'z'
|
Last code unit = 'z'
|
||||||
Subject length lower bound = 7
|
Subject length lower bound = 7
|
||||||
|
|
||||||
|
@ -1440,11 +1424,7 @@ Subject length lower bound = 1
|
||||||
------------------------------------------------------------------
|
------------------------------------------------------------------
|
||||||
Capture group count = 0
|
Capture group count = 0
|
||||||
Options: caseless utf
|
Options: caseless utf
|
||||||
Starting code units: \xc4 \xc5 \xc6 \xc7 \xc8 \xc9 \xca \xcb \xcc \xcd \xce
|
Starting code units: \xc4
|
||||||
\xcf \xd0 \xd1 \xd2 \xd3 \xd4 \xd5 \xd6 \xd7 \xd8 \xd9 \xda \xdb \xdc \xdd
|
|
||||||
\xde \xdf \xe0 \xe1 \xe2 \xe3 \xe4 \xe5 \xe6 \xe7 \xe8 \xe9 \xea \xeb \xec
|
|
||||||
\xed \xee \xef \xf0 \xf1 \xf2 \xf3 \xf4 \xf5 \xf6 \xf7 \xf8 \xf9 \xfa \xfb
|
|
||||||
\xfc \xfd \xfe \xff
|
|
||||||
Subject length lower bound = 1
|
Subject length lower bound = 1
|
||||||
\x{104}
|
\x{104}
|
||||||
0: \x{104}
|
0: \x{104}
|
||||||
|
@ -1467,11 +1447,7 @@ No match
|
||||||
------------------------------------------------------------------
|
------------------------------------------------------------------
|
||||||
Capture group count = 0
|
Capture group count = 0
|
||||||
Options: caseless utf
|
Options: caseless utf
|
||||||
Starting code units: Z z { | } ~ \x7f \xc2 \xc3 \xc4 \xc5 \xc6 \xc7 \xc8
|
Starting code units: Z z { | } ~ \x7f \xc2 \xc3 \xc4 \xc5 \xce \xe1 \xe2
|
||||||
\xc9 \xca \xcb \xcc \xcd \xce \xcf \xd0 \xd1 \xd2 \xd3 \xd4 \xd5 \xd6 \xd7
|
|
||||||
\xd8 \xd9 \xda \xdb \xdc \xdd \xde \xdf \xe0 \xe1 \xe2 \xe3 \xe4 \xe5 \xe6
|
|
||||||
\xe7 \xe8 \xe9 \xea \xeb \xec \xed \xee \xef \xf0 \xf1 \xf2 \xf3 \xf4 \xf5
|
|
||||||
\xf6 \xf7 \xf8 \xf9 \xfa \xfb \xfc \xfd \xfe \xff
|
|
||||||
Subject length lower bound = 1
|
Subject length lower bound = 1
|
||||||
Z
|
Z
|
||||||
0: Z
|
0: Z
|
||||||
|
@ -1508,11 +1484,7 @@ No match
|
||||||
------------------------------------------------------------------
|
------------------------------------------------------------------
|
||||||
Capture group count = 0
|
Capture group count = 0
|
||||||
Options: caseless utf
|
Options: caseless utf
|
||||||
Starting code units: Z z { | } ~ \x7f \xc2 \xc3 \xc4 \xc5 \xc6 \xc7 \xc8
|
Starting code units: Z z { | } ~ \x7f \xc2 \xc3 \xc4 \xc5 \xce \xe1 \xe2
|
||||||
\xc9 \xca \xcb \xcc \xcd \xce \xcf \xd0 \xd1 \xd2 \xd3 \xd4 \xd5 \xd6 \xd7
|
|
||||||
\xd8 \xd9 \xda \xdb \xdc \xdd \xde \xdf \xe0 \xe1 \xe2 \xe3 \xe4 \xe5 \xe6
|
|
||||||
\xe7 \xe8 \xe9 \xea \xeb \xec \xed \xee \xef \xf0 \xf1 \xf2 \xf3 \xf4 \xf5
|
|
||||||
\xf6 \xf7 \xf8 \xf9 \xfa \xfb \xfc \xfd \xfe \xff
|
|
||||||
Subject length lower bound = 1
|
Subject length lower bound = 1
|
||||||
|
|
||||||
/\x{3a3}B/IBi,utf
|
/\x{3a3}B/IBi,utf
|
||||||
|
@ -1773,4 +1745,28 @@ Starting code units: \xc3
|
||||||
Last code unit = 'X'
|
Last code unit = 'X'
|
||||||
Subject length lower bound = 3
|
Subject length lower bound = 3
|
||||||
|
|
||||||
|
/[,]/BI,utf
|
||||||
|
------------------------------------------------------------------
|
||||||
|
Bra
|
||||||
|
[,\x{fff9f}]
|
||||||
|
Ket
|
||||||
|
End
|
||||||
|
------------------------------------------------------------------
|
||||||
|
Capture group count = 0
|
||||||
|
Options: utf
|
||||||
|
Starting code units: , \xf3
|
||||||
|
Subject length lower bound = 1
|
||||||
|
|
||||||
|
/[\x{fff4}-\x{ffff8}]/I,utf
|
||||||
|
Capture group count = 0
|
||||||
|
Options: utf
|
||||||
|
Starting code units: \xef \xf0 \xf1 \xf2 \xf3
|
||||||
|
Subject length lower bound = 1
|
||||||
|
|
||||||
|
/[\x{fff4}-\x{afff8}\x{10ffff}]/I,utf
|
||||||
|
Capture group count = 0
|
||||||
|
Options: utf
|
||||||
|
Starting code units: \xef \xf0 \xf1 \xf2 \xf4
|
||||||
|
Subject length lower bound = 1
|
||||||
|
|
||||||
# End of testinput10
|
# End of testinput10
|
||||||
|
|
Loading…
Reference in New Issue