Optimize certain starting code unit bit maps into a single starting code unit.
This commit is contained in:
parent
d917899be5
commit
e413f3147c
10
ChangeLog
10
ChangeLog
|
@ -160,6 +160,16 @@ specifying any character with codepoint >= 0x100. Now the only bits that are
|
||||||
set are for the relevant bytes that start the wide characters. This can give a
|
set are for the relevant bytes that start the wide characters. This can give a
|
||||||
noticeable performance improvement.
|
noticeable performance improvement.
|
||||||
|
|
||||||
|
35. If the bitmap of starting code units contains only 1 or 2 bits, replace it
|
||||||
|
with a single starting code unit (1 bit) or a caseless single starting code
|
||||||
|
unit if the two relevant characters are case-partners. This is particularly
|
||||||
|
relevant to the 8-bit library, though it applies to all. It can give a
|
||||||
|
performance boost for patterns such as [Ww]ord and (word|WORD). However, this
|
||||||
|
optimization doesn't happen if there is a "required" code unit of the same
|
||||||
|
value (because the search for a "required" code unit starts at the match start
|
||||||
|
for non-unique first code unit patterns, but after a unique first code unit,
|
||||||
|
and patterns such as a*a need the former action).
|
||||||
|
|
||||||
|
|
||||||
Version 10.33 16-April-2019
|
Version 10.33 16-April-2019
|
||||||
---------------------------
|
---------------------------
|
||||||
|
|
|
@ -1666,7 +1666,101 @@ if ((re->flags & (PCRE2_FIRSTSET|PCRE2_STARTLINE)) == 0)
|
||||||
{
|
{
|
||||||
int rc = set_start_bits(re, code, utf);
|
int rc = set_start_bits(re, code, utf);
|
||||||
if (rc == SSB_UNKNOWN) return 1;
|
if (rc == SSB_UNKNOWN) return 1;
|
||||||
if (rc == SSB_DONE) re->flags |= PCRE2_FIRSTMAPSET;
|
|
||||||
|
/* If a list of starting code units was set up, scan the list to see if only
|
||||||
|
one or two were listed. Having only one listed is rare because usually a
|
||||||
|
single starting code unit will have been recognized and PCRE2_FIRSTSET set.
|
||||||
|
If two are listed, see if they are caseless versions of the same character;
|
||||||
|
if so we can replace the list with a caseless first code unit. This gives
|
||||||
|
better performance and is plausibly worth doing for patterns such as [Ww]ord
|
||||||
|
or (word|WORD). */
|
||||||
|
|
||||||
|
if (rc == SSB_DONE)
|
||||||
|
{
|
||||||
|
int i;
|
||||||
|
int a = -1;
|
||||||
|
int b = -1;
|
||||||
|
uint8_t *p = re->start_bitmap;
|
||||||
|
uint32_t flags = PCRE2_FIRSTMAPSET;
|
||||||
|
|
||||||
|
for (i = 0; i < 256; p++, i += 8)
|
||||||
|
{
|
||||||
|
uint8_t x = *p;
|
||||||
|
if (x != 0)
|
||||||
|
{
|
||||||
|
int c;
|
||||||
|
uint8_t y = x & (~x + 1); /* Least significant bit */
|
||||||
|
if (y != x) goto DONE; /* More than one bit set */
|
||||||
|
|
||||||
|
/* In the 16-bit and 32-bit libraries, the bit for 0xff means "0xff and
|
||||||
|
all wide characters", so we cannot use it here. */
|
||||||
|
|
||||||
|
#if PCRE2_CODE_UNIT_WIDTH != 8
|
||||||
|
if (i == 248 && x == 0x80) goto DONE;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
/* Compute the character value */
|
||||||
|
|
||||||
|
c = i;
|
||||||
|
switch (x)
|
||||||
|
{
|
||||||
|
case 1: break;
|
||||||
|
case 2: c += 1; break; case 4: c += 2; break;
|
||||||
|
case 8: c += 3; break; case 16: c += 4; break;
|
||||||
|
case 32: c += 5; break; case 64: c += 6; break;
|
||||||
|
case 128: c += 7; break;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* c contains the code unit value, in the range 0-255. In 8-bit UTF
|
||||||
|
mode, only values < 128 can be used. */
|
||||||
|
|
||||||
|
#if PCRE2_CODE_UNIT_WIDTH == 8
|
||||||
|
if (c > 127) goto DONE;
|
||||||
|
#endif
|
||||||
|
if (a < 0) a = c; /* First one found */
|
||||||
|
else if (b < 0) /* Second one found */
|
||||||
|
{
|
||||||
|
int d = TABLE_GET((unsigned int)c, re->tables + fcc_offset, c);
|
||||||
|
|
||||||
|
#ifdef SUPPORT_UNICODE
|
||||||
|
#if PCRE2_CODE_UNIT_WIDTH == 8
|
||||||
|
if (utf && UCD_CASESET(c) != 0) goto DONE; /* Multiple case set */
|
||||||
|
#else /* 16-bit or 32-bit */
|
||||||
|
if (UCD_CASESET(c) != 0) goto DONE; /* Multiple case set */
|
||||||
|
if (utf && c > 127) d = UCD_OTHERCASE(c);
|
||||||
|
#endif /* Code width */
|
||||||
|
#endif /* SUPPORT_UNICODE */
|
||||||
|
|
||||||
|
if (d != a) goto DONE; /* Not other case of a */
|
||||||
|
b = c;
|
||||||
|
}
|
||||||
|
else goto DONE; /* More than two characters found */
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Replace the start code unit bits with a first code unit, but only if it
|
||||||
|
is not the same as a required later code unit. This is because a search for
|
||||||
|
a required code unit starts after an explicit first code unit, but at a
|
||||||
|
code unit found from the bitmap. Patterns such as /a*a/ don't work
|
||||||
|
if both the start unit and required unit are the same. */
|
||||||
|
|
||||||
|
if (a >= 0 &&
|
||||||
|
(
|
||||||
|
(re->flags & PCRE2_LASTSET) == 0 ||
|
||||||
|
(
|
||||||
|
re->last_codeunit != (uint32_t)a &&
|
||||||
|
(b < 0 || re->last_codeunit != (uint32_t)b)
|
||||||
|
)
|
||||||
|
))
|
||||||
|
{
|
||||||
|
re->first_codeunit = a;
|
||||||
|
flags = PCRE2_FIRSTSET;
|
||||||
|
if (b >= 0) flags |= PCRE2_FIRSTCASELESS;
|
||||||
|
}
|
||||||
|
|
||||||
|
DONE:
|
||||||
|
re->flags |= flags;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Find the minimum length of subject string. If the pattern can match an empty
|
/* Find the minimum length of subject string. If the pattern can match an empty
|
||||||
|
|
|
@ -567,4 +567,16 @@
|
||||||
|
|
||||||
/[\x{fff4}-\x{afff8}\x{10ffff}]/I,utf
|
/[\x{fff4}-\x{afff8}\x{10ffff}]/I,utf
|
||||||
|
|
||||||
|
/[\xff\x{ffff}]/I,utf
|
||||||
|
|
||||||
|
/[\xff\x{ff}]/I,utf
|
||||||
|
|
||||||
|
/[\xff\x{ff}]/I
|
||||||
|
|
||||||
|
/[Ss]/I
|
||||||
|
|
||||||
|
/[Ss]/I,utf
|
||||||
|
|
||||||
|
/(?:\x{ff}|\x{3000})/I,utf
|
||||||
|
|
||||||
# End of testinput10
|
# End of testinput10
|
||||||
|
|
|
@ -451,4 +451,16 @@
|
||||||
|
|
||||||
/[\x{c1}\x{e1}]X[\x{145}\x{146}]/I,utf
|
/[\x{c1}\x{e1}]X[\x{145}\x{146}]/I,utf
|
||||||
|
|
||||||
|
/[\xff\x{ffff}]/I,utf
|
||||||
|
|
||||||
|
/[\xff\x{ff}]/I,utf
|
||||||
|
|
||||||
|
/[\xff\x{ff}]/I
|
||||||
|
|
||||||
|
/[Ss]/I
|
||||||
|
|
||||||
|
/[Ss]/I,utf
|
||||||
|
|
||||||
|
/(?:\x{ff}|\x{3000})/I,utf
|
||||||
|
|
||||||
# End of testinput12
|
# End of testinput12
|
||||||
|
|
|
@ -1625,6 +1625,7 @@
|
||||||
/^a*A\d/IBi
|
/^a*A\d/IBi
|
||||||
aaaA5
|
aaaA5
|
||||||
aaaa5
|
aaaa5
|
||||||
|
a5
|
||||||
|
|
||||||
/(a*|b*)[cd]/I
|
/(a*|b*)[cd]/I
|
||||||
|
|
||||||
|
@ -5760,4 +5761,15 @@ a)"xI
|
||||||
|
|
||||||
/[aA]b[cC]/IB
|
/[aA]b[cC]/IB
|
||||||
|
|
||||||
|
/[cc]abcd/I
|
||||||
|
|
||||||
|
/[Cc]abcd/I
|
||||||
|
|
||||||
|
/[c]abcd/I
|
||||||
|
|
||||||
|
/(?:c|C)abcd/I
|
||||||
|
|
||||||
|
/(a)?a/I
|
||||||
|
manm
|
||||||
|
|
||||||
# End of testinput2
|
# End of testinput2
|
||||||
|
|
|
@ -1769,4 +1769,38 @@ Options: utf
|
||||||
Starting code units: \xef \xf0 \xf1 \xf2 \xf4
|
Starting code units: \xef \xf0 \xf1 \xf2 \xf4
|
||||||
Subject length lower bound = 1
|
Subject length lower bound = 1
|
||||||
|
|
||||||
|
/[\xff\x{ffff}]/I,utf
|
||||||
|
Capture group count = 0
|
||||||
|
Options: utf
|
||||||
|
Starting code units: \xc3 \xef
|
||||||
|
Subject length lower bound = 1
|
||||||
|
|
||||||
|
/[\xff\x{ff}]/I,utf
|
||||||
|
Capture group count = 0
|
||||||
|
Options: utf
|
||||||
|
Starting code units: \xc3
|
||||||
|
Subject length lower bound = 1
|
||||||
|
|
||||||
|
/[\xff\x{ff}]/I
|
||||||
|
Capture group count = 0
|
||||||
|
Starting code units: \xff
|
||||||
|
Subject length lower bound = 1
|
||||||
|
|
||||||
|
/[Ss]/I
|
||||||
|
Capture group count = 0
|
||||||
|
First code unit = 'S' (caseless)
|
||||||
|
Subject length lower bound = 1
|
||||||
|
|
||||||
|
/[Ss]/I,utf
|
||||||
|
Capture group count = 0
|
||||||
|
Options: utf
|
||||||
|
Starting code units: S s
|
||||||
|
Subject length lower bound = 1
|
||||||
|
|
||||||
|
/(?:\x{ff}|\x{3000})/I,utf
|
||||||
|
Capture group count = 0
|
||||||
|
Options: utf
|
||||||
|
Starting code units: \xc3 \xe3
|
||||||
|
Subject length lower bound = 1
|
||||||
|
|
||||||
# End of testinput10
|
# End of testinput10
|
||||||
|
|
|
@ -1594,4 +1594,38 @@ First code unit = \xc1 (caseless)
|
||||||
Last code unit = \x{145} (caseless)
|
Last code unit = \x{145} (caseless)
|
||||||
Subject length lower bound = 3
|
Subject length lower bound = 3
|
||||||
|
|
||||||
|
/[\xff\x{ffff}]/I,utf
|
||||||
|
Capture group count = 0
|
||||||
|
Options: utf
|
||||||
|
Starting code units: \xff
|
||||||
|
Subject length lower bound = 1
|
||||||
|
|
||||||
|
/[\xff\x{ff}]/I,utf
|
||||||
|
Capture group count = 0
|
||||||
|
Options: utf
|
||||||
|
Starting code units: \xff
|
||||||
|
Subject length lower bound = 1
|
||||||
|
|
||||||
|
/[\xff\x{ff}]/I
|
||||||
|
Capture group count = 0
|
||||||
|
Starting code units: \xff
|
||||||
|
Subject length lower bound = 1
|
||||||
|
|
||||||
|
/[Ss]/I
|
||||||
|
Capture group count = 0
|
||||||
|
Starting code units: S s
|
||||||
|
Subject length lower bound = 1
|
||||||
|
|
||||||
|
/[Ss]/I,utf
|
||||||
|
Capture group count = 0
|
||||||
|
Options: utf
|
||||||
|
Starting code units: S s
|
||||||
|
Subject length lower bound = 1
|
||||||
|
|
||||||
|
/(?:\x{ff}|\x{3000})/I,utf
|
||||||
|
Capture group count = 0
|
||||||
|
Options: utf
|
||||||
|
Starting code units: \xff
|
||||||
|
Subject length lower bound = 1
|
||||||
|
|
||||||
# End of testinput12
|
# End of testinput12
|
||||||
|
|
|
@ -1592,4 +1592,38 @@ First code unit = \xc1 (caseless)
|
||||||
Last code unit = \x{145} (caseless)
|
Last code unit = \x{145} (caseless)
|
||||||
Subject length lower bound = 3
|
Subject length lower bound = 3
|
||||||
|
|
||||||
|
/[\xff\x{ffff}]/I,utf
|
||||||
|
Capture group count = 0
|
||||||
|
Options: utf
|
||||||
|
Starting code units: \xff
|
||||||
|
Subject length lower bound = 1
|
||||||
|
|
||||||
|
/[\xff\x{ff}]/I,utf
|
||||||
|
Capture group count = 0
|
||||||
|
Options: utf
|
||||||
|
Starting code units: \xff
|
||||||
|
Subject length lower bound = 1
|
||||||
|
|
||||||
|
/[\xff\x{ff}]/I
|
||||||
|
Capture group count = 0
|
||||||
|
Starting code units: \xff
|
||||||
|
Subject length lower bound = 1
|
||||||
|
|
||||||
|
/[Ss]/I
|
||||||
|
Capture group count = 0
|
||||||
|
Starting code units: S s
|
||||||
|
Subject length lower bound = 1
|
||||||
|
|
||||||
|
/[Ss]/I,utf
|
||||||
|
Capture group count = 0
|
||||||
|
Options: utf
|
||||||
|
Starting code units: S s
|
||||||
|
Subject length lower bound = 1
|
||||||
|
|
||||||
|
/(?:\x{ff}|\x{3000})/I,utf
|
||||||
|
Capture group count = 0
|
||||||
|
Options: utf
|
||||||
|
Starting code units: \xff
|
||||||
|
Subject length lower bound = 1
|
||||||
|
|
||||||
# End of testinput12
|
# End of testinput12
|
||||||
|
|
|
@ -816,7 +816,7 @@ Capture group count = 1
|
||||||
Max back reference = 1
|
Max back reference = 1
|
||||||
Compile options: <none>
|
Compile options: <none>
|
||||||
Overall options: anchored
|
Overall options: anchored
|
||||||
Starting code units: a
|
First code unit = 'a'
|
||||||
Subject length lower bound = 4
|
Subject length lower bound = 4
|
||||||
\= Expect no match
|
\= Expect no match
|
||||||
aaaa
|
aaaa
|
||||||
|
@ -6492,6 +6492,8 @@ Subject length lower bound = 2
|
||||||
0: aaaA5
|
0: aaaA5
|
||||||
aaaa5
|
aaaa5
|
||||||
0: aaaa5
|
0: aaaa5
|
||||||
|
a5
|
||||||
|
0: a5
|
||||||
|
|
||||||
/(a*|b*)[cd]/I
|
/(a*|b*)[cd]/I
|
||||||
Capture group count = 1
|
Capture group count = 1
|
||||||
|
@ -17401,6 +17403,38 @@ First code unit = 'a' (caseless)
|
||||||
Last code unit = 'c' (caseless)
|
Last code unit = 'c' (caseless)
|
||||||
Subject length lower bound = 3
|
Subject length lower bound = 3
|
||||||
|
|
||||||
|
/[cc]abcd/I
|
||||||
|
Capture group count = 0
|
||||||
|
First code unit = 'c'
|
||||||
|
Last code unit = 'd'
|
||||||
|
Subject length lower bound = 5
|
||||||
|
|
||||||
|
/[Cc]abcd/I
|
||||||
|
Capture group count = 0
|
||||||
|
First code unit = 'C' (caseless)
|
||||||
|
Last code unit = 'd'
|
||||||
|
Subject length lower bound = 5
|
||||||
|
|
||||||
|
/[c]abcd/I
|
||||||
|
Capture group count = 0
|
||||||
|
First code unit = 'c'
|
||||||
|
Last code unit = 'd'
|
||||||
|
Subject length lower bound = 5
|
||||||
|
|
||||||
|
/(?:c|C)abcd/I
|
||||||
|
Capture group count = 0
|
||||||
|
First code unit = 'C' (caseless)
|
||||||
|
Last code unit = 'd'
|
||||||
|
Subject length lower bound = 5
|
||||||
|
|
||||||
|
/(a)?a/I
|
||||||
|
Capture group count = 1
|
||||||
|
Starting code units: a
|
||||||
|
Last code unit = 'a'
|
||||||
|
Subject length lower bound = 1
|
||||||
|
manm
|
||||||
|
0: a
|
||||||
|
|
||||||
# End of testinput2
|
# End of testinput2
|
||||||
Error -70: PCRE2_ERROR_BADDATA (unknown error number)
|
Error -70: PCRE2_ERROR_BADDATA (unknown error number)
|
||||||
Error -62: bad serialized data
|
Error -62: bad serialized data
|
||||||
|
|
Loading…
Reference in New Issue