Optimize classes such as [Aa] to be a single caseless character.

This commit is contained in:
Philip.Hazel 2019-09-09 17:00:19 +00:00
parent aae44b83f8
commit bf15267c30
9 changed files with 114 additions and 18 deletions

View File

@ -146,9 +146,13 @@ compile-time performance improvement).
31. Installed a .gitignore file on a user's suggestion. When using the svn 31. Installed a .gitignore file on a user's suggestion. When using the svn
repository with git (through git svn) this helps keep it tidy. repository with git (through git svn) this helps keep it tidy.
32. Add underflow check in JIT which may occure when the value of subject 32. Add underflow check in JIT which may occur when the value of subject
string pointer is close to 0. string pointer is close to 0.
33. Arrange for classes such as [Aa] which contain just the two cases of the
same character, to be treated as a single caseless character. This causes the
first and required code unit optimizations to kick in where relevant.
Version 10.33 16-April-2019 Version 10.33 16-April-2019
--------------------------- ---------------------------

View File

@ -3635,6 +3635,8 @@ while (ptr < ptrend)
if (c == CHAR_RIGHT_SQUARE_BRACKET && !inescq) break; if (c == CHAR_RIGHT_SQUARE_BRACKET && !inescq) break;
} /* End of class-processing loop */ } /* End of class-processing loop */
/* -] at the end of a class is a literal '-' */
if (class_range_state == RANGE_STARTED) if (class_range_state == RANGE_STARTED)
{ {
parsed_pattern[-1] = CHAR_MINUS; parsed_pattern[-1] = CHAR_MINUS;
@ -5302,6 +5304,7 @@ BOOL groupsetfirstcu = FALSE;
BOOL had_accept = FALSE; BOOL had_accept = FALSE;
BOOL matched_char = FALSE; BOOL matched_char = FALSE;
BOOL previous_matched_char = FALSE; BOOL previous_matched_char = FALSE;
BOOL reset_caseful = FALSE;
const uint8_t *cbits = cb->cbits; const uint8_t *cbits = cb->cbits;
uint8_t classbits[32]; uint8_t classbits[32];
@ -5578,7 +5581,37 @@ for (;; pptr++)
} /* End of 1-char optimization */ } /* End of 1-char optimization */
/* Handle character classes that contain more than just one literal /* Handle character classes that contain more than just one literal
character. */ character. If there are exactly two characters in a positive class, see if
they are case partners. This can be optimized to generate a caseless single
character match (which also sets first/required code units if relevant). */
if (meta == META_CLASS && pptr[1] < META_END && pptr[2] < META_END &&
pptr[3] == META_CLASS_END)
{
uint32_t c = pptr[1];
#ifdef SUPPORT_UNICODE
if (UCD_CASESET(c) == 0)
#endif
{
uint32_t d = TABLE_GET(c, cb->fcc, c);
#ifdef SUPPORT_UNICODE
if (utf && c > 127) d = UCD_OTHERCASE(c);
#endif
if (c != d && pptr[2] == d)
{
pptr += 3; /* Move on to class end */
meta = c;
if ((options & PCRE2_CASELESS) == 0)
{
reset_caseful = TRUE;
options |= PCRE2_CASELESS;
req_caseopt = REQ_CASELESS;
}
goto CLASS_CASELESS_CHAR;
}
}
}
/* If a non-extended class contains a negative special such as \S, we need /* If a non-extended class contains a negative special such as \S, we need
to flip the negation flag at the end, so that support for characters > 255 to flip the negation flag at the end, so that support for characters > 255
@ -7818,9 +7851,15 @@ for (;; pptr++)
} }
#endif #endif
/* Caseful matches, or not one of the multicase characters. Get the /* Caseful matches, or caseless and not one of the multicase characters. We
character's code units into mcbuffer, with the length in mclength. When not come here by goto in the case of a positive class that contains only
in UTF mode, the length is always 1. */ case-partners of a character with just two cases; matched_char has already
been set TRUE and options fudged if necessary. */
CLASS_CASELESS_CHAR:
/* Get the character's code units into mcbuffer, with the length in
mclength. When not in UTF mode, the length is always 1. */
#ifdef SUPPORT_UNICODE #ifdef SUPPORT_UNICODE
if (utf) mclength = PRIV(ord2utf)(meta, mcbuffer); else if (utf) mclength = PRIV(ord2utf)(meta, mcbuffer); else
@ -7852,8 +7891,9 @@ for (;; pptr++)
zeroreqcu = reqcu; zeroreqcu = reqcu;
zeroreqcuflags = reqcuflags; zeroreqcuflags = reqcuflags;
/* If the character is more than one code unit long, we can set firstcu /* If the character is more than one code unit long, we can set a single
only if it is not to be matched caselessly. */ firstcu only if it is not to be matched caselessly. Multiple possible
starting code units may be picked up later in the studying code. */
if (mclength == 1 || req_caseopt == 0) if (mclength == 1 || req_caseopt == 0)
{ {
@ -7883,7 +7923,17 @@ for (;; pptr++)
reqcuflags = req_caseopt | cb->req_varyopt; reqcuflags = req_caseopt | cb->req_varyopt;
} }
} }
break; /* End default meta handling */
/* If caselessness was temporarily instated, reset it. */
if (reset_caseful)
{
options &= ~PCRE2_CASELESS;
req_caseopt = 0;
reset_caseful = FALSE;
}
break; /* End literal character handling */
} /* End of big switch */ } /* End of big switch */
} /* End of big loop */ } /* End of big loop */
@ -8051,7 +8101,7 @@ for (;;)
/* If this is not the first branch, the first char and reqcu have to /* If this is not the first branch, the first char and reqcu have to
match the values from all the previous branches, except that if the match the values from all the previous branches, except that if the
previous value for reqcu didn't have REQ_VARY set, it can still match, previous value for reqcu didn't have REQ_VARY set, it can still match,
and we set REQ_VARY for the regex. */ and we set REQ_VARY for the group from this branch's value. */
else else
{ {
@ -8090,7 +8140,7 @@ for (;;)
else else
{ {
reqcu = branchreqcu; reqcu = branchreqcu;
reqcuflags |= branchreqcuflags; /* To "or" REQ_VARY */ reqcuflags |= branchreqcuflags; /* To "or" REQ_VARY if present */
} }
} }
} }
@ -10329,9 +10379,10 @@ if ((re->overall_options & PCRE2_NO_START_OPTIMIZE) == 0)
is_startline(codestart, 0, &cb, 0, FALSE)) is_startline(codestart, 0, &cb, 0, FALSE))
re->flags |= PCRE2_STARTLINE; re->flags |= PCRE2_STARTLINE;
/* Handle the "required code unit", if one is set. We can increment the /* Handle the "required code unit", if one is set. In the UTF case we can
minimum minimum length only if we are sure this really is a different increment the minimum minimum length only if we are sure this really is a
character, because the count is in characters, not code units. */ different character and not a non-starting code unit of the first character,
because the minimum length count is in characters, not code units. */
if (reqcuflags >= 0) if (reqcuflags >= 0)
{ {

View File

@ -559,4 +559,6 @@
/(*UTF)(?=\x{123})/I /(*UTF)(?=\x{123})/I
/[\x{c1}\x{e1}]X[\x{145}\x{146}]/I,utf
# End of testinput10 # End of testinput10

View File

@ -449,4 +449,6 @@
/(*UTF)(?=\x{123})/I /(*UTF)(?=\x{123})/I
/[\x{c1}\x{e1}]X[\x{145}\x{146}]/I,utf
# End of testinput12 # End of testinput12

2
testdata/testinput2 vendored
View File

@ -5758,4 +5758,6 @@ a)"xI
/(?(VERSION=10.4)b)((?<=b).*)/B /(?(VERSION=10.4)b)((?<=b).*)/B
/[aA]b[cC]/IB
# End of testinput2 # End of testinput2

View File

@ -1766,4 +1766,11 @@ First code unit = \xc4
Last code unit = \xa3 Last code unit = \xa3
Subject length lower bound = 1 Subject length lower bound = 1
/[\x{c1}\x{e1}]X[\x{145}\x{146}]/I,utf
Capture group count = 0
Options: utf
Starting code units: \xc3
Last code unit = 'X'
Subject length lower bound = 3
# End of testinput10 # End of testinput10

View File

@ -1587,4 +1587,11 @@ Overall options: utf
First code unit = \x{123} First code unit = \x{123}
Subject length lower bound = 1 Subject length lower bound = 1
/[\x{c1}\x{e1}]X[\x{145}\x{146}]/I,utf
Capture group count = 0
Options: utf
First code unit = \xc1 (caseless)
Last code unit = \x{145} (caseless)
Subject length lower bound = 3
# End of testinput12 # End of testinput12

View File

@ -1585,4 +1585,11 @@ Overall options: utf
First code unit = \x{123} First code unit = \x{123}
Subject length lower bound = 1 Subject length lower bound = 1
/[\x{c1}\x{e1}]X[\x{145}\x{146}]/I,utf
Capture group count = 0
Options: utf
First code unit = \xc1 (caseless)
Last code unit = \x{145} (caseless)
Subject length lower bound = 3
# End of testinput12 # End of testinput12

14
testdata/testoutput2 vendored
View File

@ -17387,6 +17387,20 @@ Subject length lower bound = 1
End End
------------------------------------------------------------------ ------------------------------------------------------------------
/[aA]b[cC]/IB
------------------------------------------------------------------
Bra
/i a
b
/i c
Ket
End
------------------------------------------------------------------
Capture group count = 0
First code unit = 'a' (caseless)
Last code unit = 'c' (caseless)
Subject length lower bound = 3
# End of testinput2 # End of testinput2
Error -70: PCRE2_ERROR_BADDATA (unknown error number) Error -70: PCRE2_ERROR_BADDATA (unknown error number)
Error -62: bad serialized data Error -62: bad serialized data