Optimize classes such as [Aa] to be a single caseless character.
This commit is contained in:
parent
aae44b83f8
commit
bf15267c30
|
@ -146,9 +146,13 @@ compile-time performance improvement).
|
||||||
31. Installed a .gitignore file on a user's suggestion. When using the svn
|
31. Installed a .gitignore file on a user's suggestion. When using the svn
|
||||||
repository with git (through git svn) this helps keep it tidy.
|
repository with git (through git svn) this helps keep it tidy.
|
||||||
|
|
||||||
32. Add underflow check in JIT which may occure when the value of subject
|
32. Add underflow check in JIT which may occur when the value of subject
|
||||||
string pointer is close to 0.
|
string pointer is close to 0.
|
||||||
|
|
||||||
|
33. Arrange for classes such as [Aa] which contain just the two cases of the
|
||||||
|
same character, to be treated as a single caseless character. This causes the
|
||||||
|
first and required code unit optimizations to kick in where relevant.
|
||||||
|
|
||||||
|
|
||||||
Version 10.33 16-April-2019
|
Version 10.33 16-April-2019
|
||||||
---------------------------
|
---------------------------
|
||||||
|
|
|
@ -3635,6 +3635,8 @@ while (ptr < ptrend)
|
||||||
if (c == CHAR_RIGHT_SQUARE_BRACKET && !inescq) break;
|
if (c == CHAR_RIGHT_SQUARE_BRACKET && !inescq) break;
|
||||||
} /* End of class-processing loop */
|
} /* End of class-processing loop */
|
||||||
|
|
||||||
|
/* -] at the end of a class is a literal '-' */
|
||||||
|
|
||||||
if (class_range_state == RANGE_STARTED)
|
if (class_range_state == RANGE_STARTED)
|
||||||
{
|
{
|
||||||
parsed_pattern[-1] = CHAR_MINUS;
|
parsed_pattern[-1] = CHAR_MINUS;
|
||||||
|
@ -5302,6 +5304,7 @@ BOOL groupsetfirstcu = FALSE;
|
||||||
BOOL had_accept = FALSE;
|
BOOL had_accept = FALSE;
|
||||||
BOOL matched_char = FALSE;
|
BOOL matched_char = FALSE;
|
||||||
BOOL previous_matched_char = FALSE;
|
BOOL previous_matched_char = FALSE;
|
||||||
|
BOOL reset_caseful = FALSE;
|
||||||
const uint8_t *cbits = cb->cbits;
|
const uint8_t *cbits = cb->cbits;
|
||||||
uint8_t classbits[32];
|
uint8_t classbits[32];
|
||||||
|
|
||||||
|
@ -5578,7 +5581,37 @@ for (;; pptr++)
|
||||||
} /* End of 1-char optimization */
|
} /* End of 1-char optimization */
|
||||||
|
|
||||||
/* Handle character classes that contain more than just one literal
|
/* Handle character classes that contain more than just one literal
|
||||||
character. */
|
character. If there are exactly two characters in a positive class, see if
|
||||||
|
they are case partners. This can be optimized to generate a caseless single
|
||||||
|
character match (which also sets first/required code units if relevant). */
|
||||||
|
|
||||||
|
if (meta == META_CLASS && pptr[1] < META_END && pptr[2] < META_END &&
|
||||||
|
pptr[3] == META_CLASS_END)
|
||||||
|
{
|
||||||
|
uint32_t c = pptr[1];
|
||||||
|
|
||||||
|
#ifdef SUPPORT_UNICODE
|
||||||
|
if (UCD_CASESET(c) == 0)
|
||||||
|
#endif
|
||||||
|
{
|
||||||
|
uint32_t d = TABLE_GET(c, cb->fcc, c);
|
||||||
|
#ifdef SUPPORT_UNICODE
|
||||||
|
if (utf && c > 127) d = UCD_OTHERCASE(c);
|
||||||
|
#endif
|
||||||
|
if (c != d && pptr[2] == d)
|
||||||
|
{
|
||||||
|
pptr += 3; /* Move on to class end */
|
||||||
|
meta = c;
|
||||||
|
if ((options & PCRE2_CASELESS) == 0)
|
||||||
|
{
|
||||||
|
reset_caseful = TRUE;
|
||||||
|
options |= PCRE2_CASELESS;
|
||||||
|
req_caseopt = REQ_CASELESS;
|
||||||
|
}
|
||||||
|
goto CLASS_CASELESS_CHAR;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/* If a non-extended class contains a negative special such as \S, we need
|
/* If a non-extended class contains a negative special such as \S, we need
|
||||||
to flip the negation flag at the end, so that support for characters > 255
|
to flip the negation flag at the end, so that support for characters > 255
|
||||||
|
@ -7818,9 +7851,15 @@ for (;; pptr++)
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
/* Caseful matches, or not one of the multicase characters. Get the
|
/* Caseful matches, or caseless and not one of the multicase characters. We
|
||||||
character's code units into mcbuffer, with the length in mclength. When not
|
come here by goto in the case of a positive class that contains only
|
||||||
in UTF mode, the length is always 1. */
|
case-partners of a character with just two cases; matched_char has already
|
||||||
|
been set TRUE and options fudged if necessary. */
|
||||||
|
|
||||||
|
CLASS_CASELESS_CHAR:
|
||||||
|
|
||||||
|
/* Get the character's code units into mcbuffer, with the length in
|
||||||
|
mclength. When not in UTF mode, the length is always 1. */
|
||||||
|
|
||||||
#ifdef SUPPORT_UNICODE
|
#ifdef SUPPORT_UNICODE
|
||||||
if (utf) mclength = PRIV(ord2utf)(meta, mcbuffer); else
|
if (utf) mclength = PRIV(ord2utf)(meta, mcbuffer); else
|
||||||
|
@ -7852,8 +7891,9 @@ for (;; pptr++)
|
||||||
zeroreqcu = reqcu;
|
zeroreqcu = reqcu;
|
||||||
zeroreqcuflags = reqcuflags;
|
zeroreqcuflags = reqcuflags;
|
||||||
|
|
||||||
/* If the character is more than one code unit long, we can set firstcu
|
/* If the character is more than one code unit long, we can set a single
|
||||||
only if it is not to be matched caselessly. */
|
firstcu only if it is not to be matched caselessly. Multiple possible
|
||||||
|
starting code units may be picked up later in the studying code. */
|
||||||
|
|
||||||
if (mclength == 1 || req_caseopt == 0)
|
if (mclength == 1 || req_caseopt == 0)
|
||||||
{
|
{
|
||||||
|
@ -7883,7 +7923,17 @@ for (;; pptr++)
|
||||||
reqcuflags = req_caseopt | cb->req_varyopt;
|
reqcuflags = req_caseopt | cb->req_varyopt;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
break; /* End default meta handling */
|
|
||||||
|
/* If caselessness was temporarily instated, reset it. */
|
||||||
|
|
||||||
|
if (reset_caseful)
|
||||||
|
{
|
||||||
|
options &= ~PCRE2_CASELESS;
|
||||||
|
req_caseopt = 0;
|
||||||
|
reset_caseful = FALSE;
|
||||||
|
}
|
||||||
|
|
||||||
|
break; /* End literal character handling */
|
||||||
} /* End of big switch */
|
} /* End of big switch */
|
||||||
} /* End of big loop */
|
} /* End of big loop */
|
||||||
|
|
||||||
|
@ -8051,7 +8101,7 @@ for (;;)
|
||||||
/* If this is not the first branch, the first char and reqcu have to
|
/* If this is not the first branch, the first char and reqcu have to
|
||||||
match the values from all the previous branches, except that if the
|
match the values from all the previous branches, except that if the
|
||||||
previous value for reqcu didn't have REQ_VARY set, it can still match,
|
previous value for reqcu didn't have REQ_VARY set, it can still match,
|
||||||
and we set REQ_VARY for the regex. */
|
and we set REQ_VARY for the group from this branch's value. */
|
||||||
|
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
|
@ -8090,7 +8140,7 @@ for (;;)
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
reqcu = branchreqcu;
|
reqcu = branchreqcu;
|
||||||
reqcuflags |= branchreqcuflags; /* To "or" REQ_VARY */
|
reqcuflags |= branchreqcuflags; /* To "or" REQ_VARY if present */
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -10329,9 +10379,10 @@ if ((re->overall_options & PCRE2_NO_START_OPTIMIZE) == 0)
|
||||||
is_startline(codestart, 0, &cb, 0, FALSE))
|
is_startline(codestart, 0, &cb, 0, FALSE))
|
||||||
re->flags |= PCRE2_STARTLINE;
|
re->flags |= PCRE2_STARTLINE;
|
||||||
|
|
||||||
/* Handle the "required code unit", if one is set. We can increment the
|
/* Handle the "required code unit", if one is set. In the UTF case we can
|
||||||
minimum minimum length only if we are sure this really is a different
|
increment the minimum minimum length only if we are sure this really is a
|
||||||
character, because the count is in characters, not code units. */
|
different character and not a non-starting code unit of the first character,
|
||||||
|
because the minimum length count is in characters, not code units. */
|
||||||
|
|
||||||
if (reqcuflags >= 0)
|
if (reqcuflags >= 0)
|
||||||
{
|
{
|
||||||
|
|
|
@ -559,4 +559,6 @@
|
||||||
|
|
||||||
/(*UTF)(?=\x{123})/I
|
/(*UTF)(?=\x{123})/I
|
||||||
|
|
||||||
|
/[\x{c1}\x{e1}]X[\x{145}\x{146}]/I,utf
|
||||||
|
|
||||||
# End of testinput10
|
# End of testinput10
|
||||||
|
|
|
@ -449,4 +449,6 @@
|
||||||
|
|
||||||
/(*UTF)(?=\x{123})/I
|
/(*UTF)(?=\x{123})/I
|
||||||
|
|
||||||
|
/[\x{c1}\x{e1}]X[\x{145}\x{146}]/I,utf
|
||||||
|
|
||||||
# End of testinput12
|
# End of testinput12
|
||||||
|
|
|
@ -5758,4 +5758,6 @@ a)"xI
|
||||||
|
|
||||||
/(?(VERSION=10.4)b)((?<=b).*)/B
|
/(?(VERSION=10.4)b)((?<=b).*)/B
|
||||||
|
|
||||||
|
/[aA]b[cC]/IB
|
||||||
|
|
||||||
# End of testinput2
|
# End of testinput2
|
||||||
|
|
|
@ -1766,4 +1766,11 @@ First code unit = \xc4
|
||||||
Last code unit = \xa3
|
Last code unit = \xa3
|
||||||
Subject length lower bound = 1
|
Subject length lower bound = 1
|
||||||
|
|
||||||
|
/[\x{c1}\x{e1}]X[\x{145}\x{146}]/I,utf
|
||||||
|
Capture group count = 0
|
||||||
|
Options: utf
|
||||||
|
Starting code units: \xc3
|
||||||
|
Last code unit = 'X'
|
||||||
|
Subject length lower bound = 3
|
||||||
|
|
||||||
# End of testinput10
|
# End of testinput10
|
||||||
|
|
|
@ -1587,4 +1587,11 @@ Overall options: utf
|
||||||
First code unit = \x{123}
|
First code unit = \x{123}
|
||||||
Subject length lower bound = 1
|
Subject length lower bound = 1
|
||||||
|
|
||||||
|
/[\x{c1}\x{e1}]X[\x{145}\x{146}]/I,utf
|
||||||
|
Capture group count = 0
|
||||||
|
Options: utf
|
||||||
|
First code unit = \xc1 (caseless)
|
||||||
|
Last code unit = \x{145} (caseless)
|
||||||
|
Subject length lower bound = 3
|
||||||
|
|
||||||
# End of testinput12
|
# End of testinput12
|
||||||
|
|
|
@ -1585,4 +1585,11 @@ Overall options: utf
|
||||||
First code unit = \x{123}
|
First code unit = \x{123}
|
||||||
Subject length lower bound = 1
|
Subject length lower bound = 1
|
||||||
|
|
||||||
|
/[\x{c1}\x{e1}]X[\x{145}\x{146}]/I,utf
|
||||||
|
Capture group count = 0
|
||||||
|
Options: utf
|
||||||
|
First code unit = \xc1 (caseless)
|
||||||
|
Last code unit = \x{145} (caseless)
|
||||||
|
Subject length lower bound = 3
|
||||||
|
|
||||||
# End of testinput12
|
# End of testinput12
|
||||||
|
|
|
@ -17387,6 +17387,20 @@ Subject length lower bound = 1
|
||||||
End
|
End
|
||||||
------------------------------------------------------------------
|
------------------------------------------------------------------
|
||||||
|
|
||||||
|
/[aA]b[cC]/IB
|
||||||
|
------------------------------------------------------------------
|
||||||
|
Bra
|
||||||
|
/i a
|
||||||
|
b
|
||||||
|
/i c
|
||||||
|
Ket
|
||||||
|
End
|
||||||
|
------------------------------------------------------------------
|
||||||
|
Capture group count = 0
|
||||||
|
First code unit = 'a' (caseless)
|
||||||
|
Last code unit = 'c' (caseless)
|
||||||
|
Subject length lower bound = 3
|
||||||
|
|
||||||
# End of testinput2
|
# End of testinput2
|
||||||
Error -70: PCRE2_ERROR_BADDATA (unknown error number)
|
Error -70: PCRE2_ERROR_BADDATA (unknown error number)
|
||||||
Error -62: bad serialized data
|
Error -62: bad serialized data
|
||||||
|
|
Loading…
Reference in New Issue