Fix pessimizing optimization of start-of-match code units in the interpreters.

This commit is contained in:
Philip.Hazel 2019-09-06 16:08:45 +00:00
parent 963b570fd0
commit 7bbdc58513
3 changed files with 79 additions and 18 deletions

View File

@ -92,7 +92,7 @@ within it, the nested lookbehind was not correctly processed. For example, if
19. Implemented pcre2_get_match_data_size().
20. Two alterations to partial matching (not yet done by JIT):
20. Two alterations to partial matching:
(a) The definition of a partial match is slightly changed: if a pattern
contains any lookbehinds, an empty partial match may be given, because this
@ -130,6 +130,15 @@ inspected in that lookahead were not included.
28. Add the pcre2_maketables_free() function.
29. The start-up optimization that looks for a unique initial matching
code unit in the interpretive engines uses memchr() in 8-bit mode. When the
search is caseless, it was doing so inefficiently, which ended up slowing down
the match drastically when the subject was very long. The revised code (a)
remembers if one case is not found, so it never repeats the search for that
case after a bumpalong and (b) when one case has been found, it searches only
up to that position for an earlier occurrence of the other case. This fix
applies to both interpretive pcre2_match() and to pcre2_dfa_match().
Version 10.33 16-April-2019
---------------------------

View File

@ -3254,6 +3254,11 @@ BOOL utf, anchored, startline, firstline;
BOOL has_first_cu = FALSE;
BOOL has_req_cu = FALSE;
#if PCRE2_CODE_UNIT_WIDTH == 8
BOOL memchr_not_found_first_cu = FALSE;
BOOL memchr_not_found_first_cu2 = FALSE;
#endif
PCRE2_UCHAR first_cu = 0;
PCRE2_UCHAR first_cu2 = 0;
PCRE2_UCHAR req_cu = 0;
@ -3634,7 +3639,10 @@ for (;;)
/* Not anchored. Advance to a unique first code unit if there is one. In
8-bit mode, the use of memchr() gives a big speed up, even though we have
to call it twice in caseless mode, in order to find the earliest occurrence
of the character in either of its cases. */
of the character in either of its cases. If a call to memchr() that
searches the rest of the subject fails to find one case, remember that in
order not to keep on repeating the search. This can make a huge difference
when the strings are very long and only one case is present. */
else
{
@ -3648,11 +3656,29 @@ for (;;)
(smc = UCHAR21TEST(start_match)) != first_cu &&
smc != first_cu2)
start_match++;
#else /* 8-bit code units */
PCRE2_SPTR pp1 =
memchr(start_match, first_cu, end_subject-start_match);
PCRE2_SPTR pp2 =
memchr(start_match, first_cu2, end_subject-start_match);
PCRE2_SPTR pp1 = NULL;
PCRE2_SPTR pp2 = NULL;
PCRE2_SIZE cu2size = end_subject - start_match;
if (!memchr_not_found_first_cu)
{
pp1 = memchr(start_match, first_cu, end_subject - start_match);
if (pp1 == NULL) memchr_not_found_first_cu = TRUE;
else cu2size = pp1 - start_match;
}
/* If pp1 is not NULL, we have arranged to search only as far as pp1,
to see if the other case is earlier, so we can set "not found" only
when both searches have returned NULL. */
if (!memchr_not_found_first_cu2)
{
pp2 = memchr(start_match, first_cu2, cu2size);
memchr_not_found_first_cu2 = (pp2 == NULL && pp1 == NULL);
}
if (pp1 == NULL)
start_match = (pp2 == NULL)? end_subject : pp2;
else

View File

@ -494,11 +494,11 @@ in the code. The second one is used when we already know we are past the end of
the subject. We set the "hit end" flag if the pointer is at the end of the
subject and either (a) the pointer is past the earliest inspected character
(i.e. something has been matched, even if not part of the actual matched
string), or (b) the pattern contains a lookbehind. These are the conditions for
string), or (b) the pattern contains a lookbehind. These are the conditions for
which adding more characters may allow the current match to continue.
For hard partial matching, we immediately return a partial match. Otherwise,
carrying on means that a complete match on the current subject will be sought.
carrying on means that a complete match on the current subject will be sought.
A partial match is returned only if no complete match can be found. */
#define CHECK_PARTIAL()\
@ -5658,10 +5658,10 @@ fprintf(stderr, "++ op=%d\n", *Fecode);
case OP_EOD:
if (Feptr < mb->end_subject) RRETURN(MATCH_NOMATCH);
if (mb->partial != 0)
{
{
mb->hitend = TRUE;
if (mb->partial > 1) return PCRE2_ERROR_PARTIAL;
}
}
Fecode++;
break;
@ -5687,10 +5687,10 @@ fprintf(stderr, "++ op=%d\n", *Fecode);
/* Either at end of string or \n before end. */
if (mb->partial != 0)
{
{
mb->hitend = TRUE;
if (mb->partial > 1) return PCRE2_ERROR_PARTIAL;
}
}
Fecode++;
break;
@ -6047,6 +6047,11 @@ BOOL has_req_cu = FALSE;
BOOL startline;
BOOL utf;
#if PCRE2_CODE_UNIT_WIDTH == 8
BOOL memchr_not_found_first_cu = FALSE;
BOOL memchr_not_found_first_cu2 = FALSE;
#endif
PCRE2_UCHAR first_cu = 0;
PCRE2_UCHAR first_cu2 = 0;
PCRE2_UCHAR req_cu = 0;
@ -6453,7 +6458,7 @@ mb->start_subject = subject;
mb->start_offset = start_offset;
mb->end_subject = end_subject;
mb->hasthen = (re->flags & PCRE2_HASTHEN) != 0;
mb->allowemptypartial = (re->max_lookbehind > 0) ||
mb->allowemptypartial = (re->max_lookbehind > 0) ||
(re->flags & PCRE2_MATCH_EMPTY) != 0;
mb->poptions = re->overall_options; /* Pattern options */
mb->ignore_skip_arg = 0;
@ -6686,7 +6691,10 @@ for(;;)
/* Not anchored. Advance to a unique first code unit if there is one. In
8-bit mode, the use of memchr() gives a big speed up, even though we have
to call it twice in caseless mode, in order to find the earliest occurrence
of the character in either of its cases. */
of the character in either of its cases. If a call to memchr() that
searches the rest of the subject fails to find one case, remember that in
order not to keep on repeating the search. This can make a huge difference
when the strings are very long and only one case is present. */
else
{
@ -6700,11 +6708,29 @@ for(;;)
(smc = UCHAR21TEST(start_match)) != first_cu &&
smc != first_cu2)
start_match++;
#else /* 8-bit code units */
PCRE2_SPTR pp1 =
memchr(start_match, first_cu, end_subject-start_match);
PCRE2_SPTR pp2 =
memchr(start_match, first_cu2, end_subject-start_match);
PCRE2_SPTR pp1 = NULL;
PCRE2_SPTR pp2 = NULL;
PCRE2_SIZE cu2size = end_subject - start_match;
if (!memchr_not_found_first_cu)
{
pp1 = memchr(start_match, first_cu, end_subject - start_match);
if (pp1 == NULL) memchr_not_found_first_cu = TRUE;
else cu2size = pp1 - start_match;
}
/* If pp1 is not NULL, we have arranged to search only as far as pp1,
to see if the other case is earlier, so we can set "not found" only
when both searches have returned NULL. */
if (!memchr_not_found_first_cu2)
{
pp2 = memchr(start_match, first_cu2, cu2size);
memchr_not_found_first_cu2 = (pp2 == NULL && pp1 == NULL);
}
if (pp1 == NULL)
start_match = (pp2 == NULL)? end_subject : pp2;
else