Fix pessimizing optimization of start-of-match code units in the interpreters.
This commit is contained in:
parent
963b570fd0
commit
7bbdc58513
11
ChangeLog
11
ChangeLog
|
@ -92,7 +92,7 @@ within it, the nested lookbehind was not correctly processed. For example, if
|
||||||
|
|
||||||
19. Implemented pcre2_get_match_data_size().
|
19. Implemented pcre2_get_match_data_size().
|
||||||
|
|
||||||
20. Two alterations to partial matching (not yet done by JIT):
|
20. Two alterations to partial matching:
|
||||||
|
|
||||||
(a) The definition of a partial match is slightly changed: if a pattern
|
(a) The definition of a partial match is slightly changed: if a pattern
|
||||||
contains any lookbehinds, an empty partial match may be given, because this
|
contains any lookbehinds, an empty partial match may be given, because this
|
||||||
|
@ -130,6 +130,15 @@ inspected in that lookahead were not included.
|
||||||
|
|
||||||
28. Add the pcre2_maketables_free() function.
|
28. Add the pcre2_maketables_free() function.
|
||||||
|
|
||||||
|
29. The start-up optimization that looks for a unique initial matching
|
||||||
|
code unit in the interpretive engines uses memchr() in 8-bit mode. When the
|
||||||
|
search is caseless, it was doing so inefficiently, which ended up slowing down
|
||||||
|
the match drastically when the subject was very long. The revised code (a)
|
||||||
|
remembers if one case is not found, so it never repeats the search for that
|
||||||
|
case after a bumpalong and (b) when one case has been found, it searches only
|
||||||
|
up to that position for an earlier occurrence of the other case. This fix
|
||||||
|
applies to both interpretive pcre2_match() and to pcre2_dfa_match().
|
||||||
|
|
||||||
|
|
||||||
Version 10.33 16-April-2019
|
Version 10.33 16-April-2019
|
||||||
---------------------------
|
---------------------------
|
||||||
|
|
|
@ -3254,6 +3254,11 @@ BOOL utf, anchored, startline, firstline;
|
||||||
BOOL has_first_cu = FALSE;
|
BOOL has_first_cu = FALSE;
|
||||||
BOOL has_req_cu = FALSE;
|
BOOL has_req_cu = FALSE;
|
||||||
|
|
||||||
|
#if PCRE2_CODE_UNIT_WIDTH == 8
|
||||||
|
BOOL memchr_not_found_first_cu = FALSE;
|
||||||
|
BOOL memchr_not_found_first_cu2 = FALSE;
|
||||||
|
#endif
|
||||||
|
|
||||||
PCRE2_UCHAR first_cu = 0;
|
PCRE2_UCHAR first_cu = 0;
|
||||||
PCRE2_UCHAR first_cu2 = 0;
|
PCRE2_UCHAR first_cu2 = 0;
|
||||||
PCRE2_UCHAR req_cu = 0;
|
PCRE2_UCHAR req_cu = 0;
|
||||||
|
@ -3634,7 +3639,10 @@ for (;;)
|
||||||
/* Not anchored. Advance to a unique first code unit if there is one. In
|
/* Not anchored. Advance to a unique first code unit if there is one. In
|
||||||
8-bit mode, the use of memchr() gives a big speed up, even though we have
|
8-bit mode, the use of memchr() gives a big speed up, even though we have
|
||||||
to call it twice in caseless mode, in order to find the earliest occurrence
|
to call it twice in caseless mode, in order to find the earliest occurrence
|
||||||
of the character in either of its cases. */
|
of the character in either of its cases. If a call to memchr() that
|
||||||
|
searches the rest of the subject fails to find one case, remember that in
|
||||||
|
order not to keep on repeating the search. This can make a huge difference
|
||||||
|
when the strings are very long and only one case is present. */
|
||||||
|
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
|
@ -3648,11 +3656,29 @@ for (;;)
|
||||||
(smc = UCHAR21TEST(start_match)) != first_cu &&
|
(smc = UCHAR21TEST(start_match)) != first_cu &&
|
||||||
smc != first_cu2)
|
smc != first_cu2)
|
||||||
start_match++;
|
start_match++;
|
||||||
|
|
||||||
#else /* 8-bit code units */
|
#else /* 8-bit code units */
|
||||||
PCRE2_SPTR pp1 =
|
PCRE2_SPTR pp1 = NULL;
|
||||||
memchr(start_match, first_cu, end_subject-start_match);
|
PCRE2_SPTR pp2 = NULL;
|
||||||
PCRE2_SPTR pp2 =
|
PCRE2_SIZE cu2size = end_subject - start_match;
|
||||||
memchr(start_match, first_cu2, end_subject-start_match);
|
|
||||||
|
if (!memchr_not_found_first_cu)
|
||||||
|
{
|
||||||
|
pp1 = memchr(start_match, first_cu, end_subject - start_match);
|
||||||
|
if (pp1 == NULL) memchr_not_found_first_cu = TRUE;
|
||||||
|
else cu2size = pp1 - start_match;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* If pp1 is not NULL, we have arranged to search only as far as pp1,
|
||||||
|
to see if the other case is earlier, so we can set "not found" only
|
||||||
|
when both searches have returned NULL. */
|
||||||
|
|
||||||
|
if (!memchr_not_found_first_cu2)
|
||||||
|
{
|
||||||
|
pp2 = memchr(start_match, first_cu2, cu2size);
|
||||||
|
memchr_not_found_first_cu2 = (pp2 == NULL && pp1 == NULL);
|
||||||
|
}
|
||||||
|
|
||||||
if (pp1 == NULL)
|
if (pp1 == NULL)
|
||||||
start_match = (pp2 == NULL)? end_subject : pp2;
|
start_match = (pp2 == NULL)? end_subject : pp2;
|
||||||
else
|
else
|
||||||
|
|
|
@ -494,11 +494,11 @@ in the code. The second one is used when we already know we are past the end of
|
||||||
the subject. We set the "hit end" flag if the pointer is at the end of the
|
the subject. We set the "hit end" flag if the pointer is at the end of the
|
||||||
subject and either (a) the pointer is past the earliest inspected character
|
subject and either (a) the pointer is past the earliest inspected character
|
||||||
(i.e. something has been matched, even if not part of the actual matched
|
(i.e. something has been matched, even if not part of the actual matched
|
||||||
string), or (b) the pattern contains a lookbehind. These are the conditions for
|
string), or (b) the pattern contains a lookbehind. These are the conditions for
|
||||||
which adding more characters may allow the current match to continue.
|
which adding more characters may allow the current match to continue.
|
||||||
|
|
||||||
For hard partial matching, we immediately return a partial match. Otherwise,
|
For hard partial matching, we immediately return a partial match. Otherwise,
|
||||||
carrying on means that a complete match on the current subject will be sought.
|
carrying on means that a complete match on the current subject will be sought.
|
||||||
A partial match is returned only if no complete match can be found. */
|
A partial match is returned only if no complete match can be found. */
|
||||||
|
|
||||||
#define CHECK_PARTIAL()\
|
#define CHECK_PARTIAL()\
|
||||||
|
@ -5658,10 +5658,10 @@ fprintf(stderr, "++ op=%d\n", *Fecode);
|
||||||
case OP_EOD:
|
case OP_EOD:
|
||||||
if (Feptr < mb->end_subject) RRETURN(MATCH_NOMATCH);
|
if (Feptr < mb->end_subject) RRETURN(MATCH_NOMATCH);
|
||||||
if (mb->partial != 0)
|
if (mb->partial != 0)
|
||||||
{
|
{
|
||||||
mb->hitend = TRUE;
|
mb->hitend = TRUE;
|
||||||
if (mb->partial > 1) return PCRE2_ERROR_PARTIAL;
|
if (mb->partial > 1) return PCRE2_ERROR_PARTIAL;
|
||||||
}
|
}
|
||||||
Fecode++;
|
Fecode++;
|
||||||
break;
|
break;
|
||||||
|
|
||||||
|
@ -5687,10 +5687,10 @@ fprintf(stderr, "++ op=%d\n", *Fecode);
|
||||||
/* Either at end of string or \n before end. */
|
/* Either at end of string or \n before end. */
|
||||||
|
|
||||||
if (mb->partial != 0)
|
if (mb->partial != 0)
|
||||||
{
|
{
|
||||||
mb->hitend = TRUE;
|
mb->hitend = TRUE;
|
||||||
if (mb->partial > 1) return PCRE2_ERROR_PARTIAL;
|
if (mb->partial > 1) return PCRE2_ERROR_PARTIAL;
|
||||||
}
|
}
|
||||||
Fecode++;
|
Fecode++;
|
||||||
break;
|
break;
|
||||||
|
|
||||||
|
@ -6047,6 +6047,11 @@ BOOL has_req_cu = FALSE;
|
||||||
BOOL startline;
|
BOOL startline;
|
||||||
BOOL utf;
|
BOOL utf;
|
||||||
|
|
||||||
|
#if PCRE2_CODE_UNIT_WIDTH == 8
|
||||||
|
BOOL memchr_not_found_first_cu = FALSE;
|
||||||
|
BOOL memchr_not_found_first_cu2 = FALSE;
|
||||||
|
#endif
|
||||||
|
|
||||||
PCRE2_UCHAR first_cu = 0;
|
PCRE2_UCHAR first_cu = 0;
|
||||||
PCRE2_UCHAR first_cu2 = 0;
|
PCRE2_UCHAR first_cu2 = 0;
|
||||||
PCRE2_UCHAR req_cu = 0;
|
PCRE2_UCHAR req_cu = 0;
|
||||||
|
@ -6453,7 +6458,7 @@ mb->start_subject = subject;
|
||||||
mb->start_offset = start_offset;
|
mb->start_offset = start_offset;
|
||||||
mb->end_subject = end_subject;
|
mb->end_subject = end_subject;
|
||||||
mb->hasthen = (re->flags & PCRE2_HASTHEN) != 0;
|
mb->hasthen = (re->flags & PCRE2_HASTHEN) != 0;
|
||||||
mb->allowemptypartial = (re->max_lookbehind > 0) ||
|
mb->allowemptypartial = (re->max_lookbehind > 0) ||
|
||||||
(re->flags & PCRE2_MATCH_EMPTY) != 0;
|
(re->flags & PCRE2_MATCH_EMPTY) != 0;
|
||||||
mb->poptions = re->overall_options; /* Pattern options */
|
mb->poptions = re->overall_options; /* Pattern options */
|
||||||
mb->ignore_skip_arg = 0;
|
mb->ignore_skip_arg = 0;
|
||||||
|
@ -6686,7 +6691,10 @@ for(;;)
|
||||||
/* Not anchored. Advance to a unique first code unit if there is one. In
|
/* Not anchored. Advance to a unique first code unit if there is one. In
|
||||||
8-bit mode, the use of memchr() gives a big speed up, even though we have
|
8-bit mode, the use of memchr() gives a big speed up, even though we have
|
||||||
to call it twice in caseless mode, in order to find the earliest occurrence
|
to call it twice in caseless mode, in order to find the earliest occurrence
|
||||||
of the character in either of its cases. */
|
of the character in either of its cases. If a call to memchr() that
|
||||||
|
searches the rest of the subject fails to find one case, remember that in
|
||||||
|
order not to keep on repeating the search. This can make a huge difference
|
||||||
|
when the strings are very long and only one case is present. */
|
||||||
|
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
|
@ -6700,11 +6708,29 @@ for(;;)
|
||||||
(smc = UCHAR21TEST(start_match)) != first_cu &&
|
(smc = UCHAR21TEST(start_match)) != first_cu &&
|
||||||
smc != first_cu2)
|
smc != first_cu2)
|
||||||
start_match++;
|
start_match++;
|
||||||
|
|
||||||
#else /* 8-bit code units */
|
#else /* 8-bit code units */
|
||||||
PCRE2_SPTR pp1 =
|
PCRE2_SPTR pp1 = NULL;
|
||||||
memchr(start_match, first_cu, end_subject-start_match);
|
PCRE2_SPTR pp2 = NULL;
|
||||||
PCRE2_SPTR pp2 =
|
PCRE2_SIZE cu2size = end_subject - start_match;
|
||||||
memchr(start_match, first_cu2, end_subject-start_match);
|
|
||||||
|
if (!memchr_not_found_first_cu)
|
||||||
|
{
|
||||||
|
pp1 = memchr(start_match, first_cu, end_subject - start_match);
|
||||||
|
if (pp1 == NULL) memchr_not_found_first_cu = TRUE;
|
||||||
|
else cu2size = pp1 - start_match;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* If pp1 is not NULL, we have arranged to search only as far as pp1,
|
||||||
|
to see if the other case is earlier, so we can set "not found" only
|
||||||
|
when both searches have returned NULL. */
|
||||||
|
|
||||||
|
if (!memchr_not_found_first_cu2)
|
||||||
|
{
|
||||||
|
pp2 = memchr(start_match, first_cu2, cu2size);
|
||||||
|
memchr_not_found_first_cu2 = (pp2 == NULL && pp1 == NULL);
|
||||||
|
}
|
||||||
|
|
||||||
if (pp1 == NULL)
|
if (pp1 == NULL)
|
||||||
start_match = (pp2 == NULL)? end_subject : pp2;
|
start_match = (pp2 == NULL)? end_subject : pp2;
|
||||||
else
|
else
|
||||||
|
|
Loading…
Reference in New Issue