Fix pessimizing optimization of start-of-match code units in the interpreters.
This commit is contained in:
parent
963b570fd0
commit
7bbdc58513
11
ChangeLog
11
ChangeLog
|
@ -92,7 +92,7 @@ within it, the nested lookbehind was not correctly processed. For example, if
|
|||
|
||||
19. Implemented pcre2_get_match_data_size().
|
||||
|
||||
20. Two alterations to partial matching (not yet done by JIT):
|
||||
20. Two alterations to partial matching:
|
||||
|
||||
(a) The definition of a partial match is slightly changed: if a pattern
|
||||
contains any lookbehinds, an empty partial match may be given, because this
|
||||
|
@ -130,6 +130,15 @@ inspected in that lookahead were not included.
|
|||
|
||||
28. Add the pcre2_maketables_free() function.
|
||||
|
||||
29. The start-up optimization that looks for a unique initial matching
|
||||
code unit in the interpretive engines uses memchr() in 8-bit mode. When the
|
||||
search is caseless, it was doing so inefficiently, which ended up slowing down
|
||||
the match drastically when the subject was very long. The revised code (a)
|
||||
remembers if one case is not found, so it never repeats the search for that
|
||||
case after a bumpalong and (b) when one case has been found, it searches only
|
||||
up to that position for an earlier occurrence of the other case. This fix
|
||||
applies to both interpretive pcre2_match() and to pcre2_dfa_match().
|
||||
|
||||
|
||||
Version 10.33 16-April-2019
|
||||
---------------------------
|
||||
|
|
|
@ -3254,6 +3254,11 @@ BOOL utf, anchored, startline, firstline;
|
|||
BOOL has_first_cu = FALSE;
|
||||
BOOL has_req_cu = FALSE;
|
||||
|
||||
#if PCRE2_CODE_UNIT_WIDTH == 8
|
||||
BOOL memchr_not_found_first_cu = FALSE;
|
||||
BOOL memchr_not_found_first_cu2 = FALSE;
|
||||
#endif
|
||||
|
||||
PCRE2_UCHAR first_cu = 0;
|
||||
PCRE2_UCHAR first_cu2 = 0;
|
||||
PCRE2_UCHAR req_cu = 0;
|
||||
|
@ -3634,7 +3639,10 @@ for (;;)
|
|||
/* Not anchored. Advance to a unique first code unit if there is one. In
|
||||
8-bit mode, the use of memchr() gives a big speed up, even though we have
|
||||
to call it twice in caseless mode, in order to find the earliest occurrence
|
||||
of the character in either of its cases. */
|
||||
of the character in either of its cases. If a call to memchr() that
|
||||
searches the rest of the subject fails to find one case, remember that in
|
||||
order not to keep on repeating the search. This can make a huge difference
|
||||
when the strings are very long and only one case is present. */
|
||||
|
||||
else
|
||||
{
|
||||
|
@ -3648,11 +3656,29 @@ for (;;)
|
|||
(smc = UCHAR21TEST(start_match)) != first_cu &&
|
||||
smc != first_cu2)
|
||||
start_match++;
|
||||
|
||||
#else /* 8-bit code units */
|
||||
PCRE2_SPTR pp1 =
|
||||
memchr(start_match, first_cu, end_subject-start_match);
|
||||
PCRE2_SPTR pp2 =
|
||||
memchr(start_match, first_cu2, end_subject-start_match);
|
||||
PCRE2_SPTR pp1 = NULL;
|
||||
PCRE2_SPTR pp2 = NULL;
|
||||
PCRE2_SIZE cu2size = end_subject - start_match;
|
||||
|
||||
if (!memchr_not_found_first_cu)
|
||||
{
|
||||
pp1 = memchr(start_match, first_cu, end_subject - start_match);
|
||||
if (pp1 == NULL) memchr_not_found_first_cu = TRUE;
|
||||
else cu2size = pp1 - start_match;
|
||||
}
|
||||
|
||||
/* If pp1 is not NULL, we have arranged to search only as far as pp1,
|
||||
to see if the other case is earlier, so we can set "not found" only
|
||||
when both searches have returned NULL. */
|
||||
|
||||
if (!memchr_not_found_first_cu2)
|
||||
{
|
||||
pp2 = memchr(start_match, first_cu2, cu2size);
|
||||
memchr_not_found_first_cu2 = (pp2 == NULL && pp1 == NULL);
|
||||
}
|
||||
|
||||
if (pp1 == NULL)
|
||||
start_match = (pp2 == NULL)? end_subject : pp2;
|
||||
else
|
||||
|
|
|
@ -6047,6 +6047,11 @@ BOOL has_req_cu = FALSE;
|
|||
BOOL startline;
|
||||
BOOL utf;
|
||||
|
||||
#if PCRE2_CODE_UNIT_WIDTH == 8
|
||||
BOOL memchr_not_found_first_cu = FALSE;
|
||||
BOOL memchr_not_found_first_cu2 = FALSE;
|
||||
#endif
|
||||
|
||||
PCRE2_UCHAR first_cu = 0;
|
||||
PCRE2_UCHAR first_cu2 = 0;
|
||||
PCRE2_UCHAR req_cu = 0;
|
||||
|
@ -6686,7 +6691,10 @@ for(;;)
|
|||
/* Not anchored. Advance to a unique first code unit if there is one. In
|
||||
8-bit mode, the use of memchr() gives a big speed up, even though we have
|
||||
to call it twice in caseless mode, in order to find the earliest occurrence
|
||||
of the character in either of its cases. */
|
||||
of the character in either of its cases. If a call to memchr() that
|
||||
searches the rest of the subject fails to find one case, remember that in
|
||||
order not to keep on repeating the search. This can make a huge difference
|
||||
when the strings are very long and only one case is present. */
|
||||
|
||||
else
|
||||
{
|
||||
|
@ -6700,11 +6708,29 @@ for(;;)
|
|||
(smc = UCHAR21TEST(start_match)) != first_cu &&
|
||||
smc != first_cu2)
|
||||
start_match++;
|
||||
|
||||
#else /* 8-bit code units */
|
||||
PCRE2_SPTR pp1 =
|
||||
memchr(start_match, first_cu, end_subject-start_match);
|
||||
PCRE2_SPTR pp2 =
|
||||
memchr(start_match, first_cu2, end_subject-start_match);
|
||||
PCRE2_SPTR pp1 = NULL;
|
||||
PCRE2_SPTR pp2 = NULL;
|
||||
PCRE2_SIZE cu2size = end_subject - start_match;
|
||||
|
||||
if (!memchr_not_found_first_cu)
|
||||
{
|
||||
pp1 = memchr(start_match, first_cu, end_subject - start_match);
|
||||
if (pp1 == NULL) memchr_not_found_first_cu = TRUE;
|
||||
else cu2size = pp1 - start_match;
|
||||
}
|
||||
|
||||
/* If pp1 is not NULL, we have arranged to search only as far as pp1,
|
||||
to see if the other case is earlier, so we can set "not found" only
|
||||
when both searches have returned NULL. */
|
||||
|
||||
if (!memchr_not_found_first_cu2)
|
||||
{
|
||||
pp2 = memchr(start_match, first_cu2, cu2size);
|
||||
memchr_not_found_first_cu2 = (pp2 == NULL && pp1 == NULL);
|
||||
}
|
||||
|
||||
if (pp1 == NULL)
|
||||
start_match = (pp2 == NULL)? end_subject : pp2;
|
||||
else
|
||||
|
|
Loading…
Reference in New Issue