Fix pessimizing optimization of start-of-match code units in the interpreters.
This commit is contained in:
parent
963b570fd0
commit
7bbdc58513
11
ChangeLog
11
ChangeLog
|
@ -92,7 +92,7 @@ within it, the nested lookbehind was not correctly processed. For example, if
|
||||||
|
|
||||||
19. Implemented pcre2_get_match_data_size().
|
19. Implemented pcre2_get_match_data_size().
|
||||||
|
|
||||||
20. Two alterations to partial matching (not yet done by JIT):
|
20. Two alterations to partial matching:
|
||||||
|
|
||||||
(a) The definition of a partial match is slightly changed: if a pattern
|
(a) The definition of a partial match is slightly changed: if a pattern
|
||||||
contains any lookbehinds, an empty partial match may be given, because this
|
contains any lookbehinds, an empty partial match may be given, because this
|
||||||
|
@ -130,6 +130,15 @@ inspected in that lookahead were not included.
|
||||||
|
|
||||||
28. Add the pcre2_maketables_free() function.
|
28. Add the pcre2_maketables_free() function.
|
||||||
|
|
||||||
|
29. The start-up optimization that looks for a unique initial matching
|
||||||
|
code unit in the interpretive engines uses memchr() in 8-bit mode. When the
|
||||||
|
search is caseless, it was doing so inefficiently, which ended up slowing down
|
||||||
|
the match drastically when the subject was very long. The revised code (a)
|
||||||
|
remembers if one case is not found, so it never repeats the search for that
|
||||||
|
case after a bumpalong and (b) when one case has been found, it searches only
|
||||||
|
up to that position for an earlier occurrence of the other case. This fix
|
||||||
|
applies to both interpretive pcre2_match() and to pcre2_dfa_match().
|
||||||
|
|
||||||
|
|
||||||
Version 10.33 16-April-2019
|
Version 10.33 16-April-2019
|
||||||
---------------------------
|
---------------------------
|
||||||
|
|
|
@ -3254,6 +3254,11 @@ BOOL utf, anchored, startline, firstline;
|
||||||
BOOL has_first_cu = FALSE;
|
BOOL has_first_cu = FALSE;
|
||||||
BOOL has_req_cu = FALSE;
|
BOOL has_req_cu = FALSE;
|
||||||
|
|
||||||
|
#if PCRE2_CODE_UNIT_WIDTH == 8
|
||||||
|
BOOL memchr_not_found_first_cu = FALSE;
|
||||||
|
BOOL memchr_not_found_first_cu2 = FALSE;
|
||||||
|
#endif
|
||||||
|
|
||||||
PCRE2_UCHAR first_cu = 0;
|
PCRE2_UCHAR first_cu = 0;
|
||||||
PCRE2_UCHAR first_cu2 = 0;
|
PCRE2_UCHAR first_cu2 = 0;
|
||||||
PCRE2_UCHAR req_cu = 0;
|
PCRE2_UCHAR req_cu = 0;
|
||||||
|
@ -3634,7 +3639,10 @@ for (;;)
|
||||||
/* Not anchored. Advance to a unique first code unit if there is one. In
|
/* Not anchored. Advance to a unique first code unit if there is one. In
|
||||||
8-bit mode, the use of memchr() gives a big speed up, even though we have
|
8-bit mode, the use of memchr() gives a big speed up, even though we have
|
||||||
to call it twice in caseless mode, in order to find the earliest occurrence
|
to call it twice in caseless mode, in order to find the earliest occurrence
|
||||||
of the character in either of its cases. */
|
of the character in either of its cases. If a call to memchr() that
|
||||||
|
searches the rest of the subject fails to find one case, remember that in
|
||||||
|
order not to keep on repeating the search. This can make a huge difference
|
||||||
|
when the strings are very long and only one case is present. */
|
||||||
|
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
|
@ -3648,11 +3656,29 @@ for (;;)
|
||||||
(smc = UCHAR21TEST(start_match)) != first_cu &&
|
(smc = UCHAR21TEST(start_match)) != first_cu &&
|
||||||
smc != first_cu2)
|
smc != first_cu2)
|
||||||
start_match++;
|
start_match++;
|
||||||
|
|
||||||
#else /* 8-bit code units */
|
#else /* 8-bit code units */
|
||||||
PCRE2_SPTR pp1 =
|
PCRE2_SPTR pp1 = NULL;
|
||||||
memchr(start_match, first_cu, end_subject-start_match);
|
PCRE2_SPTR pp2 = NULL;
|
||||||
PCRE2_SPTR pp2 =
|
PCRE2_SIZE cu2size = end_subject - start_match;
|
||||||
memchr(start_match, first_cu2, end_subject-start_match);
|
|
||||||
|
if (!memchr_not_found_first_cu)
|
||||||
|
{
|
||||||
|
pp1 = memchr(start_match, first_cu, end_subject - start_match);
|
||||||
|
if (pp1 == NULL) memchr_not_found_first_cu = TRUE;
|
||||||
|
else cu2size = pp1 - start_match;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* If pp1 is not NULL, we have arranged to search only as far as pp1,
|
||||||
|
to see if the other case is earlier, so we can set "not found" only
|
||||||
|
when both searches have returned NULL. */
|
||||||
|
|
||||||
|
if (!memchr_not_found_first_cu2)
|
||||||
|
{
|
||||||
|
pp2 = memchr(start_match, first_cu2, cu2size);
|
||||||
|
memchr_not_found_first_cu2 = (pp2 == NULL && pp1 == NULL);
|
||||||
|
}
|
||||||
|
|
||||||
if (pp1 == NULL)
|
if (pp1 == NULL)
|
||||||
start_match = (pp2 == NULL)? end_subject : pp2;
|
start_match = (pp2 == NULL)? end_subject : pp2;
|
||||||
else
|
else
|
||||||
|
|
|
@ -6047,6 +6047,11 @@ BOOL has_req_cu = FALSE;
|
||||||
BOOL startline;
|
BOOL startline;
|
||||||
BOOL utf;
|
BOOL utf;
|
||||||
|
|
||||||
|
#if PCRE2_CODE_UNIT_WIDTH == 8
|
||||||
|
BOOL memchr_not_found_first_cu = FALSE;
|
||||||
|
BOOL memchr_not_found_first_cu2 = FALSE;
|
||||||
|
#endif
|
||||||
|
|
||||||
PCRE2_UCHAR first_cu = 0;
|
PCRE2_UCHAR first_cu = 0;
|
||||||
PCRE2_UCHAR first_cu2 = 0;
|
PCRE2_UCHAR first_cu2 = 0;
|
||||||
PCRE2_UCHAR req_cu = 0;
|
PCRE2_UCHAR req_cu = 0;
|
||||||
|
@ -6686,7 +6691,10 @@ for(;;)
|
||||||
/* Not anchored. Advance to a unique first code unit if there is one. In
|
/* Not anchored. Advance to a unique first code unit if there is one. In
|
||||||
8-bit mode, the use of memchr() gives a big speed up, even though we have
|
8-bit mode, the use of memchr() gives a big speed up, even though we have
|
||||||
to call it twice in caseless mode, in order to find the earliest occurrence
|
to call it twice in caseless mode, in order to find the earliest occurrence
|
||||||
of the character in either of its cases. */
|
of the character in either of its cases. If a call to memchr() that
|
||||||
|
searches the rest of the subject fails to find one case, remember that in
|
||||||
|
order not to keep on repeating the search. This can make a huge difference
|
||||||
|
when the strings are very long and only one case is present. */
|
||||||
|
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
|
@ -6700,11 +6708,29 @@ for(;;)
|
||||||
(smc = UCHAR21TEST(start_match)) != first_cu &&
|
(smc = UCHAR21TEST(start_match)) != first_cu &&
|
||||||
smc != first_cu2)
|
smc != first_cu2)
|
||||||
start_match++;
|
start_match++;
|
||||||
|
|
||||||
#else /* 8-bit code units */
|
#else /* 8-bit code units */
|
||||||
PCRE2_SPTR pp1 =
|
PCRE2_SPTR pp1 = NULL;
|
||||||
memchr(start_match, first_cu, end_subject-start_match);
|
PCRE2_SPTR pp2 = NULL;
|
||||||
PCRE2_SPTR pp2 =
|
PCRE2_SIZE cu2size = end_subject - start_match;
|
||||||
memchr(start_match, first_cu2, end_subject-start_match);
|
|
||||||
|
if (!memchr_not_found_first_cu)
|
||||||
|
{
|
||||||
|
pp1 = memchr(start_match, first_cu, end_subject - start_match);
|
||||||
|
if (pp1 == NULL) memchr_not_found_first_cu = TRUE;
|
||||||
|
else cu2size = pp1 - start_match;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* If pp1 is not NULL, we have arranged to search only as far as pp1,
|
||||||
|
to see if the other case is earlier, so we can set "not found" only
|
||||||
|
when both searches have returned NULL. */
|
||||||
|
|
||||||
|
if (!memchr_not_found_first_cu2)
|
||||||
|
{
|
||||||
|
pp2 = memchr(start_match, first_cu2, cu2size);
|
||||||
|
memchr_not_found_first_cu2 = (pp2 == NULL && pp1 == NULL);
|
||||||
|
}
|
||||||
|
|
||||||
if (pp1 == NULL)
|
if (pp1 == NULL)
|
||||||
start_match = (pp2 == NULL)? end_subject : pp2;
|
start_match = (pp2 == NULL)? end_subject : pp2;
|
||||||
else
|
else
|
||||||
|
|
Loading…
Reference in New Issue