Improve code for "starts with" optimization in the interpreters.

This commit is contained in:
Philip Hazel 2021-08-29 17:25:59 +01:00
parent d5a61ee891
commit eea410b33a
3 changed files with 95 additions and 52 deletions

View File

@ -41,6 +41,11 @@ quote old; it was released in 2014.
detecting symlink loops. This is dependent on the availability of realpath(), detecting symlink loops. This is dependent on the availability of realpath(),
which is now tested for in ./configure and CMakeLists.txt. which is now tested for in ./configure and CMakeLists.txt.
5. Implemented a modified version of Thomas Tempelmann's patch for handling
case-independent "first code unit" searches for unanchored patterns in 8-bit
mode in the interpreters. Instead of just remembering whether one case matched
or not, it remembers the position of a previous match so as to avoid
unnecessary repeated searching.
Version 10.37 26-May-2021 Version 10.37 26-May-2021

View File

@ -7,7 +7,7 @@ and semantics are as close as possible to those of the Perl 5 language.
Written by Philip Hazel Written by Philip Hazel
Original API code Copyright (c) 1997-2012 University of Cambridge Original API code Copyright (c) 1997-2012 University of Cambridge
New API code Copyright (c) 2016-2020 University of Cambridge New API code Copyright (c) 2016-2021 University of Cambridge
----------------------------------------------------------------------------- -----------------------------------------------------------------------------
Redistribution and use in source and binary forms, with or without Redistribution and use in source and binary forms, with or without
@ -3256,8 +3256,8 @@ BOOL has_first_cu = FALSE;
BOOL has_req_cu = FALSE; BOOL has_req_cu = FALSE;
#if PCRE2_CODE_UNIT_WIDTH == 8 #if PCRE2_CODE_UNIT_WIDTH == 8
BOOL memchr_not_found_first_cu = FALSE; PCRE2_SPTR memchr_found_first_cu = NULL;
BOOL memchr_not_found_first_cu2 = FALSE; PCRE2_SPTR memchr_found_first_cu2 = NULL;
#endif #endif
PCRE2_UCHAR first_cu = 0; PCRE2_UCHAR first_cu = 0;
@ -3648,13 +3648,7 @@ for (;;)
} }
} }
/* Not anchored. Advance to a unique first code unit if there is one. In /* Not anchored. Advance to a unique first code unit if there is one. */
8-bit mode, the use of memchr() gives a big speed up, even though we have
to call it twice in caseless mode, in order to find the earliest occurrence
of the character in either of its cases. If a call to memchr() that
searches the rest of the subject fails to find one case, remember that in
order not to keep on repeating the search. This can make a huge difference
when the strings are very long and only one case is present. */
else else
{ {
@ -3662,43 +3656,68 @@ for (;;)
{ {
if (first_cu != first_cu2) /* Caseless */ if (first_cu != first_cu2) /* Caseless */
{ {
/* In 16-bit and 32_bit modes we have to do our own search, so can
look for both cases at once. */
#if PCRE2_CODE_UNIT_WIDTH != 8 #if PCRE2_CODE_UNIT_WIDTH != 8
PCRE2_UCHAR smc; PCRE2_UCHAR smc;
while (start_match < end_subject && while (start_match < end_subject &&
(smc = UCHAR21TEST(start_match)) != first_cu && (smc = UCHAR21TEST(start_match)) != first_cu &&
smc != first_cu2) smc != first_cu2)
start_match++; start_match++;
#else
/* In 8-bit mode, the use of memchr() gives a big speed up, even
though we have to call it twice in order to find the earliest
occurrence of the code unit in either of its cases. Caching is used
to remember the positions of previously found code units. This can
make a huge difference when the strings are very long and only one
case is actually present. */
#else /* 8-bit code units */
PCRE2_SPTR pp1 = NULL; PCRE2_SPTR pp1 = NULL;
PCRE2_SPTR pp2 = NULL; PCRE2_SPTR pp2 = NULL;
PCRE2_SIZE cu2size = end_subject - start_match; PCRE2_SIZE searchlength = end_subject - start_match;
if (!memchr_not_found_first_cu) /* If we haven't got a previously found position for first_cu, or if
the current starting position is later, we need to do a search. If
the code unit is not found, set it to the end. */
if (memchr_found_first_cu == NULL ||
start_match > memchr_found_first_cu)
{ {
pp1 = memchr(start_match, first_cu, end_subject - start_match); pp1 = memchr(start_match, first_cu, searchlength);
if (pp1 == NULL) memchr_not_found_first_cu = TRUE; memchr_found_first_cu = (pp1 == NULL)? end_subject : pp1;
else cu2size = pp1 - start_match;
} }
/* If pp1 is not NULL, we have arranged to search only as far as pp1, /* If the start is before a previously found position, use the
to see if the other case is earlier, so we can set "not found" only previous position, or NULL if a previous search failed. */
when both searches have returned NULL. */
if (!memchr_not_found_first_cu2) else pp1 = (memchr_found_first_cu == end_subject)? NULL :
memchr_found_first_cu;
/* Do the same thing for the other case. */
if (memchr_found_first_cu2 == NULL ||
start_match > memchr_found_first_cu2)
{ {
pp2 = memchr(start_match, first_cu2, cu2size); pp2 = memchr(start_match, first_cu2, searchlength);
memchr_not_found_first_cu2 = (pp2 == NULL && pp1 == NULL); memchr_found_first_cu2 = (pp2 == NULL)? end_subject : pp2;
} }
else pp2 = (memchr_found_first_cu2 == end_subject)? NULL :
memchr_found_first_cu2;
/* Set the start to the end of the subject if neither case was found.
Otherwise, use the earlier found point. */
if (pp1 == NULL) if (pp1 == NULL)
start_match = (pp2 == NULL)? end_subject : pp2; start_match = (pp2 == NULL)? end_subject : pp2;
else else
start_match = (pp2 == NULL || pp1 < pp2)? pp1 : pp2; start_match = (pp2 == NULL || pp1 < pp2)? pp1 : pp2;
#endif
#endif /* 8-bit handling */
} }
/* The caseful case */ /* The caseful case is much simpler. */
else else
{ {

View File

@ -7,7 +7,7 @@ and semantics are as close as possible to those of the Perl 5 language.
Written by Philip Hazel Written by Philip Hazel
Original API code Copyright (c) 1997-2012 University of Cambridge Original API code Copyright (c) 1997-2012 University of Cambridge
New API code Copyright (c) 2015-2020 University of Cambridge New API code Copyright (c) 2015-2021 University of Cambridge
----------------------------------------------------------------------------- -----------------------------------------------------------------------------
Redistribution and use in source and binary forms, with or without Redistribution and use in source and binary forms, with or without
@ -6117,8 +6117,8 @@ BOOL has_req_cu = FALSE;
BOOL startline; BOOL startline;
#if PCRE2_CODE_UNIT_WIDTH == 8 #if PCRE2_CODE_UNIT_WIDTH == 8
BOOL memchr_not_found_first_cu; PCRE2_SPTR memchr_found_first_cu;
BOOL memchr_not_found_first_cu2; PCRE2_SPTR memchr_found_first_cu2;
#endif #endif
PCRE2_UCHAR first_cu = 0; PCRE2_UCHAR first_cu = 0;
@ -6712,8 +6712,8 @@ start_partial = match_partial = NULL;
mb->hitend = FALSE; mb->hitend = FALSE;
#if PCRE2_CODE_UNIT_WIDTH == 8 #if PCRE2_CODE_UNIT_WIDTH == 8
memchr_not_found_first_cu = FALSE; memchr_found_first_cu = NULL;
memchr_not_found_first_cu2 = FALSE; memchr_found_first_cu2 = NULL;
#endif #endif
for(;;) for(;;)
@ -6782,13 +6782,7 @@ for(;;)
} }
} }
/* Not anchored. Advance to a unique first code unit if there is one. In /* Not anchored. Advance to a unique first code unit if there is one. */
8-bit mode, the use of memchr() gives a big speed up, even though we have
to call it twice in caseless mode, in order to find the earliest occurrence
of the character in either of its cases. If a call to memchr() that
searches the rest of the subject fails to find one case, remember that in
order not to keep on repeating the search. This can make a huge difference
when the strings are very long and only one case is present. */
else else
{ {
@ -6796,43 +6790,68 @@ for(;;)
{ {
if (first_cu != first_cu2) /* Caseless */ if (first_cu != first_cu2) /* Caseless */
{ {
/* In 16-bit and 32_bit modes we have to do our own search, so can
look for both cases at once. */
#if PCRE2_CODE_UNIT_WIDTH != 8 #if PCRE2_CODE_UNIT_WIDTH != 8
PCRE2_UCHAR smc; PCRE2_UCHAR smc;
while (start_match < end_subject && while (start_match < end_subject &&
(smc = UCHAR21TEST(start_match)) != first_cu && (smc = UCHAR21TEST(start_match)) != first_cu &&
smc != first_cu2) smc != first_cu2)
start_match++; start_match++;
#else
/* In 8-bit mode, the use of memchr() gives a big speed up, even
though we have to call it twice in order to find the earliest
occurrence of the code unit in either of its cases. Caching is used
to remember the positions of previously found code units. This can
make a huge difference when the strings are very long and only one
case is actually present. */
#else /* 8-bit code units */
PCRE2_SPTR pp1 = NULL; PCRE2_SPTR pp1 = NULL;
PCRE2_SPTR pp2 = NULL; PCRE2_SPTR pp2 = NULL;
PCRE2_SIZE cu2size = end_subject - start_match; PCRE2_SIZE searchlength = end_subject - start_match;
if (!memchr_not_found_first_cu) /* If we haven't got a previously found position for first_cu, or if
the current starting position is later, we need to do a search. If
the code unit is not found, set it to the end. */
if (memchr_found_first_cu == NULL ||
start_match > memchr_found_first_cu)
{ {
pp1 = memchr(start_match, first_cu, end_subject - start_match); pp1 = memchr(start_match, first_cu, searchlength);
if (pp1 == NULL) memchr_not_found_first_cu = TRUE; memchr_found_first_cu = (pp1 == NULL)? end_subject : pp1;
else cu2size = pp1 - start_match;
} }
/* If pp1 is not NULL, we have arranged to search only as far as pp1, /* If the start is before a previously found position, use the
to see if the other case is earlier, so we can set "not found" only previous position, or NULL if a previous search failed. */
when both searches have returned NULL. */
if (!memchr_not_found_first_cu2) else pp1 = (memchr_found_first_cu == end_subject)? NULL :
memchr_found_first_cu;
/* Do the same thing for the other case. */
if (memchr_found_first_cu2 == NULL ||
start_match > memchr_found_first_cu2)
{ {
pp2 = memchr(start_match, first_cu2, cu2size); pp2 = memchr(start_match, first_cu2, searchlength);
memchr_not_found_first_cu2 = (pp2 == NULL && pp1 == NULL); memchr_found_first_cu2 = (pp2 == NULL)? end_subject : pp2;
} }
else pp2 = (memchr_found_first_cu2 == end_subject)? NULL :
memchr_found_first_cu2;
/* Set the start to the end of the subject if neither case was found.
Otherwise, use the earlier found point. */
if (pp1 == NULL) if (pp1 == NULL)
start_match = (pp2 == NULL)? end_subject : pp2; start_match = (pp2 == NULL)? end_subject : pp2;
else else
start_match = (pp2 == NULL || pp1 < pp2)? pp1 : pp2; start_match = (pp2 == NULL || pp1 < pp2)? pp1 : pp2;
#endif
#endif /* 8-bit handling */
} }
/* The caseful case */ /* The caseful case is much simpler. */
else else
{ {