Improve code for "starts with" optimization in the interpreters.
This commit is contained in:
parent
d5a61ee891
commit
eea410b33a
|
@ -41,6 +41,11 @@ quote old; it was released in 2014.
|
||||||
detecting symlink loops. This is dependent on the availability of realpath(),
|
detecting symlink loops. This is dependent on the availability of realpath(),
|
||||||
which is now tested for in ./configure and CMakeLists.txt.
|
which is now tested for in ./configure and CMakeLists.txt.
|
||||||
|
|
||||||
|
5. Implemented a modified version of Thomas Tempelmann's patch for handling
|
||||||
|
case-independent "first code unit" searches for unanchored patterns in 8-bit
|
||||||
|
mode in the interpreters. Instead of just remembering whether one case matched
|
||||||
|
or not, it remembers the position of a previous match so as to avoid
|
||||||
|
unnecessary repeated searching.
|
||||||
|
|
||||||
|
|
||||||
Version 10.37 26-May-2021
|
Version 10.37 26-May-2021
|
||||||
|
|
|
@ -7,7 +7,7 @@ and semantics are as close as possible to those of the Perl 5 language.
|
||||||
|
|
||||||
Written by Philip Hazel
|
Written by Philip Hazel
|
||||||
Original API code Copyright (c) 1997-2012 University of Cambridge
|
Original API code Copyright (c) 1997-2012 University of Cambridge
|
||||||
New API code Copyright (c) 2016-2020 University of Cambridge
|
New API code Copyright (c) 2016-2021 University of Cambridge
|
||||||
|
|
||||||
-----------------------------------------------------------------------------
|
-----------------------------------------------------------------------------
|
||||||
Redistribution and use in source and binary forms, with or without
|
Redistribution and use in source and binary forms, with or without
|
||||||
|
@ -3256,8 +3256,8 @@ BOOL has_first_cu = FALSE;
|
||||||
BOOL has_req_cu = FALSE;
|
BOOL has_req_cu = FALSE;
|
||||||
|
|
||||||
#if PCRE2_CODE_UNIT_WIDTH == 8
|
#if PCRE2_CODE_UNIT_WIDTH == 8
|
||||||
BOOL memchr_not_found_first_cu = FALSE;
|
PCRE2_SPTR memchr_found_first_cu = NULL;
|
||||||
BOOL memchr_not_found_first_cu2 = FALSE;
|
PCRE2_SPTR memchr_found_first_cu2 = NULL;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
PCRE2_UCHAR first_cu = 0;
|
PCRE2_UCHAR first_cu = 0;
|
||||||
|
@ -3648,13 +3648,7 @@ for (;;)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Not anchored. Advance to a unique first code unit if there is one. In
|
/* Not anchored. Advance to a unique first code unit if there is one. */
|
||||||
8-bit mode, the use of memchr() gives a big speed up, even though we have
|
|
||||||
to call it twice in caseless mode, in order to find the earliest occurrence
|
|
||||||
of the character in either of its cases. If a call to memchr() that
|
|
||||||
searches the rest of the subject fails to find one case, remember that in
|
|
||||||
order not to keep on repeating the search. This can make a huge difference
|
|
||||||
when the strings are very long and only one case is present. */
|
|
||||||
|
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
|
@ -3662,43 +3656,68 @@ for (;;)
|
||||||
{
|
{
|
||||||
if (first_cu != first_cu2) /* Caseless */
|
if (first_cu != first_cu2) /* Caseless */
|
||||||
{
|
{
|
||||||
|
/* In 16-bit and 32_bit modes we have to do our own search, so can
|
||||||
|
look for both cases at once. */
|
||||||
|
|
||||||
#if PCRE2_CODE_UNIT_WIDTH != 8
|
#if PCRE2_CODE_UNIT_WIDTH != 8
|
||||||
PCRE2_UCHAR smc;
|
PCRE2_UCHAR smc;
|
||||||
while (start_match < end_subject &&
|
while (start_match < end_subject &&
|
||||||
(smc = UCHAR21TEST(start_match)) != first_cu &&
|
(smc = UCHAR21TEST(start_match)) != first_cu &&
|
||||||
smc != first_cu2)
|
smc != first_cu2)
|
||||||
start_match++;
|
start_match++;
|
||||||
|
#else
|
||||||
|
/* In 8-bit mode, the use of memchr() gives a big speed up, even
|
||||||
|
though we have to call it twice in order to find the earliest
|
||||||
|
occurrence of the code unit in either of its cases. Caching is used
|
||||||
|
to remember the positions of previously found code units. This can
|
||||||
|
make a huge difference when the strings are very long and only one
|
||||||
|
case is actually present. */
|
||||||
|
|
||||||
#else /* 8-bit code units */
|
|
||||||
PCRE2_SPTR pp1 = NULL;
|
PCRE2_SPTR pp1 = NULL;
|
||||||
PCRE2_SPTR pp2 = NULL;
|
PCRE2_SPTR pp2 = NULL;
|
||||||
PCRE2_SIZE cu2size = end_subject - start_match;
|
PCRE2_SIZE searchlength = end_subject - start_match;
|
||||||
|
|
||||||
if (!memchr_not_found_first_cu)
|
/* If we haven't got a previously found position for first_cu, or if
|
||||||
|
the current starting position is later, we need to do a search. If
|
||||||
|
the code unit is not found, set it to the end. */
|
||||||
|
|
||||||
|
if (memchr_found_first_cu == NULL ||
|
||||||
|
start_match > memchr_found_first_cu)
|
||||||
{
|
{
|
||||||
pp1 = memchr(start_match, first_cu, end_subject - start_match);
|
pp1 = memchr(start_match, first_cu, searchlength);
|
||||||
if (pp1 == NULL) memchr_not_found_first_cu = TRUE;
|
memchr_found_first_cu = (pp1 == NULL)? end_subject : pp1;
|
||||||
else cu2size = pp1 - start_match;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/* If pp1 is not NULL, we have arranged to search only as far as pp1,
|
/* If the start is before a previously found position, use the
|
||||||
to see if the other case is earlier, so we can set "not found" only
|
previous position, or NULL if a previous search failed. */
|
||||||
when both searches have returned NULL. */
|
|
||||||
|
|
||||||
if (!memchr_not_found_first_cu2)
|
else pp1 = (memchr_found_first_cu == end_subject)? NULL :
|
||||||
|
memchr_found_first_cu;
|
||||||
|
|
||||||
|
/* Do the same thing for the other case. */
|
||||||
|
|
||||||
|
if (memchr_found_first_cu2 == NULL ||
|
||||||
|
start_match > memchr_found_first_cu2)
|
||||||
{
|
{
|
||||||
pp2 = memchr(start_match, first_cu2, cu2size);
|
pp2 = memchr(start_match, first_cu2, searchlength);
|
||||||
memchr_not_found_first_cu2 = (pp2 == NULL && pp1 == NULL);
|
memchr_found_first_cu2 = (pp2 == NULL)? end_subject : pp2;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
else pp2 = (memchr_found_first_cu2 == end_subject)? NULL :
|
||||||
|
memchr_found_first_cu2;
|
||||||
|
|
||||||
|
/* Set the start to the end of the subject if neither case was found.
|
||||||
|
Otherwise, use the earlier found point. */
|
||||||
|
|
||||||
if (pp1 == NULL)
|
if (pp1 == NULL)
|
||||||
start_match = (pp2 == NULL)? end_subject : pp2;
|
start_match = (pp2 == NULL)? end_subject : pp2;
|
||||||
else
|
else
|
||||||
start_match = (pp2 == NULL || pp1 < pp2)? pp1 : pp2;
|
start_match = (pp2 == NULL || pp1 < pp2)? pp1 : pp2;
|
||||||
#endif
|
|
||||||
|
#endif /* 8-bit handling */
|
||||||
}
|
}
|
||||||
|
|
||||||
/* The caseful case */
|
/* The caseful case is much simpler. */
|
||||||
|
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
|
|
|
@ -7,7 +7,7 @@ and semantics are as close as possible to those of the Perl 5 language.
|
||||||
|
|
||||||
Written by Philip Hazel
|
Written by Philip Hazel
|
||||||
Original API code Copyright (c) 1997-2012 University of Cambridge
|
Original API code Copyright (c) 1997-2012 University of Cambridge
|
||||||
New API code Copyright (c) 2015-2020 University of Cambridge
|
New API code Copyright (c) 2015-2021 University of Cambridge
|
||||||
|
|
||||||
-----------------------------------------------------------------------------
|
-----------------------------------------------------------------------------
|
||||||
Redistribution and use in source and binary forms, with or without
|
Redistribution and use in source and binary forms, with or without
|
||||||
|
@ -6117,8 +6117,8 @@ BOOL has_req_cu = FALSE;
|
||||||
BOOL startline;
|
BOOL startline;
|
||||||
|
|
||||||
#if PCRE2_CODE_UNIT_WIDTH == 8
|
#if PCRE2_CODE_UNIT_WIDTH == 8
|
||||||
BOOL memchr_not_found_first_cu;
|
PCRE2_SPTR memchr_found_first_cu;
|
||||||
BOOL memchr_not_found_first_cu2;
|
PCRE2_SPTR memchr_found_first_cu2;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
PCRE2_UCHAR first_cu = 0;
|
PCRE2_UCHAR first_cu = 0;
|
||||||
|
@ -6712,8 +6712,8 @@ start_partial = match_partial = NULL;
|
||||||
mb->hitend = FALSE;
|
mb->hitend = FALSE;
|
||||||
|
|
||||||
#if PCRE2_CODE_UNIT_WIDTH == 8
|
#if PCRE2_CODE_UNIT_WIDTH == 8
|
||||||
memchr_not_found_first_cu = FALSE;
|
memchr_found_first_cu = NULL;
|
||||||
memchr_not_found_first_cu2 = FALSE;
|
memchr_found_first_cu2 = NULL;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
for(;;)
|
for(;;)
|
||||||
|
@ -6782,13 +6782,7 @@ for(;;)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Not anchored. Advance to a unique first code unit if there is one. In
|
/* Not anchored. Advance to a unique first code unit if there is one. */
|
||||||
8-bit mode, the use of memchr() gives a big speed up, even though we have
|
|
||||||
to call it twice in caseless mode, in order to find the earliest occurrence
|
|
||||||
of the character in either of its cases. If a call to memchr() that
|
|
||||||
searches the rest of the subject fails to find one case, remember that in
|
|
||||||
order not to keep on repeating the search. This can make a huge difference
|
|
||||||
when the strings are very long and only one case is present. */
|
|
||||||
|
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
|
@ -6796,43 +6790,68 @@ for(;;)
|
||||||
{
|
{
|
||||||
if (first_cu != first_cu2) /* Caseless */
|
if (first_cu != first_cu2) /* Caseless */
|
||||||
{
|
{
|
||||||
|
/* In 16-bit and 32_bit modes we have to do our own search, so can
|
||||||
|
look for both cases at once. */
|
||||||
|
|
||||||
#if PCRE2_CODE_UNIT_WIDTH != 8
|
#if PCRE2_CODE_UNIT_WIDTH != 8
|
||||||
PCRE2_UCHAR smc;
|
PCRE2_UCHAR smc;
|
||||||
while (start_match < end_subject &&
|
while (start_match < end_subject &&
|
||||||
(smc = UCHAR21TEST(start_match)) != first_cu &&
|
(smc = UCHAR21TEST(start_match)) != first_cu &&
|
||||||
smc != first_cu2)
|
smc != first_cu2)
|
||||||
start_match++;
|
start_match++;
|
||||||
|
#else
|
||||||
|
/* In 8-bit mode, the use of memchr() gives a big speed up, even
|
||||||
|
though we have to call it twice in order to find the earliest
|
||||||
|
occurrence of the code unit in either of its cases. Caching is used
|
||||||
|
to remember the positions of previously found code units. This can
|
||||||
|
make a huge difference when the strings are very long and only one
|
||||||
|
case is actually present. */
|
||||||
|
|
||||||
#else /* 8-bit code units */
|
|
||||||
PCRE2_SPTR pp1 = NULL;
|
PCRE2_SPTR pp1 = NULL;
|
||||||
PCRE2_SPTR pp2 = NULL;
|
PCRE2_SPTR pp2 = NULL;
|
||||||
PCRE2_SIZE cu2size = end_subject - start_match;
|
PCRE2_SIZE searchlength = end_subject - start_match;
|
||||||
|
|
||||||
if (!memchr_not_found_first_cu)
|
/* If we haven't got a previously found position for first_cu, or if
|
||||||
|
the current starting position is later, we need to do a search. If
|
||||||
|
the code unit is not found, set it to the end. */
|
||||||
|
|
||||||
|
if (memchr_found_first_cu == NULL ||
|
||||||
|
start_match > memchr_found_first_cu)
|
||||||
{
|
{
|
||||||
pp1 = memchr(start_match, first_cu, end_subject - start_match);
|
pp1 = memchr(start_match, first_cu, searchlength);
|
||||||
if (pp1 == NULL) memchr_not_found_first_cu = TRUE;
|
memchr_found_first_cu = (pp1 == NULL)? end_subject : pp1;
|
||||||
else cu2size = pp1 - start_match;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/* If pp1 is not NULL, we have arranged to search only as far as pp1,
|
/* If the start is before a previously found position, use the
|
||||||
to see if the other case is earlier, so we can set "not found" only
|
previous position, or NULL if a previous search failed. */
|
||||||
when both searches have returned NULL. */
|
|
||||||
|
|
||||||
if (!memchr_not_found_first_cu2)
|
else pp1 = (memchr_found_first_cu == end_subject)? NULL :
|
||||||
|
memchr_found_first_cu;
|
||||||
|
|
||||||
|
/* Do the same thing for the other case. */
|
||||||
|
|
||||||
|
if (memchr_found_first_cu2 == NULL ||
|
||||||
|
start_match > memchr_found_first_cu2)
|
||||||
{
|
{
|
||||||
pp2 = memchr(start_match, first_cu2, cu2size);
|
pp2 = memchr(start_match, first_cu2, searchlength);
|
||||||
memchr_not_found_first_cu2 = (pp2 == NULL && pp1 == NULL);
|
memchr_found_first_cu2 = (pp2 == NULL)? end_subject : pp2;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
else pp2 = (memchr_found_first_cu2 == end_subject)? NULL :
|
||||||
|
memchr_found_first_cu2;
|
||||||
|
|
||||||
|
/* Set the start to the end of the subject if neither case was found.
|
||||||
|
Otherwise, use the earlier found point. */
|
||||||
|
|
||||||
if (pp1 == NULL)
|
if (pp1 == NULL)
|
||||||
start_match = (pp2 == NULL)? end_subject : pp2;
|
start_match = (pp2 == NULL)? end_subject : pp2;
|
||||||
else
|
else
|
||||||
start_match = (pp2 == NULL || pp1 < pp2)? pp1 : pp2;
|
start_match = (pp2 == NULL || pp1 < pp2)? pp1 : pp2;
|
||||||
#endif
|
|
||||||
|
#endif /* 8-bit handling */
|
||||||
}
|
}
|
||||||
|
|
||||||
/* The caseful case */
|
/* The caseful case is much simpler. */
|
||||||
|
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
|
|
Loading…
Reference in New Issue