Tweak limits on "must have" code unit searches (improves some performance).

This commit is contained in:
Philip.Hazel 2019-05-28 16:34:28 +00:00
parent 4f31de2866
commit d5dc4e0c33
4 changed files with 89 additions and 58 deletions

View File

@ -22,6 +22,9 @@ PCRE2_MATCH_INVALID_UTF compile-time option.
6. Add support for invalid UTF-8 to pcre2grep. 6. Add support for invalid UTF-8 to pcre2grep.
7. Adjust the limit for "must have" code unit searching, in particular,
increase it substantially for non-anchored patterns.
Version 10.33 16-April-2019 Version 10.33 16-April-2019
--------------------------- ---------------------------

View File

@ -3658,7 +3658,7 @@ for (;;)
while (start_match < end_subject && UCHAR21TEST(start_match) != while (start_match < end_subject && UCHAR21TEST(start_match) !=
first_cu) first_cu)
start_match++; start_match++;
#else #else /* 8-bit code units */
start_match = memchr(start_match, first_cu, end_subject - start_match); start_match = memchr(start_match, first_cu, end_subject - start_match);
if (start_match == NULL) start_match = end_subject; if (start_match == NULL) start_match = end_subject;
#endif #endif
@ -3745,6 +3745,8 @@ for (;;)
if ((mb->moptions & (PCRE2_PARTIAL_HARD|PCRE2_PARTIAL_SOFT)) == 0) if ((mb->moptions & (PCRE2_PARTIAL_HARD|PCRE2_PARTIAL_SOFT)) == 0)
{ {
PCRE2_SPTR p;
/* The minimum matching length is a lower bound; no actual string of that /* The minimum matching length is a lower bound; no actual string of that
length may actually match the pattern. Although the value is, strictly, length may actually match the pattern. Although the value is, strictly,
in characters, we treat it as code units to avoid spending too much time in characters, we treat it as code units to avoid spending too much time
@ -3758,37 +3760,63 @@ for (;;)
point. This optimization can save a huge amount of backtracking in point. This optimization can save a huge amount of backtracking in
patterns with nested unlimited repeats that aren't going to match. patterns with nested unlimited repeats that aren't going to match.
Writing separate code for cased/caseless versions makes it go faster, as Writing separate code for cased/caseless versions makes it go faster, as
does using an autoincrement and backing off on a match. does using an autoincrement and backing off on a match. As in the case of
the first code unit, using memchr() in the 8-bit library gives a big
speed up. Unlike the first_cu check above, we do not need to call
memchr() twice in the caseless case because we only need to check for the
presence of the character in either case, not find the first occurrence.
The search can be skipped if the code unit was found later than the
current starting point in a previous iteration of the bumpalong loop.
HOWEVER: when the subject string is very, very long, searching to its end HOWEVER: when the subject string is very, very long, searching to its end
can take a long time, and give bad performance on quite ordinary can take a long time, and give bad performance on quite ordinary
patterns. This showed up when somebody was matching something like patterns. This showed up when somebody was matching something like
/^\d+C/ on a 32-megabyte string... so we don't do this when the string is /^\d+C/ on a 32-megabyte string... so we don't do this when the string is
sufficiently long. */ sufficiently long, but it's worth searching a lot more for unanchored
patterns. */
if (has_req_cu && end_subject - start_match < REQ_CU_MAX) p = start_match + (has_first_cu? 1:0);
if (has_req_cu && p > req_cu_ptr)
{ {
PCRE2_SPTR p = start_match + (has_first_cu? 1:0); PCRE2_SIZE check_length = end_subject - start_match;
/* We don't need to repeat the search if we haven't yet reached the if (check_length < REQ_CU_MAX ||
place we found it at last time. */ (!anchored && check_length < REQ_CU_MAX * 1000))
if (p > req_cu_ptr)
{ {
if (req_cu != req_cu2) if (req_cu != req_cu2) /* Caseless */
{ {
#if PCRE2_CODE_UNIT_WIDTH != 8
while (p < end_subject) while (p < end_subject)
{ {
uint32_t pp = UCHAR21INCTEST(p); uint32_t pp = UCHAR21INCTEST(p);
if (pp == req_cu || pp == req_cu2) { p--; break; } if (pp == req_cu || pp == req_cu2) { p--; break; }
} }
#else /* 8-bit code units */
PCRE2_SPTR pp = p;
p = memchr(pp, req_cu, end_subject - pp);
if (p == NULL)
{
p = memchr(pp, req_cu2, end_subject - pp);
if (p == NULL) p = end_subject;
}
#endif /* PCRE2_CODE_UNIT_WIDTH != 8 */
} }
/* The caseful case */
else else
{ {
#if PCRE2_CODE_UNIT_WIDTH != 8
while (p < end_subject) while (p < end_subject)
{ {
if (UCHAR21INCTEST(p) == req_cu) { p--; break; } if (UCHAR21INCTEST(p) == req_cu) { p--; break; }
} }
#else /* 8-bit code units */
p = memchr(p, req_cu, end_subject - p);
if (p == NULL) p = end_subject;
#endif
} }
/* If we can't find the required code unit, break the matching loop, /* If we can't find the required code unit, break the matching loop,

View File

@ -535,13 +535,14 @@ enum { PCRE2_MATCHEDBY_INTERPRETER, /* pcre2_match() */
#define MAGIC_NUMBER 0x50435245UL /* 'PCRE' */ #define MAGIC_NUMBER 0x50435245UL /* 'PCRE' */
/* The maximum remaining length of subject we are prepared to search for a /* The maximum remaining length of subject we are prepared to search for a
req_unit match. In 8-bit mode, memchr() is used and is much faster than the req_unit match from an anchored pattern. In 8-bit mode, memchr() is used and is
search loop that has to be used in 16-bit and 32-bit modes. */ much faster than the search loop that has to be used in 16-bit and 32-bit
modes. */
#if PCRE2_CODE_UNIT_WIDTH == 8 #if PCRE2_CODE_UNIT_WIDTH == 8
#define REQ_CU_MAX 2000 #define REQ_CU_MAX 5000
#else #else
#define REQ_CU_MAX 1000 #define REQ_CU_MAX 2000
#endif #endif
/* Offsets for the bitmap tables in the cbits set of tables. Each table /* Offsets for the bitmap tables in the cbits set of tables. Each table

View File

@ -6783,6 +6783,8 @@ for(;;)
if (!mb->partial) if (!mb->partial)
{ {
PCRE2_SPTR p;
/* The minimum matching length is a lower bound; no string of that length /* The minimum matching length is a lower bound; no string of that length
may actually match the pattern. Although the value is, strictly, in may actually match the pattern. Although the value is, strictly, in
characters, we treat it as code units to avoid spending too much time in characters, we treat it as code units to avoid spending too much time in
@ -6806,60 +6808,57 @@ for(;;)
memchr() twice in the caseless case because we only need to check for the memchr() twice in the caseless case because we only need to check for the
presence of the character in either case, not find the first occurrence. presence of the character in either case, not find the first occurrence.
The search can be skipped if the code unit was found later than the
current starting point in a previous iteration of the bumpalong loop.
HOWEVER: when the subject string is very, very long, searching to its end HOWEVER: when the subject string is very, very long, searching to its end
can take a long time, and give bad performance on quite ordinary can take a long time, and give bad performance on quite ordinary
patterns. This showed up when somebody was matching something like anchored patterns. This showed up when somebody was matching something
/^\d+C/ on a 32-megabyte string... so we don't do this when the string is like /^\d+C/ on a 32-megabyte string... so we don't do this when the
sufficiently long. */ string is sufficiently long, but it's worth searching a lot more for
unanchored patterns. */
if (has_req_cu && end_subject - start_match < REQ_CU_MAX) p = start_match + (has_first_cu? 1:0);
if (has_req_cu && p > req_cu_ptr)
{ {
PCRE2_SPTR p = start_match + (has_first_cu? 1:0); PCRE2_SIZE check_length = end_subject - start_match;
/* We don't need to repeat the search if we haven't yet reached the if (check_length < REQ_CU_MAX ||
place we found it last time round the bumpalong loop. */ (!anchored && check_length < REQ_CU_MAX * 1000))
if (p > req_cu_ptr)
{ {
if (p < end_subject) if (req_cu != req_cu2) /* Caseless */
{ {
if (req_cu != req_cu2) /* Caseless */
{
#if PCRE2_CODE_UNIT_WIDTH != 8 #if PCRE2_CODE_UNIT_WIDTH != 8
do while (p < end_subject)
{ {
uint32_t pp = UCHAR21INCTEST(p); uint32_t pp = UCHAR21INCTEST(p);
if (pp == req_cu || pp == req_cu2) { p--; break; } if (pp == req_cu || pp == req_cu2) { p--; break; }
}
while (p < end_subject);
#else /* 8-bit code units */
PCRE2_SPTR pp = p;
p = memchr(pp, req_cu, end_subject - pp);
if (p == NULL)
{
p = memchr(pp, req_cu2, end_subject - pp);
if (p == NULL) p = end_subject;
}
#endif /* PCRE2_CODE_UNIT_WIDTH != 8 */
} }
/* The caseful case */
else
{
#if PCRE2_CODE_UNIT_WIDTH != 8
do
{
if (UCHAR21INCTEST(p) == req_cu) { p--; break; }
}
while (p < end_subject);
#else /* 8-bit code units */ #else /* 8-bit code units */
p = memchr(p, req_cu, end_subject - p); PCRE2_SPTR pp = p;
p = memchr(pp, req_cu, end_subject - pp);
if (p == NULL)
{
p = memchr(pp, req_cu2, end_subject - pp);
if (p == NULL) p = end_subject; if (p == NULL) p = end_subject;
#endif
} }
#endif /* PCRE2_CODE_UNIT_WIDTH != 8 */
}
/* The caseful case */
else
{
#if PCRE2_CODE_UNIT_WIDTH != 8
while (p < end_subject)
{
if (UCHAR21INCTEST(p) == req_cu) { p--; break; }
}
#else /* 8-bit code units */
p = memchr(p, req_cu, end_subject - p);
if (p == NULL) p = end_subject;
#endif
} }
/* If we can't find the required code unit, break the bumpalong loop, /* If we can't find the required code unit, break the bumpalong loop,