Tweak limits on "must have" code unit searches (improves some performance).

This commit is contained in:
Philip.Hazel 2019-05-28 16:34:28 +00:00
parent 4f31de2866
commit d5dc4e0c33
4 changed files with 89 additions and 58 deletions

View File

@ -22,6 +22,9 @@ PCRE2_MATCH_INVALID_UTF compile-time option.
6. Add support for invalid UTF-8 to pcre2grep.
7. Adjust the limit for "must have" code unit searching, in particular,
increase it substantially for non-anchored patterns.
Version 10.33 16-April-2019
---------------------------

View File

@ -3294,10 +3294,10 @@ time. */
if ((options & (PCRE2_PARTIAL_HARD|PCRE2_PARTIAL_SOFT)) != 0 &&
((re->overall_options | options) & PCRE2_ENDANCHORED) != 0)
return PCRE2_ERROR_BADOPTION;
/* Invalid UTF support is not available for DFA matching. */
if ((re->overall_options & PCRE2_MATCH_INVALID_UTF) != 0)
if ((re->overall_options & PCRE2_MATCH_INVALID_UTF) != 0)
return PCRE2_ERROR_DFA_UINVALID_UTF;
/* Check that the first field in the block is the magic number. If it is not,
@ -3658,7 +3658,7 @@ for (;;)
while (start_match < end_subject && UCHAR21TEST(start_match) !=
first_cu)
start_match++;
#else
#else /* 8-bit code units */
start_match = memchr(start_match, first_cu, end_subject - start_match);
if (start_match == NULL) start_match = end_subject;
#endif
@ -3745,6 +3745,8 @@ for (;;)
if ((mb->moptions & (PCRE2_PARTIAL_HARD|PCRE2_PARTIAL_SOFT)) == 0)
{
PCRE2_SPTR p;
/* The minimum matching length is a lower bound; no actual string of that
length may actually match the pattern. Although the value is, strictly,
in characters, we treat it as code units to avoid spending too much time
@ -3758,37 +3760,63 @@ for (;;)
point. This optimization can save a huge amount of backtracking in
patterns with nested unlimited repeats that aren't going to match.
Writing separate code for cased/caseless versions makes it go faster, as
does using an autoincrement and backing off on a match.
does using an autoincrement and backing off on a match. As in the case of
the first code unit, using memchr() in the 8-bit library gives a big
speed up. Unlike the first_cu check above, we do not need to call
memchr() twice in the caseless case because we only need to check for the
presence of the character in either case, not find the first occurrence.
The search can be skipped if the code unit was found later than the
current starting point in a previous iteration of the bumpalong loop.
HOWEVER: when the subject string is very, very long, searching to its end
can take a long time, and give bad performance on quite ordinary
patterns. This showed up when somebody was matching something like
/^\d+C/ on a 32-megabyte string... so we don't do this when the string is
sufficiently long. */
sufficiently long, but it's worth searching a lot more for unanchored
patterns. */
if (has_req_cu && end_subject - start_match < REQ_CU_MAX)
p = start_match + (has_first_cu? 1:0);
if (has_req_cu && p > req_cu_ptr)
{
PCRE2_SPTR p = start_match + (has_first_cu? 1:0);
PCRE2_SIZE check_length = end_subject - start_match;
/* We don't need to repeat the search if we haven't yet reached the
place we found it at last time. */
if (p > req_cu_ptr)
if (check_length < REQ_CU_MAX ||
(!anchored && check_length < REQ_CU_MAX * 1000))
{
if (req_cu != req_cu2)
if (req_cu != req_cu2) /* Caseless */
{
#if PCRE2_CODE_UNIT_WIDTH != 8
while (p < end_subject)
{
uint32_t pp = UCHAR21INCTEST(p);
if (pp == req_cu || pp == req_cu2) { p--; break; }
}
#else /* 8-bit code units */
PCRE2_SPTR pp = p;
p = memchr(pp, req_cu, end_subject - pp);
if (p == NULL)
{
p = memchr(pp, req_cu2, end_subject - pp);
if (p == NULL) p = end_subject;
}
#endif /* PCRE2_CODE_UNIT_WIDTH != 8 */
}
/* The caseful case */
else
{
#if PCRE2_CODE_UNIT_WIDTH != 8
while (p < end_subject)
{
if (UCHAR21INCTEST(p) == req_cu) { p--; break; }
}
#else /* 8-bit code units */
p = memchr(p, req_cu, end_subject - p);
if (p == NULL) p = end_subject;
#endif
}
/* If we can't find the required code unit, break the matching loop,

View File

@ -535,13 +535,14 @@ enum { PCRE2_MATCHEDBY_INTERPRETER, /* pcre2_match() */
#define MAGIC_NUMBER 0x50435245UL /* 'PCRE' */
/* The maximum remaining length of subject we are prepared to search for a
req_unit match. In 8-bit mode, memchr() is used and is much faster than the
search loop that has to be used in 16-bit and 32-bit modes. */
req_unit match from an anchored pattern. In 8-bit mode, memchr() is used and is
much faster than the search loop that has to be used in 16-bit and 32-bit
modes. */
#if PCRE2_CODE_UNIT_WIDTH == 8
#define REQ_CU_MAX 2000
#define REQ_CU_MAX 5000
#else
#define REQ_CU_MAX 1000
#define REQ_CU_MAX 2000
#endif
/* Offsets for the bitmap tables in the cbits set of tables. Each table

View File

@ -6783,6 +6783,8 @@ for(;;)
if (!mb->partial)
{
PCRE2_SPTR p;
/* The minimum matching length is a lower bound; no string of that length
may actually match the pattern. Although the value is, strictly, in
characters, we treat it as code units to avoid spending too much time in
@ -6806,60 +6808,57 @@ for(;;)
memchr() twice in the caseless case because we only need to check for the
presence of the character in either case, not find the first occurrence.
The search can be skipped if the code unit was found later than the
current starting point in a previous iteration of the bumpalong loop.
HOWEVER: when the subject string is very, very long, searching to its end
can take a long time, and give bad performance on quite ordinary
patterns. This showed up when somebody was matching something like
/^\d+C/ on a 32-megabyte string... so we don't do this when the string is
sufficiently long. */
anchored patterns. This showed up when somebody was matching something
like /^\d+C/ on a 32-megabyte string... so we don't do this when the
string is sufficiently long, but it's worth searching a lot more for
unanchored patterns. */
if (has_req_cu && end_subject - start_match < REQ_CU_MAX)
p = start_match + (has_first_cu? 1:0);
if (has_req_cu && p > req_cu_ptr)
{
PCRE2_SPTR p = start_match + (has_first_cu? 1:0);
PCRE2_SIZE check_length = end_subject - start_match;
/* We don't need to repeat the search if we haven't yet reached the
place we found it last time round the bumpalong loop. */
if (p > req_cu_ptr)
if (check_length < REQ_CU_MAX ||
(!anchored && check_length < REQ_CU_MAX * 1000))
{
if (p < end_subject)
if (req_cu != req_cu2) /* Caseless */
{
if (req_cu != req_cu2) /* Caseless */
{
#if PCRE2_CODE_UNIT_WIDTH != 8
do
{
uint32_t pp = UCHAR21INCTEST(p);
if (pp == req_cu || pp == req_cu2) { p--; break; }
}
while (p < end_subject);
#else /* 8-bit code units */
PCRE2_SPTR pp = p;
p = memchr(pp, req_cu, end_subject - pp);
if (p == NULL)
{
p = memchr(pp, req_cu2, end_subject - pp);
if (p == NULL) p = end_subject;
}
#endif /* PCRE2_CODE_UNIT_WIDTH != 8 */
while (p < end_subject)
{
uint32_t pp = UCHAR21INCTEST(p);
if (pp == req_cu || pp == req_cu2) { p--; break; }
}
/* The caseful case */
else
{
#if PCRE2_CODE_UNIT_WIDTH != 8
do
{
if (UCHAR21INCTEST(p) == req_cu) { p--; break; }
}
while (p < end_subject);
#else /* 8-bit code units */
p = memchr(p, req_cu, end_subject - p);
PCRE2_SPTR pp = p;
p = memchr(pp, req_cu, end_subject - pp);
if (p == NULL)
{
p = memchr(pp, req_cu2, end_subject - pp);
if (p == NULL) p = end_subject;
#endif
}
#endif /* PCRE2_CODE_UNIT_WIDTH != 8 */
}
/* The caseful case */
else
{
#if PCRE2_CODE_UNIT_WIDTH != 8
while (p < end_subject)
{
if (UCHAR21INCTEST(p) == req_cu) { p--; break; }
}
#else /* 8-bit code units */
p = memchr(p, req_cu, end_subject - p);
if (p == NULL) p = end_subject;
#endif
}
/* If we can't find the required code unit, break the bumpalong loop,