Tweak limits on "must have" code unit searches (improves some performance).
This commit is contained in:
parent
4f31de2866
commit
d5dc4e0c33
|
@ -22,6 +22,9 @@ PCRE2_MATCH_INVALID_UTF compile-time option.
|
||||||
|
|
||||||
6. Add support for invalid UTF-8 to pcre2grep.
|
6. Add support for invalid UTF-8 to pcre2grep.
|
||||||
|
|
||||||
|
7. Adjust the limit for "must have" code unit searching, in particular,
|
||||||
|
increase it substantially for non-anchored patterns.
|
||||||
|
|
||||||
|
|
||||||
Version 10.33 16-April-2019
|
Version 10.33 16-April-2019
|
||||||
---------------------------
|
---------------------------
|
||||||
|
|
|
@ -3658,7 +3658,7 @@ for (;;)
|
||||||
while (start_match < end_subject && UCHAR21TEST(start_match) !=
|
while (start_match < end_subject && UCHAR21TEST(start_match) !=
|
||||||
first_cu)
|
first_cu)
|
||||||
start_match++;
|
start_match++;
|
||||||
#else
|
#else /* 8-bit code units */
|
||||||
start_match = memchr(start_match, first_cu, end_subject - start_match);
|
start_match = memchr(start_match, first_cu, end_subject - start_match);
|
||||||
if (start_match == NULL) start_match = end_subject;
|
if (start_match == NULL) start_match = end_subject;
|
||||||
#endif
|
#endif
|
||||||
|
@ -3745,6 +3745,8 @@ for (;;)
|
||||||
|
|
||||||
if ((mb->moptions & (PCRE2_PARTIAL_HARD|PCRE2_PARTIAL_SOFT)) == 0)
|
if ((mb->moptions & (PCRE2_PARTIAL_HARD|PCRE2_PARTIAL_SOFT)) == 0)
|
||||||
{
|
{
|
||||||
|
PCRE2_SPTR p;
|
||||||
|
|
||||||
/* The minimum matching length is a lower bound; no actual string of that
|
/* The minimum matching length is a lower bound; no actual string of that
|
||||||
length may actually match the pattern. Although the value is, strictly,
|
length may actually match the pattern. Although the value is, strictly,
|
||||||
in characters, we treat it as code units to avoid spending too much time
|
in characters, we treat it as code units to avoid spending too much time
|
||||||
|
@ -3758,37 +3760,63 @@ for (;;)
|
||||||
point. This optimization can save a huge amount of backtracking in
|
point. This optimization can save a huge amount of backtracking in
|
||||||
patterns with nested unlimited repeats that aren't going to match.
|
patterns with nested unlimited repeats that aren't going to match.
|
||||||
Writing separate code for cased/caseless versions makes it go faster, as
|
Writing separate code for cased/caseless versions makes it go faster, as
|
||||||
does using an autoincrement and backing off on a match.
|
does using an autoincrement and backing off on a match. As in the case of
|
||||||
|
the first code unit, using memchr() in the 8-bit library gives a big
|
||||||
|
speed up. Unlike the first_cu check above, we do not need to call
|
||||||
|
memchr() twice in the caseless case because we only need to check for the
|
||||||
|
presence of the character in either case, not find the first occurrence.
|
||||||
|
|
||||||
|
The search can be skipped if the code unit was found later than the
|
||||||
|
current starting point in a previous iteration of the bumpalong loop.
|
||||||
|
|
||||||
HOWEVER: when the subject string is very, very long, searching to its end
|
HOWEVER: when the subject string is very, very long, searching to its end
|
||||||
can take a long time, and give bad performance on quite ordinary
|
can take a long time, and give bad performance on quite ordinary
|
||||||
patterns. This showed up when somebody was matching something like
|
patterns. This showed up when somebody was matching something like
|
||||||
/^\d+C/ on a 32-megabyte string... so we don't do this when the string is
|
/^\d+C/ on a 32-megabyte string... so we don't do this when the string is
|
||||||
sufficiently long. */
|
sufficiently long, but it's worth searching a lot more for unanchored
|
||||||
|
patterns. */
|
||||||
|
|
||||||
if (has_req_cu && end_subject - start_match < REQ_CU_MAX)
|
p = start_match + (has_first_cu? 1:0);
|
||||||
|
if (has_req_cu && p > req_cu_ptr)
|
||||||
{
|
{
|
||||||
PCRE2_SPTR p = start_match + (has_first_cu? 1:0);
|
PCRE2_SIZE check_length = end_subject - start_match;
|
||||||
|
|
||||||
/* We don't need to repeat the search if we haven't yet reached the
|
if (check_length < REQ_CU_MAX ||
|
||||||
place we found it at last time. */
|
(!anchored && check_length < REQ_CU_MAX * 1000))
|
||||||
|
|
||||||
if (p > req_cu_ptr)
|
|
||||||
{
|
{
|
||||||
if (req_cu != req_cu2)
|
if (req_cu != req_cu2) /* Caseless */
|
||||||
{
|
{
|
||||||
|
#if PCRE2_CODE_UNIT_WIDTH != 8
|
||||||
while (p < end_subject)
|
while (p < end_subject)
|
||||||
{
|
{
|
||||||
uint32_t pp = UCHAR21INCTEST(p);
|
uint32_t pp = UCHAR21INCTEST(p);
|
||||||
if (pp == req_cu || pp == req_cu2) { p--; break; }
|
if (pp == req_cu || pp == req_cu2) { p--; break; }
|
||||||
}
|
}
|
||||||
|
#else /* 8-bit code units */
|
||||||
|
PCRE2_SPTR pp = p;
|
||||||
|
p = memchr(pp, req_cu, end_subject - pp);
|
||||||
|
if (p == NULL)
|
||||||
|
{
|
||||||
|
p = memchr(pp, req_cu2, end_subject - pp);
|
||||||
|
if (p == NULL) p = end_subject;
|
||||||
|
}
|
||||||
|
#endif /* PCRE2_CODE_UNIT_WIDTH != 8 */
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* The caseful case */
|
||||||
|
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
|
#if PCRE2_CODE_UNIT_WIDTH != 8
|
||||||
while (p < end_subject)
|
while (p < end_subject)
|
||||||
{
|
{
|
||||||
if (UCHAR21INCTEST(p) == req_cu) { p--; break; }
|
if (UCHAR21INCTEST(p) == req_cu) { p--; break; }
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#else /* 8-bit code units */
|
||||||
|
p = memchr(p, req_cu, end_subject - p);
|
||||||
|
if (p == NULL) p = end_subject;
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
/* If we can't find the required code unit, break the matching loop,
|
/* If we can't find the required code unit, break the matching loop,
|
||||||
|
|
|
@ -535,13 +535,14 @@ enum { PCRE2_MATCHEDBY_INTERPRETER, /* pcre2_match() */
|
||||||
#define MAGIC_NUMBER 0x50435245UL /* 'PCRE' */
|
#define MAGIC_NUMBER 0x50435245UL /* 'PCRE' */
|
||||||
|
|
||||||
/* The maximum remaining length of subject we are prepared to search for a
|
/* The maximum remaining length of subject we are prepared to search for a
|
||||||
req_unit match. In 8-bit mode, memchr() is used and is much faster than the
|
req_unit match from an anchored pattern. In 8-bit mode, memchr() is used and is
|
||||||
search loop that has to be used in 16-bit and 32-bit modes. */
|
much faster than the search loop that has to be used in 16-bit and 32-bit
|
||||||
|
modes. */
|
||||||
|
|
||||||
#if PCRE2_CODE_UNIT_WIDTH == 8
|
#if PCRE2_CODE_UNIT_WIDTH == 8
|
||||||
#define REQ_CU_MAX 2000
|
#define REQ_CU_MAX 5000
|
||||||
#else
|
#else
|
||||||
#define REQ_CU_MAX 1000
|
#define REQ_CU_MAX 2000
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
/* Offsets for the bitmap tables in the cbits set of tables. Each table
|
/* Offsets for the bitmap tables in the cbits set of tables. Each table
|
||||||
|
|
|
@ -6783,6 +6783,8 @@ for(;;)
|
||||||
|
|
||||||
if (!mb->partial)
|
if (!mb->partial)
|
||||||
{
|
{
|
||||||
|
PCRE2_SPTR p;
|
||||||
|
|
||||||
/* The minimum matching length is a lower bound; no string of that length
|
/* The minimum matching length is a lower bound; no string of that length
|
||||||
may actually match the pattern. Although the value is, strictly, in
|
may actually match the pattern. Although the value is, strictly, in
|
||||||
characters, we treat it as code units to avoid spending too much time in
|
characters, we treat it as code units to avoid spending too much time in
|
||||||
|
@ -6806,60 +6808,57 @@ for(;;)
|
||||||
memchr() twice in the caseless case because we only need to check for the
|
memchr() twice in the caseless case because we only need to check for the
|
||||||
presence of the character in either case, not find the first occurrence.
|
presence of the character in either case, not find the first occurrence.
|
||||||
|
|
||||||
|
The search can be skipped if the code unit was found later than the
|
||||||
|
current starting point in a previous iteration of the bumpalong loop.
|
||||||
|
|
||||||
HOWEVER: when the subject string is very, very long, searching to its end
|
HOWEVER: when the subject string is very, very long, searching to its end
|
||||||
can take a long time, and give bad performance on quite ordinary
|
can take a long time, and give bad performance on quite ordinary
|
||||||
patterns. This showed up when somebody was matching something like
|
anchored patterns. This showed up when somebody was matching something
|
||||||
/^\d+C/ on a 32-megabyte string... so we don't do this when the string is
|
like /^\d+C/ on a 32-megabyte string... so we don't do this when the
|
||||||
sufficiently long. */
|
string is sufficiently long, but it's worth searching a lot more for
|
||||||
|
unanchored patterns. */
|
||||||
|
|
||||||
if (has_req_cu && end_subject - start_match < REQ_CU_MAX)
|
p = start_match + (has_first_cu? 1:0);
|
||||||
|
if (has_req_cu && p > req_cu_ptr)
|
||||||
{
|
{
|
||||||
PCRE2_SPTR p = start_match + (has_first_cu? 1:0);
|
PCRE2_SIZE check_length = end_subject - start_match;
|
||||||
|
|
||||||
/* We don't need to repeat the search if we haven't yet reached the
|
if (check_length < REQ_CU_MAX ||
|
||||||
place we found it last time round the bumpalong loop. */
|
(!anchored && check_length < REQ_CU_MAX * 1000))
|
||||||
|
|
||||||
if (p > req_cu_ptr)
|
|
||||||
{
|
{
|
||||||
if (p < end_subject)
|
if (req_cu != req_cu2) /* Caseless */
|
||||||
{
|
{
|
||||||
if (req_cu != req_cu2) /* Caseless */
|
|
||||||
{
|
|
||||||
#if PCRE2_CODE_UNIT_WIDTH != 8
|
#if PCRE2_CODE_UNIT_WIDTH != 8
|
||||||
do
|
while (p < end_subject)
|
||||||
{
|
{
|
||||||
uint32_t pp = UCHAR21INCTEST(p);
|
uint32_t pp = UCHAR21INCTEST(p);
|
||||||
if (pp == req_cu || pp == req_cu2) { p--; break; }
|
if (pp == req_cu || pp == req_cu2) { p--; break; }
|
||||||
}
|
|
||||||
while (p < end_subject);
|
|
||||||
|
|
||||||
#else /* 8-bit code units */
|
|
||||||
PCRE2_SPTR pp = p;
|
|
||||||
p = memchr(pp, req_cu, end_subject - pp);
|
|
||||||
if (p == NULL)
|
|
||||||
{
|
|
||||||
p = memchr(pp, req_cu2, end_subject - pp);
|
|
||||||
if (p == NULL) p = end_subject;
|
|
||||||
}
|
|
||||||
#endif /* PCRE2_CODE_UNIT_WIDTH != 8 */
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/* The caseful case */
|
|
||||||
|
|
||||||
else
|
|
||||||
{
|
|
||||||
#if PCRE2_CODE_UNIT_WIDTH != 8
|
|
||||||
do
|
|
||||||
{
|
|
||||||
if (UCHAR21INCTEST(p) == req_cu) { p--; break; }
|
|
||||||
}
|
|
||||||
while (p < end_subject);
|
|
||||||
|
|
||||||
#else /* 8-bit code units */
|
#else /* 8-bit code units */
|
||||||
p = memchr(p, req_cu, end_subject - p);
|
PCRE2_SPTR pp = p;
|
||||||
|
p = memchr(pp, req_cu, end_subject - pp);
|
||||||
|
if (p == NULL)
|
||||||
|
{
|
||||||
|
p = memchr(pp, req_cu2, end_subject - pp);
|
||||||
if (p == NULL) p = end_subject;
|
if (p == NULL) p = end_subject;
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
|
#endif /* PCRE2_CODE_UNIT_WIDTH != 8 */
|
||||||
|
}
|
||||||
|
|
||||||
|
/* The caseful case */
|
||||||
|
|
||||||
|
else
|
||||||
|
{
|
||||||
|
#if PCRE2_CODE_UNIT_WIDTH != 8
|
||||||
|
while (p < end_subject)
|
||||||
|
{
|
||||||
|
if (UCHAR21INCTEST(p) == req_cu) { p--; break; }
|
||||||
|
}
|
||||||
|
|
||||||
|
#else /* 8-bit code units */
|
||||||
|
p = memchr(p, req_cu, end_subject - p);
|
||||||
|
if (p == NULL) p = end_subject;
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
/* If we can't find the required code unit, break the bumpalong loop,
|
/* If we can't find the required code unit, break the bumpalong loop,
|
||||||
|
|
Loading…
Reference in New Issue