From d5dc4e0c338013cc7b82aafd832bd4f4ac36fe52 Mon Sep 17 00:00:00 2001 From: "Philip.Hazel" Date: Tue, 28 May 2019 16:34:28 +0000 Subject: [PATCH] Tweak limits on "must have" code unit searches (improves some performance). --- ChangeLog | 3 ++ src/pcre2_dfa_match.c | 52 ++++++++++++++++++++------- src/pcre2_internal.h | 9 ++--- src/pcre2_match.c | 83 +++++++++++++++++++++---------------------- 4 files changed, 89 insertions(+), 58 deletions(-) diff --git a/ChangeLog b/ChangeLog index 84e051a..7382953 100644 --- a/ChangeLog +++ b/ChangeLog @@ -22,6 +22,9 @@ PCRE2_MATCH_INVALID_UTF compile-time option. 6. Add support for invalid UTF-8 to pcre2grep. +7. Adjust the limit for "must have" code unit searching, in particular, +increase it substantially for non-anchored patterns. + Version 10.33 16-April-2019 --------------------------- diff --git a/src/pcre2_dfa_match.c b/src/pcre2_dfa_match.c index d20f02a..911e9b9 100644 --- a/src/pcre2_dfa_match.c +++ b/src/pcre2_dfa_match.c @@ -3294,10 +3294,10 @@ time. */ if ((options & (PCRE2_PARTIAL_HARD|PCRE2_PARTIAL_SOFT)) != 0 && ((re->overall_options | options) & PCRE2_ENDANCHORED) != 0) return PCRE2_ERROR_BADOPTION; - + /* Invalid UTF support is not available for DFA matching. */ -if ((re->overall_options & PCRE2_MATCH_INVALID_UTF) != 0) +if ((re->overall_options & PCRE2_MATCH_INVALID_UTF) != 0) return PCRE2_ERROR_DFA_UINVALID_UTF; /* Check that the first field in the block is the magic number. If it is not, @@ -3658,7 +3658,7 @@ for (;;) while (start_match < end_subject && UCHAR21TEST(start_match) != first_cu) start_match++; -#else +#else /* 8-bit code units */ start_match = memchr(start_match, first_cu, end_subject - start_match); if (start_match == NULL) start_match = end_subject; #endif @@ -3745,6 +3745,8 @@ for (;;) if ((mb->moptions & (PCRE2_PARTIAL_HARD|PCRE2_PARTIAL_SOFT)) == 0) { + PCRE2_SPTR p; + /* The minimum matching length is a lower bound; no actual string of that length may actually match the pattern. Although the value is, strictly, in characters, we treat it as code units to avoid spending too much time @@ -3758,37 +3760,63 @@ for (;;) point. This optimization can save a huge amount of backtracking in patterns with nested unlimited repeats that aren't going to match. Writing separate code for cased/caseless versions makes it go faster, as - does using an autoincrement and backing off on a match. + does using an autoincrement and backing off on a match. As in the case of + the first code unit, using memchr() in the 8-bit library gives a big + speed up. Unlike the first_cu check above, we do not need to call + memchr() twice in the caseless case because we only need to check for the + presence of the character in either case, not find the first occurrence. + + The search can be skipped if the code unit was found later than the + current starting point in a previous iteration of the bumpalong loop. HOWEVER: when the subject string is very, very long, searching to its end can take a long time, and give bad performance on quite ordinary patterns. This showed up when somebody was matching something like /^\d+C/ on a 32-megabyte string... so we don't do this when the string is - sufficiently long. */ + sufficiently long, but it's worth searching a lot more for unanchored + patterns. */ - if (has_req_cu && end_subject - start_match < REQ_CU_MAX) + p = start_match + (has_first_cu? 1:0); + if (has_req_cu && p > req_cu_ptr) { - PCRE2_SPTR p = start_match + (has_first_cu? 1:0); + PCRE2_SIZE check_length = end_subject - start_match; - /* We don't need to repeat the search if we haven't yet reached the - place we found it at last time. */ - - if (p > req_cu_ptr) + if (check_length < REQ_CU_MAX || + (!anchored && check_length < REQ_CU_MAX * 1000)) { - if (req_cu != req_cu2) + if (req_cu != req_cu2) /* Caseless */ { +#if PCRE2_CODE_UNIT_WIDTH != 8 while (p < end_subject) { uint32_t pp = UCHAR21INCTEST(p); if (pp == req_cu || pp == req_cu2) { p--; break; } } +#else /* 8-bit code units */ + PCRE2_SPTR pp = p; + p = memchr(pp, req_cu, end_subject - pp); + if (p == NULL) + { + p = memchr(pp, req_cu2, end_subject - pp); + if (p == NULL) p = end_subject; + } +#endif /* PCRE2_CODE_UNIT_WIDTH != 8 */ } + + /* The caseful case */ + else { +#if PCRE2_CODE_UNIT_WIDTH != 8 while (p < end_subject) { if (UCHAR21INCTEST(p) == req_cu) { p--; break; } } + +#else /* 8-bit code units */ + p = memchr(p, req_cu, end_subject - p); + if (p == NULL) p = end_subject; +#endif } /* If we can't find the required code unit, break the matching loop, diff --git a/src/pcre2_internal.h b/src/pcre2_internal.h index 814d91b..7fd8044 100644 --- a/src/pcre2_internal.h +++ b/src/pcre2_internal.h @@ -535,13 +535,14 @@ enum { PCRE2_MATCHEDBY_INTERPRETER, /* pcre2_match() */ #define MAGIC_NUMBER 0x50435245UL /* 'PCRE' */ /* The maximum remaining length of subject we are prepared to search for a -req_unit match. In 8-bit mode, memchr() is used and is much faster than the -search loop that has to be used in 16-bit and 32-bit modes. */ +req_unit match from an anchored pattern. In 8-bit mode, memchr() is used and is +much faster than the search loop that has to be used in 16-bit and 32-bit +modes. */ #if PCRE2_CODE_UNIT_WIDTH == 8 -#define REQ_CU_MAX 2000 +#define REQ_CU_MAX 5000 #else -#define REQ_CU_MAX 1000 +#define REQ_CU_MAX 2000 #endif /* Offsets for the bitmap tables in the cbits set of tables. Each table diff --git a/src/pcre2_match.c b/src/pcre2_match.c index b2dae2d..849bb58 100644 --- a/src/pcre2_match.c +++ b/src/pcre2_match.c @@ -6783,6 +6783,8 @@ for(;;) if (!mb->partial) { + PCRE2_SPTR p; + /* The minimum matching length is a lower bound; no string of that length may actually match the pattern. Although the value is, strictly, in characters, we treat it as code units to avoid spending too much time in @@ -6806,60 +6808,57 @@ for(;;) memchr() twice in the caseless case because we only need to check for the presence of the character in either case, not find the first occurrence. + The search can be skipped if the code unit was found later than the + current starting point in a previous iteration of the bumpalong loop. + HOWEVER: when the subject string is very, very long, searching to its end can take a long time, and give bad performance on quite ordinary - patterns. This showed up when somebody was matching something like - /^\d+C/ on a 32-megabyte string... so we don't do this when the string is - sufficiently long. */ + anchored patterns. This showed up when somebody was matching something + like /^\d+C/ on a 32-megabyte string... so we don't do this when the + string is sufficiently long, but it's worth searching a lot more for + unanchored patterns. */ - if (has_req_cu && end_subject - start_match < REQ_CU_MAX) + p = start_match + (has_first_cu? 1:0); + if (has_req_cu && p > req_cu_ptr) { - PCRE2_SPTR p = start_match + (has_first_cu? 1:0); + PCRE2_SIZE check_length = end_subject - start_match; - /* We don't need to repeat the search if we haven't yet reached the - place we found it last time round the bumpalong loop. */ - - if (p > req_cu_ptr) + if (check_length < REQ_CU_MAX || + (!anchored && check_length < REQ_CU_MAX * 1000)) { - if (p < end_subject) + if (req_cu != req_cu2) /* Caseless */ { - if (req_cu != req_cu2) /* Caseless */ - { #if PCRE2_CODE_UNIT_WIDTH != 8 - do - { - uint32_t pp = UCHAR21INCTEST(p); - if (pp == req_cu || pp == req_cu2) { p--; break; } - } - while (p < end_subject); - -#else /* 8-bit code units */ - PCRE2_SPTR pp = p; - p = memchr(pp, req_cu, end_subject - pp); - if (p == NULL) - { - p = memchr(pp, req_cu2, end_subject - pp); - if (p == NULL) p = end_subject; - } -#endif /* PCRE2_CODE_UNIT_WIDTH != 8 */ + while (p < end_subject) + { + uint32_t pp = UCHAR21INCTEST(p); + if (pp == req_cu || pp == req_cu2) { p--; break; } } - - /* The caseful case */ - - else - { -#if PCRE2_CODE_UNIT_WIDTH != 8 - do - { - if (UCHAR21INCTEST(p) == req_cu) { p--; break; } - } - while (p < end_subject); - #else /* 8-bit code units */ - p = memchr(p, req_cu, end_subject - p); + PCRE2_SPTR pp = p; + p = memchr(pp, req_cu, end_subject - pp); + if (p == NULL) + { + p = memchr(pp, req_cu2, end_subject - pp); if (p == NULL) p = end_subject; -#endif } +#endif /* PCRE2_CODE_UNIT_WIDTH != 8 */ + } + + /* The caseful case */ + + else + { +#if PCRE2_CODE_UNIT_WIDTH != 8 + while (p < end_subject) + { + if (UCHAR21INCTEST(p) == req_cu) { p--; break; } + } + +#else /* 8-bit code units */ + p = memchr(p, req_cu, end_subject - p); + if (p == NULL) p = end_subject; +#endif } /* If we can't find the required code unit, break the bumpalong loop,