Tweak limits on "must have" code unit searches (improves some performance).

2019-05-28 16:34:28 +00:00 · 2019-05-28 16:34:28 +00:00 · d5dc4e0c33
parent 4f31de2866
commit d5dc4e0c33
4 changed files with 89 additions and 58 deletions
--- a/3
+++ b/3
@ -22,6 +22,9 @@ PCRE2_MATCH_INVALID_UTF compile-time option.
 6. Add support for invalid UTF-8 to pcre2grep.
 7. Adjust the limit for "must have" code unit searching, in particular,
 increase it substantially for non-anchored patterns.
 Version 10.33 16-April-2019
 ---------------------------
--- a/src/pcre2_dfa_match.c
+++ b/src/pcre2_dfa_match.c
@ -3658,7 +3658,7 @@ for (;;)
          while (start_match < end_subject && UCHAR21TEST(start_match) !=
                 first_cu)
            start_match++;
-#else
+#else  /* 8-bit code units */
          start_match = memchr(start_match, first_cu, end_subject - start_match);
          if (start_match == NULL) start_match = end_subject;
 #endif
@ -3745,6 +3745,8 @@ for (;;)
    if ((mb->moptions & (PCRE2_PARTIAL_HARD|PCRE2_PARTIAL_SOFT)) == 0)
      {
      PCRE2_SPTR p;
      /* The minimum matching length is a lower bound; no actual string of that
      length may actually match the pattern. Although the value is, strictly,
      in characters, we treat it as code units to avoid spending too much time
@ -3758,37 +3760,63 @@ for (;;)
      point. This optimization can save a huge amount of backtracking in
      patterns with nested unlimited repeats that aren't going to match.
      Writing separate code for cased/caseless versions makes it go faster, as
-      does using an autoincrement and backing off on a match.
+      does using an autoincrement and backing off on a match. As in the case of
      the first code unit, using memchr() in the 8-bit library gives a big
      speed up. Unlike the first_cu check above, we do not need to call
      memchr() twice in the caseless case because we only need to check for the
      presence of the character in either case, not find the first occurrence.
      The search can be skipped if the code unit was found later than the
      current starting point in a previous iteration of the bumpalong loop.
      HOWEVER: when the subject string is very, very long, searching to its end
      can take a long time, and give bad performance on quite ordinary
      patterns. This showed up when somebody was matching something like
      /^\d+C/ on a 32-megabyte string... so we don't do this when the string is
-      sufficiently long. */
+      sufficiently long, but it's worth searching a lot more for unanchored
      patterns. */
-      if (has_req_cu && end_subject - start_match < REQ_CU_MAX)
+      p = start_match + (has_first_cu? 1:0);
      if (has_req_cu && p > req_cu_ptr)
        {
-        PCRE2_SPTR p = start_match + (has_first_cu? 1:0);
+        PCRE2_SIZE check_length = end_subject - start_match;
-        /* We don't need to repeat the search if we haven't yet reached the
+        if (check_length < REQ_CU_MAX ||
-        place we found it at last time. */
+              (!anchored && check_length < REQ_CU_MAX * 1000))
        if (p > req_cu_ptr)
          {
-          if (req_cu != req_cu2)
+          if (req_cu != req_cu2)  /* Caseless */
            {
 #if PCRE2_CODE_UNIT_WIDTH != 8
            while (p < end_subject)
              {
              uint32_t pp = UCHAR21INCTEST(p);
              if (pp == req_cu || pp == req_cu2) { p--; break; }
              }
 #else  /* 8-bit code units */
            PCRE2_SPTR pp = p;
            p = memchr(pp, req_cu, end_subject - pp);
            if (p == NULL)
              {
              p = memchr(pp, req_cu2, end_subject - pp);
              if (p == NULL) p = end_subject;
              }
 #endif /* PCRE2_CODE_UNIT_WIDTH != 8 */
            }
          /* The caseful case */
          else
            {
 #if PCRE2_CODE_UNIT_WIDTH != 8
            while (p < end_subject)
              {
              if (UCHAR21INCTEST(p) == req_cu) { p--; break; }
              }
 #else  /* 8-bit code units */
            p = memchr(p, req_cu, end_subject - p);
            if (p == NULL) p = end_subject;
 #endif
            }
          /* If we can't find the required code unit, break the matching loop,
--- a/src/pcre2_internal.h
+++ b/src/pcre2_internal.h
@ -535,13 +535,14 @@ enum { PCRE2_MATCHEDBY_INTERPRETER,     /* pcre2_match() */
 #define MAGIC_NUMBER  0x50435245UL   /* 'PCRE' */
 /* The maximum remaining length of subject we are prepared to search for a
-req_unit match. In 8-bit mode, memchr() is used and is much faster than the
+req_unit match from an anchored pattern. In 8-bit mode, memchr() is used and is
-search loop that has to be used in 16-bit and 32-bit modes. */
+much faster than the search loop that has to be used in 16-bit and 32-bit
 modes. */
 #if PCRE2_CODE_UNIT_WIDTH == 8
-#define REQ_CU_MAX 2000
+#define REQ_CU_MAX       5000
 #else
-#define REQ_CU_MAX 1000
+#define REQ_CU_MAX       2000
 #endif
 /* Offsets for the bitmap tables in the cbits set of tables. Each table
--- a/src/pcre2_match.c
+++ b/src/pcre2_match.c
@ -6783,6 +6783,8 @@ for(;;)
    if (!mb->partial)
      {
      PCRE2_SPTR p;
      /* The minimum matching length is a lower bound; no string of that length
      may actually match the pattern. Although the value is, strictly, in
      characters, we treat it as code units to avoid spending too much time in
@ -6806,60 +6808,57 @@ for(;;)
      memchr() twice in the caseless case because we only need to check for the
      presence of the character in either case, not find the first occurrence.
      The search can be skipped if the code unit was found later than the
      current starting point in a previous iteration of the bumpalong loop.
      HOWEVER: when the subject string is very, very long, searching to its end
      can take a long time, and give bad performance on quite ordinary
-      patterns. This showed up when somebody was matching something like
+      anchored patterns. This showed up when somebody was matching something
-      /^\d+C/ on a 32-megabyte string... so we don't do this when the string is
+      like /^\d+C/ on a 32-megabyte string... so we don't do this when the
-      sufficiently long. */
+      string is sufficiently long, but it's worth searching a lot more for
      unanchored patterns. */
-      if (has_req_cu && end_subject - start_match < REQ_CU_MAX)
+      p = start_match + (has_first_cu? 1:0);
      if (has_req_cu && p > req_cu_ptr)
        {
-        PCRE2_SPTR p = start_match + (has_first_cu? 1:0);
+        PCRE2_SIZE check_length = end_subject - start_match;
-        /* We don't need to repeat the search if we haven't yet reached the
+        if (check_length < REQ_CU_MAX ||
-        place we found it last time round the bumpalong loop. */
+              (!anchored && check_length < REQ_CU_MAX * 1000))
        if (p > req_cu_ptr)
          {
-          if (p < end_subject)
+          if (req_cu != req_cu2)  /* Caseless */
            {
            if (req_cu != req_cu2)  /* Caseless */
              {
 #if PCRE2_CODE_UNIT_WIDTH != 8
-              do
+            while (p < end_subject)
-                {
+              {
-                uint32_t pp = UCHAR21INCTEST(p);
+              uint32_t pp = UCHAR21INCTEST(p);
-                if (pp == req_cu || pp == req_cu2) { p--; break; }
+              if (pp == req_cu || pp == req_cu2) { p--; break; }
                }
              while (p < end_subject);
 #else  /* 8-bit code units */
              PCRE2_SPTR pp = p;
              p = memchr(pp, req_cu, end_subject - pp);
              if (p == NULL)
                {
                p = memchr(pp, req_cu2, end_subject - pp);
                if (p == NULL) p = end_subject;
                }
 #endif /* PCRE2_CODE_UNIT_WIDTH != 8 */
              }
            /* The caseful case */
            else
              {
 #if PCRE2_CODE_UNIT_WIDTH != 8
              do
                {
                if (UCHAR21INCTEST(p) == req_cu) { p--; break; }
                }
              while (p < end_subject);
 #else  /* 8-bit code units */
-              p = memchr(p, req_cu, end_subject - p);
+            PCRE2_SPTR pp = p;
            p = memchr(pp, req_cu, end_subject - pp);
            if (p == NULL)
              {
              p = memchr(pp, req_cu2, end_subject - pp);
              if (p == NULL) p = end_subject;
 #endif
              }
 #endif /* PCRE2_CODE_UNIT_WIDTH != 8 */
            }
          /* The caseful case */
          else
            {
 #if PCRE2_CODE_UNIT_WIDTH != 8
            while (p < end_subject)
              {
              if (UCHAR21INCTEST(p) == req_cu) { p--; break; }
              }
 #else  /* 8-bit code units */
            p = memchr(p, req_cu, end_subject - p);
            if (p == NULL) p = end_subject;
 #endif
            }
          /* If we can't find the required code unit, break the bumpalong loop,