From 7bbdc58513c1d920dd75667bb4b667bdb1be343e Mon Sep 17 00:00:00 2001
From: "Philip.Hazel" <Philip.Hazel@gmail.com>
Date: Fri, 6 Sep 2019 16:08:45 +0000
Subject: [PATCH] Fix pessimizing optimization of start-of-match code units in
 the interpreters.

---
 ChangeLog             | 11 +++++++++-
 src/pcre2_dfa_match.c | 36 ++++++++++++++++++++++++++-----
 src/pcre2_match.c     | 50 ++++++++++++++++++++++++++++++++-----------
 3 files changed, 79 insertions(+), 18 deletions(-)

diff --git a/ChangeLog b/ChangeLog
index 24c19fb..9f3796c 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -92,7 +92,7 @@ within it, the nested lookbehind was not correctly processed. For example, if
 
 19. Implemented pcre2_get_match_data_size().
 
-20. Two alterations to partial matching (not yet done by JIT):
+20. Two alterations to partial matching:
 
     (a) The definition of a partial match is slightly changed: if a pattern
     contains any lookbehinds, an empty partial match may be given, because this
@@ -130,6 +130,15 @@ inspected in that lookahead were not included.
 
 28. Add the pcre2_maketables_free() function.
 
+29. The start-up optimization that looks for a unique initial matching
+code unit in the interpretive engines uses memchr() in 8-bit mode. When the
+search is caseless, it was doing so inefficiently, which ended up slowing down
+the match drastically when the subject was very long. The revised code (a)
+remembers if one case is not found, so it never repeats the search for that 
+case after a bumpalong and (b) when one case has been found, it searches only
+up to that position for an earlier occurrence of the other case. This fix
+applies to both interpretive pcre2_match() and to pcre2_dfa_match(). 
+
 
 Version 10.33 16-April-2019
 ---------------------------
diff --git a/src/pcre2_dfa_match.c b/src/pcre2_dfa_match.c
index 1c2954b..2aec709 100644
--- a/src/pcre2_dfa_match.c
+++ b/src/pcre2_dfa_match.c
@@ -3254,6 +3254,11 @@ BOOL utf, anchored, startline, firstline;
 BOOL has_first_cu = FALSE;
 BOOL has_req_cu = FALSE;
 
+#if PCRE2_CODE_UNIT_WIDTH == 8
+BOOL memchr_not_found_first_cu = FALSE;
+BOOL memchr_not_found_first_cu2 = FALSE;
+#endif
+
 PCRE2_UCHAR first_cu = 0;
 PCRE2_UCHAR first_cu2 = 0;
 PCRE2_UCHAR req_cu = 0;
@@ -3634,7 +3639,10 @@ for (;;)
     /* Not anchored. Advance to a unique first code unit if there is one. In
     8-bit mode, the use of memchr() gives a big speed up, even though we have
     to call it twice in caseless mode, in order to find the earliest occurrence
-    of the character in either of its cases. */
+    of the character in either of its cases. If a call to memchr() that
+    searches the rest of the subject fails to find one case, remember that in
+    order not to keep on repeating the search. This can make a huge difference
+    when the strings are very long and only one case is present. */
 
     else
       {
@@ -3648,11 +3656,29 @@ for (;;)
                 (smc = UCHAR21TEST(start_match)) != first_cu &&
                   smc != first_cu2)
             start_match++;
+
 #else  /* 8-bit code units */
-          PCRE2_SPTR pp1 =
-            memchr(start_match, first_cu, end_subject-start_match);
-          PCRE2_SPTR pp2 =
-            memchr(start_match, first_cu2, end_subject-start_match);
+          PCRE2_SPTR pp1 = NULL;
+          PCRE2_SPTR pp2 = NULL;
+          PCRE2_SIZE cu2size = end_subject - start_match;
+
+          if (!memchr_not_found_first_cu)
+            {
+            pp1 = memchr(start_match, first_cu, end_subject - start_match);
+            if (pp1 == NULL) memchr_not_found_first_cu = TRUE;
+              else cu2size = pp1 - start_match;
+            }
+
+          /* If pp1 is not NULL, we have arranged to search only as far as pp1,
+          to see if the other case is earlier, so we can set "not found" only
+          when both searches have returned NULL. */
+
+          if (!memchr_not_found_first_cu2)
+            {
+            pp2 = memchr(start_match, first_cu2, cu2size);
+            memchr_not_found_first_cu2 = (pp2 == NULL && pp1 == NULL);
+            }
+ 
           if (pp1 == NULL)
             start_match = (pp2 == NULL)? end_subject : pp2;
           else
diff --git a/src/pcre2_match.c b/src/pcre2_match.c
index 4471183..cf3032e 100644
--- a/src/pcre2_match.c
+++ b/src/pcre2_match.c
@@ -494,11 +494,11 @@ in the code. The second one is used when we already know we are past the end of
 the subject. We set the "hit end" flag if the pointer is at the end of the
 subject and either (a) the pointer is past the earliest inspected character
 (i.e. something has been matched, even if not part of the actual matched
-string), or (b) the pattern contains a lookbehind. These are the conditions for 
+string), or (b) the pattern contains a lookbehind. These are the conditions for
 which adding more characters may allow the current match to continue.
 
 For hard partial matching, we immediately return a partial match. Otherwise,
-carrying on means that a complete match on the current subject will be sought. 
+carrying on means that a complete match on the current subject will be sought.
 A partial match is returned only if no complete match can be found. */
 
 #define CHECK_PARTIAL()\
@@ -5658,10 +5658,10 @@ fprintf(stderr, "++ op=%d\n", *Fecode);
     case OP_EOD:
     if (Feptr < mb->end_subject) RRETURN(MATCH_NOMATCH);
     if (mb->partial != 0)
-      { 
+      {
       mb->hitend = TRUE;
       if (mb->partial > 1) return PCRE2_ERROR_PARTIAL;
-      } 
+      }
     Fecode++;
     break;
 
@@ -5687,10 +5687,10 @@ fprintf(stderr, "++ op=%d\n", *Fecode);
     /* Either at end of string or \n before end. */
 
     if (mb->partial != 0)
-      { 
+      {
       mb->hitend = TRUE;
       if (mb->partial > 1) return PCRE2_ERROR_PARTIAL;
-      } 
+      }
     Fecode++;
     break;
 
@@ -6047,6 +6047,11 @@ BOOL has_req_cu = FALSE;
 BOOL startline;
 BOOL utf;
 
+#if PCRE2_CODE_UNIT_WIDTH == 8
+BOOL memchr_not_found_first_cu = FALSE;
+BOOL memchr_not_found_first_cu2 = FALSE;
+#endif
+
 PCRE2_UCHAR first_cu = 0;
 PCRE2_UCHAR first_cu2 = 0;
 PCRE2_UCHAR req_cu = 0;
@@ -6453,7 +6458,7 @@ mb->start_subject = subject;
 mb->start_offset = start_offset;
 mb->end_subject = end_subject;
 mb->hasthen = (re->flags & PCRE2_HASTHEN) != 0;
-mb->allowemptypartial = (re->max_lookbehind > 0) || 
+mb->allowemptypartial = (re->max_lookbehind > 0) ||
     (re->flags & PCRE2_MATCH_EMPTY) != 0;
 mb->poptions = re->overall_options;          /* Pattern options */
 mb->ignore_skip_arg = 0;
@@ -6686,7 +6691,10 @@ for(;;)
     /* Not anchored. Advance to a unique first code unit if there is one. In
     8-bit mode, the use of memchr() gives a big speed up, even though we have
     to call it twice in caseless mode, in order to find the earliest occurrence
-    of the character in either of its cases. */
+    of the character in either of its cases. If a call to memchr() that
+    searches the rest of the subject fails to find one case, remember that in
+    order not to keep on repeating the search. This can make a huge difference
+    when the strings are very long and only one case is present. */
 
     else
       {
@@ -6700,11 +6708,29 @@ for(;;)
                 (smc = UCHAR21TEST(start_match)) != first_cu &&
                   smc != first_cu2)
             start_match++;
+
 #else  /* 8-bit code units */
-          PCRE2_SPTR pp1 =
-            memchr(start_match, first_cu, end_subject-start_match);
-          PCRE2_SPTR pp2 =
-            memchr(start_match, first_cu2, end_subject-start_match);
+          PCRE2_SPTR pp1 = NULL;
+          PCRE2_SPTR pp2 = NULL;
+          PCRE2_SIZE cu2size = end_subject - start_match;
+
+          if (!memchr_not_found_first_cu)
+            {
+            pp1 = memchr(start_match, first_cu, end_subject - start_match);
+            if (pp1 == NULL) memchr_not_found_first_cu = TRUE;
+              else cu2size = pp1 - start_match;
+            }
+
+          /* If pp1 is not NULL, we have arranged to search only as far as pp1,
+          to see if the other case is earlier, so we can set "not found" only
+          when both searches have returned NULL. */
+
+          if (!memchr_not_found_first_cu2)
+            {
+            pp2 = memchr(start_match, first_cu2, cu2size);
+            memchr_not_found_first_cu2 = (pp2 == NULL && pp1 == NULL);
+            }
+
           if (pp1 == NULL)
             start_match = (pp2 == NULL)? end_subject : pp2;
           else