Fix pessimizing optimization of start-of-match code units in the interpreters.

2019-09-06 16:08:45 +00:00 · 2019-09-06 16:08:45 +00:00 · 7bbdc58513
parent 963b570fd0
commit 7bbdc58513
3 changed files with 79 additions and 18 deletions
--- a/11
+++ b/11
@ -92,7 +92,7 @@ within it, the nested lookbehind was not correctly processed. For example, if

 19. Implemented pcre2_get_match_data_size().

-20. Two alterations to partial matching (not yet done by JIT):
+20. Two alterations to partial matching:

    (a) The definition of a partial match is slightly changed: if a pattern
    contains any lookbehinds, an empty partial match may be given, because this
@ -130,6 +130,15 @@ inspected in that lookahead were not included.

 28. Add the pcre2_maketables_free() function.

+29. The start-up optimization that looks for a unique initial matching
+code unit in the interpretive engines uses memchr() in 8-bit mode. When the
+search is caseless, it was doing so inefficiently, which ended up slowing down
+the match drastically when the subject was very long. The revised code (a)
+remembers if one case is not found, so it never repeats the search for that 
+case after a bumpalong and (b) when one case has been found, it searches only
+up to that position for an earlier occurrence of the other case. This fix
+applies to both interpretive pcre2_match() and to pcre2_dfa_match(). 
+

 Version 10.33 16-April-2019
 ---------------------------
--- a/src/pcre2_dfa_match.c
+++ b/src/pcre2_dfa_match.c
@ -3254,6 +3254,11 @@ BOOL utf, anchored, startline, firstline;
 BOOL has_first_cu = FALSE;
 BOOL has_req_cu = FALSE;

+#if PCRE2_CODE_UNIT_WIDTH == 8
+BOOL memchr_not_found_first_cu = FALSE;
+BOOL memchr_not_found_first_cu2 = FALSE;
+#endif
+
 PCRE2_UCHAR first_cu = 0;
 PCRE2_UCHAR first_cu2 = 0;
 PCRE2_UCHAR req_cu = 0;
@ -3634,7 +3639,10 @@ for (;;)
    /* Not anchored. Advance to a unique first code unit if there is one. In
    8-bit mode, the use of memchr() gives a big speed up, even though we have
    to call it twice in caseless mode, in order to find the earliest occurrence
-    of the character in either of its cases. */
+    of the character in either of its cases. If a call to memchr() that
+    searches the rest of the subject fails to find one case, remember that in
+    order not to keep on repeating the search. This can make a huge difference
+    when the strings are very long and only one case is present. */

    else
      {
@ -3648,11 +3656,29 @@ for (;;)
                (smc = UCHAR21TEST(start_match)) != first_cu &&
                  smc != first_cu2)
            start_match++;
+
 #else  /* 8-bit code units */
-          PCRE2_SPTR pp1 =
-            memchr(start_match, first_cu, end_subject-start_match);
-          PCRE2_SPTR pp2 =
-            memchr(start_match, first_cu2, end_subject-start_match);
+          PCRE2_SPTR pp1 = NULL;
+          PCRE2_SPTR pp2 = NULL;
+          PCRE2_SIZE cu2size = end_subject - start_match;
+
+          if (!memchr_not_found_first_cu)
+            {
+            pp1 = memchr(start_match, first_cu, end_subject - start_match);
+            if (pp1 == NULL) memchr_not_found_first_cu = TRUE;
+              else cu2size = pp1 - start_match;
+            }
+
+          /* If pp1 is not NULL, we have arranged to search only as far as pp1,
+          to see if the other case is earlier, so we can set "not found" only
+          when both searches have returned NULL. */
+
+          if (!memchr_not_found_first_cu2)
+            {
+            pp2 = memchr(start_match, first_cu2, cu2size);
+            memchr_not_found_first_cu2 = (pp2 == NULL && pp1 == NULL);
+            }
+ 
          if (pp1 == NULL)
            start_match = (pp2 == NULL)? end_subject : pp2;
          else
--- a/src/pcre2_match.c
+++ b/src/pcre2_match.c
@ -6047,6 +6047,11 @@ BOOL has_req_cu = FALSE;
 BOOL startline;
 BOOL utf;

+#if PCRE2_CODE_UNIT_WIDTH == 8
+BOOL memchr_not_found_first_cu = FALSE;
+BOOL memchr_not_found_first_cu2 = FALSE;
+#endif
+
 PCRE2_UCHAR first_cu = 0;
 PCRE2_UCHAR first_cu2 = 0;
 PCRE2_UCHAR req_cu = 0;
@ -6686,7 +6691,10 @@ for(;;)
    /* Not anchored. Advance to a unique first code unit if there is one. In
    8-bit mode, the use of memchr() gives a big speed up, even though we have
    to call it twice in caseless mode, in order to find the earliest occurrence
-    of the character in either of its cases. */
+    of the character in either of its cases. If a call to memchr() that
+    searches the rest of the subject fails to find one case, remember that in
+    order not to keep on repeating the search. This can make a huge difference
+    when the strings are very long and only one case is present. */

    else
      {
@ -6700,11 +6708,29 @@ for(;;)
                (smc = UCHAR21TEST(start_match)) != first_cu &&
                  smc != first_cu2)
            start_match++;
+
 #else  /* 8-bit code units */
-          PCRE2_SPTR pp1 =
-            memchr(start_match, first_cu, end_subject-start_match);
-          PCRE2_SPTR pp2 =
-            memchr(start_match, first_cu2, end_subject-start_match);
+          PCRE2_SPTR pp1 = NULL;
+          PCRE2_SPTR pp2 = NULL;
+          PCRE2_SIZE cu2size = end_subject - start_match;
+
+          if (!memchr_not_found_first_cu)
+            {
+            pp1 = memchr(start_match, first_cu, end_subject - start_match);
+            if (pp1 == NULL) memchr_not_found_first_cu = TRUE;
+              else cu2size = pp1 - start_match;
+            }
+
+          /* If pp1 is not NULL, we have arranged to search only as far as pp1,
+          to see if the other case is earlier, so we can set "not found" only
+          when both searches have returned NULL. */
+
+          if (!memchr_not_found_first_cu2)
+            {
+            pp2 = memchr(start_match, first_cu2, cu2size);
+            memchr_not_found_first_cu2 = (pp2 == NULL && pp1 == NULL);
+            }
+
          if (pp1 == NULL)
            start_match = (pp2 == NULL)? end_subject : pp2;
          else