Improve code for "starts with" optimization in the interpreters.

2021-08-29 17:25:59 +01:00 · 2021-08-29 17:25:59 +01:00 · eea410b33a
parent d5a61ee891
commit eea410b33a
3 changed files with 95 additions and 52 deletions
--- a/5
+++ b/5
@ -41,6 +41,11 @@ quote old; it was released in 2014.
 detecting symlink loops. This is dependent on the availability of realpath(), 
 which is now tested for in ./configure and CMakeLists.txt. 
 5. Implemented a modified version of Thomas Tempelmann's patch for handling 
 case-independent "first code unit" searches for unanchored patterns in 8-bit
 mode in the interpreters. Instead of just remembering whether one case matched
 or not, it remembers the position of a previous match so as to avoid
 unnecessary repeated searching.
 Version 10.37 26-May-2021
--- a/src/pcre2_dfa_match.c
+++ b/src/pcre2_dfa_match.c
@ -7,7 +7,7 @@ and semantics are as close as possible to those of the Perl 5 language.
                       Written by Philip Hazel
     Original API code Copyright (c) 1997-2012 University of Cambridge
-          New API code Copyright (c) 2016-2020 University of Cambridge
+          New API code Copyright (c) 2016-2021 University of Cambridge
 -----------------------------------------------------------------------------
 Redistribution and use in source and binary forms, with or without
@ -3256,8 +3256,8 @@ BOOL has_first_cu = FALSE;
 BOOL has_req_cu = FALSE;
 #if PCRE2_CODE_UNIT_WIDTH == 8
-BOOL memchr_not_found_first_cu = FALSE;
+PCRE2_SPTR memchr_found_first_cu = NULL;
-BOOL memchr_not_found_first_cu2 = FALSE;
+PCRE2_SPTR memchr_found_first_cu2 = NULL;
 #endif
 PCRE2_UCHAR first_cu = 0;
@ -3648,13 +3648,7 @@ for (;;)
        }
      }
-    /* Not anchored. Advance to a unique first code unit if there is one. In
+    /* Not anchored. Advance to a unique first code unit if there is one. */
    8-bit mode, the use of memchr() gives a big speed up, even though we have
    to call it twice in caseless mode, in order to find the earliest occurrence
    of the character in either of its cases. If a call to memchr() that
    searches the rest of the subject fails to find one case, remember that in
    order not to keep on repeating the search. This can make a huge difference
    when the strings are very long and only one case is present. */
    else
      {
@ -3662,43 +3656,68 @@ for (;;)
        {
        if (first_cu != first_cu2)  /* Caseless */
          {
          /* In 16-bit and 32_bit modes we have to do our own search, so can
          look for both cases at once. */
 #if PCRE2_CODE_UNIT_WIDTH != 8
          PCRE2_UCHAR smc;
          while (start_match < end_subject &&
                (smc = UCHAR21TEST(start_match)) != first_cu &&
-                  smc != first_cu2)
+                 smc != first_cu2)
            start_match++;
 #else
          /* In 8-bit mode, the use of memchr() gives a big speed up, even
          though we have to call it twice in order to find the earliest
          occurrence of the code unit in either of its cases. Caching is used
          to remember the positions of previously found code units. This can
          make a huge difference when the strings are very long and only one
          case is actually present. */
 #else  /* 8-bit code units */
          PCRE2_SPTR pp1 = NULL;
          PCRE2_SPTR pp2 = NULL;
-          PCRE2_SIZE cu2size = end_subject - start_match;
+          PCRE2_SIZE searchlength = end_subject - start_match;
-          if (!memchr_not_found_first_cu)
+          /* If we haven't got a previously found position for first_cu, or if
          the current starting position is later, we need to do a search. If
          the code unit is not found, set it to the end. */
          if (memchr_found_first_cu == NULL ||
              start_match > memchr_found_first_cu)
            {
-            pp1 = memchr(start_match, first_cu, end_subject - start_match);
+            pp1 = memchr(start_match, first_cu, searchlength);
-            if (pp1 == NULL) memchr_not_found_first_cu = TRUE;
+            memchr_found_first_cu = (pp1 == NULL)? end_subject : pp1;
              else cu2size = pp1 - start_match;
            }
-          /* If pp1 is not NULL, we have arranged to search only as far as pp1,
+          /* If the start is before a previously found position, use the
-          to see if the other case is earlier, so we can set "not found" only
+          previous position, or NULL if a previous search failed. */
          when both searches have returned NULL. */
-          if (!memchr_not_found_first_cu2)
+          else pp1 = (memchr_found_first_cu == end_subject)? NULL :
            memchr_found_first_cu;
          /* Do the same thing for the other case. */
          if (memchr_found_first_cu2 == NULL ||
              start_match > memchr_found_first_cu2)
            {
-            pp2 = memchr(start_match, first_cu2, cu2size);
+            pp2 = memchr(start_match, first_cu2, searchlength);
-            memchr_not_found_first_cu2 = (pp2 == NULL && pp1 == NULL);
+            memchr_found_first_cu2 = (pp2 == NULL)? end_subject : pp2;
            }
          else pp2 = (memchr_found_first_cu2 == end_subject)? NULL :
            memchr_found_first_cu2;
          /* Set the start to the end of the subject if neither case was found.
          Otherwise, use the earlier found point. */
          if (pp1 == NULL)
            start_match = (pp2 == NULL)? end_subject : pp2;
          else
            start_match = (pp2 == NULL || pp1 < pp2)? pp1 : pp2;
-#endif
+             
 #endif  /* 8-bit handling */
          }
-        /* The caseful case */
+        /* The caseful case is much simpler. */
        else
          {
--- a/src/pcre2_match.c
+++ b/src/pcre2_match.c
@ -7,7 +7,7 @@ and semantics are as close as possible to those of the Perl 5 language.
                       Written by Philip Hazel
     Original API code Copyright (c) 1997-2012 University of Cambridge
-          New API code Copyright (c) 2015-2020 University of Cambridge
+          New API code Copyright (c) 2015-2021 University of Cambridge
 -----------------------------------------------------------------------------
 Redistribution and use in source and binary forms, with or without
@ -6117,8 +6117,8 @@ BOOL has_req_cu = FALSE;
 BOOL startline;
 #if PCRE2_CODE_UNIT_WIDTH == 8
-BOOL memchr_not_found_first_cu;
+PCRE2_SPTR memchr_found_first_cu;
-BOOL memchr_not_found_first_cu2;
+PCRE2_SPTR memchr_found_first_cu2;
 #endif
 PCRE2_UCHAR first_cu = 0;
@ -6712,8 +6712,8 @@ start_partial = match_partial = NULL;
 mb->hitend = FALSE;
 #if PCRE2_CODE_UNIT_WIDTH == 8
-memchr_not_found_first_cu = FALSE;
+memchr_found_first_cu = NULL;
-memchr_not_found_first_cu2 = FALSE;
+memchr_found_first_cu2 = NULL;
 #endif
 for(;;)
@ -6782,13 +6782,7 @@ for(;;)
        }
      }
-    /* Not anchored. Advance to a unique first code unit if there is one. In
+    /* Not anchored. Advance to a unique first code unit if there is one. */
    8-bit mode, the use of memchr() gives a big speed up, even though we have
    to call it twice in caseless mode, in order to find the earliest occurrence
    of the character in either of its cases. If a call to memchr() that
    searches the rest of the subject fails to find one case, remember that in
    order not to keep on repeating the search. This can make a huge difference
    when the strings are very long and only one case is present. */
    else
      {
@ -6796,43 +6790,68 @@ for(;;)
        {
        if (first_cu != first_cu2)  /* Caseless */
          {
          /* In 16-bit and 32_bit modes we have to do our own search, so can
          look for both cases at once. */
 #if PCRE2_CODE_UNIT_WIDTH != 8
          PCRE2_UCHAR smc;
          while (start_match < end_subject &&
                (smc = UCHAR21TEST(start_match)) != first_cu &&
-                  smc != first_cu2)
+                 smc != first_cu2)
            start_match++;
 #else
          /* In 8-bit mode, the use of memchr() gives a big speed up, even
          though we have to call it twice in order to find the earliest
          occurrence of the code unit in either of its cases. Caching is used
          to remember the positions of previously found code units. This can
          make a huge difference when the strings are very long and only one
          case is actually present. */
 #else  /* 8-bit code units */
          PCRE2_SPTR pp1 = NULL;
          PCRE2_SPTR pp2 = NULL;
-          PCRE2_SIZE cu2size = end_subject - start_match;
+          PCRE2_SIZE searchlength = end_subject - start_match;
-          if (!memchr_not_found_first_cu)
+          /* If we haven't got a previously found position for first_cu, or if
          the current starting position is later, we need to do a search. If
          the code unit is not found, set it to the end. */
          if (memchr_found_first_cu == NULL ||
              start_match > memchr_found_first_cu)
            {
-            pp1 = memchr(start_match, first_cu, end_subject - start_match);
+            pp1 = memchr(start_match, first_cu, searchlength);
-            if (pp1 == NULL) memchr_not_found_first_cu = TRUE;
+            memchr_found_first_cu = (pp1 == NULL)? end_subject : pp1;
              else cu2size = pp1 - start_match;
            }
-          /* If pp1 is not NULL, we have arranged to search only as far as pp1,
+          /* If the start is before a previously found position, use the
-          to see if the other case is earlier, so we can set "not found" only
+          previous position, or NULL if a previous search failed. */
          when both searches have returned NULL. */
-          if (!memchr_not_found_first_cu2)
+          else pp1 = (memchr_found_first_cu == end_subject)? NULL :
            memchr_found_first_cu;
          /* Do the same thing for the other case. */
          if (memchr_found_first_cu2 == NULL ||
              start_match > memchr_found_first_cu2)
            {
-            pp2 = memchr(start_match, first_cu2, cu2size);
+            pp2 = memchr(start_match, first_cu2, searchlength);
-            memchr_not_found_first_cu2 = (pp2 == NULL && pp1 == NULL);
+            memchr_found_first_cu2 = (pp2 == NULL)? end_subject : pp2;
            }
          else pp2 = (memchr_found_first_cu2 == end_subject)? NULL :
            memchr_found_first_cu2;
          /* Set the start to the end of the subject if neither case was found.
          Otherwise, use the earlier found point. */
          if (pp1 == NULL)
            start_match = (pp2 == NULL)? end_subject : pp2;
          else
            start_match = (pp2 == NULL || pp1 < pp2)? pp1 : pp2;
-#endif
+             
 #endif  /* 8-bit handling */
          }
-        /* The caseful case */
+        /* The caseful case is much simpler. */
        else
          {