Allow anchored patterns to use "first code unit" optimization.

2017-06-30 16:00:33 +00:00 · 2017-06-30 16:00:33 +00:00 · b7d5cee61f
parent cc089cf971
commit b7d5cee61f
15 changed files with 673 additions and 273 deletions
--- a/5
+++ b/5
@ -205,6 +205,11 @@ JIT.
 subjects from 1000 to 2000 for 8-bit searches, since they use memchr() and are 
 much faster.

+46. Arrange for anchored patterns to record and use "first code unit" data,
+because this can give a fast "no match" without searching for a "required code 
+unit". Previously only non-anchored patterns did this.
+
+

 Version 10.23 14-February-2017
 ------------------------------
--- a/src/pcre2_compile.c
+++ b/src/pcre2_compile.c
@ -9632,14 +9632,19 @@ if ((re->overall_options & PCRE2_ANCHORED) == 0 &&
     is_anchored(codestart, 0, &cb, 0, FALSE))
  re->overall_options |= PCRE2_ANCHORED;

-/* If the pattern is still not anchored and we do not have a first code unit,
-see if there is one that is asserted (these are not saved during the compile
-because they can cause conflicts with actual literals that follow). This code
-need not be obeyed if PCRE2_NO_START_OPTIMIZE is set, as the data it would
-create will not be used. */
+/* Set up the first code unit or startline flag, the required code unit, and
+then study the pattern. This code need not be obeyed if PCRE2_NO_START_OPTIMIZE
+is set, as the data it would create will not be used. Note that a first code
+unit (but not the startline flag) is useful for anchored patterns because it
+can still give a quick "no match" and also avoid searching for a last code
+unit. */

-if ((re->overall_options & (PCRE2_ANCHORED|PCRE2_NO_START_OPTIMIZE)) == 0)
+if ((re->overall_options & PCRE2_NO_START_OPTIMIZE) == 0)
  {
+  /* If we do not have a first code unit, see if there is one that is asserted
+  (these are not saved during the compile because they can cause conflicts with
+  actual literals that follow). */
+
  if (firstcuflags < 0)
    firstcu = find_firstassertedcu(codestart, &firstcuflags, FALSE);

@ -9672,52 +9677,50 @@ if ((re->overall_options & (PCRE2_ANCHORED|PCRE2_NO_START_OPTIMIZE)) == 0)
      }
    }

-  /* When there is no first code unit, see if we can set the PCRE2_STARTLINE
-  flag. This is helpful for multiline matches when all branches start with ^
-  and also when all branches start with non-atomic .* for non-DOTALL matches
-  when *PRUNE and SKIP are not present. (There is an option that disables this
-  case.) */
+  /* When there is no first code unit, for non-anchored patterns, see if we can
+  set the PCRE2_STARTLINE flag. This is helpful for multiline matches when all
+  branches start with ^ and also when all branches start with non-atomic .* for
+  non-DOTALL matches when *PRUNE and SKIP are not present. (There is an option
+  that disables this case.) */

-  else if (is_startline(codestart, 0, &cb, 0, FALSE))
+  else if ((re->overall_options & PCRE2_ANCHORED) == 0 &&
+           is_startline(codestart, 0, &cb, 0, FALSE))
    re->flags |= PCRE2_STARTLINE;
-  }

-/* Handle the "required code unit", if one is set. In the case of an anchored
-pattern, do this only if it follows a variable length item in the pattern.
-Again, skip this if PCRE2_NO_START_OPTIMIZE is set. */
+  /* Handle the "required code unit", if one is set. In the case of an anchored
+  pattern, do this only if it follows a variable length item in the pattern. */

-if (reqcuflags >= 0 &&
-     ((re->overall_options & (PCRE2_ANCHORED|PCRE2_NO_START_OPTIMIZE)) == 0 ||
-      (reqcuflags & REQ_VARY) != 0))
-  {
-  re->last_codeunit = reqcu;
-  re->flags |= PCRE2_LASTSET;
-
-  /* Handle caseless required code units as for first code units (above). */
-
-  if ((reqcuflags & REQ_CASELESS) != 0)
+  if (reqcuflags >= 0 &&
+       ((re->overall_options & PCRE2_ANCHORED) == 0 ||
+        (reqcuflags & REQ_VARY) != 0))
    {
-    if (reqcu < 128 || (!utf && reqcu < 255))
+    re->last_codeunit = reqcu;
+    re->flags |= PCRE2_LASTSET;
+
+    /* Handle caseless required code units as for first code units (above). */
+
+    if ((reqcuflags & REQ_CASELESS) != 0)
      {
-      if (cb.fcc[reqcu] != reqcu) re->flags |= PCRE2_LASTCASELESS;
-      }
+      if (reqcu < 128 || (!utf && reqcu < 255))
+        {
+        if (cb.fcc[reqcu] != reqcu) re->flags |= PCRE2_LASTCASELESS;
+        }
 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 8
-    else if (reqcu <= MAX_UTF_CODE_POINT && UCD_OTHERCASE(reqcu) != reqcu)
-      re->flags |= PCRE2_LASTCASELESS;
+      else if (reqcu <= MAX_UTF_CODE_POINT && UCD_OTHERCASE(reqcu) != reqcu)
+        re->flags |= PCRE2_LASTCASELESS;
 #endif
+      }
    }
-  }

-/* Finally, unless PCRE2_NO_START_OPTIMIZE is set, study the compiled pattern
-to set up information such as a bitmap of starting code units and a minimum
-matching length. */
+  /* Finally, study the compiled pattern to set up information such as a bitmap
+  of starting code units and a minimum matching length. */

-if ((re->overall_options & PCRE2_NO_START_OPTIMIZE) == 0 &&
-    PRIV(study)(re) != 0)
-  {
-  errorcode = ERR31;
-  goto HAD_CB_ERROR;
-  }
+  if (PRIV(study)(re) != 0)
+    {
+    errorcode = ERR31;
+    goto HAD_CB_ERROR;
+    }
+  }   /* End of start-of-match optimizations. */

 /* Control ends up here in all cases. When running under valgrind, make a
 pattern's terminating zero defined again. If memory was obtained for the parsed
--- a/src/pcre2_dfa_match.c
+++ b/src/pcre2_dfa_match.c
@ -3341,34 +3341,27 @@ if (utf && (options & PCRE2_NO_UTF_CHECK) == 0)
  }
 #endif  /* SUPPORT_UNICODE */

-/* Set up the first code unit to match, if available. The first_codeunit value
-is never set for an anchored regular expression, but the anchoring may be
-forced at run time, so we have to test for anchoring. The first code unit may
-be unset for an unanchored pattern, of course. If there's no first code unit
-there may be a bitmap of possible first characters. */
+/* Set up the first code unit to match, if available. If there's no first code
+unit there may be a bitmap of possible first characters. */

-if (!anchored)
+if ((re->flags & PCRE2_FIRSTSET) != 0)
  {
-  if ((re->flags & PCRE2_FIRSTSET) != 0)
+  has_first_cu = TRUE;
+  first_cu = first_cu2 = (PCRE2_UCHAR)(re->first_codeunit);
+  if ((re->flags & PCRE2_FIRSTCASELESS) != 0)
    {
-    has_first_cu = TRUE;
-    first_cu = first_cu2 = (PCRE2_UCHAR)(re->first_codeunit);
-    if ((re->flags & PCRE2_FIRSTCASELESS) != 0)
-      {
-      first_cu2 = TABLE_GET(first_cu, mb->tables + fcc_offset, first_cu);
+    first_cu2 = TABLE_GET(first_cu, mb->tables + fcc_offset, first_cu);
 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 8
-      if (utf && first_cu > 127)
-        first_cu2 = (PCRE2_UCHAR)UCD_OTHERCASE(first_cu);
+    if (utf && first_cu > 127)
+      first_cu2 = (PCRE2_UCHAR)UCD_OTHERCASE(first_cu);
 #endif
-      }
    }
-  else
-    if (!startline && (re->flags & PCRE2_FIRSTMAPSET) != 0)
-      start_bits = re->start_bitmap;
  }
+else
+  if (!startline && (re->flags & PCRE2_FIRSTMAPSET) != 0)
+    start_bits = re->start_bitmap;

-/* For anchored or unanchored matches, there may be a "last known required
-character" set. */
+/* There may be a "last known required code unit" set. */

 if ((re->flags & PCRE2_LASTSET) != 0)
  {
@ -3414,8 +3407,8 @@ for (;;)
    /* If firstline is TRUE, the start of the match is constrained to the first
    line of a multiline string. That is, the match must be before or at the
    first newline. Implement this by temporarily adjusting end_subject so that
-    we stop the optimization scans at a newline. If the match fails at the
-    newline, later code breaks this loop. */
+    we stop the optimization scans for a first code unit at a newline. If the
+    match fails at the newline, later code breaks this loop. */

    if (firstline)
      {
@ -3435,69 +3428,137 @@ for (;;)
      end_subject = t;
      }
      
-    /* Advance to a unique first code unit if there is one. */
+    /* Anchored: check the first code unit if one is recorded. This may seem
+    pointless but it can help in detecting a no match case without scanning for
+    the required code unit. */

-    if (has_first_cu)
+    if (anchored)
      {
-      PCRE2_UCHAR smc;
-      if (first_cu != first_cu2)
-        while (start_match < end_subject &&
-          (smc = UCHAR21TEST(start_match)) != first_cu && smc != first_cu2)
-          start_match++;
-      else
-        while (start_match < end_subject && UCHAR21TEST(start_match) != first_cu)
-          start_match++;
-      }
-
-    /* Or to just after a linebreak for a multiline match */
-
-    else if (startline)
-      {
-      if (start_match > mb->start_subject + start_offset)
+      if (has_first_cu || start_bits != NULL)
        {
-#ifdef SUPPORT_UNICODE
-        if (utf)
+        BOOL ok = start_match < end_subject;
+        if (ok)
          {
-          while (start_match < end_subject && !WAS_NEWLINE(start_match))
+          PCRE2_UCHAR c = UCHAR21TEST(start_match);
+          ok = has_first_cu && (c == first_cu || c == first_cu2);
+          if (!ok && start_bits != NULL)
            {
-            start_match++;
-            ACROSSCHAR(start_match < end_subject, *start_match,
-              start_match++);
+#if PCRE2_CODE_UNIT_WIDTH != 8
+            if (c > 255) c = 255;
+#endif
+            ok = (start_bits[c/8] & (1 << (c&7))) != 0;
            }
          }
-        else
-#endif
-        while (start_match < end_subject && !WAS_NEWLINE(start_match))
-          start_match++;
-
-        /* If we have just passed a CR and the newline option is ANY or
-        ANYCRLF, and we are now at a LF, advance the match position by one more
-        code unit. */
-
-        if (start_match[-1] == CHAR_CR &&
-             (mb->nltype == NLTYPE_ANY || mb->nltype == NLTYPE_ANYCRLF) &&
-             start_match < end_subject &&
-             UCHAR21TEST(start_match) == CHAR_NL)
-          start_match++;
+        if (!ok) break;
        }
      }

-    /* Or to a non-unique first code unit if any have been identified. The
-    bitmap contains only 256 bits. When code units are 16 or 32 bits wide, all
-    code units greater than 254 set the 255 bit. */
+    /* Not anchored. Advance to a unique first code unit if there is one. In
+    8-bit mode, the use of memchr() gives a big speed up, even though we have
+    to call it twice in caseless mode, in order to find the earliest occurrence
+    of the character in either of its cases. */

-    else if (start_bits != NULL)
+    else
      {
-      while (start_match < end_subject)
+      if (has_first_cu)
        {
-        uint32_t c = UCHAR21TEST(start_match);
+        if (first_cu != first_cu2)  /* Caseless */
+          {
 #if PCRE2_CODE_UNIT_WIDTH != 8
-        if (c > 255) c = 255;
+          PCRE2_UCHAR smc;
+          while (start_match < end_subject &&
+                (smc = UCHAR21TEST(start_match)) != first_cu &&
+                  smc != first_cu2)
+            start_match++;
+#else  /* 8-bit code units */
+          PCRE2_SPTR pp1 =
+            memchr(start_match, first_cu, end_subject-start_match);
+          PCRE2_SPTR pp2 =
+            memchr(start_match, first_cu2, end_subject-start_match);
+          if (pp1 == NULL)
+            start_match = (pp2 == NULL)? end_subject : pp2;
+          else
+            start_match = (pp2 == NULL || pp1 < pp2)? pp1 : pp2;
 #endif
-        if ((start_bits[c/8] & (1 << (c&7))) != 0) break;
-        start_match++;
+          }
+
+        /* The caseful case */
+
+        else
+          {
+#if PCRE2_CODE_UNIT_WIDTH != 8
+          while (start_match < end_subject && UCHAR21TEST(start_match) !=
+                 first_cu)
+            start_match++;
+#else
+          start_match = memchr(start_match, first_cu, end_subject - start_match);
+          if (start_match == NULL) start_match = end_subject;
+#endif
+          }
+
+        /* If we can't find the required code unit, break the bumpalong loop,
+        to force a match failure, except when doing partial matching, when we
+        let the next cycle run at the end of the subject. To see why, consider
+        the pattern /(?<=abc)def/, which partially matches "abc", even though
+        the string does not contain the starting character "d". */
+
+        if ((mb->moptions & (PCRE2_PARTIAL_HARD|PCRE2_PARTIAL_SOFT)) == 0 &&
+            start_match >= end_subject)
+          break;
        }
-      }
+
+      /* If there's no first code unit, advance to just after a linebreak for a
+      multiline match if required. */
+
+      else if (startline)
+        {
+        if (start_match > mb->start_subject + start_offset)
+          {
+#ifdef SUPPORT_UNICODE
+          if (utf)
+            {
+            while (start_match < end_subject && !WAS_NEWLINE(start_match))
+              {
+              start_match++;
+              ACROSSCHAR(start_match < end_subject, *start_match,
+                start_match++);
+              }
+            }
+          else
+#endif
+          while (start_match < end_subject && !WAS_NEWLINE(start_match))
+            start_match++;
+
+          /* If we have just passed a CR and the newline option is ANY or
+          ANYCRLF, and we are now at a LF, advance the match position by one
+          more code unit. */
+
+          if (start_match[-1] == CHAR_CR &&
+               (mb->nltype == NLTYPE_ANY || mb->nltype == NLTYPE_ANYCRLF) &&
+               start_match < end_subject &&
+               UCHAR21TEST(start_match) == CHAR_NL)
+            start_match++;
+          }
+        }
+
+      /* If there's no first code unit or a requirement for a multiline line
+      start, advance to a non-unique first code unit if any have been
+      identified. The bitmap contains only 256 bits. When code units are 16 or
+      32 bits wide, all code units greater than 254 set the 255 bit. */
+
+      else if (start_bits != NULL)
+        {
+        while (start_match < end_subject)
+          {
+          uint32_t c = UCHAR21TEST(start_match);
+#if PCRE2_CODE_UNIT_WIDTH != 8
+          if (c > 255) c = 255;
+#endif
+          if ((start_bits[c/8] & (1 << (c&7))) != 0) break;
+          start_match++;
+          }
+        }
+      }  /* End of first code unit handling */

    /* Restore fudged end_subject */

--- a/src/pcre2_match.c
+++ b/src/pcre2_match.c
@ -6333,33 +6333,26 @@ mb->lcc = re->tables + lcc_offset;
 mb->fcc = re->tables + fcc_offset;
 mb->ctypes = re->tables + ctypes_offset;

-/* Set up the first code unit to match, if available. The first_codeunit value
-is never set for an anchored regular expression, but the anchoring may be
-forced at run time, so we have to test for anchoring. The first code unit may
-be unset for an unanchored pattern, of course. If there's no first code unit
-there may be a bitmap of possible first characters. */
+/* Set up the first code unit to match, if available. If there's no first code
+unit there may be a bitmap of possible first characters. */

-if (!anchored)
+if ((re->flags & PCRE2_FIRSTSET) != 0)
  {
-  if ((re->flags & PCRE2_FIRSTSET) != 0)
+  has_first_cu = TRUE;
+  first_cu = first_cu2 = (PCRE2_UCHAR)(re->first_codeunit);
+  if ((re->flags & PCRE2_FIRSTCASELESS) != 0)
    {
-    has_first_cu = TRUE;
-    first_cu = first_cu2 = (PCRE2_UCHAR)(re->first_codeunit);
-    if ((re->flags & PCRE2_FIRSTCASELESS) != 0)
-      {
-      first_cu2 = TABLE_GET(first_cu, mb->fcc, first_cu);
+    first_cu2 = TABLE_GET(first_cu, mb->fcc, first_cu);
 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 8
-      if (utf && first_cu > 127) first_cu2 = UCD_OTHERCASE(first_cu);
+    if (utf && first_cu > 127) first_cu2 = UCD_OTHERCASE(first_cu);
 #endif
-      }
    }
-  else
-    if (!startline && (re->flags & PCRE2_FIRSTMAPSET) != 0)
-      start_bits = re->start_bitmap;
  }
+else
+  if (!startline && (re->flags & PCRE2_FIRSTMAPSET) != 0)
+    start_bits = re->start_bitmap;

-/* For anchored or unanchored matches, there may be a "last known required
-character" set. */
+/* There may also be a "last known required character" set. */

 if ((re->flags & PCRE2_LASTSET) != 0)
  {
@ -6398,8 +6391,8 @@ for(;;)
    /* If firstline is TRUE, the start of the match is constrained to the first
    line of a multiline string. That is, the match must be before or at the
    first newline. Implement this by temporarily adjusting end_subject so that
-    we stop the optimization scans at a newline. If the match fails at the
-    newline, later code breaks this loop. */
+    we stop the optimization scans for a first code unit at a newline. If the
+    match fails at the newline, later code breaks this loop. */

    if (firstline)
      {
@ -6419,107 +6412,143 @@ for(;;)
      end_subject = t;
      }

-    /* Advance to a unique first code unit if there is one. In 8-bit mode, the
-    use of memchr() gives a big speed up, even though we have to call it twice
-    in caseless mode, in order to find the first occurrence of the character in
-    either of its cases. */
+    /* Anchored: check the first code unit if one is recorded. This may seem
+    pointless but it can help in detecting a no match case without scanning for
+    the required code unit. */

-    if (has_first_cu)
+    if (anchored)
      {
-      if (first_cu != first_cu2)  /* Caseless */
+      if (has_first_cu || start_bits != NULL)
        {
-#if PCRE2_CODE_UNIT_WIDTH != 8
-        PCRE2_UCHAR smc;
-        while (start_match < end_subject &&
-              (smc = UCHAR21TEST(start_match)) != first_cu && smc != first_cu2)
-          start_match++;
-#else  /* 8-bit code units */
-        PCRE2_SPTR pp1 = memchr(start_match, first_cu, end_subject-start_match);
-        PCRE2_SPTR pp2 = memchr(start_match, first_cu2, end_subject-start_match);
-        if (pp1 == NULL)
-          start_match = (pp2 == NULL)? end_subject : pp2;
-        else
-          start_match = (pp2 == NULL || pp1 < pp2)? pp1 : pp2;
-#endif
-        }
-
-      /* The caseful case */
-
-      else
-        {
-#if PCRE2_CODE_UNIT_WIDTH != 8
-        while (start_match < end_subject && UCHAR21TEST(start_match) != first_cu)
-          start_match++;
-#else
-        start_match = memchr(start_match, first_cu, end_subject - start_match);
-        if (start_match == NULL) start_match = end_subject;
-#endif
-        }
-
-      /* If we can't find the required code unit, break the bumpalong loop, to
-      force a match failure, except when doing partial matching, when we let
-      the next cycle run at the end of the subject. To see why, consider the
-      pattern /(?<=abc)def/, which partially matches "abc", even though the
-      string does not contain the starting character "d". */
-
-      if (!mb->partial && start_match >= end_subject)
-        {
-        rc = MATCH_NOMATCH;
-        break;
-        }
-      }
-
-    /* If there's no first code unit, advance to just after a linebreak for a
-    multiline match if required. */
-
-    else if (startline)
-      {
-      if (start_match > mb->start_subject + start_offset)
-        {
-#ifdef SUPPORT_UNICODE
-        if (utf)
+        BOOL ok = start_match < end_subject;
+        if (ok)
          {
-          while (start_match < end_subject && !WAS_NEWLINE(start_match))
+          PCRE2_UCHAR c = UCHAR21TEST(start_match);
+          ok = has_first_cu && (c == first_cu || c == first_cu2);
+          if (!ok && start_bits != NULL)
            {
-            start_match++;
-            ACROSSCHAR(start_match < end_subject, *start_match,
-              start_match++);
+#if PCRE2_CODE_UNIT_WIDTH != 8
+            if (c > 255) c = 255;
+#endif
+            ok = (start_bits[c/8] & (1 << (c&7))) != 0;
            }
          }
-        else
-#endif
-        while (start_match < end_subject && !WAS_NEWLINE(start_match))
-          start_match++;
-
-        /* If we have just passed a CR and the newline option is ANY or
-        ANYCRLF, and we are now at a LF, advance the match position by one more
-        code unit. */
-
-        if (start_match[-1] == CHAR_CR &&
-             (mb->nltype == NLTYPE_ANY || mb->nltype == NLTYPE_ANYCRLF) &&
-             start_match < end_subject &&
-             UCHAR21TEST(start_match) == CHAR_NL)
-          start_match++;
+        if (!ok)
+          {
+          rc = MATCH_NOMATCH;
+          break;
+          }
        }
      }

-    /* If there's no first code unit or a requirement for a multiline line
-    start, advance to a non-unique first code unit if any have been identified.
-    The bitmap contains only 256 bits. When code units are 16 or 32 bits wide,
-    all code units greater than 254 set the 255 bit. */
+    /* Not anchored. Advance to a unique first code unit if there is one. In
+    8-bit mode, the use of memchr() gives a big speed up, even though we have
+    to call it twice in caseless mode, in order to find the earliest occurrence
+    of the character in either of its cases. */

-    else if (start_bits != NULL)
+    else
      {
-      while (start_match < end_subject)
+      if (has_first_cu)
        {
-        uint32_t c = UCHAR21TEST(start_match);
+        if (first_cu != first_cu2)  /* Caseless */
+          {
 #if PCRE2_CODE_UNIT_WIDTH != 8
-        if (c > 255) c = 255;
+          PCRE2_UCHAR smc;
+          while (start_match < end_subject &&
+                (smc = UCHAR21TEST(start_match)) != first_cu &&
+                  smc != first_cu2)
+            start_match++;
+#else  /* 8-bit code units */
+          PCRE2_SPTR pp1 =
+            memchr(start_match, first_cu, end_subject-start_match);
+          PCRE2_SPTR pp2 =
+            memchr(start_match, first_cu2, end_subject-start_match);
+          if (pp1 == NULL)
+            start_match = (pp2 == NULL)? end_subject : pp2;
+          else
+            start_match = (pp2 == NULL || pp1 < pp2)? pp1 : pp2;
 #endif
-        if ((start_bits[c/8] & (1 << (c&7))) != 0) break;
-        start_match++;
+          }
+
+        /* The caseful case */
+
+        else
+          {
+#if PCRE2_CODE_UNIT_WIDTH != 8
+          while (start_match < end_subject && UCHAR21TEST(start_match) !=
+                 first_cu)
+            start_match++;
+#else
+          start_match = memchr(start_match, first_cu, end_subject - start_match);
+          if (start_match == NULL) start_match = end_subject;
+#endif
+          }
+
+        /* If we can't find the required code unit, break the bumpalong loop,
+        to force a match failure, except when doing partial matching, when we
+        let the next cycle run at the end of the subject. To see why, consider
+        the pattern /(?<=abc)def/, which partially matches "abc", even though
+        the string does not contain the starting character "d". */
+
+        if (!mb->partial && start_match >= end_subject)
+          {
+          rc = MATCH_NOMATCH;
+          break;
+          }
        }
-      }
+
+      /* If there's no first code unit, advance to just after a linebreak for a
+      multiline match if required. */
+
+      else if (startline)
+        {
+        if (start_match > mb->start_subject + start_offset)
+          {
+#ifdef SUPPORT_UNICODE
+          if (utf)
+            {
+            while (start_match < end_subject && !WAS_NEWLINE(start_match))
+              {
+              start_match++;
+              ACROSSCHAR(start_match < end_subject, *start_match,
+                start_match++);
+              }
+            }
+          else
+#endif
+          while (start_match < end_subject && !WAS_NEWLINE(start_match))
+            start_match++;
+
+          /* If we have just passed a CR and the newline option is ANY or
+          ANYCRLF, and we are now at a LF, advance the match position by one
+          more code unit. */
+
+          if (start_match[-1] == CHAR_CR &&
+               (mb->nltype == NLTYPE_ANY || mb->nltype == NLTYPE_ANYCRLF) &&
+               start_match < end_subject &&
+               UCHAR21TEST(start_match) == CHAR_NL)
+            start_match++;
+          }
+        }
+
+      /* If there's no first code unit or a requirement for a multiline line
+      start, advance to a non-unique first code unit if any have been
+      identified. The bitmap contains only 256 bits. When code units are 16 or
+      32 bits wide, all code units greater than 254 set the 255 bit. */
+
+      else if (start_bits != NULL)
+        {
+        while (start_match < end_subject)
+          {
+          uint32_t c = UCHAR21TEST(start_match);
+#if PCRE2_CODE_UNIT_WIDTH != 8
+          if (c > 255) c = 255;
+#endif
+          if ((start_bits[c/8] & (1 << (c&7))) != 0) break;
+          start_match++;
+          }
+        }
+      }   /* End first code unit handling */

    /* Restore fudged end_subject */

--- a/src/pcre2_study.c
+++ b/src/pcre2_study.c
@ -953,7 +953,6 @@ do
      case OP_ALLANY:
      case OP_ANY:
      case OP_ANYBYTE:
-      case OP_CIRC:
      case OP_CIRCM:
      case OP_CLOSE:
      case OP_COMMIT:
@ -1021,6 +1020,13 @@ do
      case OP_THEN_ARG:
      return SSB_FAIL;

+      /* OP_CIRC happens only at the start of an anchored branch (multiline ^
+      uses OP_CIRCM). Skip over it. */
+
+      case OP_CIRC:
+      tcode += PRIV(OP_lengths)[OP_CIRC];
+      break;
+
      /* A "real" property test implies no starting bits, but the fake property
      PT_CLIST identifies a list of characters. These lists are short, as they
      are used for characters with more than one "other case", so there is no
@ -1579,12 +1585,11 @@ BOOL utf = (re->overall_options & PCRE2_UTF) != 0;
 code = (PCRE2_UCHAR *)((uint8_t *)re + sizeof(pcre2_real_code)) +
  re->name_entry_size * re->name_count;

-/* For an anchored pattern, or an unanchored pattern that has a first code
-unit, or a multiline pattern that matches only at "line start", there is no
-point in seeking a list of starting code units. */
+/* For a pattern that has a first code unit, or a multiline pattern that
+matches only at "line start", there is no point in seeking a list of starting
+code units. */

-if ((re->overall_options & PCRE2_ANCHORED) == 0 &&
-    (re->flags & (PCRE2_FIRSTSET|PCRE2_STARTLINE)) == 0)
+if ((re->flags & (PCRE2_FIRSTSET|PCRE2_STARTLINE)) == 0)
  {
  int rc = set_start_bits(re, code, utf);
  if (rc == SSB_UNKNOWN) return 1;
--- a/testdata/testinput10
+++ b/testdata/testinput10
@ -467,4 +467,13 @@
 /\udfff\o{157401}/utf,alt_bsux,allow_surrogate_escapes
    \x{dfff}\x{df01}\=no_utf_check
    
+# This has different starting code units in 8-bit mode. 
+
+/^[^ab]/IB,utf
+    c
+    \x{ff}
+    \x{100}
+\= Expect no match
+    aaa
+
 # End of testinput10
--- a/testdata/testinput12
+++ b/testdata/testinput12
@ -373,4 +373,13 @@
 /\udfff\o{157401}/utf,alt_bsux,allow_surrogate_escapes
    \x{dfff}\x{df01}\=no_utf_check

+# This has different starting code units in 8-bit mode. 
+
+/^[^ab]/IB,utf
+    c
+    \x{ff}
+    \x{100}
+\= Expect no match
+    aaa
+
 # End of testinput12
--- a/testdata/testinput2
+++ b/testdata/testinput2
@ -5256,6 +5256,9 @@ a)"xI
    XAB     

 /^(?!A(?C1)B)C/
+    ABC\=callout_error=1,no_jit
+
+/^(?!A(?C1)B)C/no_start_optimize
    ABC\=callout_error=1

 /^(?(?!A(?C1)B)C)/
--- a/testdata/testinput5
+++ b/testdata/testinput5
@ -120,13 +120,6 @@
    \x{ff}
    \x{100}

-/^[^ab]/IB,utf
-    c
-    \x{ff}
-    \x{100}
-\= Expect no match
-    aaa
-
 /\x{100}*(\d+|"(?1)")/utf
    1234
    "1234"
@ -190,7 +183,10 @@
 /\w/utf
    \x{100}X

-/^\ሴ/IB,utf
+# Use no_start_optimize because the first code unit is different in 8-bit from
+# the wider modes.
+
+/^\ሴ/IB,utf,no_start_optimize

 /()()()()()()()()()()
 ()()()()()()()()()()
--- a/testdata/testoutput10
+++ b/testdata/testoutput10
@ -1586,4 +1586,38 @@ Failed: error 176 at offset 259: name is too long in (*MARK), (*PRUNE), (*SKIP),
    \x{dfff}\x{df01}\=no_utf_check
 0: \x{dfff}\x{df01}
    
+# This has different starting code units in 8-bit mode. 
+
+/^[^ab]/IB,utf
+------------------------------------------------------------------
+        Bra
+        ^
+        [\x00-`c-\xff] (neg)
+        Ket
+        End
+------------------------------------------------------------------
+Capturing subpattern count = 0
+Compile options: utf
+Overall options: anchored utf
+Starting code units: \x00 \x01 \x02 \x03 \x04 \x05 \x06 \x07 \x08 \x09 \x0a 
+  \x0b \x0c \x0d \x0e \x0f \x10 \x11 \x12 \x13 \x14 \x15 \x16 \x17 \x18 \x19 
+  \x1a \x1b \x1c \x1d \x1e \x1f \x20 ! " # $ % & ' ( ) * + , - . / 0 1 2 3 4 
+  5 6 7 8 9 : ; < = > ? @ A B C D E F G H I J K L M N O P Q R S T U V W X Y 
+  Z [ \ ] ^ _ ` c d e f g h i j k l m n o p q r s t u v w x y z { | } ~ \x7f 
+  \xc2 \xc3 \xc4 \xc5 \xc6 \xc7 \xc8 \xc9 \xca \xcb \xcc \xcd \xce \xcf \xd0 
+  \xd1 \xd2 \xd3 \xd4 \xd5 \xd6 \xd7 \xd8 \xd9 \xda \xdb \xdc \xdd \xde \xdf 
+  \xe0 \xe1 \xe2 \xe3 \xe4 \xe5 \xe6 \xe7 \xe8 \xe9 \xea \xeb \xec \xed \xee 
+  \xef \xf0 \xf1 \xf2 \xf3 \xf4 \xf5 \xf6 \xf7 \xf8 \xf9 \xfa \xfb \xfc \xfd 
+  \xfe \xff 
+Subject length lower bound = 1
+    c
+ 0: c
+    \x{ff}
+ 0: \x{ff}
+    \x{100}
+ 0: \x{100}
+\= Expect no match
+    aaa
+No match
+
 # End of testinput10
--- a/testdata/testoutput12-16
+++ b/testdata/testoutput12-16
@ -1433,4 +1433,42 @@ Failed: error 191 at offset 0: PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES is not allowe
 Failed: error 191 at offset 0: PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES is not allowed in UTF-16 mode
    \x{dfff}\x{df01}\=no_utf_check

+# This has different starting code units in 8-bit mode. 
+
+/^[^ab]/IB,utf
+------------------------------------------------------------------
+        Bra
+        ^
+        [\x00-`c-\xff] (neg)
+        Ket
+        End
+------------------------------------------------------------------
+Capturing subpattern count = 0
+Compile options: utf
+Overall options: anchored utf
+Starting code units: \x00 \x01 \x02 \x03 \x04 \x05 \x06 \x07 \x08 \x09 \x0a 
+  \x0b \x0c \x0d \x0e \x0f \x10 \x11 \x12 \x13 \x14 \x15 \x16 \x17 \x18 \x19 
+  \x1a \x1b \x1c \x1d \x1e \x1f \x20 ! " # $ % & ' ( ) * + , - . / 0 1 2 3 4 
+  5 6 7 8 9 : ; < = > ? @ A B C D E F G H I J K L M N O P Q R S T U V W X Y 
+  Z [ \ ] ^ _ ` c d e f g h i j k l m n o p q r s t u v w x y z { | } ~ \x7f 
+  \x80 \x81 \x82 \x83 \x84 \x85 \x86 \x87 \x88 \x89 \x8a \x8b \x8c \x8d \x8e 
+  \x8f \x90 \x91 \x92 \x93 \x94 \x95 \x96 \x97 \x98 \x99 \x9a \x9b \x9c \x9d 
+  \x9e \x9f \xa0 \xa1 \xa2 \xa3 \xa4 \xa5 \xa6 \xa7 \xa8 \xa9 \xaa \xab \xac 
+  \xad \xae \xaf \xb0 \xb1 \xb2 \xb3 \xb4 \xb5 \xb6 \xb7 \xb8 \xb9 \xba \xbb 
+  \xbc \xbd \xbe \xbf \xc0 \xc1 \xc2 \xc3 \xc4 \xc5 \xc6 \xc7 \xc8 \xc9 \xca 
+  \xcb \xcc \xcd \xce \xcf \xd0 \xd1 \xd2 \xd3 \xd4 \xd5 \xd6 \xd7 \xd8 \xd9 
+  \xda \xdb \xdc \xdd \xde \xdf \xe0 \xe1 \xe2 \xe3 \xe4 \xe5 \xe6 \xe7 \xe8 
+  \xe9 \xea \xeb \xec \xed \xee \xef \xf0 \xf1 \xf2 \xf3 \xf4 \xf5 \xf6 \xf7 
+  \xf8 \xf9 \xfa \xfb \xfc \xfd \xfe \xff 
+Subject length lower bound = 1
+    c
+ 0: c
+    \x{ff}
+ 0: \x{ff}
+    \x{100}
+ 0: \x{100}
+\= Expect no match
+    aaa
+No match
+
 # End of testinput12
--- a/testdata/testoutput12-32
+++ b/testdata/testoutput12-32
@ -1425,4 +1425,42 @@ No match
    \x{dfff}\x{df01}\=no_utf_check
 0: \x{dfff}\x{df01}

+# This has different starting code units in 8-bit mode. 
+
+/^[^ab]/IB,utf
+------------------------------------------------------------------
+        Bra
+        ^
+        [\x00-`c-\xff] (neg)
+        Ket
+        End
+------------------------------------------------------------------
+Capturing subpattern count = 0
+Compile options: utf
+Overall options: anchored utf
+Starting code units: \x00 \x01 \x02 \x03 \x04 \x05 \x06 \x07 \x08 \x09 \x0a 
+  \x0b \x0c \x0d \x0e \x0f \x10 \x11 \x12 \x13 \x14 \x15 \x16 \x17 \x18 \x19 
+  \x1a \x1b \x1c \x1d \x1e \x1f \x20 ! " # $ % & ' ( ) * + , - . / 0 1 2 3 4 
+  5 6 7 8 9 : ; < = > ? @ A B C D E F G H I J K L M N O P Q R S T U V W X Y 
+  Z [ \ ] ^ _ ` c d e f g h i j k l m n o p q r s t u v w x y z { | } ~ \x7f 
+  \x80 \x81 \x82 \x83 \x84 \x85 \x86 \x87 \x88 \x89 \x8a \x8b \x8c \x8d \x8e 
+  \x8f \x90 \x91 \x92 \x93 \x94 \x95 \x96 \x97 \x98 \x99 \x9a \x9b \x9c \x9d 
+  \x9e \x9f \xa0 \xa1 \xa2 \xa3 \xa4 \xa5 \xa6 \xa7 \xa8 \xa9 \xaa \xab \xac 
+  \xad \xae \xaf \xb0 \xb1 \xb2 \xb3 \xb4 \xb5 \xb6 \xb7 \xb8 \xb9 \xba \xbb 
+  \xbc \xbd \xbe \xbf \xc0 \xc1 \xc2 \xc3 \xc4 \xc5 \xc6 \xc7 \xc8 \xc9 \xca 
+  \xcb \xcc \xcd \xce \xcf \xd0 \xd1 \xd2 \xd3 \xd4 \xd5 \xd6 \xd7 \xd8 \xd9 
+  \xda \xdb \xdc \xdd \xde \xdf \xe0 \xe1 \xe2 \xe3 \xe4 \xe5 \xe6 \xe7 \xe8 
+  \xe9 \xea \xeb \xec \xed \xee \xef \xf0 \xf1 \xf2 \xf3 \xf4 \xf5 \xf6 \xf7 
+  \xf8 \xf9 \xfa \xfb \xfc \xfd \xfe \xff 
+Subject length lower bound = 1
+    c
+ 0: c
+    \x{ff}
+ 0: \x{ff}
+    \x{100}
+ 0: \x{100}
+\= Expect no match
+    aaa
+No match
+
 # End of testinput12
--- a/testdata/testoutput17
+++ b/testdata/testoutput17
@ -368,6 +368,7 @@ No match
 Capturing subpattern count = 0
 Compile options: <none>
 Overall options: anchored
+First code unit = 'a'
 Subject length lower bound = 6
 JIT compilation was successful
 #pop jitverify
@ -379,6 +380,7 @@ JIT compilation was successful
 Capturing subpattern count = 0
 Compile options: <none>
 Overall options: anchored
+First code unit = 'a'
 Subject length lower bound = 6
 JIT compilation was successful
 #save testsaved1
--- a/testdata/testoutput2
+++ b/testdata/testoutput2
@ -72,6 +72,7 @@ No match
 Capturing subpattern count = 0
 Compile options: <none>
 Overall options: anchored
+First code unit = 'a'
 Subject length lower bound = 3
    abc
 0: abc
@ -110,6 +111,7 @@ Subject length lower bound = 2
 Capturing subpattern count = 0
 Compile options: <none>
 Overall options: anchored
+First code unit = 'a'
 Subject length lower bound = 3
    abc
 0: abc
@ -339,6 +341,7 @@ Subject length lower bound = 19
 /the quick brown fox/I,anchored
 Capturing subpattern count = 0
 Options: anchored
+First code unit = 't'
 Subject length lower bound = 19
    the quick brown fox
 0: the quick brown fox
@ -351,6 +354,7 @@ Failed: error 111 at offset 4: unrecognized character after (? or (?-

 /^abc|def/I
 Capturing subpattern count = 0
+Starting code units: a d 
 Subject length lower bound = 3
    abcdef
 0: abc
@ -495,12 +499,14 @@ Subject length lower bound = 1
 Capturing subpattern count = 0
 Compile options: <none>
 Overall options: anchored
+First code unit = '1'
 Subject length lower bound = 4

 /(^b|(?i)^d)/I
 Capturing subpattern count = 1
 Compile options: <none>
 Overall options: anchored
+Starting code units: D b d 
 Subject length lower bound = 1

 /(?s).*/I
@ -624,6 +630,7 @@ Capturing subpattern count = 0
 Max lookbehind = 1
 Compile options: multiline
 Overall options: anchored multiline
+First code unit = 'a'
 Subject length lower bound = 3

 /^abc/Im
@ -637,6 +644,7 @@ Subject length lower bound = 3
 Capturing subpattern count = 5
 Compile options: <none>
 Overall options: anchored
+First code unit = 'a'
 Subject length lower bound = 3
  aaaaabbbbbcccccdef
 0: aaaaabbbbbcccccdef
@ -808,6 +816,7 @@ Capturing subpattern count = 1
 Max back reference = 1
 Compile options: <none>
 Overall options: anchored
+Starting code units: a 
 Subject length lower bound = 4
 \= Expect no match
    aaaa
@ -1004,6 +1013,7 @@ Subject length lower bound = 16
 Capturing subpattern count = 3
 Compile options: <none>
 Overall options: anchored
+Starting code units: a b 
 Subject length lower bound = 4
    adef\=get=1,get=2,get=3,get=4,getall
 0: adef
@ -1042,6 +1052,7 @@ Get substring 4 failed (-49): unknown substring
 Capturing subpattern count = 0
 Compile options: <none>
 Overall options: anchored
+First code unit = 'a'
 Subject length lower bound = 7
    abc\00def\=copy=0,getall
 0: abc\x00def
@ -1227,6 +1238,7 @@ Subject length lower bound = 3
 Capturing subpattern count = 0
 Compile options: <none>
 Overall options: anchored
+First code unit = 'i'
 Subject length lower bound = 3
    ississippi
 0: iss
@ -1286,6 +1298,7 @@ Capturing subpattern count = 0
 Contains explicit CR or LF match
 Compile options: <none>
 Overall options: anchored
+First code unit = 'a'
 Subject length lower bound = 3
    ab\nab\ncd
 0: ab\x0a
@ -1776,6 +1789,8 @@ Subject length lower bound = 2
 Capturing subpattern count = 0
 Compile options: <none>
 Overall options: anchored
+Starting code units: 0 1 2 3 4 5 6 7 8 9 A B C D E F G H I J K L M N O P 
+  Q R S T U V W X Y Z a b c d e f g h i j k l m n o p q r s t u v w x y z 
 Subject length lower bound = 1

 /^[[:^alnum:]]/IB
@ -1789,6 +1804,18 @@ Subject length lower bound = 1
 Capturing subpattern count = 0
 Compile options: <none>
 Overall options: anchored
+Starting code units: \x00 \x01 \x02 \x03 \x04 \x05 \x06 \x07 \x08 \x09 \x0a 
+  \x0b \x0c \x0d \x0e \x0f \x10 \x11 \x12 \x13 \x14 \x15 \x16 \x17 \x18 \x19 
+  \x1a \x1b \x1c \x1d \x1e \x1f \x20 ! " # $ % & ' ( ) * + , - . / : ; < = > 
+  ? @ [ \ ] ^ _ ` { | } ~ \x7f \x80 \x81 \x82 \x83 \x84 \x85 \x86 \x87 \x88 
+  \x89 \x8a \x8b \x8c \x8d \x8e \x8f \x90 \x91 \x92 \x93 \x94 \x95 \x96 \x97 
+  \x98 \x99 \x9a \x9b \x9c \x9d \x9e \x9f \xa0 \xa1 \xa2 \xa3 \xa4 \xa5 \xa6 
+  \xa7 \xa8 \xa9 \xaa \xab \xac \xad \xae \xaf \xb0 \xb1 \xb2 \xb3 \xb4 \xb5 
+  \xb6 \xb7 \xb8 \xb9 \xba \xbb \xbc \xbd \xbe \xbf \xc0 \xc1 \xc2 \xc3 \xc4 
+  \xc5 \xc6 \xc7 \xc8 \xc9 \xca \xcb \xcc \xcd \xce \xcf \xd0 \xd1 \xd2 \xd3 
+  \xd4 \xd5 \xd6 \xd7 \xd8 \xd9 \xda \xdb \xdc \xdd \xde \xdf \xe0 \xe1 \xe2 
+  \xe3 \xe4 \xe5 \xe6 \xe7 \xe8 \xe9 \xea \xeb \xec \xed \xee \xef \xf0 \xf1 
+  \xf2 \xf3 \xf4 \xf5 \xf6 \xf7 \xf8 \xf9 \xfa \xfb \xfc \xfd \xfe \xff 
 Subject length lower bound = 1

 /^[[:alpha:]]/IB
@ -1802,6 +1829,8 @@ Subject length lower bound = 1
 Capturing subpattern count = 0
 Compile options: <none>
 Overall options: anchored
+Starting code units: A B C D E F G H I J K L M N O P Q R S T U V W X Y Z 
+  a b c d e f g h i j k l m n o p q r s t u v w x y z 
 Subject length lower bound = 1

 /^[[:^alpha:]]/IB
@ -1815,6 +1844,19 @@ Subject length lower bound = 1
 Capturing subpattern count = 0
 Compile options: <none>
 Overall options: anchored
+Starting code units: \x00 \x01 \x02 \x03 \x04 \x05 \x06 \x07 \x08 \x09 \x0a 
+  \x0b \x0c \x0d \x0e \x0f \x10 \x11 \x12 \x13 \x14 \x15 \x16 \x17 \x18 \x19 
+  \x1a \x1b \x1c \x1d \x1e \x1f \x20 ! " # $ % & ' ( ) * + , - . / 0 1 2 3 4 
+  5 6 7 8 9 : ; < = > ? @ [ \ ] ^ _ ` { | } ~ \x7f \x80 \x81 \x82 \x83 \x84 
+  \x85 \x86 \x87 \x88 \x89 \x8a \x8b \x8c \x8d \x8e \x8f \x90 \x91 \x92 \x93 
+  \x94 \x95 \x96 \x97 \x98 \x99 \x9a \x9b \x9c \x9d \x9e \x9f \xa0 \xa1 \xa2 
+  \xa3 \xa4 \xa5 \xa6 \xa7 \xa8 \xa9 \xaa \xab \xac \xad \xae \xaf \xb0 \xb1 
+  \xb2 \xb3 \xb4 \xb5 \xb6 \xb7 \xb8 \xb9 \xba \xbb \xbc \xbd \xbe \xbf \xc0 
+  \xc1 \xc2 \xc3 \xc4 \xc5 \xc6 \xc7 \xc8 \xc9 \xca \xcb \xcc \xcd \xce \xcf 
+  \xd0 \xd1 \xd2 \xd3 \xd4 \xd5 \xd6 \xd7 \xd8 \xd9 \xda \xdb \xdc \xdd \xde 
+  \xdf \xe0 \xe1 \xe2 \xe3 \xe4 \xe5 \xe6 \xe7 \xe8 \xe9 \xea \xeb \xec \xed 
+  \xee \xef \xf0 \xf1 \xf2 \xf3 \xf4 \xf5 \xf6 \xf7 \xf8 \xf9 \xfa \xfb \xfc 
+  \xfd \xfe \xff 
 Subject length lower bound = 1

 /[_[:alpha:]]/I
@ -1834,6 +1876,12 @@ Subject length lower bound = 1
 Capturing subpattern count = 0
 Compile options: <none>
 Overall options: anchored
+Starting code units: \x00 \x01 \x02 \x03 \x04 \x05 \x06 \x07 \x08 \x09 \x0a 
+  \x0b \x0c \x0d \x0e \x0f \x10 \x11 \x12 \x13 \x14 \x15 \x16 \x17 \x18 \x19 
+  \x1a \x1b \x1c \x1d \x1e \x1f \x20 ! " # $ % & ' ( ) * + , - . / 0 1 2 3 4 
+  5 6 7 8 9 : ; < = > ? @ A B C D E F G H I J K L M N O P Q R S T U V W X Y 
+  Z [ \ ] ^ _ ` a b c d e f g h i j k l m n o p q r s t u v w x y z { | } ~ 
+  \x7f 
 Subject length lower bound = 1

 /^[[:^ascii:]]/IB
@ -1847,6 +1895,15 @@ Subject length lower bound = 1
 Capturing subpattern count = 0
 Compile options: <none>
 Overall options: anchored
+Starting code units: \x80 \x81 \x82 \x83 \x84 \x85 \x86 \x87 \x88 \x89 \x8a 
+  \x8b \x8c \x8d \x8e \x8f \x90 \x91 \x92 \x93 \x94 \x95 \x96 \x97 \x98 \x99 
+  \x9a \x9b \x9c \x9d \x9e \x9f \xa0 \xa1 \xa2 \xa3 \xa4 \xa5 \xa6 \xa7 \xa8 
+  \xa9 \xaa \xab \xac \xad \xae \xaf \xb0 \xb1 \xb2 \xb3 \xb4 \xb5 \xb6 \xb7 
+  \xb8 \xb9 \xba \xbb \xbc \xbd \xbe \xbf \xc0 \xc1 \xc2 \xc3 \xc4 \xc5 \xc6 
+  \xc7 \xc8 \xc9 \xca \xcb \xcc \xcd \xce \xcf \xd0 \xd1 \xd2 \xd3 \xd4 \xd5 
+  \xd6 \xd7 \xd8 \xd9 \xda \xdb \xdc \xdd \xde \xdf \xe0 \xe1 \xe2 \xe3 \xe4 
+  \xe5 \xe6 \xe7 \xe8 \xe9 \xea \xeb \xec \xed \xee \xef \xf0 \xf1 \xf2 \xf3 
+  \xf4 \xf5 \xf6 \xf7 \xf8 \xf9 \xfa \xfb \xfc \xfd \xfe \xff 
 Subject length lower bound = 1

 /^[[:blank:]]/IB
@ -1860,6 +1917,7 @@ Subject length lower bound = 1
 Capturing subpattern count = 0
 Compile options: <none>
 Overall options: anchored
+Starting code units: \x09 \x20 
 Subject length lower bound = 1

 /^[[:^blank:]]/IB
@ -1873,6 +1931,20 @@ Subject length lower bound = 1
 Capturing subpattern count = 0
 Compile options: <none>
 Overall options: anchored
+Starting code units: \x00 \x01 \x02 \x03 \x04 \x05 \x06 \x07 \x08 \x0a \x0b 
+  \x0c \x0d \x0e \x0f \x10 \x11 \x12 \x13 \x14 \x15 \x16 \x17 \x18 \x19 \x1a 
+  \x1b \x1c \x1d \x1e \x1f ! " # $ % & ' ( ) * + , - . / 0 1 2 3 4 5 6 7 8 9 
+  : ; < = > ? @ A B C D E F G H I J K L M N O P Q R S T U V W X Y Z [ \ ] ^ 
+  _ ` a b c d e f g h i j k l m n o p q r s t u v w x y z { | } ~ \x7f \x80 
+  \x81 \x82 \x83 \x84 \x85 \x86 \x87 \x88 \x89 \x8a \x8b \x8c \x8d \x8e \x8f 
+  \x90 \x91 \x92 \x93 \x94 \x95 \x96 \x97 \x98 \x99 \x9a \x9b \x9c \x9d \x9e 
+  \x9f \xa0 \xa1 \xa2 \xa3 \xa4 \xa5 \xa6 \xa7 \xa8 \xa9 \xaa \xab \xac \xad 
+  \xae \xaf \xb0 \xb1 \xb2 \xb3 \xb4 \xb5 \xb6 \xb7 \xb8 \xb9 \xba \xbb \xbc 
+  \xbd \xbe \xbf \xc0 \xc1 \xc2 \xc3 \xc4 \xc5 \xc6 \xc7 \xc8 \xc9 \xca \xcb 
+  \xcc \xcd \xce \xcf \xd0 \xd1 \xd2 \xd3 \xd4 \xd5 \xd6 \xd7 \xd8 \xd9 \xda 
+  \xdb \xdc \xdd \xde \xdf \xe0 \xe1 \xe2 \xe3 \xe4 \xe5 \xe6 \xe7 \xe8 \xe9 
+  \xea \xeb \xec \xed \xee \xef \xf0 \xf1 \xf2 \xf3 \xf4 \xf5 \xf6 \xf7 \xf8 
+  \xf9 \xfa \xfb \xfc \xfd \xfe \xff 
 Subject length lower bound = 1

 /[\n\x0b\x0c\x0d[:blank:]]/I
@ -1892,6 +1964,9 @@ Subject length lower bound = 1
 Capturing subpattern count = 0
 Compile options: <none>
 Overall options: anchored
+Starting code units: \x00 \x01 \x02 \x03 \x04 \x05 \x06 \x07 \x08 \x09 \x0a 
+  \x0b \x0c \x0d \x0e \x0f \x10 \x11 \x12 \x13 \x14 \x15 \x16 \x17 \x18 \x19 
+  \x1a \x1b \x1c \x1d \x1e \x1f \x7f 
 Subject length lower bound = 1

 /^[[:digit:]]/IB
@ -1905,6 +1980,7 @@ Subject length lower bound = 1
 Capturing subpattern count = 0
 Compile options: <none>
 Overall options: anchored
+Starting code units: 0 1 2 3 4 5 6 7 8 9 
 Subject length lower bound = 1

 /^[[:graph:]]/IB
@ -1918,6 +1994,9 @@ Subject length lower bound = 1
 Capturing subpattern count = 0
 Compile options: <none>
 Overall options: anchored
+Starting code units: ! " # $ % & ' ( ) * + , - . / 0 1 2 3 4 5 6 7 8 9 : 
+  ; < = > ? @ A B C D E F G H I J K L M N O P Q R S T U V W X Y Z [ \ ] ^ _ 
+  ` a b c d e f g h i j k l m n o p q r s t u v w x y z { | } ~ 
 Subject length lower bound = 1

 /^[[:lower:]]/IB
@ -1931,6 +2010,7 @@ Subject length lower bound = 1
 Capturing subpattern count = 0
 Compile options: <none>
 Overall options: anchored
+Starting code units: a b c d e f g h i j k l m n o p q r s t u v w x y z 
 Subject length lower bound = 1

 /^[[:print:]]/IB
@ -1944,6 +2024,9 @@ Subject length lower bound = 1
 Capturing subpattern count = 0
 Compile options: <none>
 Overall options: anchored
+Starting code units: \x20 ! " # $ % & ' ( ) * + , - . / 0 1 2 3 4 5 6 7 8 
+  9 : ; < = > ? @ A B C D E F G H I J K L M N O P Q R S T U V W X Y Z [ \ ] 
+  ^ _ ` a b c d e f g h i j k l m n o p q r s t u v w x y z { | } ~ 
 Subject length lower bound = 1

 /^[[:punct:]]/IB
@ -1957,6 +2040,8 @@ Subject length lower bound = 1
 Capturing subpattern count = 0
 Compile options: <none>
 Overall options: anchored
+Starting code units: ! " # $ % & ' ( ) * + , - . / : ; < = > ? @ [ \ ] ^ 
+  _ ` { | } ~ 
 Subject length lower bound = 1

 /^[[:space:]]/IB
@ -1970,6 +2055,7 @@ Subject length lower bound = 1
 Capturing subpattern count = 0
 Compile options: <none>
 Overall options: anchored
+Starting code units: \x09 \x0a \x0b \x0c \x0d \x20 
 Subject length lower bound = 1

 /^[[:upper:]]/IB
@ -1983,6 +2069,7 @@ Subject length lower bound = 1
 Capturing subpattern count = 0
 Compile options: <none>
 Overall options: anchored
+Starting code units: A B C D E F G H I J K L M N O P Q R S T U V W X Y Z 
 Subject length lower bound = 1

 /^[[:xdigit:]]/IB
@ -1996,6 +2083,7 @@ Subject length lower bound = 1
 Capturing subpattern count = 0
 Compile options: <none>
 Overall options: anchored
+Starting code units: 0 1 2 3 4 5 6 7 8 9 A B C D E F a b c d e f 
 Subject length lower bound = 1

 /^[[:word:]]/IB
@ -2009,6 +2097,8 @@ Subject length lower bound = 1
 Capturing subpattern count = 0
 Compile options: <none>
 Overall options: anchored
+Starting code units: 0 1 2 3 4 5 6 7 8 9 A B C D E F G H I J K L M N O P 
+  Q R S T U V W X Y Z _ a b c d e f g h i j k l m n o p q r s t u v w x y z 
 Subject length lower bound = 1

 /^[[:^cntrl:]]/IB
@ -2022,6 +2112,18 @@ Subject length lower bound = 1
 Capturing subpattern count = 0
 Compile options: <none>
 Overall options: anchored
+Starting code units: \x20 ! " # $ % & ' ( ) * + , - . / 0 1 2 3 4 5 6 7 8 
+  9 : ; < = > ? @ A B C D E F G H I J K L M N O P Q R S T U V W X Y Z [ \ ] 
+  ^ _ ` a b c d e f g h i j k l m n o p q r s t u v w x y z { | } ~ \x80 \x81 
+  \x82 \x83 \x84 \x85 \x86 \x87 \x88 \x89 \x8a \x8b \x8c \x8d \x8e \x8f \x90 
+  \x91 \x92 \x93 \x94 \x95 \x96 \x97 \x98 \x99 \x9a \x9b \x9c \x9d \x9e \x9f 
+  \xa0 \xa1 \xa2 \xa3 \xa4 \xa5 \xa6 \xa7 \xa8 \xa9 \xaa \xab \xac \xad \xae 
+  \xaf \xb0 \xb1 \xb2 \xb3 \xb4 \xb5 \xb6 \xb7 \xb8 \xb9 \xba \xbb \xbc \xbd 
+  \xbe \xbf \xc0 \xc1 \xc2 \xc3 \xc4 \xc5 \xc6 \xc7 \xc8 \xc9 \xca \xcb \xcc 
+  \xcd \xce \xcf \xd0 \xd1 \xd2 \xd3 \xd4 \xd5 \xd6 \xd7 \xd8 \xd9 \xda \xdb 
+  \xdc \xdd \xde \xdf \xe0 \xe1 \xe2 \xe3 \xe4 \xe5 \xe6 \xe7 \xe8 \xe9 \xea 
+  \xeb \xec \xed \xee \xef \xf0 \xf1 \xf2 \xf3 \xf4 \xf5 \xf6 \xf7 \xf8 \xf9 
+  \xfa \xfb \xfc \xfd \xfe \xff 
 Subject length lower bound = 1

 /^[12[:^digit:]]/IB
@ -2035,6 +2137,20 @@ Subject length lower bound = 1
 Capturing subpattern count = 0
 Compile options: <none>
 Overall options: anchored
+Starting code units: \x00 \x01 \x02 \x03 \x04 \x05 \x06 \x07 \x08 \x09 \x0a 
+  \x0b \x0c \x0d \x0e \x0f \x10 \x11 \x12 \x13 \x14 \x15 \x16 \x17 \x18 \x19 
+  \x1a \x1b \x1c \x1d \x1e \x1f \x20 ! " # $ % & ' ( ) * + , - . / 1 2 : ; < 
+  = > ? @ A B C D E F G H I J K L M N O P Q R S T U V W X Y Z [ \ ] ^ _ ` a 
+  b c d e f g h i j k l m n o p q r s t u v w x y z { | } ~ \x7f \x80 \x81 \x82 
+  \x83 \x84 \x85 \x86 \x87 \x88 \x89 \x8a \x8b \x8c \x8d \x8e \x8f \x90 \x91 
+  \x92 \x93 \x94 \x95 \x96 \x97 \x98 \x99 \x9a \x9b \x9c \x9d \x9e \x9f \xa0 
+  \xa1 \xa2 \xa3 \xa4 \xa5 \xa6 \xa7 \xa8 \xa9 \xaa \xab \xac \xad \xae \xaf 
+  \xb0 \xb1 \xb2 \xb3 \xb4 \xb5 \xb6 \xb7 \xb8 \xb9 \xba \xbb \xbc \xbd \xbe 
+  \xbf \xc0 \xc1 \xc2 \xc3 \xc4 \xc5 \xc6 \xc7 \xc8 \xc9 \xca \xcb \xcc \xcd 
+  \xce \xcf \xd0 \xd1 \xd2 \xd3 \xd4 \xd5 \xd6 \xd7 \xd8 \xd9 \xda \xdb \xdc 
+  \xdd \xde \xdf \xe0 \xe1 \xe2 \xe3 \xe4 \xe5 \xe6 \xe7 \xe8 \xe9 \xea \xeb 
+  \xec \xed \xee \xef \xf0 \xf1 \xf2 \xf3 \xf4 \xf5 \xf6 \xf7 \xf8 \xf9 \xfa 
+  \xfb \xfc \xfd \xfe \xff 
 Subject length lower bound = 1

 /^[[:^blank:]]/IB
@ -2048,6 +2164,20 @@ Subject length lower bound = 1
 Capturing subpattern count = 0
 Compile options: <none>
 Overall options: anchored
+Starting code units: \x00 \x01 \x02 \x03 \x04 \x05 \x06 \x07 \x08 \x0a \x0b 
+  \x0c \x0d \x0e \x0f \x10 \x11 \x12 \x13 \x14 \x15 \x16 \x17 \x18 \x19 \x1a 
+  \x1b \x1c \x1d \x1e \x1f ! " # $ % & ' ( ) * + , - . / 0 1 2 3 4 5 6 7 8 9 
+  : ; < = > ? @ A B C D E F G H I J K L M N O P Q R S T U V W X Y Z [ \ ] ^ 
+  _ ` a b c d e f g h i j k l m n o p q r s t u v w x y z { | } ~ \x7f \x80 
+  \x81 \x82 \x83 \x84 \x85 \x86 \x87 \x88 \x89 \x8a \x8b \x8c \x8d \x8e \x8f 
+  \x90 \x91 \x92 \x93 \x94 \x95 \x96 \x97 \x98 \x99 \x9a \x9b \x9c \x9d \x9e 
+  \x9f \xa0 \xa1 \xa2 \xa3 \xa4 \xa5 \xa6 \xa7 \xa8 \xa9 \xaa \xab \xac \xad 
+  \xae \xaf \xb0 \xb1 \xb2 \xb3 \xb4 \xb5 \xb6 \xb7 \xb8 \xb9 \xba \xbb \xbc 
+  \xbd \xbe \xbf \xc0 \xc1 \xc2 \xc3 \xc4 \xc5 \xc6 \xc7 \xc8 \xc9 \xca \xcb 
+  \xcc \xcd \xce \xcf \xd0 \xd1 \xd2 \xd3 \xd4 \xd5 \xd6 \xd7 \xd8 \xd9 \xda 
+  \xdb \xdc \xdd \xde \xdf \xe0 \xe1 \xe2 \xe3 \xe4 \xe5 \xe6 \xe7 \xe8 \xe9 
+  \xea \xeb \xec \xed \xee \xef \xf0 \xf1 \xf2 \xf3 \xf4 \xf5 \xf6 \xf7 \xf8 
+  \xf9 \xfa \xfb \xfc \xfd \xfe \xff 
 Subject length lower bound = 1

 /[01[:alpha:]%]/IB
@ -2418,6 +2548,7 @@ Subject length lower bound = 4
 Capturing subpattern count = 2
 Compile options: <none>
 Overall options: anchored
+First code unit = 'a'
 Subject length lower bound = 1
    aba
 0: aba
@ -2428,6 +2559,7 @@ Subject length lower bound = 1
 Capturing subpattern count = 2
 Compile options: <none>
 Overall options: anchored
+First code unit = 'a'
 Subject length lower bound = 2
    aabbaa
 0: aabbaa
@ -2438,6 +2570,7 @@ Subject length lower bound = 2
 Capturing subpattern count = 2
 Compile options: <none>
 Overall options: anchored
+First code unit = 'a'
 Subject length lower bound = 2
    aabbaa
 0: aabbaa
@ -2448,6 +2581,7 @@ Subject length lower bound = 2
 Capturing subpattern count = 2
 Compile options: <none>
 Overall options: anchored
+First code unit = 'a'
 Subject length lower bound = 2
    aabbaa
 0: aabbaa
@ -2458,6 +2592,7 @@ Subject length lower bound = 2
 Capturing subpattern count = 1
 Compile options: <none>
 Overall options: anchored
+First code unit = 'a'
 Subject length lower bound = 2
    aabbaa
 0: aabbaa
@ -2467,6 +2602,7 @@ Subject length lower bound = 2
 Capturing subpattern count = 3
 Compile options: <none>
 Overall options: anchored
+First code unit = 'a'
 Subject length lower bound = 2
    aabbaa
 0: aabbaa
@ -2478,6 +2614,7 @@ Subject length lower bound = 2
 Capturing subpattern count = 2
 Compile options: <none>
 Overall options: anchored
+First code unit = 'a'
 Subject length lower bound = 2
    aabbaa
 0: aabbaa
@ -2488,6 +2625,7 @@ Subject length lower bound = 2
 Capturing subpattern count = 1
 Compile options: <none>
 Overall options: anchored
+First code unit = 'a'
 Subject length lower bound = 2
    aabbaa
 0: aabbaa
@ -2497,6 +2635,7 @@ Subject length lower bound = 2
 Capturing subpattern count = 1
 Compile options: <none>
 Overall options: anchored
+First code unit = 'a'
 Subject length lower bound = 2
    aabbbaa
 0: aabbbaa
@ -2506,6 +2645,7 @@ Subject length lower bound = 2
 Capturing subpattern count = 1
 Compile options: <none>
 Overall options: anchored
+First code unit = 'a'
 Subject length lower bound = 2
    aabbbaa
 0: aabbbaa
@ -2515,6 +2655,7 @@ Subject length lower bound = 2
 Capturing subpattern count = 1
 Compile options: <none>
 Overall options: anchored
+First code unit = 'a'
 Subject length lower bound = 2
    aabbaa
 0: aabbaa
@ -2524,6 +2665,7 @@ Subject length lower bound = 2
 Capturing subpattern count = 1
 Compile options: <none>
 Overall options: anchored
+First code unit = 'a'
 Subject length lower bound = 2
    aabbbaa
 0: aabbbaa
@ -2533,6 +2675,7 @@ Subject length lower bound = 2
 Capturing subpattern count = 3
 Compile options: <none>
 Overall options: anchored
+First code unit = 'a'
 Subject length lower bound = 2
    aabbbaa
 0: aabbbaa
@ -2544,6 +2687,7 @@ Subject length lower bound = 2
 Capturing subpattern count = 3
 Compile options: <none>
 Overall options: anchored
+First code unit = 'a'
 Subject length lower bound = 2
    aabbbbaa
 0: aabbbbaa
@ -3052,6 +3196,7 @@ Subject length lower bound = 3
 Capturing subpattern count = 5
 Compile options: <none>
 Overall options: anchored
+First code unit = 'a'
 Subject length lower bound = 3

 /^x(?U)a+b/IB
@ -3067,6 +3212,7 @@ Subject length lower bound = 3
 Capturing subpattern count = 0
 Compile options: <none>
 Overall options: anchored
+First code unit = 'x'
 Last code unit = 'b'
 Subject length lower bound = 3

@ -3085,6 +3231,7 @@ Subject length lower bound = 3
 Capturing subpattern count = 1
 Compile options: <none>
 Overall options: anchored
+First code unit = 'x'
 Last code unit = 'b'
 Subject length lower bound = 3

@ -3725,6 +3872,7 @@ Subject length lower bound = 3
 Capturing subpattern count = 0
 Compile options: <none>
 Overall options: anchored
+First code unit = 'a'
 Subject length lower bound = 3

 /(?C)a|b/I
@ -3785,6 +3933,7 @@ No match
 Capturing subpattern count = 1
 Compile options: <none>
 Overall options: anchored
+First code unit = '>'
 Last code unit = '<'
 Subject length lower bound = 10
   >abc>123<xyz<
@ -3835,6 +3984,7 @@ Subject length lower bound = 2
 Capturing subpattern count = 2
 Compile options: <none>
 Overall options: anchored
+Starting code units: ( - 0 1 2 3 4 5 6 7 8 9 
 Subject length lower bound = 1
    12
 0: 12
@ -3854,6 +4004,7 @@ No match
 Capturing subpattern count = 2
 Compile options: <none>
 Overall options: anchored
+First code unit = 'x'
 Subject length lower bound = 3
    xyz
 0: xyz
@ -3913,6 +4064,7 @@ Failed: error 114 at offset 10: missing closing parenthesis
 Capturing subpattern count = 1
 Compile options: <none>
 Overall options: anchored
+First code unit = 'a'
 Subject length lower bound = 9
    abcdefabc
 0: abcdefabc
@ -3922,6 +4074,7 @@ Subject length lower bound = 9
 Capturing subpattern count = 1
 Compile options: <none>
 Overall options: anchored
+Starting code units: a b c 
 Subject length lower bound = 2
    a=a
 0: a=a
@ -3937,6 +4090,7 @@ Subject length lower bound = 2
 Capturing subpattern count = 2
 Compile options: <none>
 Overall options: anchored
+Starting code units: a b c 
 Subject length lower bound = 2
    a=a
 0: a=a
@ -5173,6 +5327,7 @@ No match
 Capturing subpattern count = 3
 Compile options: <none>
 Overall options: anchored
+Starting code units: 0 1 2 3 4 5 6 7 8 9 
 Last code unit = '/'
 Subject length lower bound = 6
    13/05/04\=ps
@ -5270,6 +5425,7 @@ Partial match: c12
 Capturing subpattern count = 0
 Compile options: <none>
 Overall options: anchored
+Starting code units: 0 1 2 3 4 5 6 7 8 9 
 Last code unit = 'X'
 Subject length lower bound = 4
    1\=ps
@ -5643,6 +5799,7 @@ Named capturing subpatterns:
  A   3
 Compile options: dupnames
 Overall options: anchored dupnames
+First code unit = 'a'
 Subject length lower bound = 2
    a1b\=copy=A
 0: a1
@ -5680,6 +5837,7 @@ Named capturing subpatterns:
  A   2
 Compile options: dupnames
 Overall options: anchored dupnames
+First code unit = 'a'
 Subject length lower bound = 2
    ab\=copy=A
 0: ab
@ -5693,6 +5851,7 @@ Named capturing subpatterns:
  A   1
  A   2
 Options: dupnames
+Starting code units: a c 
 Subject length lower bound = 2
    ab\=copy=A
 0: ab
@ -5711,6 +5870,7 @@ Named capturing subpatterns:
  A   3
  A   4
 Options: dupnames
+Starting code units: a c 
 Subject length lower bound = 2
    cdefgh\=copy=A
 0: cdefgh
@ -5727,6 +5887,7 @@ Named capturing subpatterns:
  A   3
 Compile options: dupnames
 Overall options: anchored dupnames
+First code unit = 'a'
 Subject length lower bound = 2
    a1b\=get=A
 0: a1
@ -5754,6 +5915,7 @@ Named capturing subpatterns:
  A   2
 Compile options: dupnames
 Overall options: anchored dupnames
+First code unit = 'a'
 Subject length lower bound = 2
    ab\=get=A
 0: ab
@ -5767,6 +5929,7 @@ Named capturing subpatterns:
  A   1
  A   2
 Options: dupnames
+Starting code units: a c 
 Subject length lower bound = 2
    ab\=get=A
 0: ab
@ -5785,6 +5948,7 @@ Named capturing subpatterns:
  A   3
  A   4
 Options: dupnames
+Starting code units: a c 
 Subject length lower bound = 2
    cdefgh\=get=A
 0: cdefgh
@ -5802,6 +5966,7 @@ Named capturing subpatterns:
 Compile options: <none>
 Overall options: anchored
 Duplicate name status changes
+First code unit = 'a'
 Subject length lower bound = 2
    a1b\=copy=A
 0: a1
@ -5832,6 +5997,7 @@ Named capturing subpatterns:
 Compile options: <none>
 Overall options: anchored
 Duplicate name status changes
+First code unit = 'a'
 Subject length lower bound = 6
    a bc d\=copy=A,copy=B,copy=C
 0: a bc d
@ -6233,6 +6399,7 @@ Subject length lower bound = 4
 Capturing subpattern count = 0
 Compile options: <none>
 Overall options: anchored
+Starting code units: a b 
 Last code unit = 'b'
 Subject length lower bound = 2

@ -6249,6 +6416,7 @@ Subject length lower bound = 2
 Capturing subpattern count = 0
 Compile options: <none>
 Overall options: anchored
+Starting code units: a b 
 Last code unit = 'b'
 Subject length lower bound = 2

@ -6265,6 +6433,7 @@ Subject length lower bound = 2
 Capturing subpattern count = 0
 Compile options: <none>
 Overall options: anchored
+Starting code units: a b 
 Last code unit = 'b'
 Subject length lower bound = 2

@ -6281,6 +6450,7 @@ Subject length lower bound = 2
 Capturing subpattern count = 0
 Compile options: <none>
 Overall options: anchored
+First code unit = 'a'
 Last code unit = 'A'
 Subject length lower bound = 3
    aaaA5
@ -6302,6 +6472,7 @@ No match
 Capturing subpattern count = 0
 Compile options: caseless
 Overall options: anchored caseless
+Starting code units: A a 
 Last code unit = 'A' (caseless)
 Subject length lower bound = 2
    aaaA5
@ -9540,6 +9711,7 @@ Subject length lower bound = 2
 Capturing subpattern count = 1
 Compile options: <none>
 Overall options: anchored
+First code unit = 'F'
 Last code unit = ':'
 Subject length lower bound = 22

@ -9691,6 +9863,7 @@ Named capturing subpatterns:
  D   1
 Compile options: dupnames extended
 Overall options: anchored dupnames extended
+Starting code units: a e 
 Subject length lower bound = 2
    abcdX
 0: abcdX
@ -10445,12 +10618,14 @@ Failed: error 125 at offset 0: lookbehind assertion is not fixed length
 Capturing subpattern count = 1
 Compile options: <none>
 Overall options: anchored
+First code unit = 'a'
 Subject length lower bound = 2

 /(^ab)++/I
 Capturing subpattern count = 1
 Compile options: <none>
 Overall options: anchored
+First code unit = 'a'
 Subject length lower bound = 2

 /(^ab|^)+/I
@ -10471,12 +10646,14 @@ Subject length lower bound = 0
 Capturing subpattern count = 0
 Compile options: <none>
 Overall options: anchored
+First code unit = 'a'
 Subject length lower bound = 2

 /(?:^ab)++/I
 Capturing subpattern count = 0
 Compile options: <none>
 Overall options: anchored
+First code unit = 'a'
 Subject length lower bound = 2

 /(?:^ab|^)+/I
@ -11586,6 +11763,7 @@ Subject length lower bound = 2
 Capturing subpattern count = 0
 Compile options: dotall
 Overall options: anchored dotall
+First code unit = 'a'
 Subject length lower bound = 2

 /.*?a(*SKIP)b/I
@ -11608,6 +11786,7 @@ Subject length lower bound = 2
 Capturing subpattern count = 0
 Compile options: dotall
 Overall options: anchored dotall
+First code unit = 'a'
 Subject length lower bound = 2

 /(?>.*?)(?<=(abcd)|(wxyz))/I
@ -13375,7 +13554,6 @@ Subject length lower bound = 1
 /(|ab)*?d/I,no_start_optimize
 Capturing subpattern count = 1
 Options: no_start_optimize
-Last code unit = 'd'
 Subject length lower bound = 0
   abd
 0: abd
@ -13641,12 +13819,14 @@ get substring list failed (-2): partial match
 Capturing subpattern count = 0
 Compile options: <none>
 Overall options: anchored
+First code unit = 'a'
 Subject length lower bound = 3

 /^abc/info,no_dotstar_anchor
 Capturing subpattern count = 0
 Compile options: no_dotstar_anchor
 Overall options: anchored no_dotstar_anchor
+First code unit = 'a'
 Subject length lower bound = 3

 /.*\d/info,auto_callout
@ -14684,6 +14864,7 @@ Capturing subpattern count = 2
 Max back reference = 1
 Compile options: <none>
 Overall options: anchored
+First code unit = 'o'
 Last code unit = '}'
 Subject length lower bound = 65535

@ -15607,6 +15788,7 @@ No match
 Capturing subpattern count = 1
 Compile options: <none>
 Overall options: anchored
+First code unit = 'b'
 Subject length lower bound = 2

 /(a){0}.*bc/sI
@ -15885,6 +16067,10 @@ No match
 No match

 /^(?!A(?C1)B)C/
+    ABC\=callout_error=1,no_jit
+No match
+
+/^(?!A(?C1)B)C/no_start_optimize
    ABC\=callout_error=1
 --->ABC
  1 ^^      B
--- a/testdata/testoutput5
+++ b/testdata/testoutput5
@ -194,6 +194,7 @@ Subject length lower bound = 3
 Capturing subpattern count = 0
 Compile options: utf
 Overall options: anchored utf
+Starting code units: a b 
 Subject length lower bound = 1
    bar
 0: b
@ -205,28 +206,6 @@ No match
    \x{100}
 No match

-/^[^ab]/IB,utf
------------------------------------------------------------------
-        Bra
-        ^
-        [\x00-`c-\xff] (neg)
-        Ket
-        End
------------------------------------------------------------------
-Capturing subpattern count = 0
-Compile options: utf
-Overall options: anchored utf
-Subject length lower bound = 1
-    c
- 0: c
-    \x{ff}
- 0: \x{ff}
-    \x{100}
- 0: \x{100}
-\= Expect no match
-    aaa
-No match
-
 /\x{100}*(\d+|"(?1)")/utf
    1234
 0: 1234
@ -479,7 +458,10 @@ Subject length lower bound = 0
    \x{100}X
 0: X

-/^\ሴ/IB,utf
+# Use no_start_optimize because the first code unit is different in 8-bit from
+# the wider modes.
+
+/^\ሴ/IB,utf,no_start_optimize
 ------------------------------------------------------------------
        Bra
        ^
@ -488,9 +470,9 @@ Subject length lower bound = 0
        End
 ------------------------------------------------------------------
 Capturing subpattern count = 0
-Compile options: utf
-Overall options: anchored utf
-Subject length lower bound = 1
+Compile options: no_start_optimize utf
+Overall options: anchored no_start_optimize utf
+Subject length lower bound = 0

 /()()()()()()()()()()
 ()()()()()()()()()()