Allow anchored patterns to use "first code unit" optimization.

2017-06-30 16:00:33 +00:00 · 2017-06-30 16:00:33 +00:00 · b7d5cee61f
parent cc089cf971
commit b7d5cee61f
15 changed files with 673 additions and 273 deletions
--- a/5
+++ b/5
@ -205,6 +205,11 @@ JIT.
 subjects from 1000 to 2000 for 8-bit searches, since they use memchr() and are 
 much faster.

+46. Arrange for anchored patterns to record and use "first code unit" data,
+because this can give a fast "no match" without searching for a "required code 
+unit". Previously only non-anchored patterns did this.
+
+

 Version 10.23 14-February-2017
 ------------------------------
--- a/src/pcre2_compile.c
+++ b/src/pcre2_compile.c
@ -9632,14 +9632,19 @@ if ((re->overall_options & PCRE2_ANCHORED) == 0 &&
     is_anchored(codestart, 0, &cb, 0, FALSE))
  re->overall_options |= PCRE2_ANCHORED;

-/* If the pattern is still not anchored and we do not have a first code unit,
-see if there is one that is asserted (these are not saved during the compile
-because they can cause conflicts with actual literals that follow). This code
-need not be obeyed if PCRE2_NO_START_OPTIMIZE is set, as the data it would
-create will not be used. */
+/* Set up the first code unit or startline flag, the required code unit, and
+then study the pattern. This code need not be obeyed if PCRE2_NO_START_OPTIMIZE
+is set, as the data it would create will not be used. Note that a first code
+unit (but not the startline flag) is useful for anchored patterns because it
+can still give a quick "no match" and also avoid searching for a last code
+unit. */

-if ((re->overall_options & (PCRE2_ANCHORED|PCRE2_NO_START_OPTIMIZE)) == 0)
+if ((re->overall_options & PCRE2_NO_START_OPTIMIZE) == 0)
  {
+  /* If we do not have a first code unit, see if there is one that is asserted
+  (these are not saved during the compile because they can cause conflicts with
+  actual literals that follow). */
+
  if (firstcuflags < 0)
    firstcu = find_firstassertedcu(codestart, &firstcuflags, FALSE);

@ -9672,52 +9677,50 @@ if ((re->overall_options & (PCRE2_ANCHORED|PCRE2_NO_START_OPTIMIZE)) == 0)
      }
    }

-  /* When there is no first code unit, see if we can set the PCRE2_STARTLINE
-  flag. This is helpful for multiline matches when all branches start with ^
-  and also when all branches start with non-atomic .* for non-DOTALL matches
-  when *PRUNE and SKIP are not present. (There is an option that disables this
-  case.) */
+  /* When there is no first code unit, for non-anchored patterns, see if we can
+  set the PCRE2_STARTLINE flag. This is helpful for multiline matches when all
+  branches start with ^ and also when all branches start with non-atomic .* for
+  non-DOTALL matches when *PRUNE and SKIP are not present. (There is an option
+  that disables this case.) */

-  else if (is_startline(codestart, 0, &cb, 0, FALSE))
+  else if ((re->overall_options & PCRE2_ANCHORED) == 0 &&
+           is_startline(codestart, 0, &cb, 0, FALSE))
    re->flags |= PCRE2_STARTLINE;
-  }

-/* Handle the "required code unit", if one is set. In the case of an anchored
-pattern, do this only if it follows a variable length item in the pattern.
-Again, skip this if PCRE2_NO_START_OPTIMIZE is set. */
+  /* Handle the "required code unit", if one is set. In the case of an anchored
+  pattern, do this only if it follows a variable length item in the pattern. */

-if (reqcuflags >= 0 &&
-     ((re->overall_options & (PCRE2_ANCHORED|PCRE2_NO_START_OPTIMIZE)) == 0 ||
-      (reqcuflags & REQ_VARY) != 0))
-  {
-  re->last_codeunit = reqcu;
-  re->flags |= PCRE2_LASTSET;
-
-  /* Handle caseless required code units as for first code units (above). */
-
-  if ((reqcuflags & REQ_CASELESS) != 0)
+  if (reqcuflags >= 0 &&
+       ((re->overall_options & PCRE2_ANCHORED) == 0 ||
+        (reqcuflags & REQ_VARY) != 0))
    {
-    if (reqcu < 128 || (!utf && reqcu < 255))
+    re->last_codeunit = reqcu;
+    re->flags |= PCRE2_LASTSET;
+
+    /* Handle caseless required code units as for first code units (above). */
+
+    if ((reqcuflags & REQ_CASELESS) != 0)
      {
-      if (cb.fcc[reqcu] != reqcu) re->flags |= PCRE2_LASTCASELESS;
-      }
+      if (reqcu < 128 || (!utf && reqcu < 255))
+        {
+        if (cb.fcc[reqcu] != reqcu) re->flags |= PCRE2_LASTCASELESS;
+        }
 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 8
-    else if (reqcu <= MAX_UTF_CODE_POINT && UCD_OTHERCASE(reqcu) != reqcu)
-      re->flags |= PCRE2_LASTCASELESS;
+      else if (reqcu <= MAX_UTF_CODE_POINT && UCD_OTHERCASE(reqcu) != reqcu)
+        re->flags |= PCRE2_LASTCASELESS;
 #endif
+      }
    }
-  }

-/* Finally, unless PCRE2_NO_START_OPTIMIZE is set, study the compiled pattern
-to set up information such as a bitmap of starting code units and a minimum
-matching length. */
+  /* Finally, study the compiled pattern to set up information such as a bitmap
+  of starting code units and a minimum matching length. */

-if ((re->overall_options & PCRE2_NO_START_OPTIMIZE) == 0 &&
-    PRIV(study)(re) != 0)
-  {
-  errorcode = ERR31;
-  goto HAD_CB_ERROR;
-  }
+  if (PRIV(study)(re) != 0)
+    {
+    errorcode = ERR31;
+    goto HAD_CB_ERROR;
+    }
+  }   /* End of start-of-match optimizations. */

 /* Control ends up here in all cases. When running under valgrind, make a
 pattern's terminating zero defined again. If memory was obtained for the parsed
--- a/src/pcre2_dfa_match.c
+++ b/src/pcre2_dfa_match.c
@ -3341,34 +3341,27 @@ if (utf && (options & PCRE2_NO_UTF_CHECK) == 0)
  }
 #endif  /* SUPPORT_UNICODE */

-/* Set up the first code unit to match, if available. The first_codeunit value
-is never set for an anchored regular expression, but the anchoring may be
-forced at run time, so we have to test for anchoring. The first code unit may
-be unset for an unanchored pattern, of course. If there's no first code unit
-there may be a bitmap of possible first characters. */
+/* Set up the first code unit to match, if available. If there's no first code
+unit there may be a bitmap of possible first characters. */

-if (!anchored)
+if ((re->flags & PCRE2_FIRSTSET) != 0)
  {
-  if ((re->flags & PCRE2_FIRSTSET) != 0)
+  has_first_cu = TRUE;
+  first_cu = first_cu2 = (PCRE2_UCHAR)(re->first_codeunit);
+  if ((re->flags & PCRE2_FIRSTCASELESS) != 0)
    {
-    has_first_cu = TRUE;
-    first_cu = first_cu2 = (PCRE2_UCHAR)(re->first_codeunit);
-    if ((re->flags & PCRE2_FIRSTCASELESS) != 0)
-      {
-      first_cu2 = TABLE_GET(first_cu, mb->tables + fcc_offset, first_cu);
+    first_cu2 = TABLE_GET(first_cu, mb->tables + fcc_offset, first_cu);
 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 8
-      if (utf && first_cu > 127)
-        first_cu2 = (PCRE2_UCHAR)UCD_OTHERCASE(first_cu);
+    if (utf && first_cu > 127)
+      first_cu2 = (PCRE2_UCHAR)UCD_OTHERCASE(first_cu);
 #endif
-      }
    }
-  else
-    if (!startline && (re->flags & PCRE2_FIRSTMAPSET) != 0)
-      start_bits = re->start_bitmap;
  }
+else
+  if (!startline && (re->flags & PCRE2_FIRSTMAPSET) != 0)
+    start_bits = re->start_bitmap;

-/* For anchored or unanchored matches, there may be a "last known required
-character" set. */
+/* There may be a "last known required code unit" set. */

 if ((re->flags & PCRE2_LASTSET) != 0)
  {
@ -3414,8 +3407,8 @@ for (;;)
    /* If firstline is TRUE, the start of the match is constrained to the first
    line of a multiline string. That is, the match must be before or at the
    first newline. Implement this by temporarily adjusting end_subject so that
-    we stop the optimization scans at a newline. If the match fails at the
-    newline, later code breaks this loop. */
+    we stop the optimization scans for a first code unit at a newline. If the
+    match fails at the newline, later code breaks this loop. */

    if (firstline)
      {
@ -3434,70 +3427,138 @@ for (;;)
      while (t < mb->end_subject && !IS_NEWLINE(t)) t++;
      end_subject = t;
      }
+      
+    /* Anchored: check the first code unit if one is recorded. This may seem
+    pointless but it can help in detecting a no match case without scanning for
+    the required code unit. */

-    /* Advance to a unique first code unit if there is one. */
-
-    if (has_first_cu)
+    if (anchored)
      {
-      PCRE2_UCHAR smc;
-      if (first_cu != first_cu2)
-        while (start_match < end_subject &&
-          (smc = UCHAR21TEST(start_match)) != first_cu && smc != first_cu2)
-          start_match++;
-      else
-        while (start_match < end_subject && UCHAR21TEST(start_match) != first_cu)
-          start_match++;
-      }
-
-    /* Or to just after a linebreak for a multiline match */
-
-    else if (startline)
-      {
-      if (start_match > mb->start_subject + start_offset)
+      if (has_first_cu || start_bits != NULL)
        {
-#ifdef SUPPORT_UNICODE
-        if (utf)
+        BOOL ok = start_match < end_subject;
+        if (ok)
          {
-          while (start_match < end_subject && !WAS_NEWLINE(start_match))
+          PCRE2_UCHAR c = UCHAR21TEST(start_match);
+          ok = has_first_cu && (c == first_cu || c == first_cu2);
+          if (!ok && start_bits != NULL)
            {
-            start_match++;
-            ACROSSCHAR(start_match < end_subject, *start_match,
-              start_match++);
+#if PCRE2_CODE_UNIT_WIDTH != 8
+            if (c > 255) c = 255;
+#endif
+            ok = (start_bits[c/8] & (1 << (c&7))) != 0;
            }
          }
-        else
-#endif
-        while (start_match < end_subject && !WAS_NEWLINE(start_match))
-          start_match++;
-
-        /* If we have just passed a CR and the newline option is ANY or
-        ANYCRLF, and we are now at a LF, advance the match position by one more
-        code unit. */
-
-        if (start_match[-1] == CHAR_CR &&
-             (mb->nltype == NLTYPE_ANY || mb->nltype == NLTYPE_ANYCRLF) &&
-             start_match < end_subject &&
-             UCHAR21TEST(start_match) == CHAR_NL)
-          start_match++;
+        if (!ok) break;
        }
      }

-    /* Or to a non-unique first code unit if any have been identified. The
-    bitmap contains only 256 bits. When code units are 16 or 32 bits wide, all
-    code units greater than 254 set the 255 bit. */
+    /* Not anchored. Advance to a unique first code unit if there is one. In
+    8-bit mode, the use of memchr() gives a big speed up, even though we have
+    to call it twice in caseless mode, in order to find the earliest occurrence
+    of the character in either of its cases. */

-    else if (start_bits != NULL)
+    else
      {
-      while (start_match < end_subject)
+      if (has_first_cu)
        {
-        uint32_t c = UCHAR21TEST(start_match);
+        if (first_cu != first_cu2)  /* Caseless */
+          {
 #if PCRE2_CODE_UNIT_WIDTH != 8
-        if (c > 255) c = 255;
+          PCRE2_UCHAR smc;
+          while (start_match < end_subject &&
+                (smc = UCHAR21TEST(start_match)) != first_cu &&
+                  smc != first_cu2)
+            start_match++;
+#else  /* 8-bit code units */
+          PCRE2_SPTR pp1 =
+            memchr(start_match, first_cu, end_subject-start_match);
+          PCRE2_SPTR pp2 =
+            memchr(start_match, first_cu2, end_subject-start_match);
+          if (pp1 == NULL)
+            start_match = (pp2 == NULL)? end_subject : pp2;
+          else
+            start_match = (pp2 == NULL || pp1 < pp2)? pp1 : pp2;
 #endif
-        if ((start_bits[c/8] & (1 << (c&7))) != 0) break;
-        start_match++;
+          }
+
+        /* The caseful case */
+
+        else
+          {
+#if PCRE2_CODE_UNIT_WIDTH != 8
+          while (start_match < end_subject && UCHAR21TEST(start_match) !=
+                 first_cu)
+            start_match++;
+#else
+          start_match = memchr(start_match, first_cu, end_subject - start_match);
+          if (start_match == NULL) start_match = end_subject;
+#endif
+          }
+
+        /* If we can't find the required code unit, break the bumpalong loop,
+        to force a match failure, except when doing partial matching, when we
+        let the next cycle run at the end of the subject. To see why, consider
+        the pattern /(?<=abc)def/, which partially matches "abc", even though
+        the string does not contain the starting character "d". */
+
+        if ((mb->moptions & (PCRE2_PARTIAL_HARD|PCRE2_PARTIAL_SOFT)) == 0 &&
+            start_match >= end_subject)
+          break;
        }
-      }
+
+      /* If there's no first code unit, advance to just after a linebreak for a
+      multiline match if required. */
+
+      else if (startline)
+        {
+        if (start_match > mb->start_subject + start_offset)
+          {
+#ifdef SUPPORT_UNICODE
+          if (utf)
+            {
+            while (start_match < end_subject && !WAS_NEWLINE(start_match))
+              {
+              start_match++;
+              ACROSSCHAR(start_match < end_subject, *start_match,
+                start_match++);
+              }
+            }
+          else
+#endif
+          while (start_match < end_subject && !WAS_NEWLINE(start_match))
+            start_match++;
+
+          /* If we have just passed a CR and the newline option is ANY or
+          ANYCRLF, and we are now at a LF, advance the match position by one
+          more code unit. */
+
+          if (start_match[-1] == CHAR_CR &&
+               (mb->nltype == NLTYPE_ANY || mb->nltype == NLTYPE_ANYCRLF) &&
+               start_match < end_subject &&
+               UCHAR21TEST(start_match) == CHAR_NL)
+            start_match++;
+          }
+        }
+
+      /* If there's no first code unit or a requirement for a multiline line
+      start, advance to a non-unique first code unit if any have been
+      identified. The bitmap contains only 256 bits. When code units are 16 or
+      32 bits wide, all code units greater than 254 set the 255 bit. */
+
+      else if (start_bits != NULL)
+        {
+        while (start_match < end_subject)
+          {
+          uint32_t c = UCHAR21TEST(start_match);
+#if PCRE2_CODE_UNIT_WIDTH != 8
+          if (c > 255) c = 255;
+#endif
+          if ((start_bits[c/8] & (1 << (c&7))) != 0) break;
+          start_match++;
+          }
+        }
+      }  /* End of first code unit handling */

    /* Restore fudged end_subject */

--- a/src/pcre2_match.c
+++ b/src/pcre2_match.c
@ -270,7 +270,7 @@ pcre2_callout_block cb;

 *lengthptr = (*Fecode == OP_CALLOUT)?
  PRIV(OP_lengths)[OP_CALLOUT] : GET(Fecode, 1 + 2*LINK_SIZE);
-  
+
 if (mb->callout == NULL) return 0;   /* No callout function provided */

 /* The original matching code (pre 10.30) worked directly with the ovector
@ -279,11 +279,11 @@ ovector is in the backtracking frame, it no longer needs to reserve space for
 the overall match offsets (which would waste space in the frame). For backward
 compatibility, however, we pass capture_top and offset_vector to the callout as
 if for the extended ovector, and we ensure that the first two slots are unset
-by preserving and restoring their current contents. Picky compilers complain if 
-references such as Fovector[-2] are use directly, so we set up a separate 
+by preserving and restoring their current contents. Picky compilers complain if
+references such as Fovector[-2] are use directly, so we set up a separate
 pointer. */

-callout_ovector = (PCRE2_SIZE *)(Fovector) - 2; 
+callout_ovector = (PCRE2_SIZE *)(Fovector) - 2;

 cb.version          = 1;
 cb.capture_top      = (uint32_t)Foffset_top/2 + 1;
@ -935,8 +935,8 @@ fprintf(stderr, "++ op=%d\n", *Fecode);

    /* ===================================================================== */
    /* Match a single character, caselessly. If we are at the end of the
-    subject, give up immediately. We get here only when the pattern character 
-    has at most one other case. Characters with more than two cases are coded 
+    subject, give up immediately. We get here only when the pattern character
+    has at most one other case. Characters with more than two cases are coded
    as OP_PROP with the pseudo-property PT_CLIST. */

    case OP_CHARI:
@ -954,7 +954,7 @@ fprintf(stderr, "++ op=%d\n", *Fecode);
      GETCHARLEN(fc, Fecode, Flength);

      /* If the pattern character's value is < 128, we know that its other case
-      (if any) is also < 128 (and therefore only one code unit long in all 
+      (if any) is also < 128 (and therefore only one code unit long in all
      code-unit widths), so we can use the fast lookup table. We checked above
      that there is at least one character left in the subject. */

@ -966,7 +966,7 @@ fprintf(stderr, "++ op=%d\n", *Fecode);
        Feptr++;
        }

-      /* Otherwise we must pick up the subject character and use Unicode 
+      /* Otherwise we must pick up the subject character and use Unicode
      property support to test its other case. Note that we cannot use the
      value of "Flength" to check for sufficient bytes left, because the other
      case of the character may have more or fewer code units. */
@ -3056,7 +3056,7 @@ fprintf(stderr, "++ op=%d\n", *Fecode);
          }
        Feptr += Lmin;
        break;
-        
+
        /* This OP_ANYBYTE case will never be reached because \C gets turned
        into OP_ALLANY in non-UTF mode. Cut out the code so that coverage
        reports don't complain about it's never being used. */
@ -5352,8 +5352,8 @@ fprintf(stderr, "++ op=%d\n", *Fecode);
                (char *)assert_accept_frame + offsetof(heapframe, ovector),
                assert_accept_frame->offset_top * sizeof(PCRE2_SIZE));
          Foffset_top = assert_accept_frame->offset_top;
-           
-          /* Fall through */ 
+
+          /* Fall through */
          /* In the case of a match, the captures have already been put into
          the current frame. */

@ -5650,7 +5650,7 @@ fprintf(stderr, "++ op=%d\n", *Fecode);
    if ((mb->moptions & PCRE2_NOTEOL) != 0) RRETURN(MATCH_NOMATCH);
    if ((mb->poptions & PCRE2_DOLLAR_ENDONLY) == 0) goto ASSERT_NL_OR_EOS;

-    /* Fall through */ 
+    /* Fall through */
    /* Unconditional end of subject assertion (\z) */

    case OP_EOD:
@ -6280,7 +6280,7 @@ The last of these is changed within the match() function if the frame vector
 has to be expanded. We therefore put it into the match block so that it is
 correct when calling match() more than once for non-anchored patterns. */

-frame_size = offsetof(heapframe, ovector) + 
+frame_size = offsetof(heapframe, ovector) +
  re->top_bracket * 2 * sizeof(PCRE2_SIZE);

 /* Limits set in the pattern override the match context only if they are
@ -6333,33 +6333,26 @@ mb->lcc = re->tables + lcc_offset;
 mb->fcc = re->tables + fcc_offset;
 mb->ctypes = re->tables + ctypes_offset;

-/* Set up the first code unit to match, if available. The first_codeunit value
-is never set for an anchored regular expression, but the anchoring may be
-forced at run time, so we have to test for anchoring. The first code unit may
-be unset for an unanchored pattern, of course. If there's no first code unit
-there may be a bitmap of possible first characters. */
+/* Set up the first code unit to match, if available. If there's no first code
+unit there may be a bitmap of possible first characters. */

-if (!anchored)
+if ((re->flags & PCRE2_FIRSTSET) != 0)
  {
-  if ((re->flags & PCRE2_FIRSTSET) != 0)
+  has_first_cu = TRUE;
+  first_cu = first_cu2 = (PCRE2_UCHAR)(re->first_codeunit);
+  if ((re->flags & PCRE2_FIRSTCASELESS) != 0)
    {
-    has_first_cu = TRUE;
-    first_cu = first_cu2 = (PCRE2_UCHAR)(re->first_codeunit);
-    if ((re->flags & PCRE2_FIRSTCASELESS) != 0)
-      {
-      first_cu2 = TABLE_GET(first_cu, mb->fcc, first_cu);
+    first_cu2 = TABLE_GET(first_cu, mb->fcc, first_cu);
 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 8
-      if (utf && first_cu > 127) first_cu2 = UCD_OTHERCASE(first_cu);
+    if (utf && first_cu > 127) first_cu2 = UCD_OTHERCASE(first_cu);
 #endif
-      }
    }
-  else
-    if (!startline && (re->flags & PCRE2_FIRSTMAPSET) != 0)
-      start_bits = re->start_bitmap;
  }
+else
+  if (!startline && (re->flags & PCRE2_FIRSTMAPSET) != 0)
+    start_bits = re->start_bitmap;

-/* For anchored or unanchored matches, there may be a "last known required
-character" set. */
+/* There may also be a "last known required character" set. */

 if ((re->flags & PCRE2_LASTSET) != 0)
  {
@ -6398,8 +6391,8 @@ for(;;)
    /* If firstline is TRUE, the start of the match is constrained to the first
    line of a multiline string. That is, the match must be before or at the
    first newline. Implement this by temporarily adjusting end_subject so that
-    we stop the optimization scans at a newline. If the match fails at the
-    newline, later code breaks this loop. */
+    we stop the optimization scans for a first code unit at a newline. If the
+    match fails at the newline, later code breaks this loop. */

    if (firstline)
      {
@ -6419,107 +6412,143 @@ for(;;)
      end_subject = t;
      }

-    /* Advance to a unique first code unit if there is one. In 8-bit mode, the
-    use of memchr() gives a big speed up, even though we have to call it twice
-    in caseless mode, in order to find the first occurrence of the character in
-    either of its cases. */
+    /* Anchored: check the first code unit if one is recorded. This may seem
+    pointless but it can help in detecting a no match case without scanning for
+    the required code unit. */

-    if (has_first_cu)
+    if (anchored)
      {
-      if (first_cu != first_cu2)  /* Caseless */
+      if (has_first_cu || start_bits != NULL)
        {
-#if PCRE2_CODE_UNIT_WIDTH != 8
-        PCRE2_UCHAR smc;
-        while (start_match < end_subject &&
-              (smc = UCHAR21TEST(start_match)) != first_cu && smc != first_cu2)
-          start_match++;
-#else  /* 8-bit code units */
-        PCRE2_SPTR pp1 = memchr(start_match, first_cu, end_subject-start_match);
-        PCRE2_SPTR pp2 = memchr(start_match, first_cu2, end_subject-start_match);
-        if (pp1 == NULL)
-          start_match = (pp2 == NULL)? end_subject : pp2;
-        else
-          start_match = (pp2 == NULL || pp1 < pp2)? pp1 : pp2;
-#endif
-        }
-
-      /* The caseful case */
-
-      else
-        {
-#if PCRE2_CODE_UNIT_WIDTH != 8
-        while (start_match < end_subject && UCHAR21TEST(start_match) != first_cu)
-          start_match++;
-#else
-        start_match = memchr(start_match, first_cu, end_subject - start_match);
-        if (start_match == NULL) start_match = end_subject;
-#endif
-        }
-
-      /* If we can't find the required code unit, break the bumpalong loop, to
-      force a match failure, except when doing partial matching, when we let
-      the next cycle run at the end of the subject. To see why, consider the
-      pattern /(?<=abc)def/, which partially matches "abc", even though the
-      string does not contain the starting character "d". */
-
-      if (!mb->partial && start_match >= end_subject)
-        {
-        rc = MATCH_NOMATCH;
-        break;
-        }
-      }
-
-    /* If there's no first code unit, advance to just after a linebreak for a
-    multiline match if required. */
-
-    else if (startline)
-      {
-      if (start_match > mb->start_subject + start_offset)
-        {
-#ifdef SUPPORT_UNICODE
-        if (utf)
+        BOOL ok = start_match < end_subject;
+        if (ok)
          {
-          while (start_match < end_subject && !WAS_NEWLINE(start_match))
+          PCRE2_UCHAR c = UCHAR21TEST(start_match);
+          ok = has_first_cu && (c == first_cu || c == first_cu2);
+          if (!ok && start_bits != NULL)
            {
-            start_match++;
-            ACROSSCHAR(start_match < end_subject, *start_match,
-              start_match++);
+#if PCRE2_CODE_UNIT_WIDTH != 8
+            if (c > 255) c = 255;
+#endif
+            ok = (start_bits[c/8] & (1 << (c&7))) != 0;
            }
          }
-        else
-#endif
-        while (start_match < end_subject && !WAS_NEWLINE(start_match))
-          start_match++;
-
-        /* If we have just passed a CR and the newline option is ANY or
-        ANYCRLF, and we are now at a LF, advance the match position by one more
-        code unit. */
-
-        if (start_match[-1] == CHAR_CR &&
-             (mb->nltype == NLTYPE_ANY || mb->nltype == NLTYPE_ANYCRLF) &&
-             start_match < end_subject &&
-             UCHAR21TEST(start_match) == CHAR_NL)
-          start_match++;
+        if (!ok)
+          {
+          rc = MATCH_NOMATCH;
+          break;
+          }
        }
      }

-    /* If there's no first code unit or a requirement for a multiline line
-    start, advance to a non-unique first code unit if any have been identified.
-    The bitmap contains only 256 bits. When code units are 16 or 32 bits wide,
-    all code units greater than 254 set the 255 bit. */
+    /* Not anchored. Advance to a unique first code unit if there is one. In
+    8-bit mode, the use of memchr() gives a big speed up, even though we have
+    to call it twice in caseless mode, in order to find the earliest occurrence
+    of the character in either of its cases. */

-    else if (start_bits != NULL)
+    else
      {
-      while (start_match < end_subject)
+      if (has_first_cu)
        {
-        uint32_t c = UCHAR21TEST(start_match);
+        if (first_cu != first_cu2)  /* Caseless */
+          {
 #if PCRE2_CODE_UNIT_WIDTH != 8
-        if (c > 255) c = 255;
+          PCRE2_UCHAR smc;
+          while (start_match < end_subject &&
+                (smc = UCHAR21TEST(start_match)) != first_cu &&
+                  smc != first_cu2)
+            start_match++;
+#else  /* 8-bit code units */
+          PCRE2_SPTR pp1 =
+            memchr(start_match, first_cu, end_subject-start_match);
+          PCRE2_SPTR pp2 =
+            memchr(start_match, first_cu2, end_subject-start_match);
+          if (pp1 == NULL)
+            start_match = (pp2 == NULL)? end_subject : pp2;
+          else
+            start_match = (pp2 == NULL || pp1 < pp2)? pp1 : pp2;
 #endif
-        if ((start_bits[c/8] & (1 << (c&7))) != 0) break;
-        start_match++;
+          }
+
+        /* The caseful case */
+
+        else
+          {
+#if PCRE2_CODE_UNIT_WIDTH != 8
+          while (start_match < end_subject && UCHAR21TEST(start_match) !=
+                 first_cu)
+            start_match++;
+#else
+          start_match = memchr(start_match, first_cu, end_subject - start_match);
+          if (start_match == NULL) start_match = end_subject;
+#endif
+          }
+
+        /* If we can't find the required code unit, break the bumpalong loop,
+        to force a match failure, except when doing partial matching, when we
+        let the next cycle run at the end of the subject. To see why, consider
+        the pattern /(?<=abc)def/, which partially matches "abc", even though
+        the string does not contain the starting character "d". */
+
+        if (!mb->partial && start_match >= end_subject)
+          {
+          rc = MATCH_NOMATCH;
+          break;
+          }
        }
-      }
+
+      /* If there's no first code unit, advance to just after a linebreak for a
+      multiline match if required. */
+
+      else if (startline)
+        {
+        if (start_match > mb->start_subject + start_offset)
+          {
+#ifdef SUPPORT_UNICODE
+          if (utf)
+            {
+            while (start_match < end_subject && !WAS_NEWLINE(start_match))
+              {
+              start_match++;
+              ACROSSCHAR(start_match < end_subject, *start_match,
+                start_match++);
+              }
+            }
+          else
+#endif
+          while (start_match < end_subject && !WAS_NEWLINE(start_match))
+            start_match++;
+
+          /* If we have just passed a CR and the newline option is ANY or
+          ANYCRLF, and we are now at a LF, advance the match position by one
+          more code unit. */
+
+          if (start_match[-1] == CHAR_CR &&
+               (mb->nltype == NLTYPE_ANY || mb->nltype == NLTYPE_ANYCRLF) &&
+               start_match < end_subject &&
+               UCHAR21TEST(start_match) == CHAR_NL)
+            start_match++;
+          }
+        }
+
+      /* If there's no first code unit or a requirement for a multiline line
+      start, advance to a non-unique first code unit if any have been
+      identified. The bitmap contains only 256 bits. When code units are 16 or
+      32 bits wide, all code units greater than 254 set the 255 bit. */
+
+      else if (start_bits != NULL)
+        {
+        while (start_match < end_subject)
+          {
+          uint32_t c = UCHAR21TEST(start_match);
+#if PCRE2_CODE_UNIT_WIDTH != 8
+          if (c > 255) c = 255;
+#endif
+          if ((start_bits[c/8] & (1 << (c&7))) != 0) break;
+          start_match++;
+          }
+        }
+      }   /* End first code unit handling */

    /* Restore fudged end_subject */

--- a/src/pcre2_study.c
+++ b/src/pcre2_study.c
@ -799,7 +799,7 @@ if (caseless)
    if (c > 0xff) SET_BIT(0xff); else SET_BIT(c);
 #endif
    }
-  else 
+  else
 #endif  /* SUPPORT_UNICODE */

  /* Not UTF */
@ -953,7 +953,6 @@ do
      case OP_ALLANY:
      case OP_ANY:
      case OP_ANYBYTE:
-      case OP_CIRC:
      case OP_CIRCM:
      case OP_CLOSE:
      case OP_COMMIT:
@ -1021,6 +1020,13 @@ do
      case OP_THEN_ARG:
      return SSB_FAIL;

+      /* OP_CIRC happens only at the start of an anchored branch (multiline ^
+      uses OP_CIRCM). Skip over it. */
+
+      case OP_CIRC:
+      tcode += PRIV(OP_lengths)[OP_CIRC];
+      break;
+
      /* A "real" property test implies no starting bits, but the fake property
      PT_CLIST identifies a list of characters. These lists are short, as they
      are used for characters with more than one "other case", so there is no
@ -1450,7 +1456,7 @@ do
 #endif
      /* It seems that the fall through comment must be outside the #ifdef if
      it is to avoid the gcc compiler warning. */
-        
+
      /* Fall through */

      /* Enter here for a negative non-XCLASS. In the 8-bit library, if we are
@ -1579,12 +1585,11 @@ BOOL utf = (re->overall_options & PCRE2_UTF) != 0;
 code = (PCRE2_UCHAR *)((uint8_t *)re + sizeof(pcre2_real_code)) +
  re->name_entry_size * re->name_count;

-/* For an anchored pattern, or an unanchored pattern that has a first code
-unit, or a multiline pattern that matches only at "line start", there is no
-point in seeking a list of starting code units. */
+/* For a pattern that has a first code unit, or a multiline pattern that
+matches only at "line start", there is no point in seeking a list of starting
+code units. */

-if ((re->overall_options & PCRE2_ANCHORED) == 0 &&
-    (re->flags & (PCRE2_FIRSTSET|PCRE2_STARTLINE)) == 0)
+if ((re->flags & (PCRE2_FIRSTSET|PCRE2_STARTLINE)) == 0)
  {
  int rc = set_start_bits(re, code, utf);
  if (rc == SSB_UNKNOWN) return 1;
--- a/testdata/testinput10
+++ b/testdata/testinput10
@ -466,5 +466,14 @@

 /\udfff\o{157401}/utf,alt_bsux,allow_surrogate_escapes
    \x{dfff}\x{df01}\=no_utf_check
+    
+# This has different starting code units in 8-bit mode. 
+
+/^[^ab]/IB,utf
+    c
+    \x{ff}
+    \x{100}
+\= Expect no match
+    aaa

 # End of testinput10
--- a/testdata/testinput12
+++ b/testdata/testinput12
@ -373,4 +373,13 @@
 /\udfff\o{157401}/utf,alt_bsux,allow_surrogate_escapes
    \x{dfff}\x{df01}\=no_utf_check

+# This has different starting code units in 8-bit mode. 
+
+/^[^ab]/IB,utf
+    c
+    \x{ff}
+    \x{100}
+\= Expect no match
+    aaa
+
 # End of testinput12
--- a/testdata/testinput2
+++ b/testdata/testinput2
@ -5256,6 +5256,9 @@ a)"xI
    XAB     

 /^(?!A(?C1)B)C/
+    ABC\=callout_error=1,no_jit
+
+/^(?!A(?C1)B)C/no_start_optimize
    ABC\=callout_error=1

 /^(?(?!A(?C1)B)C)/
--- a/testdata/testinput5
+++ b/testdata/testinput5
@ -120,13 +120,6 @@
    \x{ff}
    \x{100}

-/^[^ab]/IB,utf
-    c
-    \x{ff}
-    \x{100}
-\= Expect no match
-    aaa
-
 /\x{100}*(\d+|"(?1)")/utf
    1234
    "1234"
@ -190,7 +183,10 @@
 /\w/utf
    \x{100}X

-/^\ሴ/IB,utf
+# Use no_start_optimize because the first code unit is different in 8-bit from
+# the wider modes.
+
+/^\ሴ/IB,utf,no_start_optimize

 /()()()()()()()()()()
 ()()()()()()()()()()
--- a/testdata/testoutput10
+++ b/testdata/testoutput10
@ -1585,5 +1585,39 @@ Failed: error 176 at offset 259: name is too long in (*MARK), (*PRUNE), (*SKIP),
 /\udfff\o{157401}/utf,alt_bsux,allow_surrogate_escapes
    \x{dfff}\x{df01}\=no_utf_check
 0: \x{dfff}\x{df01}
+    
+# This has different starting code units in 8-bit mode. 
+
+/^[^ab]/IB,utf
+------------------------------------------------------------------
+        Bra
+        ^
+        [\x00-`c-\xff] (neg)
+        Ket
+        End
+------------------------------------------------------------------
+Capturing subpattern count = 0
+Compile options: utf
+Overall options: anchored utf
+Starting code units: \x00 \x01 \x02 \x03 \x04 \x05 \x06 \x07 \x08 \x09 \x0a 
+  \x0b \x0c \x0d \x0e \x0f \x10 \x11 \x12 \x13 \x14 \x15 \x16 \x17 \x18 \x19 
+  \x1a \x1b \x1c \x1d \x1e \x1f \x20 ! " # $ % & ' ( ) * + , - . / 0 1 2 3 4 
+  5 6 7 8 9 : ; < = > ? @ A B C D E F G H I J K L M N O P Q R S T U V W X Y 
+  Z [ \ ] ^ _ ` c d e f g h i j k l m n o p q r s t u v w x y z { | } ~ \x7f 
+  \xc2 \xc3 \xc4 \xc5 \xc6 \xc7 \xc8 \xc9 \xca \xcb \xcc \xcd \xce \xcf \xd0 
+  \xd1 \xd2 \xd3 \xd4 \xd5 \xd6 \xd7 \xd8 \xd9 \xda \xdb \xdc \xdd \xde \xdf 
+  \xe0 \xe1 \xe2 \xe3 \xe4 \xe5 \xe6 \xe7 \xe8 \xe9 \xea \xeb \xec \xed \xee 
+  \xef \xf0 \xf1 \xf2 \xf3 \xf4 \xf5 \xf6 \xf7 \xf8 \xf9 \xfa \xfb \xfc \xfd 
+  \xfe \xff 
+Subject length lower bound = 1
+    c
+ 0: c
+    \x{ff}
+ 0: \x{ff}
+    \x{100}
+ 0: \x{100}
+\= Expect no match
+    aaa
+No match

 # End of testinput10
--- a/testdata/testoutput12-16
+++ b/testdata/testoutput12-16
@ -1433,4 +1433,42 @@ Failed: error 191 at offset 0: PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES is not allowe
 Failed: error 191 at offset 0: PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES is not allowed in UTF-16 mode
    \x{dfff}\x{df01}\=no_utf_check

+# This has different starting code units in 8-bit mode. 
+
+/^[^ab]/IB,utf
+------------------------------------------------------------------
+        Bra
+        ^
+        [\x00-`c-\xff] (neg)
+        Ket
+        End
+------------------------------------------------------------------
+Capturing subpattern count = 0
+Compile options: utf
+Overall options: anchored utf
+Starting code units: \x00 \x01 \x02 \x03 \x04 \x05 \x06 \x07 \x08 \x09 \x0a 
+  \x0b \x0c \x0d \x0e \x0f \x10 \x11 \x12 \x13 \x14 \x15 \x16 \x17 \x18 \x19 
+  \x1a \x1b \x1c \x1d \x1e \x1f \x20 ! " # $ % & ' ( ) * + , - . / 0 1 2 3 4 
+  5 6 7 8 9 : ; < = > ? @ A B C D E F G H I J K L M N O P Q R S T U V W X Y 
+  Z [ \ ] ^ _ ` c d e f g h i j k l m n o p q r s t u v w x y z { | } ~ \x7f 
+  \x80 \x81 \x82 \x83 \x84 \x85 \x86 \x87 \x88 \x89 \x8a \x8b \x8c \x8d \x8e 
+  \x8f \x90 \x91 \x92 \x93 \x94 \x95 \x96 \x97 \x98 \x99 \x9a \x9b \x9c \x9d 
+  \x9e \x9f \xa0 \xa1 \xa2 \xa3 \xa4 \xa5 \xa6 \xa7 \xa8 \xa9 \xaa \xab \xac 
+  \xad \xae \xaf \xb0 \xb1 \xb2 \xb3 \xb4 \xb5 \xb6 \xb7 \xb8 \xb9 \xba \xbb 
+  \xbc \xbd \xbe \xbf \xc0 \xc1 \xc2 \xc3 \xc4 \xc5 \xc6 \xc7 \xc8 \xc9 \xca 
+  \xcb \xcc \xcd \xce \xcf \xd0 \xd1 \xd2 \xd3 \xd4 \xd5 \xd6 \xd7 \xd8 \xd9 
+  \xda \xdb \xdc \xdd \xde \xdf \xe0 \xe1 \xe2 \xe3 \xe4 \xe5 \xe6 \xe7 \xe8 
+  \xe9 \xea \xeb \xec \xed \xee \xef \xf0 \xf1 \xf2 \xf3 \xf4 \xf5 \xf6 \xf7 
+  \xf8 \xf9 \xfa \xfb \xfc \xfd \xfe \xff 
+Subject length lower bound = 1
+    c
+ 0: c
+    \x{ff}
+ 0: \x{ff}
+    \x{100}
+ 0: \x{100}
+\= Expect no match
+    aaa
+No match
+
 # End of testinput12
--- a/testdata/testoutput12-32
+++ b/testdata/testoutput12-32
@ -1425,4 +1425,42 @@ No match
    \x{dfff}\x{df01}\=no_utf_check
 0: \x{dfff}\x{df01}

+# This has different starting code units in 8-bit mode. 
+
+/^[^ab]/IB,utf
+------------------------------------------------------------------
+        Bra
+        ^
+        [\x00-`c-\xff] (neg)
+        Ket
+        End
+------------------------------------------------------------------
+Capturing subpattern count = 0
+Compile options: utf
+Overall options: anchored utf
+Starting code units: \x00 \x01 \x02 \x03 \x04 \x05 \x06 \x07 \x08 \x09 \x0a 
+  \x0b \x0c \x0d \x0e \x0f \x10 \x11 \x12 \x13 \x14 \x15 \x16 \x17 \x18 \x19 
+  \x1a \x1b \x1c \x1d \x1e \x1f \x20 ! " # $ % & ' ( ) * + , - . / 0 1 2 3 4 
+  5 6 7 8 9 : ; < = > ? @ A B C D E F G H I J K L M N O P Q R S T U V W X Y 
+  Z [ \ ] ^ _ ` c d e f g h i j k l m n o p q r s t u v w x y z { | } ~ \x7f 
+  \x80 \x81 \x82 \x83 \x84 \x85 \x86 \x87 \x88 \x89 \x8a \x8b \x8c \x8d \x8e 
+  \x8f \x90 \x91 \x92 \x93 \x94 \x95 \x96 \x97 \x98 \x99 \x9a \x9b \x9c \x9d 
+  \x9e \x9f \xa0 \xa1 \xa2 \xa3 \xa4 \xa5 \xa6 \xa7 \xa8 \xa9 \xaa \xab \xac 
+  \xad \xae \xaf \xb0 \xb1 \xb2 \xb3 \xb4 \xb5 \xb6 \xb7 \xb8 \xb9 \xba \xbb 
+  \xbc \xbd \xbe \xbf \xc0 \xc1 \xc2 \xc3 \xc4 \xc5 \xc6 \xc7 \xc8 \xc9 \xca 
+  \xcb \xcc \xcd \xce \xcf \xd0 \xd1 \xd2 \xd3 \xd4 \xd5 \xd6 \xd7 \xd8 \xd9 
+  \xda \xdb \xdc \xdd \xde \xdf \xe0 \xe1 \xe2 \xe3 \xe4 \xe5 \xe6 \xe7 \xe8 
+  \xe9 \xea \xeb \xec \xed \xee \xef \xf0 \xf1 \xf2 \xf3 \xf4 \xf5 \xf6 \xf7 
+  \xf8 \xf9 \xfa \xfb \xfc \xfd \xfe \xff 
+Subject length lower bound = 1
+    c
+ 0: c
+    \x{ff}
+ 0: \x{ff}
+    \x{100}
+ 0: \x{100}
+\= Expect no match
+    aaa
+No match
+
 # End of testinput12
--- a/testdata/testoutput17
+++ b/testdata/testoutput17
@ -368,6 +368,7 @@ No match
 Capturing subpattern count = 0
 Compile options: <none>
 Overall options: anchored
+First code unit = 'a'
 Subject length lower bound = 6
 JIT compilation was successful
 #pop jitverify
@ -379,6 +380,7 @@ JIT compilation was successful
 Capturing subpattern count = 0
 Compile options: <none>
 Overall options: anchored
+First code unit = 'a'
 Subject length lower bound = 6
 JIT compilation was successful
 #save testsaved1
--- a/testdata/testoutput2
+++ b/testdata/testoutput2
@ -72,6 +72,7 @@ No match
 Capturing subpattern count = 0
 Compile options: <none>
 Overall options: anchored
+First code unit = 'a'
 Subject length lower bound = 3
    abc
 0: abc
@ -110,6 +111,7 @@ Subject length lower bound = 2
 Capturing subpattern count = 0
 Compile options: <none>
 Overall options: anchored
+First code unit = 'a'
 Subject length lower bound = 3
    abc
 0: abc
@ -339,6 +341,7 @@ Subject length lower bound = 19
 /the quick brown fox/I,anchored
 Capturing subpattern count = 0
 Options: anchored
+First code unit = 't'
 Subject length lower bound = 19
    the quick brown fox
 0: the quick brown fox
@ -351,6 +354,7 @@ Failed: error 111 at offset 4: unrecognized character after (? or (?-

 /^abc|def/I
 Capturing subpattern count = 0
+Starting code units: a d 
 Subject length lower bound = 3
    abcdef
 0: abc
@ -495,12 +499,14 @@ Subject length lower bound = 1
 Capturing subpattern count = 0
 Compile options: <none>
 Overall options: anchored
+First code unit = '1'
 Subject length lower bound = 4

 /(^b|(?i)^d)/I
 Capturing subpattern count = 1
 Compile options: <none>
 Overall options: anchored
+Starting code units: D b d 
 Subject length lower bound = 1

 /(?s).*/I
@ -624,6 +630,7 @@ Capturing subpattern count = 0
 Max lookbehind = 1
 Compile options: multiline
 Overall options: anchored multiline
+First code unit = 'a'
 Subject length lower bound = 3

 /^abc/Im
@ -637,6 +644,7 @@ Subject length lower bound = 3
 Capturing subpattern count = 5
 Compile options: <none>
 Overall options: anchored
+First code unit = 'a'
 Subject length lower bound = 3
  aaaaabbbbbcccccdef
 0: aaaaabbbbbcccccdef
@ -808,6 +816,7 @@ Capturing subpattern count = 1
 Max back reference = 1
 Compile options: <none>
 Overall options: anchored
+Starting code units: a 
 Subject length lower bound = 4
 \= Expect no match
    aaaa
@ -1004,6 +1013,7 @@ Subject length lower bound = 16
 Capturing subpattern count = 3
 Compile options: <none>
 Overall options: anchored
+Starting code units: a b 
 Subject length lower bound = 4
    adef\=get=1,get=2,get=3,get=4,getall
 0: adef
@ -1042,6 +1052,7 @@ Get substring 4 failed (-49): unknown substring
 Capturing subpattern count = 0
 Compile options: <none>
 Overall options: anchored
+First code unit = 'a'
 Subject length lower bound = 7
    abc\00def\=copy=0,getall
 0: abc\x00def
@ -1227,6 +1238,7 @@ Subject length lower bound = 3
 Capturing subpattern count = 0
 Compile options: <none>
 Overall options: anchored
+First code unit = 'i'
 Subject length lower bound = 3
    ississippi
 0: iss
@ -1286,6 +1298,7 @@ Capturing subpattern count = 0
 Contains explicit CR or LF match
 Compile options: <none>
 Overall options: anchored
+First code unit = 'a'
 Subject length lower bound = 3
    ab\nab\ncd
 0: ab\x0a
@ -1776,6 +1789,8 @@ Subject length lower bound = 2
 Capturing subpattern count = 0
 Compile options: <none>
 Overall options: anchored
+Starting code units: 0 1 2 3 4 5 6 7 8 9 A B C D E F G H I J K L M N O P 
+  Q R S T U V W X Y Z a b c d e f g h i j k l m n o p q r s t u v w x y z 
 Subject length lower bound = 1

 /^[[:^alnum:]]/IB
@ -1789,6 +1804,18 @@ Subject length lower bound = 1
 Capturing subpattern count = 0
 Compile options: <none>
 Overall options: anchored
+Starting code units: \x00 \x01 \x02 \x03 \x04 \x05 \x06 \x07 \x08 \x09 \x0a 
+  \x0b \x0c \x0d \x0e \x0f \x10 \x11 \x12 \x13 \x14 \x15 \x16 \x17 \x18 \x19 
+  \x1a \x1b \x1c \x1d \x1e \x1f \x20 ! " # $ % & ' ( ) * + , - . / : ; < = > 
+  ? @ [ \ ] ^ _ ` { | } ~ \x7f \x80 \x81 \x82 \x83 \x84 \x85 \x86 \x87 \x88 
+  \x89 \x8a \x8b \x8c \x8d \x8e \x8f \x90 \x91 \x92 \x93 \x94 \x95 \x96 \x97 
+  \x98 \x99 \x9a \x9b \x9c \x9d \x9e \x9f \xa0 \xa1 \xa2 \xa3 \xa4 \xa5 \xa6 
+  \xa7 \xa8 \xa9 \xaa \xab \xac \xad \xae \xaf \xb0 \xb1 \xb2 \xb3 \xb4 \xb5 
+  \xb6 \xb7 \xb8 \xb9 \xba \xbb \xbc \xbd \xbe \xbf \xc0 \xc1 \xc2 \xc3 \xc4 
+  \xc5 \xc6 \xc7 \xc8 \xc9 \xca \xcb \xcc \xcd \xce \xcf \xd0 \xd1 \xd2 \xd3 
+  \xd4 \xd5 \xd6 \xd7 \xd8 \xd9 \xda \xdb \xdc \xdd \xde \xdf \xe0 \xe1 \xe2 
+  \xe3 \xe4 \xe5 \xe6 \xe7 \xe8 \xe9 \xea \xeb \xec \xed \xee \xef \xf0 \xf1 
+  \xf2 \xf3 \xf4 \xf5 \xf6 \xf7 \xf8 \xf9 \xfa \xfb \xfc \xfd \xfe \xff 
 Subject length lower bound = 1

 /^[[:alpha:]]/IB
@ -1802,6 +1829,8 @@ Subject length lower bound = 1
 Capturing subpattern count = 0
 Compile options: <none>
 Overall options: anchored
+Starting code units: A B C D E F G H I J K L M N O P Q R S T U V W X Y Z 
+  a b c d e f g h i j k l m n o p q r s t u v w x y z 
 Subject length lower bound = 1

 /^[[:^alpha:]]/IB
@ -1815,6 +1844,19 @@ Subject length lower bound = 1
 Capturing subpattern count = 0
 Compile options: <none>
 Overall options: anchored
+Starting code units: \x00 \x01 \x02 \x03 \x04 \x05 \x06 \x07 \x08 \x09 \x0a 
+  \x0b \x0c \x0d \x0e \x0f \x10 \x11 \x12 \x13 \x14 \x15 \x16 \x17 \x18 \x19 
+  \x1a \x1b \x1c \x1d \x1e \x1f \x20 ! " # $ % & ' ( ) * + , - . / 0 1 2 3 4 
+  5 6 7 8 9 : ; < = > ? @ [ \ ] ^ _ ` { | } ~ \x7f \x80 \x81 \x82 \x83 \x84 
+  \x85 \x86 \x87 \x88 \x89 \x8a \x8b \x8c \x8d \x8e \x8f \x90 \x91 \x92 \x93 
+  \x94 \x95 \x96 \x97 \x98 \x99 \x9a \x9b \x9c \x9d \x9e \x9f \xa0 \xa1 \xa2 
+  \xa3 \xa4 \xa5 \xa6 \xa7 \xa8 \xa9 \xaa \xab \xac \xad \xae \xaf \xb0 \xb1 
+  \xb2 \xb3 \xb4 \xb5 \xb6 \xb7 \xb8 \xb9 \xba \xbb \xbc \xbd \xbe \xbf \xc0 
+  \xc1 \xc2 \xc3 \xc4 \xc5 \xc6 \xc7 \xc8 \xc9 \xca \xcb \xcc \xcd \xce \xcf 
+  \xd0 \xd1 \xd2 \xd3 \xd4 \xd5 \xd6 \xd7 \xd8 \xd9 \xda \xdb \xdc \xdd \xde 
+  \xdf \xe0 \xe1 \xe2 \xe3 \xe4 \xe5 \xe6 \xe7 \xe8 \xe9 \xea \xeb \xec \xed 
+  \xee \xef \xf0 \xf1 \xf2 \xf3 \xf4 \xf5 \xf6 \xf7 \xf8 \xf9 \xfa \xfb \xfc 
+  \xfd \xfe \xff 
 Subject length lower bound = 1

 /[_[:alpha:]]/I
@ -1834,6 +1876,12 @@ Subject length lower bound = 1
 Capturing subpattern count = 0
 Compile options: <none>
 Overall options: anchored
+Starting code units: \x00 \x01 \x02 \x03 \x04 \x05 \x06 \x07 \x08 \x09 \x0a 
+  \x0b \x0c \x0d \x0e \x0f \x10 \x11 \x12 \x13 \x14 \x15 \x16 \x17 \x18 \x19 
+  \x1a \x1b \x1c \x1d \x1e \x1f \x20 ! " # $ % & ' ( ) * + , - . / 0 1 2 3 4 
+  5 6 7 8 9 : ; < = > ? @ A B C D E F G H I J K L M N O P Q R S T U V W X Y 
+  Z [ \ ] ^ _ ` a b c d e f g h i j k l m n o p q r s t u v w x y z { | } ~ 
+  \x7f 
 Subject length lower bound = 1

 /^[[:^ascii:]]/IB
@ -1847,6 +1895,15 @@ Subject length lower bound = 1
 Capturing subpattern count = 0
 Compile options: <none>
 Overall options: anchored
+Starting code units: \x80 \x81 \x82 \x83 \x84 \x85 \x86 \x87 \x88 \x89 \x8a 
+  \x8b \x8c \x8d \x8e \x8f \x90 \x91 \x92 \x93 \x94 \x95 \x96 \x97 \x98 \x99 
+  \x9a \x9b \x9c \x9d \x9e \x9f \xa0 \xa1 \xa2 \xa3 \xa4 \xa5 \xa6 \xa7 \xa8 
+  \xa9 \xaa \xab \xac \xad \xae \xaf \xb0 \xb1 \xb2 \xb3 \xb4 \xb5 \xb6 \xb7 
+  \xb8 \xb9 \xba \xbb \xbc \xbd \xbe \xbf \xc0 \xc1 \xc2 \xc3 \xc4 \xc5 \xc6 
+  \xc7 \xc8 \xc9 \xca \xcb \xcc \xcd \xce \xcf \xd0 \xd1 \xd2 \xd3 \xd4 \xd5 
+  \xd6 \xd7 \xd8 \xd9 \xda \xdb \xdc \xdd \xde \xdf \xe0 \xe1 \xe2 \xe3 \xe4 
+  \xe5 \xe6 \xe7 \xe8 \xe9 \xea \xeb \xec \xed \xee \xef \xf0 \xf1 \xf2 \xf3 
+  \xf4 \xf5 \xf6 \xf7 \xf8 \xf9 \xfa \xfb \xfc \xfd \xfe \xff 
 Subject length lower bound = 1

 /^[[:blank:]]/IB
@ -1860,6 +1917,7 @@ Subject length lower bound = 1
 Capturing subpattern count = 0
 Compile options: <none>
 Overall options: anchored
+Starting code units: \x09 \x20 
 Subject length lower bound = 1

 /^[[:^blank:]]/IB
@ -1873,6 +1931,20 @@ Subject length lower bound = 1
 Capturing subpattern count = 0
 Compile options: <none>
 Overall options: anchored
+Starting code units: \x00 \x01 \x02 \x03 \x04 \x05 \x06 \x07 \x08 \x0a \x0b 
+  \x0c \x0d \x0e \x0f \x10 \x11 \x12 \x13 \x14 \x15 \x16 \x17 \x18 \x19 \x1a 
+  \x1b \x1c \x1d \x1e \x1f ! " # $ % & ' ( ) * + , - . / 0 1 2 3 4 5 6 7 8 9 
+  : ; < = > ? @ A B C D E F G H I J K L M N O P Q R S T U V W X Y Z [ \ ] ^ 
+  _ ` a b c d e f g h i j k l m n o p q r s t u v w x y z { | } ~ \x7f \x80 
+  \x81 \x82 \x83 \x84 \x85 \x86 \x87 \x88 \x89 \x8a \x8b \x8c \x8d \x8e \x8f 
+  \x90 \x91 \x92 \x93 \x94 \x95 \x96 \x97 \x98 \x99 \x9a \x9b \x9c \x9d \x9e 
+  \x9f \xa0 \xa1 \xa2 \xa3 \xa4 \xa5 \xa6 \xa7 \xa8 \xa9 \xaa \xab \xac \xad 
+  \xae \xaf \xb0 \xb1 \xb2 \xb3 \xb4 \xb5 \xb6 \xb7 \xb8 \xb9 \xba \xbb \xbc 
+  \xbd \xbe \xbf \xc0 \xc1 \xc2 \xc3 \xc4 \xc5 \xc6 \xc7 \xc8 \xc9 \xca \xcb 
+  \xcc \xcd \xce \xcf \xd0 \xd1 \xd2 \xd3 \xd4 \xd5 \xd6 \xd7 \xd8 \xd9 \xda 
+  \xdb \xdc \xdd \xde \xdf \xe0 \xe1 \xe2 \xe3 \xe4 \xe5 \xe6 \xe7 \xe8 \xe9 
+  \xea \xeb \xec \xed \xee \xef \xf0 \xf1 \xf2 \xf3 \xf4 \xf5 \xf6 \xf7 \xf8 
+  \xf9 \xfa \xfb \xfc \xfd \xfe \xff 
 Subject length lower bound = 1

 /[\n\x0b\x0c\x0d[:blank:]]/I
@ -1892,6 +1964,9 @@ Subject length lower bound = 1
 Capturing subpattern count = 0
 Compile options: <none>
 Overall options: anchored
+Starting code units: \x00 \x01 \x02 \x03 \x04 \x05 \x06 \x07 \x08 \x09 \x0a 
+  \x0b \x0c \x0d \x0e \x0f \x10 \x11 \x12 \x13 \x14 \x15 \x16 \x17 \x18 \x19 
+  \x1a \x1b \x1c \x1d \x1e \x1f \x7f 
 Subject length lower bound = 1

 /^[[:digit:]]/IB
@ -1905,6 +1980,7 @@ Subject length lower bound = 1
 Capturing subpattern count = 0
 Compile options: <none>
 Overall options: anchored
+Starting code units: 0 1 2 3 4 5 6 7 8 9 
 Subject length lower bound = 1

 /^[[:graph:]]/IB
@ -1918,6 +1994,9 @@ Subject length lower bound = 1
 Capturing subpattern count = 0
 Compile options: <none>
 Overall options: anchored
+Starting code units: ! " # $ % & ' ( ) * + , - . / 0 1 2 3 4 5 6 7 8 9 : 
+  ; < = > ? @ A B C D E F G H I J K L M N O P Q R S T U V W X Y Z [ \ ] ^ _ 
+  ` a b c d e f g h i j k l m n o p q r s t u v w x y z { | } ~ 
 Subject length lower bound = 1

 /^[[:lower:]]/IB
@ -1931,6 +2010,7 @@ Subject length lower bound = 1
 Capturing subpattern count = 0
 Compile options: <none>
 Overall options: anchored
+Starting code units: a b c d e f g h i j k l m n o p q r s t u v w x y z 
 Subject length lower bound = 1

 /^[[:print:]]/IB
@ -1944,6 +2024,9 @@ Subject length lower bound = 1
 Capturing subpattern count = 0
 Compile options: <none>
 Overall options: anchored
+Starting code units: \x20 ! " # $ % & ' ( ) * + , - . / 0 1 2 3 4 5 6 7 8 
+  9 : ; < = > ? @ A B C D E F G H I J K L M N O P Q R S T U V W X Y Z [ \ ] 
+  ^ _ ` a b c d e f g h i j k l m n o p q r s t u v w x y z { | } ~ 
 Subject length lower bound = 1

 /^[[:punct:]]/IB
@ -1957,6 +2040,8 @@ Subject length lower bound = 1
 Capturing subpattern count = 0
 Compile options: <none>
 Overall options: anchored
+Starting code units: ! " # $ % & ' ( ) * + , - . / : ; < = > ? @ [ \ ] ^ 
+  _ ` { | } ~ 
 Subject length lower bound = 1

 /^[[:space:]]/IB
@ -1970,6 +2055,7 @@ Subject length lower bound = 1
 Capturing subpattern count = 0
 Compile options: <none>
 Overall options: anchored
+Starting code units: \x09 \x0a \x0b \x0c \x0d \x20 
 Subject length lower bound = 1

 /^[[:upper:]]/IB
@ -1983,6 +2069,7 @@ Subject length lower bound = 1
 Capturing subpattern count = 0
 Compile options: <none>
 Overall options: anchored
+Starting code units: A B C D E F G H I J K L M N O P Q R S T U V W X Y Z 
 Subject length lower bound = 1

 /^[[:xdigit:]]/IB
@ -1996,6 +2083,7 @@ Subject length lower bound = 1
 Capturing subpattern count = 0
 Compile options: <none>
 Overall options: anchored
+Starting code units: 0 1 2 3 4 5 6 7 8 9 A B C D E F a b c d e f 
 Subject length lower bound = 1

 /^[[:word:]]/IB
@ -2009,6 +2097,8 @@ Subject length lower bound = 1
 Capturing subpattern count = 0
 Compile options: <none>
 Overall options: anchored
+Starting code units: 0 1 2 3 4 5 6 7 8 9 A B C D E F G H I J K L M N O P 
+  Q R S T U V W X Y Z _ a b c d e f g h i j k l m n o p q r s t u v w x y z 
 Subject length lower bound = 1

 /^[[:^cntrl:]]/IB
@ -2022,6 +2112,18 @@ Subject length lower bound = 1
 Capturing subpattern count = 0
 Compile options: <none>
 Overall options: anchored
+Starting code units: \x20 ! " # $ % & ' ( ) * + , - . / 0 1 2 3 4 5 6 7 8 
+  9 : ; < = > ? @ A B C D E F G H I J K L M N O P Q R S T U V W X Y Z [ \ ] 
+  ^ _ ` a b c d e f g h i j k l m n o p q r s t u v w x y z { | } ~ \x80 \x81 
+  \x82 \x83 \x84 \x85 \x86 \x87 \x88 \x89 \x8a \x8b \x8c \x8d \x8e \x8f \x90 
+  \x91 \x92 \x93 \x94 \x95 \x96 \x97 \x98 \x99 \x9a \x9b \x9c \x9d \x9e \x9f 
+  \xa0 \xa1 \xa2 \xa3 \xa4 \xa5 \xa6 \xa7 \xa8 \xa9 \xaa \xab \xac \xad \xae 
+  \xaf \xb0 \xb1 \xb2 \xb3 \xb4 \xb5 \xb6 \xb7 \xb8 \xb9 \xba \xbb \xbc \xbd 
+  \xbe \xbf \xc0 \xc1 \xc2 \xc3 \xc4 \xc5 \xc6 \xc7 \xc8 \xc9 \xca \xcb \xcc 
+  \xcd \xce \xcf \xd0 \xd1 \xd2 \xd3 \xd4 \xd5 \xd6 \xd7 \xd8 \xd9 \xda \xdb 
+  \xdc \xdd \xde \xdf \xe0 \xe1 \xe2 \xe3 \xe4 \xe5 \xe6 \xe7 \xe8 \xe9 \xea 
+  \xeb \xec \xed \xee \xef \xf0 \xf1 \xf2 \xf3 \xf4 \xf5 \xf6 \xf7 \xf8 \xf9 
+  \xfa \xfb \xfc \xfd \xfe \xff 
 Subject length lower bound = 1

 /^[12[:^digit:]]/IB
@ -2035,6 +2137,20 @@ Subject length lower bound = 1
 Capturing subpattern count = 0
 Compile options: <none>
 Overall options: anchored
+Starting code units: \x00 \x01 \x02 \x03 \x04 \x05 \x06 \x07 \x08 \x09 \x0a 
+  \x0b \x0c \x0d \x0e \x0f \x10 \x11 \x12 \x13 \x14 \x15 \x16 \x17 \x18 \x19 
+  \x1a \x1b \x1c \x1d \x1e \x1f \x20 ! " # $ % & ' ( ) * + , - . / 1 2 : ; < 
+  = > ? @ A B C D E F G H I J K L M N O P Q R S T U V W X Y Z [ \ ] ^ _ ` a 
+  b c d e f g h i j k l m n o p q r s t u v w x y z { | } ~ \x7f \x80 \x81 \x82 
+  \x83 \x84 \x85 \x86 \x87 \x88 \x89 \x8a \x8b \x8c \x8d \x8e \x8f \x90 \x91 
+  \x92 \x93 \x94 \x95 \x96 \x97 \x98 \x99 \x9a \x9b \x9c \x9d \x9e \x9f \xa0 
+  \xa1 \xa2 \xa3 \xa4 \xa5 \xa6 \xa7 \xa8 \xa9 \xaa \xab \xac \xad \xae \xaf 
+  \xb0 \xb1 \xb2 \xb3 \xb4 \xb5 \xb6 \xb7 \xb8 \xb9 \xba \xbb \xbc \xbd \xbe 
+  \xbf \xc0 \xc1 \xc2 \xc3 \xc4 \xc5 \xc6 \xc7 \xc8 \xc9 \xca \xcb \xcc \xcd 
+  \xce \xcf \xd0 \xd1 \xd2 \xd3 \xd4 \xd5 \xd6 \xd7 \xd8 \xd9 \xda \xdb \xdc 
+  \xdd \xde \xdf \xe0 \xe1 \xe2 \xe3 \xe4 \xe5 \xe6 \xe7 \xe8 \xe9 \xea \xeb 
+  \xec \xed \xee \xef \xf0 \xf1 \xf2 \xf3 \xf4 \xf5 \xf6 \xf7 \xf8 \xf9 \xfa 
+  \xfb \xfc \xfd \xfe \xff 
 Subject length lower bound = 1

 /^[[:^blank:]]/IB
@ -2048,6 +2164,20 @@ Subject length lower bound = 1
 Capturing subpattern count = 0
 Compile options: <none>
 Overall options: anchored
+Starting code units: \x00 \x01 \x02 \x03 \x04 \x05 \x06 \x07 \x08 \x0a \x0b 
+  \x0c \x0d \x0e \x0f \x10 \x11 \x12 \x13 \x14 \x15 \x16 \x17 \x18 \x19 \x1a 
+  \x1b \x1c \x1d \x1e \x1f ! " # $ % & ' ( ) * + , - . / 0 1 2 3 4 5 6 7 8 9 
+  : ; < = > ? @ A B C D E F G H I J K L M N O P Q R S T U V W X Y Z [ \ ] ^ 
+  _ ` a b c d e f g h i j k l m n o p q r s t u v w x y z { | } ~ \x7f \x80 
+  \x81 \x82 \x83 \x84 \x85 \x86 \x87 \x88 \x89 \x8a \x8b \x8c \x8d \x8e \x8f 
+  \x90 \x91 \x92 \x93 \x94 \x95 \x96 \x97 \x98 \x99 \x9a \x9b \x9c \x9d \x9e 
+  \x9f \xa0 \xa1 \xa2 \xa3 \xa4 \xa5 \xa6 \xa7 \xa8 \xa9 \xaa \xab \xac \xad 
+  \xae \xaf \xb0 \xb1 \xb2 \xb3 \xb4 \xb5 \xb6 \xb7 \xb8 \xb9 \xba \xbb \xbc 
+  \xbd \xbe \xbf \xc0 \xc1 \xc2 \xc3 \xc4 \xc5 \xc6 \xc7 \xc8 \xc9 \xca \xcb 
+  \xcc \xcd \xce \xcf \xd0 \xd1 \xd2 \xd3 \xd4 \xd5 \xd6 \xd7 \xd8 \xd9 \xda 
+  \xdb \xdc \xdd \xde \xdf \xe0 \xe1 \xe2 \xe3 \xe4 \xe5 \xe6 \xe7 \xe8 \xe9 
+  \xea \xeb \xec \xed \xee \xef \xf0 \xf1 \xf2 \xf3 \xf4 \xf5 \xf6 \xf7 \xf8 
+  \xf9 \xfa \xfb \xfc \xfd \xfe \xff 
 Subject length lower bound = 1

 /[01[:alpha:]%]/IB
@ -2418,6 +2548,7 @@ Subject length lower bound = 4
 Capturing subpattern count = 2
 Compile options: <none>
 Overall options: anchored
+First code unit = 'a'
 Subject length lower bound = 1
    aba
 0: aba
@ -2428,6 +2559,7 @@ Subject length lower bound = 1
 Capturing subpattern count = 2
 Compile options: <none>
 Overall options: anchored
+First code unit = 'a'
 Subject length lower bound = 2
    aabbaa
 0: aabbaa
@ -2438,6 +2570,7 @@ Subject length lower bound = 2
 Capturing subpattern count = 2
 Compile options: <none>
 Overall options: anchored
+First code unit = 'a'
 Subject length lower bound = 2
    aabbaa
 0: aabbaa
@ -2448,6 +2581,7 @@ Subject length lower bound = 2
 Capturing subpattern count = 2
 Compile options: <none>
 Overall options: anchored
+First code unit = 'a'
 Subject length lower bound = 2
    aabbaa
 0: aabbaa
@ -2458,6 +2592,7 @@ Subject length lower bound = 2
 Capturing subpattern count = 1
 Compile options: <none>
 Overall options: anchored
+First code unit = 'a'
 Subject length lower bound = 2
    aabbaa
 0: aabbaa
@ -2467,6 +2602,7 @@ Subject length lower bound = 2
 Capturing subpattern count = 3
 Compile options: <none>
 Overall options: anchored
+First code unit = 'a'
 Subject length lower bound = 2
    aabbaa
 0: aabbaa
@ -2478,6 +2614,7 @@ Subject length lower bound = 2
 Capturing subpattern count = 2
 Compile options: <none>
 Overall options: anchored
+First code unit = 'a'
 Subject length lower bound = 2
    aabbaa
 0: aabbaa
@ -2488,6 +2625,7 @@ Subject length lower bound = 2
 Capturing subpattern count = 1
 Compile options: <none>
 Overall options: anchored
+First code unit = 'a'
 Subject length lower bound = 2
    aabbaa
 0: aabbaa
@ -2497,6 +2635,7 @@ Subject length lower bound = 2
 Capturing subpattern count = 1
 Compile options: <none>
 Overall options: anchored
+First code unit = 'a'
 Subject length lower bound = 2
    aabbbaa
 0: aabbbaa
@ -2506,6 +2645,7 @@ Subject length lower bound = 2
 Capturing subpattern count = 1
 Compile options: <none>
 Overall options: anchored
+First code unit = 'a'
 Subject length lower bound = 2
    aabbbaa
 0: aabbbaa
@ -2515,6 +2655,7 @@ Subject length lower bound = 2
 Capturing subpattern count = 1
 Compile options: <none>
 Overall options: anchored
+First code unit = 'a'
 Subject length lower bound = 2
    aabbaa
 0: aabbaa
@ -2524,6 +2665,7 @@ Subject length lower bound = 2
 Capturing subpattern count = 1
 Compile options: <none>
 Overall options: anchored
+First code unit = 'a'
 Subject length lower bound = 2
    aabbbaa
 0: aabbbaa
@ -2533,6 +2675,7 @@ Subject length lower bound = 2
 Capturing subpattern count = 3
 Compile options: <none>
 Overall options: anchored
+First code unit = 'a'
 Subject length lower bound = 2
    aabbbaa
 0: aabbbaa
@ -2544,6 +2687,7 @@ Subject length lower bound = 2
 Capturing subpattern count = 3
 Compile options: <none>
 Overall options: anchored
+First code unit = 'a'
 Subject length lower bound = 2
    aabbbbaa
 0: aabbbbaa
@ -3052,6 +3196,7 @@ Subject length lower bound = 3
 Capturing subpattern count = 5
 Compile options: <none>
 Overall options: anchored
+First code unit = 'a'
 Subject length lower bound = 3

 /^x(?U)a+b/IB
@ -3067,6 +3212,7 @@ Subject length lower bound = 3
 Capturing subpattern count = 0
 Compile options: <none>
 Overall options: anchored
+First code unit = 'x'
 Last code unit = 'b'
 Subject length lower bound = 3

@ -3085,6 +3231,7 @@ Subject length lower bound = 3
 Capturing subpattern count = 1
 Compile options: <none>
 Overall options: anchored
+First code unit = 'x'
 Last code unit = 'b'
 Subject length lower bound = 3

@ -3725,6 +3872,7 @@ Subject length lower bound = 3
 Capturing subpattern count = 0
 Compile options: <none>
 Overall options: anchored
+First code unit = 'a'
 Subject length lower bound = 3

 /(?C)a|b/I
@ -3785,6 +3933,7 @@ No match
 Capturing subpattern count = 1
 Compile options: <none>
 Overall options: anchored
+First code unit = '>'
 Last code unit = '<'
 Subject length lower bound = 10
   >abc>123<xyz<
@ -3835,6 +3984,7 @@ Subject length lower bound = 2
 Capturing subpattern count = 2
 Compile options: <none>
 Overall options: anchored
+Starting code units: ( - 0 1 2 3 4 5 6 7 8 9 
 Subject length lower bound = 1
    12
 0: 12
@ -3854,6 +4004,7 @@ No match
 Capturing subpattern count = 2
 Compile options: <none>
 Overall options: anchored
+First code unit = 'x'
 Subject length lower bound = 3
    xyz
 0: xyz
@ -3913,6 +4064,7 @@ Failed: error 114 at offset 10: missing closing parenthesis
 Capturing subpattern count = 1
 Compile options: <none>
 Overall options: anchored
+First code unit = 'a'
 Subject length lower bound = 9
    abcdefabc
 0: abcdefabc
@ -3922,6 +4074,7 @@ Subject length lower bound = 9
 Capturing subpattern count = 1
 Compile options: <none>
 Overall options: anchored
+Starting code units: a b c 
 Subject length lower bound = 2
    a=a
 0: a=a
@ -3937,6 +4090,7 @@ Subject length lower bound = 2
 Capturing subpattern count = 2
 Compile options: <none>
 Overall options: anchored
+Starting code units: a b c 
 Subject length lower bound = 2
    a=a
 0: a=a
@ -5173,6 +5327,7 @@ No match
 Capturing subpattern count = 3
 Compile options: <none>
 Overall options: anchored
+Starting code units: 0 1 2 3 4 5 6 7 8 9 
 Last code unit = '/'
 Subject length lower bound = 6
    13/05/04\=ps
@ -5270,6 +5425,7 @@ Partial match: c12
 Capturing subpattern count = 0
 Compile options: <none>
 Overall options: anchored
+Starting code units: 0 1 2 3 4 5 6 7 8 9 
 Last code unit = 'X'
 Subject length lower bound = 4
    1\=ps
@ -5643,6 +5799,7 @@ Named capturing subpatterns:
  A   3
 Compile options: dupnames
 Overall options: anchored dupnames
+First code unit = 'a'
 Subject length lower bound = 2
    a1b\=copy=A
 0: a1
@ -5680,6 +5837,7 @@ Named capturing subpatterns:
  A   2
 Compile options: dupnames
 Overall options: anchored dupnames
+First code unit = 'a'
 Subject length lower bound = 2
    ab\=copy=A
 0: ab
@ -5693,6 +5851,7 @@ Named capturing subpatterns:
  A   1
  A   2
 Options: dupnames
+Starting code units: a c 
 Subject length lower bound = 2
    ab\=copy=A
 0: ab
@ -5711,6 +5870,7 @@ Named capturing subpatterns:
  A   3
  A   4
 Options: dupnames
+Starting code units: a c 
 Subject length lower bound = 2
    cdefgh\=copy=A
 0: cdefgh
@ -5727,6 +5887,7 @@ Named capturing subpatterns:
  A   3
 Compile options: dupnames
 Overall options: anchored dupnames
+First code unit = 'a'
 Subject length lower bound = 2
    a1b\=get=A
 0: a1
@ -5754,6 +5915,7 @@ Named capturing subpatterns:
  A   2
 Compile options: dupnames
 Overall options: anchored dupnames
+First code unit = 'a'
 Subject length lower bound = 2
    ab\=get=A
 0: ab
@ -5767,6 +5929,7 @@ Named capturing subpatterns:
  A   1
  A   2
 Options: dupnames
+Starting code units: a c 
 Subject length lower bound = 2
    ab\=get=A
 0: ab
@ -5785,6 +5948,7 @@ Named capturing subpatterns:
  A   3
  A   4
 Options: dupnames
+Starting code units: a c 
 Subject length lower bound = 2
    cdefgh\=get=A
 0: cdefgh
@ -5802,6 +5966,7 @@ Named capturing subpatterns:
 Compile options: <none>
 Overall options: anchored
 Duplicate name status changes
+First code unit = 'a'
 Subject length lower bound = 2
    a1b\=copy=A
 0: a1
@ -5832,6 +5997,7 @@ Named capturing subpatterns:
 Compile options: <none>
 Overall options: anchored
 Duplicate name status changes
+First code unit = 'a'
 Subject length lower bound = 6
    a bc d\=copy=A,copy=B,copy=C
 0: a bc d
@ -6233,6 +6399,7 @@ Subject length lower bound = 4
 Capturing subpattern count = 0
 Compile options: <none>
 Overall options: anchored
+Starting code units: a b 
 Last code unit = 'b'
 Subject length lower bound = 2

@ -6249,6 +6416,7 @@ Subject length lower bound = 2
 Capturing subpattern count = 0
 Compile options: <none>
 Overall options: anchored
+Starting code units: a b 
 Last code unit = 'b'
 Subject length lower bound = 2

@ -6265,6 +6433,7 @@ Subject length lower bound = 2
 Capturing subpattern count = 0
 Compile options: <none>
 Overall options: anchored
+Starting code units: a b 
 Last code unit = 'b'
 Subject length lower bound = 2

@ -6281,6 +6450,7 @@ Subject length lower bound = 2
 Capturing subpattern count = 0
 Compile options: <none>
 Overall options: anchored
+First code unit = 'a'
 Last code unit = 'A'
 Subject length lower bound = 3
    aaaA5
@ -6302,6 +6472,7 @@ No match
 Capturing subpattern count = 0
 Compile options: caseless
 Overall options: anchored caseless
+Starting code units: A a 
 Last code unit = 'A' (caseless)
 Subject length lower bound = 2
    aaaA5
@ -9540,6 +9711,7 @@ Subject length lower bound = 2
 Capturing subpattern count = 1
 Compile options: <none>
 Overall options: anchored
+First code unit = 'F'
 Last code unit = ':'
 Subject length lower bound = 22

@ -9691,6 +9863,7 @@ Named capturing subpatterns:
  D   1
 Compile options: dupnames extended
 Overall options: anchored dupnames extended
+Starting code units: a e 
 Subject length lower bound = 2
    abcdX
 0: abcdX
@ -10445,12 +10618,14 @@ Failed: error 125 at offset 0: lookbehind assertion is not fixed length
 Capturing subpattern count = 1
 Compile options: <none>
 Overall options: anchored
+First code unit = 'a'
 Subject length lower bound = 2

 /(^ab)++/I
 Capturing subpattern count = 1
 Compile options: <none>
 Overall options: anchored
+First code unit = 'a'
 Subject length lower bound = 2

 /(^ab|^)+/I
@ -10471,12 +10646,14 @@ Subject length lower bound = 0
 Capturing subpattern count = 0
 Compile options: <none>
 Overall options: anchored
+First code unit = 'a'
 Subject length lower bound = 2

 /(?:^ab)++/I
 Capturing subpattern count = 0
 Compile options: <none>
 Overall options: anchored
+First code unit = 'a'
 Subject length lower bound = 2

 /(?:^ab|^)+/I
@ -11586,6 +11763,7 @@ Subject length lower bound = 2
 Capturing subpattern count = 0
 Compile options: dotall
 Overall options: anchored dotall
+First code unit = 'a'
 Subject length lower bound = 2

 /.*?a(*SKIP)b/I
@ -11608,6 +11786,7 @@ Subject length lower bound = 2
 Capturing subpattern count = 0
 Compile options: dotall
 Overall options: anchored dotall
+First code unit = 'a'
 Subject length lower bound = 2

 /(?>.*?)(?<=(abcd)|(wxyz))/I
@ -13375,7 +13554,6 @@ Subject length lower bound = 1
 /(|ab)*?d/I,no_start_optimize
 Capturing subpattern count = 1
 Options: no_start_optimize
-Last code unit = 'd'
 Subject length lower bound = 0
   abd
 0: abd
@ -13641,12 +13819,14 @@ get substring list failed (-2): partial match
 Capturing subpattern count = 0
 Compile options: <none>
 Overall options: anchored
+First code unit = 'a'
 Subject length lower bound = 3

 /^abc/info,no_dotstar_anchor
 Capturing subpattern count = 0
 Compile options: no_dotstar_anchor
 Overall options: anchored no_dotstar_anchor
+First code unit = 'a'
 Subject length lower bound = 3

 /.*\d/info,auto_callout
@ -14684,6 +14864,7 @@ Capturing subpattern count = 2
 Max back reference = 1
 Compile options: <none>
 Overall options: anchored
+First code unit = 'o'
 Last code unit = '}'
 Subject length lower bound = 65535

@ -15607,6 +15788,7 @@ No match
 Capturing subpattern count = 1
 Compile options: <none>
 Overall options: anchored
+First code unit = 'b'
 Subject length lower bound = 2

 /(a){0}.*bc/sI
@ -15885,6 +16067,10 @@ No match
 No match

 /^(?!A(?C1)B)C/
+    ABC\=callout_error=1,no_jit
+No match
+
+/^(?!A(?C1)B)C/no_start_optimize
    ABC\=callout_error=1
 --->ABC
  1 ^^      B
--- a/testdata/testoutput5
+++ b/testdata/testoutput5
@ -194,6 +194,7 @@ Subject length lower bound = 3
 Capturing subpattern count = 0
 Compile options: utf
 Overall options: anchored utf
+Starting code units: a b 
 Subject length lower bound = 1
    bar
 0: b
@ -205,28 +206,6 @@ No match
    \x{100}
 No match

-/^[^ab]/IB,utf
------------------------------------------------------------------
-        Bra
-        ^
-        [\x00-`c-\xff] (neg)
-        Ket
-        End
------------------------------------------------------------------
-Capturing subpattern count = 0
-Compile options: utf
-Overall options: anchored utf
-Subject length lower bound = 1
-    c
- 0: c
-    \x{ff}
- 0: \x{ff}
-    \x{100}
- 0: \x{100}
-\= Expect no match
-    aaa
-No match
-
 /\x{100}*(\d+|"(?1)")/utf
    1234
 0: 1234
@ -479,7 +458,10 @@ Subject length lower bound = 0
    \x{100}X
 0: X

-/^\ሴ/IB,utf
+# Use no_start_optimize because the first code unit is different in 8-bit from
+# the wider modes.
+
+/^\ሴ/IB,utf,no_start_optimize
 ------------------------------------------------------------------
        Bra
        ^
@ -488,9 +470,9 @@ Subject length lower bound = 0
        End
 ------------------------------------------------------------------
 Capturing subpattern count = 0
-Compile options: utf
-Overall options: anchored utf
-Subject length lower bound = 1
+Compile options: no_start_optimize utf
+Overall options: anchored no_start_optimize utf
+Subject length lower bound = 0

 /()()()()()()()()()()
 ()()()()()()()()()()