Unicode upper/lower casing is now used when UCP is set, even if UTF is not set.

This is not yet documented, and it not yet implemented in JIT.
2020-02-23 16:40:05 +00:00 · 2020-02-23 16:40:05 +00:00 · 4a7dfab0ec
parent d0666136c9
commit 4a7dfab0ec
18 changed files with 893 additions and 125 deletions
--- a/5
+++ b/5
@ -66,6 +66,11 @@ recurse function in JIT.
 17. Fix a crash which occurs when the character type of an invalid UTF
 character is decoded in JIT.
 18. Changes in many areas of the code so that when Unicode is supported and 
 PCRE2_UCP is set without PCRE2_UTF, Unicode character properties are used for 
 upper/lower case computations on characters whose code points are greater than 
 127. Documentation is not yet updated. JIT is not yet updated.
 Version 10.34 21-November-2019
 ------------------------------
--- a/maint/ManyConfigTests
+++ b/maint/ManyConfigTests
@ -28,8 +28,6 @@
 # The -v option causes a call to 'pcre2test -C' to happen for each
 # configuration.
 # Currently -fsanitize=undefined is not working (locks machine).
 useasan=1
 useusan=1
 usedebug=1
--- a/src/pcre2_auto_possess.c
+++ b/src/pcre2_auto_possess.c
@ -7,7 +7,7 @@ and semantics are as close as possible to those of the Perl 5 language.
                       Written by Philip Hazel
     Original API code Copyright (c) 1997-2012 University of Cambridge
-          New API code Copyright (c) 2016-2019 University of Cambridge
+          New API code Copyright (c) 2016-2020 University of Cambridge
 -----------------------------------------------------------------------------
 Redistribution and use in source and binary forms, with or without
@ -292,6 +292,7 @@ possessification, and if so, fills a list with its properties.
 Arguments:
  code        points to start of expression
  utf         TRUE if in UTF mode
  ucp         TRUE if in UCP mode
  fcc         points to the case-flipping table
  list        points to output list
              list[0] will be filled with the opcode
@ -304,7 +305,7 @@ Returns:      points to the start of the next opcode if *code is accepted
 */
 static PCRE2_SPTR
-get_chr_property_list(PCRE2_SPTR code, BOOL utf, const uint8_t *fcc,
+get_chr_property_list(PCRE2_SPTR code, BOOL utf, BOOL ucp, const uint8_t *fcc,
  uint32_t *list)
 {
 PCRE2_UCHAR c = *code;
@ -316,7 +317,8 @@ uint32_t chr;
 uint32_t *clist_dest;
 const uint32_t *clist_src;
 #else
-(void)utf;    /* Suppress "unused parameter" compiler warning */
+(void)utf;    /* Suppress "unused parameter" compiler warnings */
 (void)ucp;
 #endif
 list[0] = c;
@ -396,7 +398,7 @@ switch(c)
  list[2] = chr;
 #ifdef SUPPORT_UNICODE
-  if (chr < 128 || (chr < 256 && !utf))
+  if (chr < 128 || (chr < 256 && !utf && !ucp))
    list[3] = fcc[chr];
  else
    list[3] = UCD_OTHERCASE(chr);
@ -503,6 +505,7 @@ which case the base cannot be possessified.
 Arguments:
  code        points to the byte code
  utf         TRUE in UTF mode
  ucp         TRUE in UCP mode 
  cb          compile data block
  base_list   the data list of the base opcode
  base_end    the end of the base opcode
@ -512,7 +515,7 @@ Returns:      TRUE if the auto-possessification is possible
 */
 static BOOL
-compare_opcodes(PCRE2_SPTR code, BOOL utf, const compile_block *cb,
+compare_opcodes(PCRE2_SPTR code, BOOL utf, BOOL ucp, const compile_block *cb,
  const uint32_t *base_list, PCRE2_SPTR base_end, int *rec_limit)
 {
 PCRE2_UCHAR c;
@ -651,7 +654,7 @@ for(;;)
    while (*next_code == OP_ALT)
      {
-      if (!compare_opcodes(code, utf, cb, base_list, base_end, rec_limit))
+      if (!compare_opcodes(code, utf, ucp, cb, base_list, base_end, rec_limit))
        return FALSE;
      code = next_code + 1 + LINK_SIZE;
      next_code += GET(next_code, 1);
@ -672,7 +675,8 @@ for(;;)
    /* The bracket content will be checked by the OP_BRA/OP_CBRA case above. */
    next_code += 1 + LINK_SIZE;
-    if (!compare_opcodes(next_code, utf, cb, base_list, base_end, rec_limit))
+    if (!compare_opcodes(next_code, utf, ucp, cb, base_list, base_end, 
         rec_limit))
      return FALSE;
    code += PRIV(OP_lengths)[c];
@ -688,7 +692,7 @@ for(;;)
  /* We now have the next appropriate opcode to compare with the base. Check
  for a supported opcode, and load its properties. */
-  code = get_chr_property_list(code, utf, cb->fcc, list);
+  code = get_chr_property_list(code, utf, ucp, cb->fcc, list);
  if (code == NULL) return FALSE;    /* Unsupported */
  /* If either opcode is a small character list, set pointers for comparing
@ -1100,7 +1104,6 @@ leaving the remainder of the pattern unpossessified.
 Arguments:
  code        points to start of the byte code
  utf         TRUE in UTF mode
  cb          compile data block
 Returns:      0 for success
@ -1108,13 +1111,15 @@ Returns:      0 for success
 */
 int
-PRIV(auto_possessify)(PCRE2_UCHAR *code, BOOL utf, const compile_block *cb)
+PRIV(auto_possessify)(PCRE2_UCHAR *code, const compile_block *cb)
 {
 PCRE2_UCHAR c;
 PCRE2_SPTR end;
 PCRE2_UCHAR *repeat_opcode;
 uint32_t list[8];
 int rec_limit = 1000;  /* Was 10,000 but clang+ASAN uses a lot of stack. */
 BOOL utf = (cb->external_options & PCRE2_UTF) != 0;
 BOOL ucp = (cb->external_options & PCRE2_UCP) != 0;
 for (;;)
  {
@ -1126,10 +1131,11 @@ for (;;)
    {
    c -= get_repeat_base(c) - OP_STAR;
    end = (c <= OP_MINUPTO) ?
-      get_chr_property_list(code, utf, cb->fcc, list) : NULL;
+      get_chr_property_list(code, utf, ucp, cb->fcc, list) : NULL;
    list[1] = c == OP_STAR || c == OP_PLUS || c == OP_QUERY || c == OP_UPTO;
-    if (end != NULL && compare_opcodes(end, utf, cb, list, end, &rec_limit))
+    if (end != NULL && compare_opcodes(end, utf, ucp, cb, list, end, 
        &rec_limit))
      {
      switch(c)
        {
@ -1181,11 +1187,11 @@ for (;;)
    if (c >= OP_CRSTAR && c <= OP_CRMINRANGE)
      {
      /* end must not be NULL. */
-      end = get_chr_property_list(code, utf, cb->fcc, list);
+      end = get_chr_property_list(code, utf, ucp, cb->fcc, list);
      list[1] = (c & 1) == 0;
-      if (compare_opcodes(end, utf, cb, list, end, &rec_limit))
+      if (compare_opcodes(end, utf, ucp, cb, list, end, &rec_limit))
        {
        switch (c)
          {
--- a/src/pcre2_compile.c
+++ b/src/pcre2_compile.c
@ -4904,7 +4904,7 @@ range. */
 if ((options & PCRE2_CASELESS) != 0)
  {
 #ifdef SUPPORT_UNICODE
-  if ((options & PCRE2_UTF) != 0)
+  if ((options & (PCRE2_UTF|PCRE2_UCP)) != 0)
    {
    int rc;
    uint32_t oc, od;
@ -5319,7 +5319,8 @@ dynamically as we process the pattern. */
 #ifdef SUPPORT_UNICODE
 BOOL utf = (options & PCRE2_UTF) != 0;
-#else  /* No UTF support */
+BOOL ucp = (options & PCRE2_UCP) != 0;
 #else  /* No Unicode support */
 BOOL utf = FALSE;
 #endif
@ -5602,7 +5603,7 @@ for (;; pptr++)
        uint32_t d;
 #ifdef SUPPORT_UNICODE
-        if (utf && c > 127) d = UCD_OTHERCASE(c); else
+        if ((utf || ucp) && c > 127) d = UCD_OTHERCASE(c); else
 #endif
          {
 #if PCRE2_CODE_UNIT_WIDTH != 8
@ -9632,6 +9633,7 @@ pcre2_compile(PCRE2_SPTR pattern, PCRE2_SIZE patlen, uint32_t options,
   int *errorptr, PCRE2_SIZE *erroroffset, pcre2_compile_context *ccontext)
 {
 BOOL utf;                             /* Set TRUE for UTF mode */
 BOOL ucp;                             /* Set TRUE for UCP mode */
 BOOL has_lookbehind = FALSE;          /* Set TRUE if a lookbehind is found */
 BOOL zero_terminated;                 /* Set TRUE for zero-terminated pattern */
 pcre2_real_code *re = NULL;           /* What we will return */
@ -9919,8 +9921,8 @@ if (utf)
 /* Check UCP lockout. */
-if ((cb.external_options & (PCRE2_UCP|PCRE2_NEVER_UCP)) ==
+ucp = (cb.external_options & PCRE2_UCP) != 0;
-    (PCRE2_UCP|PCRE2_NEVER_UCP))
+if (ucp && (cb.external_options & PCRE2_NEVER_UCP) != 0)
  {
  errorcode = ERR75;
  goto HAD_EARLY_ERROR;
@ -10296,7 +10298,7 @@ function call. */
 if (errorcode == 0 && (re->overall_options & PCRE2_NO_AUTO_POSSESS) == 0)
  {
  PCRE2_UCHAR *temp = (PCRE2_UCHAR *)codestart;
-  if (PRIV(auto_possessify)(temp, utf, &cb) != 0) errorcode = ERR80;
+  if (PRIV(auto_possessify)(temp, &cb) != 0) errorcode = ERR80;
  }
 /* Failed to compile, or error while post-processing. */
@ -10344,21 +10346,25 @@ if ((re->overall_options & PCRE2_NO_START_OPTIMIZE) == 0)
    if ((firstcuflags & REQ_CASELESS) != 0)
      {
-      if (firstcu < 128 || (!utf && firstcu < 255))
+      if (firstcu < 128 || (!utf && !ucp && firstcu < 255))
        {
        if (cb.fcc[firstcu] != firstcu) re->flags |= PCRE2_FIRSTCASELESS;
        }
-      /* The first code unit is > 128 in UTF mode, or > 255 otherwise. In
+      /* The first code unit is > 128 in UTF or UCP mode, or > 255 otherwise.
-      8-bit UTF mode, codepoints in the range 128-255 are introductory code
+      In 8-bit UTF mode, codepoints in the range 128-255 are introductory code
-      points and cannot have another case. In 16-bit and 32-bit modes, we can
+      points and cannot have another case, but if UCP is set they may do. */
      check wide characters when UTF (and therefore UCP) is supported. */
-#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 8
+#ifdef SUPPORT_UNICODE
-      else if (firstcu <= MAX_UTF_CODE_POINT &&
+#if PCRE2_CODE_UNIT_WIDTH == 8
      else if (ucp && !utf && UCD_OTHERCASE(firstcu) != firstcu)
        re->flags |= PCRE2_FIRSTCASELESS;
 #else
      else if ((utf || ucp) && firstcu <= MAX_UTF_CODE_POINT &&
               UCD_OTHERCASE(firstcu) != firstcu)
        re->flags |= PCRE2_FIRSTCASELESS;
 #endif
 #endif  /* SUPPORT_UNICODE */
      }
    }
@ -10407,14 +10413,20 @@ if ((re->overall_options & PCRE2_NO_START_OPTIMIZE) == 0)
      if ((reqcuflags & REQ_CASELESS) != 0)
        {
-        if (reqcu < 128 || (!utf && reqcu < 255))
+        if (reqcu < 128 || (!utf && !ucp && reqcu < 255))
          {
          if (cb.fcc[reqcu] != reqcu) re->flags |= PCRE2_LASTCASELESS;
          }
-#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 8
+#ifdef SUPPORT_UNICODE
-        else if (reqcu <= MAX_UTF_CODE_POINT && UCD_OTHERCASE(reqcu) != reqcu)
+#if PCRE2_CODE_UNIT_WIDTH == 8
-          re->flags |= PCRE2_LASTCASELESS;
+      else if (ucp && !utf && UCD_OTHERCASE(reqcu) != reqcu)
        re->flags |= PCRE2_LASTCASELESS;
 #else
      else if ((utf || ucp) && reqcu <= MAX_UTF_CODE_POINT &&
               UCD_OTHERCASE(reqcu) != reqcu)
        re->flags |= PCRE2_LASTCASELESS;
 #endif
 #endif  /* SUPPORT_UNICODE */
        }
      }
    }
--- a/src/pcre2_dfa_match.c
+++ b/src/pcre2_dfa_match.c
@ -7,7 +7,7 @@ and semantics are as close as possible to those of the Perl 5 language.
                       Written by Philip Hazel
     Original API code Copyright (c) 1997-2012 University of Cambridge
-          New API code Copyright (c) 2016-2019 University of Cambridge
+          New API code Copyright (c) 2016-2020 University of Cambridge
 -----------------------------------------------------------------------------
 Redistribution and use in source and binary forms, with or without
@ -548,6 +548,7 @@ PCRE2_SPTR start_code = mb->start_code;
 #ifdef SUPPORT_UNICODE
 BOOL utf = (mb->poptions & PCRE2_UTF) != 0;
 BOOL utf_or_ucp = utf || (mb->poptions & PCRE2_UCP) != 0;
 #else
 BOOL utf = FALSE;
 #endif
@ -2190,7 +2191,7 @@ for (;;)
      if (clen == 0) break;
 #ifdef SUPPORT_UNICODE
-      if (utf)
+      if (utf_or_ucp)
        {
        if (c == d) { ADD_NEW(state_offset + dlen + 1, 0); } else
          {
@ -2204,7 +2205,7 @@ for (;;)
        }
      else
 #endif  /* SUPPORT_UNICODE */
-      /* Not UTF mode */
+      /* Not UTF or UCP mode */
        {
        if (TABLE_GET(c, lcc, c) == TABLE_GET(d, lcc, d))
          { ADD_NEW(state_offset + 2, 0); }
@ -2339,7 +2340,7 @@ for (;;)
        {
        uint32_t otherd;
 #ifdef SUPPORT_UNICODE
-        if (utf && d >= 128)
+        if (utf_or_ucp && d >= 128)
          otherd = UCD_OTHERCASE(d);
        else
 #endif  /* SUPPORT_UNICODE */
@ -2374,7 +2375,7 @@ for (;;)
        if (caseless)
          {
 #ifdef SUPPORT_UNICODE
-          if (utf && d >= 128)
+          if (utf_or_ucp && d >= 128)
            otherd = UCD_OTHERCASE(d);
          else
 #endif  /* SUPPORT_UNICODE */
@ -2417,7 +2418,7 @@ for (;;)
        if (caseless)
          {
 #ifdef SUPPORT_UNICODE
-          if (utf && d >= 128)
+          if (utf_or_ucp && d >= 128)
            otherd = UCD_OTHERCASE(d);
          else
 #endif  /* SUPPORT_UNICODE */
@ -2458,7 +2459,7 @@ for (;;)
        if (caseless)
          {
 #ifdef SUPPORT_UNICODE
-          if (utf && d >= 128)
+          if (utf_or_ucp && d >= 128)
            otherd = UCD_OTHERCASE(d);
          else
 #endif  /* SUPPORT_UNICODE */
@ -2491,7 +2492,7 @@ for (;;)
        if (caseless)
          {
 #ifdef SUPPORT_UNICODE
-          if (utf && d >= 128)
+          if (utf_or_ucp && d >= 128)
            otherd = UCD_OTHERCASE(d);
          else
 #endif  /* SUPPORT_UNICODE */
@ -2531,7 +2532,7 @@ for (;;)
        if (caseless)
          {
 #ifdef SUPPORT_UNICODE
-          if (utf && d >= 128)
+          if (utf_or_ucp && d >= 128)
            otherd = UCD_OTHERCASE(d);
          else
 #endif  /* SUPPORT_UNICODE */
@ -3526,10 +3527,15 @@ if ((re->flags & PCRE2_FIRSTSET) != 0)
  if ((re->flags & PCRE2_FIRSTCASELESS) != 0)
    {
    first_cu2 = TABLE_GET(first_cu, mb->tables + fcc_offset, first_cu);
-#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 8
+#ifdef SUPPORT_UNICODE 
-    if (utf && first_cu > 127)
+#if PCRE2_CODE_UNIT_WIDTH == 8
    if (first_cu > 127 && !utf && (re->overall_options & PCRE2_UCP) != 0)
      first_cu2 = (PCRE2_UCHAR)UCD_OTHERCASE(first_cu);
-#endif
+#else
    if (first_cu > 127 && (utf || (re->overall_options & PCRE2_UCP) != 0))
      first_cu2 = (PCRE2_UCHAR)UCD_OTHERCASE(first_cu);
 #endif       
 #endif  /* SUPPORT_UNICODE */
    }
  }
 else
@ -3545,9 +3551,15 @@ if ((re->flags & PCRE2_LASTSET) != 0)
  if ((re->flags & PCRE2_LASTCASELESS) != 0)
    {
    req_cu2 = TABLE_GET(req_cu, mb->tables + fcc_offset, req_cu);
-#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 8
+#ifdef SUPPORT_UNICODE
-    if (utf && req_cu > 127) req_cu2 = (PCRE2_UCHAR)UCD_OTHERCASE(req_cu);
+#if PCRE2_CODE_UNIT_WIDTH == 8
    if (req_cu > 127 && !utf && (re->overall_options & PCRE2_UCP) != 0) 
      req_cu2 = (PCRE2_UCHAR)UCD_OTHERCASE(req_cu);
 #else
    if (req_cu > 127 && (utf || (re->overall_options & PCRE2_UCP) != 0)) 
      req_cu2 = (PCRE2_UCHAR)UCD_OTHERCASE(req_cu);
 #endif
 #endif  /* SUPPORT_UNICODE */
    }
  }
--- a/src/pcre2_internal.h
+++ b/src/pcre2_internal.h
@ -1952,7 +1952,7 @@ is available. */
 #define _pcre2_was_newline           PCRE2_SUFFIX(_pcre2_was_newline_)
 #define _pcre2_xclass                PCRE2_SUFFIX(_pcre2_xclass_)
-extern int          _pcre2_auto_possessify(PCRE2_UCHAR *, BOOL,
+extern int          _pcre2_auto_possessify(PCRE2_UCHAR *,
                      const compile_block *);
 extern int          _pcre2_check_escape(PCRE2_SPTR *, PCRE2_SPTR, uint32_t *,
                      int *, uint32_t, uint32_t, BOOL, compile_block *);
--- a/src/pcre2_match.c
+++ b/src/pcre2_match.c
@ -7,7 +7,7 @@ and semantics are as close as possible to those of the Perl 5 language.
                       Written by Philip Hazel
     Original API code Copyright (c) 1997-2012 University of Cambridge
-          New API code Copyright (c) 2015-2019 University of Cambridge
+          New API code Copyright (c) 2015-2020 University of Cambridge
 -----------------------------------------------------------------------------
 Redistribution and use in source and binary forms, with or without
@ -598,12 +598,13 @@ BOOL condition;         /* Used in conditional groups */
 BOOL cur_is_word;       /* Used in "word" tests */
 BOOL prev_is_word;      /* Used in "word" tests */
-/* UTF flag */
+/* UTF and UCP flags */
 #ifdef SUPPORT_UNICODE
 BOOL utf = (mb->poptions & PCRE2_UTF) != 0;
 BOOL ucp = (mb->poptions & PCRE2_UCP) != 0;
 #else
-BOOL utf = FALSE;
+BOOL utf = FALSE;  /* Required for convenience even when no Unicode support */
 #endif
 /* This is the length of the last part of a backtracking frame that must be
@ -928,6 +929,7 @@ fprintf(stderr, "++ op=%d\n", *Fecode);
      }
    else
 #endif
    /* Not UTF mode */
      {
      if (mb->end_subject - Feptr < 1)
@ -987,10 +989,30 @@ fprintf(stderr, "++ op=%d\n", *Fecode);
        if (dc != fc && dc != UCD_OTHERCASE(fc)) RRETURN(MATCH_NOMATCH);
        }
      }
    /* If UCP is set without UTF we must do the same as above, but with one
    character per code unit. */
    else if (ucp)
      {
      uint32_t cc = UCHAR21(Feptr);
      fc = Fecode[1];
      if (fc < 128)
        {
        if (mb->lcc[fc] != TABLE_GET(cc, mb->lcc, cc)) RRETURN(MATCH_NOMATCH);
        }
      else
        {
        if (cc != fc && cc != UCD_OTHERCASE(fc)) RRETURN(MATCH_NOMATCH);
        }
      Feptr++;
      Fecode += 2;
      }
    else
 #endif   /* SUPPORT_UNICODE */
-    /* Not UTF mode; use the table for characters < 256. */
+    /* Not UTF or UCP mode; use the table for characters < 256. */
      {
      if (TABLE_GET(Fecode[1], mb->lcc, Fecode[1])
          != TABLE_GET(*Feptr, mb->lcc, *Feptr)) RRETURN(MATCH_NOMATCH);
@ -1010,6 +1032,7 @@ fprintf(stderr, "++ op=%d\n", *Fecode);
      SCHECK_PARTIAL();
      RRETURN(MATCH_NOMATCH);
      }
 #ifdef SUPPORT_UNICODE
    if (utf)
      {
@ -1026,15 +1049,42 @@ fprintf(stderr, "++ op=%d\n", *Fecode);
        if (ch > 127)
          ch = UCD_OTHERCASE(ch);
        else
-          ch = TABLE_GET(ch, mb->fcc, ch);
+          ch = (mb->fcc)[ch];
        if (ch == fc) RRETURN(MATCH_NOMATCH);
        }
      }
    /* UCP without UTF is as above, but with one character per code unit. */
    else if (ucp)
      {
      uint32_t ch;
      fc = UCHAR21INC(Feptr);
      ch = Fecode[1];
      Fecode += 2;
      if (ch == fc)
        {
        RRETURN(MATCH_NOMATCH);  /* Caseful match */
        }
      else if (Fop == OP_NOTI)   /* If caseless */
        {
        if (ch > 127)
          ch = UCD_OTHERCASE(ch);
        else
          ch = (mb->fcc)[ch];
        if (ch == fc) RRETURN(MATCH_NOMATCH);
        }
      }
    else
 #endif  /* SUPPORT_UNICODE */
    /* Neither UTF nor UCP is set */
      {
      uint32_t ch = Fecode[1];
-      fc = *Feptr++;
+      fc = UCHAR21INC(Feptr);
      if (ch == fc || (Fop == OP_NOTI && TABLE_GET(ch, mb->fcc, ch) == fc))
        RRETURN(MATCH_NOMATCH);
      Fecode += 2;
@ -1244,7 +1294,7 @@ fprintf(stderr, "++ op=%d\n", *Fecode);
 #endif  /* SUPPORT_UNICODE */
    /* When not in UTF mode, load a single-code-unit character. Then proceed as
-    above. */
+    above, using Unicode casing if either UTF or UCP is set. */
    Lc = *Fecode++;
@ -1253,11 +1303,15 @@ fprintf(stderr, "++ op=%d\n", *Fecode);
    if (Fop >= OP_STARI)
      {
 #if PCRE2_CODE_UNIT_WIDTH == 8
-      /* Lc must be < 128 in UTF-8 mode. */
+#ifdef SUPPORT_UNICODE
      if (ucp && !utf && Lc > 127) Loc = UCD_OTHERCASE(Lc);
      else
 #endif  /* SUPPORT_UNICODE */
      /* Lc will be < 128 in UTF-8 mode. */
      Loc = mb->fcc[Lc];
 #else /* 16-bit & 32-bit */
 #ifdef SUPPORT_UNICODE
-      if (utf && Lc > 127) Loc = UCD_OTHERCASE(Lc);
+      if ((utf || ucp) && Lc > 127) Loc = UCD_OTHERCASE(Lc);
      else
 #endif  /* SUPPORT_UNICODE */
      Loc = TABLE_GET(Lc, mb->fcc, Lc);
@ -1490,7 +1544,7 @@ fprintf(stderr, "++ op=%d\n", *Fecode);
    if (Fop >= OP_NOTSTARI)     /* Caseless */
      {
 #ifdef SUPPORT_UNICODE
-      if (utf && Lc > 127)
+      if ((utf || ucp) && Lc > 127)
        Loc = UCD_OTHERCASE(Lc);
      else
 #endif /* SUPPORT_UNICODE */
@ -6045,7 +6099,6 @@ BOOL firstline;
 BOOL has_first_cu = FALSE;
 BOOL has_req_cu = FALSE;
 BOOL startline;
 BOOL utf;
 #if PCRE2_CODE_UNIT_WIDTH == 8
 BOOL memchr_not_found_first_cu = FALSE;
@ -6069,13 +6122,19 @@ PCRE2_SPTR match_partial;
 BOOL use_jit;
 #endif
 /* This flag is needed even when Unicode is not supported for convenience
 (it is used by the IS_NEWLINE macro). */
 BOOL utf = FALSE;
 #ifdef SUPPORT_UNICODE
 BOOL ucp = FALSE;
 BOOL allow_invalid;
 uint32_t fragment_options = 0;
 #ifdef SUPPORT_JIT
 BOOL jit_checked_utf = FALSE;
 #endif
-#endif
+#endif  /* SUPPORT_UNICODE */
 PCRE2_SIZE frame_size;
@ -6147,12 +6206,13 @@ use_jit = (re->executable_jit != NULL &&
          (options & ~PUBLIC_JIT_MATCH_OPTIONS) == 0);
 #endif
-/* Initialize UTF parameters. */
+/* Initialize UTF/UCP parameters. */
 utf = (re->overall_options & PCRE2_UTF) != 0;
 #ifdef SUPPORT_UNICODE
 utf = (re->overall_options & PCRE2_UTF) != 0;
 allow_invalid = (re->overall_options & PCRE2_MATCH_INVALID_UTF) != 0;
-#endif
+ucp = (re->overall_options & PCRE2_UCP) != 0;
 #endif  /* SUPPORT_UNICODE */
 /* Convert the partial matching flags into an integer. */
@ -6589,9 +6649,13 @@ if ((re->flags & PCRE2_FIRSTSET) != 0)
  if ((re->flags & PCRE2_FIRSTCASELESS) != 0)
    {
    first_cu2 = TABLE_GET(first_cu, mb->fcc, first_cu);
-#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 8
+#ifdef SUPPORT_UNICODE
-    if (utf && first_cu > 127) first_cu2 = UCD_OTHERCASE(first_cu);
+#if PCRE2_CODE_UNIT_WIDTH == 8
    if (first_cu > 127 && ucp && !utf) first_cu2 = UCD_OTHERCASE(first_cu);
 #else
    if (first_cu > 127 && (utf || ucp)) first_cu2 = UCD_OTHERCASE(first_cu);
 #endif
 #endif  /* SUPPORT_UNICODE */
    }
  }
 else
@ -6607,9 +6671,13 @@ if ((re->flags & PCRE2_LASTSET) != 0)
  if ((re->flags & PCRE2_LASTCASELESS) != 0)
    {
    req_cu2 = TABLE_GET(req_cu, mb->fcc, req_cu);
-#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 8
+#ifdef SUPPORT_UNICODE
-    if (utf && req_cu > 127) req_cu2 = UCD_OTHERCASE(req_cu);
+#if PCRE2_CODE_UNIT_WIDTH == 8
    if (req_cu > 127 && ucp && !utf) req_cu2 = UCD_OTHERCASE(req_cu);
 #else
    if (req_cu > 127 && (utf || ucp)) req_cu2 = UCD_OTHERCASE(req_cu);
 #endif
 #endif  /* SUPPORT_UNICODE */
    }
  }
@ -6756,15 +6824,16 @@ for(;;)
 #endif
          }
-        /* If we can't find the required code unit, having reached the true end
+        /* If we can't find the required first code unit, having reached the
-        of the subject, break the bumpalong loop, to force a match failure,
+        true end of the subject, break the bumpalong loop, to force a match
-        except when doing partial matching, when we let the next cycle run at
+        failure, except when doing partial matching, when we let the next cycle
-        the end of the subject. To see why, consider the pattern /(?<=abc)def/,
+        run at the end of the subject. To see why, consider the pattern
-        which partially matches "abc", even though the string does not contain
+        /(?<=abc)def/, which partially matches "abc", even though the string
-        the starting character "d". If we have not reached the true end of the
+        does not contain the starting character "d". If we have not reached the
-        subject (PCRE2_FIRSTLINE caused end_subject to be temporarily modified)
+        true end of the subject (PCRE2_FIRSTLINE caused end_subject to be
-        we also let the cycle run, because the matching string is legitimately
+        temporarily modified) we also let the cycle run, because the matching
-        allowed to start with the first code unit of a newline. */
+        string is legitimately allowed to start with the first code unit of a
        newline. */
        if (mb->partial == 0 && start_match >= mb->end_subject)
          {
--- a/src/pcre2_study.c
+++ b/src/pcre2_study.c
@ -772,15 +772,19 @@ Arguments:
  p             points to the first code unit of the character
  caseless      TRUE if caseless
  utf           TRUE for UTF mode
  ucp           TRUE for UCP mode 
 Returns:        pointer after the character
 */
 static PCRE2_SPTR
-set_table_bit(pcre2_real_code *re, PCRE2_SPTR p, BOOL caseless, BOOL utf)
+set_table_bit(pcre2_real_code *re, PCRE2_SPTR p, BOOL caseless, BOOL utf, 
  BOOL ucp)
 {
 uint32_t c = *p++;   /* First code unit */
-(void)utf;           /* Stop compiler warning when UTF not supported */
+
 (void)utf;           /* Stop compiler warnings when UTF not supported */
 (void)ucp;
 /* In 16-bit and 32-bit modes, code units greater than 0xff set the bit for
 0xff. */
@ -810,22 +814,26 @@ if (utf)
 if (caseless)
  {
 #ifdef SUPPORT_UNICODE
-  if (utf)
+  if (utf || ucp)
    {
    c = UCD_OTHERCASE(c);
 #if PCRE2_CODE_UNIT_WIDTH == 8
-    PCRE2_UCHAR buff[6];
+    if (utf)
-    c = UCD_OTHERCASE(c);
+      { 
-    (void)PRIV(ord2utf)(c, buff);
+      PCRE2_UCHAR buff[6];
-    SET_BIT(buff[0]);
+      (void)PRIV(ord2utf)(c, buff);
      SET_BIT(buff[0]);
      }
    else SET_BIT(c);    
 #else  /* 16-bit or 32-bit mode */
    c = UCD_OTHERCASE(c);
    if (c > 0xff) SET_BIT(0xff); else SET_BIT(c);
 #endif
    }
  else
 #endif  /* SUPPORT_UNICODE */
-  /* Not UTF */
+  /* Not UTF or UCP */
  if (MAX_255(c)) SET_BIT(re->tables[fcc_offset + c]);
  }
@ -931,6 +939,7 @@ Arguments:
  re           points to the compiled regex block
  code         points to an expression
  utf          TRUE if in UTF mode
  ucp          TRUE if in UCP mode 
  depthptr     pointer to recurse depth
 Returns:       SSB_FAIL     => Failed to find any starting code units
@ -941,7 +950,8 @@ Returns:       SSB_FAIL     => Failed to find any starting code units
 */
 static int
-set_start_bits(pcre2_real_code *re, PCRE2_SPTR code, BOOL utf, int *depthptr)
+set_start_bits(pcre2_real_code *re, PCRE2_SPTR code, BOOL utf, BOOL ucp,
  int *depthptr)
 {
 uint32_t c;
 int yield = SSB_DONE;
@ -1111,7 +1121,7 @@ do
      case OP_SCRIPT_RUN:
      case OP_ASSERT:
      case OP_ASSERT_NA:
-      rc = set_start_bits(re, tcode, utf, depthptr);
+      rc = set_start_bits(re, tcode, utf, ucp, depthptr);
      if (rc == SSB_DONE)
        {
        try_next = FALSE;
@ -1167,7 +1177,7 @@ do
      case OP_BRAZERO:
      case OP_BRAMINZERO:
      case OP_BRAPOSZERO:
-      rc = set_start_bits(re, ++tcode, utf, depthptr);
+      rc = set_start_bits(re, ++tcode, utf, ucp, depthptr);
      if (rc == SSB_FAIL || rc == SSB_UNKNOWN || rc == SSB_TOODEEP) return rc;
      do tcode += GET(tcode,1); while (*tcode == OP_ALT);
      tcode += 1 + LINK_SIZE;
@ -1189,7 +1199,7 @@ do
      case OP_QUERY:
      case OP_MINQUERY:
      case OP_POSQUERY:
-      tcode = set_table_bit(re, tcode + 1, FALSE, utf);
+      tcode = set_table_bit(re, tcode + 1, FALSE, utf, ucp);
      break;
      case OP_STARI:
@ -1198,7 +1208,7 @@ do
      case OP_QUERYI:
      case OP_MINQUERYI:
      case OP_POSQUERYI:
-      tcode = set_table_bit(re, tcode + 1, TRUE, utf);
+      tcode = set_table_bit(re, tcode + 1, TRUE, utf, ucp);
      break;
      /* Single-char upto sets the bit and tries the next */
@ -1206,13 +1216,13 @@ do
      case OP_UPTO:
      case OP_MINUPTO:
      case OP_POSUPTO:
-      tcode = set_table_bit(re, tcode + 1 + IMM2_SIZE, FALSE, utf);
+      tcode = set_table_bit(re, tcode + 1 + IMM2_SIZE, FALSE, utf, ucp);
      break;
      case OP_UPTOI:
      case OP_MINUPTOI:
      case OP_POSUPTOI:
-      tcode = set_table_bit(re, tcode + 1 + IMM2_SIZE, TRUE, utf);
+      tcode = set_table_bit(re, tcode + 1 + IMM2_SIZE, TRUE, utf, ucp);
      break;
      /* At least one single char sets the bit and stops */
@ -1224,7 +1234,7 @@ do
      case OP_PLUS:
      case OP_MINPLUS:
      case OP_POSPLUS:
-      (void)set_table_bit(re, tcode + 1, FALSE, utf);
+      (void)set_table_bit(re, tcode + 1, FALSE, utf, ucp);
      try_next = FALSE;
      break;
@ -1235,7 +1245,7 @@ do
      case OP_PLUSI:
      case OP_MINPLUSI:
      case OP_POSPLUSI:
-      (void)set_table_bit(re, tcode + 1, TRUE, utf);
+      (void)set_table_bit(re, tcode + 1, TRUE, utf, ucp);
      try_next = FALSE;
      break;
@ -1664,6 +1674,7 @@ PRIV(study)(pcre2_real_code *re)
 int count = 0;
 PCRE2_UCHAR *code;
 BOOL utf = (re->overall_options & PCRE2_UTF) != 0;
 BOOL ucp = (re->overall_options & PCRE2_UCP) != 0;
 /* Find start of compiled code */
@ -1677,7 +1688,7 @@ code units. */
 if ((re->flags & (PCRE2_FIRSTSET|PCRE2_STARTLINE)) == 0)
  {
  int depth = 0;
-  int rc = set_start_bits(re, code, utf, &depth);
+  int rc = set_start_bits(re, code, utf, ucp, &depth);
  if (rc == SSB_UNKNOWN) return 1;
  /* If a list of starting code units was set up, scan the list to see if only
@ -1695,7 +1706,7 @@ if ((re->flags & (PCRE2_FIRSTSET|PCRE2_STARTLINE)) == 0)
    int b = -1;
    uint8_t *p = re->start_bitmap;
    uint32_t flags = PCRE2_FIRSTMAPSET;
-
+    
    for (i = 0; i < 256; p++, i += 8)
      {
      uint8_t x = *p;
@ -1725,27 +1736,27 @@ if ((re->flags & (PCRE2_FIRSTSET|PCRE2_STARTLINE)) == 0)
          }
        /* c contains the code unit value, in the range 0-255. In 8-bit UTF
-        mode, only values < 128 can be used. */
+        mode, only values < 128 can be used. In all the other cases, c is a 
        character value. */
 #if PCRE2_CODE_UNIT_WIDTH == 8
-        if (c > 127) goto DONE;
+        if (utf && c > 127) goto DONE;
 #endif
-        if (a < 0) a = c;   /* First one found */
+        if (a < 0) a = c;   /* First one found, save in a */
        else if (b < 0)     /* Second one found */
          {
          int d = TABLE_GET((unsigned int)c, re->tables + fcc_offset, c);
-
+          
 #ifdef SUPPORT_UNICODE
-#if PCRE2_CODE_UNIT_WIDTH == 8
+          if (utf || ucp)
-          if (utf && UCD_CASESET(c) != 0) goto DONE;   /* Multiple case set */
+            { 
-#else   /* 16-bit or 32-bit */
+            if (UCD_CASESET(c) != 0) goto DONE;     /* Multiple case set */
-          if (UCD_CASESET(c) != 0) goto DONE;     /* Multiple case set */
+            if (c > 127) d = UCD_OTHERCASE(c);
-          if (utf && c > 127) d = UCD_OTHERCASE(c);
+            }
 #endif  /* Code width */
 #endif  /* SUPPORT_UNICODE */
-          if (d != a) goto DONE;   /* Not other case of a */
+          if (d != a) goto DONE;   /* Not the other case of a */
-          b = c;
+          b = c;                   /* Save second in b */
          }
        else goto DONE;   /* More than two characters found */
        }
--- a/src/pcre2_substitute.c
+++ b/src/pcre2_substitute.c
@ -236,6 +236,7 @@ BOOL use_existing_match;
 BOOL replacement_only;
 #ifdef SUPPORT_UNICODE
 BOOL utf = (code->overall_options & PCRE2_UTF) != 0;
 BOOL ucp = (code->overall_options & PCRE2_UCP) != 0;
 #endif
 PCRE2_UCHAR temp[6];
 PCRE2_SPTR ptr;
@ -758,7 +759,7 @@ do
          if (forcecase != 0)
            {
 #ifdef SUPPORT_UNICODE
-            if (utf)
+            if (utf || ucp)
              {
              uint32_t type = UCD_CHARTYPE(ch);
              if (PRIV(ucp_gentype)[type] == ucp_L &&
@ -860,7 +861,7 @@ do
      if (forcecase != 0)
        {
 #ifdef SUPPORT_UNICODE
-        if (utf)
+        if (utf || ucp)
          {
          uint32_t type = UCD_CHARTYPE(ch);
          if (PRIV(ucp_gentype)[type] == ucp_L &&
--- a/testdata/testinput10
+++ b/testdata/testinput10
@ -570,8 +570,10 @@
 /[\xff\x{ffff}]/I,utf
 /[\xff\x{ff}]/I,utf
    abc\x{ff}def
 /[\xff\x{ff}]/I
    abc\x{ff}def
 /[Ss]/I
@ -585,4 +587,31 @@
    abc\x80\=startchar
    abc\x80\=startchar,offset=3
 #subject no_jit
 /\x{c1}+\x{e1}/iIB,ucp
    \x{c1}\x{c1}\x{c1}
    \x{e1}\x{e1}\x{e1} 
 /a|\x{c1}/iI,ucp
    \x{e1}xxx
 /a|\x{c1}/iI,utf
    \x{e1}xxx
 /\x{c1}|\x{e1}/iI,ucp
 /X(\x{e1})Y/ucp,replace=>\U$1<,substitute_extended
    X\x{e1}Y
 /X(\x{e1})Y/i,ucp,replace=>\L$1<,substitute_extended
    X\x{c1}Y
 # Without UTF or UCP characters > 127 have only one case in the default locale.
 /X(\x{e1})Y/replace=>\U$1<,substitute_extended
    X\x{e1}Y
 #subject     
 # End of testinput10
--- a/testdata/testinput12
+++ b/testdata/testinput12
@ -463,4 +463,71 @@
 /(?:\x{ff}|\x{3000})/I,utf
 # ---------------------------------------------------- 
 # UCP and casing tests
 /\x{120}/i,I
 /\x{c1}/i,I,ucp
 /[\x{120}\x{121}]/iB,ucp
 /[ab\x{120}]+/iB,ucp
    aABb\x{121}\x{120}
 #subject no_jit
 /\x{c1}/i,no_start_optimize
 \= Expect no match
    \x{e1}
 /\x{120}\x{c1}/i,ucp,no_start_optimize
    \x{121}\x{e1}
 /\x{120}\x{c1}/i,ucp
    \x{121}\x{e1}
 /[^\x{120}]/i,no_start_optimize
    \x{121}
 /[^\x{120}]/i,ucp,no_start_optimize
 \= Expect no match
    \x{121}
 /[^\x{120}]/i
    \x{121}
 /[^\x{120}]/i,ucp
 \= Expect no match
    \x{121}
 /\x{120}{2}/i,ucp
    \x{121}\x{121}
 /[^\x{120}]{2}/i,ucp
 \= Expect no match
    \x{121}\x{121}
 /\x{c1}+\x{e1}/iB,ucp
    \x{c1}\x{c1}\x{c1}
 /\x{c1}+\x{e1}/iIB,ucp
    \x{c1}\x{c1}\x{c1}
    \x{e1}\x{e1}\x{e1} 
 /a|\x{c1}/iI,ucp
    \x{e1}xxx
 /\x{c1}|\x{e1}/iI,ucp
 /X(\x{e1})Y/ucp,replace=>\U$1<,substitute_extended
    X\x{e1}Y
 /X(\x{121})Y/ucp,replace=>\U$1<,substitute_extended
    X\x{121}Y
 #subject 
 # ---------------------------------------------------- 
 # End of testinput12
--- a/testdata/testinput14
+++ b/testdata/testinput14
@ -1,9 +1,12 @@
-# These test special (mostly error) UTF features of DFA matching. They are a 
+# These test special UTF and UCP features of DFA matching. The output is
-# selection of the more comprehensive tests that are run for non-DFA matching.
+# different for the different widths.
 # The output is different for the different widths.
 #subject dfa
 # ---------------------------------------------------- 
 # These are a selection of the more comprehensive tests that are run for
 # non-DFA matching.
 /X/utf
    XX\x{d800}
    XX\x{d800}\=offset=3
@ -33,5 +36,46 @@
    XX\xef\x80\=ph
    \xf7\=ph
    \xf7\x80\=ph
 # ---------------------------------------------------- 
 # UCP and casing tests - except for the first two, these will all fail in 8-bit
 # mode because they are testing UCP without UTF and use characters > 255.
 /\x{c1}/i,no_start_optimize
 \= Expect no match
    \x{e1}
 /\x{c1}+\x{e1}/iB,ucp
    \x{c1}\x{c1}\x{c1}
    \x{e1}\x{e1}\x{e1} 
 /\x{120}\x{c1}/i,ucp,no_start_optimize
    \x{121}\x{e1}
 /\x{120}\x{c1}/i,ucp
    \x{121}\x{e1}
 /[^\x{120}]/i,no_start_optimize
    \x{121}
 /[^\x{120}]/i,ucp,no_start_optimize
 \= Expect no match
    \x{121}
 /[^\x{120}]/i
    \x{121}
 /[^\x{120}]/i,ucp
 \= Expect no match
    \x{121}
 /\x{120}{2}/i,ucp
    \x{121}\x{121}
 /[^\x{120}]{2}/i,ucp
 \= Expect no match
    \x{121}\x{121}
 # ---------------------------------------------------- 
 # End of testinput14
--- a/testdata/testoutput10
+++ b/testdata/testoutput10
@ -1780,11 +1780,15 @@ Capture group count = 0
 Options: utf
 Starting code units: \xc3 
 Subject length lower bound = 1
    abc\x{ff}def
 0: \x{ff}
 /[\xff\x{ff}]/I
 Capture group count = 0
-Starting code units: \xff 
+First code unit = \xff
 Subject length lower bound = 1
    abc\x{ff}def
 0: \xff
 /[Ss]/I
 Capture group count = 0
@ -1813,4 +1817,62 @@ Failed: error -22: UTF-8 error: isolated byte with 0x80 bit set at offset 3
    abc\x80\=startchar,offset=3
 Error -36 (bad UTF-8 offset)
 #subject no_jit
 /\x{c1}+\x{e1}/iIB,ucp
 ------------------------------------------------------------------
        Bra
     /i \x{c1}+
     /i \x{e1}
        Ket
        End
 ------------------------------------------------------------------
 Capture group count = 0
 Options: caseless ucp
 First code unit = \xc1 (caseless)
 Last code unit = \xe1 (caseless)
 Subject length lower bound = 2
    \x{c1}\x{c1}\x{c1}
 0: \xc1\xc1\xc1
    \x{e1}\x{e1}\x{e1} 
 0: \xe1\xe1\xe1
 /a|\x{c1}/iI,ucp
 Capture group count = 0
 Options: caseless ucp
 Starting code units: A a \xc1 \xe1 
 Subject length lower bound = 1
    \x{e1}xxx
 0: \xe1
 /a|\x{c1}/iI,utf
 Capture group count = 0
 Options: caseless utf
 Starting code units: A a \xc3 
 Subject length lower bound = 1
    \x{e1}xxx
 0: \x{e1}
 /\x{c1}|\x{e1}/iI,ucp
 Capture group count = 0
 Options: caseless ucp
 First code unit = \xc1 (caseless)
 Subject length lower bound = 1
 /X(\x{e1})Y/ucp,replace=>\U$1<,substitute_extended
    X\x{e1}Y
 1: >\xc1<
 /X(\x{e1})Y/i,ucp,replace=>\L$1<,substitute_extended
    X\x{c1}Y
 1: >\xe1<
 # Without UTF or UCP characters > 127 have only one case in the default locale.
 /X(\x{e1})Y/replace=>\U$1<,substitute_extended
    X\x{e1}Y
 1: >\xe1<
 #subject     
 # End of testinput10
--- a/testdata/testoutput12-16
+++ b/testdata/testoutput12-16
@ -1613,7 +1613,7 @@ Subject length lower bound = 1
 /[Ss]/I
 Capture group count = 0
-Starting code units: S s 
+First code unit = 'S' (caseless)
 Subject length lower bound = 1
 /[Ss]/I,utf
@ -1628,4 +1628,134 @@ Options: utf
 Starting code units: \xff 
 Subject length lower bound = 1
 # ---------------------------------------------------- 
 # UCP and casing tests
 /\x{120}/i,I
 Capture group count = 0
 Options: caseless
 First code unit = \x{120}
 Subject length lower bound = 1
 /\x{c1}/i,I,ucp
 Capture group count = 0
 Options: caseless ucp
 First code unit = \xc1 (caseless)
 Subject length lower bound = 1
 /[\x{120}\x{121}]/iB,ucp
 ------------------------------------------------------------------
        Bra
     /i \x{120}
        Ket
        End
 ------------------------------------------------------------------
 /[ab\x{120}]+/iB,ucp
 ------------------------------------------------------------------
        Bra
        [ABab\x{120}-\x{121}]++
        Ket
        End
 ------------------------------------------------------------------
    aABb\x{121}\x{120}
 0: aABb\x{121}\x{120}
 #subject no_jit
 /\x{c1}/i,no_start_optimize
 \= Expect no match
    \x{e1}
 No match
 /\x{120}\x{c1}/i,ucp,no_start_optimize
    \x{121}\x{e1}
 0: \x{121}\xe1
 /\x{120}\x{c1}/i,ucp
    \x{121}\x{e1}
 0: \x{121}\xe1
 /[^\x{120}]/i,no_start_optimize
    \x{121}
 0: \x{121}
 /[^\x{120}]/i,ucp,no_start_optimize
 \= Expect no match
    \x{121}
 No match
 /[^\x{120}]/i
    \x{121}
 0: \x{121}
 /[^\x{120}]/i,ucp
 \= Expect no match
    \x{121}
 No match
 /\x{120}{2}/i,ucp
    \x{121}\x{121}
 0: \x{121}\x{121}
 /[^\x{120}]{2}/i,ucp
 \= Expect no match
    \x{121}\x{121}
 No match
 /\x{c1}+\x{e1}/iB,ucp
 ------------------------------------------------------------------
        Bra
     /i \x{c1}+
     /i \x{e1}
        Ket
        End
 ------------------------------------------------------------------
    \x{c1}\x{c1}\x{c1}
 0: \xc1\xc1\xc1
 /\x{c1}+\x{e1}/iIB,ucp
 ------------------------------------------------------------------
        Bra
     /i \x{c1}+
     /i \x{e1}
        Ket
        End
 ------------------------------------------------------------------
 Capture group count = 0
 Options: caseless ucp
 First code unit = \xc1 (caseless)
 Last code unit = \xe1 (caseless)
 Subject length lower bound = 2
    \x{c1}\x{c1}\x{c1}
 0: \xc1\xc1\xc1
    \x{e1}\x{e1}\x{e1} 
 0: \xe1\xe1\xe1
 /a|\x{c1}/iI,ucp
 Capture group count = 0
 Options: caseless ucp
 Starting code units: A a \xc1 \xe1 
 Subject length lower bound = 1
    \x{e1}xxx
 0: \xe1
 /\x{c1}|\x{e1}/iI,ucp
 Capture group count = 0
 Options: caseless ucp
 First code unit = \xc1 (caseless)
 Subject length lower bound = 1
 /X(\x{e1})Y/ucp,replace=>\U$1<,substitute_extended
    X\x{e1}Y
 1: >\xc1<
 /X(\x{121})Y/ucp,replace=>\U$1<,substitute_extended
    X\x{121}Y
 1: >\x{120}<
 #subject 
 # ---------------------------------------------------- 
 # End of testinput12
--- a/testdata/testoutput12-32
+++ b/testdata/testoutput12-32
@ -1611,7 +1611,7 @@ Subject length lower bound = 1
 /[Ss]/I
 Capture group count = 0
-Starting code units: S s 
+First code unit = 'S' (caseless)
 Subject length lower bound = 1
 /[Ss]/I,utf
@ -1626,4 +1626,134 @@ Options: utf
 Starting code units: \xff 
 Subject length lower bound = 1
 # ---------------------------------------------------- 
 # UCP and casing tests
 /\x{120}/i,I
 Capture group count = 0
 Options: caseless
 First code unit = \x{120}
 Subject length lower bound = 1
 /\x{c1}/i,I,ucp
 Capture group count = 0
 Options: caseless ucp
 First code unit = \xc1 (caseless)
 Subject length lower bound = 1
 /[\x{120}\x{121}]/iB,ucp
 ------------------------------------------------------------------
        Bra
     /i \x{120}
        Ket
        End
 ------------------------------------------------------------------
 /[ab\x{120}]+/iB,ucp
 ------------------------------------------------------------------
        Bra
        [ABab\x{120}-\x{121}]++
        Ket
        End
 ------------------------------------------------------------------
    aABb\x{121}\x{120}
 0: aABb\x{121}\x{120}
 #subject no_jit
 /\x{c1}/i,no_start_optimize
 \= Expect no match
    \x{e1}
 No match
 /\x{120}\x{c1}/i,ucp,no_start_optimize
    \x{121}\x{e1}
 0: \x{121}\xe1
 /\x{120}\x{c1}/i,ucp
    \x{121}\x{e1}
 0: \x{121}\xe1
 /[^\x{120}]/i,no_start_optimize
    \x{121}
 0: \x{121}
 /[^\x{120}]/i,ucp,no_start_optimize
 \= Expect no match
    \x{121}
 No match
 /[^\x{120}]/i
    \x{121}
 0: \x{121}
 /[^\x{120}]/i,ucp
 \= Expect no match
    \x{121}
 No match
 /\x{120}{2}/i,ucp
    \x{121}\x{121}
 0: \x{121}\x{121}
 /[^\x{120}]{2}/i,ucp
 \= Expect no match
    \x{121}\x{121}
 No match
 /\x{c1}+\x{e1}/iB,ucp
 ------------------------------------------------------------------
        Bra
     /i \x{c1}+
     /i \x{e1}
        Ket
        End
 ------------------------------------------------------------------
    \x{c1}\x{c1}\x{c1}
 0: \xc1\xc1\xc1
 /\x{c1}+\x{e1}/iIB,ucp
 ------------------------------------------------------------------
        Bra
     /i \x{c1}+
     /i \x{e1}
        Ket
        End
 ------------------------------------------------------------------
 Capture group count = 0
 Options: caseless ucp
 First code unit = \xc1 (caseless)
 Last code unit = \xe1 (caseless)
 Subject length lower bound = 2
    \x{c1}\x{c1}\x{c1}
 0: \xc1\xc1\xc1
    \x{e1}\x{e1}\x{e1} 
 0: \xe1\xe1\xe1
 /a|\x{c1}/iI,ucp
 Capture group count = 0
 Options: caseless ucp
 Starting code units: A a \xc1 \xe1 
 Subject length lower bound = 1
    \x{e1}xxx
 0: \xe1
 /\x{c1}|\x{e1}/iI,ucp
 Capture group count = 0
 Options: caseless ucp
 First code unit = \xc1 (caseless)
 Subject length lower bound = 1
 /X(\x{e1})Y/ucp,replace=>\U$1<,substitute_extended
    X\x{e1}Y
 1: >\xc1<
 /X(\x{121})Y/ucp,replace=>\U$1<,substitute_extended
    X\x{121}Y
 1: >\x{120}<
 #subject 
 # ---------------------------------------------------- 
 # End of testinput12
--- a/testdata/testoutput14-16
+++ b/testdata/testoutput14-16
@ -1,9 +1,12 @@
-# These test special (mostly error) UTF features of DFA matching. They are a 
+# These test special UTF and UCP features of DFA matching. The output is
-# selection of the more comprehensive tests that are run for non-DFA matching.
+# different for the different widths.
 # The output is different for the different widths.
 #subject dfa
 # ---------------------------------------------------- 
 # These are a selection of the more comprehensive tests that are run for
 # non-DFA matching.
 /X/utf
    XX\x{d800}
 Failed: error -24: UTF-16 error: missing low surrogate at end at offset 2
@ -57,5 +60,66 @@ No match
 No match
    \xf7\x80\=ph
 No match
 # ---------------------------------------------------- 
 # UCP and casing tests - except for the first two, these will all fail in 8-bit
 # mode because they are testing UCP without UTF and use characters > 255.
 /\x{c1}/i,no_start_optimize
 \= Expect no match
    \x{e1}
 No match
 /\x{c1}+\x{e1}/iB,ucp
 ------------------------------------------------------------------
        Bra
     /i \x{c1}+
     /i \x{e1}
        Ket
        End
 ------------------------------------------------------------------
    \x{c1}\x{c1}\x{c1}
 0: \xc1\xc1\xc1
 1: \xc1\xc1
    \x{e1}\x{e1}\x{e1} 
 0: \xe1\xe1\xe1
 1: \xe1\xe1
 /\x{120}\x{c1}/i,ucp,no_start_optimize
    \x{121}\x{e1}
 0: \x{121}\xe1
 /\x{120}\x{c1}/i,ucp
    \x{121}\x{e1}
 0: \x{121}\xe1
 /[^\x{120}]/i,no_start_optimize
    \x{121}
 0: \x{121}
 /[^\x{120}]/i,ucp,no_start_optimize
 \= Expect no match
    \x{121}
 No match
 /[^\x{120}]/i
    \x{121}
 0: \x{121}
 /[^\x{120}]/i,ucp
 \= Expect no match
    \x{121}
 No match
 /\x{120}{2}/i,ucp
    \x{121}\x{121}
 0: \x{121}\x{121}
 /[^\x{120}]{2}/i,ucp
 \= Expect no match
    \x{121}\x{121}
 No match
 # ---------------------------------------------------- 
 # End of testinput14
--- a/testdata/testoutput14-32
+++ b/testdata/testoutput14-32
@ -1,9 +1,12 @@
-# These test special (mostly error) UTF features of DFA matching. They are a 
+# These test special UTF and UCP features of DFA matching. The output is
-# selection of the more comprehensive tests that are run for non-DFA matching.
+# different for the different widths.
 # The output is different for the different widths.
 #subject dfa
 # ---------------------------------------------------- 
 # These are a selection of the more comprehensive tests that are run for
 # non-DFA matching.
 /X/utf
    XX\x{d800}
 Failed: error -27: UTF-32 error: code points 0xd800-0xdfff are not defined at offset 2
@ -57,5 +60,66 @@ No match
 No match
    \xf7\x80\=ph
 No match
 # ---------------------------------------------------- 
 # UCP and casing tests - except for the first two, these will all fail in 8-bit
 # mode because they are testing UCP without UTF and use characters > 255.
 /\x{c1}/i,no_start_optimize
 \= Expect no match
    \x{e1}
 No match
 /\x{c1}+\x{e1}/iB,ucp
 ------------------------------------------------------------------
        Bra
     /i \x{c1}+
     /i \x{e1}
        Ket
        End
 ------------------------------------------------------------------
    \x{c1}\x{c1}\x{c1}
 0: \xc1\xc1\xc1
 1: \xc1\xc1
    \x{e1}\x{e1}\x{e1} 
 0: \xe1\xe1\xe1
 1: \xe1\xe1
 /\x{120}\x{c1}/i,ucp,no_start_optimize
    \x{121}\x{e1}
 0: \x{121}\xe1
 /\x{120}\x{c1}/i,ucp
    \x{121}\x{e1}
 0: \x{121}\xe1
 /[^\x{120}]/i,no_start_optimize
    \x{121}
 0: \x{121}
 /[^\x{120}]/i,ucp,no_start_optimize
 \= Expect no match
    \x{121}
 No match
 /[^\x{120}]/i
    \x{121}
 0: \x{121}
 /[^\x{120}]/i,ucp
 \= Expect no match
    \x{121}
 No match
 /\x{120}{2}/i,ucp
    \x{121}\x{121}
 0: \x{121}\x{121}
 /[^\x{120}]{2}/i,ucp
 \= Expect no match
    \x{121}\x{121}
 No match
 # ---------------------------------------------------- 
 # End of testinput14
--- a/testdata/testoutput14-8
+++ b/testdata/testoutput14-8
@ -1,9 +1,12 @@
-# These test special (mostly error) UTF features of DFA matching. They are a 
+# These test special UTF and UCP features of DFA matching. The output is
-# selection of the more comprehensive tests that are run for non-DFA matching.
+# different for the different widths.
 # The output is different for the different widths.
 #subject dfa
 # ---------------------------------------------------- 
 # These are a selection of the more comprehensive tests that are run for
 # non-DFA matching.
 /X/utf
    XX\x{d800}
 Failed: error -16: UTF-8 error: code points 0xd800-0xdfff are not defined at offset 2
@ -57,5 +60,66 @@ Failed: error -3: UTF-8 error: 1 byte missing at end at offset 2
 Failed: error -5: UTF-8 error: 3 bytes missing at end at offset 0
    \xf7\x80\=ph
 Failed: error -4: UTF-8 error: 2 bytes missing at end at offset 0
 # ---------------------------------------------------- 
 # UCP and casing tests - except for the first two, these will all fail in 8-bit
 # mode because they are testing UCP without UTF and use characters > 255.
 /\x{c1}/i,no_start_optimize
 \= Expect no match
    \x{e1}
 No match
 /\x{c1}+\x{e1}/iB,ucp
 ------------------------------------------------------------------
        Bra
     /i \x{c1}+
     /i \x{e1}
        Ket
        End
 ------------------------------------------------------------------
    \x{c1}\x{c1}\x{c1}
 0: \xc1\xc1\xc1
 1: \xc1\xc1
    \x{e1}\x{e1}\x{e1} 
 0: \xe1\xe1\xe1
 1: \xe1\xe1
 /\x{120}\x{c1}/i,ucp,no_start_optimize
 Failed: error 134 at offset 6: character code point value in \x{} or \o{} is too large
    \x{121}\x{e1}
 /\x{120}\x{c1}/i,ucp
 Failed: error 134 at offset 6: character code point value in \x{} or \o{} is too large
    \x{121}\x{e1}
 /[^\x{120}]/i,no_start_optimize
 Failed: error 134 at offset 8: character code point value in \x{} or \o{} is too large
    \x{121}
 /[^\x{120}]/i,ucp,no_start_optimize
 Failed: error 134 at offset 8: character code point value in \x{} or \o{} is too large
 \= Expect no match
    \x{121}
 /[^\x{120}]/i
 Failed: error 134 at offset 8: character code point value in \x{} or \o{} is too large
    \x{121}
 /[^\x{120}]/i,ucp
 Failed: error 134 at offset 8: character code point value in \x{} or \o{} is too large
 \= Expect no match
    \x{121}
 /\x{120}{2}/i,ucp
 Failed: error 134 at offset 6: character code point value in \x{} or \o{} is too large
    \x{121}\x{121}
 /[^\x{120}]{2}/i,ucp
 Failed: error 134 at offset 8: character code point value in \x{} or \o{} is too large
 \= Expect no match
    \x{121}\x{121}
 # ---------------------------------------------------- 
 # End of testinput14