Unicode upper/lower casing is now used when UCP is set, even if UTF is not set.

This is not yet documented, and it not yet implemented in JIT.
2020-02-23 16:40:05 +00:00 · 2020-02-23 16:40:05 +00:00 · 4a7dfab0ec
parent d0666136c9
commit 4a7dfab0ec
18 changed files with 893 additions and 125 deletions
--- a/5
+++ b/5
@ -66,6 +66,11 @@ recurse function in JIT.
 17. Fix a crash which occurs when the character type of an invalid UTF
 character is decoded in JIT.

+18. Changes in many areas of the code so that when Unicode is supported and 
+PCRE2_UCP is set without PCRE2_UTF, Unicode character properties are used for 
+upper/lower case computations on characters whose code points are greater than 
+127. Documentation is not yet updated. JIT is not yet updated.
+

 Version 10.34 21-November-2019
 ------------------------------
--- a/maint/ManyConfigTests
+++ b/maint/ManyConfigTests
@ -28,8 +28,6 @@
 # The -v option causes a call to 'pcre2test -C' to happen for each
 # configuration.

-# Currently -fsanitize=undefined is not working (locks machine).
-
 useasan=1
 useusan=1
 usedebug=1
--- a/src/pcre2_auto_possess.c
+++ b/src/pcre2_auto_possess.c
@ -7,7 +7,7 @@ and semantics are as close as possible to those of the Perl 5 language.

                       Written by Philip Hazel
     Original API code Copyright (c) 1997-2012 University of Cambridge
-          New API code Copyright (c) 2016-2019 University of Cambridge
+          New API code Copyright (c) 2016-2020 University of Cambridge

 -----------------------------------------------------------------------------
 Redistribution and use in source and binary forms, with or without
@ -292,6 +292,7 @@ possessification, and if so, fills a list with its properties.
 Arguments:
  code        points to start of expression
  utf         TRUE if in UTF mode
+  ucp         TRUE if in UCP mode
  fcc         points to the case-flipping table
  list        points to output list
              list[0] will be filled with the opcode
@ -304,7 +305,7 @@ Returns:      points to the start of the next opcode if *code is accepted
 */

 static PCRE2_SPTR
-get_chr_property_list(PCRE2_SPTR code, BOOL utf, const uint8_t *fcc,
+get_chr_property_list(PCRE2_SPTR code, BOOL utf, BOOL ucp, const uint8_t *fcc,
  uint32_t *list)
 {
 PCRE2_UCHAR c = *code;
@ -316,7 +317,8 @@ uint32_t chr;
 uint32_t *clist_dest;
 const uint32_t *clist_src;
 #else
-(void)utf;    /* Suppress "unused parameter" compiler warning */
+(void)utf;    /* Suppress "unused parameter" compiler warnings */
+(void)ucp;
 #endif

 list[0] = c;
@ -396,7 +398,7 @@ switch(c)
  list[2] = chr;

 #ifdef SUPPORT_UNICODE
-  if (chr < 128 || (chr < 256 && !utf))
+  if (chr < 128 || (chr < 256 && !utf && !ucp))
    list[3] = fcc[chr];
  else
    list[3] = UCD_OTHERCASE(chr);
@ -503,6 +505,7 @@ which case the base cannot be possessified.
 Arguments:
  code        points to the byte code
  utf         TRUE in UTF mode
+  ucp         TRUE in UCP mode 
  cb          compile data block
  base_list   the data list of the base opcode
  base_end    the end of the base opcode
@ -512,7 +515,7 @@ Returns:      TRUE if the auto-possessification is possible
 */

 static BOOL
-compare_opcodes(PCRE2_SPTR code, BOOL utf, const compile_block *cb,
+compare_opcodes(PCRE2_SPTR code, BOOL utf, BOOL ucp, const compile_block *cb,
  const uint32_t *base_list, PCRE2_SPTR base_end, int *rec_limit)
 {
 PCRE2_UCHAR c;
@ -651,7 +654,7 @@ for(;;)

    while (*next_code == OP_ALT)
      {
-      if (!compare_opcodes(code, utf, cb, base_list, base_end, rec_limit))
+      if (!compare_opcodes(code, utf, ucp, cb, base_list, base_end, rec_limit))
        return FALSE;
      code = next_code + 1 + LINK_SIZE;
      next_code += GET(next_code, 1);
@ -672,7 +675,8 @@ for(;;)
    /* The bracket content will be checked by the OP_BRA/OP_CBRA case above. */

    next_code += 1 + LINK_SIZE;
-    if (!compare_opcodes(next_code, utf, cb, base_list, base_end, rec_limit))
+    if (!compare_opcodes(next_code, utf, ucp, cb, base_list, base_end, 
+         rec_limit))
      return FALSE;

    code += PRIV(OP_lengths)[c];
@ -688,7 +692,7 @@ for(;;)
  /* We now have the next appropriate opcode to compare with the base. Check
  for a supported opcode, and load its properties. */

-  code = get_chr_property_list(code, utf, cb->fcc, list);
+  code = get_chr_property_list(code, utf, ucp, cb->fcc, list);
  if (code == NULL) return FALSE;    /* Unsupported */

  /* If either opcode is a small character list, set pointers for comparing
@ -1100,7 +1104,6 @@ leaving the remainder of the pattern unpossessified.

 Arguments:
  code        points to start of the byte code
-  utf         TRUE in UTF mode
  cb          compile data block

 Returns:      0 for success
@ -1108,13 +1111,15 @@ Returns:      0 for success
 */

 int
-PRIV(auto_possessify)(PCRE2_UCHAR *code, BOOL utf, const compile_block *cb)
+PRIV(auto_possessify)(PCRE2_UCHAR *code, const compile_block *cb)
 {
 PCRE2_UCHAR c;
 PCRE2_SPTR end;
 PCRE2_UCHAR *repeat_opcode;
 uint32_t list[8];
 int rec_limit = 1000;  /* Was 10,000 but clang+ASAN uses a lot of stack. */
+BOOL utf = (cb->external_options & PCRE2_UTF) != 0;
+BOOL ucp = (cb->external_options & PCRE2_UCP) != 0;

 for (;;)
  {
@ -1126,10 +1131,11 @@ for (;;)
    {
    c -= get_repeat_base(c) - OP_STAR;
    end = (c <= OP_MINUPTO) ?
-      get_chr_property_list(code, utf, cb->fcc, list) : NULL;
+      get_chr_property_list(code, utf, ucp, cb->fcc, list) : NULL;
    list[1] = c == OP_STAR || c == OP_PLUS || c == OP_QUERY || c == OP_UPTO;

-    if (end != NULL && compare_opcodes(end, utf, cb, list, end, &rec_limit))
+    if (end != NULL && compare_opcodes(end, utf, ucp, cb, list, end, 
+        &rec_limit))
      {
      switch(c)
        {
@ -1181,11 +1187,11 @@ for (;;)
    if (c >= OP_CRSTAR && c <= OP_CRMINRANGE)
      {
      /* end must not be NULL. */
-      end = get_chr_property_list(code, utf, cb->fcc, list);
+      end = get_chr_property_list(code, utf, ucp, cb->fcc, list);

      list[1] = (c & 1) == 0;

-      if (compare_opcodes(end, utf, cb, list, end, &rec_limit))
+      if (compare_opcodes(end, utf, ucp, cb, list, end, &rec_limit))
        {
        switch (c)
          {
--- a/src/pcre2_compile.c
+++ b/src/pcre2_compile.c
@ -4904,7 +4904,7 @@ range. */
 if ((options & PCRE2_CASELESS) != 0)
  {
 #ifdef SUPPORT_UNICODE
-  if ((options & PCRE2_UTF) != 0)
+  if ((options & (PCRE2_UTF|PCRE2_UCP)) != 0)
    {
    int rc;
    uint32_t oc, od;
@ -5319,7 +5319,8 @@ dynamically as we process the pattern. */

 #ifdef SUPPORT_UNICODE
 BOOL utf = (options & PCRE2_UTF) != 0;
-#else  /* No UTF support */
+BOOL ucp = (options & PCRE2_UCP) != 0;
+#else  /* No Unicode support */
 BOOL utf = FALSE;
 #endif

@ -5602,7 +5603,7 @@ for (;; pptr++)
        uint32_t d;

 #ifdef SUPPORT_UNICODE
-        if (utf && c > 127) d = UCD_OTHERCASE(c); else
+        if ((utf || ucp) && c > 127) d = UCD_OTHERCASE(c); else
 #endif
          {
 #if PCRE2_CODE_UNIT_WIDTH != 8
@ -9632,6 +9633,7 @@ pcre2_compile(PCRE2_SPTR pattern, PCRE2_SIZE patlen, uint32_t options,
   int *errorptr, PCRE2_SIZE *erroroffset, pcre2_compile_context *ccontext)
 {
 BOOL utf;                             /* Set TRUE for UTF mode */
+BOOL ucp;                             /* Set TRUE for UCP mode */
 BOOL has_lookbehind = FALSE;          /* Set TRUE if a lookbehind is found */
 BOOL zero_terminated;                 /* Set TRUE for zero-terminated pattern */
 pcre2_real_code *re = NULL;           /* What we will return */
@ -9919,8 +9921,8 @@ if (utf)

 /* Check UCP lockout. */

-if ((cb.external_options & (PCRE2_UCP|PCRE2_NEVER_UCP)) ==
-    (PCRE2_UCP|PCRE2_NEVER_UCP))
+ucp = (cb.external_options & PCRE2_UCP) != 0;
+if (ucp && (cb.external_options & PCRE2_NEVER_UCP) != 0)
  {
  errorcode = ERR75;
  goto HAD_EARLY_ERROR;
@ -10296,7 +10298,7 @@ function call. */
 if (errorcode == 0 && (re->overall_options & PCRE2_NO_AUTO_POSSESS) == 0)
  {
  PCRE2_UCHAR *temp = (PCRE2_UCHAR *)codestart;
-  if (PRIV(auto_possessify)(temp, utf, &cb) != 0) errorcode = ERR80;
+  if (PRIV(auto_possessify)(temp, &cb) != 0) errorcode = ERR80;
  }

 /* Failed to compile, or error while post-processing. */
@ -10344,21 +10346,25 @@ if ((re->overall_options & PCRE2_NO_START_OPTIMIZE) == 0)

    if ((firstcuflags & REQ_CASELESS) != 0)
      {
-      if (firstcu < 128 || (!utf && firstcu < 255))
+      if (firstcu < 128 || (!utf && !ucp && firstcu < 255))
        {
        if (cb.fcc[firstcu] != firstcu) re->flags |= PCRE2_FIRSTCASELESS;
        }

-      /* The first code unit is > 128 in UTF mode, or > 255 otherwise. In
-      8-bit UTF mode, codepoints in the range 128-255 are introductory code
-      points and cannot have another case. In 16-bit and 32-bit modes, we can
-      check wide characters when UTF (and therefore UCP) is supported. */
+      /* The first code unit is > 128 in UTF or UCP mode, or > 255 otherwise.
+      In 8-bit UTF mode, codepoints in the range 128-255 are introductory code
+      points and cannot have another case, but if UCP is set they may do. */

-#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 8
-      else if (firstcu <= MAX_UTF_CODE_POINT &&
+#ifdef SUPPORT_UNICODE
+#if PCRE2_CODE_UNIT_WIDTH == 8
+      else if (ucp && !utf && UCD_OTHERCASE(firstcu) != firstcu)
+        re->flags |= PCRE2_FIRSTCASELESS;
+#else
+      else if ((utf || ucp) && firstcu <= MAX_UTF_CODE_POINT &&
               UCD_OTHERCASE(firstcu) != firstcu)
        re->flags |= PCRE2_FIRSTCASELESS;
 #endif
+#endif  /* SUPPORT_UNICODE */
      }
    }

@ -10407,14 +10413,20 @@ if ((re->overall_options & PCRE2_NO_START_OPTIMIZE) == 0)

      if ((reqcuflags & REQ_CASELESS) != 0)
        {
-        if (reqcu < 128 || (!utf && reqcu < 255))
+        if (reqcu < 128 || (!utf && !ucp && reqcu < 255))
          {
          if (cb.fcc[reqcu] != reqcu) re->flags |= PCRE2_LASTCASELESS;
          }
-#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 8
-        else if (reqcu <= MAX_UTF_CODE_POINT && UCD_OTHERCASE(reqcu) != reqcu)
+#ifdef SUPPORT_UNICODE
+#if PCRE2_CODE_UNIT_WIDTH == 8
+      else if (ucp && !utf && UCD_OTHERCASE(reqcu) != reqcu)
+        re->flags |= PCRE2_LASTCASELESS;
+#else
+      else if ((utf || ucp) && reqcu <= MAX_UTF_CODE_POINT &&
+               UCD_OTHERCASE(reqcu) != reqcu)
        re->flags |= PCRE2_LASTCASELESS;
 #endif
+#endif  /* SUPPORT_UNICODE */
        }
      }
    }
--- a/src/pcre2_dfa_match.c
+++ b/src/pcre2_dfa_match.c
@ -7,7 +7,7 @@ and semantics are as close as possible to those of the Perl 5 language.

                       Written by Philip Hazel
     Original API code Copyright (c) 1997-2012 University of Cambridge
-          New API code Copyright (c) 2016-2019 University of Cambridge
+          New API code Copyright (c) 2016-2020 University of Cambridge

 -----------------------------------------------------------------------------
 Redistribution and use in source and binary forms, with or without
@ -548,6 +548,7 @@ PCRE2_SPTR start_code = mb->start_code;

 #ifdef SUPPORT_UNICODE
 BOOL utf = (mb->poptions & PCRE2_UTF) != 0;
+BOOL utf_or_ucp = utf || (mb->poptions & PCRE2_UCP) != 0;
 #else
 BOOL utf = FALSE;
 #endif
@ -2190,7 +2191,7 @@ for (;;)
      if (clen == 0) break;

 #ifdef SUPPORT_UNICODE
-      if (utf)
+      if (utf_or_ucp)
        {
        if (c == d) { ADD_NEW(state_offset + dlen + 1, 0); } else
          {
@ -2204,7 +2205,7 @@ for (;;)
        }
      else
 #endif  /* SUPPORT_UNICODE */
-      /* Not UTF mode */
+      /* Not UTF or UCP mode */
        {
        if (TABLE_GET(c, lcc, c) == TABLE_GET(d, lcc, d))
          { ADD_NEW(state_offset + 2, 0); }
@ -2339,7 +2340,7 @@ for (;;)
        {
        uint32_t otherd;
 #ifdef SUPPORT_UNICODE
-        if (utf && d >= 128)
+        if (utf_or_ucp && d >= 128)
          otherd = UCD_OTHERCASE(d);
        else
 #endif  /* SUPPORT_UNICODE */
@ -2374,7 +2375,7 @@ for (;;)
        if (caseless)
          {
 #ifdef SUPPORT_UNICODE
-          if (utf && d >= 128)
+          if (utf_or_ucp && d >= 128)
            otherd = UCD_OTHERCASE(d);
          else
 #endif  /* SUPPORT_UNICODE */
@ -2417,7 +2418,7 @@ for (;;)
        if (caseless)
          {
 #ifdef SUPPORT_UNICODE
-          if (utf && d >= 128)
+          if (utf_or_ucp && d >= 128)
            otherd = UCD_OTHERCASE(d);
          else
 #endif  /* SUPPORT_UNICODE */
@ -2458,7 +2459,7 @@ for (;;)
        if (caseless)
          {
 #ifdef SUPPORT_UNICODE
-          if (utf && d >= 128)
+          if (utf_or_ucp && d >= 128)
            otherd = UCD_OTHERCASE(d);
          else
 #endif  /* SUPPORT_UNICODE */
@ -2491,7 +2492,7 @@ for (;;)
        if (caseless)
          {
 #ifdef SUPPORT_UNICODE
-          if (utf && d >= 128)
+          if (utf_or_ucp && d >= 128)
            otherd = UCD_OTHERCASE(d);
          else
 #endif  /* SUPPORT_UNICODE */
@ -2531,7 +2532,7 @@ for (;;)
        if (caseless)
          {
 #ifdef SUPPORT_UNICODE
-          if (utf && d >= 128)
+          if (utf_or_ucp && d >= 128)
            otherd = UCD_OTHERCASE(d);
          else
 #endif  /* SUPPORT_UNICODE */
@ -3526,10 +3527,15 @@ if ((re->flags & PCRE2_FIRSTSET) != 0)
  if ((re->flags & PCRE2_FIRSTCASELESS) != 0)
    {
    first_cu2 = TABLE_GET(first_cu, mb->tables + fcc_offset, first_cu);
-#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 8
-    if (utf && first_cu > 127)
+#ifdef SUPPORT_UNICODE 
+#if PCRE2_CODE_UNIT_WIDTH == 8
+    if (first_cu > 127 && !utf && (re->overall_options & PCRE2_UCP) != 0)
+      first_cu2 = (PCRE2_UCHAR)UCD_OTHERCASE(first_cu);
+#else
+    if (first_cu > 127 && (utf || (re->overall_options & PCRE2_UCP) != 0))
      first_cu2 = (PCRE2_UCHAR)UCD_OTHERCASE(first_cu);
 #endif       
+#endif  /* SUPPORT_UNICODE */
    }
  }
 else
@ -3545,9 +3551,15 @@ if ((re->flags & PCRE2_LASTSET) != 0)
  if ((re->flags & PCRE2_LASTCASELESS) != 0)
    {
    req_cu2 = TABLE_GET(req_cu, mb->tables + fcc_offset, req_cu);
-#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 8
-    if (utf && req_cu > 127) req_cu2 = (PCRE2_UCHAR)UCD_OTHERCASE(req_cu);
+#ifdef SUPPORT_UNICODE
+#if PCRE2_CODE_UNIT_WIDTH == 8
+    if (req_cu > 127 && !utf && (re->overall_options & PCRE2_UCP) != 0) 
+      req_cu2 = (PCRE2_UCHAR)UCD_OTHERCASE(req_cu);
+#else
+    if (req_cu > 127 && (utf || (re->overall_options & PCRE2_UCP) != 0)) 
+      req_cu2 = (PCRE2_UCHAR)UCD_OTHERCASE(req_cu);
 #endif
+#endif  /* SUPPORT_UNICODE */
    }
  }

--- a/src/pcre2_internal.h
+++ b/src/pcre2_internal.h
@ -1952,7 +1952,7 @@ is available. */
 #define _pcre2_was_newline           PCRE2_SUFFIX(_pcre2_was_newline_)
 #define _pcre2_xclass                PCRE2_SUFFIX(_pcre2_xclass_)

-extern int          _pcre2_auto_possessify(PCRE2_UCHAR *, BOOL,
+extern int          _pcre2_auto_possessify(PCRE2_UCHAR *,
                      const compile_block *);
 extern int          _pcre2_check_escape(PCRE2_SPTR *, PCRE2_SPTR, uint32_t *,
                      int *, uint32_t, uint32_t, BOOL, compile_block *);
--- a/src/pcre2_match.c
+++ b/src/pcre2_match.c
@ -7,7 +7,7 @@ and semantics are as close as possible to those of the Perl 5 language.

                       Written by Philip Hazel
     Original API code Copyright (c) 1997-2012 University of Cambridge
-          New API code Copyright (c) 2015-2019 University of Cambridge
+          New API code Copyright (c) 2015-2020 University of Cambridge

 -----------------------------------------------------------------------------
 Redistribution and use in source and binary forms, with or without
@ -598,12 +598,13 @@ BOOL condition;         /* Used in conditional groups */
 BOOL cur_is_word;       /* Used in "word" tests */
 BOOL prev_is_word;      /* Used in "word" tests */

-/* UTF flag */
+/* UTF and UCP flags */

 #ifdef SUPPORT_UNICODE
 BOOL utf = (mb->poptions & PCRE2_UTF) != 0;
+BOOL ucp = (mb->poptions & PCRE2_UCP) != 0;
 #else
-BOOL utf = FALSE;
+BOOL utf = FALSE;  /* Required for convenience even when no Unicode support */
 #endif

 /* This is the length of the last part of a backtracking frame that must be
@ -928,6 +929,7 @@ fprintf(stderr, "++ op=%d\n", *Fecode);
      }
    else
 #endif
+
    /* Not UTF mode */
      {
      if (mb->end_subject - Feptr < 1)
@ -987,10 +989,30 @@ fprintf(stderr, "++ op=%d\n", *Fecode);
        if (dc != fc && dc != UCD_OTHERCASE(fc)) RRETURN(MATCH_NOMATCH);
        }
      }
+
+    /* If UCP is set without UTF we must do the same as above, but with one
+    character per code unit. */
+
+    else if (ucp)
+      {
+      uint32_t cc = UCHAR21(Feptr);
+      fc = Fecode[1];
+      if (fc < 128)
+        {
+        if (mb->lcc[fc] != TABLE_GET(cc, mb->lcc, cc)) RRETURN(MATCH_NOMATCH);
+        }
+      else
+        {
+        if (cc != fc && cc != UCD_OTHERCASE(fc)) RRETURN(MATCH_NOMATCH);
+        }
+      Feptr++;
+      Fecode += 2;
+      }
+
    else
 #endif   /* SUPPORT_UNICODE */

-    /* Not UTF mode; use the table for characters < 256. */
+    /* Not UTF or UCP mode; use the table for characters < 256. */
      {
      if (TABLE_GET(Fecode[1], mb->lcc, Fecode[1])
          != TABLE_GET(*Feptr, mb->lcc, *Feptr)) RRETURN(MATCH_NOMATCH);
@ -1010,6 +1032,7 @@ fprintf(stderr, "++ op=%d\n", *Fecode);
      SCHECK_PARTIAL();
      RRETURN(MATCH_NOMATCH);
      }
+
 #ifdef SUPPORT_UNICODE
    if (utf)
      {
@ -1026,15 +1049,42 @@ fprintf(stderr, "++ op=%d\n", *Fecode);
        if (ch > 127)
          ch = UCD_OTHERCASE(ch);
        else
-          ch = TABLE_GET(ch, mb->fcc, ch);
+          ch = (mb->fcc)[ch];
        if (ch == fc) RRETURN(MATCH_NOMATCH);
        }
      }
+
+    /* UCP without UTF is as above, but with one character per code unit. */
+
+    else if (ucp)
+      {
+      uint32_t ch;
+      fc = UCHAR21INC(Feptr);
+      ch = Fecode[1];
+      Fecode += 2;
+
+      if (ch == fc)
+        {
+        RRETURN(MATCH_NOMATCH);  /* Caseful match */
+        }
+      else if (Fop == OP_NOTI)   /* If caseless */
+        {
+        if (ch > 127)
+          ch = UCD_OTHERCASE(ch);
+        else
+          ch = (mb->fcc)[ch];
+        if (ch == fc) RRETURN(MATCH_NOMATCH);
+        }
+      }
+
    else
 #endif  /* SUPPORT_UNICODE */
+
+    /* Neither UTF nor UCP is set */
+
      {
      uint32_t ch = Fecode[1];
-      fc = *Feptr++;
+      fc = UCHAR21INC(Feptr);
      if (ch == fc || (Fop == OP_NOTI && TABLE_GET(ch, mb->fcc, ch) == fc))
        RRETURN(MATCH_NOMATCH);
      Fecode += 2;
@ -1244,7 +1294,7 @@ fprintf(stderr, "++ op=%d\n", *Fecode);
 #endif  /* SUPPORT_UNICODE */

    /* When not in UTF mode, load a single-code-unit character. Then proceed as
-    above. */
+    above, using Unicode casing if either UTF or UCP is set. */

    Lc = *Fecode++;

@ -1253,11 +1303,15 @@ fprintf(stderr, "++ op=%d\n", *Fecode);
    if (Fop >= OP_STARI)
      {
 #if PCRE2_CODE_UNIT_WIDTH == 8
-      /* Lc must be < 128 in UTF-8 mode. */
+#ifdef SUPPORT_UNICODE
+      if (ucp && !utf && Lc > 127) Loc = UCD_OTHERCASE(Lc);
+      else
+#endif  /* SUPPORT_UNICODE */
+      /* Lc will be < 128 in UTF-8 mode. */
      Loc = mb->fcc[Lc];
 #else /* 16-bit & 32-bit */
 #ifdef SUPPORT_UNICODE
-      if (utf && Lc > 127) Loc = UCD_OTHERCASE(Lc);
+      if ((utf || ucp) && Lc > 127) Loc = UCD_OTHERCASE(Lc);
      else
 #endif  /* SUPPORT_UNICODE */
      Loc = TABLE_GET(Lc, mb->fcc, Lc);
@ -1490,7 +1544,7 @@ fprintf(stderr, "++ op=%d\n", *Fecode);
    if (Fop >= OP_NOTSTARI)     /* Caseless */
      {
 #ifdef SUPPORT_UNICODE
-      if (utf && Lc > 127)
+      if ((utf || ucp) && Lc > 127)
        Loc = UCD_OTHERCASE(Lc);
      else
 #endif /* SUPPORT_UNICODE */
@ -6045,7 +6099,6 @@ BOOL firstline;
 BOOL has_first_cu = FALSE;
 BOOL has_req_cu = FALSE;
 BOOL startline;
-BOOL utf;

 #if PCRE2_CODE_UNIT_WIDTH == 8
 BOOL memchr_not_found_first_cu = FALSE;
@ -6069,13 +6122,19 @@ PCRE2_SPTR match_partial;
 BOOL use_jit;
 #endif

+/* This flag is needed even when Unicode is not supported for convenience
+(it is used by the IS_NEWLINE macro). */
+
+BOOL utf = FALSE;
+
 #ifdef SUPPORT_UNICODE
+BOOL ucp = FALSE;
 BOOL allow_invalid;
 uint32_t fragment_options = 0;
 #ifdef SUPPORT_JIT
 BOOL jit_checked_utf = FALSE;
 #endif
-#endif
+#endif  /* SUPPORT_UNICODE */

 PCRE2_SIZE frame_size;

@ -6147,12 +6206,13 @@ use_jit = (re->executable_jit != NULL &&
          (options & ~PUBLIC_JIT_MATCH_OPTIONS) == 0);
 #endif

-/* Initialize UTF parameters. */
+/* Initialize UTF/UCP parameters. */

-utf = (re->overall_options & PCRE2_UTF) != 0;
 #ifdef SUPPORT_UNICODE
+utf = (re->overall_options & PCRE2_UTF) != 0;
 allow_invalid = (re->overall_options & PCRE2_MATCH_INVALID_UTF) != 0;
-#endif
+ucp = (re->overall_options & PCRE2_UCP) != 0;
+#endif  /* SUPPORT_UNICODE */

 /* Convert the partial matching flags into an integer. */

@ -6589,9 +6649,13 @@ if ((re->flags & PCRE2_FIRSTSET) != 0)
  if ((re->flags & PCRE2_FIRSTCASELESS) != 0)
    {
    first_cu2 = TABLE_GET(first_cu, mb->fcc, first_cu);
-#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 8
-    if (utf && first_cu > 127) first_cu2 = UCD_OTHERCASE(first_cu);
+#ifdef SUPPORT_UNICODE
+#if PCRE2_CODE_UNIT_WIDTH == 8
+    if (first_cu > 127 && ucp && !utf) first_cu2 = UCD_OTHERCASE(first_cu);
+#else
+    if (first_cu > 127 && (utf || ucp)) first_cu2 = UCD_OTHERCASE(first_cu);
 #endif
+#endif  /* SUPPORT_UNICODE */
    }
  }
 else
@ -6607,9 +6671,13 @@ if ((re->flags & PCRE2_LASTSET) != 0)
  if ((re->flags & PCRE2_LASTCASELESS) != 0)
    {
    req_cu2 = TABLE_GET(req_cu, mb->fcc, req_cu);
-#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 8
-    if (utf && req_cu > 127) req_cu2 = UCD_OTHERCASE(req_cu);
+#ifdef SUPPORT_UNICODE
+#if PCRE2_CODE_UNIT_WIDTH == 8
+    if (req_cu > 127 && ucp && !utf) req_cu2 = UCD_OTHERCASE(req_cu);
+#else
+    if (req_cu > 127 && (utf || ucp)) req_cu2 = UCD_OTHERCASE(req_cu);
 #endif
+#endif  /* SUPPORT_UNICODE */
    }
  }

@ -6756,15 +6824,16 @@ for(;;)
 #endif
          }

-        /* If we can't find the required code unit, having reached the true end
-        of the subject, break the bumpalong loop, to force a match failure,
-        except when doing partial matching, when we let the next cycle run at
-        the end of the subject. To see why, consider the pattern /(?<=abc)def/,
-        which partially matches "abc", even though the string does not contain
-        the starting character "d". If we have not reached the true end of the
-        subject (PCRE2_FIRSTLINE caused end_subject to be temporarily modified)
-        we also let the cycle run, because the matching string is legitimately
-        allowed to start with the first code unit of a newline. */
+        /* If we can't find the required first code unit, having reached the
+        true end of the subject, break the bumpalong loop, to force a match
+        failure, except when doing partial matching, when we let the next cycle
+        run at the end of the subject. To see why, consider the pattern
+        /(?<=abc)def/, which partially matches "abc", even though the string
+        does not contain the starting character "d". If we have not reached the
+        true end of the subject (PCRE2_FIRSTLINE caused end_subject to be
+        temporarily modified) we also let the cycle run, because the matching
+        string is legitimately allowed to start with the first code unit of a
+        newline. */

        if (mb->partial == 0 && start_match >= mb->end_subject)
          {
--- a/src/pcre2_study.c
+++ b/src/pcre2_study.c
@ -772,15 +772,19 @@ Arguments:
  p             points to the first code unit of the character
  caseless      TRUE if caseless
  utf           TRUE for UTF mode
+  ucp           TRUE for UCP mode 

 Returns:        pointer after the character
 */

 static PCRE2_SPTR
-set_table_bit(pcre2_real_code *re, PCRE2_SPTR p, BOOL caseless, BOOL utf)
+set_table_bit(pcre2_real_code *re, PCRE2_SPTR p, BOOL caseless, BOOL utf, 
+  BOOL ucp)
 {
 uint32_t c = *p++;   /* First code unit */
-(void)utf;           /* Stop compiler warning when UTF not supported */
+
+(void)utf;           /* Stop compiler warnings when UTF not supported */
+(void)ucp;

 /* In 16-bit and 32-bit modes, code units greater than 0xff set the bit for
 0xff. */
@ -810,22 +814,26 @@ if (utf)
 if (caseless)
  {
 #ifdef SUPPORT_UNICODE
+  if (utf || ucp)
+    {
+    c = UCD_OTHERCASE(c);
+#if PCRE2_CODE_UNIT_WIDTH == 8
    if (utf)
      { 
-#if PCRE2_CODE_UNIT_WIDTH == 8
      PCRE2_UCHAR buff[6];
-    c = UCD_OTHERCASE(c);
      (void)PRIV(ord2utf)(c, buff);
      SET_BIT(buff[0]);
+      }
+    else SET_BIT(c);    
 #else  /* 16-bit or 32-bit mode */
-    c = UCD_OTHERCASE(c);
    if (c > 0xff) SET_BIT(0xff); else SET_BIT(c);
 #endif
    }
+ 
  else
 #endif  /* SUPPORT_UNICODE */

-  /* Not UTF */
+  /* Not UTF or UCP */

  if (MAX_255(c)) SET_BIT(re->tables[fcc_offset + c]);
  }
@ -931,6 +939,7 @@ Arguments:
  re           points to the compiled regex block
  code         points to an expression
  utf          TRUE if in UTF mode
+  ucp          TRUE if in UCP mode 
  depthptr     pointer to recurse depth

 Returns:       SSB_FAIL     => Failed to find any starting code units
@ -941,7 +950,8 @@ Returns:       SSB_FAIL     => Failed to find any starting code units
 */

 static int
-set_start_bits(pcre2_real_code *re, PCRE2_SPTR code, BOOL utf, int *depthptr)
+set_start_bits(pcre2_real_code *re, PCRE2_SPTR code, BOOL utf, BOOL ucp,
+  int *depthptr)
 {
 uint32_t c;
 int yield = SSB_DONE;
@ -1111,7 +1121,7 @@ do
      case OP_SCRIPT_RUN:
      case OP_ASSERT:
      case OP_ASSERT_NA:
-      rc = set_start_bits(re, tcode, utf, depthptr);
+      rc = set_start_bits(re, tcode, utf, ucp, depthptr);
      if (rc == SSB_DONE)
        {
        try_next = FALSE;
@ -1167,7 +1177,7 @@ do
      case OP_BRAZERO:
      case OP_BRAMINZERO:
      case OP_BRAPOSZERO:
-      rc = set_start_bits(re, ++tcode, utf, depthptr);
+      rc = set_start_bits(re, ++tcode, utf, ucp, depthptr);
      if (rc == SSB_FAIL || rc == SSB_UNKNOWN || rc == SSB_TOODEEP) return rc;
      do tcode += GET(tcode,1); while (*tcode == OP_ALT);
      tcode += 1 + LINK_SIZE;
@ -1189,7 +1199,7 @@ do
      case OP_QUERY:
      case OP_MINQUERY:
      case OP_POSQUERY:
-      tcode = set_table_bit(re, tcode + 1, FALSE, utf);
+      tcode = set_table_bit(re, tcode + 1, FALSE, utf, ucp);
      break;

      case OP_STARI:
@ -1198,7 +1208,7 @@ do
      case OP_QUERYI:
      case OP_MINQUERYI:
      case OP_POSQUERYI:
-      tcode = set_table_bit(re, tcode + 1, TRUE, utf);
+      tcode = set_table_bit(re, tcode + 1, TRUE, utf, ucp);
      break;

      /* Single-char upto sets the bit and tries the next */
@ -1206,13 +1216,13 @@ do
      case OP_UPTO:
      case OP_MINUPTO:
      case OP_POSUPTO:
-      tcode = set_table_bit(re, tcode + 1 + IMM2_SIZE, FALSE, utf);
+      tcode = set_table_bit(re, tcode + 1 + IMM2_SIZE, FALSE, utf, ucp);
      break;

      case OP_UPTOI:
      case OP_MINUPTOI:
      case OP_POSUPTOI:
-      tcode = set_table_bit(re, tcode + 1 + IMM2_SIZE, TRUE, utf);
+      tcode = set_table_bit(re, tcode + 1 + IMM2_SIZE, TRUE, utf, ucp);
      break;

      /* At least one single char sets the bit and stops */
@ -1224,7 +1234,7 @@ do
      case OP_PLUS:
      case OP_MINPLUS:
      case OP_POSPLUS:
-      (void)set_table_bit(re, tcode + 1, FALSE, utf);
+      (void)set_table_bit(re, tcode + 1, FALSE, utf, ucp);
      try_next = FALSE;
      break;

@ -1235,7 +1245,7 @@ do
      case OP_PLUSI:
      case OP_MINPLUSI:
      case OP_POSPLUSI:
-      (void)set_table_bit(re, tcode + 1, TRUE, utf);
+      (void)set_table_bit(re, tcode + 1, TRUE, utf, ucp);
      try_next = FALSE;
      break;

@ -1664,6 +1674,7 @@ PRIV(study)(pcre2_real_code *re)
 int count = 0;
 PCRE2_UCHAR *code;
 BOOL utf = (re->overall_options & PCRE2_UTF) != 0;
+BOOL ucp = (re->overall_options & PCRE2_UCP) != 0;

 /* Find start of compiled code */

@ -1677,7 +1688,7 @@ code units. */
 if ((re->flags & (PCRE2_FIRSTSET|PCRE2_STARTLINE)) == 0)
  {
  int depth = 0;
-  int rc = set_start_bits(re, code, utf, &depth);
+  int rc = set_start_bits(re, code, utf, ucp, &depth);
  if (rc == SSB_UNKNOWN) return 1;

  /* If a list of starting code units was set up, scan the list to see if only
@ -1725,27 +1736,27 @@ if ((re->flags & (PCRE2_FIRSTSET|PCRE2_STARTLINE)) == 0)
          }

        /* c contains the code unit value, in the range 0-255. In 8-bit UTF
-        mode, only values < 128 can be used. */
+        mode, only values < 128 can be used. In all the other cases, c is a 
+        character value. */

 #if PCRE2_CODE_UNIT_WIDTH == 8
-        if (c > 127) goto DONE;
+        if (utf && c > 127) goto DONE;
 #endif
-        if (a < 0) a = c;   /* First one found */
+        if (a < 0) a = c;   /* First one found, save in a */
        else if (b < 0)     /* Second one found */
          {
          int d = TABLE_GET((unsigned int)c, re->tables + fcc_offset, c);
          
 #ifdef SUPPORT_UNICODE
-#if PCRE2_CODE_UNIT_WIDTH == 8
-          if (utf && UCD_CASESET(c) != 0) goto DONE;   /* Multiple case set */
-#else   /* 16-bit or 32-bit */
+          if (utf || ucp)
+            { 
            if (UCD_CASESET(c) != 0) goto DONE;     /* Multiple case set */
-          if (utf && c > 127) d = UCD_OTHERCASE(c);
-#endif  /* Code width */
+            if (c > 127) d = UCD_OTHERCASE(c);
+            }
 #endif  /* SUPPORT_UNICODE */

-          if (d != a) goto DONE;   /* Not other case of a */
-          b = c;
+          if (d != a) goto DONE;   /* Not the other case of a */
+          b = c;                   /* Save second in b */
          }
        else goto DONE;   /* More than two characters found */
        }
--- a/src/pcre2_substitute.c
+++ b/src/pcre2_substitute.c
@ -236,6 +236,7 @@ BOOL use_existing_match;
 BOOL replacement_only;
 #ifdef SUPPORT_UNICODE
 BOOL utf = (code->overall_options & PCRE2_UTF) != 0;
+BOOL ucp = (code->overall_options & PCRE2_UCP) != 0;
 #endif
 PCRE2_UCHAR temp[6];
 PCRE2_SPTR ptr;
@ -758,7 +759,7 @@ do
          if (forcecase != 0)
            {
 #ifdef SUPPORT_UNICODE
-            if (utf)
+            if (utf || ucp)
              {
              uint32_t type = UCD_CHARTYPE(ch);
              if (PRIV(ucp_gentype)[type] == ucp_L &&
@ -860,7 +861,7 @@ do
      if (forcecase != 0)
        {
 #ifdef SUPPORT_UNICODE
-        if (utf)
+        if (utf || ucp)
          {
          uint32_t type = UCD_CHARTYPE(ch);
          if (PRIV(ucp_gentype)[type] == ucp_L &&
--- a/testdata/testinput10
+++ b/testdata/testinput10
@ -570,8 +570,10 @@
 /[\xff\x{ffff}]/I,utf

 /[\xff\x{ff}]/I,utf
+    abc\x{ff}def

 /[\xff\x{ff}]/I
+    abc\x{ff}def

 /[Ss]/I

@ -585,4 +587,31 @@
    abc\x80\=startchar
    abc\x80\=startchar,offset=3

+#subject no_jit
+
+/\x{c1}+\x{e1}/iIB,ucp
+    \x{c1}\x{c1}\x{c1}
+    \x{e1}\x{e1}\x{e1} 
+
+/a|\x{c1}/iI,ucp
+    \x{e1}xxx
+
+/a|\x{c1}/iI,utf
+    \x{e1}xxx
+
+/\x{c1}|\x{e1}/iI,ucp
+
+/X(\x{e1})Y/ucp,replace=>\U$1<,substitute_extended
+    X\x{e1}Y
+
+/X(\x{e1})Y/i,ucp,replace=>\L$1<,substitute_extended
+    X\x{c1}Y
+
+# Without UTF or UCP characters > 127 have only one case in the default locale.
+
+/X(\x{e1})Y/replace=>\U$1<,substitute_extended
+    X\x{e1}Y
+
+#subject     
+
 # End of testinput10
--- a/testdata/testinput12
+++ b/testdata/testinput12
@ -463,4 +463,71 @@

 /(?:\x{ff}|\x{3000})/I,utf

+# ---------------------------------------------------- 
+# UCP and casing tests
+
+/\x{120}/i,I
+
+/\x{c1}/i,I,ucp
+
+/[\x{120}\x{121}]/iB,ucp
+
+/[ab\x{120}]+/iB,ucp
+    aABb\x{121}\x{120}
+
+#subject no_jit
+
+/\x{c1}/i,no_start_optimize
+\= Expect no match
+    \x{e1}
+
+/\x{120}\x{c1}/i,ucp,no_start_optimize
+    \x{121}\x{e1}
+
+/\x{120}\x{c1}/i,ucp
+    \x{121}\x{e1}
+
+/[^\x{120}]/i,no_start_optimize
+    \x{121}
+
+/[^\x{120}]/i,ucp,no_start_optimize
+\= Expect no match
+    \x{121}
+
+/[^\x{120}]/i
+    \x{121}
+
+/[^\x{120}]/i,ucp
+\= Expect no match
+    \x{121}
+    
+/\x{120}{2}/i,ucp
+    \x{121}\x{121}
+
+/[^\x{120}]{2}/i,ucp
+\= Expect no match
+    \x{121}\x{121}
+
+/\x{c1}+\x{e1}/iB,ucp
+    \x{c1}\x{c1}\x{c1}
+
+/\x{c1}+\x{e1}/iIB,ucp
+    \x{c1}\x{c1}\x{c1}
+    \x{e1}\x{e1}\x{e1} 
+
+/a|\x{c1}/iI,ucp
+    \x{e1}xxx
+
+/\x{c1}|\x{e1}/iI,ucp
+
+/X(\x{e1})Y/ucp,replace=>\U$1<,substitute_extended
+    X\x{e1}Y
+
+/X(\x{121})Y/ucp,replace=>\U$1<,substitute_extended
+    X\x{121}Y
+
+#subject 
+
+# ---------------------------------------------------- 
+
 # End of testinput12
--- a/testdata/testinput14
+++ b/testdata/testinput14
@ -1,9 +1,12 @@
-# These test special (mostly error) UTF features of DFA matching. They are a 
-# selection of the more comprehensive tests that are run for non-DFA matching.
-# The output is different for the different widths.
+# These test special UTF and UCP features of DFA matching. The output is
+# different for the different widths.

 #subject dfa

+# ---------------------------------------------------- 
+# These are a selection of the more comprehensive tests that are run for
+# non-DFA matching.
+
 /X/utf
    XX\x{d800}
    XX\x{d800}\=offset=3
@ -34,4 +37,45 @@
    \xf7\=ph
    \xf7\x80\=ph
    
+# ---------------------------------------------------- 
+# UCP and casing tests - except for the first two, these will all fail in 8-bit
+# mode because they are testing UCP without UTF and use characters > 255.
+
+/\x{c1}/i,no_start_optimize
+\= Expect no match
+    \x{e1}
+
+/\x{c1}+\x{e1}/iB,ucp
+    \x{c1}\x{c1}\x{c1}
+    \x{e1}\x{e1}\x{e1} 
+
+/\x{120}\x{c1}/i,ucp,no_start_optimize
+    \x{121}\x{e1}
+
+/\x{120}\x{c1}/i,ucp
+    \x{121}\x{e1}
+
+/[^\x{120}]/i,no_start_optimize
+    \x{121}
+
+/[^\x{120}]/i,ucp,no_start_optimize
+\= Expect no match
+    \x{121}
+
+/[^\x{120}]/i
+    \x{121}
+
+/[^\x{120}]/i,ucp
+\= Expect no match
+    \x{121}
+    
+/\x{120}{2}/i,ucp
+    \x{121}\x{121}
+
+/[^\x{120}]{2}/i,ucp
+\= Expect no match
+    \x{121}\x{121}
+
+# ---------------------------------------------------- 
+
 # End of testinput14
--- a/testdata/testoutput10
+++ b/testdata/testoutput10
@ -1780,11 +1780,15 @@ Capture group count = 0
 Options: utf
 Starting code units: \xc3 
 Subject length lower bound = 1
+    abc\x{ff}def
+ 0: \x{ff}

 /[\xff\x{ff}]/I
 Capture group count = 0
-Starting code units: \xff 
+First code unit = \xff
 Subject length lower bound = 1
+    abc\x{ff}def
+ 0: \xff

 /[Ss]/I
 Capture group count = 0
@ -1813,4 +1817,62 @@ Failed: error -22: UTF-8 error: isolated byte with 0x80 bit set at offset 3
    abc\x80\=startchar,offset=3
 Error -36 (bad UTF-8 offset)

+#subject no_jit
+
+/\x{c1}+\x{e1}/iIB,ucp
+------------------------------------------------------------------
+        Bra
+     /i \x{c1}+
+     /i \x{e1}
+        Ket
+        End
+------------------------------------------------------------------
+Capture group count = 0
+Options: caseless ucp
+First code unit = \xc1 (caseless)
+Last code unit = \xe1 (caseless)
+Subject length lower bound = 2
+    \x{c1}\x{c1}\x{c1}
+ 0: \xc1\xc1\xc1
+    \x{e1}\x{e1}\x{e1} 
+ 0: \xe1\xe1\xe1
+
+/a|\x{c1}/iI,ucp
+Capture group count = 0
+Options: caseless ucp
+Starting code units: A a \xc1 \xe1 
+Subject length lower bound = 1
+    \x{e1}xxx
+ 0: \xe1
+
+/a|\x{c1}/iI,utf
+Capture group count = 0
+Options: caseless utf
+Starting code units: A a \xc3 
+Subject length lower bound = 1
+    \x{e1}xxx
+ 0: \x{e1}
+
+/\x{c1}|\x{e1}/iI,ucp
+Capture group count = 0
+Options: caseless ucp
+First code unit = \xc1 (caseless)
+Subject length lower bound = 1
+
+/X(\x{e1})Y/ucp,replace=>\U$1<,substitute_extended
+    X\x{e1}Y
+ 1: >\xc1<
+
+/X(\x{e1})Y/i,ucp,replace=>\L$1<,substitute_extended
+    X\x{c1}Y
+ 1: >\xe1<
+
+# Without UTF or UCP characters > 127 have only one case in the default locale.
+
+/X(\x{e1})Y/replace=>\U$1<,substitute_extended
+    X\x{e1}Y
+ 1: >\xe1<
+
+#subject     
+
 # End of testinput10
--- a/testdata/testoutput12-16
+++ b/testdata/testoutput12-16
@ -1613,7 +1613,7 @@ Subject length lower bound = 1

 /[Ss]/I
 Capture group count = 0
-Starting code units: S s 
+First code unit = 'S' (caseless)
 Subject length lower bound = 1

 /[Ss]/I,utf
@ -1628,4 +1628,134 @@ Options: utf
 Starting code units: \xff 
 Subject length lower bound = 1

+# ---------------------------------------------------- 
+# UCP and casing tests
+
+/\x{120}/i,I
+Capture group count = 0
+Options: caseless
+First code unit = \x{120}
+Subject length lower bound = 1
+
+/\x{c1}/i,I,ucp
+Capture group count = 0
+Options: caseless ucp
+First code unit = \xc1 (caseless)
+Subject length lower bound = 1
+
+/[\x{120}\x{121}]/iB,ucp
+------------------------------------------------------------------
+        Bra
+     /i \x{120}
+        Ket
+        End
+------------------------------------------------------------------
+
+/[ab\x{120}]+/iB,ucp
+------------------------------------------------------------------
+        Bra
+        [ABab\x{120}-\x{121}]++
+        Ket
+        End
+------------------------------------------------------------------
+    aABb\x{121}\x{120}
+ 0: aABb\x{121}\x{120}
+
+#subject no_jit
+
+/\x{c1}/i,no_start_optimize
+\= Expect no match
+    \x{e1}
+No match
+
+/\x{120}\x{c1}/i,ucp,no_start_optimize
+    \x{121}\x{e1}
+ 0: \x{121}\xe1
+
+/\x{120}\x{c1}/i,ucp
+    \x{121}\x{e1}
+ 0: \x{121}\xe1
+
+/[^\x{120}]/i,no_start_optimize
+    \x{121}
+ 0: \x{121}
+
+/[^\x{120}]/i,ucp,no_start_optimize
+\= Expect no match
+    \x{121}
+No match
+
+/[^\x{120}]/i
+    \x{121}
+ 0: \x{121}
+
+/[^\x{120}]/i,ucp
+\= Expect no match
+    \x{121}
+No match
+    
+/\x{120}{2}/i,ucp
+    \x{121}\x{121}
+ 0: \x{121}\x{121}
+
+/[^\x{120}]{2}/i,ucp
+\= Expect no match
+    \x{121}\x{121}
+No match
+
+/\x{c1}+\x{e1}/iB,ucp
+------------------------------------------------------------------
+        Bra
+     /i \x{c1}+
+     /i \x{e1}
+        Ket
+        End
+------------------------------------------------------------------
+    \x{c1}\x{c1}\x{c1}
+ 0: \xc1\xc1\xc1
+
+/\x{c1}+\x{e1}/iIB,ucp
+------------------------------------------------------------------
+        Bra
+     /i \x{c1}+
+     /i \x{e1}
+        Ket
+        End
+------------------------------------------------------------------
+Capture group count = 0
+Options: caseless ucp
+First code unit = \xc1 (caseless)
+Last code unit = \xe1 (caseless)
+Subject length lower bound = 2
+    \x{c1}\x{c1}\x{c1}
+ 0: \xc1\xc1\xc1
+    \x{e1}\x{e1}\x{e1} 
+ 0: \xe1\xe1\xe1
+
+/a|\x{c1}/iI,ucp
+Capture group count = 0
+Options: caseless ucp
+Starting code units: A a \xc1 \xe1 
+Subject length lower bound = 1
+    \x{e1}xxx
+ 0: \xe1
+
+/\x{c1}|\x{e1}/iI,ucp
+Capture group count = 0
+Options: caseless ucp
+First code unit = \xc1 (caseless)
+Subject length lower bound = 1
+
+/X(\x{e1})Y/ucp,replace=>\U$1<,substitute_extended
+    X\x{e1}Y
+ 1: >\xc1<
+
+/X(\x{121})Y/ucp,replace=>\U$1<,substitute_extended
+    X\x{121}Y
+ 1: >\x{120}<
+
+#subject 
+
+# ---------------------------------------------------- 
+
 # End of testinput12
--- a/testdata/testoutput12-32
+++ b/testdata/testoutput12-32
@ -1611,7 +1611,7 @@ Subject length lower bound = 1

 /[Ss]/I
 Capture group count = 0
-Starting code units: S s 
+First code unit = 'S' (caseless)
 Subject length lower bound = 1

 /[Ss]/I,utf
@ -1626,4 +1626,134 @@ Options: utf
 Starting code units: \xff 
 Subject length lower bound = 1

+# ---------------------------------------------------- 
+# UCP and casing tests
+
+/\x{120}/i,I
+Capture group count = 0
+Options: caseless
+First code unit = \x{120}
+Subject length lower bound = 1
+
+/\x{c1}/i,I,ucp
+Capture group count = 0
+Options: caseless ucp
+First code unit = \xc1 (caseless)
+Subject length lower bound = 1
+
+/[\x{120}\x{121}]/iB,ucp
+------------------------------------------------------------------
+        Bra
+     /i \x{120}
+        Ket
+        End
+------------------------------------------------------------------
+
+/[ab\x{120}]+/iB,ucp
+------------------------------------------------------------------
+        Bra
+        [ABab\x{120}-\x{121}]++
+        Ket
+        End
+------------------------------------------------------------------
+    aABb\x{121}\x{120}
+ 0: aABb\x{121}\x{120}
+
+#subject no_jit
+
+/\x{c1}/i,no_start_optimize
+\= Expect no match
+    \x{e1}
+No match
+
+/\x{120}\x{c1}/i,ucp,no_start_optimize
+    \x{121}\x{e1}
+ 0: \x{121}\xe1
+
+/\x{120}\x{c1}/i,ucp
+    \x{121}\x{e1}
+ 0: \x{121}\xe1
+
+/[^\x{120}]/i,no_start_optimize
+    \x{121}
+ 0: \x{121}
+
+/[^\x{120}]/i,ucp,no_start_optimize
+\= Expect no match
+    \x{121}
+No match
+
+/[^\x{120}]/i
+    \x{121}
+ 0: \x{121}
+
+/[^\x{120}]/i,ucp
+\= Expect no match
+    \x{121}
+No match
+    
+/\x{120}{2}/i,ucp
+    \x{121}\x{121}
+ 0: \x{121}\x{121}
+
+/[^\x{120}]{2}/i,ucp
+\= Expect no match
+    \x{121}\x{121}
+No match
+
+/\x{c1}+\x{e1}/iB,ucp
+------------------------------------------------------------------
+        Bra
+     /i \x{c1}+
+     /i \x{e1}
+        Ket
+        End
+------------------------------------------------------------------
+    \x{c1}\x{c1}\x{c1}
+ 0: \xc1\xc1\xc1
+
+/\x{c1}+\x{e1}/iIB,ucp
+------------------------------------------------------------------
+        Bra
+     /i \x{c1}+
+     /i \x{e1}
+        Ket
+        End
+------------------------------------------------------------------
+Capture group count = 0
+Options: caseless ucp
+First code unit = \xc1 (caseless)
+Last code unit = \xe1 (caseless)
+Subject length lower bound = 2
+    \x{c1}\x{c1}\x{c1}
+ 0: \xc1\xc1\xc1
+    \x{e1}\x{e1}\x{e1} 
+ 0: \xe1\xe1\xe1
+
+/a|\x{c1}/iI,ucp
+Capture group count = 0
+Options: caseless ucp
+Starting code units: A a \xc1 \xe1 
+Subject length lower bound = 1
+    \x{e1}xxx
+ 0: \xe1
+
+/\x{c1}|\x{e1}/iI,ucp
+Capture group count = 0
+Options: caseless ucp
+First code unit = \xc1 (caseless)
+Subject length lower bound = 1
+
+/X(\x{e1})Y/ucp,replace=>\U$1<,substitute_extended
+    X\x{e1}Y
+ 1: >\xc1<
+
+/X(\x{121})Y/ucp,replace=>\U$1<,substitute_extended
+    X\x{121}Y
+ 1: >\x{120}<
+
+#subject 
+
+# ---------------------------------------------------- 
+
 # End of testinput12
--- a/testdata/testoutput14-16
+++ b/testdata/testoutput14-16
@ -1,9 +1,12 @@
-# These test special (mostly error) UTF features of DFA matching. They are a 
-# selection of the more comprehensive tests that are run for non-DFA matching.
-# The output is different for the different widths.
+# These test special UTF and UCP features of DFA matching. The output is
+# different for the different widths.

 #subject dfa

+# ---------------------------------------------------- 
+# These are a selection of the more comprehensive tests that are run for
+# non-DFA matching.
+
 /X/utf
    XX\x{d800}
 Failed: error -24: UTF-16 error: missing low surrogate at end at offset 2
@ -58,4 +61,65 @@ No match
    \xf7\x80\=ph
 No match
    
+# ---------------------------------------------------- 
+# UCP and casing tests - except for the first two, these will all fail in 8-bit
+# mode because they are testing UCP without UTF and use characters > 255.
+
+/\x{c1}/i,no_start_optimize
+\= Expect no match
+    \x{e1}
+No match
+
+/\x{c1}+\x{e1}/iB,ucp
+------------------------------------------------------------------
+        Bra
+     /i \x{c1}+
+     /i \x{e1}
+        Ket
+        End
+------------------------------------------------------------------
+    \x{c1}\x{c1}\x{c1}
+ 0: \xc1\xc1\xc1
+ 1: \xc1\xc1
+    \x{e1}\x{e1}\x{e1} 
+ 0: \xe1\xe1\xe1
+ 1: \xe1\xe1
+
+/\x{120}\x{c1}/i,ucp,no_start_optimize
+    \x{121}\x{e1}
+ 0: \x{121}\xe1
+
+/\x{120}\x{c1}/i,ucp
+    \x{121}\x{e1}
+ 0: \x{121}\xe1
+
+/[^\x{120}]/i,no_start_optimize
+    \x{121}
+ 0: \x{121}
+
+/[^\x{120}]/i,ucp,no_start_optimize
+\= Expect no match
+    \x{121}
+No match
+
+/[^\x{120}]/i
+    \x{121}
+ 0: \x{121}
+
+/[^\x{120}]/i,ucp
+\= Expect no match
+    \x{121}
+No match
+    
+/\x{120}{2}/i,ucp
+    \x{121}\x{121}
+ 0: \x{121}\x{121}
+
+/[^\x{120}]{2}/i,ucp
+\= Expect no match
+    \x{121}\x{121}
+No match
+
+# ---------------------------------------------------- 
+
 # End of testinput14
--- a/testdata/testoutput14-32
+++ b/testdata/testoutput14-32
@ -1,9 +1,12 @@
-# These test special (mostly error) UTF features of DFA matching. They are a 
-# selection of the more comprehensive tests that are run for non-DFA matching.
-# The output is different for the different widths.
+# These test special UTF and UCP features of DFA matching. The output is
+# different for the different widths.

 #subject dfa

+# ---------------------------------------------------- 
+# These are a selection of the more comprehensive tests that are run for
+# non-DFA matching.
+
 /X/utf
    XX\x{d800}
 Failed: error -27: UTF-32 error: code points 0xd800-0xdfff are not defined at offset 2
@ -58,4 +61,65 @@ No match
    \xf7\x80\=ph
 No match
    
+# ---------------------------------------------------- 
+# UCP and casing tests - except for the first two, these will all fail in 8-bit
+# mode because they are testing UCP without UTF and use characters > 255.
+
+/\x{c1}/i,no_start_optimize
+\= Expect no match
+    \x{e1}
+No match
+
+/\x{c1}+\x{e1}/iB,ucp
+------------------------------------------------------------------
+        Bra
+     /i \x{c1}+
+     /i \x{e1}
+        Ket
+        End
+------------------------------------------------------------------
+    \x{c1}\x{c1}\x{c1}
+ 0: \xc1\xc1\xc1
+ 1: \xc1\xc1
+    \x{e1}\x{e1}\x{e1} 
+ 0: \xe1\xe1\xe1
+ 1: \xe1\xe1
+
+/\x{120}\x{c1}/i,ucp,no_start_optimize
+    \x{121}\x{e1}
+ 0: \x{121}\xe1
+
+/\x{120}\x{c1}/i,ucp
+    \x{121}\x{e1}
+ 0: \x{121}\xe1
+
+/[^\x{120}]/i,no_start_optimize
+    \x{121}
+ 0: \x{121}
+
+/[^\x{120}]/i,ucp,no_start_optimize
+\= Expect no match
+    \x{121}
+No match
+
+/[^\x{120}]/i
+    \x{121}
+ 0: \x{121}
+
+/[^\x{120}]/i,ucp
+\= Expect no match
+    \x{121}
+No match
+    
+/\x{120}{2}/i,ucp
+    \x{121}\x{121}
+ 0: \x{121}\x{121}
+
+/[^\x{120}]{2}/i,ucp
+\= Expect no match
+    \x{121}\x{121}
+No match
+
+# ---------------------------------------------------- 
+
 # End of testinput14
--- a/testdata/testoutput14-8
+++ b/testdata/testoutput14-8
@ -1,9 +1,12 @@
-# These test special (mostly error) UTF features of DFA matching. They are a 
-# selection of the more comprehensive tests that are run for non-DFA matching.
-# The output is different for the different widths.
+# These test special UTF and UCP features of DFA matching. The output is
+# different for the different widths.

 #subject dfa

+# ---------------------------------------------------- 
+# These are a selection of the more comprehensive tests that are run for
+# non-DFA matching.
+
 /X/utf
    XX\x{d800}
 Failed: error -16: UTF-8 error: code points 0xd800-0xdfff are not defined at offset 2
@ -58,4 +61,65 @@ Failed: error -5: UTF-8 error: 3 bytes missing at end at offset 0
    \xf7\x80\=ph
 Failed: error -4: UTF-8 error: 2 bytes missing at end at offset 0
    
+# ---------------------------------------------------- 
+# UCP and casing tests - except for the first two, these will all fail in 8-bit
+# mode because they are testing UCP without UTF and use characters > 255.
+
+/\x{c1}/i,no_start_optimize
+\= Expect no match
+    \x{e1}
+No match
+
+/\x{c1}+\x{e1}/iB,ucp
+------------------------------------------------------------------
+        Bra
+     /i \x{c1}+
+     /i \x{e1}
+        Ket
+        End
+------------------------------------------------------------------
+    \x{c1}\x{c1}\x{c1}
+ 0: \xc1\xc1\xc1
+ 1: \xc1\xc1
+    \x{e1}\x{e1}\x{e1} 
+ 0: \xe1\xe1\xe1
+ 1: \xe1\xe1
+
+/\x{120}\x{c1}/i,ucp,no_start_optimize
+Failed: error 134 at offset 6: character code point value in \x{} or \o{} is too large
+    \x{121}\x{e1}
+
+/\x{120}\x{c1}/i,ucp
+Failed: error 134 at offset 6: character code point value in \x{} or \o{} is too large
+    \x{121}\x{e1}
+
+/[^\x{120}]/i,no_start_optimize
+Failed: error 134 at offset 8: character code point value in \x{} or \o{} is too large
+    \x{121}
+
+/[^\x{120}]/i,ucp,no_start_optimize
+Failed: error 134 at offset 8: character code point value in \x{} or \o{} is too large
+\= Expect no match
+    \x{121}
+
+/[^\x{120}]/i
+Failed: error 134 at offset 8: character code point value in \x{} or \o{} is too large
+    \x{121}
+
+/[^\x{120}]/i,ucp
+Failed: error 134 at offset 8: character code point value in \x{} or \o{} is too large
+\= Expect no match
+    \x{121}
+    
+/\x{120}{2}/i,ucp
+Failed: error 134 at offset 6: character code point value in \x{} or \o{} is too large
+    \x{121}\x{121}
+
+/[^\x{120}]{2}/i,ucp
+Failed: error 134 at offset 8: character code point value in \x{} or \o{} is too large
+\= Expect no match
+    \x{121}\x{121}
+
+# ---------------------------------------------------- 
+
 # End of testinput14