Runtime UTF checks now take not of the starting offset.

2015-08-18 10:34:05 +00:00 · 2015-08-18 10:34:05 +00:00 · ee41aa906f
commit ee41aa906f
parent 1370a49dfe
30 changed files with 2077 additions and 1664 deletions
--- a/4
+++ b/4
@ -145,6 +145,10 @@ was fixed.
 39. Match limit check added to recursion. This issue was found by Karl Skomski
 with a custom LLVM fuzzer.

+40. Arrange for the UTF check in pcre2_match() and pcre2_dfa_match() to look 
+only at the part of the subject that is relevant when the starting offset is 
+non-zero.
+

 Version 10.20 30-June-2015
 --------------------------
--- a/80
+++ b/80
@ -68,12 +68,13 @@ title10="Test 10: Specials for the 8-bit library with UTF-8 and UCP support"
 title11="Test 11: Specials for the basic 16-bit and 32-bit libraries"
 title12="Test 12: Specials for the 16-bit and 32-bit libraries UTF and UCP support"
 title13="Test 13: DFA specials for the basic 16-bit and 32-bit libraries"
-title14="Test 14: Non-JIT limits and other non-JIT tests"
-title15="Test 15: JIT-specific features when JIT is not available"
-title16="Test 16: JIT-specific features when JIT is available"
-title17="Test 17: Tests of the POSIX interface, excluding UTF/UCP"
-title18="Test 18: Tests of the POSIX interface with UTF/UCP"
-title19="Test 19: Serialization tests"
+title14="Test 14: DFA specials for UTF and UCP support"
+title15="Test 15: Non-JIT limits and other non-JIT tests"
+title16="Test 16: JIT-specific features when JIT is not available"
+title17="Test 17: JIT-specific features when JIT is available"
+title18="Test 18: Tests of the POSIX interface, excluding UTF/UCP"
+title19="Test 19: Tests of the POSIX interface with UTF/UCP"
+title20="Test 20: Serialization tests"
 maxtest=18

 if [ $# -eq 1 -a "$1" = "list" ]; then
@ -97,6 +98,7 @@ if [ $# -eq 1 -a "$1" = "list" ]; then
  echo $title17
  echo $title18
  echo $title19
+  echo $title20
  exit 0
 fi

@ -219,6 +221,7 @@ do16=no
 do17=no
 do18=no
 do19=no
+do20=no

 while [ $# -gt 0 ] ; do
  case $1 in
@ -242,6 +245,7 @@ while [ $# -gt 0 ] ; do
   17) do17=yes;;
   18) do18=yes;;
   19) do19=yes;;
+   20) do20=yes;;
   -8) arg8=yes;;
  -16) arg16=yes;;
  -32) arg32=yes;;
@ -387,7 +391,8 @@ if [ $do0  = no -a $do1  = no -a $do2  = no -a $do3  = no -a \
     $do4  = no -a $do5  = no -a $do6  = no -a $do7  = no -a \
     $do8  = no -a $do9  = no -a $do10 = no -a $do11 = no -a \
     $do12 = no -a $do13 = no -a $do14 = no -a $do15 = no -a \
-     $do16 = no -a $do17 = no -a $do18 = no -a $do19 = no \
+     $do16 = no -a $do17 = no -a $do18 = no -a $do19 = no -a \
+     $do20 = no \
   ]; then
  do0=yes
  do1=yes
@ -409,6 +414,7 @@ if [ $do0  = no -a $do1  = no -a $do2  = no -a $do3  = no -a \
  do17=yes
  do18=yes
  do19=yes
+  do20=yes 
 fi

 # Handle any explicit skips at this stage, so that an argument list may consist
@ -689,70 +695,78 @@ for bmode in "$test8" "$test16" "$test32"; do
    fi
  fi
  
-  # Test non-JIT match and recursion limits
+  # Tests for DFA UTF and UCP features. Output is different for the different widths.

  if [ $do14 = yes ] ; then
    echo $title14
-    $sim $valgrind ./pcre2test -q $defaultstack $bmode $testdata/testinput14 testtry
-    checkresult $? 14 ""
+    $sim $valgrind ./pcre2test -q $defaultstack $bmode $opt $testdata/testinput14 testtry
+    checkresult $? 14-$bits "$opt"
+  fi
+
+  # Test non-JIT match and recursion limits
+
+  if [ $do15 = yes ] ; then
+    echo $title15
+    $sim $valgrind ./pcre2test -q $defaultstack $bmode $testdata/testinput15 testtry
+    checkresult $? 15 ""
  fi

  # Test JIT-specific features when JIT is not available

-  if [ $do15 = yes ] ; then
-    echo $title15
-    if [ $jit -ne 0 ] ; then
-      echo "  Skipped because JIT is available"
-    else
-      $sim $valgrind ./pcre2test -q $defaultstack $bmode $testdata/testinput15 testtry
-      checkresult $? 15 ""
-    fi
-  fi
-
-  # Test JIT-specific features when JIT is available
-
  if [ $do16 = yes ] ; then
    echo $title16
-    if [ $jit -eq 0 -o "$nojit" = "yes" ] ; then
-      echo "  Skipped because JIT is not available or nojit was specified"
+    if [ $jit -ne 0 ] ; then
+      echo "  Skipped because JIT is available"
    else
      $sim $valgrind ./pcre2test -q $defaultstack $bmode $testdata/testinput16 testtry
      checkresult $? 16 ""
    fi
  fi

-  # Tests for the POSIX interface without UTF/UCP (8-bit only)
+  # Test JIT-specific features when JIT is available

  if [ $do17 = yes ] ; then
    echo $title17
-    if [ "$bits" = "16" -o "$bits" = "32" ] ; then
-      echo "  Skipped when running 16/32-bit tests"
+    if [ $jit -eq 0 -o "$nojit" = "yes" ] ; then
+      echo "  Skipped because JIT is not available or nojit was specified"
    else
      $sim $valgrind ./pcre2test -q $defaultstack $bmode $testdata/testinput17 testtry
      checkresult $? 17 ""
    fi
  fi

-  # Tests for the POSIX interface with UTF/UCP (8-bit only)
+  # Tests for the POSIX interface without UTF/UCP (8-bit only)

  if [ $do18 = yes ] ; then
    echo $title18
    if [ "$bits" = "16" -o "$bits" = "32" ] ; then
      echo "  Skipped when running 16/32-bit tests"
-    elif [ $utf -eq 0 ] ; then
-      echo "  Skipped because UTF-$bits support is not available"
    else
      $sim $valgrind ./pcre2test -q $defaultstack $bmode $testdata/testinput18 testtry
      checkresult $? 18 ""
    fi
  fi

-  # Serialization tests
+  # Tests for the POSIX interface with UTF/UCP (8-bit only)

  if [ $do19 = yes ] ; then
    echo $title19
-    $sim $valgrind ./pcre2test -q $defaultstack $bmode $testdata/testinput19 testtry
-    checkresult $? 19 ""
+    if [ "$bits" = "16" -o "$bits" = "32" ] ; then
+      echo "  Skipped when running 16/32-bit tests"
+    elif [ $utf -eq 0 ] ; then
+      echo "  Skipped because UTF-$bits support is not available"
+    else
+      $sim $valgrind ./pcre2test -q $defaultstack $bmode $testdata/testinput19 testtry
+      checkresult $? 19 ""
+    fi
+  fi
+
+  # Serialization tests
+
+  if [ $do20 = yes ] ; then
+    echo $title20
+    $sim $valgrind ./pcre2test -q $defaultstack $bmode $testdata/testinput20 testtry
+    checkresult $? 20 ""
  fi

 # End of loop for 8/16/32-bit tests
--- a/doc/pcre2api.3
+++ b/doc/pcre2api.3
@ -1,4 +1,4 @@
-.TH PCRE2API 3 "29 July 2015" "PCRE2 10.21"
+.TH PCRE2API 3 "18 August 2015" "PCRE2 10.21"
 .SH NAME
 PCRE2 - Perl-compatible regular expressions (revised API)
 .sp
@ -2022,12 +2022,19 @@ If the pattern is anchored, such a match can occur only if the pattern contains
 .sp
 When PCRE2_UTF is set at compile time, the validity of the subject as a UTF
 string is checked by default when \fBpcre2_match()\fP is subsequently called.
-The entire string is checked before any other processing takes place, and a
+If a non-zero starting offset is given, the check is applied only to that part
+of the subject that could be inspected during matching, and there is a check
+that the starting offset points to the first code unit of a character or to the
+end of the subject. If there are no lookbehind assertions in the pattern, the
+check starts at the starting offset. Otherwise, it starts at the length of the
+longest lookbehind before the starting offset, or at the start of the subject
+if there are not that many characters before the starting offset. Note that the
+sequences \eb and \eB are one-character lookbehinds.
+.P
+The check is carried out before any other processing takes place, and a
 negative error code is returned if the check fails. There are several UTF error
 codes for each code unit width, corresponding to different problems with the
-code unit sequence. The value of \fIstartoffset\fP is also checked, to ensure
-that it points to the start of a character or to the end of the subject. There
-are discussions about the validity of
+code unit sequence. There are discussions about the validity of
 .\" HTML <a href="pcre2unicode.html#utf8strings">
 .\" </a>
 UTF-8 strings,
@ -2939,6 +2946,6 @@ Cambridge, England.
 .rs
 .sp
 .nf
-Last updated: 29 July 2015
+Last updated: 18 August 2015
 Copyright (c) 1997-2015 University of Cambridge.
 .fi
--- a/doc/pcre2unicode.3
+++ b/doc/pcre2unicode.3
@ -1,4 +1,4 @@
-.TH PCRE2UNICODE 3 "23 November 2014" "PCRE2 10.00"
+.TH PCRE2UNICODE 3 "18 August 2015" "PCRE2 10.21"
 .SH NAME
 PCRE - Perl-compatible regular expressions (revised API)
 .SH "UNICODE AND UTF SUPPORT"
@ -117,11 +117,21 @@ UTF-16 and UTF-32 strings can indicate their endianness by special code knows
 as a byte-order mark (BOM). The PCRE2 functions do not handle this, expecting
 strings to be in host byte order.
 .P
-The entire string is checked before any other processing takes place. In
-addition to checking the format of the string, there is a check to ensure that
-all code points lie in the range U+0 to U+10FFFF, excluding the surrogate area.
-The so-called "non-character" code points are not excluded because Unicode
-corrigendum #9 makes it clear that they should not be.
+A UTF string is checked before any other processing takes place. In the case of 
+\fBpcre2_match()\fP and \fBpcre2_dfa_match()\fP calls with a non-zero starting 
+offset, the check is applied only to that part of the subject that could be
+inspected during matching, and there is a check that the starting offset points
+to the first code unit of a character or to the end of the subject. If there
+are no lookbehind assertions in the pattern, the check starts at the starting
+offset. Otherwise, it starts at the length of the longest lookbehind before the
+starting offset, or at the start of the subject if there are not that many
+characters before the starting offset. Note that the sequences \eb and \eB are
+one-character lookbehinds.
+.P
+In addition to checking the format of the string, there is a check to ensure
+that all code points lie in the range U+0 to U+10FFFF, excluding the surrogate
+area. The so-called "non-character" code points are not excluded because
+Unicode corrigendum #9 makes it clear that they should not be.
 .P
 Characters in the "Surrogate Area" of Unicode are reserved for use by UTF-16,
 where they are used in pairs to encode code points with values greater than
@ -252,6 +262,6 @@ Cambridge, England.
 .rs
 .sp
 .nf
-Last updated: 23 November 2014
-Copyright (c) 1997-2014 University of Cambridge.
+Last updated: 18 August 2015
+Copyright (c) 1997-2015 University of Cambridge.
 .fi
--- a/src/pcre2_compile.c
+++ b/src/pcre2_compile.c
@ -4682,7 +4682,7 @@ for (;; ptr++)
      that it's a length rather than a small character. */

 #ifdef MAYBE_UTF_MULTI
-      if (utf && NOT_FIRSTCHAR(code[-1]))
+      if (utf && NOT_FIRSTCU(code[-1]))
        {
        PCRE2_UCHAR *lastchar = code - 1;
        BACKCHAR(lastchar);
--- a/src/pcre2_dfa_match.c
+++ b/src/pcre2_dfa_match.c
@ -2774,7 +2774,7 @@ for (;;)
              {
              PCRE2_SPTR p = start_subject + local_offsets[rc];
              PCRE2_SPTR pp = start_subject + local_offsets[rc+1];
-              while (p < pp) if (NOT_FIRSTCHAR(*p++)) charcount--;
+              while (p < pp) if (NOT_FIRSTCU(*p++)) charcount--;
              }
 #endif
            if (charcount > 0)
@ -2874,7 +2874,7 @@ for (;;)
            PCRE2_SPTR pp = local_ptr;
            charcount = (int)(pp - p);
 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
-            if (utf) while (p < pp) if (NOT_FIRSTCHAR(*p++)) charcount--;
+            if (utf) while (p < pp) if (NOT_FIRSTCU(*p++)) charcount--;
 #endif
            ADD_NEW_DATA(-next_state_offset, 0, (charcount - 1));
            }
@ -2960,7 +2960,7 @@ for (;;)
              {
              PCRE2_SPTR p = start_subject + local_offsets[0];
              PCRE2_SPTR pp = start_subject + local_offsets[1];
-              while (p < pp) if (NOT_FIRSTCHAR(*p++)) charcount--;
+              while (p < pp) if (NOT_FIRSTCU(*p++)) charcount--;
              }
 #endif
            ADD_NEW_DATA(-next_state_offset, 0, (charcount - 1));
@ -3264,18 +3264,50 @@ switch(re->newline_convention)

 /* Check a UTF string for validity if required. For 8-bit and 16-bit strings,
 we must also check that a starting offset does not point into the middle of a
-multiunit character. */
+multiunit character. We check only the portion of the subject that is going to 
+be inspected during matching - from the offset minus the maximum back reference 
+to the given length. This saves time when a small part of a large subject is 
+being matched by the use of a starting offset. Note that the maximum lookbehind 
+is a number of characters, not code units. */

 #ifdef SUPPORT_UNICODE
 if (utf && (options & PCRE2_NO_UTF_CHECK) == 0)
  {
-  match_data->rc = PRIV(valid_utf)(subject, length, &(match_data->startchar));
-  if (match_data->rc != 0) return match_data->rc;
+  PCRE2_SPTR check_subject = start_match;  /* start_match includes offset */
+
+  if (start_offset > 0)
+    { 
 #if PCRE2_CODE_UNIT_WIDTH != 32
-  if (start_offset > 0 && start_offset < length &&
-      NOT_FIRSTCHAR(subject[start_offset]))
-    return PCRE2_ERROR_BADUTFOFFSET;
+    unsigned int i; 
+    if (start_match < end_subject && NOT_FIRSTCU(*start_match))
+      return PCRE2_ERROR_BADUTFOFFSET;
+    for (i = re->max_lookbehind; i > 0 && check_subject > subject; i--)
+      {
+      check_subject--;
+      while (check_subject > subject &&
+#if PCRE2_CODE_UNIT_WIDTH == 8
+      (*check_subject & 0xc0) == 0x80)
+#else  /* 16-bit */
+      (*check_subject & 0xfc00) == 0xdc00)
+#endif /* PCRE2_CODE_UNIT_WIDTH == 8 */
+        check_subject--; 
+      }  
+#else   /* In the 32-bit library, one code unit equals one character. */
+    check_subject -= re->max_lookbehind;
+    if (check_subject < subject) check_subject = subject; 
 #endif  /* PCRE2_CODE_UNIT_WIDTH != 32 */
+    }
+  
+  /* Validate the relevant portion of the subject. After an error, adjust the
+  offset to be an absolute offset in the whole string. */
+    
+  match_data->rc = PRIV(valid_utf)(check_subject, 
+    length - (check_subject - subject), &(match_data->startchar));
+  if (match_data->rc != 0) 
+    {
+    match_data->startchar += check_subject - subject;
+    return match_data->rc;
+    } 
  }
 #endif  /* SUPPORT_UNICODE */

--- a/src/pcre2_intmodedep.h
+++ b/src/pcre2_intmodedep.h
@ -72,7 +72,7 @@ just to undefine them all. */
 #undef MAX_MARK
 #undef MAX_PATTERN_SIZE
 #undef MAX_UTF_SINGLE_CU
-#undef NOT_FIRSTCHAR
+#undef NOT_FIRSTCU
 #undef PUT
 #undef PUT2
 #undef PUT2INC
@ -252,7 +252,7 @@ UTF support is omitted, we don't even define them. */
 /* #define MAX_UTF_SINGLE_CU */
 /* #define HAS_EXTRALEN(c) */
 /* #define GET_EXTRALEN(c) */
-/* #define NOT_FIRSTCHAR(c) */
+/* #define NOT_FIRSTCU(c) */
 #define GETCHAR(c, eptr) c = *eptr;
 #define GETCHARTEST(c, eptr) c = *eptr;
 #define GETCHARINC(c, eptr) c = *eptr++;
@ -285,10 +285,10 @@ Otherwise it has an undefined behaviour. */

 #define GET_EXTRALEN(c) (PRIV(utf8_table4)[(c) & 0x3f])

-/* Returns TRUE, if the given character is not the first character
-of a UTF sequence. */
+/* Returns TRUE, if the given value is not the first code unit of a UTF
+sequence. */

-#define NOT_FIRSTCHAR(c) (((c) & 0xc0) == 0x80)
+#define NOT_FIRSTCU(c) (((c) & 0xc0) == 0x80)

 /* Get the next UTF-8 character, not advancing the pointer. This is called when
 we know we are in UTF-8 mode. */
@ -371,10 +371,10 @@ Otherwise it has an undefined behaviour. */

 #define GET_EXTRALEN(c) 1

-/* Returns TRUE, if the given character is not the first character
-of a UTF sequence. */
+/* Returns TRUE, if the given value is not the first code unit of a UTF
+sequence. */

-#define NOT_FIRSTCHAR(c) (((c) & 0xfc00) == 0xdc00)
+#define NOT_FIRSTCU(c) (((c) & 0xfc00) == 0xdc00)

 /* Base macro to pick up the low surrogate of a UTF-16 character, not
 advancing the pointer. */
@ -469,7 +469,7 @@ into one PCRE2_UCHAR unit. */
 #define MAX_UTF_SINGLE_CU (0x10ffffu)
 #define HAS_EXTRALEN(c) (0)
 #define GET_EXTRALEN(c) (0)
-#define NOT_FIRSTCHAR(c) (0)
+#define NOT_FIRSTCU(c) (0)

 /* Get the next UTF-32 character, not advancing the pointer. This is called when
 we know we are in UTF-32 mode. */
--- a/src/pcre2_match.c
+++ b/src/pcre2_match.c
@ -6485,6 +6485,7 @@ mb->match_frames_base = &frame_zero;
 subject string. */

 if (length == PCRE2_ZERO_TERMINATED) length = PRIV(strlen)(subject);
+end_subject = subject + length;

 /* Plausibility checks */

@ -6536,18 +6537,50 @@ mb->partial = ((options & PCRE2_PARTIAL_HARD) != 0)? 2 :

 /* Check a UTF string for validity if required. For 8-bit and 16-bit strings,
 we must also check that a starting offset does not point into the middle of a
-multiunit character. */
+multiunit character. We check only the portion of the subject that is going to 
+be inspected during matching - from the offset minus the maximum back reference 
+to the given length. This saves time when a small part of a large subject is 
+being matched by the use of a starting offset. Note that the maximum lookbehind 
+is a number of characters, not code units. */

 #ifdef SUPPORT_UNICODE
 if (utf && (options & PCRE2_NO_UTF_CHECK) == 0)
  {
-  match_data->rc = PRIV(valid_utf)(subject, length, &(match_data->startchar));
-  if (match_data->rc != 0) return match_data->rc;
+  PCRE2_SPTR check_subject = start_match;  /* start_match includes offset */
+
+  if (start_offset > 0)
+    { 
 #if PCRE2_CODE_UNIT_WIDTH != 32
-  if (start_offset > 0 && start_offset < length &&
-      NOT_FIRSTCHAR(subject[start_offset]))
-    return PCRE2_ERROR_BADUTFOFFSET;
+    unsigned int i; 
+    if (start_match < end_subject && NOT_FIRSTCU(*start_match))
+      return PCRE2_ERROR_BADUTFOFFSET;
+    for (i = re->max_lookbehind; i > 0 && check_subject > subject; i--)
+      {
+      check_subject--;
+      while (check_subject > subject &&
+#if PCRE2_CODE_UNIT_WIDTH == 8
+      (*check_subject & 0xc0) == 0x80)
+#else  /* 16-bit */
+      (*check_subject & 0xfc00) == 0xdc00)
+#endif /* PCRE2_CODE_UNIT_WIDTH == 8 */
+        check_subject--; 
+      }  
+#else   /* In the 32-bit library, one code unit equals one character. */
+    check_subject -= re->max_lookbehind;
+    if (check_subject < subject) check_subject = subject; 
 #endif  /* PCRE2_CODE_UNIT_WIDTH != 32 */
+    }
+  
+  /* Validate the relevant portion of the subject. After an error, adjust the
+  offset to be an absolute offset in the whole string. */
+    
+  match_data->rc = PRIV(valid_utf)(check_subject, 
+    length - (check_subject - subject), &(match_data->startchar));
+  if (match_data->rc != 0) 
+    {
+    match_data->startchar += check_subject - subject;
+    return match_data->rc;
+    } 
  }
 #endif  /* SUPPORT_UNICODE */

@ -6594,7 +6627,7 @@ else

 mb->start_subject = subject;
 mb->start_offset = start_offset;
-mb->end_subject = end_subject = mb->start_subject + length;
+mb->end_subject = end_subject;
 mb->hasthen = (re->flags & PCRE2_HASTHEN) != 0;

 mb->moptions = options;                 /* Match options */
--- a/testdata/testinput10
+++ b/testdata/testinput10
@ -133,6 +133,35 @@
    \xfc\x84\x80\x80\x80\x80\=no_utf_check
    \xfd\x83\x80\x80\x80\x80\=no_utf_check
    
+# Similar tests with offsets
+
+/badutf/utf
+    X\xdfabcd
+    X\xdfabcd\=offset=1
+    X\xdfabcd\=offset=2
+
+/(?<=x)badutf/utf
+    X\xdfabcd
+    X\xdfabcd\=offset=1
+    X\xdfabcd\=offset=2
+    X\xdfabcd\=offset=3
+    X\xdfabcd\xdf\=offset=3
+
+/(?<=xx)badutf/utf
+    X\xdfabcd
+    X\xdfabcd\=offset=1
+    X\xdfabcd\=offset=2
+    X\xdfabcd\=offset=3
+
+/(?<=xxxx)badutf/utf
+    X\xdfabcd
+    X\xdfabcd\=offset=1
+    X\xdfabcd\=offset=2
+    X\xdfabcd\=offset=3
+    X\xdfabcd\=offset=6
+    X\xdfabc\xdf\=offset=6
+    X\xdfabc\xdf\=offset=7
+ 
 /\x{100}/IB,utf

 /\x{1000}/IB,utf
--- a/testdata/testinput12
+++ b/testdata/testinput12
@ -158,6 +158,7 @@

 /X/utf
    XX\x{d800}
+    XX\x{d800}\=offset=3
    XX\x{d800}\=no_utf_check
    XX\x{da00}
    XX\x{da00}\=no_utf_check
@ -170,6 +171,9 @@
    XX\x{110000}
    XX\x{d800}\x{1234}
    
+/(?<=.)X/utf
+    XX\x{d800}\=offset=3
+
 /(*UTF16)\x{11234}/
  abcd\x{11234}pqr

--- a/testdata/testinput14
+++ b/testdata/testinput14
@ -1,155 +1,37 @@
-# These are:
-#
-# (1) Tests of the match-limiting features. The results are different for
-# interpretive or JIT matching, so this test should not be run with JIT. The
-# same tests are run using JIT in test 16.
+# These test special (mostly error) UTF features of DFA matching. They are a 
+# selection of the more comprehensive tests that are run for non-DFA matching.
+# The output is different for the different widths.

-# (2) Other tests that must not be run with JIT.
+#subject dfa

-/(a+)*zz/I
-  aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaazzbbbbbb\=find_limits
-  aaaaaaaaaaaaaz\=find_limits
+/X/utf
+    XX\x{d800}
+    XX\x{d800}\=offset=3
+    XX\x{d800}\=no_utf_check
+    XX\x{da00}
+    XX\x{da00}\=no_utf_check
+    XX\x{dc00}
+    XX\x{dc00}\=no_utf_check
+    XX\x{de00}
+    XX\x{de00}\=no_utf_check
+    XX\x{dfff}
+    XX\x{dfff}\=no_utf_check
+    XX\x{110000}
+    XX\x{d800}\x{1234}
          
-!((?:\s|//.*\\n|/[*](?:\\n|.)*?[*]/)*)!I
-   /* this is a C style comment */\=find_limits
+/badutf/utf
+    X\xdf
+    XX\xef
+    XXX\xef\x80
+    X\xf7
+    XX\xf7\x80
+    XXX\xf7\x80\x80

-/^(?>a)++/
-    aa\=find_limits
-    aaaaaaaaa\=find_limits
-    
-/(a)(?1)++/
-    aa\=find_limits
-    aaaaaaaaa\=find_limits
-
-/a(?:.)*?a/ims
-    abbbbbbbbbbbbbbbbbbbbba\=find_limits
-    
-/a(?:.(*THEN))*?a/ims
-    abbbbbbbbbbbbbbbbbbbbba\=find_limits
-
-/a(?:.(*THEN:ABC))*?a/ims
-    abbbbbbbbbbbbbbbbbbbbba\=find_limits
-
-/^(?>a+)(?>b+)(?>c+)(?>d+)(?>e+)/
-     aabbccddee\=find_limits
-
-/^(?>(a+))(?>(b+))(?>(c+))(?>(d+))(?>(e+))/
-     aabbccddee\=find_limits
-
-/^(?>(a+))(?>b+)(?>(c+))(?>d+)(?>(e+))/
-     aabbccddee\=find_limits
-
-/(*LIMIT_MATCH=12bc)abc/
-
-/(*LIMIT_MATCH=4294967290)abc/
-
-/(*LIMIT_RECURSION=4294967280)abc/I
-
-/(a+)*zz/
-    aaaaaaaaaaaaaz
-    aaaaaaaaaaaaaz\=match_limit=3000
-
-/(a+)*zz/
-    aaaaaaaaaaaaaz\=recursion_limit=10
-
-/(*LIMIT_MATCH=3000)(a+)*zz/I
-    aaaaaaaaaaaaaz
-    aaaaaaaaaaaaaz\=match_limit=60000
-
-/(*LIMIT_MATCH=60000)(*LIMIT_MATCH=3000)(a+)*zz/I
-    aaaaaaaaaaaaaz
-
-/(*LIMIT_MATCH=60000)(a+)*zz/I
-    aaaaaaaaaaaaaz
-    aaaaaaaaaaaaaz\=match_limit=3000
-
-/(*LIMIT_RECURSION=10)(a+)*zz/I
-    aaaaaaaaaaaaaz
-    aaaaaaaaaaaaaz\=recursion_limit=1000
-
-/(*LIMIT_RECURSION=10)(*LIMIT_RECURSION=1000)(a+)*zz/I
-    aaaaaaaaaaaaaz
-
-/(*LIMIT_RECURSION=1000)(a+)*zz/I
-    aaaaaaaaaaaaaz
-    aaaaaaaaaaaaaz\=recursion_limit=10
-    
-# These three have infinitely nested recursions. 
-    
-/((?2))((?1))/
-    abc
-
-/((?(R2)a+|(?1)b))/
-    aaaabcde
-
-/(?(R)a*(?1)|((?R))b)/
-    aaaabcde
-    
-# The allusedtext modifier does not work with JIT, which does not maintain
-# the leftchar/rightchar data.
-
-/abc(?=xyz)/allusedtext
-    abcxyzpqr
-    abcxyzpqr\=aftertext
-    
-/(?<=pqr)abc(?=xyz)/allusedtext
-    xyzpqrabcxyzpqr
-    xyzpqrabcxyzpqr\=aftertext
-    
-/a\b/
-    a.\=allusedtext
-    a\=allusedtext  
-
-/abc\Kxyz/
-    abcxyz\=allusedtext
-
-/abc(?=xyz(*ACCEPT))/
-    abcxyz\=allusedtext
-
-/abc(?=abcde)(?=ab)/allusedtext
-    abcabcdefg
-    
-# These tests provoke recursion loops, which give a different error message
-# when JIT is used.
-
-/(?R)/I
-    abcd
-
-/(a|(?R))/I
-    abcd
-    defg 
-
-/(ab|(bc|(de|(?R))))/I
-    abcd
-    fghi 
-
-/(ab|(bc|(de|(?1))))/I
-    abcd
-    fghi 
-
-/x(ab|(bc|(de|(?1)x)x)x)/I
-    xab123
-    xfghi 
-
-/(?!\w)(?R)/
-    abcd
-    =abc 
-
-/(?=\w)(?R)/
-    =abc 
-    abcd
-
-/(?<!\w)(?R)/
-    abcd
-
-/(?<=\w)(?R)/
-    abcd
-
-/(a+|(?R)b)/
-    aaa
-    bbb 
-
-/[^\xff]((?1))/BI
-    abcd
+/shortutf/utf
+    XX\xdf\=ph
+    XX\xef\=ph
+    XX\xef\x80\=ph
+    \xf7\=ph
+    \xf7\x80\=ph

 # End of testinput14
--- a/testdata/testinput15
+++ b/testdata/testinput15
@ -1,9 +1,155 @@
-# This test is run only when JIT support is not available. It checks that an
-# attempt to use it has the expected behaviour. It also tests things that
-# are different without JIT.
+# These are:
+#
+# (1) Tests of the match-limiting features. The results are different for
+# interpretive or JIT matching, so this test should not be run with JIT. The
+# same tests are run using JIT in test 17.

-/abc/I,jit,jitverify
+# (2) Other tests that must not be run with JIT.

-/a*/I
+/(a+)*zz/I
+  aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaazzbbbbbb\=find_limits
+  aaaaaaaaaaaaaz\=find_limits
+
+!((?:\s|//.*\\n|/[*](?:\\n|.)*?[*]/)*)!I
+   /* this is a C style comment */\=find_limits
+
+/^(?>a)++/
+    aa\=find_limits
+    aaaaaaaaa\=find_limits
+    
+/(a)(?1)++/
+    aa\=find_limits
+    aaaaaaaaa\=find_limits
+
+/a(?:.)*?a/ims
+    abbbbbbbbbbbbbbbbbbbbba\=find_limits
+    
+/a(?:.(*THEN))*?a/ims
+    abbbbbbbbbbbbbbbbbbbbba\=find_limits
+
+/a(?:.(*THEN:ABC))*?a/ims
+    abbbbbbbbbbbbbbbbbbbbba\=find_limits
+
+/^(?>a+)(?>b+)(?>c+)(?>d+)(?>e+)/
+     aabbccddee\=find_limits
+
+/^(?>(a+))(?>(b+))(?>(c+))(?>(d+))(?>(e+))/
+     aabbccddee\=find_limits
+
+/^(?>(a+))(?>b+)(?>(c+))(?>d+)(?>(e+))/
+     aabbccddee\=find_limits
+
+/(*LIMIT_MATCH=12bc)abc/
+
+/(*LIMIT_MATCH=4294967290)abc/
+
+/(*LIMIT_RECURSION=4294967280)abc/I
+
+/(a+)*zz/
+    aaaaaaaaaaaaaz
+    aaaaaaaaaaaaaz\=match_limit=3000
+
+/(a+)*zz/
+    aaaaaaaaaaaaaz\=recursion_limit=10
+
+/(*LIMIT_MATCH=3000)(a+)*zz/I
+    aaaaaaaaaaaaaz
+    aaaaaaaaaaaaaz\=match_limit=60000
+
+/(*LIMIT_MATCH=60000)(*LIMIT_MATCH=3000)(a+)*zz/I
+    aaaaaaaaaaaaaz
+
+/(*LIMIT_MATCH=60000)(a+)*zz/I
+    aaaaaaaaaaaaaz
+    aaaaaaaaaaaaaz\=match_limit=3000
+
+/(*LIMIT_RECURSION=10)(a+)*zz/I
+    aaaaaaaaaaaaaz
+    aaaaaaaaaaaaaz\=recursion_limit=1000
+
+/(*LIMIT_RECURSION=10)(*LIMIT_RECURSION=1000)(a+)*zz/I
+    aaaaaaaaaaaaaz
+
+/(*LIMIT_RECURSION=1000)(a+)*zz/I
+    aaaaaaaaaaaaaz
+    aaaaaaaaaaaaaz\=recursion_limit=10
+    
+# These three have infinitely nested recursions. 
+    
+/((?2))((?1))/
+    abc
+
+/((?(R2)a+|(?1)b))/
+    aaaabcde
+
+/(?(R)a*(?1)|((?R))b)/
+    aaaabcde
+    
+# The allusedtext modifier does not work with JIT, which does not maintain
+# the leftchar/rightchar data.
+
+/abc(?=xyz)/allusedtext
+    abcxyzpqr
+    abcxyzpqr\=aftertext
+    
+/(?<=pqr)abc(?=xyz)/allusedtext
+    xyzpqrabcxyzpqr
+    xyzpqrabcxyzpqr\=aftertext
+    
+/a\b/
+    a.\=allusedtext
+    a\=allusedtext  
+
+/abc\Kxyz/
+    abcxyz\=allusedtext
+
+/abc(?=xyz(*ACCEPT))/
+    abcxyz\=allusedtext
+
+/abc(?=abcde)(?=ab)/allusedtext
+    abcabcdefg
+    
+# These tests provoke recursion loops, which give a different error message
+# when JIT is used.
+
+/(?R)/I
+    abcd
+
+/(a|(?R))/I
+    abcd
+    defg 
+
+/(ab|(bc|(de|(?R))))/I
+    abcd
+    fghi 
+
+/(ab|(bc|(de|(?1))))/I
+    abcd
+    fghi 
+
+/x(ab|(bc|(de|(?1)x)x)x)/I
+    xab123
+    xfghi 
+
+/(?!\w)(?R)/
+    abcd
+    =abc 
+
+/(?=\w)(?R)/
+    =abc 
+    abcd
+
+/(?<!\w)(?R)/
+    abcd
+
+/(?<=\w)(?R)/
+    abcd
+
+/(a+|(?R)b)/
+    aaa
+    bbb 
+
+/[^\xff]((?1))/BI
+    abcd

 # End of testinput15
--- a/testdata/testinput16
+++ b/testdata/testinput16
--- a/testdata/testinput17
+++ b/testdata/testinput17
--- a/testdata/testinput18
+++ b/testdata/testinput18
@ -1,17 +1,95 @@
 # This set of tests is run only with the 8-bit library. It tests the POSIX
-# interface with UTF/UCP support, which is supported only with the 8-bit
-# library. This test should not be run with JIT (which is not available for the
-# POSIX interface).
+# interface, which is supported only with the 8-bit library. This test should
+# not be run with JIT (which is not available for the POSIX interface).
    
+#forbid_utf
 #pattern posix

-/a\x{1234}b/utf
-    a\x{1234}b
+# Test invalid options

-/\w/
-    +++\x{c2}
+/abc/auto_callout

-/\w/ucp
-    +++\x{c2}
+/abc/
+   abc\=find_limits

-# End of testdata/testinput17
+/abc/
+  abc\=partial_hard
+
+# Real tests
+
+/abc/
+    abc
+    *** Failers
+
+/^abc|def/
+    abcdef
+    abcdef\=notbol
+
+/.*((abc)$|(def))/
+    defabc
+    defabc\=noteol
+
+/the quick brown fox/
+    the quick brown fox
+    *** Failers
+    The Quick Brown Fox
+
+/the quick brown fox/i
+    the quick brown fox
+    The Quick Brown Fox
+
+/abc.def/
+    *** Failers
+    abc\ndef
+
+/abc$/
+    abc
+    abc\n
+
+/(abc)\2/
+
+/(abc\1)/
+    abc
+
+/a*(b+)(z)(z)/
+    aaaabbbbzzzz
+    aaaabbbbzzzz\=ovector=0
+    aaaabbbbzzzz\=ovector=1
+    aaaabbbbzzzz\=ovector=2
+
+/ab.cd/
+    ab-cd
+    ab=cd
+    ** Failers
+    ab\ncd
+
+/ab.cd/s
+    ab-cd
+    ab=cd
+    ab\ncd
+
+/a(b)c/no_auto_capture
+    abc
+
+/a(?P<name>b)c/no_auto_capture
+    abc
+
+/a?|b?/
+    abc
+    ** Failers
+    ddd\=notempty
+
+/\w+A/
+   CDAAAAB
+
+/\w+A/ungreedy
+   CDAAAAB
+   
+/\Biss\B/I,aftertext
+    Mississippi
+
+/abc/\
+
+"(?(?C)"
+
+# End of testdata/testinput18
--- a/testdata/testinput19
+++ b/testdata/testinput19
@ -1,62 +1,17 @@
-# This set of tests exercises the serialization/deserialization functions in
-# the library. It does not use UTF or JIT.
+# This set of tests is run only with the 8-bit library. It tests the POSIX
+# interface with UTF/UCP support, which is supported only with the 8-bit
+# library. This test should not be run with JIT (which is not available for the
+# POSIX interface).
    
-#forbid_utf
+#pattern posix

-# Compile several patterns, push them onto the stack, and then write them
-# all to a file.
+/a\x{1234}b/utf
+    a\x{1234}b

-#pattern push
+/\w/
+    +++\x{c2}

-/(?<NAME>(?&NAME_PAT))\s+(?<ADDR>(?&ADDRESS_PAT))
-  (?(DEFINE)
-  (?<NAME_PAT>[a-z]+)
-  (?<ADDRESS_PAT>\d+)
-  )/x
-/^(?:((.)(?1)\2|)|((.)(?3)\4|.))$/i
+/\w/ucp
+    +++\x{c2}
    
-#save testsaved1
-
-# Do it again for some more patterns.
-
-/(*MARK:A)(*SKIP:B)(C|X)/mark
-/(?:(?<n>foo)|(?<n>bar))\k<n>/dupnames
-
-#save testsaved2
-#pattern -push
-
-# Reload the patterns, then pop them one by one and check them.
-
-#load testsaved1
-#load testsaved2
-
-#pop info
-    foofoo             
-    barbar
-    
-#pop mark
-    C
-    D 
-    
-#pop
-    AmanaplanacanalPanama   
-
-#pop info
-    metcalfe 33
-    
-# Check for an error when different tables are used.
-
-/abc/push,tables=1
-/xyz/push,tables=2
-#save testsaved1
-
-#pop
-    xyz
-
-#pop
-    abc
-
-#pop should give an error
-    pqr
-
-# End of testinput19 
+# End of testdata/testinput19
--- a/testdata/testinput20
+++ b/testdata/testinput20
@ -0,0 +1,62 @@
+# This set of tests exercises the serialization/deserialization functions in
+# the library. It does not use UTF or JIT.
+
+#forbid_utf
+
+# Compile several patterns, push them onto the stack, and then write them
+# all to a file.
+
+#pattern push
+
+/(?<NAME>(?&NAME_PAT))\s+(?<ADDR>(?&ADDRESS_PAT))
+  (?(DEFINE)
+  (?<NAME_PAT>[a-z]+)
+  (?<ADDRESS_PAT>\d+)
+  )/x
+/^(?:((.)(?1)\2|)|((.)(?3)\4|.))$/i
+
+#save testsaved1
+
+# Do it again for some more patterns.
+
+/(*MARK:A)(*SKIP:B)(C|X)/mark
+/(?:(?<n>foo)|(?<n>bar))\k<n>/dupnames
+
+#save testsaved2
+#pattern -push
+
+# Reload the patterns, then pop them one by one and check them.
+
+#load testsaved1
+#load testsaved2
+
+#pop info
+    foofoo             
+    barbar
+    
+#pop mark
+    C
+    D 
+    
+#pop
+    AmanaplanacanalPanama   
+
+#pop info
+    metcalfe 33
+    
+# Check for an error when different tables are used.
+
+/abc/push,tables=1
+/xyz/push,tables=2
+#save testsaved1
+
+#pop
+    xyz
+
+#pop
+    abc
+
+#pop should give an error
+    pqr
+
+# End of testinput20
--- a/testdata/testoutput10
+++ b/testdata/testoutput10
@ -236,6 +236,54 @@ No match
    \xfd\x83\x80\x80\x80\x80\=no_utf_check
 No match
    
+# Similar tests with offsets
+
+/badutf/utf
+    X\xdfabcd
+Failed: error -8: UTF-8 error: byte 2 top bits not 0x80 at offset 1
+    X\xdfabcd\=offset=1
+Failed: error -8: UTF-8 error: byte 2 top bits not 0x80 at offset 1
+    X\xdfabcd\=offset=2
+No match
+
+/(?<=x)badutf/utf
+    X\xdfabcd
+Failed: error -8: UTF-8 error: byte 2 top bits not 0x80 at offset 1
+    X\xdfabcd\=offset=1
+Failed: error -8: UTF-8 error: byte 2 top bits not 0x80 at offset 1
+    X\xdfabcd\=offset=2
+Failed: error -8: UTF-8 error: byte 2 top bits not 0x80 at offset 1
+    X\xdfabcd\=offset=3
+No match
+    X\xdfabcd\xdf\=offset=3
+Failed: error -3: UTF-8 error: 1 byte missing at end at offset 6
+
+/(?<=xx)badutf/utf
+    X\xdfabcd
+Failed: error -8: UTF-8 error: byte 2 top bits not 0x80 at offset 1
+    X\xdfabcd\=offset=1
+Failed: error -8: UTF-8 error: byte 2 top bits not 0x80 at offset 1
+    X\xdfabcd\=offset=2
+Failed: error -8: UTF-8 error: byte 2 top bits not 0x80 at offset 1
+    X\xdfabcd\=offset=3
+Failed: error -8: UTF-8 error: byte 2 top bits not 0x80 at offset 1
+
+/(?<=xxxx)badutf/utf
+    X\xdfabcd
+Failed: error -8: UTF-8 error: byte 2 top bits not 0x80 at offset 1
+    X\xdfabcd\=offset=1
+Failed: error -8: UTF-8 error: byte 2 top bits not 0x80 at offset 1
+    X\xdfabcd\=offset=2
+Failed: error -8: UTF-8 error: byte 2 top bits not 0x80 at offset 1
+    X\xdfabcd\=offset=3
+Failed: error -8: UTF-8 error: byte 2 top bits not 0x80 at offset 1
+    X\xdfabcd\=offset=6
+No match
+    X\xdfabc\xdf\=offset=6
+Failed: error -3: UTF-8 error: 1 byte missing at end at offset 5
+    X\xdfabc\xdf\=offset=7
+Failed: error -33: bad offset value
+ 
 /\x{100}/IB,utf
 ------------------------------------------------------------------
        Bra
--- a/testdata/testoutput12-16
+++ b/testdata/testoutput12-16
@ -609,6 +609,8 @@ Failed: error 106 at offset 13: missing terminating ] for character class
 /X/utf
    XX\x{d800}
 Failed: error -24: UTF-16 error: missing low surrogate at end at offset 2
+    XX\x{d800}\=offset=3
+No match
    XX\x{d800}\=no_utf_check
 0: X
    XX\x{da00}
@ -632,6 +634,10 @@ Failed: error -26: UTF-16 error: isolated low surrogate at offset 2
    XX\x{d800}\x{1234}
 Failed: error -25: UTF-16 error: invalid low surrogate at offset 3
    
+/(?<=.)X/utf
+    XX\x{d800}\=offset=3
+Failed: error -24: UTF-16 error: missing low surrogate at end at offset 2
+
 /(*UTF16)\x{11234}/
  abcd\x{11234}pqr
 0: \x{11234}
--- a/testdata/testoutput12-32
+++ b/testdata/testoutput12-32
@ -602,6 +602,8 @@ Failed: error 106 at offset 13: missing terminating ] for character class
 /X/utf
    XX\x{d800}
 Failed: error -27: UTF-32 error: code points 0xd800-0xdfff are not defined at offset 2
+    XX\x{d800}\=offset=3
+No match
    XX\x{d800}\=no_utf_check
 0: X
    XX\x{da00}
@ -625,6 +627,10 @@ Failed: error -28: UTF-32 error: code points greater than 0x10ffff are not defin
    XX\x{d800}\x{1234}
 Failed: error -27: UTF-32 error: code points 0xd800-0xdfff are not defined at offset 2
    
+/(?<=.)X/utf
+    XX\x{d800}\=offset=3
+Failed: error -27: UTF-32 error: code points 0xd800-0xdfff are not defined at offset 2
+
 /(*UTF16)\x{11234}/
 Failed: error 160 at offset 5: (*VERB) not recognized or malformed
  abcd\x{11234}pqr
--- a/testdata/testoutput14
+++ b/testdata/testoutput14
@ -1,334 +0,0 @@
-# These are:
-#
-# (1) Tests of the match-limiting features. The results are different for
-# interpretive or JIT matching, so this test should not be run with JIT. The
-# same tests are run using JIT in test 16.
-
-# (2) Other tests that must not be run with JIT.
-
-/(a+)*zz/I
-Capturing subpattern count = 1
-Starting code units: a z 
-Last code unit = 'z'
-Subject length lower bound = 2
-  aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaazzbbbbbb\=find_limits
-Minimum match limit = 8
-Minimum recursion limit = 6
- 0: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaazz
- 1: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
-  aaaaaaaaaaaaaz\=find_limits
-Minimum match limit = 32768
-Minimum recursion limit = 29
-No match
-
-!((?:\s|//.*\\n|/[*](?:\\n|.)*?[*]/)*)!I
-Capturing subpattern count = 1
-May match empty string
-Subject length lower bound = 0
-   /* this is a C style comment */\=find_limits
-Minimum match limit = 120
-Minimum recursion limit = 6
- 0: /* this is a C style comment */
- 1: /* this is a C style comment */
-
-/^(?>a)++/
-    aa\=find_limits
-Minimum match limit = 5
-Minimum recursion limit = 2
- 0: aa
-    aaaaaaaaa\=find_limits
-Minimum match limit = 12
-Minimum recursion limit = 2
- 0: aaaaaaaaa
-    
-/(a)(?1)++/
-    aa\=find_limits
-Minimum match limit = 7
-Minimum recursion limit = 4
- 0: aa
- 1: a
-    aaaaaaaaa\=find_limits
-Minimum match limit = 21
-Minimum recursion limit = 4
- 0: aaaaaaaaa
- 1: a
-
-/a(?:.)*?a/ims
-    abbbbbbbbbbbbbbbbbbbbba\=find_limits
-Minimum match limit = 65
-Minimum recursion limit = 2
- 0: abbbbbbbbbbbbbbbbbbbbba
-    
-/a(?:.(*THEN))*?a/ims
-    abbbbbbbbbbbbbbbbbbbbba\=find_limits
-Minimum match limit = 86
-Minimum recursion limit = 45
- 0: abbbbbbbbbbbbbbbbbbbbba
-
-/a(?:.(*THEN:ABC))*?a/ims
-    abbbbbbbbbbbbbbbbbbbbba\=find_limits
-Minimum match limit = 86
-Minimum recursion limit = 45
- 0: abbbbbbbbbbbbbbbbbbbbba
-
-/^(?>a+)(?>b+)(?>c+)(?>d+)(?>e+)/
-     aabbccddee\=find_limits
-Minimum match limit = 7
-Minimum recursion limit = 2
- 0: aabbccddee
-
-/^(?>(a+))(?>(b+))(?>(c+))(?>(d+))(?>(e+))/
-     aabbccddee\=find_limits
-Minimum match limit = 17
-Minimum recursion limit = 16
- 0: aabbccddee
- 1: aa
- 2: bb
- 3: cc
- 4: dd
- 5: ee
-
-/^(?>(a+))(?>b+)(?>(c+))(?>d+)(?>(e+))/
-     aabbccddee\=find_limits
-Minimum match limit = 13
-Minimum recursion limit = 10
- 0: aabbccddee
- 1: aa
- 2: cc
- 3: ee
-
-/(*LIMIT_MATCH=12bc)abc/
-Failed: error 160 at offset 17: (*VERB) not recognized or malformed
-
-/(*LIMIT_MATCH=4294967290)abc/
-Failed: error 160 at offset 24: (*VERB) not recognized or malformed
-
-/(*LIMIT_RECURSION=4294967280)abc/I
-Capturing subpattern count = 0
-Recursion limit = 4294967280
-First code unit = 'a'
-Last code unit = 'c'
-Subject length lower bound = 3
-
-/(a+)*zz/
-    aaaaaaaaaaaaaz
-No match
-    aaaaaaaaaaaaaz\=match_limit=3000
-Failed: error -47: match limit exceeded
-
-/(a+)*zz/
-    aaaaaaaaaaaaaz\=recursion_limit=10
-Failed: error -53: recursion limit exceeded
-
-/(*LIMIT_MATCH=3000)(a+)*zz/I
-Capturing subpattern count = 1
-Match limit = 3000
-Starting code units: a z 
-Last code unit = 'z'
-Subject length lower bound = 2
-    aaaaaaaaaaaaaz
-Failed: error -47: match limit exceeded
-    aaaaaaaaaaaaaz\=match_limit=60000
-Failed: error -47: match limit exceeded
-
-/(*LIMIT_MATCH=60000)(*LIMIT_MATCH=3000)(a+)*zz/I
-Capturing subpattern count = 1
-Match limit = 3000
-Starting code units: a z 
-Last code unit = 'z'
-Subject length lower bound = 2
-    aaaaaaaaaaaaaz
-Failed: error -47: match limit exceeded
-
-/(*LIMIT_MATCH=60000)(a+)*zz/I
-Capturing subpattern count = 1
-Match limit = 60000
-Starting code units: a z 
-Last code unit = 'z'
-Subject length lower bound = 2
-    aaaaaaaaaaaaaz
-No match
-    aaaaaaaaaaaaaz\=match_limit=3000
-Failed: error -47: match limit exceeded
-
-/(*LIMIT_RECURSION=10)(a+)*zz/I
-Capturing subpattern count = 1
-Recursion limit = 10
-Starting code units: a z 
-Last code unit = 'z'
-Subject length lower bound = 2
-    aaaaaaaaaaaaaz
-Failed: error -53: recursion limit exceeded
-    aaaaaaaaaaaaaz\=recursion_limit=1000
-Failed: error -53: recursion limit exceeded
-
-/(*LIMIT_RECURSION=10)(*LIMIT_RECURSION=1000)(a+)*zz/I
-Capturing subpattern count = 1
-Recursion limit = 1000
-Starting code units: a z 
-Last code unit = 'z'
-Subject length lower bound = 2
-    aaaaaaaaaaaaaz
-No match
-
-/(*LIMIT_RECURSION=1000)(a+)*zz/I
-Capturing subpattern count = 1
-Recursion limit = 1000
-Starting code units: a z 
-Last code unit = 'z'
-Subject length lower bound = 2
-    aaaaaaaaaaaaaz
-No match
-    aaaaaaaaaaaaaz\=recursion_limit=10
-Failed: error -53: recursion limit exceeded
-    
-# These three have infinitely nested recursions. 
-    
-/((?2))((?1))/
-    abc
-Failed: error -52: nested recursion at the same subject position
-
-/((?(R2)a+|(?1)b))/
-    aaaabcde
-Failed: error -52: nested recursion at the same subject position
-
-/(?(R)a*(?1)|((?R))b)/
-    aaaabcde
-Failed: error -52: nested recursion at the same subject position
-    
-# The allusedtext modifier does not work with JIT, which does not maintain
-# the leftchar/rightchar data.
-
-/abc(?=xyz)/allusedtext
-    abcxyzpqr
- 0: abcxyz
-       >>>
-    abcxyzpqr\=aftertext
- 0: abcxyz
-       >>>
- 0+ xyzpqr
-    
-/(?<=pqr)abc(?=xyz)/allusedtext
-    xyzpqrabcxyzpqr
- 0: pqrabcxyz
-    <<<   >>>
-    xyzpqrabcxyzpqr\=aftertext
- 0: pqrabcxyz
-    <<<   >>>
- 0+ xyzpqr
-    
-/a\b/
-    a.\=allusedtext
- 0: a.
-     >
-    a\=allusedtext  
- 0: a
-
-/abc\Kxyz/
-    abcxyz\=allusedtext
- 0: abcxyz
-    <<<   
-
-/abc(?=xyz(*ACCEPT))/
-    abcxyz\=allusedtext
- 0: abcxyz
-       >>>
-
-/abc(?=abcde)(?=ab)/allusedtext
-    abcabcdefg
- 0: abcabcde
-       >>>>>
-    
-# These tests provoke recursion loops, which give a different error message
-# when JIT is used.
-
-/(?R)/I
-Capturing subpattern count = 0
-May match empty string
-Subject length lower bound = 0
-    abcd
-Failed: error -52: nested recursion at the same subject position
-
-/(a|(?R))/I
-Capturing subpattern count = 1
-May match empty string
-Subject length lower bound = 1
-    abcd
- 0: a
- 1: a
-    defg 
-Failed: error -52: nested recursion at the same subject position
-
-/(ab|(bc|(de|(?R))))/I
-Capturing subpattern count = 3
-May match empty string
-Subject length lower bound = 2
-    abcd
- 0: ab
- 1: ab
-    fghi 
-Failed: error -52: nested recursion at the same subject position
-
-/(ab|(bc|(de|(?1))))/I
-Capturing subpattern count = 3
-May match empty string
-Subject length lower bound = 2
-    abcd
- 0: ab
- 1: ab
-    fghi 
-Failed: error -52: nested recursion at the same subject position
-
-/x(ab|(bc|(de|(?1)x)x)x)/I
-Capturing subpattern count = 3
-First code unit = 'x'
-Subject length lower bound = 3
-    xab123
- 0: xab
- 1: ab
-    xfghi 
-Failed: error -52: nested recursion at the same subject position
-
-/(?!\w)(?R)/
-    abcd
-Failed: error -52: nested recursion at the same subject position
-    =abc 
-Failed: error -52: nested recursion at the same subject position
-
-/(?=\w)(?R)/
-    =abc 
-Failed: error -52: nested recursion at the same subject position
-    abcd
-Failed: error -52: nested recursion at the same subject position
-
-/(?<!\w)(?R)/
-    abcd
-Failed: error -52: nested recursion at the same subject position
-
-/(?<=\w)(?R)/
-    abcd
-Failed: error -52: nested recursion at the same subject position
-
-/(a+|(?R)b)/
-    aaa
- 0: aaa
- 1: aaa
-    bbb 
-Failed: error -52: nested recursion at the same subject position
-
-/[^\xff]((?1))/BI
------------------------------------------------------------------
-        Bra
-        [^\x{ff}]
-        CBra 1
-        Recurse
-        Ket
-        Ket
-        End
------------------------------------------------------------------
-Capturing subpattern count = 1
-Subject length lower bound = 1
-    abcd
-Failed: error -52: nested recursion at the same subject position
-
-# End of testinput14
--- a/testdata/testoutput14-16
+++ b/testdata/testoutput14-16
@ -0,0 +1,61 @@
+# These test special (mostly error) UTF features of DFA matching. They are a 
+# selection of the more comprehensive tests that are run for non-DFA matching.
+# The output is different for the different widths.
+
+#subject dfa
+
+/X/utf
+    XX\x{d800}
+Failed: error -24: UTF-16 error: missing low surrogate at end at offset 2
+    XX\x{d800}\=offset=3
+No match
+    XX\x{d800}\=no_utf_check
+ 0: X
+    XX\x{da00}
+Failed: error -24: UTF-16 error: missing low surrogate at end at offset 2
+    XX\x{da00}\=no_utf_check
+ 0: X
+    XX\x{dc00}
+Failed: error -26: UTF-16 error: isolated low surrogate at offset 2
+    XX\x{dc00}\=no_utf_check
+ 0: X
+    XX\x{de00}
+Failed: error -26: UTF-16 error: isolated low surrogate at offset 2
+    XX\x{de00}\=no_utf_check
+ 0: X
+    XX\x{dfff}
+Failed: error -26: UTF-16 error: isolated low surrogate at offset 2
+    XX\x{dfff}\=no_utf_check
+ 0: X
+    XX\x{110000}
+** Failed: character \x{110000} is greater than 0x10ffff and so cannot be converted to UTF-16
+    XX\x{d800}\x{1234}
+Failed: error -25: UTF-16 error: invalid low surrogate at offset 3
+          
+/badutf/utf
+    X\xdf
+No match
+    XX\xef
+No match
+    XXX\xef\x80
+No match
+    X\xf7
+No match
+    XX\xf7\x80
+No match
+    XXX\xf7\x80\x80
+No match
+
+/shortutf/utf
+    XX\xdf\=ph
+No match
+    XX\xef\=ph
+No match
+    XX\xef\x80\=ph
+No match
+    \xf7\=ph
+No match
+    \xf7\x80\=ph
+No match
+
+# End of testinput14
--- a/testdata/testoutput14-32
+++ b/testdata/testoutput14-32
@ -0,0 +1,61 @@
+# These test special (mostly error) UTF features of DFA matching. They are a 
+# selection of the more comprehensive tests that are run for non-DFA matching.
+# The output is different for the different widths.
+
+#subject dfa
+
+/X/utf
+    XX\x{d800}
+Failed: error -27: UTF-32 error: code points 0xd800-0xdfff are not defined at offset 2
+    XX\x{d800}\=offset=3
+No match
+    XX\x{d800}\=no_utf_check
+ 0: X
+    XX\x{da00}
+Failed: error -27: UTF-32 error: code points 0xd800-0xdfff are not defined at offset 2
+    XX\x{da00}\=no_utf_check
+ 0: X
+    XX\x{dc00}
+Failed: error -27: UTF-32 error: code points 0xd800-0xdfff are not defined at offset 2
+    XX\x{dc00}\=no_utf_check
+ 0: X
+    XX\x{de00}
+Failed: error -27: UTF-32 error: code points 0xd800-0xdfff are not defined at offset 2
+    XX\x{de00}\=no_utf_check
+ 0: X
+    XX\x{dfff}
+Failed: error -27: UTF-32 error: code points 0xd800-0xdfff are not defined at offset 2
+    XX\x{dfff}\=no_utf_check
+ 0: X
+    XX\x{110000}
+Failed: error -28: UTF-32 error: code points greater than 0x10ffff are not defined at offset 2
+    XX\x{d800}\x{1234}
+Failed: error -27: UTF-32 error: code points 0xd800-0xdfff are not defined at offset 2
+          
+/badutf/utf
+    X\xdf
+No match
+    XX\xef
+No match
+    XXX\xef\x80
+No match
+    X\xf7
+No match
+    XX\xf7\x80
+No match
+    XXX\xf7\x80\x80
+No match
+
+/shortutf/utf
+    XX\xdf\=ph
+No match
+    XX\xef\=ph
+No match
+    XX\xef\x80\=ph
+No match
+    \xf7\=ph
+No match
+    \xf7\x80\=ph
+No match
+
+# End of testinput14
--- a/testdata/testoutput14-8
+++ b/testdata/testoutput14-8
@ -0,0 +1,61 @@
+# These test special (mostly error) UTF features of DFA matching. They are a 
+# selection of the more comprehensive tests that are run for non-DFA matching.
+# The output is different for the different widths.
+
+#subject dfa
+
+/X/utf
+    XX\x{d800}
+Failed: error -16: UTF-8 error: code points 0xd800-0xdfff are not defined at offset 2
+    XX\x{d800}\=offset=3
+Error -36 (bad UTF-8 offset)
+    XX\x{d800}\=no_utf_check
+ 0: X
+    XX\x{da00}
+Failed: error -16: UTF-8 error: code points 0xd800-0xdfff are not defined at offset 2
+    XX\x{da00}\=no_utf_check
+ 0: X
+    XX\x{dc00}
+Failed: error -16: UTF-8 error: code points 0xd800-0xdfff are not defined at offset 2
+    XX\x{dc00}\=no_utf_check
+ 0: X
+    XX\x{de00}
+Failed: error -16: UTF-8 error: code points 0xd800-0xdfff are not defined at offset 2
+    XX\x{de00}\=no_utf_check
+ 0: X
+    XX\x{dfff}
+Failed: error -16: UTF-8 error: code points 0xd800-0xdfff are not defined at offset 2
+    XX\x{dfff}\=no_utf_check
+ 0: X
+    XX\x{110000}
+Failed: error -15: UTF-8 error: code points greater than 0x10ffff are not defined at offset 2
+    XX\x{d800}\x{1234}
+Failed: error -16: UTF-8 error: code points 0xd800-0xdfff are not defined at offset 2
+          
+/badutf/utf
+    X\xdf
+Failed: error -3: UTF-8 error: 1 byte missing at end at offset 1
+    XX\xef
+Failed: error -4: UTF-8 error: 2 bytes missing at end at offset 2
+    XXX\xef\x80
+Failed: error -3: UTF-8 error: 1 byte missing at end at offset 3
+    X\xf7
+Failed: error -5: UTF-8 error: 3 bytes missing at end at offset 1
+    XX\xf7\x80
+Failed: error -4: UTF-8 error: 2 bytes missing at end at offset 2
+    XXX\xf7\x80\x80
+Failed: error -3: UTF-8 error: 1 byte missing at end at offset 3
+
+/shortutf/utf
+    XX\xdf\=ph
+Failed: error -3: UTF-8 error: 1 byte missing at end at offset 2
+    XX\xef\=ph
+Failed: error -4: UTF-8 error: 2 bytes missing at end at offset 2
+    XX\xef\x80\=ph
+Failed: error -3: UTF-8 error: 1 byte missing at end at offset 2
+    \xf7\=ph
+Failed: error -5: UTF-8 error: 3 bytes missing at end at offset 0
+    \xf7\x80\=ph
+Failed: error -4: UTF-8 error: 2 bytes missing at end at offset 0
+
+# End of testinput14
--- a/testdata/testoutput15
+++ b/testdata/testoutput15
@ -1,17 +1,334 @@
-# This test is run only when JIT support is not available. It checks that an
-# attempt to use it has the expected behaviour. It also tests things that
-# are different without JIT.
+# These are:
+#
+# (1) Tests of the match-limiting features. The results are different for
+# interpretive or JIT matching, so this test should not be run with JIT. The
+# same tests are run using JIT in test 17.

-/abc/I,jit,jitverify
+# (2) Other tests that must not be run with JIT.
+
+/(a+)*zz/I
+Capturing subpattern count = 1
+Starting code units: a z 
+Last code unit = 'z'
+Subject length lower bound = 2
+  aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaazzbbbbbb\=find_limits
+Minimum match limit = 8
+Minimum recursion limit = 6
+ 0: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaazz
+ 1: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
+  aaaaaaaaaaaaaz\=find_limits
+Minimum match limit = 32768
+Minimum recursion limit = 29
+No match
+
+!((?:\s|//.*\\n|/[*](?:\\n|.)*?[*]/)*)!I
+Capturing subpattern count = 1
+May match empty string
+Subject length lower bound = 0
+   /* this is a C style comment */\=find_limits
+Minimum match limit = 120
+Minimum recursion limit = 6
+ 0: /* this is a C style comment */
+ 1: /* this is a C style comment */
+
+/^(?>a)++/
+    aa\=find_limits
+Minimum match limit = 5
+Minimum recursion limit = 2
+ 0: aa
+    aaaaaaaaa\=find_limits
+Minimum match limit = 12
+Minimum recursion limit = 2
+ 0: aaaaaaaaa
+    
+/(a)(?1)++/
+    aa\=find_limits
+Minimum match limit = 7
+Minimum recursion limit = 4
+ 0: aa
+ 1: a
+    aaaaaaaaa\=find_limits
+Minimum match limit = 21
+Minimum recursion limit = 4
+ 0: aaaaaaaaa
+ 1: a
+
+/a(?:.)*?a/ims
+    abbbbbbbbbbbbbbbbbbbbba\=find_limits
+Minimum match limit = 65
+Minimum recursion limit = 2
+ 0: abbbbbbbbbbbbbbbbbbbbba
+    
+/a(?:.(*THEN))*?a/ims
+    abbbbbbbbbbbbbbbbbbbbba\=find_limits
+Minimum match limit = 86
+Minimum recursion limit = 45
+ 0: abbbbbbbbbbbbbbbbbbbbba
+
+/a(?:.(*THEN:ABC))*?a/ims
+    abbbbbbbbbbbbbbbbbbbbba\=find_limits
+Minimum match limit = 86
+Minimum recursion limit = 45
+ 0: abbbbbbbbbbbbbbbbbbbbba
+
+/^(?>a+)(?>b+)(?>c+)(?>d+)(?>e+)/
+     aabbccddee\=find_limits
+Minimum match limit = 7
+Minimum recursion limit = 2
+ 0: aabbccddee
+
+/^(?>(a+))(?>(b+))(?>(c+))(?>(d+))(?>(e+))/
+     aabbccddee\=find_limits
+Minimum match limit = 17
+Minimum recursion limit = 16
+ 0: aabbccddee
+ 1: aa
+ 2: bb
+ 3: cc
+ 4: dd
+ 5: ee
+
+/^(?>(a+))(?>b+)(?>(c+))(?>d+)(?>(e+))/
+     aabbccddee\=find_limits
+Minimum match limit = 13
+Minimum recursion limit = 10
+ 0: aabbccddee
+ 1: aa
+ 2: cc
+ 3: ee
+
+/(*LIMIT_MATCH=12bc)abc/
+Failed: error 160 at offset 17: (*VERB) not recognized or malformed
+
+/(*LIMIT_MATCH=4294967290)abc/
+Failed: error 160 at offset 24: (*VERB) not recognized or malformed
+
+/(*LIMIT_RECURSION=4294967280)abc/I
 Capturing subpattern count = 0
+Recursion limit = 4294967280
 First code unit = 'a'
 Last code unit = 'c'
 Subject length lower bound = 3
-JIT support is not available in this version of PCRE2

-/a*/I
+/(a+)*zz/
+    aaaaaaaaaaaaaz
+No match
+    aaaaaaaaaaaaaz\=match_limit=3000
+Failed: error -47: match limit exceeded
+
+/(a+)*zz/
+    aaaaaaaaaaaaaz\=recursion_limit=10
+Failed: error -53: recursion limit exceeded
+
+/(*LIMIT_MATCH=3000)(a+)*zz/I
+Capturing subpattern count = 1
+Match limit = 3000
+Starting code units: a z 
+Last code unit = 'z'
+Subject length lower bound = 2
+    aaaaaaaaaaaaaz
+Failed: error -47: match limit exceeded
+    aaaaaaaaaaaaaz\=match_limit=60000
+Failed: error -47: match limit exceeded
+
+/(*LIMIT_MATCH=60000)(*LIMIT_MATCH=3000)(a+)*zz/I
+Capturing subpattern count = 1
+Match limit = 3000
+Starting code units: a z 
+Last code unit = 'z'
+Subject length lower bound = 2
+    aaaaaaaaaaaaaz
+Failed: error -47: match limit exceeded
+
+/(*LIMIT_MATCH=60000)(a+)*zz/I
+Capturing subpattern count = 1
+Match limit = 60000
+Starting code units: a z 
+Last code unit = 'z'
+Subject length lower bound = 2
+    aaaaaaaaaaaaaz
+No match
+    aaaaaaaaaaaaaz\=match_limit=3000
+Failed: error -47: match limit exceeded
+
+/(*LIMIT_RECURSION=10)(a+)*zz/I
+Capturing subpattern count = 1
+Recursion limit = 10
+Starting code units: a z 
+Last code unit = 'z'
+Subject length lower bound = 2
+    aaaaaaaaaaaaaz
+Failed: error -53: recursion limit exceeded
+    aaaaaaaaaaaaaz\=recursion_limit=1000
+Failed: error -53: recursion limit exceeded
+
+/(*LIMIT_RECURSION=10)(*LIMIT_RECURSION=1000)(a+)*zz/I
+Capturing subpattern count = 1
+Recursion limit = 1000
+Starting code units: a z 
+Last code unit = 'z'
+Subject length lower bound = 2
+    aaaaaaaaaaaaaz
+No match
+
+/(*LIMIT_RECURSION=1000)(a+)*zz/I
+Capturing subpattern count = 1
+Recursion limit = 1000
+Starting code units: a z 
+Last code unit = 'z'
+Subject length lower bound = 2
+    aaaaaaaaaaaaaz
+No match
+    aaaaaaaaaaaaaz\=recursion_limit=10
+Failed: error -53: recursion limit exceeded
+    
+# These three have infinitely nested recursions. 
+    
+/((?2))((?1))/
+    abc
+Failed: error -52: nested recursion at the same subject position
+
+/((?(R2)a+|(?1)b))/
+    aaaabcde
+Failed: error -52: nested recursion at the same subject position
+
+/(?(R)a*(?1)|((?R))b)/
+    aaaabcde
+Failed: error -52: nested recursion at the same subject position
+    
+# The allusedtext modifier does not work with JIT, which does not maintain
+# the leftchar/rightchar data.
+
+/abc(?=xyz)/allusedtext
+    abcxyzpqr
+ 0: abcxyz
+       >>>
+    abcxyzpqr\=aftertext
+ 0: abcxyz
+       >>>
+ 0+ xyzpqr
+    
+/(?<=pqr)abc(?=xyz)/allusedtext
+    xyzpqrabcxyzpqr
+ 0: pqrabcxyz
+    <<<   >>>
+    xyzpqrabcxyzpqr\=aftertext
+ 0: pqrabcxyz
+    <<<   >>>
+ 0+ xyzpqr
+    
+/a\b/
+    a.\=allusedtext
+ 0: a.
+     >
+    a\=allusedtext  
+ 0: a
+
+/abc\Kxyz/
+    abcxyz\=allusedtext
+ 0: abcxyz
+    <<<   
+
+/abc(?=xyz(*ACCEPT))/
+    abcxyz\=allusedtext
+ 0: abcxyz
+       >>>
+
+/abc(?=abcde)(?=ab)/allusedtext
+    abcabcdefg
+ 0: abcabcde
+       >>>>>
+    
+# These tests provoke recursion loops, which give a different error message
+# when JIT is used.
+
+/(?R)/I
 Capturing subpattern count = 0
 May match empty string
 Subject length lower bound = 0
+    abcd
+Failed: error -52: nested recursion at the same subject position
+
+/(a|(?R))/I
+Capturing subpattern count = 1
+May match empty string
+Subject length lower bound = 1
+    abcd
+ 0: a
+ 1: a
+    defg 
+Failed: error -52: nested recursion at the same subject position
+
+/(ab|(bc|(de|(?R))))/I
+Capturing subpattern count = 3
+May match empty string
+Subject length lower bound = 2
+    abcd
+ 0: ab
+ 1: ab
+    fghi 
+Failed: error -52: nested recursion at the same subject position
+
+/(ab|(bc|(de|(?1))))/I
+Capturing subpattern count = 3
+May match empty string
+Subject length lower bound = 2
+    abcd
+ 0: ab
+ 1: ab
+    fghi 
+Failed: error -52: nested recursion at the same subject position
+
+/x(ab|(bc|(de|(?1)x)x)x)/I
+Capturing subpattern count = 3
+First code unit = 'x'
+Subject length lower bound = 3
+    xab123
+ 0: xab
+ 1: ab
+    xfghi 
+Failed: error -52: nested recursion at the same subject position
+
+/(?!\w)(?R)/
+    abcd
+Failed: error -52: nested recursion at the same subject position
+    =abc 
+Failed: error -52: nested recursion at the same subject position
+
+/(?=\w)(?R)/
+    =abc 
+Failed: error -52: nested recursion at the same subject position
+    abcd
+Failed: error -52: nested recursion at the same subject position
+
+/(?<!\w)(?R)/
+    abcd
+Failed: error -52: nested recursion at the same subject position
+
+/(?<=\w)(?R)/
+    abcd
+Failed: error -52: nested recursion at the same subject position
+
+/(a+|(?R)b)/
+    aaa
+ 0: aaa
+ 1: aaa
+    bbb 
+Failed: error -52: nested recursion at the same subject position
+
+/[^\xff]((?1))/BI
+------------------------------------------------------------------
+        Bra
+        [^\x{ff}]
+        CBra 1
+        Recurse
+        Ket
+        Ket
+        End
+------------------------------------------------------------------
+Capturing subpattern count = 1
+Subject length lower bound = 1
+    abcd
+Failed: error -52: nested recursion at the same subject position

 # End of testinput15
--- a/testdata/testoutput16
+++ b/testdata/testoutput16
--- a/testdata/testoutput17
+++ b/testdata/testoutput17
--- a/testdata/testoutput18
+++ b/testdata/testoutput18
@ -1,20 +1,148 @@
 # This set of tests is run only with the 8-bit library. It tests the POSIX
-# interface with UTF/UCP support, which is supported only with the 8-bit
-# library. This test should not be run with JIT (which is not available for the
-# POSIX interface).
+# interface, which is supported only with the 8-bit library. This test should
+# not be run with JIT (which is not available for the POSIX interface).
    
+#forbid_utf
 #pattern posix

-/a\x{1234}b/utf
-    a\x{1234}b
- 0: a\x{1234}b
+# Test invalid options

-/\w/
-    +++\x{c2}
+/abc/auto_callout
+** Ignored with POSIX interface: auto_callout
+
+/abc/
+   abc\=find_limits
+** Ignored with POSIX interface: find_limits
+ 0: abc
+
+/abc/
+  abc\=partial_hard
+** Ignored with POSIX interface: partial_hard
+ 0: abc
+
+# Real tests
+
+/abc/
+    abc
+ 0: abc
+    *** Failers
 No match: POSIX code 17: match failed

-/\w/ucp
-    +++\x{c2}
- 0: \xc2
+/^abc|def/
+    abcdef
+ 0: abc
+    abcdef\=notbol
+ 0: def

-# End of testdata/testinput17
+/.*((abc)$|(def))/
+    defabc
+ 0: defabc
+ 1: abc
+ 2: abc
+    defabc\=noteol
+ 0: def
+ 1: def
+ 3: def
+
+/the quick brown fox/
+    the quick brown fox
+ 0: the quick brown fox
+    *** Failers
+No match: POSIX code 17: match failed
+    The Quick Brown Fox
+No match: POSIX code 17: match failed
+
+/the quick brown fox/i
+    the quick brown fox
+ 0: the quick brown fox
+    The Quick Brown Fox
+ 0: The Quick Brown Fox
+
+/abc.def/
+    *** Failers
+No match: POSIX code 17: match failed
+    abc\ndef
+No match: POSIX code 17: match failed
+
+/abc$/
+    abc
+ 0: abc
+    abc\n
+ 0: abc
+
+/(abc)\2/
+Failed: POSIX code 15: bad back reference at offset 6     
+
+/(abc\1)/
+    abc
+No match: POSIX code 17: match failed
+
+/a*(b+)(z)(z)/
+    aaaabbbbzzzz
+ 0: aaaabbbbzz
+ 1: bbbb
+ 2: z
+ 3: z
+    aaaabbbbzzzz\=ovector=0
+Matched without capture
+    aaaabbbbzzzz\=ovector=1
+ 0: aaaabbbbzz
+    aaaabbbbzzzz\=ovector=2
+ 0: aaaabbbbzz
+ 1: bbbb
+
+/ab.cd/
+    ab-cd
+ 0: ab-cd
+    ab=cd
+ 0: ab=cd
+    ** Failers
+No match: POSIX code 17: match failed
+    ab\ncd
+No match: POSIX code 17: match failed
+
+/ab.cd/s
+    ab-cd
+ 0: ab-cd
+    ab=cd
+ 0: ab=cd
+    ab\ncd
+ 0: ab\x0acd
+
+/a(b)c/no_auto_capture
+    abc
+Matched with REG_NOSUB
+
+/a(?P<name>b)c/no_auto_capture
+    abc
+Matched with REG_NOSUB
+
+/a?|b?/
+    abc
+ 0: a
+    ** Failers
+ 0: 
+    ddd\=notempty
+No match: POSIX code 17: match failed
+
+/\w+A/
+   CDAAAAB
+ 0: CDAAAA
+
+/\w+A/ungreedy
+   CDAAAAB
+ 0: CDA
+   
+/\Biss\B/I,aftertext
+** Ignored with POSIX interface: info
+    Mississippi
+ 0: iss
+ 0+ issippi
+
+/abc/\
+Failed: POSIX code 9: bad escape sequence at offset 4     
+
+"(?(?C)"
+Failed: POSIX code 3: pattern error at offset 2     
+
+# End of testdata/testinput18
--- a/testdata/testoutput19
+++ b/testdata/testoutput19
@ -1,100 +1,20 @@
-# This set of tests exercises the serialization/deserialization functions in
-# the library. It does not use UTF or JIT.
+# This set of tests is run only with the 8-bit library. It tests the POSIX
+# interface with UTF/UCP support, which is supported only with the 8-bit
+# library. This test should not be run with JIT (which is not available for the
+# POSIX interface).
    
-#forbid_utf
+#pattern posix

-# Compile several patterns, push them onto the stack, and then write them
-# all to a file.
+/a\x{1234}b/utf
+    a\x{1234}b
+ 0: a\x{1234}b

-#pattern push
+/\w/
+    +++\x{c2}
+No match: POSIX code 17: match failed

-/(?<NAME>(?&NAME_PAT))\s+(?<ADDR>(?&ADDRESS_PAT))
-  (?(DEFINE)
-  (?<NAME_PAT>[a-z]+)
-  (?<ADDRESS_PAT>\d+)
-  )/x
-/^(?:((.)(?1)\2|)|((.)(?3)\4|.))$/i
+/\w/ucp
+    +++\x{c2}
+ 0: \xc2
    
-#save testsaved1
-
-# Do it again for some more patterns.
-
-/(*MARK:A)(*SKIP:B)(C|X)/mark
-** Ignored when compiled pattern is stacked with 'push': mark
-/(?:(?<n>foo)|(?<n>bar))\k<n>/dupnames
-
-#save testsaved2
-#pattern -push
-
-# Reload the patterns, then pop them one by one and check them.
-
-#load testsaved1
-#load testsaved2
-
-#pop info
-Capturing subpattern count = 2
-Max back reference = 2
-Named capturing subpatterns:
-  n   1
-  n   2
-Options: dupnames
-Starting code units: b f 
-Subject length lower bound = 6
-    foofoo             
- 0: foofoo
- 1: foo
-    barbar
- 0: barbar
- 1: <unset>
- 2: bar
-    
-#pop mark
-    C
- 0: C
- 1: C
-MK: A
-    D 
-No match, mark = A
-    
-#pop
-    AmanaplanacanalPanama   
- 0: AmanaplanacanalPanama
- 1: <unset>
- 2: <unset>
- 3: AmanaplanacanalPanama
- 4: A
-
-#pop info
-Capturing subpattern count = 4
-Named capturing subpatterns:
-  ADDR          2
-  ADDRESS_PAT   4
-  NAME          1
-  NAME_PAT      3
-Options: extended
-Subject length lower bound = 3
-    metcalfe 33
- 0: metcalfe 33
- 1: metcalfe
- 2: 33
-    
-# Check for an error when different tables are used.
-
-/abc/push,tables=1
-/xyz/push,tables=2
-#save testsaved1
-Serialization failed: error -30: patterns do not all use the same character tables
-
-#pop
-    xyz
- 0: xyz
-
-#pop
-    abc
- 0: abc
-
-#pop should give an error
-** Can't pop off an empty stack
-    pqr
-
-# End of testinput19 
+# End of testdata/testinput19
--- a/testdata/testoutput20
+++ b/testdata/testoutput20
@ -0,0 +1,100 @@
+# This set of tests exercises the serialization/deserialization functions in
+# the library. It does not use UTF or JIT.
+
+#forbid_utf
+
+# Compile several patterns, push them onto the stack, and then write them
+# all to a file.
+
+#pattern push
+
+/(?<NAME>(?&NAME_PAT))\s+(?<ADDR>(?&ADDRESS_PAT))
+  (?(DEFINE)
+  (?<NAME_PAT>[a-z]+)
+  (?<ADDRESS_PAT>\d+)
+  )/x
+/^(?:((.)(?1)\2|)|((.)(?3)\4|.))$/i
+
+#save testsaved1
+
+# Do it again for some more patterns.
+
+/(*MARK:A)(*SKIP:B)(C|X)/mark
+** Ignored when compiled pattern is stacked with 'push': mark
+/(?:(?<n>foo)|(?<n>bar))\k<n>/dupnames
+
+#save testsaved2
+#pattern -push
+
+# Reload the patterns, then pop them one by one and check them.
+
+#load testsaved1
+#load testsaved2
+
+#pop info
+Capturing subpattern count = 2
+Max back reference = 2
+Named capturing subpatterns:
+  n   1
+  n   2
+Options: dupnames
+Starting code units: b f 
+Subject length lower bound = 6
+    foofoo             
+ 0: foofoo
+ 1: foo
+    barbar
+ 0: barbar
+ 1: <unset>
+ 2: bar
+    
+#pop mark
+    C
+ 0: C
+ 1: C
+MK: A
+    D 
+No match, mark = A
+    
+#pop
+    AmanaplanacanalPanama   
+ 0: AmanaplanacanalPanama
+ 1: <unset>
+ 2: <unset>
+ 3: AmanaplanacanalPanama
+ 4: A
+
+#pop info
+Capturing subpattern count = 4
+Named capturing subpatterns:
+  ADDR          2
+  ADDRESS_PAT   4
+  NAME          1
+  NAME_PAT      3
+Options: extended
+Subject length lower bound = 3
+    metcalfe 33
+ 0: metcalfe 33
+ 1: metcalfe
+ 2: 33
+    
+# Check for an error when different tables are used.
+
+/abc/push,tables=1
+/xyz/push,tables=2
+#save testsaved1
+Serialization failed: error -30: patterns do not all use the same character tables
+
+#pop
+    xyz
+ 0: xyz
+
+#pop
+    abc
+ 0: abc
+
+#pop should give an error
+** Can't pop off an empty stack
+    pqr
+
+# End of testinput20