Further work on pcre2test (can now display compiled code).

2014-05-13 11:20:03 +00:00 · 2014-05-13 11:20:03 +00:00 · 225992aa3a
parent 9812ca8b0a
commit 225992aa3a
32 changed files with 38225 additions and 937 deletions
--- a/Makefile.am
+++ b/Makefile.am
@ -269,25 +269,26 @@ COMMON_SOURCES = \
  src/pcre2_error.c \
  src/pcre2_match.c \
  src/pcre2_internal.h \
  src/pcre2_intmodedep.h \
  src/pcre2_jit_compile.c \
  src/pcre2_jit_match.c \
  src/pcre2_jit_misc.c \
  src/pcre2_maketables.c \
  src/pcre2_match_data.c \
  src/pcre2_pattern_info.c \
  src/pcre2_string_utils.c \
  src/pcre2_substring.c \
  src/pcre2_tables.c \
  src/pcre2_ucd.c \
  src/pcre2_ucp.h \
  src/pcre2_version.c
 #  src/pcre2_newline.c \
 #  src/pcre2_ord2utf8.c \
 #  src/pcre2_refcount.c \
 #  src/pcre2_string_utils.c \
 #  src/pcre2_study.c \
 #  src/pcre2_tables.c \
 #  src/pcre2_ucd.c \
 #  src/pcre2_valid_utf8.c \
-#  src/pcre2_xclass.c \
+#  src/pcre2_xclass.c
 #  src/ucp.h
 if WITH_PCRE8
--- a/maint/GenerateUtt.py
+++ b/maint/GenerateUtt.py
@ -0,0 +1,113 @@
 #! /usr/bin/python
 # Generate utt tables. Note: this script is written in Python 2 and is
 # incompatible with Python 3. However, the 2to3 conversion script has been 
 # successfully tested on it.
 # The source file pcre2_tables.c contains (amongst other things), a table that
 # is indexed by script name. In order to reduce the number of relocations when
 # loading the library, the names are held as a single large string, with
 # offsets in the table. This is tedious to maintain by hand. Therefore, this
 # script is used to generate the table. The output is sent to stdout; usually
 # that should be directed to a temporary file. Then pcre2_tables.c can be
 # edited by replacing the relevant definitions and table therein with the
 # temporary file.
 # Modified by PH 17-March-2009 to generate the more verbose form that works
 # for UTF-support in EBCDIC as well as ASCII environments.
 # Modified by PH 01-March-2010 to add new scripts for Unicode 5.2.0.
 # Modified by PH 04-May-2010 to add new "X.." special categories.
 # Modified by PH 30-April-2011 to add new scripts for Unicode 6.0.0
 # Modified by ChPe 30-September-2012 to add this note; no other changes were
 # necessary for Unicode 6.2.0 support.
 # Modfied by PH 26-February-2013 to add the Xuc special category.
 # Comment modified by PH 13-May-2014 to update to PCRE2 file names.
 script_names = ['Arabic', 'Armenian', 'Bengali', 'Bopomofo', 'Braille', 'Buginese', 'Buhid', 'Canadian_Aboriginal', \
 'Cherokee', 'Common', 'Coptic', 'Cypriot', 'Cyrillic', 'Deseret', 'Devanagari', 'Ethiopic', 'Georgian', \
 'Glagolitic', 'Gothic', 'Greek', 'Gujarati', 'Gurmukhi', 'Han', 'Hangul', 'Hanunoo', 'Hebrew', 'Hiragana', \
 'Inherited', 'Kannada', 'Katakana', 'Kharoshthi', 'Khmer', 'Lao', 'Latin', 'Limbu', 'Linear_B', 'Malayalam', \
 'Mongolian', 'Myanmar', 'New_Tai_Lue', 'Ogham', 'Old_Italic', 'Old_Persian', 'Oriya', 'Osmanya', 'Runic', \
 'Shavian', 'Sinhala', 'Syloti_Nagri', 'Syriac', 'Tagalog', 'Tagbanwa', 'Tai_Le', 'Tamil', 'Telugu', 'Thaana', \
 'Thai', 'Tibetan', 'Tifinagh', 'Ugaritic', 'Yi', \
 # New for Unicode 5.0
 'Balinese', 'Cuneiform', 'Nko', 'Phags_Pa', 'Phoenician', \
 # New for Unicode 5.1
 'Carian', 'Cham', 'Kayah_Li', 'Lepcha', 'Lycian', 'Lydian', 'Ol_Chiki', 'Rejang', 'Saurashtra', 'Sundanese', 'Vai', \
 # New for Unicode 5.2
 'Avestan', 'Bamum', 'Egyptian_Hieroglyphs', 'Imperial_Aramaic', \
 'Inscriptional_Pahlavi', 'Inscriptional_Parthian', \
 'Javanese', 'Kaithi', 'Lisu', 'Meetei_Mayek', \
 'Old_South_Arabian', 'Old_Turkic', 'Samaritan', 'Tai_Tham', 'Tai_Viet', \
 # New for Unicode 6.0.0
 'Batak', 'Brahmi', 'Mandaic', \
 # New for Unicode 6.1.0
 'Chakma', 'Meroitic_Cursive', 'Meroitic_Hieroglyphs', 'Miao', 'Sharada', 'Sora_Sompeng', 'Takri'
 ]
 category_names = ['Cc', 'Cf', 'Cn', 'Co', 'Cs', 'Ll', 'Lm', 'Lo', 'Lt', 'Lu',
  'Mc', 'Me', 'Mn', 'Nd', 'Nl', 'No', 'Pc', 'Pd', 'Pe', 'Pf', 'Pi', 'Po', 'Ps',
  'Sc', 'Sk', 'Sm', 'So', 'Zl', 'Zp', 'Zs' ]
 general_category_names = ['C', 'L', 'M', 'N', 'P', 'S', 'Z']
 # First add the Unicode script and category names.
 utt_table  = zip(script_names, ['PT_SC'] * len(script_names))
 utt_table += zip(category_names, ['PT_PC'] * len(category_names))
 utt_table += zip(general_category_names, ['PT_GC'] * len(general_category_names))
 # Now add our own specials.
 utt_table.append(('Any', 'PT_ANY'))
 utt_table.append(('L&',  'PT_LAMP'))
 utt_table.append(('Xan', 'PT_ALNUM'))
 utt_table.append(('Xps', 'PT_PXSPACE'))
 utt_table.append(('Xsp', 'PT_SPACE'))
 utt_table.append(('Xuc', 'PT_UCNC'))
 utt_table.append(('Xwd', 'PT_WORD'))
 # Sort the table.
 utt_table.sort()
 # We have to use STR_ macros to define the strings so that it all works in
 # UTF-8 mode on EBCDIC platforms.
 for utt in utt_table:
        print '#define STRING_%s0' % (utt[0].replace('&', '_AMPERSAND')),
        for c in utt[0]:
                if c == '_':
                        print 'STR_UNDERSCORE',
                elif c == '&':
                        print 'STR_AMPERSAND',
                else:
                        print 'STR_%s' % c,;
        print '"\\0"'
 # Print the actual table, using the string names
 print ''
 print 'const char PRIV(utt_names)[] =';
 last = ''
 for utt in utt_table:
        if utt == utt_table[-1]:
                last = ';'
        print '  STRING_%s0%s' % (utt[0].replace('&', '_AMPERSAND'), last)
 # This was how it was done before the EBCDIC-compatible modification.
 #        print '  "%s\\0"%s' % (utt[0], last)
 print '\nconst ucp_type_table PRIV(utt)[] = {'
 offset = 0
 last = ','
 for utt in utt_table:
        if utt[1] in ('PT_ANY', 'PT_LAMP', 'PT_ALNUM', 'PT_PXSPACE', 
          'PT_SPACE', 'PT_UCNC', 'PT_WORD'):
                value = '0'
        else:
                value = 'ucp_' + utt[0]
        if utt == utt_table[-1]:
                last = ''
        print '  { %3d, %s, %s }%s' % (offset, utt[1], value, last)
        offset += len(utt[0]) + 1
 print '};'
--- a/maint/ManyConfigTests
+++ b/maint/ManyConfigTests
@ -0,0 +1,305 @@
 #! /bin/sh
 # This is a script for the use of PCRE maintainers. It configures and rebuilds
 # PCRE2 with a variety of configuration options, and in each case runs the
 # tests to ensure that all goes well. Every possible combination would take far
 # too long, so we use a representative sample. This script should be run in the
 # PCRE2 source directory.
 # Some of the tests have to be skipped when PCRE2 is built with non-Unix
 # newline recognition. I am planning to reduce this as much as possible in due
 # course.
 # This is in case the caller has set aliases (as I do - PH)
 unset cp ls mv rm
 # Use -v to make the output more verbose
 verbose=0
 if [ "$1" = "-v" ] ; then verbose=1; fi
 # This is a temporary directory for testing out-of-line builds
 tmp=/tmp/pcretesting
 # Don't bother with compiler optimization for most tests; it just slows down
 # compilation a lot (and running the tests themselves is quick). However, a
 # few specific tests turn optimization on, because it can provoke some compiler
 # warnings.
 CFLAGS="-g -O0"
 CXXFLAGS="$CFLAGS"
 ISGCC="no"
 # If the compiler is gcc, add a lot of warning switches.
 cc --version >zzz 2>/dev/null
 if [ $? -eq 0 ] && grep GCC zzz >/dev/null; then
  ISGCC="yes"
  CFLAGS="$CFLAGS -Wall"
  CFLAGS="$CFLAGS -Wno-overlength-strings"
  CFLAGS="$CFLAGS -Wpointer-arith"
  CFLAGS="$CFLAGS -Wwrite-strings"
  CFLAGS="$CFLAGS -Wundef -Wshadow"
  CFLAGS="$CFLAGS -Wmissing-field-initializers"
  CFLAGS="$CFLAGS -Wunused-parameter"  
  CFLAGS="$CFLAGS -Wextra -Wformat"
  CFLAGS="$CFLAGS -Wbad-function-cast"
  CFLAGS="$CFLAGS -Wmissing-declarations"
  CFLAGS="$CFLAGS -Wnested-externs"
  CFLAGS="$CFLAGS -pedantic"
  CFLAGS="$CFLAGS -Wuninitialized"
  CFLAGS="$CFLAGS -Wmissing-prototypes"
  CFLAGS="$CFLAGS -Wstrict-prototypes"
 fi
 # This function runs a single test with the set of configuration options that
 # are in $opts. The source directory must be set in srcdir.
 function runtest()
  {
  rm -f *_unittest
  testcount=`expr $testcount + 1`
  if [ "$opts" = "" ] ; then
    echo "[$testcount/$testtotal] Configuring with: default settings"
  else
    echo "[$testcount/$testtotal] Configuring with:"
    echo "  $opts"
  fi
  CFLAGS="$CFLAGS" CXXFLAGS="$CXXFLAGS" \
    $srcdir/configure $opts >/dev/null 2>teststderr
  if [ $? -ne 0 ]; then
    echo " "
    echo "**** Error while configuring ****"
    cat teststderr
    exit 1
  fi
  echo "Making"
  make -j >/dev/null 2>teststderr
  if [ $? -ne 0 -o -s teststderr ]; then
    echo " "
    echo "**** Errors or warnings while making ****"
    echo " "
    cat teststderr
    exit 1
  fi
  if [ $verbose -eq 1 ]; then
    ./pcre2test -C
  fi
  nl=`./pcre2test -C newline`
  ./pcretest -C jit >/dev/null
  jit=$?
  ./pcre2test -C utf >/dev/null
  utf=$?
  if [ "$nl" = "LF" -o "$nl" = "ANY" ]; then
    echo "Running C library tests $withvalgrind"
    $srcdir/RunTest $valgrind >teststdout
    if [ $? -ne 0 ]; then
      echo " "
      echo "**** Test failed ****"
      cat teststdout
      exit 1
    fi
  else
    echo "Skipping C library tests: newline is $nl"
  fi
  if [ "$nl" = "LF" ]; then
    echo "Running pcre2grep tests $withvalgrind"
    $srcdir/RunGrepTest $valgrind >teststdout 2>teststderr
    if [ $? -ne 0 ]; then
      echo " "
      echo "**** Test failed ****"
      cat teststderr
      cat teststdout
      exit 1
    fi
  else
    echo "Skipping pcre2grep tests: newline is $nl"
  fi
  if [ "$jit" -gt 0 -a $utf -gt 0 ]; then
    echo "Running JIT regression tests $withvalgrind"
    $cvalgrind $srcdir/pcre2_jit_test >teststdout 2>teststderr
    if [ $? -ne 0 ]; then
      echo " "
      echo "**** Test failed ****"
      cat teststderr
      cat teststdout
      exit 1
    fi
  else
    echo "Skipping JIT regression tests: JIT or UTF not enabled"
  fi
 #  if [ "$nl" = "LF" -o "$nl" = "ANY" ]; then
 #    if [ -f pcrecpp_unittest ] ; then
 #      for utest in pcrecpp_unittest \
 #                   pcre_scanner_unittest \
 #                   pcre_stringpiece_unittest
 #      do
 #        echo "Running $utest $withvalgrind"
 #        $cvalgrind $utest >teststdout
 #        if [ $? -ne 0 ]; then
 #          echo " "
 #          echo "**** Test failed ****"
 #          cat teststdout
 #          exit 1
 #        fi
 #      done
 #    else
 #      echo "Skipping C++ tests: pcrecpp_unittest does not exist"
 #    fi
 #  else
 #    echo "Skipping C++ tests: newline is $nl"
 #  fi
  }
 # Update the total count whenever a new test is added; it is used to show
 # progess as each test is run.
 testtotal=40
 testcount=0
 # This set of tests builds PCRE and runs the tests with a variety of configure
 # options, in the current (source) directory. The empty configuration builds
 # with all the default settings. As well as testing that these options work, we
 # use --disable-shared or --disable-static after the default test (which builds
 # both) to save a bit of time by building only one version of the library for
 # the subsequent tests.
 valgrind=
 cvalgrind=
 withvalgrind=
 srcdir=.
 export srcdir
 # If gcc is in use, run a maximally configured test with -O2, because that can
 # throw up warnings that are not detected with -O0.
 if [ "$ISGCC" = "yes" ]; then
  echo "Maximally configured test with -O2"
  SAVECLFAGS="$CFLAGS"
  CFLAGS="$CFLAGS -O2"
  opts="--disable-shared --enable-utf --enable-jit --enable-pcre16 --enable-pcre32"
  runtest
  CFLAGS="$SAVECFLAGS"
 fi
 echo "General tests in the current directory"
 for opts in \
  "" \
  "--enable-utf --disable-static" \
  "--disable-stack-for-recursion --disable-shared" \
  "--enable-utf --disable-shared" \
  "--enable-utf --disable-stack-for-recursion --disable-shared" \
  "--enable-utf --with-link-size=3 --disable-shared" \
  "--enable-rebuild-chartables --disable-shared" \
  "--enable-newline-is-any --disable-shared" \
  "--enable-newline-is-cr --disable-shared" \
  "--enable-newline-is-crlf --disable-shared" \
  "--enable-newline-is-anycrlf --enable-bsr-anycrlf --disable-shared" \
  "--enable-utf --enable-newline-is-any --disable-stack-for-recursion --disable-static" \
  "--enable-jit --disable-shared" \
  "--enable-jit --enable-utf --disable-shared" \
  "--enable-jit --enable-utf --with-link-size=3 --disable-shared" \
  "--enable-pcre16" \
  "--enable-pcre16 --enable-jit --enable-utf --disable-shared" \
  "--enable-pcre16 --enable-jit --disable-pcre8 --disable-shared" \
  "--enable-pcre16 --enable-jit --disable-pcre8 --enable-utf --disable-shared" \
  "--enable-pcre16 --disable-stack-for-recursion --disable-shared" \
  "--enable-pcre16 --enable-utf --disable-stack-for-recursion --disable-shared" \
  "--enable-pcre16 --enable-jit --enable-utf --with-link-size=3 --disable-shared" \
  "--enable-pcre16 --enable-jit --enable-utf --with-link-size=4 --disable-shared" \
  "--enable-pcre32" \
  "--enable-pcre32 --enable-jit --enable-utf --disable-shared" \
  "--enable-pcre32 --enable-jit --disable-pcre8 --disable-shared" \
  "--enable-pcre32 --enable-jit --disable-pcre8 --enable-utf --disable-shared" \
  "--enable-pcre32 --disable-stack-for-recursion --disable-shared" \
  "--enable-pcre32 --enable-utf --disable-stack-for-recursion --disable-shared" \
  "--enable-pcre32 --enable-jit --enable-utf --with-link-size=4 --disable-shared" \
  "--enable-pcre32 --enable-pcre16 --disable-shared" \
  "--enable-pcre32 --enable-pcre16 --disable-pcre8 --disable-shared" \
  "--enable-pcre32 --enable-pcre16 --disable-pcre8 --enable-jit --enable-utf --enable-newline-is-anycrlf --enable-bsr-anycrlf --disable-shared"
 do
  runtest
 done
 # Now re-run some of the tests under valgrind.
 echo "Tests in the current directory using valgrind"
 valgrind=valgrind
 cvalgrind="valgrind -q --smc-check=all"
 withvalgrind="with valgrind"
 for opts in \
  "--enable-utf --disable-stack-for-recursion --disable-shared" \
  "--enable-utf --with-link-size=3 --disable-shared" \
  "--enable-jit --enable-utf --disable-shared" \
  "--enable-pcre16 --enable-pcre32 --enable-jit --enable-utf " \
  "--disable-shared"
 do
  opts="--enable-valgrind $opts"
  runtest
 done
 valgrind=
 cvalgrind=
 withvalgrind=
 # Clean up the distribution and then do at least one build and test in a
 # directory other than the source directory. It doesn't work unless the
 # source directory is cleaned up first.
 if [ -f Makefile ]; then
  echo "Running 'make distclean'"
  make distclean >/dev/null 2>&1
  if [ $? -ne 0 ]; then
    echo "** 'make distclean' failed"
    exit 1
  fi
 fi
 echo "Tests in the $tmp directory"
 srcdir=`pwd`
 export srcdir
 if [ ! -e $tmp ]; then
  mkdir $tmp
 fi
 if [ ! -d $tmp ]; then
  echo "** Failed to create $tmp or it is not a directory"
  exit 1
 fi
 cd $tmp
 if [ $? -ne 0 ]; then
  echo "** Failed to cd to $tmp"
  exit 1
 fi
 for opts in \
  "--enable-utf --disable-shared"
 do
  runtest
 done
 echo "Removing $tmp"
 rm -rf $tmp
 echo "All done"
 # End
--- a/maint/MultiStage2.py
+++ b/maint/MultiStage2.py
@ -0,0 +1,505 @@
 #! /usr/bin/python
 # Multistage table builder
 # (c) Peter Kankowski, 2008
 ##############################################################################
 # This script was submitted to the PCRE project by Peter Kankowski as part of
 # the upgrading of Unicode property support. The new code speeds up property
 # matching many times. The script is for the use of PCRE maintainers, to
 # generate the pcre_ucd.c file that contains a digested form of the Unicode
 # data tables.
 #
 # The script should be run in the maint subdirectory, using the command
 #
 # [python2] ./MultiStage2.py >../src/pcre2_ucd.c
 #
 # It requires four Unicode data tables, DerivedGeneralCategory.txt,
 # GraphemeBreakProperty.txt, Scripts.txt, and CaseFolding.txt, to be in the 
 # Unicode.tables subdirectory. The first of these is found in the "extracted" 
 # subdirectory of the Unicode database (UCD) on the Unicode web site; the 
 # second is in the "auxiliary" subdirectory; the other two are directly in the 
 # UCD directory.
 #
 # Minor modifications made to this script:
 #  Added #! line at start
 #  Removed tabs
 #  Made it work with Python 2.4 by rewriting two statements that needed 2.5
 #  Consequent code tidy
 #  Adjusted data file names to take from the Unicode.tables directory
 #  Adjusted global table names by prefixing _pcre_.
 #  Commented out stuff relating to the casefolding table, which isn't used;
 #    removed completely in 2012.
 #  Corrected size calculation
 #  Add #ifndef SUPPORT_UCP to use dummy tables when no UCP support is needed.
 #  Update for PCRE2: name changes and SUPPORT_UCP is abolished.
 #
 # Major modifications made to this script:
 #  Added code to add a grapheme break property field to records.
 #
 #  Added code to search for sets of more than two characters that must match
 #  each other caselessly. A new table is output containing these sets, and
 #  offsets into the table are added to the main output records. This new
 #  code scans CaseFolding.txt instead of UnicodeData.txt.
 #
 # The main tables generated by this script are used by macros defined in
 # pcre2_internal.h. They look up Unicode character properties using short 
 # sequences of code that contains no branches, which makes for greater speed.
 #
 # Conceptually, there is a table of records (of type ucd_record), containing a
 # script number, character type, grapheme break type, offset to caseless
 # matching set, and offset to the character's other case for every character.
 # However, a real table covering all Unicode characters would be far too big.
 # It can be efficiently compressed by observing that many characters have the
 # same record, and many blocks of characters (taking 128 characters in a block)
 # have the same set of records as other blocks. This leads to a 2-stage lookup
 # process.
 #
 # This script constructs four tables. The ucd_caseless_sets table contains
 # lists of characters that all match each other caselessly. Each list is
 # in order, and is terminated by NOTACHAR (0xffffffff), which is larger than
 # any valid character. The first list is empty; this is used for characters
 # that are not part of any list.
 #
 # The ucd_records table contains one instance of every unique record that is
 # required. The ucd_stage1 table is indexed by a character's block number, and
 # yields what is in effect a "virtual" block number. The ucd_stage2 table is a
 # table of "virtual" blocks; each block is indexed by the offset of a character
 # within its own block, and the result is the offset of the required record.
 #
 # Example: lowercase "a" (U+0061) is in block 0
 #          lookup 0 in stage1 table yields 0
 #          lookup 97 in the first table in stage2 yields 16
 #          record 17 is { 33, 5, 11, 0, -32 } 
 #            33 = ucp_Latin   => Latin script
 #             5 = ucp_Ll      => Lower case letter
 #            11 = ucp_gbOther => Grapheme break property "Other"
 #             0               => not part of a caseless set
 #           -32               => Other case is U+0041
 #         
 # Almost all lowercase latin characters resolve to the same record. One or two
 # are different because they are part of a multi-character caseless set (for
 # example, k, K and the Kelvin symbol are such a set).
 #
 # Example: hiragana letter A (U+3042) is in block 96 (0x60)
 #          lookup 96 in stage1 table yields 88
 #          lookup 66 in the 88th table in stage2 yields 467
 #          record 470 is { 26, 7, 11, 0, 0 } 
 #            26 = ucp_Hiragana => Hiragana script
 #             7 = ucp_Lo       => Other letter
 #            11 = ucp_gbOther  => Grapheme break property "Other"
 #             0                => not part of a caseless set
 #             0                => No other case 
 #
 # In these examples, no other blocks resolve to the same "virtual" block, as it
 # happens, but plenty of other blocks do share "virtual" blocks.
 #
 # There is a fourth table, maintained by hand, which translates from the 
 # individual character types such as ucp_Cc to the general types like ucp_C.
 #
 #  Philip Hazel, 03 July 2008
 #
 # 01-March-2010:     Updated list of scripts for Unicode 5.2.0
 # 30-April-2011:     Updated list of scripts for Unicode 6.0.0
 #     July-2012:     Updated list of scripts for Unicode 6.1.0
 # 20-August-2012:    Added scan of GraphemeBreakProperty.txt and added a new 
 #                      field in the record to hold the value. Luckily, the 
 #                      structure had a hole in it, so the resulting table is
 #                      not much bigger than before.
 # 18-September-2012: Added code for multiple caseless sets. This uses the
 #                      final hole in the structure.
 # 30-September-2012: Added RegionalIndicator break property from Unicode 6.2.0
 # 13-May-2014:       Updated for PCRE2
 ##############################################################################
 import re
 import string
 import sys
 MAX_UNICODE = 0x110000
 NOTACHAR = 0xffffffff
 # Parse a line of Scripts.txt, GraphemeBreakProperty.txt or DerivedGeneralCategory.txt
 def make_get_names(enum):
        return lambda chardata: enum.index(chardata[1])
 # Parse a line of CaseFolding.txt
 def get_other_case(chardata):
        if chardata[1] == 'C' or chardata[1] == 'S':
          return int(chardata[2], 16) - int(chardata[0], 16)
        return 0
 # Read the whole table in memory
 def read_table(file_name, get_value, default_value):
        file = open(file_name, 'r')
        table = [default_value] * MAX_UNICODE
        for line in file:
                line = re.sub(r'#.*', '', line)
                chardata = map(string.strip, line.split(';'))
                if len(chardata) <= 1:
                        continue
                value = get_value(chardata)
                m = re.match(r'([0-9a-fA-F]+)(\.\.([0-9a-fA-F]+))?$', chardata[0])
                char = int(m.group(1), 16)
                if m.group(3) is None:
                        last = char
                else:
                        last = int(m.group(3), 16)            
                for i in range(char, last + 1):
                        # It is important not to overwrite a previously set
                        # value because in the CaseFolding file there are lines
                        # to be ignored (returning the default value of 0) 
                        # which often come after a line which has already set 
                        # data.   
                        if table[i] == default_value: 
                          table[i] = value
        file.close()
        return table
 # Get the smallest possible C language type for the values
 def get_type_size(table):
        type_size = [("uint8_t", 1), ("uint16_t", 2), ("uint32_t", 4),
                                 ("signed char", 1), ("pcre_int16", 2), ("pcre_int32", 4)]
        limits = [(0, 255), (0, 65535), (0, 4294967295),
                          (-128, 127), (-32768, 32767), (-2147483648, 2147483647)]
        minval = min(table)
        maxval = max(table)
        for num, (minlimit, maxlimit) in enumerate(limits):
                if minlimit <= minval and maxval <= maxlimit:
                        return type_size[num]
        else:
                raise OverflowError, "Too large to fit into C types"
 def get_tables_size(*tables):
        total_size = 0
        for table in tables:
                type, size = get_type_size(table)
                total_size += size * len(table)
        return total_size
 # Compress the table into the two stages
 def compress_table(table, block_size):
        blocks = {} # Dictionary for finding identical blocks
        stage1 = [] # Stage 1 table contains block numbers (indices into stage 2 table)
        stage2 = [] # Stage 2 table contains the blocks with property values
        table = tuple(table)
        for i in range(0, len(table), block_size):
                block = table[i:i+block_size]
                start = blocks.get(block)
                if start is None:
                        # Allocate a new block
                        start = len(stage2) / block_size
                        stage2 += block
                        blocks[block] = start
                stage1.append(start)
        return stage1, stage2
 # Print a table
 def print_table(table, table_name, block_size = None):
        type, size = get_type_size(table)
        ELEMS_PER_LINE = 16
        s = "const %s %s[] = { /* %d bytes" % (type, table_name, size * len(table))
        if block_size:
                s += ", block = %d" % block_size
        print s + " */"
        table = tuple(table)
        if block_size is None:
                fmt = "%3d," * ELEMS_PER_LINE + " /* U+%04X */"
                mult = MAX_UNICODE / len(table)
                for i in range(0, len(table), ELEMS_PER_LINE):
                        print fmt % (table[i:i+ELEMS_PER_LINE] + (i * mult,))
        else:
                if block_size > ELEMS_PER_LINE:
                        el = ELEMS_PER_LINE
                else:
                        el = block_size
                fmt = "%3d," * el + "\n"
                if block_size > ELEMS_PER_LINE:
                        fmt = fmt * (block_size / ELEMS_PER_LINE)
                for i in range(0, len(table), block_size):
                        print ("/* block %d */\n" + fmt) % ((i / block_size,) + table[i:i+block_size])
        print "};\n"
 # Extract the unique combinations of properties into records
 def combine_tables(*tables):
        records = {}
        index = []
        for t in zip(*tables):
                i = records.get(t)
                if i is None:
                        i = records[t] = len(records)
                index.append(i)
        return index, records
 def get_record_size_struct(records):
        size = 0
        structure = '/* When recompiling tables with a new Unicode version, please check the\n' + \
        'types in this structure definition from pcre2_internal.h (the actual\n' + \
        'field names will be different):\n\ntypedef struct {\n'
        for i in range(len(records[0])):
                record_slice = map(lambda record: record[i], records)
                slice_type, slice_size = get_type_size(record_slice)
                # add padding: round up to the nearest power of slice_size
                size = (size + slice_size - 1) & -slice_size
                size += slice_size
                structure += '%s property_%d;\n' % (slice_type, i)
        # round up to the first item of the next structure in array
        record_slice = map(lambda record: record[0], records)
        slice_type, slice_size = get_type_size(record_slice)
        size = (size + slice_size - 1) & -slice_size
        structure += '} ucd_record;\n*/\n\n'
        return size, structure
 def test_record_size():
        tests = [ \
          ( [(3,), (6,), (6,), (1,)], 1 ), \
          ( [(300,), (600,), (600,), (100,)], 2 ), \
          ( [(25, 3), (6, 6), (34, 6), (68, 1)], 2 ), \
          ( [(300, 3), (6, 6), (340, 6), (690, 1)], 4 ), \
          ( [(3, 300), (6, 6), (6, 340), (1, 690)], 4 ), \
          ( [(300, 300), (6, 6), (6, 340), (1, 690)], 4 ), \
          ( [(3, 100000), (6, 6), (6, 123456), (1, 690)], 8 ), \
          ( [(100000, 300), (6, 6), (123456, 6), (1, 690)], 8 ), \
        ]
        for test in tests:
            size, struct = get_record_size_struct(test[0])
            assert(size == test[1])
            #print struct
 def print_records(records, record_size):
        print 'const ucd_record PRIV(ucd_records)[] = { ' + \
              '/* %d bytes, record size %d */' % (len(records) * record_size, record_size)
        records = zip(records.keys(), records.values())
        records.sort(None, lambda x: x[1])
        for i, record in enumerate(records):
                print ('  {' + '%6d, ' * len(record[0]) + '}, /* %3d */') % (record[0] + (i,))
        print '};\n'
 script_names = ['Arabic', 'Armenian', 'Bengali', 'Bopomofo', 'Braille', 'Buginese', 'Buhid', 'Canadian_Aboriginal', \
 'Cherokee', 'Common', 'Coptic', 'Cypriot', 'Cyrillic', 'Deseret', 'Devanagari', 'Ethiopic', 'Georgian', \
 'Glagolitic', 'Gothic', 'Greek', 'Gujarati', 'Gurmukhi', 'Han', 'Hangul', 'Hanunoo', 'Hebrew', 'Hiragana', \
 'Inherited', 'Kannada', 'Katakana', 'Kharoshthi', 'Khmer', 'Lao', 'Latin', 'Limbu', 'Linear_B', 'Malayalam', \
 'Mongolian', 'Myanmar', 'New_Tai_Lue', 'Ogham', 'Old_Italic', 'Old_Persian', 'Oriya', 'Osmanya', 'Runic', \
 'Shavian', 'Sinhala', 'Syloti_Nagri', 'Syriac', 'Tagalog', 'Tagbanwa', 'Tai_Le', 'Tamil', 'Telugu', 'Thaana', \
 'Thai', 'Tibetan', 'Tifinagh', 'Ugaritic', 'Yi', \
 # New for Unicode 5.0
 'Balinese', 'Cuneiform', 'Nko', 'Phags_Pa', 'Phoenician', \
 # New for Unicode 5.1
 'Carian', 'Cham', 'Kayah_Li', 'Lepcha', 'Lycian', 'Lydian', 'Ol_Chiki', 'Rejang', 'Saurashtra', 'Sundanese', 'Vai', \
 # New for Unicode 5.2
 'Avestan', 'Bamum', 'Egyptian_Hieroglyphs', 'Imperial_Aramaic', \
 'Inscriptional_Pahlavi', 'Inscriptional_Parthian', \
 'Javanese', 'Kaithi', 'Lisu', 'Meetei_Mayek', \
 'Old_South_Arabian', 'Old_Turkic', 'Samaritan', 'Tai_Tham', 'Tai_Viet', \
 # New for Unicode 6.0.0
 'Batak', 'Brahmi', 'Mandaic', \
 # New for Unicode 6.1.0
 'Chakma', 'Meroitic_Cursive', 'Meroitic_Hieroglyphs', 'Miao', 'Sharada', 'Sora_Sompeng', 'Takri'
 ]
 category_names = ['Cc', 'Cf', 'Cn', 'Co', 'Cs', 'Ll', 'Lm', 'Lo', 'Lt', 'Lu',
  'Mc', 'Me', 'Mn', 'Nd', 'Nl', 'No', 'Pc', 'Pd', 'Pe', 'Pf', 'Pi', 'Po', 'Ps',
  'Sc', 'Sk', 'Sm', 'So', 'Zl', 'Zp', 'Zs' ]
 break_property_names = ['CR', 'LF', 'Control', 'Extend', 'Prepend',
  'SpacingMark', 'L', 'V', 'T', 'LV', 'LVT', 'Regional_Indicator', 'Other' ]
 test_record_size()
 script = read_table('Unicode.tables/Scripts.txt', make_get_names(script_names), script_names.index('Common'))
 category = read_table('Unicode.tables/DerivedGeneralCategory.txt', make_get_names(category_names), category_names.index('Cn'))
 break_props = read_table('Unicode.tables/GraphemeBreakProperty.txt', make_get_names(break_property_names), break_property_names.index('Other'))
 other_case = read_table('Unicode.tables/CaseFolding.txt', get_other_case, 0)
 # This block of code was added by PH in September 2012. I am not a Python 
 # programmer, so the style is probably dreadful, but it does the job. It scans 
 # the other_case table to find sets of more than two characters that must all 
 # match each other caselessly. Later in this script a table of these sets is 
 # written out. However, we have to do this work here in order to compute the 
 # offsets in the table that are inserted into the main table.
 # The CaseFolding.txt file lists pairs, but the common logic for reading data
 # sets only one value, so first we go through the table and set "return" 
 # offsets for those that are not already set.
 for c in range(0x10ffff):
  if other_case[c] != 0 and other_case[c + other_case[c]] == 0:
    other_case[c + other_case[c]] = -other_case[c] 
 # Now scan again and create equivalence sets.
 sets = []
 for c in range(0x10ffff):
  o = c + other_case[c]
  # Trigger when this character's other case does not point back here. We
  # now have three characters that are case-equivalent. 
  if other_case[o] != -other_case[c]:
    t = o + other_case[o]
    # Scan the existing sets to see if any of the three characters are already 
    # part of a set. If so, unite the existing set with the new set.
    appended = 0 
    for s in sets:
      found = 0 
      for x in s:
        if x == c or x == o or x == t:
          found = 1
      # Add new characters to an existing set
      if found:
        found = 0 
        for y in [c, o, t]:
          for x in s:
            if x == y:
              found = 1
          if not found:
            s.append(y)
        appended = 1
    # If we have not added to an existing set, create a new one.
    if not appended:     
      sets.append([c, o, t])
 # End of loop looking for caseless sets.
 # Now scan the sets and set appropriate offsets for the characters.
 caseless_offsets = [0] * MAX_UNICODE
 offset = 1;
 for s in sets:
  for x in s:   
    caseless_offsets[x] = offset
  offset += len(s) + 1
 # End of block of code for creating offsets for caseless matching sets.
 # Combine the tables
 table, records = combine_tables(script, category, break_props, 
  caseless_offsets, other_case)
 record_size, record_struct = get_record_size_struct(records.keys())
 # Find the optimum block size for the two-stage table
 min_size = sys.maxint
 for block_size in [2 ** i for i in range(5,10)]:
        size = len(records) * record_size
        stage1, stage2 = compress_table(table, block_size)
        size += get_tables_size(stage1, stage2)
        #print "/* block size %5d  => %5d bytes */" % (block_size, size)
        if size < min_size:
                min_size = size
                min_stage1, min_stage2 = stage1, stage2
                min_block_size = block_size
 print "/* This module is generated by the maint/MultiStage2.py script."
 print "Do not modify it by hand. Instead modify the script and run it"
 print "to regenerate this code."
 print
 print "As well as being part of the PCRE2 library, this module is #included"
 print "by the pcre2test program, which redefines the PRIV macro to change"
 print "table names from _pcre2_xxx to xxxx, thereby avoiding name clashes"
 print "with the library. At present, just one of these tables is actually"
 print "needed. */"
 print
 print "#ifndef PCRE2_INCLUDED"
 print
 print "#ifdef HAVE_CONFIG_H"
 print "#include \"config.h\""
 print "#endif"
 print
 print "#include \"pcre2_internal.h\""
 print
 print "#endif /* PCRE2_INCLUDED */"
 print
 print "/* Unicode character database. */"
 print "/* This file was autogenerated by the MultiStage2.py script. */"
 print "/* Total size: %d bytes, block size: %d. */" % (min_size, min_block_size)
 print
 print "/* The tables herein are needed only when UCP support is built,"
 print "and in PCRE2 that happens automatically with UTF support." 
 print "This module should not be referenced otherwise, so"
 print "it should not matter whether it is compiled or not. However"
 print "a comment was received about space saving - maybe the guy linked"
 print "all the modules rather than using a library - so we include a"
 print "condition to cut out the tables when not needed. But don't leave"
 print "a totally empty module because some compilers barf at that."
 print "Instead, just supply small dummy tables. */"
 print
 print "#ifndef SUPPORT_UTF"
 print "const ucd_record PRIV(ucd_records)[] = {{0,0,0,0,0 }};"
 print "const uint8_t PRIV(ucd_stage1)[] = {0};"
 print "const uint16_t PRIV(ucd_stage2)[] = {0};"
 print "const uint32_t PRIV(ucd_caseless_sets)[] = {0};"
 print "#else"
 print
 print record_struct
 # --- Added by PH: output the table of caseless character sets ---
 print "const uint32_t PRIV(ucd_caseless_sets)[] = {"
 print "  NOTACHAR,"
 for s in sets:
  s = sorted(s)
  for x in s:
    print '  0x%04x,' % x,
  print '  NOTACHAR,'   
 print '};'
 print
 # ------
 print "/* When #included in pcre2test, we don't need this large table. */"
 print
 print "#ifndef PCRE2_INCLUDED"
 print
 print_records(records, record_size)
 print_table(min_stage1, 'PRIV(ucd_stage1)')
 print_table(min_stage2, 'PRIV(ucd_stage2)', min_block_size)
 print "#if UCD_BLOCK_SIZE != %d" % min_block_size
 print "#error Please correct UCD_BLOCK_SIZE in pcre2_internal.h"
 print "#endif"
 print "#endif  /* SUPPORT_UTF */"
 print
 print "#endif  /* PCRE2_INCLUDED */"
 """
 # Three-stage tables:
 # Find the optimum block size for 3-stage table
 min_size = sys.maxint
 for stage3_block in [2 ** i for i in range(2,6)]:
        stage_i, stage3 = compress_table(table, stage3_block)
        for stage2_block in [2 ** i for i in range(5,10)]:
                size = len(records) * 4
                stage1, stage2 = compress_table(stage_i, stage2_block)
                size += get_tables_size(stage1, stage2, stage3)
                # print "/* %5d / %3d  => %5d bytes */" % (stage2_block, stage3_block, size)
                if size < min_size:
                        min_size = size
                        min_stage1, min_stage2, min_stage3 = stage1, stage2, stage3
                        min_stage2_block, min_stage3_block = stage2_block, stage3_block
 print "/* Total size: %d bytes" % min_size */
 print_records(records)
 print_table(min_stage1, 'ucd_stage1')
 print_table(min_stage2, 'ucd_stage2', min_stage2_block)
 print_table(min_stage3, 'ucd_stage3', min_stage3_block)
 """
--- a/maint/README
+++ b/maint/README
@ -0,0 +1,324 @@
 MAINTENANCE README FOR PCRE2
 ============================
 The files in the "maint" directory of the PCRE2 source contain data, scripts,
 and programs that are used for the maintenance of PCRE2, but which do not form
 part of the PCRE2 distribution tarballs. This document describes these files
 and also contains some notes for maintainers. Its contents are:
  Files in the maint directory
  Updating to a new Unicode release
  Preparing for a PCRE2 release
  Making a PCRE2 release
  Long-term ideas (wish list)
 Files in the maint directory
 ============================
 GenerateUtt.py   A Python script to generate part of the pcre2_tables.c file
                 that contains Unicode script names in a long string with
                 offsets, which is tedious to maintain by hand.
 ManyConfigTests  A shell script that runs "configure, make, test" a number of
                 times with different configuration settings.
 MultiStage2.py   A Python script that generates the file pcre2_ucd.c from three
                 Unicode data tables, which are themselves downloaded from the
                 Unicode web site. Run this script in the "maint" directory.
                 The generated file contains the tables for a 2-stage lookup
                 of Unicode properties.
 pcre2_chartables.c.non-standard
                 This is a set of character tables that came from a Windows
                 system. It has characters greater than 128 that are set as
                 spaces, amongst other things. I kept it so that it can be
                 used for testing from time to time.
 README           This file.
 Unicode.tables   The files in this directory (CaseFolding.txt, 
                 DerivedGeneralCategory.txt, GraphemeBreakProperty.txt,
                 Scripts.txt and UnicodeData.txt) were downloaded from the
                 Unicode web site. They contain information about Unicode
                 characters and scripts. 
 ucptest.c        A short C program for testing the Unicode property macros
                 that do lookups in the pcre2_ucd.c data, mainly useful after
                 rebuilding the Unicode property table. Compile and run this in
                 the "maint" directory (see comments at its head).
 ucptestdata      A directory containing two files, testinput1 and testoutput1,
                 to use in conjunction with the ucptest program.
 utf8.c           A short, freestanding C program for converting a Unicode code
                 point into a sequence of bytes in the UTF-8 encoding, and vice
                 versa. If its argument is a hex number such as 0x1234, it
                 outputs a list of the equivalent UTF-8 bytes. If its argument
                 is sequence of concatenated UTF-8 bytes (e.g. e188b4) it
                 treats them as a UTF-8 character and outputs the equivalent
                 code point in hex.
 Updating to a new Unicode release
 =================================
 When there is a new release of Unicode, the files in Unicode.tables must be
 refreshed from the web site. If the new version of Unicode adds new character
 scripts, the source file pacr2_ucp.h and both the MultiStage2.py and the
 GenerateUtt.py scripts must be edited to add the new names. Then MultiStage2.py
 can be run to generate a new version of pcre2_ucd.c, and GenerateUtt.py can be
 run to generate the tricky tables for inclusion in pcre2_tables.c.
 If MultiStage2.py gives the error "ValueError: list.index(x): x not in list",
 the cause is usually a missing (or misspelt) name in the list of scripts. I
 couldn't find a straightforward list of scripts on the Unicode site, but
 there's a useful Wikipedia page that list them, and notes the Unicode version
 in which they were introduced:
 http://en.wikipedia.org/wiki/Unicode_scripts#Table_of_Unicode_scripts
 The ucptest program can be compiled and used to check that the new tables in
 pcre2_ucd.c work properly, using the data files in ucptestdata to check a
 number of test characters. The source file ucptest.c must be updated whenever
 new Unicode script names are added.
 Note also that both the pcre2syntax.3 and pcre2pattern.3 man pages contain
 lists of Unicode script names.
 Preparing for a PCRE release
 ============================
 This section contains a checklist of things that I consult before building a
 distribution for a new release.
 . Ensure that the version number and version date are correct in configure.ac.
 . Update the library version numbers in configure.ac according to the rules 
  given below.
 . If new build options have been added, ensure that they are added to the CMake
  files as well as to the autoconf files. The relevant files are CMakeLists.txt
  and config-cmake.h.in. After making a release tarball, test it out with CMake 
  if there have been changes here.  
 . Run ./autogen.sh to ensure everything is up-to-date.
 . Compile and test with many different config options, and combinations of
  options. Also, test with valgrind by running "RunTest valgrind" and
  "RunGrepTest valgrind" (which takes quite a long time). The script
  maint/ManyConfigTests now encapsulates this testing. It runs tests with
  different configurations, and it also runs some of them with valgrind, all of
  which can take quite some time.
 . Run perltest.pl on the test data for tests 1, 4, and 6. The output 
  should match the PCRE2 test output, apart from the version identification at
  the start of each test. The other tests are not Perl-compatible (they use
  various PCRE2-specific features or options).
 . It is possible to test with the emulated memmove() function by undefining
  HAVE_MEMMOVE and HAVE_BCOPY in config.h, though I do not do this often. You
  may see a number of "pcre2_memmove defined but not used" warnings for the
  modules in which there is no call to memmove(). These can be ignored.
 . Documentation: check AUTHORS, ChangeLog (check version and date), LICENCE, 
  NEWS (check version and date), NON-AUTOTOOLS-BUILD, and README. Many of these
  won't need changing, but over the long term things do change.
 . I used to test new releases myself on a number of different operating
  systems, using different compilers as well. For example, on Solaris it is
  helpful to test using Sun's cc compiler as a change from gcc. Adding
  -xarch=v9 to the cc options does a 64-bit test, but it also needs -S 64 for
  pcretest to increase the stack size for test 2. Since I retired I can no
  longer do this, but instead I rely on putting out release candidates for
  folks on the pcre-dev list to test.
 Updating version info for libtool
 =================================
 This set of rules for updating library version information came from a web page 
 whose URL I have forgotten. The version information consists of three parts:
 (current, revision, age).
 1. Start with version information of 0:0:0 for each libtool library.
 2. Update the version information only immediately before a public release of
   your software. More frequent updates are unnecessary, and only guarantee
   that the current interface number gets larger faster.
 3. If the library source code has changed at all since the last update, then
   increment revision; c:r:a becomes c:r+1:a.
 4. If any interfaces have been added, removed, or changed since the last
   update, increment current, and set revision to 0.
 5. If any interfaces have been added since the last public release, then
   increment age.
 6. If any interfaces have been removed or changed since the last public
   release, then set age to 0.
 The following explanation may help in understanding the above rules a bit
 better. Consider that there are three possible kinds of reaction from users to
 changes in a shared library:
 1. Programs using the previous version may use the new version as a drop-in
   replacement, and programs using the new version can also work with the
   previous one. In other words, no recompiling nor relinking is needed. In
   this case, increment revision only, don't touch current or age.
 2. Programs using the previous version may use the new version as a drop-in
   replacement, but programs using the new version may use APIs not present in
   the previous one. In other words, a program linking against the new version
   may fail if linked against the old version at run time. In this case, set
   revision to 0, increment current and age.
 3. Programs may need to be changed, recompiled, relinked in order to use the
   new version. Increment current, set revision and age to 0.
 Making a PCRE release
 =====================
 Run PrepareRelease and commit the files that it changes (by removing trailing
 spaces). The first thing this script does is to run CheckMan on the man pages;
 if it finds any markup errors, it reports them and then aborts.
 Once PrepareRelease has run clean, run "make distcheck" to create the tarballs
 and the zipball. Double-check with "svn status", then create an SVN tagged
 copy:
  svn copy svn://vcs.exim.org/pcre2/code/trunk \
           svn://vcs.exim.org/pcre2/code/tags/pcre-8.xx
 Don't forget to update Freecode (fka Freshmeat) when the new release is out,
 and to tell webmaster@pcre.org and the mailing list. Also, update the list of
 version numbers in Bugzilla (edit products).
 Future ideas (wish list)
 ========================
 This section records a list of ideas so that they do not get forgotten. They
 vary enormously in their usefulness and potential for implementation. Some are
 very sensible; some are rather wacky. Some have been on this list for years;
 others are relatively new.
 . Optimization
  There are always ideas for new optimizations so as to speed up pattern
  matching. Most of them try to save work by recognizing a non-match without
  having to scan all the possibilities. These are some that I've recorded:
  * /((A{0,5}){0,5}){0,5}(something complex)/ on a non-matching string is very
    slow, though Perl is fast. Can we speed up somehow? Convert to {0,125}?
    OTOH, this is pathological - the user could easily fix it.
  * Turn ={4} into ==== ? (for speed). I once did an experiment, and it seems
    to have little effect, and maybe makes things worse.
  * "Ends with literal string" - note that a single character doesn't gain much
    over the existing "required byte" (reqbyte) feature that just remembers one
    data unit.
  * Remember an initial string rather than just 1 code unit?
  * A required code unit from alternatives - not just the last unit, but an
    earlier one if common to all alternatives.
  o Friedl contains other ideas.
  * The code does not set initial code unit flags for Unicode property types
    such as \p; I don't know how much benefit there would be for, for example,
    setting the bits for 0-9 and all values >= xC0 (in 8-bit mode) when a
    pattern starts with \p{N}.
  * There is scope for more "auto-possessifying" in connection with \p and \P.
 . If Perl gets to a consistent state over the settings of capturing sub-
  patterns inside repeats, see if we can match it. One example of the
  difference is the matching of /(main(O)?)+/ against mainOmain, where PCRE
  leaves $2 set. In Perl, it's unset. Changing this in PCRE will be very hard
  because I think it needs much more state to be remembered.
 . Perl 6 will be a revolution. Is it a revolution too far for PCRE?
 . Allow errorptr and erroroffset to be NULL. I don't like this idea.
 . Line endings:
  * Option to use NUL as a line terminator in subject strings. This could now
    be done relatively easily since the extension to support LF, CR, and CRLF.
    If it is done, a suitable option for pcregrep is also required.
 . Catch SIGSEGV for stack overflows?
 . A feature to suspend a match via a callout was once requested.
 . Option to convert results into character offsets and character lengths.
 . Option for pcregrep to scan only the start of a file. I am not keen - this is
  the job of "head".
 . A (non-Unix) user wanted pcregrep options to (a) list a file name just once,
  preceded by a blank line, instead of adding it to every matched line, and (b)
  support --outputfile=name.
 . Consider making UTF and UCP the default for PCRE n.0 for some n > 8.
 . Define a union for the results from pcre2_pattern_info().
 . Provide a "random access to the subject" facility so that the way in which it
  is stored is independent of PCRE. For efficiency, it probably isn't possible
  to switch this dynamically. It would have to be specified when PCRE was
  compiled. PCRE would then call a function every time it wanted a character.
 . Wild thought: the ability to compile from PCRE's internal byte code to a real
  FSM and a very fast (third) matcher to process the result. There would be
  even more restrictions than for pcre_dfa_exec(), however. This is not easy.
  This is probably obsolete now that we have the JIT support.
 . Should pcretest have some private locale data, to avoid relying on the
  available locales for the test data, since different OS have different ideas?
  This won't be as thorough a test, but perhaps that doesn't really matter.
 . pcregrep: add -rs for a sorted recurse? Having to store file names and sort
  them will of course slow it down.
 . Someone suggested --disable-callout to save code space when callouts are
  never wanted. This seems rather marginal.
 . A user suggested a parameter to limit the length of string matched, for
  example if the parameter is N, the current match should fail if the matched
  substring exceeds N. This could apply to both match functions. The value
  could be a new field in the extra block.
 . Callouts with arguments: (?Cn:ARG) for instance.
 . Write a function that generates random matching strings for a compiled regex.
 . Pcregrep: an option to specify the output line separator, either as a string
  or select from a fixed list. This is not dead easy, because at the moment it
  outputs whatever is in the input file.
 . Improve the code for duplicate checking in pcre_dfa_exec(). An incomplete,
  non-thread-safe patch showed that this can help performance for patterns
  where there are many alternatives. However, a simple thread-safe
  implementation that I tried made things worse in many simple cases, so this
  is not an obviously good thing.
 . PCRE cannot at present distinguish between subpatterns with different names,
  but the same number (created by the use of ?|). In order to do so, a way of
  remembering *which* subpattern numbered n matched is needed. Bugzilla #760.
  Now that (*MARK) has been implemented, it can perhaps be used as a way round
  this problem.
 . Instead of having #ifdef HAVE_CONFIG_H in each module, put #include
  "something" and the the #ifdef appears only in one place, in "something".
 Philip Hazel
 Email local part: ph10
 Email domain: cam.ac.uk
 Last updated: 13 May 2014
--- a/maint/Unicode.tables/CaseFolding.txt
+++ b/maint/Unicode.tables/CaseFolding.txt
--- a/maint/Unicode.tables/DerivedGeneralCategory.txt
+++ b/maint/Unicode.tables/DerivedGeneralCategory.txt
--- a/maint/Unicode.tables/GraphemeBreakProperty.txt
+++ b/maint/Unicode.tables/GraphemeBreakProperty.txt
--- a/maint/Unicode.tables/Scripts.txt
+++ b/maint/Unicode.tables/Scripts.txt
--- a/maint/Unicode.tables/UnicodeData.txt
+++ b/maint/Unicode.tables/UnicodeData.txt
--- a/maint/pcre2_chartables.c.non-standard
+++ b/maint/pcre2_chartables.c.non-standard
@ -0,0 +1,138 @@
 const unsigned char _pcre_default_tables[] = {
 0,1,2,3,4,5,6,7,
 8,9,10,11,12,13,14,15,
 16,17,18,19,20,21,22,23,
 24,25,26,27,28,29,30,31,
 32,33,34,35,36,37,38,39,
 40,41,42,43,44,45,46,47,
 48,49,50,51,52,53,54,55,
 56,57,58,59,60,61,62,63,
 64,97,98,99,100,101,102,103,
 104,105,106,107,108,109,110,111,
 112,113,114,115,116,117,118,119,
 120,121,122,91,92,93,94,95,
 96,97,98,99,100,101,102,103,
 104,105,106,107,108,109,110,111,
 112,113,114,115,116,117,118,119,
 120,121,122,123,124,125,126,127,
 128,129,130,131,132,133,134,135,
 136,137,138,139,140,141,142,143,
 144,145,146,147,148,149,150,151,
 152,153,154,155,156,157,158,159,
 160,161,162,163,164,165,166,167,
 168,169,170,171,172,173,174,175,
 176,177,178,179,180,181,182,183,
 184,185,186,187,188,189,190,191,
 224,225,226,227,228,229,230,231,
 232,233,234,235,236,237,238,239,
 240,241,242,243,244,245,246,215,
 248,249,250,251,252,253,254,223,
 224,225,226,227,228,229,230,231,
 232,233,234,235,236,237,238,239,
 240,241,242,243,244,245,246,247,
 248,249,250,251,252,253,254,255,
 0,1,2,3,4,5,6,7,
 8,9,10,11,12,13,14,15,
 16,17,18,19,20,21,22,23,
 24,25,26,27,28,29,30,31,
 32,33,34,35,36,37,38,39,
 40,41,42,43,44,45,46,47,
 48,49,50,51,52,53,54,55,
 56,57,58,59,60,61,62,63,
 64,97,98,99,100,101,102,103,
 104,105,106,107,108,109,110,111,
 112,113,114,115,116,117,118,119,
 120,121,122,91,92,93,94,95,
 96,65,66,67,68,69,70,71,
 72,73,74,75,76,77,78,79,
 80,81,82,83,84,85,86,87,
 88,89,90,123,124,125,126,127,
 128,129,130,131,132,133,134,135,
 136,137,138,139,140,141,142,143,
 144,145,146,147,148,149,150,151,
 152,153,154,155,156,157,158,159,
 160,161,162,163,164,165,166,167,
 168,169,170,171,172,173,174,175,
 176,177,178,179,180,181,182,183,
 184,185,186,187,188,189,190,191,
 224,225,226,227,228,229,230,231,
 232,233,234,235,236,237,238,239,
 240,241,242,243,244,245,246,215,
 248,249,250,251,252,253,254,223,
 192,193,194,195,196,197,198,199,
 200,201,202,203,204,205,206,207,
 208,209,210,211,212,213,214,247,
 216,217,218,219,220,221,222,255,
 0,62,0,0,1,0,0,0,
 0,0,0,0,0,0,0,0,
 32,0,0,0,1,0,0,0,
 0,0,0,0,0,0,0,0,
 0,0,0,0,0,0,255,3,
 126,0,0,0,126,0,0,0,
 0,0,0,0,0,0,0,0,
 0,0,0,0,0,0,0,0,
 0,0,0,0,0,0,255,3,
 0,0,0,0,0,0,0,0,
 0,0,0,0,0,0,12,2,
 0,0,0,0,0,0,0,0,
 0,0,0,0,0,0,0,0,
 254,255,255,7,0,0,0,0,
 0,0,0,0,0,0,0,0,
 255,255,127,127,0,0,0,0,
 0,0,0,0,0,0,0,0,
 0,0,0,0,254,255,255,7,
 0,0,0,0,0,4,32,4,
 0,0,0,128,255,255,127,255,
 0,0,0,0,0,0,255,3,
 254,255,255,135,254,255,255,7,
 0,0,0,0,0,4,44,6,
 255,255,127,255,255,255,127,255,
 0,0,0,0,254,255,255,255,
 255,255,255,255,255,255,255,127,
 0,0,0,0,254,255,255,255,
 255,255,255,255,255,255,255,255,
 0,2,0,0,255,255,255,255,
 255,255,255,255,255,255,255,127,
 0,0,0,0,255,255,255,255,
 255,255,255,255,255,255,255,255,
 0,0,0,0,254,255,0,252,
 1,0,0,248,1,0,0,120,
 0,0,0,0,254,255,255,255,
 0,0,128,0,0,0,128,0,
 255,255,255,255,0,0,0,0,
 0,0,0,0,0,0,0,128,
 255,255,255,255,0,0,0,0,
 0,0,0,0,0,0,0,0,
 128,0,0,0,0,0,0,0,
 0,1,1,0,1,1,0,0,
 0,0,0,0,0,0,0,0,
 0,0,0,0,0,0,0,0,
 1,0,0,0,128,0,0,0,
 128,128,128,128,0,0,128,0,
 28,28,28,28,28,28,28,28,
 28,28,0,0,0,0,0,128,
 0,26,26,26,26,26,26,18,
 18,18,18,18,18,18,18,18,
 18,18,18,18,18,18,18,18,
 18,18,18,128,128,0,128,16,
 0,26,26,26,26,26,26,18,
 18,18,18,18,18,18,18,18,
 18,18,18,18,18,18,18,18,
 18,18,18,128,128,0,0,0,
 0,0,0,0,0,1,0,0,
 0,0,0,0,0,0,0,0,
 0,0,0,0,0,0,0,0,
 0,0,0,0,0,0,0,0,
 1,0,0,0,0,0,0,0,
 0,0,18,0,0,0,0,0,
 0,0,20,20,0,18,0,0,
 0,20,18,0,0,0,0,0,
 18,18,18,18,18,18,18,18,
 18,18,18,18,18,18,18,18,
 18,18,18,18,18,18,18,0,
 18,18,18,18,18,18,18,18,
 18,18,18,18,18,18,18,18,
 18,18,18,18,18,18,18,18,
 18,18,18,18,18,18,18,0,
 18,18,18,18,18,18,18,18
 };
--- a/maint/ucptest.c
+++ b/maint/ucptest.c
@ -0,0 +1,297 @@
 /***************************************************
 * A program for testing the Unicode property table *
 ***************************************************/
 /* Copyright (c) University of Cambridge 2008 */
 /* Compile thus:
   gcc -DHAVE_CONFIG_H -DPCRE2_CODE_UNIT_WIDTH=8 -o ucptest \
     ucptest.c ../src/pcre2_ucd.c ../src/pcre2_tables.c
 */
 /* The program expects to read commands on stdin, and it writes output
 to stdout. There is only one command, "findprop", followed by a list of Unicode 
 code points as hex numbers (without any prefixes). The output is one line per 
 character, giving its Unicode properties followed by its other case if there is 
 one. */
 #ifdef HAVE_CONFIG_H
 #include "../src/config.h"
 #endif
 #ifndef SUPPORT_UTF
 #define SUPPORT_UTF
 #endif
 #include <ctype.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include "../src/pcre2_internal.h"
 #include "../src/pcre2_ucp.h"
 /* -------------------------------------------------------------------*/
 #define CS   (char *)
 #define CCS  (const char *)
 #define CSS  (char **)
 #define US   (unsigned char *)
 #define CUS  (const unsigned char *)
 #define USS  (unsigned char **)
 /* -------------------------------------------------------------------*/
 /*************************************************
 *      Print Unicode property info for a char    *
 *************************************************/
 static void
 print_prop(int c)
 {
 int type = UCD_CATEGORY(c);
 int fulltype = UCD_CHARTYPE(c);
 int script = UCD_SCRIPT(c);
 int gbprop = UCD_GRAPHBREAK(c);
 int othercase = UCD_OTHERCASE(c);
 int caseset = UCD_CASESET(c);
 unsigned char *fulltypename = US"??";
 unsigned char *typename = US"??";
 unsigned char *scriptname = US"??";
 unsigned char *graphbreak = US"??";
 switch (type)
  {
  case ucp_C: typename = US"Control"; break;
  case ucp_L: typename = US"Letter"; break;
  case ucp_M: typename = US"Mark"; break;
  case ucp_N: typename = US"Number"; break;
  case ucp_P: typename = US"Punctuation"; break;
  case ucp_S: typename = US"Symbol"; break;
  case ucp_Z: typename = US"Separator"; break;
  }
 switch (fulltype)
  {
  case ucp_Cc: fulltypename = US"Control"; break;
  case ucp_Cf: fulltypename = US"Format"; break;
  case ucp_Cn: fulltypename = US"Unassigned"; break;
  case ucp_Co: fulltypename = US"Private use"; break;
  case ucp_Cs: fulltypename = US"Surrogate"; break;
  case ucp_Ll: fulltypename = US"Lower case letter"; break;
  case ucp_Lm: fulltypename = US"Modifier letter"; break;
  case ucp_Lo: fulltypename = US"Other letter"; break;
  case ucp_Lt: fulltypename = US"Title case letter"; break;
  case ucp_Lu: fulltypename = US"Upper case letter"; break;
  case ucp_Mc: fulltypename = US"Spacing mark"; break;
  case ucp_Me: fulltypename = US"Enclosing mark"; break;
  case ucp_Mn: fulltypename = US"Non-spacing mark"; break;
  case ucp_Nd: fulltypename = US"Decimal number"; break;
  case ucp_Nl: fulltypename = US"Letter number"; break;
  case ucp_No: fulltypename = US"Other number"; break;
  case ucp_Pc: fulltypename = US"Connector punctuation"; break;
  case ucp_Pd: fulltypename = US"Dash punctuation"; break;
  case ucp_Pe: fulltypename = US"Close punctuation"; break;
  case ucp_Pf: fulltypename = US"Final punctuation"; break;
  case ucp_Pi: fulltypename = US"Initial punctuation"; break;
  case ucp_Po: fulltypename = US"Other punctuation"; break;
  case ucp_Ps: fulltypename = US"Open punctuation"; break;
  case ucp_Sc: fulltypename = US"Currency symbol"; break;
  case ucp_Sk: fulltypename = US"Modifier symbol"; break;
  case ucp_Sm: fulltypename = US"Mathematical symbol"; break;
  case ucp_So: fulltypename = US"Other symbol"; break;
  case ucp_Zl: fulltypename = US"Line separator"; break;
  case ucp_Zp: fulltypename = US"Paragraph separator"; break;
  case ucp_Zs: fulltypename = US"Space separator"; break;
  }
 switch(gbprop)
  {
  case ucp_gbCR:           graphbreak = US"CR"; break;
  case ucp_gbLF:           graphbreak = US"LF"; break;
  case ucp_gbControl:      graphbreak = US"Control"; break;
  case ucp_gbExtend:       graphbreak = US"Extend"; break;
  case ucp_gbPrepend:      graphbreak = US"Prepend"; break;
  case ucp_gbSpacingMark:  graphbreak = US"SpacingMark"; break;
  case ucp_gbL:            graphbreak = US"Hangul syllable type L"; break;
  case ucp_gbV:            graphbreak = US"Hangul syllable type V"; break;
  case ucp_gbT:            graphbreak = US"Hangul syllable type T"; break;
  case ucp_gbLV:           graphbreak = US"Hangul syllable type LV"; break;
  case ucp_gbLVT:          graphbreak = US"Hangul syllable type LVT"; break;
  case ucp_gbOther:        graphbreak = US"Other"; break;
  }
 switch(script)
  {
  case ucp_Arabic:      scriptname = US"Arabic"; break;
  case ucp_Armenian:    scriptname = US"Armenian"; break;
  case ucp_Balinese:    scriptname = US"Balinese"; break;
  case ucp_Bengali:     scriptname = US"Bengali"; break;
  case ucp_Bopomofo:    scriptname = US"Bopomofo"; break;
  case ucp_Braille:     scriptname = US"Braille"; break;
  case ucp_Buginese:    scriptname = US"Buginese"; break;
  case ucp_Buhid:       scriptname = US"Buhid"; break;
  case ucp_Canadian_Aboriginal: scriptname = US"Canadian_Aboriginal"; break;
  case ucp_Cherokee:    scriptname = US"Cherokee"; break;
  case ucp_Common:      scriptname = US"Common"; break;
  case ucp_Coptic:      scriptname = US"Coptic"; break;
  case ucp_Cuneiform:   scriptname = US"Cuneiform"; break;
  case ucp_Cypriot:     scriptname = US"Cypriot"; break;
  case ucp_Cyrillic:    scriptname = US"Cyrillic"; break;
  case ucp_Deseret:     scriptname = US"Deseret"; break;
  case ucp_Devanagari:  scriptname = US"Devanagari"; break;
  case ucp_Ethiopic:    scriptname = US"Ethiopic"; break;
  case ucp_Georgian:    scriptname = US"Georgian"; break;
  case ucp_Glagolitic:  scriptname = US"Glagolitic"; break;
  case ucp_Gothic:      scriptname = US"Gothic"; break;
  case ucp_Greek:       scriptname = US"Greek"; break;
  case ucp_Gujarati:    scriptname = US"Gujarati"; break;
  case ucp_Gurmukhi:    scriptname = US"Gurmukhi"; break;
  case ucp_Han:         scriptname = US"Han"; break;
  case ucp_Hangul:      scriptname = US"Hangul"; break;
  case ucp_Hanunoo:     scriptname = US"Hanunoo"; break;
  case ucp_Hebrew:      scriptname = US"Hebrew"; break;
  case ucp_Hiragana:    scriptname = US"Hiragana"; break;
  case ucp_Inherited:   scriptname = US"Inherited"; break;
  case ucp_Kannada:     scriptname = US"Kannada"; break;
  case ucp_Katakana:    scriptname = US"Katakana"; break;
  case ucp_Kharoshthi:  scriptname = US"Kharoshthi"; break;
  case ucp_Khmer:       scriptname = US"Khmer"; break;
  case ucp_Lao:         scriptname = US"Lao"; break;
  case ucp_Latin:       scriptname = US"Latin"; break;
  case ucp_Limbu:       scriptname = US"Limbu"; break;
  case ucp_Linear_B:    scriptname = US"Linear_B"; break;
  case ucp_Malayalam:   scriptname = US"Malayalam"; break;
  case ucp_Mongolian:   scriptname = US"Mongolian"; break;
  case ucp_Myanmar:     scriptname = US"Myanmar"; break;
  case ucp_New_Tai_Lue: scriptname = US"New_Tai_Lue"; break;
  case ucp_Nko:         scriptname = US"Nko"; break;
  case ucp_Ogham:       scriptname = US"Ogham"; break;
  case ucp_Old_Italic:  scriptname = US"Old_Italic"; break;
  case ucp_Old_Persian: scriptname = US"Old_Persian"; break;
  case ucp_Oriya:       scriptname = US"Oriya"; break;
  case ucp_Osmanya:     scriptname = US"Osmanya"; break;
  case ucp_Phags_Pa:    scriptname = US"Phags_Pa"; break;
  case ucp_Phoenician:  scriptname = US"Phoenician"; break;
  case ucp_Runic:       scriptname = US"Runic"; break;
  case ucp_Shavian:     scriptname = US"Shavian"; break;
  case ucp_Sinhala:     scriptname = US"Sinhala"; break;
  case ucp_Syloti_Nagri: scriptname = US"Syloti_Nagri"; break;
  case ucp_Syriac:      scriptname = US"Syriac"; break;
  case ucp_Tagalog:     scriptname = US"Tagalog"; break;
  case ucp_Tagbanwa:    scriptname = US"Tagbanwa"; break;
  case ucp_Tai_Le:      scriptname = US"Tai_Le"; break;
  case ucp_Tamil:       scriptname = US"Tamil"; break;
  case ucp_Telugu:      scriptname = US"Telugu"; break;
  case ucp_Thaana:      scriptname = US"Thaana"; break;
  case ucp_Thai:        scriptname = US"Thai"; break;
  case ucp_Tibetan:     scriptname = US"Tibetan"; break;
  case ucp_Tifinagh:    scriptname = US"Tifinagh"; break;
  case ucp_Ugaritic:    scriptname = US"Ugaritic"; break;
  case ucp_Yi:          scriptname = US"Yi"; break;
  /* New for Unicode 5.1: */
  case ucp_Carian:      scriptname = US"Carian"; break;
  case ucp_Cham:        scriptname = US"Cham"; break;
  case ucp_Kayah_Li:    scriptname = US"Kayah_Li"; break;
  case ucp_Lepcha:      scriptname = US"Lepcha"; break;
  case ucp_Lycian:      scriptname = US"Lycian"; break;
  case ucp_Lydian:      scriptname = US"Lydian"; break;
  case ucp_Ol_Chiki:    scriptname = US"Ol_Chiki"; break;
  case ucp_Rejang:      scriptname = US"Rejang"; break;
  case ucp_Saurashtra:  scriptname = US"Saurashtra"; break;
  case ucp_Sundanese:   scriptname = US"Sundanese"; break;
  case ucp_Vai:         scriptname = US"Vai"; break;
  /* New for Unicode 5.2: */
  case ucp_Avestan:     scriptname = US"Avestan"; break;
  case ucp_Bamum:       scriptname = US"Bamum"; break;
  case ucp_Egyptian_Hieroglyphs: scriptname = US"Egyptian_Hieroglyphs"; break;
  case ucp_Imperial_Aramaic: scriptname = US"Imperial_Aramaic"; break;
  case ucp_Inscriptional_Pahlavi: scriptname = US"Inscriptional_Pahlavi"; break;
  case ucp_Inscriptional_Parthian: scriptname = US"Inscriptional_Parthian"; break;
  case ucp_Javanese:    scriptname = US"Javanese"; break;
  case ucp_Kaithi:      scriptname = US"Kaithi"; break;
  case ucp_Lisu:        scriptname = US"Lisu"; break;
  case ucp_Meetei_Mayek: scriptname = US"Meetei_Mayek"; break;
  case ucp_Old_South_Arabian: scriptname = US"Old_South_Arabian"; break;
  case ucp_Old_Turkic:  scriptname = US"Old_Turkic"; break;
  case ucp_Samaritan:   scriptname = US"Samaritan"; break;
  case ucp_Tai_Tham:    scriptname = US"Tai_Tham"; break;
  case ucp_Tai_Viet:    scriptname = US"Tai_Viet"; break;
  /* New for Unicode 6.0.0 */
  case ucp_Batak:       scriptname = US"Batak"; break;
  case ucp_Brahmi:      scriptname = US"Brahmi"; break;
  case ucp_Mandaic:     scriptname = US"Mandaic"; break;
  /* New for Unicode 6.1.0 */
  case ucp_Chakma:               scriptname = US"Chakma"; break;
  case ucp_Meroitic_Cursive:     scriptname = US"Meroitic_Cursive"; break;
  case ucp_Meroitic_Hieroglyphs: scriptname = US"Meroitic_Hieroglyphs"; break;
  case ucp_Miao:                 scriptname = US"Miao"; break;
  case ucp_Sharada:              scriptname = US"Sharada"; break;
  case ucp_Sora_Sompeng:         scriptname = US"Sora Sompent"; break;
  case ucp_Takri:                scriptname = US"Takri"; break;
  }
 printf("%04x %s: %s, %s, %s", c, typename, fulltypename, scriptname, graphbreak);
 if (othercase != c) 
  {
  printf(", %04x", othercase);
  if (caseset != 0)
    {
    const uint32_t *p = PRIV(ucd_caseless_sets) + caseset - 1;
    while (*(++p) < NOTACHAR)
      if (*p != othercase && *p != c) printf(", %04x", *p);
    }   
  } 
 printf("\n");
 }
 /*************************************************
 *               Main program                     *
 *************************************************/
 int
 main(void)
 {
 unsigned char buffer[1024];
 while (fgets(CS buffer, sizeof(buffer), stdin) != NULL)
  {
  unsigned char name[24];
  unsigned char *s, *t;
  printf("%s", buffer);
  s = buffer;
  while (isspace(*s)) s++;
  if (*s == 0) continue;
  for (t = name; *s != 0 && !isspace(*s); s++) *t++ = *s;
  *t = 0;
  while (isspace(*s)) s++;
  if (strcmp(CS name, "findprop") == 0)
    {
    while (*s != 0)
      {
      unsigned char *endptr;
      int c = strtoul(CS s, CSS(&endptr), 16);
      print_prop(c);
      s = endptr;
      while (isspace(*s)) s++;
      }
    }
  else printf("Unknown test command %s\n", name);
  }
 return 0;
 }
 /* End */
--- a/maint/ucptestdata/testinput1
+++ b/maint/ucptestdata/testinput1
@ -0,0 +1,34 @@
 findprop 00 01 02 03 04 05 06 07 08 09 0a 0b 0c 0d 0e 0f 
 findprop 10 11 12 13 14 15 16 17 18 19 1a 1b 1c 1d 1e 1f 
 findprop 20 21 22 23 24 25 26 27 28 29 2a 2b 2c 2d 2e 2f 
 findprop 30 31 32 33 34 35 36 37 38 39 3a 3b 3c 3d 3e 3f 
 findprop 40 41 42 43 44 45 46 47 48 49 4a 4b 4c 4d 4e 4f 
 findprop 50 51 52 53 54 55 56 57 58 59 5a 5b 5c 5d 5e 5f 
 findprop 60 61 62 63 64 65 66 67 68 69 6a 6b 6c 6d 6e 6f 
 findprop 70 71 72 73 74 75 76 77 78 79 7a 7b 7c 7d 7e 7f 
 findprop 80 81 82 83 84 85 86 87 88 89 8a 8b 8c 8d 8e 8f 
 findprop 90 91 92 93 94 95 96 97 98 99 9a 9b 9c 9d 9e 9f 
 findprop a0 a1 a2 a3 a4 a5 a6 a7 a8 a9 aa ab ac ad ae af 
 findprop b0 b1 b2 b3 b4 b5 b6 b7 b8 b9 ba bb bc bd be bf 
 findprop c0 c1 c2 c3 c4 c5 c6 c7 c8 c9 ca cb cc cd ce cf 
 findprop d0 d1 d2 d3 d4 d5 d6 d7 d8 d9 da db dc dd de df 
 findprop e0 e1 e2 e3 e4 e5 e6 e7 e8 e9 ea eb ec ed ee ef 
 findprop f0 f1 f2 f3 f4 f5 f6 f7 f8 f9 fa fb fc fd fe ff 
 findprop 0100 0101 0102 0103 0104 0105 0106
 findprop ffe0 ffe1 ffe2 ffe3 ffe4 ffe5 ffe6 ffe7 
 findprop ffe8 ffe9 ffea ffeb ffec ffed ffee ffef
 findprop fff8 fff9 fffa fffb fffc fffd fffe ffff
 findprop 10000 10001 e01ef f0000 100000
 findprop 1b00 12000 7c0 a840 10900
 findprop 1d79 a77d
 findprop  0800  083e  a4d0  a4f7  aa80  aadf
 findprop 10b00 10b35 13000 1342e 10840 10855
 findprop 11100 1113c 11680 116c0
 findprop 0d 0a 0e 0711 1b04 1111 1169 11fe ae4c ad89
--- a/maint/ucptestdata/testoutput1
+++ b/maint/ucptestdata/testoutput1
@ -0,0 +1,359 @@
 findprop 00 01 02 03 04 05 06 07 08 09 0a 0b 0c 0d 0e 0f 
 0000 Control: Control, Common, Control
 0001 Control: Control, Common, Control
 0002 Control: Control, Common, Control
 0003 Control: Control, Common, Control
 0004 Control: Control, Common, Control
 0005 Control: Control, Common, Control
 0006 Control: Control, Common, Control
 0007 Control: Control, Common, Control
 0008 Control: Control, Common, Control
 0009 Control: Control, Common, Control
 000a Control: Control, Common, LF
 000b Control: Control, Common, Control
 000c Control: Control, Common, Control
 000d Control: Control, Common, CR
 000e Control: Control, Common, Control
 000f Control: Control, Common, Control
 findprop 10 11 12 13 14 15 16 17 18 19 1a 1b 1c 1d 1e 1f 
 0010 Control: Control, Common, Control
 0011 Control: Control, Common, Control
 0012 Control: Control, Common, Control
 0013 Control: Control, Common, Control
 0014 Control: Control, Common, Control
 0015 Control: Control, Common, Control
 0016 Control: Control, Common, Control
 0017 Control: Control, Common, Control
 0018 Control: Control, Common, Control
 0019 Control: Control, Common, Control
 001a Control: Control, Common, Control
 001b Control: Control, Common, Control
 001c Control: Control, Common, Control
 001d Control: Control, Common, Control
 001e Control: Control, Common, Control
 001f Control: Control, Common, Control
 findprop 20 21 22 23 24 25 26 27 28 29 2a 2b 2c 2d 2e 2f 
 0020 Separator: Space separator, Common, Other
 0021 Punctuation: Other punctuation, Common, Other
 0022 Punctuation: Other punctuation, Common, Other
 0023 Punctuation: Other punctuation, Common, Other
 0024 Symbol: Currency symbol, Common, Other
 0025 Punctuation: Other punctuation, Common, Other
 0026 Punctuation: Other punctuation, Common, Other
 0027 Punctuation: Other punctuation, Common, Other
 0028 Punctuation: Open punctuation, Common, Other
 0029 Punctuation: Close punctuation, Common, Other
 002a Punctuation: Other punctuation, Common, Other
 002b Symbol: Mathematical symbol, Common, Other
 002c Punctuation: Other punctuation, Common, Other
 002d Punctuation: Dash punctuation, Common, Other
 002e Punctuation: Other punctuation, Common, Other
 002f Punctuation: Other punctuation, Common, Other
 findprop 30 31 32 33 34 35 36 37 38 39 3a 3b 3c 3d 3e 3f 
 0030 Number: Decimal number, Common, Other
 0031 Number: Decimal number, Common, Other
 0032 Number: Decimal number, Common, Other
 0033 Number: Decimal number, Common, Other
 0034 Number: Decimal number, Common, Other
 0035 Number: Decimal number, Common, Other
 0036 Number: Decimal number, Common, Other
 0037 Number: Decimal number, Common, Other
 0038 Number: Decimal number, Common, Other
 0039 Number: Decimal number, Common, Other
 003a Punctuation: Other punctuation, Common, Other
 003b Punctuation: Other punctuation, Common, Other
 003c Symbol: Mathematical symbol, Common, Other
 003d Symbol: Mathematical symbol, Common, Other
 003e Symbol: Mathematical symbol, Common, Other
 003f Punctuation: Other punctuation, Common, Other
 findprop 40 41 42 43 44 45 46 47 48 49 4a 4b 4c 4d 4e 4f 
 0040 Punctuation: Other punctuation, Common, Other
 0041 Letter: Upper case letter, Latin, Other, 0061
 0042 Letter: Upper case letter, Latin, Other, 0062
 0043 Letter: Upper case letter, Latin, Other, 0063
 0044 Letter: Upper case letter, Latin, Other, 0064
 0045 Letter: Upper case letter, Latin, Other, 0065
 0046 Letter: Upper case letter, Latin, Other, 0066
 0047 Letter: Upper case letter, Latin, Other, 0067
 0048 Letter: Upper case letter, Latin, Other, 0068
 0049 Letter: Upper case letter, Latin, Other, 0069
 004a Letter: Upper case letter, Latin, Other, 006a
 004b Letter: Upper case letter, Latin, Other, 006b, 212a
 004c Letter: Upper case letter, Latin, Other, 006c
 004d Letter: Upper case letter, Latin, Other, 006d
 004e Letter: Upper case letter, Latin, Other, 006e
 004f Letter: Upper case letter, Latin, Other, 006f
 findprop 50 51 52 53 54 55 56 57 58 59 5a 5b 5c 5d 5e 5f 
 0050 Letter: Upper case letter, Latin, Other, 0070
 0051 Letter: Upper case letter, Latin, Other, 0071
 0052 Letter: Upper case letter, Latin, Other, 0072
 0053 Letter: Upper case letter, Latin, Other, 0073, 017f
 0054 Letter: Upper case letter, Latin, Other, 0074
 0055 Letter: Upper case letter, Latin, Other, 0075
 0056 Letter: Upper case letter, Latin, Other, 0076
 0057 Letter: Upper case letter, Latin, Other, 0077
 0058 Letter: Upper case letter, Latin, Other, 0078
 0059 Letter: Upper case letter, Latin, Other, 0079
 005a Letter: Upper case letter, Latin, Other, 007a
 005b Punctuation: Open punctuation, Common, Other
 005c Punctuation: Other punctuation, Common, Other
 005d Punctuation: Close punctuation, Common, Other
 005e Symbol: Modifier symbol, Common, Other
 005f Punctuation: Connector punctuation, Common, Other
 findprop 60 61 62 63 64 65 66 67 68 69 6a 6b 6c 6d 6e 6f 
 0060 Symbol: Modifier symbol, Common, Other
 0061 Letter: Lower case letter, Latin, Other, 0041
 0062 Letter: Lower case letter, Latin, Other, 0042
 0063 Letter: Lower case letter, Latin, Other, 0043
 0064 Letter: Lower case letter, Latin, Other, 0044
 0065 Letter: Lower case letter, Latin, Other, 0045
 0066 Letter: Lower case letter, Latin, Other, 0046
 0067 Letter: Lower case letter, Latin, Other, 0047
 0068 Letter: Lower case letter, Latin, Other, 0048
 0069 Letter: Lower case letter, Latin, Other, 0049
 006a Letter: Lower case letter, Latin, Other, 004a
 006b Letter: Lower case letter, Latin, Other, 004b, 212a
 006c Letter: Lower case letter, Latin, Other, 004c
 006d Letter: Lower case letter, Latin, Other, 004d
 006e Letter: Lower case letter, Latin, Other, 004e
 006f Letter: Lower case letter, Latin, Other, 004f
 findprop 70 71 72 73 74 75 76 77 78 79 7a 7b 7c 7d 7e 7f 
 0070 Letter: Lower case letter, Latin, Other, 0050
 0071 Letter: Lower case letter, Latin, Other, 0051
 0072 Letter: Lower case letter, Latin, Other, 0052
 0073 Letter: Lower case letter, Latin, Other, 0053, 017f
 0074 Letter: Lower case letter, Latin, Other, 0054
 0075 Letter: Lower case letter, Latin, Other, 0055
 0076 Letter: Lower case letter, Latin, Other, 0056
 0077 Letter: Lower case letter, Latin, Other, 0057
 0078 Letter: Lower case letter, Latin, Other, 0058
 0079 Letter: Lower case letter, Latin, Other, 0059
 007a Letter: Lower case letter, Latin, Other, 005a
 007b Punctuation: Open punctuation, Common, Other
 007c Symbol: Mathematical symbol, Common, Other
 007d Punctuation: Close punctuation, Common, Other
 007e Symbol: Mathematical symbol, Common, Other
 007f Control: Control, Common, Control
 findprop 80 81 82 83 84 85 86 87 88 89 8a 8b 8c 8d 8e 8f 
 0080 Control: Control, Common, Control
 0081 Control: Control, Common, Control
 0082 Control: Control, Common, Control
 0083 Control: Control, Common, Control
 0084 Control: Control, Common, Control
 0085 Control: Control, Common, Control
 0086 Control: Control, Common, Control
 0087 Control: Control, Common, Control
 0088 Control: Control, Common, Control
 0089 Control: Control, Common, Control
 008a Control: Control, Common, Control
 008b Control: Control, Common, Control
 008c Control: Control, Common, Control
 008d Control: Control, Common, Control
 008e Control: Control, Common, Control
 008f Control: Control, Common, Control
 findprop 90 91 92 93 94 95 96 97 98 99 9a 9b 9c 9d 9e 9f 
 0090 Control: Control, Common, Control
 0091 Control: Control, Common, Control
 0092 Control: Control, Common, Control
 0093 Control: Control, Common, Control
 0094 Control: Control, Common, Control
 0095 Control: Control, Common, Control
 0096 Control: Control, Common, Control
 0097 Control: Control, Common, Control
 0098 Control: Control, Common, Control
 0099 Control: Control, Common, Control
 009a Control: Control, Common, Control
 009b Control: Control, Common, Control
 009c Control: Control, Common, Control
 009d Control: Control, Common, Control
 009e Control: Control, Common, Control
 009f Control: Control, Common, Control
 findprop a0 a1 a2 a3 a4 a5 a6 a7 a8 a9 aa ab ac ad ae af 
 00a0 Separator: Space separator, Common, Other
 00a1 Punctuation: Other punctuation, Common, Other
 00a2 Symbol: Currency symbol, Common, Other
 00a3 Symbol: Currency symbol, Common, Other
 00a4 Symbol: Currency symbol, Common, Other
 00a5 Symbol: Currency symbol, Common, Other
 00a6 Symbol: Other symbol, Common, Other
 00a7 Punctuation: Other punctuation, Common, Other
 00a8 Symbol: Modifier symbol, Common, Other
 00a9 Symbol: Other symbol, Common, Other
 00aa Letter: Other letter, Latin, Other
 00ab Punctuation: Initial punctuation, Common, Other
 00ac Symbol: Mathematical symbol, Common, Other
 00ad Control: Format, Common, Control
 00ae Symbol: Other symbol, Common, Other
 00af Symbol: Modifier symbol, Common, Other
 findprop b0 b1 b2 b3 b4 b5 b6 b7 b8 b9 ba bb bc bd be bf 
 00b0 Symbol: Other symbol, Common, Other
 00b1 Symbol: Mathematical symbol, Common, Other
 00b2 Number: Other number, Common, Other
 00b3 Number: Other number, Common, Other
 00b4 Symbol: Modifier symbol, Common, Other
 00b5 Letter: Lower case letter, Common, Other, 03bc, 039c
 00b6 Punctuation: Other punctuation, Common, Other
 00b7 Punctuation: Other punctuation, Common, Other
 00b8 Symbol: Modifier symbol, Common, Other
 00b9 Number: Other number, Common, Other
 00ba Letter: Other letter, Latin, Other
 00bb Punctuation: Final punctuation, Common, Other
 00bc Number: Other number, Common, Other
 00bd Number: Other number, Common, Other
 00be Number: Other number, Common, Other
 00bf Punctuation: Other punctuation, Common, Other
 findprop c0 c1 c2 c3 c4 c5 c6 c7 c8 c9 ca cb cc cd ce cf 
 00c0 Letter: Upper case letter, Latin, Other, 00e0
 00c1 Letter: Upper case letter, Latin, Other, 00e1
 00c2 Letter: Upper case letter, Latin, Other, 00e2
 00c3 Letter: Upper case letter, Latin, Other, 00e3
 00c4 Letter: Upper case letter, Latin, Other, 00e4
 00c5 Letter: Upper case letter, Latin, Other, 00e5, 212b
 00c6 Letter: Upper case letter, Latin, Other, 00e6
 00c7 Letter: Upper case letter, Latin, Other, 00e7
 00c8 Letter: Upper case letter, Latin, Other, 00e8
 00c9 Letter: Upper case letter, Latin, Other, 00e9
 00ca Letter: Upper case letter, Latin, Other, 00ea
 00cb Letter: Upper case letter, Latin, Other, 00eb
 00cc Letter: Upper case letter, Latin, Other, 00ec
 00cd Letter: Upper case letter, Latin, Other, 00ed
 00ce Letter: Upper case letter, Latin, Other, 00ee
 00cf Letter: Upper case letter, Latin, Other, 00ef
 findprop d0 d1 d2 d3 d4 d5 d6 d7 d8 d9 da db dc dd de df 
 00d0 Letter: Upper case letter, Latin, Other, 00f0
 00d1 Letter: Upper case letter, Latin, Other, 00f1
 00d2 Letter: Upper case letter, Latin, Other, 00f2
 00d3 Letter: Upper case letter, Latin, Other, 00f3
 00d4 Letter: Upper case letter, Latin, Other, 00f4
 00d5 Letter: Upper case letter, Latin, Other, 00f5
 00d6 Letter: Upper case letter, Latin, Other, 00f6
 00d7 Symbol: Mathematical symbol, Common, Other
 00d8 Letter: Upper case letter, Latin, Other, 00f8
 00d9 Letter: Upper case letter, Latin, Other, 00f9
 00da Letter: Upper case letter, Latin, Other, 00fa
 00db Letter: Upper case letter, Latin, Other, 00fb
 00dc Letter: Upper case letter, Latin, Other, 00fc
 00dd Letter: Upper case letter, Latin, Other, 00fd
 00de Letter: Upper case letter, Latin, Other, 00fe
 00df Letter: Lower case letter, Latin, Other, 1e9e
 findprop e0 e1 e2 e3 e4 e5 e6 e7 e8 e9 ea eb ec ed ee ef 
 00e0 Letter: Lower case letter, Latin, Other, 00c0
 00e1 Letter: Lower case letter, Latin, Other, 00c1
 00e2 Letter: Lower case letter, Latin, Other, 00c2
 00e3 Letter: Lower case letter, Latin, Other, 00c3
 00e4 Letter: Lower case letter, Latin, Other, 00c4
 00e5 Letter: Lower case letter, Latin, Other, 00c5, 212b
 00e6 Letter: Lower case letter, Latin, Other, 00c6
 00e7 Letter: Lower case letter, Latin, Other, 00c7
 00e8 Letter: Lower case letter, Latin, Other, 00c8
 00e9 Letter: Lower case letter, Latin, Other, 00c9
 00ea Letter: Lower case letter, Latin, Other, 00ca
 00eb Letter: Lower case letter, Latin, Other, 00cb
 00ec Letter: Lower case letter, Latin, Other, 00cc
 00ed Letter: Lower case letter, Latin, Other, 00cd
 00ee Letter: Lower case letter, Latin, Other, 00ce
 00ef Letter: Lower case letter, Latin, Other, 00cf
 findprop f0 f1 f2 f3 f4 f5 f6 f7 f8 f9 fa fb fc fd fe ff 
 00f0 Letter: Lower case letter, Latin, Other, 00d0
 00f1 Letter: Lower case letter, Latin, Other, 00d1
 00f2 Letter: Lower case letter, Latin, Other, 00d2
 00f3 Letter: Lower case letter, Latin, Other, 00d3
 00f4 Letter: Lower case letter, Latin, Other, 00d4
 00f5 Letter: Lower case letter, Latin, Other, 00d5
 00f6 Letter: Lower case letter, Latin, Other, 00d6
 00f7 Symbol: Mathematical symbol, Common, Other
 00f8 Letter: Lower case letter, Latin, Other, 00d8
 00f9 Letter: Lower case letter, Latin, Other, 00d9
 00fa Letter: Lower case letter, Latin, Other, 00da
 00fb Letter: Lower case letter, Latin, Other, 00db
 00fc Letter: Lower case letter, Latin, Other, 00dc
 00fd Letter: Lower case letter, Latin, Other, 00dd
 00fe Letter: Lower case letter, Latin, Other, 00de
 00ff Letter: Lower case letter, Latin, Other, 0178
 findprop 0100 0101 0102 0103 0104 0105 0106
 0100 Letter: Upper case letter, Latin, Other, 0101
 0101 Letter: Lower case letter, Latin, Other, 0100
 0102 Letter: Upper case letter, Latin, Other, 0103
 0103 Letter: Lower case letter, Latin, Other, 0102
 0104 Letter: Upper case letter, Latin, Other, 0105
 0105 Letter: Lower case letter, Latin, Other, 0104
 0106 Letter: Upper case letter, Latin, Other, 0107
 findprop ffe0 ffe1 ffe2 ffe3 ffe4 ffe5 ffe6 ffe7 
 ffe0 Symbol: Currency symbol, Common, Other
 ffe1 Symbol: Currency symbol, Common, Other
 ffe2 Symbol: Mathematical symbol, Common, Other
 ffe3 Symbol: Modifier symbol, Common, Other
 ffe4 Symbol: Other symbol, Common, Other
 ffe5 Symbol: Currency symbol, Common, Other
 ffe6 Symbol: Currency symbol, Common, Other
 ffe7 Control: Unassigned, Common, Other
 findprop ffe8 ffe9 ffea ffeb ffec ffed ffee ffef
 ffe8 Symbol: Other symbol, Common, Other
 ffe9 Symbol: Mathematical symbol, Common, Other
 ffea Symbol: Mathematical symbol, Common, Other
 ffeb Symbol: Mathematical symbol, Common, Other
 ffec Symbol: Mathematical symbol, Common, Other
 ffed Symbol: Other symbol, Common, Other
 ffee Symbol: Other symbol, Common, Other
 ffef Control: Unassigned, Common, Other
 findprop fff8 fff9 fffa fffb fffc fffd fffe ffff
 fff8 Control: Unassigned, Common, Control
 fff9 Control: Format, Common, Control
 fffa Control: Format, Common, Control
 fffb Control: Format, Common, Control
 fffc Symbol: Other symbol, Common, Other
 fffd Symbol: Other symbol, Common, Other
 fffe Control: Unassigned, Common, Other
 ffff Control: Unassigned, Common, Other
 findprop 10000 10001 e01ef f0000 100000
 10000 Letter: Other letter, Linear_B, Other
 10001 Letter: Other letter, Linear_B, Other
 e01ef Mark: Non-spacing mark, Inherited, Extend
 f0000 Control: Private use, Common, Other
 100000 Control: Private use, Common, Other
 findprop 1b00 12000 7c0 a840 10900
 1b00 Mark: Non-spacing mark, Balinese, Extend
 12000 Letter: Other letter, Cuneiform, Other
 07c0 Number: Decimal number, Nko, Other
 a840 Letter: Other letter, Phags_Pa, Other
 10900 Letter: Other letter, Phoenician, Other
 findprop 1d79 a77d
 1d79 Letter: Lower case letter, Latin, Other, a77d
 a77d Letter: Upper case letter, Latin, Other, 1d79
 findprop  0800  083e  a4d0  a4f7  aa80  aadf
 0800 Letter: Other letter, Samaritan, Other
 083e Punctuation: Other punctuation, Samaritan, Other
 a4d0 Letter: Other letter, Lisu, Other
 a4f7 Letter: Other letter, Lisu, Other
 aa80 Letter: Other letter, Tai_Viet, Other
 aadf Punctuation: Other punctuation, Tai_Viet, Other
 findprop 10b00 10b35 13000 1342e 10840 10855
 10b00 Letter: Other letter, Avestan, Other
 10b35 Letter: Other letter, Avestan, Other
 13000 Letter: Other letter, Egyptian_Hieroglyphs, Other
 1342e Letter: Other letter, Egyptian_Hieroglyphs, Other
 10840 Letter: Other letter, Imperial_Aramaic, Other
 10855 Letter: Other letter, Imperial_Aramaic, Other
 findprop 11100 1113c 11680 116c0
 11100 Mark: Non-spacing mark, Chakma, Extend
 1113c Number: Decimal number, Chakma, Other
 11680 Letter: Other letter, Takri, Other
 116c0 Number: Decimal number, Takri, Other
 findprop 0d 0a 0e 0711 1b04 1111 1169 11fe ae4c ad89
 000d Control: Control, Common, CR
 000a Control: Control, Common, LF
 000e Control: Control, Common, Control
 0711 Mark: Non-spacing mark, Syriac, Extend
 1b04 Mark: Spacing mark, Balinese, SpacingMark
 1111 Letter: Other letter, Hangul, Hangul syllable type L
 1169 Letter: Other letter, Hangul, Hangul syllable type V
 11fe Letter: Other letter, Hangul, Hangul syllable type T
 ae4c Letter: Other letter, Hangul, Hangul syllable type LV
 ad89 Letter: Other letter, Hangul, Hangul syllable type LVT
--- a/maint/utf8.c
+++ b/maint/utf8.c
@ -0,0 +1,253 @@
 /* A test program for converting characters to UTF-8 and vice versa. Note that
 this program conforms to the original definition of UTF-8, which allows
 codepoints up to 7fffffff. The more recent definition limits the validity of
 UTF-8 codepoints to a maximum of 10ffffff.
 The arguments are either single codepoint values, written as 0xhhhh, for 
 conversion to UTF-8, or sequences of hex values, written without 0x and 
 optionally including spaces (but such arguments must be quoted), for conversion 
 from UTF-8 to codepoints. For example:
 ./utf8 0x1234
 0x00001234 => e1 88 b4
 ./utf8 "e1 88 b4"
 0x00001234 <= e1 88 b4
 In the second case, a number of characters can be present in one argument:
 ./utf8 "65 e188b4 77"
 0x00000065 <= 65 
 0x00001234 <= e1 88 b4 
 0x00000077 <= 77 
 If the option -s is given, the sequence of UTF-bytes is written out between 
 angle brackets at the end of the line. On a UTF-8 terminal, this will show the
 appropriate graphic for the codepoint. */
 #include <stdio.h>
 #include <stdlib.h>
 #include <ctype.h>
 /* The valid ranges for UTF-8 characters are:
 0000 0000  to  0000 007f   1 byte (ascii)
 0000 0080  to  0000 07ff   2 bytes
 0000 0800  to  0000 ffff   3 bytes
 0001 0000  to  001f ffff   4 bytes
 0020 0000  to  03ff ffff   5 bytes
 0400 0000  to  7fff ffff   6 bytes
 */
 static const int utf8_table1[] = {
  0x0000007f, 0x000007ff, 0x0000ffff, 0x001fffff, 0x03ffffff, 0x7fffffff};  
 static const int utf8_table2[] = {
  0, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc};  
 static const int utf8_table3[] = {
  0xff, 0x1f, 0x0f, 0x07, 0x03, 0x01};  
 static const unsigned char utf8_table4[] = {
  1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
  1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
  2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
  3,3,3,3,3,3,3,3,4,4,4,4,5,5,6,6 };
 /*************************************************
 *       Convert character value to UTF-8         *
 *************************************************/
 /* This function takes an integer value in the range 0 - 0x7fffffff
 and encodes it as a UTF-8 character in 1 to 6 bytes.
 Arguments:   
  cvalue     the character value 
  buffer     pointer to buffer for result - at least 6 bytes long
 Returns:     number of characters placed in the buffer
             -1 if input character is negative  
             0 if input character is positive but too big (only when
             int is longer than 32 bits) 
 */
 int
 ord2utf8(int cvalue, unsigned char *buffer)
 {
 register int i, j;
 for (i = 0; i < sizeof(utf8_table1)/sizeof(int); i++)
  if (cvalue <= utf8_table1[i]) break;
 if (i >= sizeof(utf8_table1)/sizeof(int)) return 0;
 if (cvalue < 0) return -1;
 buffer += i;
 for (j = i; j > 0; j--)
 {
 *buffer-- = 0x80 | (cvalue & 0x3f);
 cvalue >>= 6;
 }
 *buffer = utf8_table2[i] | cvalue;
 return i + 1;
 }
 /*************************************************
 *            Convert UTF-8 string to value       *
 *************************************************/
 /* This function takes one or more bytes that represents a UTF-8 character,
 and returns the value of the character.
 Argument:  
  buffer   a pointer to the byte vector
  vptr     a pointer to an int to receive the value 
 Returns:   >  0 => the number of bytes consumed
           -6 to 0 => malformed UTF-8 character at offset = (-return)
 */
 int
 utf82ord(unsigned char *buffer, int *vptr)
 {
 int c = *buffer++;
 int d = c;
 int i, j, s;
 for (i = -1; i < 6; i++)               /* i is number of additional bytes */
  {
  if ((d & 0x80) == 0) break;
  d <<= 1;
  }
 if (i == -1) { *vptr = c; return 1; }  /* ascii character */
 if (i == 0 || i == 6) return 0;        /* invalid UTF-8 */
 /* i now has a value in the range 1-5 */
 s = 6*i;
 d = (c & utf8_table3[i]) << s;
 for (j = 0; j < i; j++)
  {
  c = *buffer++;
  if ((c & 0xc0) != 0x80) return -(j+1);
  s -= 6;
  d |= (c & 0x3f) << s;
  }
 /* Check that encoding was the correct unique one */
 for (j = 0; j < sizeof(utf8_table1)/sizeof(int); j++)
  if (d <= utf8_table1[j]) break;
 if (j != i) return -(i+1);
 /* Valid value */
 *vptr = d;
 return i+1;
 }
 /*************************************************
 *                 Main Program                   *
 *************************************************/
 int
 main(int argc, char **argv)
 {
 int i = 1;
 int show = 0;
 unsigned char buffer[64];
 if (argc > 1 && strcmp(argv[1], "-s") == 0)
  {
  show = 1;
  i = 2;
  }   
 for (; i < argc; i++)
  {
  unsigned char *x = argv[i];
  if (strncmp(x, "0x", 2) == 0)
    {
    int j; 
    int d = strtol(x+2, NULL, 16);
    int rc = ord2utf8(d, buffer);
    printf("0x%08x => ", d); 
    if (rc <= 0) printf("*** Error %d ***", rc); else 
      {
      for (j = 0; j < rc; j++) printf("%02x ", buffer[j]);
      if (show)
        {
        printf(">");
        for (j = 0; j < rc; j++) printf("%c", buffer[j]);
        printf("<"); 
        }  
      } 
    printf("\n");   
    }
  else
    {
    int d, rc; 
    int j = 0;
    int y = 0; 
    int z = 0;
    unsigned char *bptr;
    for (;;) 
      { 
      while (*x == ' ') x++; 
      if (*x == 0 && !z) break;
      if (!isxdigit(*x)) 
        {
        printf("Malformed hex string: %s\n", argv[i]);
        j = -1;
        break;    
        } 
      y = y * 16 + tolower(*x) - ((isdigit(*x))? '0' : 'W');
      x++; 
      if (z)
        { 
        buffer[j++] = y;
        y = 0;
        }
      z ^= 1;     
      } 
    buffer[j] = 0;
    bptr = buffer;
    while (*bptr != 0)
      { 
      rc = utf82ord(bptr, &d);
      if (rc > 0) 
        {
        printf("0x%08x <= ", d);
        for (j = 0; j < rc; j++) printf("%02x ", bptr[j]);
        if (show)
          {
          printf(">");
          for (j = 0; j < rc; j++) printf("%c", bptr[j]);
          printf("<"); 
          }  
        printf("\n");
        bptr += rc; 
        } 
      else 
        {
        printf("Malformed UTF-8 at offset %d <= ", -rc);
        while (*bptr != 0) printf("%02x ", *bptr++);
        printf("\n"); 
        break;  
        } 
      }   
    }       
  } 
 return 0;
 }
 /* End */
--- a/src/pcre2.h
+++ b/src/pcre2.h
@ -451,7 +451,7 @@ PCRE2_EXP_DECL int       pcre2_substring_length_byname(pcre2_match_data *, \
 PCRE2_EXP_DECL int       pcre2_substring_length_bynumber(pcre2_match_data *, \
                           int); \
 PCRE2_EXP_DECL int       pcre2_substring_nametable_scan(const pcre2_code *, \
-                           PCRE2_SPTR, PCRE2_UCHAR **, PCRE2_UCHAR **); \
+                           PCRE2_SPTR, PCRE2_SPTR *, PCRE2_SPTR *); \
 PCRE2_EXP_DECL int       pcre2_substring_number_from_name(\
                           const pcre2_code *, PCRE2_SPTR); \
 PCRE2_EXP_DECL void      pcre2_substring_list_free(PCRE2_SPTR *); \
--- a/src/pcre2.h.in
+++ b/src/pcre2.h.in
@ -451,7 +451,7 @@ PCRE2_EXP_DECL int       pcre2_substring_length_byname(pcre2_match_data *, \
 PCRE2_EXP_DECL int       pcre2_substring_length_bynumber(pcre2_match_data *, \
                           int); \
 PCRE2_EXP_DECL int       pcre2_substring_nametable_scan(const pcre2_code *, \
-                           PCRE2_SPTR, PCRE2_UCHAR **, PCRE2_UCHAR **); \
+                           PCRE2_SPTR, PCRE2_SPTR *, PCRE2_SPTR *); \
 PCRE2_EXP_DECL int       pcre2_substring_number_from_name(\
                           const pcre2_code *, PCRE2_SPTR); \
 PCRE2_EXP_DECL void      pcre2_substring_list_free(PCRE2_SPTR *); \
--- a/src/pcre2_compile.c
+++ b/src/pcre2_compile.c
@ -102,23 +102,57 @@ if (ccontext == NULL)
 if (pattern[0] == 'Y')
  {
-  c = ccontext->memctl.malloc(sizeof(pcre2_real_code), NULL);
+  PCRE2_UCHAR *n; 
  int lennumber = (PCRE2_CODE_UNIT_WIDTH == 8)? 2 : 1;
  size_t size = sizeof(pcre2_real_code) + 
    (12 + 3*lennumber)*(PCRE2_CODE_UNIT_WIDTH/8) + CU2BYTES(20);
  c = ccontext->memctl.malloc(size, NULL);
  c->memctl = ccontext->memctl; 
  c->magic_number = MAGIC_NUMBER;
-  c->size = sizeof(pcre2_real_code);  
+  c->size = size;
  c->name_table_offset = sizeof(pcre2_real_code); 
  c->compile_options = options; 
  c->flags = PCRE2_CODE_UNIT_WIDTH/8;
  c->limit_match = 0;
  c->limit_recursion = 0;
  c->max_lookbehind = 0;
  c->minlength = 3;
-  c->top_bracket = 1;
+  c->top_bracket = 5;
  c->top_backref = 1;       
  c->bsr_convention = ccontext->bsr_convention;
  c->newline_convention = ccontext->newline_convention;  
-  c->name_count = 0; 
+  c->name_count = 3; 
-  c->name_entry_size = 0; 
+  c->name_entry_size = 4 + lennumber; 
  n = (PCRE2_UCHAR *)((char *)c + sizeof(pcre2_real_code));
  if (lennumber == 2) *n++ = 0 ;
  *n++ = 1;
  *n++ = 'x'; *n++ = 'x'; *n++ = 'x'; *n++ = 0;
  if (lennumber == 2) *n++ = 0 ;
  *n++ = 2;
  *n++ = 'y'; *n++ = 'y'; *n++ = 'y'; *n++ = 0;
  if (lennumber == 2) *n++ = 0 ;
  *n++ = 3;
  *n++ = 'y'; *n++ = 'y'; *n++ = 'y'; *n++ = 0;
  *n++ = OP_CHAR;
  *n++ = 'x';
  *n++ = OP_CHARI;
  *n++ = 'Y';   
  *n++ = OP_PROP;
  *n++ = PT_SC;
  *n++ = 0;  
  *n++ = OP_DNRREF;
  *n++ = 0;  
  *n++ = OP_END;
  } 
 else
--- a/src/pcre2_context.c
+++ b/src/pcre2_context.c
@ -78,27 +78,26 @@ memory control data is to be stored for future use.
 Arguments:
  size        amount of memory required
  offset      offset in memory block to memctl structure
-  gcontext    a general context or NULL
+  memctl      pointer to a memctl block or NULL
 Returns:      pointer to memory or NULL on failure
 */   
 PCRE2_EXP_DEFN void *
-PRIV(memctl_malloc)(size_t size, size_t offset, 
+PRIV(memctl_malloc)(size_t size, size_t offset, pcre2_memctl *memctl)
  pcre2_general_context *gcontext)
 {
-pcre2_memctl *memctl;
+pcre2_memctl *newmemctl;
-void *yield = (gcontext == NULL)? malloc(size) :
+void *yield = (memctl == NULL)? malloc(size) :
-  gcontext->memctl.malloc(size, gcontext->memctl.memory_data);
+  memctl->malloc(size, memctl->memory_data);
 if (yield == NULL) return NULL; 
-memctl = (pcre2_memctl *)(((uint8_t *)yield) + offset);
+newmemctl = (pcre2_memctl *)(((uint8_t *)yield) + offset);
-if (gcontext == NULL)
+if (memctl == NULL)
  {
-  memctl->malloc = default_malloc;
+  newmemctl->malloc = default_malloc;
-  memctl->free = default_free;
+  newmemctl->free = default_free;
-  memctl->memory_data = NULL;
+  newmemctl->memory_data = NULL;
  }
-else *memctl = gcontext->memctl;       
+else *newmemctl = *memctl;       
 return yield;
 }   
@ -152,7 +151,7 @@ pcre2_compile_context_create(pcre2_general_context *gcontext)
 pcre2_compile_context *ccontext = PRIV(memctl_malloc)(
  sizeof(pcre2_real_compile_context), 
  offsetof(pcre2_real_compile_context, memctl),
-  gcontext); 
+  &(gcontext->memctl)); 
 if (ccontext == NULL) return NULL;  
 PRIV(compile_context_init)(ccontext, FALSE);
 return ccontext;
@ -184,7 +183,7 @@ pcre2_match_context_create(pcre2_general_context *gcontext)
 pcre2_match_context *mcontext = PRIV(memctl_malloc)(
  sizeof(pcre2_real_match_context),
  offsetof(pcre2_real_compile_context, memctl),
-  gcontext);  
+  &(gcontext->memctl));  
 if (mcontext == NULL) return NULL;   
 PRIV(match_context_init)(mcontext, FALSE);
 return mcontext;
@ -240,21 +239,24 @@ return new;
 PCRE2_EXP_DEFN void PCRE2_CALL_CONVENTION
 pcre2_general_context_free(pcre2_general_context *gcontext)
 {
-gcontext->memctl.free(gcontext, gcontext->memctl.memory_data);
+if (gcontext != NULL)
  gcontext->memctl.free(gcontext, gcontext->memctl.memory_data);
 }
 PCRE2_EXP_DEFN void PCRE2_CALL_CONVENTION
 pcre2_compile_context_free(pcre2_compile_context *ccontext)
 {
-ccontext->memctl.free(ccontext, ccontext->memctl.memory_data);
+if (ccontext != NULL)
  ccontext->memctl.free(ccontext, ccontext->memctl.memory_data);
 }
 PCRE2_EXP_DEFN void PCRE2_CALL_CONVENTION
 pcre2_match_context_free(pcre2_match_context *mcontext)
 {
-mcontext->memctl.free(mcontext, mcontext->memctl.memory_data);
+if (mcontext != NULL)
  mcontext->memctl.free(mcontext, mcontext->memctl.memory_data);
 }
--- a/src/pcre2_internal.h
+++ b/src/pcre2_internal.h
@ -49,6 +49,7 @@ POSSIBILITY OF SUCH DAMAGE.
 #include <string.h>
 #include "pcre2.h"
 #include "pcre2_ucp.h"
 #define PUBL(name) pcre2_##name
@ -77,6 +78,11 @@ typedef int BOOL;
 #include <valgrind/memcheck.h>
 #endif
 /* This is an unsigned int value that no character can ever have, as
 Unicode doesn't go beyond 0x0010ffff. */
 #define NOTACHAR 0xffffffff
 /* When UTF encoding is being used, a character is no longer just a single
 byte in 8-bit mode or a single short in 16-bit mode. The macros for character
 handling generate simple sequences when used in the basic mode, and more
@ -165,6 +171,109 @@ the pointer. */
 #endif  /* SUPPORT_UTF */
 /* Tests for Unicode horizontal and vertical whitespace characters must check a
 number of different values. Using a switch statement for this generates the
 fastest code (no loop, no memory access), and there are several places in the
 interpreter code where this happens. In order to ensure that all the case lists
 remain in step, we use macros so that there is only one place where the lists
 are defined.
 These values are also required as lists in pcre2_compile.c when processing \h,
 \H, \v and \V in a character class. The lists are defined in pcre2_tables.c,
 but macros that define the values are here so that all the definitions are
 together. The lists must be in ascending character order, terminated by
 NOTACHAR (which is 0xffffffff).
 Any changes should ensure that the various macros are kept in step with each
 other. NOTE: The values also appear in pcre2_jit_compile.c. */
 /* ------ ASCII/Unicode environments ------ */
 #ifndef EBCDIC
 #define HSPACE_LIST \
  CHAR_HT, CHAR_SPACE, 0xa0, \
  0x1680, 0x180e, 0x2000, 0x2001, 0x2002, 0x2003, 0x2004, 0x2005, \
  0x2006, 0x2007, 0x2008, 0x2009, 0x200A, 0x202f, 0x205f, 0x3000, \
  NOTACHAR
 #define HSPACE_MULTIBYTE_CASES \
  case 0x1680:  /* OGHAM SPACE MARK */ \
  case 0x180e:  /* MONGOLIAN VOWEL SEPARATOR */ \
  case 0x2000:  /* EN QUAD */ \
  case 0x2001:  /* EM QUAD */ \
  case 0x2002:  /* EN SPACE */ \
  case 0x2003:  /* EM SPACE */ \
  case 0x2004:  /* THREE-PER-EM SPACE */ \
  case 0x2005:  /* FOUR-PER-EM SPACE */ \
  case 0x2006:  /* SIX-PER-EM SPACE */ \
  case 0x2007:  /* FIGURE SPACE */ \
  case 0x2008:  /* PUNCTUATION SPACE */ \
  case 0x2009:  /* THIN SPACE */ \
  case 0x200A:  /* HAIR SPACE */ \
  case 0x202f:  /* NARROW NO-BREAK SPACE */ \
  case 0x205f:  /* MEDIUM MATHEMATICAL SPACE */ \
  case 0x3000   /* IDEOGRAPHIC SPACE */
 #define HSPACE_BYTE_CASES \
  case CHAR_HT: \
  case CHAR_SPACE: \
  case 0xa0     /* NBSP */
 #define HSPACE_CASES \
  HSPACE_BYTE_CASES: \
  HSPACE_MULTIBYTE_CASES
 #define VSPACE_LIST \
  CHAR_LF, CHAR_VT, CHAR_FF, CHAR_CR, CHAR_NEL, 0x2028, 0x2029, NOTACHAR
 #define VSPACE_MULTIBYTE_CASES \
  case 0x2028:    /* LINE SEPARATOR */ \
  case 0x2029     /* PARAGRAPH SEPARATOR */
 #define VSPACE_BYTE_CASES \
  case CHAR_LF: \
  case CHAR_VT: \
  case CHAR_FF: \
  case CHAR_CR: \
  case CHAR_NEL
 #define VSPACE_CASES \
  VSPACE_BYTE_CASES: \
  VSPACE_MULTIBYTE_CASES
 /* ------ EBCDIC environments ------ */
 #else
 #define HSPACE_LIST CHAR_HT, CHAR_SPACE
 #define HSPACE_BYTE_CASES \
  case CHAR_HT: \
  case CHAR_SPACE
 #define HSPACE_CASES HSPACE_BYTE_CASES
 #ifdef EBCDIC_NL25
 #define VSPACE_LIST \
  CHAR_VT, CHAR_FF, CHAR_CR, CHAR_NEL, CHAR_LF, NOTACHAR
 #else
 #define VSPACE_LIST \
  CHAR_VT, CHAR_FF, CHAR_CR, CHAR_LF, CHAR_NEL, NOTACHAR
 #endif
 #define VSPACE_BYTE_CASES \
  case CHAR_LF: \
  case CHAR_VT: \
  case CHAR_FF: \
  case CHAR_CR: \
  case CHAR_NEL
 #define VSPACE_CASES VSPACE_BYTE_CASES
 #endif  /* EBCDIC */
 /* ------ End of whitespace macros ------ */
 /* Private flags containing information about the compiled pattern. The first
 three must not be changed, because whichever is set is actually the number of
 bytes in a code unit in that mode. */
@ -801,7 +910,519 @@ only. */
 /* -------------------- End of character and string names -------------------*/
-/* Private structures that are mode-independent. */
+/* -------------------- Definitions for compiled patterns -------------------*/
 /* Escape items that are just an encoding of a particular data value. */
 #ifndef ESC_e
 #define ESC_e CHAR_ESC
 #endif
 #ifndef ESC_f
 #define ESC_f CHAR_FF
 #endif
 #ifndef ESC_n
 #define ESC_n CHAR_LF
 #endif
 #ifndef ESC_r
 #define ESC_r CHAR_CR
 #endif
 /* We can't officially use ESC_t because it is a POSIX reserved identifier
 (presumably because of all the others like size_t). */
 #ifndef ESC_tee
 #define ESC_tee CHAR_HT
 #endif
 /* Codes for different types of Unicode property */
 #define PT_ANY        0    /* Any property - matches all chars */
 #define PT_LAMP       1    /* L& - the union of Lu, Ll, Lt */
 #define PT_GC         2    /* Specified general characteristic (e.g. L) */
 #define PT_PC         3    /* Specified particular characteristic (e.g. Lu) */
 #define PT_SC         4    /* Script (e.g. Han) */
 #define PT_ALNUM      5    /* Alphanumeric - the union of L and N */
 #define PT_SPACE      6    /* Perl space - Z plus 9,10,12,13 */
 #define PT_PXSPACE    7    /* POSIX space - Z plus 9,10,11,12,13 */
 #define PT_WORD       8    /* Word - L plus N plus underscore */
 #define PT_CLIST      9    /* Pseudo-property: match character list */
 #define PT_UCNC      10    /* Universal Character nameable character */
 #define PT_TABSIZE   11    /* Size of square table for autopossessify tests */
 /* The following special properties are used only in XCLASS items, when POSIX
 classes are specified and PCRE_UCP is set - in other words, for Unicode
 handling of these classes. They are not available via the \p or \P escapes like
 those in the above list, and so they do not take part in the autopossessifying
 table. */
 #define PT_PXGRAPH   11    /* [:graph:] - characters that mark the paper */
 #define PT_PXPRINT   12    /* [:print:] - [:graph:] plus non-control spaces */
 #define PT_PXPUNCT   13    /* [:punct:] - punctuation characters */
 /* Flag bits and data types for the extended class (OP_XCLASS) for classes that
 contain characters with values greater than 255. */
 #define XCL_NOT       0x01    /* Flag: this is a negative class */
 #define XCL_MAP       0x02    /* Flag: a 32-byte map is present */
 #define XCL_HASPROP   0x04    /* Flag: property checks are present. */
 #define XCL_END       0    /* Marks end of individual items */
 #define XCL_SINGLE    1    /* Single item (one multibyte char) follows */
 #define XCL_RANGE     2    /* A range (two multibyte chars) follows */
 #define XCL_PROP      3    /* Unicode property (2-byte property code follows) */
 #define XCL_NOTPROP   4    /* Unicode inverted property (ditto) */
 /* These are escaped items that aren't just an encoding of a particular data
 value such as \n. They must have non-zero values, as check_escape() returns 0
 for a data character.  Also, they must appear in the same order as in the
 opcode definitions below, up to ESC_z. There's a dummy for OP_ALLANY because it
 corresponds to "." in DOTALL mode rather than an escape sequence. It is also
 used for [^] in JavaScript compatibility mode, and for \C in non-utf mode. In
 non-DOTALL mode, "." behaves like \N.
 The special values ESC_DU, ESC_du, etc. are used instead of ESC_D, ESC_d, etc.
 when PCRE_UCP is set and replacement of \d etc by \p sequences is required.
 They must be contiguous, and remain in order so that the replacements can be
 looked up from a table.
 Negative numbers are used to encode a backreference (\1, \2, \3, etc.) in
 check_escape(). There are two tests in the code for an escape
 greater than ESC_b and less than ESC_Z to detect the types that may be
 repeated. These are the types that consume characters. If any new escapes are
 put in between that don't consume a character, that code will have to change.
 */
 enum { ESC_A = 1, ESC_G, ESC_K, ESC_B, ESC_b, ESC_D, ESC_d, ESC_S, ESC_s,
       ESC_W, ESC_w, ESC_N, ESC_dum, ESC_C, ESC_P, ESC_p, ESC_R, ESC_H,
       ESC_h, ESC_V, ESC_v, ESC_X, ESC_Z, ESC_z,
       ESC_E, ESC_Q, ESC_g, ESC_k,
       ESC_DU, ESC_du, ESC_SU, ESC_su, ESC_WU, ESC_wu };
 /********************** Opcode definitions ******************/
 /****** NOTE NOTE NOTE ******
 Starting from 1 (i.e. after OP_END), the values up to OP_EOD must correspond in
 order to the list of escapes immediately above. Furthermore, values up to
 OP_DOLLM must not be changed without adjusting the table called autoposstab in
 pcre_compile.c
 Whenever this list is updated, the two macro definitions that follow must be
 updated to match. The possessification table called "opcode_possessify" in
 pcre_compile.c must also be updated, and also the tables called "coptable"
 and "poptable" in pcre_dfa_exec.c.
 ****** NOTE NOTE NOTE ******/
 /* The values between FIRST_AUTOTAB_OP and LAST_AUTOTAB_RIGHT_OP, inclusive,
 are used in a table for deciding whether a repeated character type can be
 auto-possessified. */
 #define FIRST_AUTOTAB_OP       OP_NOT_DIGIT
 #define LAST_AUTOTAB_LEFT_OP   OP_EXTUNI
 #define LAST_AUTOTAB_RIGHT_OP  OP_DOLLM
 enum {
  OP_END,            /* 0 End of pattern */
  /* Values corresponding to backslashed metacharacters */
  OP_SOD,            /* 1 Start of data: \A */
  OP_SOM,            /* 2 Start of match (subject + offset): \G */
  OP_SET_SOM,        /* 3 Set start of match (\K) */
  OP_NOT_WORD_BOUNDARY,  /*  4 \B */
  OP_WORD_BOUNDARY,      /*  5 \b */
  OP_NOT_DIGIT,          /*  6 \D */
  OP_DIGIT,              /*  7 \d */
  OP_NOT_WHITESPACE,     /*  8 \S */
  OP_WHITESPACE,         /*  9 \s */
  OP_NOT_WORDCHAR,       /* 10 \W */
  OP_WORDCHAR,           /* 11 \w */
  OP_ANY,            /* 12 Match any character except newline (\N) */
  OP_ALLANY,         /* 13 Match any character */
  OP_ANYBYTE,        /* 14 Match any byte (\C); different to OP_ANY for UTF-8 */
  OP_NOTPROP,        /* 15 \P (not Unicode property) */
  OP_PROP,           /* 16 \p (Unicode property) */
  OP_ANYNL,          /* 17 \R (any newline sequence) */
  OP_NOT_HSPACE,     /* 18 \H (not horizontal whitespace) */
  OP_HSPACE,         /* 19 \h (horizontal whitespace) */
  OP_NOT_VSPACE,     /* 20 \V (not vertical whitespace) */
  OP_VSPACE,         /* 21 \v (vertical whitespace) */
  OP_EXTUNI,         /* 22 \X (extended Unicode sequence */
  OP_EODN,           /* 23 End of data or \n at end of data (\Z) */
  OP_EOD,            /* 24 End of data (\z) */
  /* Line end assertions */
  OP_DOLL,           /* 25 End of line - not multiline */
  OP_DOLLM,          /* 26 End of line - multiline */
  OP_CIRC,           /* 27 Start of line - not multiline */
  OP_CIRCM,          /* 28 Start of line - multiline */
  /* Single characters; caseful must precede the caseless ones */
  OP_CHAR,           /* 29 Match one character, casefully */
  OP_CHARI,          /* 30 Match one character, caselessly */
  OP_NOT,            /* 31 Match one character, not the given one, casefully */
  OP_NOTI,           /* 32 Match one character, not the given one, caselessly */
  /* The following sets of 13 opcodes must always be kept in step because
  the offset from the first one is used to generate the others. */
  /* Repeated characters; caseful must precede the caseless ones */
  OP_STAR,           /* 33 The maximizing and minimizing versions of */
  OP_MINSTAR,        /* 34 these six opcodes must come in pairs, with */
  OP_PLUS,           /* 35 the minimizing one second. */
  OP_MINPLUS,        /* 36 */
  OP_QUERY,          /* 37 */
  OP_MINQUERY,       /* 38 */
  OP_UPTO,           /* 39 From 0 to n matches of one character, caseful*/
  OP_MINUPTO,        /* 40 */
  OP_EXACT,          /* 41 Exactly n matches */
  OP_POSSTAR,        /* 42 Possessified star, caseful */
  OP_POSPLUS,        /* 43 Possessified plus, caseful */
  OP_POSQUERY,       /* 44 Posesssified query, caseful */
  OP_POSUPTO,        /* 45 Possessified upto, caseful */
  /* Repeated characters; caseless must follow the caseful ones */
  OP_STARI,          /* 46 */
  OP_MINSTARI,       /* 47 */
  OP_PLUSI,          /* 48 */
  OP_MINPLUSI,       /* 49 */
  OP_QUERYI,         /* 50 */
  OP_MINQUERYI,      /* 51 */
  OP_UPTOI,          /* 52 From 0 to n matches of one character, caseless */
  OP_MINUPTOI,       /* 53 */
  OP_EXACTI,         /* 54 */
  OP_POSSTARI,       /* 55 Possessified star, caseless */
  OP_POSPLUSI,       /* 56 Possessified plus, caseless */
  OP_POSQUERYI,      /* 57 Posesssified query, caseless */
  OP_POSUPTOI,       /* 58 Possessified upto, caseless */
  /* The negated ones must follow the non-negated ones, and match them */
  /* Negated repeated character, caseful; must precede the caseless ones */
  OP_NOTSTAR,        /* 59 The maximizing and minimizing versions of */
  OP_NOTMINSTAR,     /* 60 these six opcodes must come in pairs, with */
  OP_NOTPLUS,        /* 61 the minimizing one second. They must be in */
  OP_NOTMINPLUS,     /* 62 exactly the same order as those above. */
  OP_NOTQUERY,       /* 63 */
  OP_NOTMINQUERY,    /* 64 */
  OP_NOTUPTO,        /* 65 From 0 to n matches, caseful */
  OP_NOTMINUPTO,     /* 66 */
  OP_NOTEXACT,       /* 67 Exactly n matches */
  OP_NOTPOSSTAR,     /* 68 Possessified versions, caseful */
  OP_NOTPOSPLUS,     /* 69 */
  OP_NOTPOSQUERY,    /* 70 */
  OP_NOTPOSUPTO,     /* 71 */
  /* Negated repeated character, caseless; must follow the caseful ones */
  OP_NOTSTARI,       /* 72 */
  OP_NOTMINSTARI,    /* 73 */
  OP_NOTPLUSI,       /* 74 */
  OP_NOTMINPLUSI,    /* 75 */
  OP_NOTQUERYI,      /* 76 */
  OP_NOTMINQUERYI,   /* 77 */
  OP_NOTUPTOI,       /* 78 From 0 to n matches, caseless */
  OP_NOTMINUPTOI,    /* 79 */
  OP_NOTEXACTI,      /* 80 Exactly n matches */
  OP_NOTPOSSTARI,    /* 81 Possessified versions, caseless */
  OP_NOTPOSPLUSI,    /* 82 */
  OP_NOTPOSQUERYI,   /* 83 */
  OP_NOTPOSUPTOI,    /* 84 */
  /* Character types */
  OP_TYPESTAR,       /* 85 The maximizing and minimizing versions of */
  OP_TYPEMINSTAR,    /* 86 these six opcodes must come in pairs, with */
  OP_TYPEPLUS,       /* 87 the minimizing one second. These codes must */
  OP_TYPEMINPLUS,    /* 88 be in exactly the same order as those above. */
  OP_TYPEQUERY,      /* 89 */
  OP_TYPEMINQUERY,   /* 90 */
  OP_TYPEUPTO,       /* 91 From 0 to n matches */
  OP_TYPEMINUPTO,    /* 92 */
  OP_TYPEEXACT,      /* 93 Exactly n matches */
  OP_TYPEPOSSTAR,    /* 94 Possessified versions */
  OP_TYPEPOSPLUS,    /* 95 */
  OP_TYPEPOSQUERY,   /* 96 */
  OP_TYPEPOSUPTO,    /* 97 */
  /* These are used for character classes and back references; only the
  first six are the same as the sets above. */
  OP_CRSTAR,         /* 98 The maximizing and minimizing versions of */
  OP_CRMINSTAR,      /* 99 all these opcodes must come in pairs, with */
  OP_CRPLUS,         /* 100 the minimizing one second. These codes must */
  OP_CRMINPLUS,      /* 101 be in exactly the same order as those above. */
  OP_CRQUERY,        /* 102 */
  OP_CRMINQUERY,     /* 103 */
  OP_CRRANGE,        /* 104 These are different to the three sets above. */
  OP_CRMINRANGE,     /* 105 */
  OP_CRPOSSTAR,      /* 106 Possessified versions */
  OP_CRPOSPLUS,      /* 107 */
  OP_CRPOSQUERY,     /* 108 */
  OP_CRPOSRANGE,     /* 109 */
  /* End of quantifier opcodes */
  OP_CLASS,          /* 110 Match a character class, chars < 256 only */
  OP_NCLASS,         /* 111 Same, but the bitmap was created from a negative
                              class - the difference is relevant only when a
                              character > 255 is encountered. */
  OP_XCLASS,         /* 112 Extended class for handling > 255 chars within the
                              class. This does both positive and negative. */
  OP_REF,            /* 113 Match a back reference, casefully */
  OP_REFI,           /* 114 Match a back reference, caselessly */
  OP_DNREF,          /* 115 Match a duplicate name backref, casefully */
  OP_DNREFI,         /* 116 Match a duplicate name backref, caselessly */
  OP_RECURSE,        /* 117 Match a numbered subpattern (possibly recursive) */
  OP_CALLOUT,        /* 118 Call out to external function if provided */
  OP_ALT,            /* 119 Start of alternation */
  OP_KET,            /* 120 End of group that doesn't have an unbounded repeat */
  OP_KETRMAX,        /* 121 These two must remain together and in this */
  OP_KETRMIN,        /* 122 order. They are for groups the repeat for ever. */
  OP_KETRPOS,        /* 123 Possessive unlimited repeat. */
  /* The assertions must come before BRA, CBRA, ONCE, and COND, and the four
  asserts must remain in order. */
  OP_REVERSE,        /* 124 Move pointer back - used in lookbehind assertions */
  OP_ASSERT,         /* 125 Positive lookahead */
  OP_ASSERT_NOT,     /* 126 Negative lookahead */
  OP_ASSERTBACK,     /* 127 Positive lookbehind */
  OP_ASSERTBACK_NOT, /* 128 Negative lookbehind */
  /* ONCE, ONCE_NC, BRA, BRAPOS, CBRA, CBRAPOS, and COND must come immediately
  after the assertions, with ONCE first, as there's a test for >= ONCE for a
  subpattern that isn't an assertion. The POS versions must immediately follow
  the non-POS versions in each case. */
  OP_ONCE,           /* 129 Atomic group, contains captures */
  OP_ONCE_NC,        /* 130 Atomic group containing no captures */
  OP_BRA,            /* 131 Start of non-capturing bracket */
  OP_BRAPOS,         /* 132 Ditto, with unlimited, possessive repeat */
  OP_CBRA,           /* 133 Start of capturing bracket */
  OP_CBRAPOS,        /* 134 Ditto, with unlimited, possessive repeat */
  OP_COND,           /* 135 Conditional group */
  /* These five must follow the previous five, in the same order. There's a
  check for >= SBRA to distinguish the two sets. */
  OP_SBRA,           /* 136 Start of non-capturing bracket, check empty  */
  OP_SBRAPOS,        /* 137 Ditto, with unlimited, possessive repeat */
  OP_SCBRA,          /* 138 Start of capturing bracket, check empty */
  OP_SCBRAPOS,       /* 139 Ditto, with unlimited, possessive repeat */
  OP_SCOND,          /* 140 Conditional group, check empty */
  /* The next two pairs must (respectively) be kept together. */
  OP_CREF,           /* 141 Used to hold a capture number as condition */
  OP_DNCREF,         /* 142 Used to point to duplicate names as a condition */
  OP_RREF,           /* 143 Used to hold a recursion number as condition */
  OP_DNRREF,         /* 144 Used to point to duplicate names as a condition */
  OP_DEF,            /* 145 The DEFINE condition */
  OP_BRAZERO,        /* 146 These two must remain together and in this */
  OP_BRAMINZERO,     /* 147 order. */
  OP_BRAPOSZERO,     /* 148 */
  /* These are backtracking control verbs */
  OP_MARK,           /* 149 always has an argument */
  OP_PRUNE,          /* 150 */
  OP_PRUNE_ARG,      /* 151 same, but with argument */
  OP_SKIP,           /* 152 */
  OP_SKIP_ARG,       /* 153 same, but with argument */
  OP_THEN,           /* 154 */
  OP_THEN_ARG,       /* 155 same, but with argument */
  OP_COMMIT,         /* 156 */
  /* These are forced failure and success verbs */
  OP_FAIL,           /* 157 */
  OP_ACCEPT,         /* 158 */
  OP_ASSERT_ACCEPT,  /* 159 Used inside assertions */
  OP_CLOSE,          /* 160 Used before OP_ACCEPT to close open captures */
  /* This is used to skip a subpattern with a {0} quantifier */
  OP_SKIPZERO,       /* 161 */
  /* This is not an opcode, but is used to check that tables indexed by opcode
  are the correct length, in order to catch updating errors - there have been
  some in the past. */
  OP_TABLE_LENGTH
 };
 /* *** NOTE NOTE NOTE *** Whenever the list above is updated, the two macro
 definitions that follow must also be updated to match. There are also tables
 called "opcode_possessify" in pcre_compile.c and "coptable" and "poptable" in
 pcre_dfa_exec.c that must be updated. */
 /* This macro defines textual names for all the opcodes. These are used only
 for debugging, and some of them are only partial names. The macro is referenced
 only in pcre_printint.c, which fills out the full names in many cases (and in
 some cases doesn't actually use these names at all). */
 #define OP_NAME_LIST \
  "End", "\\A", "\\G", "\\K", "\\B", "\\b", "\\D", "\\d",         \
  "\\S", "\\s", "\\W", "\\w", "Any", "AllAny", "Anybyte",         \
  "notprop", "prop", "\\R", "\\H", "\\h", "\\V", "\\v",           \
  "extuni",  "\\Z", "\\z",                                        \
  "$", "$", "^", "^", "char", "chari", "not", "noti",             \
  "*", "*?", "+", "+?", "?", "??",                                \
  "{", "{", "{",                                                  \
  "*+","++", "?+", "{",                                           \
  "*", "*?", "+", "+?", "?", "??",                                \
  "{", "{", "{",                                                  \
  "*+","++", "?+", "{",                                           \
  "*", "*?", "+", "+?", "?", "??",                                \
  "{", "{", "{",                                                  \
  "*+","++", "?+", "{",                                           \
  "*", "*?", "+", "+?", "?", "??",                                \
  "{", "{", "{",                                                  \
  "*+","++", "?+", "{",                                           \
  "*", "*?", "+", "+?", "?", "??", "{", "{", "{",                 \
  "*+","++", "?+", "{",                                           \
  "*", "*?", "+", "+?", "?", "??", "{", "{",                      \
  "*+","++", "?+", "{",                                           \
  "class", "nclass", "xclass", "Ref", "Refi", "DnRef", "DnRefi",  \
  "Recurse", "Callout",                                           \
  "Alt", "Ket", "KetRmax", "KetRmin", "KetRpos",                  \
  "Reverse", "Assert", "Assert not", "AssertB", "AssertB not",    \
  "Once", "Once_NC",                                              \
  "Bra", "BraPos", "CBra", "CBraPos",                             \
  "Cond",                                                         \
  "SBra", "SBraPos", "SCBra", "SCBraPos",                         \
  "SCond",                                                        \
  "Cond ref", "Cond dnref", "Cond rec", "Cond dnrec", "Cond def", \
  "Brazero", "Braminzero", "Braposzero",                          \
  "*MARK", "*PRUNE", "*PRUNE", "*SKIP", "*SKIP",                  \
  "*THEN", "*THEN", "*COMMIT", "*FAIL",                           \
  "*ACCEPT", "*ASSERT_ACCEPT",                                    \
  "Close", "Skip zero"
 /* This macro defines the length of fixed length operations in the compiled
 regex. The lengths are used when searching for specific things, and also in the
 debugging printing of a compiled regex. We use a macro so that it can be
 defined close to the definitions of the opcodes themselves.
 As things have been extended, some of these are no longer fixed lenths, but are
 minima instead. For example, the length of a single-character repeat may vary
 in UTF-8 mode. The code that uses this table must know about such things. */
 #define OP_LENGTHS \
  1,                             /* End                                    */ \
  1, 1, 1, 1, 1,                 /* \A, \G, \K, \B, \b                     */ \
  1, 1, 1, 1, 1, 1,              /* \D, \d, \S, \s, \W, \w                 */ \
  1, 1, 1,                       /* Any, AllAny, Anybyte                   */ \
  3, 3,                          /* \P, \p                                 */ \
  1, 1, 1, 1, 1,                 /* \R, \H, \h, \V, \v                     */ \
  1,                             /* \X                                     */ \
  1, 1, 1, 1, 1, 1,              /* \Z, \z, $, $M ^, ^M                    */ \
  2,                             /* Char  - the minimum length             */ \
  2,                             /* Chari  - the minimum length            */ \
  2,                             /* not                                    */ \
  2,                             /* noti                                   */ \
  /* Positive single-char repeats                             ** These are */ \
  2, 2, 2, 2, 2, 2,              /* *, *?, +, +?, ?, ??       ** minima in */ \
  2+IMM2_SIZE, 2+IMM2_SIZE,      /* upto, minupto             ** mode      */ \
  2+IMM2_SIZE,                   /* exact                                  */ \
  2, 2, 2, 2+IMM2_SIZE,          /* *+, ++, ?+, upto+                      */ \
  2, 2, 2, 2, 2, 2,              /* *I, *?I, +I, +?I, ?I, ??I ** UTF-8     */ \
  2+IMM2_SIZE, 2+IMM2_SIZE,      /* upto I, minupto I                      */ \
  2+IMM2_SIZE,                   /* exact I                                */ \
  2, 2, 2, 2+IMM2_SIZE,          /* *+I, ++I, ?+I, upto+I                  */ \
  /* Negative single-char repeats - only for chars < 256                   */ \
  2, 2, 2, 2, 2, 2,              /* NOT *, *?, +, +?, ?, ??                */ \
  2+IMM2_SIZE, 2+IMM2_SIZE,      /* NOT upto, minupto                      */ \
  2+IMM2_SIZE,                   /* NOT exact                              */ \
  2, 2, 2, 2+IMM2_SIZE,          /* Possessive NOT *, +, ?, upto           */ \
  2, 2, 2, 2, 2, 2,              /* NOT *I, *?I, +I, +?I, ?I, ??I          */ \
  2+IMM2_SIZE, 2+IMM2_SIZE,      /* NOT upto I, minupto I                  */ \
  2+IMM2_SIZE,                   /* NOT exact I                            */ \
  2, 2, 2, 2+IMM2_SIZE,          /* Possessive NOT *I, +I, ?I, upto I      */ \
  /* Positive type repeats                                                 */ \
  2, 2, 2, 2, 2, 2,              /* Type *, *?, +, +?, ?, ??               */ \
  2+IMM2_SIZE, 2+IMM2_SIZE,      /* Type upto, minupto                     */ \
  2+IMM2_SIZE,                   /* Type exact                             */ \
  2, 2, 2, 2+IMM2_SIZE,          /* Possessive *+, ++, ?+, upto+           */ \
  /* Character class & ref repeats                                         */ \
  1, 1, 1, 1, 1, 1,              /* *, *?, +, +?, ?, ??                    */ \
  1+2*IMM2_SIZE, 1+2*IMM2_SIZE,  /* CRRANGE, CRMINRANGE                    */ \
  1, 1, 1, 1+2*IMM2_SIZE,        /* Possessive *+, ++, ?+, CRPOSRANGE      */ \
  1+(32/sizeof(PCRE2_UCHAR)),    /* CLASS                                  */ \
  1+(32/sizeof(PCRE2_UCHAR)),    /* NCLASS                                 */ \
  0,                             /* XCLASS - variable length               */ \
  1+IMM2_SIZE,                   /* REF                                    */ \
  1+IMM2_SIZE,                   /* REFI                                   */ \
  1+2*IMM2_SIZE,                 /* DNREF                                  */ \
  1+2*IMM2_SIZE,                 /* DNREFI                                 */ \
  1+LINK_SIZE,                   /* RECURSE                                */ \
  2+2*LINK_SIZE,                 /* CALLOUT                                */ \
  1+LINK_SIZE,                   /* Alt                                    */ \
  1+LINK_SIZE,                   /* Ket                                    */ \
  1+LINK_SIZE,                   /* KetRmax                                */ \
  1+LINK_SIZE,                   /* KetRmin                                */ \
  1+LINK_SIZE,                   /* KetRpos                                */ \
  1+LINK_SIZE,                   /* Reverse                                */ \
  1+LINK_SIZE,                   /* Assert                                 */ \
  1+LINK_SIZE,                   /* Assert not                             */ \
  1+LINK_SIZE,                   /* Assert behind                          */ \
  1+LINK_SIZE,                   /* Assert behind not                      */ \
  1+LINK_SIZE,                   /* ONCE                                   */ \
  1+LINK_SIZE,                   /* ONCE_NC                                */ \
  1+LINK_SIZE,                   /* BRA                                    */ \
  1+LINK_SIZE,                   /* BRAPOS                                 */ \
  1+LINK_SIZE+IMM2_SIZE,         /* CBRA                                   */ \
  1+LINK_SIZE+IMM2_SIZE,         /* CBRAPOS                                */ \
  1+LINK_SIZE,                   /* COND                                   */ \
  1+LINK_SIZE,                   /* SBRA                                   */ \
  1+LINK_SIZE,                   /* SBRAPOS                                */ \
  1+LINK_SIZE+IMM2_SIZE,         /* SCBRA                                  */ \
  1+LINK_SIZE+IMM2_SIZE,         /* SCBRAPOS                               */ \
  1+LINK_SIZE,                   /* SCOND                                  */ \
  1+IMM2_SIZE, 1+2*IMM2_SIZE,    /* CREF, DNCREF                           */ \
  1+IMM2_SIZE, 1+2*IMM2_SIZE,    /* RREF, DNRREF                           */ \
  1,                             /* DEF                                    */ \
  1, 1, 1,                       /* BRAZERO, BRAMINZERO, BRAPOSZERO        */ \
  3, 1, 3,                       /* MARK, PRUNE, PRUNE_ARG                 */ \
  1, 3,                          /* SKIP, SKIP_ARG                         */ \
  1, 3,                          /* THEN, THEN_ARG                         */ \
  1, 1, 1, 1,                    /* COMMIT, FAIL, ACCEPT, ASSERT_ACCEPT    */ \
  1+IMM2_SIZE, 1                 /* CLOSE, SKIPZERO                        */
 /* A magic value for OP_RREF to indicate the "any recursion" condition. */
 #define RREF_ANY  0xffff
 /* ---------- Private structures that are mode-independent. ---------- */
 /* Structure to hold data for custom memory management. */
@ -811,15 +1432,64 @@ typedef struct pcre2_memctl {
  void      *memory_data;
 } pcre2_memctl;
-/* The other private structures used by PCRE are defined in a separate file.
+/* Layout of the UCP type table that translates property names into types and
 codes. Each entry used to point directly to a name, but to reduce the number of
 relocations in shared libraries, it now has an offset into a single string
 instead. */
 typedef struct {
  uint16_t name_offset;
  uint16_t type;
  uint16_t value;
 } ucp_type_table;
 /* Unicode character database (UCD) */
 typedef struct {
  uint8_t script;     /* ucp_Arabic, etc. */
  uint8_t chartype;   /* ucp_Cc, etc. (general categories) */
  uint8_t gbprop;     /* ucp_gbControl, etc. (grapheme break property) */
  uint8_t caseset;    /* offset to multichar other cases or zero */
  int32_t other_case; /* offset to other case, or zero if none */
 } ucd_record;
 extern const uint32_t    PRIV(ucd_caseless_sets)[];
 extern const ucd_record  PRIV(ucd_records)[];
 extern const uint8_t     PRIV(ucd_stage1)[];
 extern const uint16_t    PRIV(ucd_stage2)[];
 extern const uint32_t    PRIV(ucp_gentype)[];
 extern const uint32_t    PRIV(ucp_gbtable)[];
 #ifdef SUPPORT_JIT
 extern const int         PRIV(ucp_typerange)[];
 #endif
 /* UCD access macros */
 #define UCD_BLOCK_SIZE 128
 #define GET_UCD(ch) (PRIV(ucd_records) + \
        PRIV(ucd_stage2)[PRIV(ucd_stage1)[(int)(ch) / UCD_BLOCK_SIZE] * \
        UCD_BLOCK_SIZE + (int)(ch) % UCD_BLOCK_SIZE])
 #define UCD_CHARTYPE(ch)    GET_UCD(ch)->chartype
 #define UCD_SCRIPT(ch)      GET_UCD(ch)->script
 #define UCD_CATEGORY(ch)    PRIV(ucp_gentype)[UCD_CHARTYPE(ch)]
 #define UCD_GRAPHBREAK(ch)  GET_UCD(ch)->gbprop
 #define UCD_CASESET(ch)     GET_UCD(ch)->caseset
 #define UCD_OTHERCASE(ch)   ((uint32_t)((int)ch + (int)(GET_UCD(ch)->other_case)))
 /* ----------------- Items that need PCRE2_CODE_UNIT_WIDTH ----------------- */
 #ifdef PCRE2_CODE_UNIT_WIDTH
 /* Mode-dependent macros and private structures are defined in a separate file.
 When compiling the library, PCRE2_CODE_UNIT_WIDTH will be defined, and we
 include them at the appropriate width. When compiling pcre2test, however, that
 macro is not set at this point because pcre2test needs to include them at all
 supported widths. */
-#ifdef PCRE2_CODE_UNIT_WIDTH
+#include "pcre2_intmodedep.h"
 #include "pcre2_intstructs.h"
 #endif
 /* Internal shared functions. These are functions that are used by more than
 one of the library's exported public functions. They have to be "external" in
@ -827,14 +1497,15 @@ the C sense, but are not part of the PCRE public API. They are not referenced
 from pcre2test, and must not be defined when no code unit width is available.
 */
 #ifdef PCRE2_CODE_UNIT_WIDTH
 #define _pcre2_compile_context_init  PCRE2_SUFFIX(_pcre2_compile_context_init_)
 #define _pcre2_match_context_init    PCRE2_SUFFIX(_pcre2_match_context_init_)
 #define _pcre2_memctl_malloc         PCRE2_SUFFIX(_pcre2_memctl_malloc_)
 #define _pcre2_strcmp                PCRE2_SUFFIX(_pcre_strcmp_)
 extern void     _pcre2_compile_context_init(pcre2_compile_context *, BOOL);
 extern void     _pcre2_match_context_init(pcre2_match_context *, BOOL);
-extern void    *_pcre2_memctl_malloc(size_t, size_t, pcre2_general_context *);
+extern void    *_pcre2_memctl_malloc(size_t, size_t, pcre2_memctl *);
 extern int      _pcre2_strcmp(PCRE2_SPTR, PCRE2_SPTR);
 #endif
 /* End of pcre2_internal.h */
--- a/src/pcre2_intmodedep.h
+++ b/src/pcre2_intmodedep.h
@ -0,0 +1,258 @@
 /*************************************************
 *      Perl-Compatible Regular Expressions       *
 *************************************************/
 /* PCRE is a library of functions to support regular expressions whose syntax
 and semantics are as close as possible to those of the Perl 5 language.
                       Written by Philip Hazel
     Original API code Copyright (c) 1997-2012 University of Cambridge
         New API code Copyright (c) 2014 University of Cambridge
 -----------------------------------------------------------------------------
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are met:
    * Redistributions of source code must retain the above copyright notice,
      this list of conditions and the following disclaimer.
    * Redistributions in binary form must reproduce the above copyright
      notice, this list of conditions and the following disclaimer in the
      documentation and/or other materials provided with the distribution.
    * Neither the name of the University of Cambridge nor the names of its
      contributors may be used to endorse or promote products derived from
      this software without specific prior written permission.
 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 POSSIBILITY OF SUCH DAMAGE.
 -----------------------------------------------------------------------------
 */
 /* This module contains mode-dependent macro and structure definitions. The 
 file is #included by pcre2_internal.h if PCRE2_CODE_UNIT_WIDTH is defined.
 These mode-dependent items are kept in a separate file so that they can also be
 #included multiple times for different code unit widths by pcre2test. Start by 
 undefining all the new macros defined herein so that they can be redefined for 
 multiple inclusions. */
 #undef CU2BYTES
 #undef GET
 #undef GET2
 #undef IMM2_SIZE
 #undef MAX_PATTERN_SIZE
 #undef PUT
 #undef PUT2
 #undef PUTINC
 /* ---------------------------MACROS ----------------------------- */
 /* PCRE keeps offsets in its compiled code as at least 16-bit quantities
 (always stored in big-endian order in 8-bit mode) by default. These are used,
 for example, to link from the start of a subpattern to its alternatives and its
 end. The use of 16 bits per offset limits the size of an 8-bit compiled regex
 to around 64K, which is big enough for almost everybody. However, I received a
 request for an even bigger limit. For this reason, and also to make the code
 easier to maintain, the storing and loading of offsets from the compiled code
 unit string is now handled by the macros that are defined here.
 The macros are controlled by the value of LINK_SIZE. This defaults to 2, but 
 values of 2 or 4 are also supported. */
 /* ------------------- 8-bit support  ------------------ */
 #if PCRE2_CODE_UNIT_WIDTH == 8
 #if LINK_SIZE == 2
 #define PUT(a,n,d)   \
  (a[n] = (d) >> 8), \
  (a[(n)+1] = (d) & 255)
 #define GET(a,n) \
  (((a)[n] << 8) | (a)[(n)+1])
 #define MAX_PATTERN_SIZE (1 << 16)
 #elif LINK_SIZE == 3
 #define PUT(a,n,d)       \
  (a[n] = (d) >> 16),    \
  (a[(n)+1] = (d) >> 8), \
  (a[(n)+2] = (d) & 255)
 #define GET(a,n) \
  (((a)[n] << 16) | ((a)[(n)+1] << 8) | (a)[(n)+2])
 #define MAX_PATTERN_SIZE (1 << 24)
 #elif LINK_SIZE == 4
 #define PUT(a,n,d)        \
  (a[n] = (d) >> 24),     \
  (a[(n)+1] = (d) >> 16), \
  (a[(n)+2] = (d) >> 8),  \
  (a[(n)+3] = (d) & 255)
 #define GET(a,n) \
  (((a)[n] << 24) | ((a)[(n)+1] << 16) | ((a)[(n)+2] << 8) | (a)[(n)+3])
 #define MAX_PATTERN_SIZE (1 << 30)   /* Keep it positive */
 #else
 #error LINK_SIZE must be either 2, 3, or 4
 #endif
 /* ------------------- 16-bit support  ------------------ */
 #elif PCRE2_CODE_UNIT_WIDTH == 16
 #if LINK_SIZE == 2
 #undef LINK_SIZE
 #define LINK_SIZE 1
 #define PUT(a,n,d)   \
  (a[n] = (d))
 #define GET(a,n) \
  (a[n])
 #define MAX_PATTERN_SIZE (1 << 16)
 #elif LINK_SIZE == 3 || LINK_SIZE == 4
 #undef LINK_SIZE
 #define LINK_SIZE 2
 #define PUT(a,n,d)   \
  (a[n] = (d) >> 16), \
  (a[(n)+1] = (d) & 65535)
 #define GET(a,n) \
  (((a)[n] << 16) | (a)[(n)+1])
 #define MAX_PATTERN_SIZE (1 << 30)  /* Keep it positive */
 #else
 #error LINK_SIZE must be either 2, 3, or 4
 #endif
 /* ------------------- 32-bit support  ------------------ */
 #elif PCRE2_CODE_UNIT_WIDTH == 32
 #undef LINK_SIZE
 #define LINK_SIZE 1
 #define PUT(a,n,d)   \
  (a[n] = (d))
 #define GET(a,n) \
  (a[n])
 #define MAX_PATTERN_SIZE (1 << 30)  /* Keep it positive */
 #else
 #error Unsupported compiling mode
 #endif 
 /* -------------------------------------------------------*/
 /* PCRE uses some other (at least) 16-bit quantities that do not change when
 the size of offsets changes. There are used for repeat counts and for other
 things such as capturing parenthesis numbers in back references. 
 Define the number of code units required to hold a 16-bit count/offset, and
 macros to load and store such a value. For reasons that I do not understand,
 the expression in the 8-bit GET2 macro is treated by gcc as a signed
 expression, even when a is declared as unsigned. It seems that any kind of
 arithmetic results in a signed value. Hence the cast. */
 #if PCRE2_CODE_UNIT_WIDTH == 8
 #define IMM2_SIZE 2
 #define GET2(a,n) (unsigned int)(((a)[n] << 8) | (a)[(n)+1])
 #define PUT2(a,n,d) { a[n] = (d) >> 8; a[(n)+1] = (d) & 255; }
 #else  /* Code units are 16 or 32 bits */
 #define IMM2_SIZE 1
 #define GET2(a,n) a[n]
 #define PUT2(a,n,d) a[n] = d  
 #endif
 /* Mode-dependent macros that have the same definition in all modes. */
 #define CU2BYTES(x)     (x)*((PCRE2_CODE_UNIT_WIDTH/8))
 #define PUTINC(a,n,d)   PUT(a,n,d), a += LINK_SIZE
 /* --------------------------- STRUCTURES ----------------------------- */
 /* The real general context structure. At present it hold only data for custom 
 memory control. */
 typedef struct pcre2_real_general_context {
  pcre2_memctl    memctl;
 } pcre2_real_general_context;
 /* The real compile context structure */
 typedef struct pcre2_real_compile_context {
  pcre2_memctl    memctl;
  int       (*stack_guard)(uint32_t);
  const unsigned char *tables;
  uint16_t  bsr_convention;
  uint16_t  newline_convention;
  uint32_t  parens_nest_limit;
 } pcre2_real_compile_context;
 /* The real match context structure. */
 typedef struct pcre2_real_match_context {
  pcre2_memctl    memctl;
 #ifdef NO_RECURSE
  void *    (*stack_malloc)(size_t, void *);
  void      (*stack_free)(void *, void *);
 #endif   
  int       (*callout)(pcre2_callout_block *, void *);
  uint32_t  match_limit;
  uint32_t  recursion_limit;
 } pcre2_real_match_context;
 /* The real compiled code structure */
 typedef struct pcre2_real_code {
  pcre2_memctl   memctl;
  void    *executable_jit;        /* Pointer to JIT code */  
  uint8_t  start_bitmap[32];      /* Bitmap for starting code unit < 256 */
  uint32_t magic_number;          /* Paranoid and endianness check */
  uint32_t size;                  /* Total (bytes) that was malloc-ed */ 
  uint32_t compile_options;       /* Options passed to pcre2_compile() */
  uint32_t pattern_options;       /* Options taken from the pattern */
  uint32_t flags;                 /* Various state flags */
  uint32_t limit_match;           /* Limit set in the pattern */
  uint32_t limit_recursion;       /* Limit set in the pattern */
  uint32_t first_codeunit;        /* Starting code unit */
  uint32_t last_codeunit;         /* This codeunit must be seen */
  uint16_t bsr_convention;        /* What \R matches */
  uint16_t newline_convention;    /* What is a newline? */  
  uint16_t max_lookbehind;        /* Longest lookbehind (characters) */
  uint16_t minlength;             /* Minimum length of match */ 
  uint16_t top_bracket;           /* Highest numbered group */ 
  uint16_t top_backref;           /* Highest numbered back reference */
  uint16_t name_entry_size;       /* Size (code units) of table entries */
  uint16_t name_count;            /* Number of name entries in the table */
 } pcre2_real_code;
 /* The reat match data structure. */
 typedef struct pcre2_real_match_data {
  pcre2_memctl     memctl;
  const pcre2_real_code *code;    /* The pattern used for the match */
  PCRE2_SPTR       subject;       /* The subject that was matched */
  int              rc;            /* The return code from the match */
  int              utf_reason;    /* Reason code for bad UTF */  
  size_t           leftchar;      /* Offset to leftmost code unit */
  size_t           rightchar;     /* Offset to rightmost code unit */
  size_t           startchar;     /* Offset to starting code unit */  
  PCRE2_SPTR       mark;          /* Pointer to last mark */  
  uint16_t         oveccount;     /* Number of pairs */
  size_t           ovector[1];    /* The first field */ 
 } pcre2_real_match_data;
 /* End of pcre2_intmodedep.h */
--- a/src/pcre2_intstructs.h
+++ b/src/pcre2_intstructs.h
@ -1,114 +0,0 @@
 /*************************************************
 *      Perl-Compatible Regular Expressions       *
 *************************************************/
 /* PCRE is a library of functions to support regular expressions whose syntax
 and semantics are as close as possible to those of the Perl 5 language.
                       Written by Philip Hazel
     Original API code Copyright (c) 1997-2012 University of Cambridge
         New API code Copyright (c) 2014 University of Cambridge
 -----------------------------------------------------------------------------
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are met:
    * Redistributions of source code must retain the above copyright notice,
      this list of conditions and the following disclaimer.
    * Redistributions in binary form must reproduce the above copyright
      notice, this list of conditions and the following disclaimer in the
      documentation and/or other materials provided with the distribution.
    * Neither the name of the University of Cambridge nor the names of its
      contributors may be used to endorse or promote products derived from
      this software without specific prior written permission.
 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 POSSIBILITY OF SUCH DAMAGE.
 -----------------------------------------------------------------------------
 */
 /* This module contains the private mode-dependent structures needed by
 pcre2_internal.h. They are kept separate so that they can be #included multiple
 times for different code unit widths by pcre2test. */
 /* The real general context structure. At present it hold only data for custom 
 memory control. */
 typedef struct pcre2_real_general_context {
  pcre2_memctl    memctl;
 } pcre2_real_general_context;
 /* The real compile context structure */
 typedef struct pcre2_real_compile_context {
  pcre2_memctl    memctl;
  int       (*stack_guard)(uint32_t);
  const unsigned char *tables;
  uint16_t  bsr_convention;
  uint16_t  newline_convention;
  uint32_t  parens_nest_limit;
 } pcre2_real_compile_context;
 /* The real match context structure. */
 typedef struct pcre2_real_match_context {
  pcre2_memctl    memctl;
 #ifdef NO_RECURSE
  void *    (*stack_malloc)(size_t, void *);
  void      (*stack_free)(void *, void *);
 #endif   
  int       (*callout)(pcre2_callout_block *, void *);
  uint32_t  match_limit;
  uint32_t  recursion_limit;
 } pcre2_real_match_context;
 /* The reat match data structure. */
 typedef struct pcre2_real_match_data {
  pcre2_memctl    memctl;
  size_t    leftchar;             /* Offset to leftmost code unit */
  size_t    rightchar;            /* Offset to rightmost code unit */
  size_t    startchar;            /* Offset to starting code unit */  
  PCRE2_SPTR mark;                /* Pointer to last mark */  
  uint16_t  oveccount;            /* Number of pairs */
  size_t    ovector[1];           /* The first field */ 
 } pcre2_real_match_data;
 /* The real compiled code structure */
 typedef struct pcre2_real_code {
  pcre2_memctl   memctl;
  void    *executable_jit;        /* Pointer to JIT code */  
  uint8_t  start_bitmap[32];      /* Bitmap for starting code unit < 256 */
  uint32_t magic_number;          /* Paranoid and endianness check */
  uint32_t size;                  /* Total that was malloc-ed */ 
  uint32_t compile_options;       /* Options passed to pcre2_compile() */
  uint32_t pattern_options;       /* Options taken from the pattern */
  uint32_t flags;                 /* Various state flags */
  uint32_t limit_match;           /* Limit set in the pattern */
  uint32_t limit_recursion;       /* Limit set in the pattern */
  uint32_t first_codeunit;        /* Starting code unit */
  uint32_t last_codeunit;         /* This codeunit must be seen */
  uint16_t bsr_convention;        /* What \R matches */
  uint16_t newline_convention;    /* What is a newline? */  
  uint16_t max_lookbehind;        /* Longest lookbehind (characters) */
  uint16_t minlength;             /* Minimum length of match */ 
  uint16_t top_bracket;           /* Highest numbered group */ 
  uint16_t top_backref;           /* Highest numbered back reference */
  uint16_t name_table_offset;     /* Offset to name table that follows */
  uint16_t name_entry_size;       /* Size of name items in the table */
  uint16_t name_count;            /* Number of name entries in the table */
 } pcre2_real_code;
 /* End of pcre2_intstructs.h */
--- a/src/pcre2_match.c
+++ b/src/pcre2_match.c
@ -76,11 +76,19 @@ pcre2_match(const pcre2_code *code, PCRE2_SPTR subject, int length,
  size_t start_offset, uint32_t options, pcre2_match_data *match_data, 
  pcre2_match_context *mcontext)
 {
 int rc = PCRE2_ERROR_NOMATCH;
-/* Fudge for testing pcre2test */
+mcontext=mcontext;length=length;
 options=options;
-if (subject[start_offset] == 'Y')
+
 /* Fudges for testing pcre2test */
 if (subject[0] == 'Y')
  {
  rc = 0;
  match_data->code = code;
  match_data->subject = subject;  
  match_data->leftchar = 0;
  match_data->rightchar = 3;
  match_data->startchar = 0;
@ -88,24 +96,51 @@ if (subject[start_offset] == 'Y')
  switch (match_data->oveccount)
    {
-    case 0: return 0;
+    case 0: break;
    case 1: match_data->ovector[0] = start_offset; 
            match_data->ovector[1] = start_offset + 4;
-            return 0;
+            break;
-    default: match_data->ovector[0] = start_offset; 
+    default:
    case 6:  match_data->ovector[10] = PCRE2_UNSET;
             match_data->ovector[11] = PCRE2_UNSET; 
    case 5:  match_data->ovector[8] = PCRE2_UNSET;
             match_data->ovector[9] = PCRE2_UNSET; 
    case 4:  match_data->ovector[6] = start_offset + 3;
             match_data->ovector[7] = start_offset + 4; 
             rc += 2; 
    case 3:  match_data->ovector[4] = PCRE2_UNSET;
             match_data->ovector[5] = PCRE2_UNSET; 
    case 2:  match_data->ovector[0] = start_offset; 
             match_data->ovector[1] = start_offset + 4;
             match_data->ovector[2] = start_offset + 1;    
             match_data->ovector[3] = start_offset + 3;
-             return 2;
+             match_data->mark = subject; 
             rc += 2;
             break; 
    }  
  } 
 else if (subject[0] == 'P')
  {
  rc = PCRE2_ERROR_PARTIAL;
  match_data->code = code;
  match_data->subject = subject;  
  match_data->leftchar = 0;
  match_data->rightchar = length;
  match_data->startchar = 1;
  match_data->mark = NULL;
  } 
-mcontext=mcontext;code=code;subject=subject;length=length;
+match_data->rc = rc;
-start_offset=start_offset; options=options; match_data=match_data;
+return rc; 
 return PCRE2_ERROR_NOMATCH;
 }    
 /* End of pcre2_match.c */
--- a/src/pcre2_match_data.c
+++ b/src/pcre2_match_data.c
@ -56,7 +56,7 @@ pcre2_match_data_create(size_t oveccount, pcre2_general_context *gcontext)
 {
 pcre2_match_data *yield = PRIV(memctl_malloc)(
  sizeof(pcre2_match_data) + 3*oveccount*sizeof(size_t),
-  offsetof(pcre2_real_match_data, memctl), gcontext);
+  offsetof(pcre2_real_match_data, memctl), &(gcontext->memctl));
 yield->oveccount = oveccount;
 return yield;
 }
--- a/src/pcre2_pattern_info.c
+++ b/src/pcre2_pattern_info.c
@ -167,7 +167,7 @@ switch(what)
  break;
  case PCRE2_INFO_NAMETABLE:
-  *((PCRE2_SPTR*)where) = (PCRE2_SPTR)re + re->name_table_offset;
+  *((PCRE2_SPTR*)where) = (PCRE2_SPTR)((char *)re + sizeof(pcre2_real_code));
  break;
  case PCRE2_INFO_NEWLINE_CONVENTION:
--- a/src/pcre2_printint.c
+++ b/src/pcre2_printint.c
@ -1 +1,787 @@
-/* This is a placeholder for pcre2_printint.c */
+/*************************************************
 *      Perl-Compatible Regular Expressions       *
 *************************************************/
 /* PCRE is a library of functions to support regular expressions whose syntax
 and semantics are as close as possible to those of the Perl 5 language.
                       Written by Philip Hazel
     Original API code Copyright (c) 1997-2012 University of Cambridge
         New API code Copyright (c) 2014 University of Cambridge
 -----------------------------------------------------------------------------
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are met:
    * Redistributions of source code must retain the above copyright notice,
      this list of conditions and the following disclaimer.
    * Redistributions in binary form must reproduce the above copyright
      notice, this list of conditions and the following disclaimer in the
      documentation and/or other materials provided with the distribution.
    * Neither the name of the University of Cambridge nor the names of its
      contributors may be used to endorse or promote products derived from
      this software without specific prior written permission.
 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 POSSIBILITY OF SUCH DAMAGE.
 -----------------------------------------------------------------------------
 */
 /* This module contains a PCRE private debugging function for printing out the
 internal form of a compiled regular expression, along with some supporting
 local functions. This source file is #included in pcre2test.c at each supported 
 code unit width, with PCRE2_SUFFIX set appropriately, just like the functions 
 that comprise the library. */
 /* Tables of operator names. The same 8-bit table is used for all code unit
 widths, so it must be defined only once. The list itself is defined in
 pcre2_internal.h, which is #included by pcre2test before this file. */
 #ifndef OP_LISTS_DEFINED
 static const char *OP_names[] = { OP_NAME_LIST };
 #define OP_LISTS_DEFINED
 #endif
 /* The functions and tables herein must all have mode-dependent names. */
 #define OP_lengths        PCRE2_SUFFIX(OP_lengths_)
 #define get_ucpname       PCRE2_SUFFIX(get_ucpname_)
 #define pcre2_printint    PCRE2_SUFFIX(pcre2_printint_)
 #define print_char        PCRE2_SUFFIX(print_char_)
 #define print_custring    PCRE2_SUFFIX(print_custring_)
 #define print_prop        PCRE2_SUFFIX(print_prop_)
 /* Table of sizes for the fixed-length opcodes. It's defined in a macro so that
 the definition is next to the definition of the opcodes in pcre2_internal.h.
 The contents of the table are, however, mode-dependent. */
 static const uint8_t OP_lengths[] = { OP_LENGTHS };
 /*************************************************
 *       Print one character from a string        *
 *************************************************/
 /* In UTF mode the character may occupy more than one code unit.
 Arguments:
  f           file to write to
  ptr         pointer to first code unit of the character
  utf         TRUE if string is UTF (will be FALSE if UTF is not supported)
 Returns:      number of additional code units used
 */   
 static unsigned int
 print_char(FILE *f, PCRE2_SPTR ptr, BOOL utf)
 {
 uint32_t c = *ptr;
 #if defined SUPPORT_UTF && PCRE2_CODE_UNIT_WIDTH == 8
 int a, i, s;
 #endif
 /* If UTF is supported and requested, check for a one-code-unit character. The 
 16-bit and 32-bit tests are for malformed UTF, and should only trigger if the 
 sanity check is turned off. */
 #ifdef SUPPORT_UTF
 if (utf)
  {
 #if PCRE2_CODE_UNIT_WIDTH == 8
  utf = (c & 0xc0) == 0xc0;
 #elif PCRE2_CODE_UNIT_WIDTH == 16
  utf = (c & 0xfc00) == 0xd800;
 #else
  utf = (c & 0xfffff800u) != 0xd800u;
 #endif
  }
 #endif  /* SUPPORT_UTF */  
 /* Handle a one-code-unit character at any width. */
 if (!utf)
  {
  if (PRINTABLE(c)) fprintf(f, "%c", (char)c);
  else if (c < 0x80) fprintf(f, "\\x%02x", c);
  else fprintf(f, "\\x{%02x}", c);
  return 0;
  } 
 /* Per-width code for handling non-one-code-unit UTF characters. */
 #ifdef SUPPORT_UTF
 /* Handle a multi-byte UTF-8 character. */
 #if PCRE2_CODE_UNIT_WIDTH == 8
 a = utf8_table4[c & 0x3f];  /* Number of additional bytes */
 s = 6*a;
 c = (c & utf8_table3[a]) << s;
 for (i = 1; i <= a; i++)
  {
  /* This is a check for malformed UTF-8; it should only occur if the sanity
  check has been turned off. Rather than swallow random bytes, just stop if
  we hit a bad one. Print it with \X instead of \x as an indication. */
  if ((ptr[i] & 0xc0) != 0x80)
    {
    fprintf(f, "\\X{%x}", c);
    return i - 1;
    }
  /* The byte is OK */
  s -= 6;
  c |= (ptr[i] & 0x3f) << s;
  }
 fprintf(f, "\\x{%x}", c);
 return a;
 #endif  /* PCRE2_CODE_UNIT_WIDTH == 8 */
 /* Handle a multi-code-unit UTF-16 character, starting with a check for
 malformed UTF-16; it should only occur if the sanity check has been turned off.
 Rather than swallow a low surrogate, just stop if we hit a bad one. Print it
 with \X instead of \x as an indication. */
 #if PCRE2_CODE_UNIT_WIDTH == 16
 if ((ptr[1] & 0xfc00) != 0xdc00)
  {
  fprintf(f, "\\X{%x}", c);
  return 0;
  }
 c = (((c & 0x3ff) << 10) | (ptr[1] & 0x3ff)) + 0x10000;
 fprintf(f, "\\x{%x}", c);
 return 1;
 #endif  /* PCRE2_CODE_UNIT_WIDTH == 16 */
 /* For UTF-32 we get here only for a malformed code unit, which should only
 occur if the sanity check has been turned off. Print it with \X instead of \x
 as an indication. */
 #if PCRE2_CODE_UNIT_WIDTH == 32
 fprintf(f, "\\X{%x}", c);
 return 0;
 #endif  /* PCRE2_CODE_UNIT_WIDTH == 16 */
 #endif /* SUPPORT_UTF */
 }
 /*************************************************
 *     Print string as a list of code units       *
 *************************************************/
 /* This takes no account of UTF as it always prints each individual code unit. 
 The string is zero-terminated.
 Arguments:
  f          file to write to
  ptr        point to the string
 Returns:     nothing
 */  
 static void
 print_custring(FILE *f, PCRE2_SPTR ptr)
 {
 while (*ptr != '\0')
  {
  register uint32_t c = *ptr++;
  if (PRINTABLE(c)) fprintf(f, "%c", c); else fprintf(f, "\\x{%x}", c);
  }
 }
 /*************************************************
 *          Find Unicode property name            *
 *************************************************/
 static const char *
 get_ucpname(unsigned int ptype, unsigned int pvalue)
 {
 int i;
 for (i = utt_size - 1; i >= 0; i--)
  {
  if (ptype == utt[i].type && pvalue == utt[i].value) break;
  }
 return (i >= 0)? utt_names + utt[i].name_offset : "??";
 }
 /*************************************************
 *       Print Unicode property value             *
 *************************************************/
 /* "Normal" properties can be printed from tables. The PT_CLIST property is a
 pseudo-property that contains a pointer to a list of case-equivalent
 characters. 
 Arguments:
  f            file to write to
  code         pointer in the compiled code
  before       text to print before
  after        text to print after
 Returns:       nothing    
 */
 static void
 print_prop(FILE *f, PCRE2_SPTR code, const char *before, const char *after)
 {
 if (code[1] != PT_CLIST)
  {
  fprintf(f, "%s%s %s%s", before, OP_names[*code], get_ucpname(code[1],
    code[2]), after);
  }
 else
  {
  const char *not = (*code == OP_PROP)? "" : "not ";
  const uint32_t *p = ucd_caseless_sets + code[2];
  fprintf (f, "%s%sclist", before, not);
  while (*p < NOTACHAR) fprintf(f, " %04x", *p++);
  fprintf(f, "%s", after);
  }
 }
 /*************************************************
 *            Print compiled pattern              *
 *************************************************/
 /* The print_lengths flag controls whether offsets and lengths of items are
 printed. Lenths can be turned off from pcre2test so that automatic tests on
 bytecode can be written that do not depend on the value of LINK_SIZE. 
 Arguments:
  re              a compiled pattern
  f               the file to write to
  print_lenghts   show various lengths  
 Returns:          nothing 
 */
 static void
 pcre2_printint(pcre2_code *re, FILE *f, BOOL print_lengths)
 {
 PCRE2_SPTR codestart, nametable, code;
 uint32_t options = re->compile_options;
 size_t nesize = re->name_entry_size;
 BOOL utf = (options & PCRE2_UTF) != 0;
 nametable = (PCRE2_SPTR)((uint8_t *)re + sizeof(pcre2_real_code));
 code = codestart = nametable + re->name_count * re->name_entry_size;
 for(;;)
  {
  PCRE2_SPTR ccode;
  uint32_t c;
  const char *flag = "  ";
  unsigned int extra = 0;
  if (print_lengths)
    fprintf(f, "%3d ", (int)(code - codestart));
  else
    fprintf(f, "    ");
  switch(*code)
    {
 /* ========================================================================== */
      /* These cases are never obeyed. This is a fudge that causes a compile-
      time error if the vectors OP_names or OP_lengths, which are indexed
      by opcode, are not the correct length. It seems to be the only way to do
      such a check at compile time, as the sizeof() operator does not work in
      the C preprocessor. */
      case OP_TABLE_LENGTH:
      case OP_TABLE_LENGTH +
        ((sizeof(OP_names)/sizeof(const char *) == OP_TABLE_LENGTH) &&
        (sizeof(OP_lengths) == OP_TABLE_LENGTH)):
      break;
 /* ========================================================================== */
    case OP_END:
    fprintf(f, "    %s\n", OP_names[*code]);
    fprintf(f, "------------------------------------------------------------------\n");
    return;
    case OP_CHAR:
    fprintf(f, "    ");
    do
      {
      code++;
      code += 1 + print_char(f, code, utf);
      }
    while (*code == OP_CHAR);
    fprintf(f, "\n");
    continue;
    case OP_CHARI:
    fprintf(f, " /i ");
    do
      {
      code++;
      code += 1 + print_char(f, code, utf);
      }
    while (*code == OP_CHARI);
    fprintf(f, "\n");
    continue;
    case OP_CBRA:
    case OP_CBRAPOS:
    case OP_SCBRA:
    case OP_SCBRAPOS:
    if (print_lengths) fprintf(f, "%3d ", GET(code, 1));
      else fprintf(f, "    ");
    fprintf(f, "%s %d", OP_names[*code], GET2(code, 1+LINK_SIZE));
    break;
    case OP_BRA:
    case OP_BRAPOS:
    case OP_SBRA:
    case OP_SBRAPOS:
    case OP_KETRMAX:
    case OP_KETRMIN:
    case OP_KETRPOS:
    case OP_ALT:
    case OP_KET:
    case OP_ASSERT:
    case OP_ASSERT_NOT:
    case OP_ASSERTBACK:
    case OP_ASSERTBACK_NOT:
    case OP_ONCE:
    case OP_ONCE_NC:
    case OP_COND:
    case OP_SCOND:
    case OP_REVERSE:
    if (print_lengths) fprintf(f, "%3d ", GET(code, 1));
      else fprintf(f, "    ");
    fprintf(f, "%s", OP_names[*code]);
    break;
    case OP_CLOSE:
    fprintf(f, "    %s %d", OP_names[*code], GET2(code, 1));
    break;
    case OP_CREF:
    fprintf(f, "%3d %s", GET2(code,1), OP_names[*code]);
    break;
    case OP_DNCREF:
      {
      PCRE2_SPTR entry = nametable + (GET2(code, 1) * nesize) + IMM2_SIZE;
      fprintf(f, " %s Cond ref <", flag);
      print_custring(f, entry);
      fprintf(f, ">%d", GET2(code, 1 + IMM2_SIZE));
      }
    break;
    case OP_RREF:
    c = GET2(code, 1);
    if (c == RREF_ANY)
      fprintf(f, "    Cond recurse any");
    else
      fprintf(f, "    Cond recurse %d", c);
    break;
    case OP_DNRREF:
      {
      PCRE2_SPTR entry = nametable + (GET2(code, 1) * nesize) + IMM2_SIZE;
      fprintf(f, " %s Cond recurse <", flag);
      print_custring(f, entry);
      fprintf(f, ">%d", GET2(code, 1 + IMM2_SIZE));
      }
    break;
    case OP_DEF:
    fprintf(f, "    Cond def");
    break;
    case OP_STARI:
    case OP_MINSTARI:
    case OP_POSSTARI:
    case OP_PLUSI:
    case OP_MINPLUSI:
    case OP_POSPLUSI:
    case OP_QUERYI:
    case OP_MINQUERYI:
    case OP_POSQUERYI:
    flag = "/i";
    /* Fall through */
    case OP_STAR:
    case OP_MINSTAR:
    case OP_POSSTAR:
    case OP_PLUS:
    case OP_MINPLUS:
    case OP_POSPLUS:
    case OP_QUERY:
    case OP_MINQUERY:
    case OP_POSQUERY:
    case OP_TYPESTAR:
    case OP_TYPEMINSTAR:
    case OP_TYPEPOSSTAR:
    case OP_TYPEPLUS:
    case OP_TYPEMINPLUS:
    case OP_TYPEPOSPLUS:
    case OP_TYPEQUERY:
    case OP_TYPEMINQUERY:
    case OP_TYPEPOSQUERY:
    fprintf(f, " %s ", flag);
    if (*code >= OP_TYPESTAR)
      {
      if (code[1] == OP_PROP || code[1] == OP_NOTPROP)
        {
        print_prop(f, code + 1, "", " ");
        extra = 2;
        }
      else fprintf(f, "%s", OP_names[code[1]]);
      }
    else extra = print_char(f, code+1, utf);
    fprintf(f, "%s", OP_names[*code]);
    break;
    case OP_EXACTI:
    case OP_UPTOI:
    case OP_MINUPTOI:
    case OP_POSUPTOI:
    flag = "/i";
    /* Fall through */
    case OP_EXACT:
    case OP_UPTO:
    case OP_MINUPTO:
    case OP_POSUPTO:
    fprintf(f, " %s ", flag);
    extra = print_char(f, code + 1 + IMM2_SIZE, utf);
    fprintf(f, "{");
    if (*code != OP_EXACT && *code != OP_EXACTI) fprintf(f, "0,");
    fprintf(f, "%d}", GET2(code,1));
    if (*code == OP_MINUPTO || *code == OP_MINUPTOI) fprintf(f, "?");
      else if (*code == OP_POSUPTO || *code == OP_POSUPTOI) fprintf(f, "+");
    break;
    case OP_TYPEEXACT:
    case OP_TYPEUPTO:
    case OP_TYPEMINUPTO:
    case OP_TYPEPOSUPTO:
    if (code[1 + IMM2_SIZE] == OP_PROP || code[1 + IMM2_SIZE] == OP_NOTPROP)
      {
      print_prop(f, code + IMM2_SIZE + 1, "    ", " ");
      extra = 2;
      }
    else fprintf(f, "    %s", OP_names[code[1 + IMM2_SIZE]]);
    fprintf(f, "{");
    if (*code != OP_TYPEEXACT) fprintf(f, "0,");
    fprintf(f, "%d}", GET2(code,1));
    if (*code == OP_TYPEMINUPTO) fprintf(f, "?");
      else if (*code == OP_TYPEPOSUPTO) fprintf(f, "+");
    break;
    case OP_NOTI:
    flag = "/i";
    /* Fall through */
    case OP_NOT:
    fprintf(f, " %s [^", flag);
    extra = print_char(f, code + 1, utf);
    fprintf(f, "]");
    break;
    case OP_NOTSTARI:
    case OP_NOTMINSTARI:
    case OP_NOTPOSSTARI:
    case OP_NOTPLUSI:
    case OP_NOTMINPLUSI:
    case OP_NOTPOSPLUSI:
    case OP_NOTQUERYI:
    case OP_NOTMINQUERYI:
    case OP_NOTPOSQUERYI:
    flag = "/i";
    /* Fall through */
    case OP_NOTSTAR:
    case OP_NOTMINSTAR:
    case OP_NOTPOSSTAR:
    case OP_NOTPLUS:
    case OP_NOTMINPLUS:
    case OP_NOTPOSPLUS:
    case OP_NOTQUERY:
    case OP_NOTMINQUERY:
    case OP_NOTPOSQUERY:
    fprintf(f, " %s [^", flag);
    extra = print_char(f, code + 1, utf);
    fprintf(f, "]%s", OP_names[*code]);
    break;
    case OP_NOTEXACTI:
    case OP_NOTUPTOI:
    case OP_NOTMINUPTOI:
    case OP_NOTPOSUPTOI:
    flag = "/i";
    /* Fall through */
    case OP_NOTEXACT:
    case OP_NOTUPTO:
    case OP_NOTMINUPTO:
    case OP_NOTPOSUPTO:
    fprintf(f, " %s [^", flag);
    extra = print_char(f, code + 1 + IMM2_SIZE, utf);
    fprintf(f, "]{");
    if (*code != OP_NOTEXACT && *code != OP_NOTEXACTI) fprintf(f, "0,");
    fprintf(f, "%d}", GET2(code,1));
    if (*code == OP_NOTMINUPTO || *code == OP_NOTMINUPTOI) fprintf(f, "?");
      else
    if (*code == OP_NOTPOSUPTO || *code == OP_NOTPOSUPTOI) fprintf(f, "+");
    break;
    case OP_RECURSE:
    if (print_lengths) fprintf(f, "%3d ", GET(code, 1));
      else fprintf(f, "    ");
    fprintf(f, "%s", OP_names[*code]);
    break;
    case OP_REFI:
    flag = "/i";
    /* Fall through */
    case OP_REF:
    fprintf(f, " %s \\%d", flag, GET2(code,1));
    ccode = code + OP_lengths[*code];
    goto CLASS_REF_REPEAT;
    case OP_DNREFI:
    flag = "/i";
    /* Fall through */
    case OP_DNREF:
      {
      PCRE2_SPTR entry = nametable + (GET2(code, 1) * nesize) + IMM2_SIZE;
      fprintf(f, " %s \\k<", flag);
      print_custring(f, entry);
      fprintf(f, ">%d", GET2(code, 1 + IMM2_SIZE));
      }
    ccode = code + OP_lengths[*code];
    goto CLASS_REF_REPEAT;
    case OP_CALLOUT:
    fprintf(f, "    %s %d %d %d", OP_names[*code], code[1], GET(code,2),
      GET(code, 2 + LINK_SIZE));
    break;
    case OP_PROP:
    case OP_NOTPROP:
    print_prop(f, code, "    ", "");
    break;
    /* OP_XCLASS cannot occur in 8-bit, non-UTF mode. However, there's no harm
    in having this code always here, and it makes it less messy without all
    those #ifdefs. */
    case OP_CLASS:
    case OP_NCLASS:
    case OP_XCLASS:
      {
      int i;
      unsigned int min, max;
      BOOL printmap;
      BOOL invertmap = FALSE;
      uint8_t *map;
      uint8_t inverted_map[32];
      fprintf(f, "    [");
      if (*code == OP_XCLASS)
        {
        extra = GET(code, 1);
        ccode = code + LINK_SIZE + 1;
        printmap = (*ccode & XCL_MAP) != 0;
        if ((*ccode & XCL_NOT) != 0)
          {
          invertmap = (*ccode & XCL_HASPROP) == 0;
          fprintf(f, "^");
          }
        ccode++;
        }
      else
        {
        printmap = TRUE;
        ccode = code + 1;
        }
      /* Print a bit map */
      if (printmap)
        {
        map = (uint8_t *)ccode;
        if (invertmap)
          {
          for (i = 0; i < 32; i++) inverted_map[i] = ~map[i];
          map = inverted_map;
          }
        for (i = 0; i < 256; i++)
          {
          if ((map[i/8] & (1 << (i&7))) != 0)
            {
            int j;
            for (j = i+1; j < 256; j++)
              if ((map[j/8] & (1 << (j&7))) == 0) break;
            if (i == '-' || i == ']') fprintf(f, "\\");
            if (PRINTABLE(i)) fprintf(f, "%c", i);
              else fprintf(f, "\\x%02x", i);
            if (--j > i)
              {
              if (j != i + 1) fprintf(f, "-");
              if (j == '-' || j == ']') fprintf(f, "\\");
              if (PRINTABLE(j)) fprintf(f, "%c", j);
                else fprintf(f, "\\x%02x", j);
              }
            i = j;
            }
          }
        ccode += 32 / sizeof(PCRE2_UCHAR);
        }
      /* For an XCLASS there is always some additional data */
      if (*code == OP_XCLASS)
        {
        PCRE2_UCHAR ch;
        while ((ch = *ccode++) != XCL_END)
          {
          BOOL not = FALSE;
          const char *notch = "";
          switch(ch)
            {
            case XCL_NOTPROP:
            not = TRUE;
            notch = "^";
            /* Fall through */
            case XCL_PROP:
              {
              unsigned int ptype = *ccode++;
              unsigned int pvalue = *ccode++;
              switch(ptype)
                {
                case PT_PXGRAPH:
                fprintf(f, "[:%sgraph:]", notch);
                break;
                case PT_PXPRINT:
                fprintf(f, "[:%sprint:]", notch);
                break;
                case PT_PXPUNCT:
                fprintf(f, "[:%spunct:]", notch);
                break;
                default:
                fprintf(f, "\\%c{%s}", (not? 'P':'p'),
                  get_ucpname(ptype, pvalue));
                break;
                }
              }
            break;
            default:
            ccode += 1 + print_char(f, ccode, utf);
            if (ch == XCL_RANGE)
              {
              fprintf(f, "-");
              ccode += 1 + print_char(f, ccode, utf);
              }
            break;
            }
          }
        }
      /* Indicate a non-UTF class which was created by negation */
      fprintf(f, "]%s", (*code == OP_NCLASS)? " (neg)" : "");
      /* Handle repeats after a class or a back reference */
      CLASS_REF_REPEAT:
      switch(*ccode)
        {
        case OP_CRSTAR:
        case OP_CRMINSTAR:
        case OP_CRPLUS:
        case OP_CRMINPLUS:
        case OP_CRQUERY:
        case OP_CRMINQUERY:
        case OP_CRPOSSTAR:
        case OP_CRPOSPLUS:
        case OP_CRPOSQUERY:
        fprintf(f, "%s", OP_names[*ccode]);
        extra += OP_lengths[*ccode];
        break;
        case OP_CRRANGE:
        case OP_CRMINRANGE:
        case OP_CRPOSRANGE:
        min = GET2(ccode,1);
        max = GET2(ccode,1 + IMM2_SIZE);
        if (max == 0) fprintf(f, "{%u,}", min);
        else fprintf(f, "{%u,%u}", min, max);
        if (*ccode == OP_CRMINRANGE) fprintf(f, "?");
        else if (*ccode == OP_CRPOSRANGE) fprintf(f, "+");
        extra += OP_lengths[*ccode];
        break;
        /* Do nothing if it's not a repeat; this code stops picky compilers
        warning about the lack of a default code path. */
        default:
        break;
        }
      }
    break;
    case OP_MARK:
    case OP_PRUNE_ARG:
    case OP_SKIP_ARG:
    case OP_THEN_ARG:
    fprintf(f, "    %s ", OP_names[*code]);
    print_custring(f, code + 2);
    extra += code[1];
    break;
    case OP_THEN:
    fprintf(f, "    %s", OP_names[*code]);
    break;
    case OP_CIRCM:
    case OP_DOLLM:
    flag = "/m";
    /* Fall through */
    /* Anything else is just an item with no data, but possibly a flag. */
    default:
    fprintf(f, " %s %s", flag, OP_names[*code]);
    break;
    }
  code += OP_lengths[*code] + extra;
  fprintf(f, "\n");
  }
 }
 /* End of pcre2_printint.c */
--- a/src/pcre2_string_utils.c
+++ b/src/pcre2_string_utils.c
@ -0,0 +1,80 @@
 /*************************************************
 *      Perl-Compatible Regular Expressions       *
 *************************************************/
 /* PCRE is a library of functions to support regular expressions whose syntax
 and semantics are as close as possible to those of the Perl 5 language.
                       Written by Philip Hazel
     Original API code Copyright (c) 1997-2012 University of Cambridge
         New API code Copyright (c) 2014 University of Cambridge
 -----------------------------------------------------------------------------
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are met:
    * Redistributions of source code must retain the above copyright notice,
      this list of conditions and the following disclaimer.
    * Redistributions in binary form must reproduce the above copyright
      notice, this list of conditions and the following disclaimer in the
      documentation and/or other materials provided with the distribution.
    * Neither the name of the University of Cambridge nor the names of its
      contributors may be used to endorse or promote products derived from
      this software without specific prior written permission.
 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 POSSIBILITY OF SUCH DAMAGE.
 -----------------------------------------------------------------------------
 */
 /* This module contains internal functions for comparing and finding the length
 of strings. These are used instead of strcmp() etc because the standard 
 functions work only on 8-bit data. */
 #ifdef HAVE_CONFIG_H
 #include "config.h"
 #endif
 #include "pcre2_internal.h"
 /* FIXME: this module is incomplete */
 /*************************************************
 *             Compare two strings                *
 *************************************************/
 /* 
 Arguments:
  str1        first string
  str2        second string
 Returns:      0, 1, or -1
 */
 int
 PRIV(strcmp)(PCRE2_SPTR str1, PCRE2_SPTR str2)
 {
 PCRE2_UCHAR c1, c2;
 while (*str1 != '\0' || *str2 != '\0')
  {
  c1 = *str1++;
  c2 = *str2++;
  if (c1 != c2) return ((c1 > c2) << 1) - 1;
  }
 return 0;
 }
 /* End of pcre2_string_utils.c */
--- a/src/pcre2_substring.c
+++ b/src/pcre2_substring.c
@ -46,8 +46,6 @@ POSSIBILITY OF SUCH DAMAGE.
 #include "pcre2_internal.h"
 /* FIXME: most of these are currently placeholder functions */
 /*************************************************
 *   Copy named captured string to given buffer   *
@ -75,7 +73,16 @@ PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION
 pcre2_substring_copy_byname(pcre2_match_data *match_data, PCRE2_SPTR stringname,
  PCRE2_UCHAR *buffer, size_t size)
 {
-match_data=match_data;stringname=stringname;buffer=buffer;size=size;
+PCRE2_SPTR first, last, entry;
 int entrysize = pcre2_substring_nametable_scan(match_data->code, stringname,
  &first, &last);
 if (entrysize <= 0) return entrysize;
 for (entry = first; entry <= last; entry += entrysize)
  {
  uint16_t n = GET2(entry, 0);
  if (n < match_data->oveccount && match_data->ovector[n*2] != PCRE2_UNSET)
    return pcre2_substring_copy_bynumber(match_data, n, buffer, size); 
  }
 return PCRE2_ERROR_NOSUBSTRING;
 }
@ -106,55 +113,17 @@ PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION
 pcre2_substring_copy_bynumber(pcre2_match_data *match_data, int stringnumber,
  PCRE2_UCHAR *buffer, size_t size)
 {
-match_data=match_data;stringnumber=stringnumber;buffer=buffer;size=size;
+size_t left, right;
-return PCRE2_ERROR_NOSUBSTRING;
+size_t p = 0;
-}
+PCRE2_SPTR subject = match_data->subject;
-
+if (stringnumber >= match_data->oveccount ||
-
+    (left = match_data->ovector[stringnumber*2]) == PCRE2_UNSET)
-
+  return PCRE2_ERROR_NOSUBSTRING;
-/*************************************************
+right = match_data->ovector[stringnumber*2+1];
-*       Free memory obtained by get_substring    *
+if (right - left + 1 > size) return PCRE2_ERROR_NOMEMORY; 
-*************************************************/
+while (left < right) buffer[p++] = subject[left++];
-
+buffer[p] = 0;
-/* This function exists for the benefit of people calling PCRE from non-C
+return p;
 programs that can call its functions, but not free() itself.
 Arguments:
  context     points to a PCRE2 context
  string      the result of a previous pcre2_get_substring()
 Returns:      nothing
 */
 PCRE2_EXP_DEFN void PCRE2_CALL_CONVENTION
 pcre2_substring_free(PCRE2_UCHAR *string)
 {
 string=string;
 return;
 }
 /*************************************************
 *   Free memory obtained by get_substring_list   *
 *************************************************/
 /* This function exists for the benefit of people calling PCRE from non-C
 programs that can call its functions, but not free() itself.
 Arguments:
  context     points to a PCRE2 context
  list        the result of a previous pcre2_get_substring_list()
 Returns:      nothing
 */
 PCRE2_EXP_DEFN void PCRE2_CALL_CONVENTION
 pcre2_substring_list_free(PCRE2_SPTR *list)
 {
 list=list;
 return;
 }
@ -168,10 +137,9 @@ new memory. If the regex permits duplicate names, the first substring that is
 set is chosen.
 Arguments:
  context        points to a PCRE2 context
  match_data     pointer to match_data
  stringname     the name of the required substring
-  stringptr      where to put the pointer
+  stringptr      where to put the pointer to the new memory
 Returns:         if successful:
                   the length of the copied string, not including the zero
@ -185,7 +153,16 @@ PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION
 pcre2_substring_get_byname(pcre2_match_data *match_data,
  PCRE2_SPTR stringname, PCRE2_UCHAR **stringptr)
 {
-match_data=match_data;stringname=stringname;stringptr=stringptr;
+PCRE2_SPTR first, last, entry;
 int entrysize = pcre2_substring_nametable_scan(match_data->code, stringname,
  &first, &last);
 if (entrysize <= 0) return entrysize;
 for (entry = first; entry <= last; entry += entrysize)
  {
  uint16_t n = GET2(entry, 0);
  if (n < match_data->oveccount && match_data->ovector[n*2] != PCRE2_UNSET)
    return pcre2_substring_get_bynumber(match_data, n, stringptr); 
  }
 return PCRE2_ERROR_NOSUBSTRING;
 }
@ -199,10 +176,9 @@ return PCRE2_ERROR_NOSUBSTRING;
 memory.
 Arguments:
  context        points to a PCRE2 context
  match_data     points to match data
  stringnumber   the number of the required substring
-  stringptr      where to put a pointer to the substring
+  stringptr      where to put a pointer to the new memory
 Returns:         if successful:
                   the length of the string, not including the zero that
@ -216,9 +192,44 @@ PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION
 pcre2_substring_get_bynumber(pcre2_match_data *match_data, int stringnumber, 
  PCRE2_UCHAR **stringptr)
 {
-match_data=match_data;stringnumber=stringnumber;
+size_t left, right;
-stringptr=stringptr;
+size_t p = 0;
-return PCRE2_ERROR_NOSUBSTRING;
+void *block;
 PCRE2_UCHAR *yield;
 PCRE2_SPTR subject = match_data->subject;
 if (stringnumber >= match_data->oveccount ||
    (left = match_data->ovector[stringnumber*2]) == PCRE2_UNSET)
  return PCRE2_ERROR_NOSUBSTRING;
 right = match_data->ovector[stringnumber*2+1];
 block = PRIV(memctl_malloc)(sizeof(pcre2_memctl) + 
  (right-left+1)*PCRE2_CODE_UNIT_WIDTH, 0, &(match_data->memctl)); 
 if (block == NULL) return PCRE2_ERROR_NOMEMORY;
 yield = (PCRE2_UCHAR *)((char *)block + sizeof(pcre2_memctl));
 while (left < right) yield[p++] = subject[left++];
 yield[p] = 0;
 *stringptr = yield;
 return p;
 }
 /*************************************************
 *       Free memory obtained by get_substring    *
 *************************************************/
 /* 
 Argument:     the result of a previous pcre2_substring_get_byxxx()
 Returns:      nothing
 */
 PCRE2_EXP_DEFN void PCRE2_CALL_CONVENTION
 pcre2_substring_free(PCRE2_UCHAR *string)
 {
 pcre2_memctl *memctl = (pcre2_memctl *)((char *)string - sizeof(pcre2_memctl));
 memctl->free(memctl, memctl->memory_data);
 }
@ -242,7 +253,16 @@ PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION
 pcre2_substring_length_byname(pcre2_match_data *match_data,
  PCRE2_SPTR stringname)
 {
-match_data=match_data;stringname=stringname;
+PCRE2_SPTR first, last, entry;
 int entrysize = pcre2_substring_nametable_scan(match_data->code, stringname,
  &first, &last);
 if (entrysize <= 0) return entrysize;
 for (entry = first; entry <= last; entry += entrysize)
  {
  uint16_t n = GET2(entry, 0);
  if (n < match_data->oveccount && match_data->ovector[n*2] != PCRE2_UNSET)
    return pcre2_substring_length_bynumber(match_data, n); 
  }
 return PCRE2_ERROR_NOSUBSTRING;
 }
@ -266,8 +286,11 @@ PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION
 pcre2_substring_length_bynumber(pcre2_match_data *match_data,
  int stringnumber)
 {
-match_data=match_data;stringnumber=stringnumber;
+if (stringnumber >= match_data->oveccount ||
-return PCRE2_ERROR_NOSUBSTRING;
+    match_data->ovector[stringnumber*2] == PCRE2_UNSET)
  return PCRE2_ERROR_NOSUBSTRING;
 return match_data->ovector[stringnumber*2 + 1] -
       match_data->ovector[stringnumber*2];
 }
@ -278,48 +301,88 @@ return PCRE2_ERROR_NOSUBSTRING;
 /* This function gets one chunk of memory and builds a list of pointers and all
 the captured substrings in it. A NULL pointer is put on the end of the list.
 The substrings are zero-terminated, but also, if the final argument is 
 non-NULL, a list of lengths is also returned. This allows binary data to be 
 handled.
 Arguments:
  context        points to a PCRE2 context
  match_data     points to the match data
  listptr        set to point to the list of pointers
  lengthsptr     set to point to the list of lengths (may be NULL) 
 Returns:         if successful: 0
                 if not successful, a negative error code:
-                   PCRE2_ERROR_NOMEMORY: failed to get memory
+                   PCRE2_ERROR_NOMEMORY: failed to get memory,
                   or a match failure code   
 */
 PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION
 pcre2_substring_list_get(pcre2_match_data *match_data, PCRE2_UCHAR ***listptr, 
  size_t **lengthsptr)
 {
-match_data=match_data;listptr=listptr;lengthsptr=lengthsptr;
+int i, count, count2;
-return PCRE2_ERROR_NOMEMORY;
+size_t size;
 size_t *lensp, *ovector;
 pcre2_memctl *memp;
 PCRE2_UCHAR **listp;
 PCRE2_UCHAR *sp;
 if ((count = match_data->rc) < 0) return count;
 count2 = 2*count;
 ovector = match_data->ovector;
 size = sizeof(pcre2_memctl) + sizeof(PCRE2_UCHAR *);     /* For final NULL */
 if (lengthsptr != NULL) size += sizeof(size_t) * count;  /* For lengths */
 for (i = 0; i < count2; i += 2)
   size += sizeof(PCRE2_UCHAR *) + CU2BYTES(ovector[i+1] - ovector[i] + 1);
 memp = PRIV(memctl_malloc)(size, 0, &(match_data->memctl)); 
 if (memp == NULL) return PCRE2_ERROR_NOMEMORY;
 *listptr = listp = (PCRE2_UCHAR **)((char *)memp + sizeof(pcre2_memctl));
 lensp = (size_t *)((char *)listp + sizeof(PCRE2_UCHAR *) * (count + 1));
 if (lengthsptr == NULL)
  {
  sp = (PCRE2_UCHAR *)lensp; 
  lensp = NULL;
  }
 else
  {   
  *lengthsptr = lensp; 
  sp = (PCRE2_UCHAR *)((char *)lensp + sizeof(size_t) * count); 
  } 
 for (i = 0; i < count2; i += 2)
  {
  size = ovector[i+1] - ovector[i];
  memcpy(sp, match_data->subject + ovector[i], CU2BYTES(size));
  *listp++ = sp;
  if (lensp != NULL) *lensp++ = size;
  sp += size;
  *sp++ = 0;
  }
 *listp = NULL;
 return 0;
 }
 /*************************************************
-*           Find number for named string         *
+*   Free memory obtained by substring_list_get   *
 *************************************************/
-/* This function is used by the local get_first_set() function, as well
+/*
-as being generally available. It assumes that names are unique.
+Argument:     the result of a previous pcre2_substring_list_get()
-
+Returns:      nothing
 Arguments:
  code        the compiled regex
  stringname  the name whose number is required
 Returns:      the number of the named parentheses, or a negative number
                (PCRE2_ERROR_NOSUBSTRING) if not found
 */
-PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION
+PCRE2_EXP_DEFN void PCRE2_CALL_CONVENTION
-pcre2_substring_number_from_name(const pcre2_code *code, 
+pcre2_substring_list_free(PCRE2_SPTR *list)
  PCRE2_SPTR stringname)
 {
-code=code;stringname=stringname;
+pcre2_memctl *memctl = (pcre2_memctl *)((char *)list - sizeof(pcre2_memctl));
-return PCRE2_ERROR_NOSUBSTRING;
+memctl->free(memctl, memctl->memory_data);
 }
@ -328,8 +391,10 @@ return PCRE2_ERROR_NOSUBSTRING;
 *     Find (multiple) entries for named string   *
 *************************************************/
-/* This is used by the local get_first_set() function, as well as being
+/* This function scans the nametable for a given name, using binary chop. It 
-generally available. It is used when duplicated names are permitted.
+returns either two pointers to the entries in the table, or, if no pointers are 
 given, the number of a group with the given name. If duplicate names are 
 permitted, this may not be unique.
 Arguments:
  code        the compiled regex
@ -337,17 +402,73 @@ Arguments:
  firstptr    where to put the pointer to the first entry
  lastptr     where to put the pointer to the last entry
-Returns:      the length of each entry, or a negative number
+Returns:      if firstptr and lastptr are NULL, a group number;
              otherwise, the length of each entry, or a negative number
                (PCRE2_ERROR_NOSUBSTRING) if not found
 */
 PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION
 pcre2_substring_nametable_scan(const pcre2_code *code, PCRE2_SPTR stringname,
-  PCRE2_UCHAR **firstptr, PCRE2_UCHAR **lastptr)
+  PCRE2_SPTR *firstptr, PCRE2_SPTR *lastptr)
 {
-code=code;stringname=stringname;firstptr=firstptr;lastptr=lastptr;
+uint16_t bot = 0;
 uint16_t top = code->name_count;
 uint16_t entrysize = code->name_entry_size;
 PCRE2_SPTR nametable = (PCRE2_SPTR)((char *)code + sizeof(pcre2_real_code));
 while (top > bot)
  {
  uint16_t mid = (top + bot) / 2;
  PCRE2_SPTR entry = nametable + entrysize*mid;
  int c = PRIV(strcmp)(stringname, entry + IMM2_SIZE);
  if (c == 0) 
    {
    PCRE2_SPTR first, last, lastentry; 
    if (firstptr == NULL) return GET2(entry, 0);
    lastentry = nametable + entrysize * (code->name_count - 1);
    first = last = entry;
    while (first > nametable)
      {
      if (PRIV(strcmp)(stringname, (first - entrysize + IMM2_SIZE)) != 0) break;
      first -= entrysize;
      }
    while (last < lastentry)
      {
      if (PRIV(strcmp)(stringname, (last + entrysize + IMM2_SIZE)) != 0) break;
      last += entrysize;
      }
    *firstptr = first;
    *lastptr = last; 
    return entrysize;
    }
  if (c > 0) bot = mid + 1; else top = mid;
  }
 return PCRE2_ERROR_NOSUBSTRING;
 }
 /*************************************************
 *           Find number for named string         *
 *************************************************/
 /* This function is a convenience wrapper for pcre2_substring_nametable_scan()
 when it is known that names are unique. If there are duplicate names, it is not 
 defined which number is returned.
 Arguments:
  code        the compiled regex
  stringname  the name whose number is required
 Returns:      the number of the named parenthesis, or a negative number
                (PCRE2_ERROR_NOSUBSTRING) if not found
 */
 PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION
 pcre2_substring_number_from_name(const pcre2_code *code, 
  PCRE2_SPTR stringname)
 {
 return pcre2_substring_nametable_scan(code, stringname, NULL, NULL);
 }
 /* End of pcre2_substring.c */
--- a/src/pcre2_tables.c
+++ b/src/pcre2_tables.c
@ -38,34 +38,33 @@ POSSIBILITY OF SUCH DAMAGE.
 -----------------------------------------------------------------------------
 */
 #ifndef PCRE2_INCLUDED
 /* This module contains some fixed tables that are used by more than one of the
 PCRE code modules. The tables are also #included by the pcre2test program,
 which uses macros to change their names from _pcre2_xxx to xxxx, thereby
-avoiding name clashes with the library. */
+avoiding name clashes with the library. In this case, PCRE2_INCLUDED is 
 defined. */
 #ifndef PCRE2_INCLUDED           /* We're compiling the library */
 #ifdef HAVE_CONFIG_H
 #include "config.h"
 #endif
 #include "pcre2_internal.h"
 #endif /* PCRE2_INCLUDED */
 #ifdef FIXME
 /* Table of sizes for the fixed-length opcodes. It's defined in a macro so that
-the definition is next to the definition of the opcodes in pcre2_internal.h. */
+the definition is next to the definition of the opcodes in pcre2_internal.h. 
 This is mode-dependent, so is skipped when this file is included by pcre2test. */
 #ifndef PCRE2_INCLUDED
 const uint8_t PRIV(OP_lengths)[] = { OP_LENGTHS };
 #endif
 /* Tables of horizontal and vertical whitespace characters, suitable for
 adding to classes. */
 const uint32_t PRIV(hspace_list)[] = { HSPACE_LIST };
 const uint32_t PRIV(vspace_list)[] = { VSPACE_LIST };
 #endif /* FIXME */
 /*************************************************
@ -103,8 +102,6 @@ const uint8_t PRIV(utf8_table4)[] = {
 #endif /* (SUPPORT_UTF && COMPILE_PCRE8) || (PCRE2_INCLUDED && SUPPORT_PCRE[16|32])*/
 #ifdef FIXME
 #ifdef SUPPORT_UTF
 /* Table to translate from particular type value to the general value. */
@ -122,9 +119,9 @@ const uint32_t PRIV(ucp_gentype)[] = {
 /* This table encodes the rules for finding the end of an extended grapheme
 cluster. Every code point has a grapheme break property which is one of the
-ucp_gbXX values defined in ucp.h. The 2-dimensional table is indexed by the
+ucp_gbXX values defined in pcre2_ucp.h. The 2-dimensional table is indexed by
-properties of two adjacent code points. The left property selects a word from
+the properties of two adjacent code points. The left property selects a word
-the table, and the right property selects a bit from that word like this:
+from the table, and the right property selects a bit from that word like this:
  ucp_gbtable[left-property] & (1 << right-property)
@ -660,6 +657,4 @@ const int PRIV(utt_size) = sizeof(PRIV(utt)) / sizeof(ucp_type_table);
 #endif /* SUPPORT_UTF */
 #endif /* FIXME */
 /* End of pcre2_tables.c */
--- a/src/pcre2_ucd.c
+++ b/src/pcre2_ucd.c
@ -2,28 +2,29 @@
 Do not modify it by hand. Instead modify the script and run it
 to regenerate this code.
-As well as being part of the PCRE library, this module is #included
+As well as being part of the PCRE2 library, this module is #included
-by the pcretest program, which redefines the PRIV macro to change
+by the pcre2test program, which redefines the PRIV macro to change
-table names from _pcre_xxx to xxxx, thereby avoiding name clashes
+table names from _pcre2_xxx to xxxx, thereby avoiding name clashes
 with the library. At present, just one of these tables is actually
 needed. */
-#ifndef PCRE_INCLUDED
+#ifndef PCRE2_INCLUDED
 #ifdef HAVE_CONFIG_H
 #include "config.h"
 #endif
-#include "pcre_internal.h"
+#include "pcre2_internal.h"
-#endif /* PCRE_INCLUDED */
+#endif /* PCRE2_INCLUDED */
 /* Unicode character database. */
 /* This file was autogenerated by the MultiStage2.py script. */
 /* Total size: 65688 bytes, block size: 128. */
-/* The tables herein are needed only when UCP support is built
+/* The tables herein are needed only when UCP support is built,
-into PCRE. This module should not be referenced otherwise, so
+and in PCRE2 that happens automatically with UTF support.
 This module should not be referenced otherwise, so
 it should not matter whether it is compiled or not. However
 a comment was received about space saving - maybe the guy linked
 all the modules rather than using a library - so we include a
@ -31,28 +32,28 @@ condition to cut out the tables when not needed. But don't leave
 a totally empty module because some compilers barf at that.
 Instead, just supply small dummy tables. */
-#ifndef SUPPORT_UCP
+#ifndef SUPPORT_UTF
 const ucd_record PRIV(ucd_records)[] = {{0,0,0,0,0 }};
-const pcre_uint8 PRIV(ucd_stage1)[] = {0};
+const uint8_t PRIV(ucd_stage1)[] = {0};
-const pcre_uint16 PRIV(ucd_stage2)[] = {0};
+const uint16_t PRIV(ucd_stage2)[] = {0};
-const pcre_uint32 PRIV(ucd_caseless_sets)[] = {0};
+const uint32_t PRIV(ucd_caseless_sets)[] = {0};
 #else
 /* When recompiling tables with a new Unicode version, please check the
-types in this structure definition from pcre_internal.h (the actual
+types in this structure definition from pcre2_internal.h (the actual
 field names will be different):
 typedef struct {
-pcre_uint8 property_0;
+uint8_t property_0;
-pcre_uint8 property_1;
+uint8_t property_1;
-pcre_uint8 property_2;
+uint8_t property_2;
-pcre_uint8 property_3;
+uint8_t property_3;
 pcre_int32 property_4;
 } ucd_record;
 */
-const pcre_uint32 PRIV(ucd_caseless_sets)[] = {
+const uint32_t PRIV(ucd_caseless_sets)[] = {
  NOTACHAR,
  0x0053,   0x0073,   0x017f,   NOTACHAR,
  0x01c4,   0x01c5,   0x01c6,   NOTACHAR,
@ -75,9 +76,9 @@ const pcre_uint32 PRIV(ucd_caseless_sets)[] = {
  0x00c5,   0x00e5,   0x212b,   NOTACHAR,
 };
-/* When #included in pcretest, we don't need this large table. */
+/* When #included in pcre2test, we don't need this large table. */
-#ifndef PCRE_INCLUDED
+#ifndef PCRE2_INCLUDED
 const ucd_record PRIV(ucd_records)[] = { /* 5016 bytes, record size 8 */
  {     9,      0,      2,      0,      0, }, /*   0 */
@ -709,7 +710,7 @@ const ucd_record PRIV(ucd_records)[] = { /* 5016 bytes, record size 8 */
  {    26,     26,     12,      0,      0, }, /* 626 */
 };
-const pcre_uint8 PRIV(ucd_stage1)[] = { /* 8704 bytes */
+const uint8_t PRIV(ucd_stage1)[] = { /* 8704 bytes */
  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, /* U+0000 */
 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, /* U+0800 */
 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 41, 41, 42, 43, 44, 45, /* U+1000 */
@ -1256,7 +1257,7 @@ const pcre_uint8 PRIV(ucd_stage1)[] = { /* 8704 bytes */
 123,123,123,123,123,123,123,123,123,123,123,123,123,123,123,202, /* U+10F800 */
 };
-const pcre_uint16 PRIV(ucd_stage2)[] = { /* 51968 bytes, block = 128 */
+const uint16_t PRIV(ucd_stage2)[] = { /* 51968 bytes, block = 128 */
 /* block 0 */
  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  0,  0,  2,  0,  0,
  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
@ -3290,8 +3291,8 @@ const pcre_uint16 PRIV(ucd_stage2)[] = { /* 51968 bytes, block = 128 */
 };
 #if UCD_BLOCK_SIZE != 128
-#error Please correct UCD_BLOCK_SIZE in pcre_internal.h
+#error Please correct UCD_BLOCK_SIZE in pcre2_internal.h
 #endif
-#endif  /* SUPPORT_UCP */
+#endif  /* SUPPORT_UTF */
-#endif  /* PCRE_INCLUDED */
+#endif  /* PCRE2_INCLUDED */
--- a/src/pcre2_ucp.h
+++ b/src/pcre2_ucp.h
@ -0,0 +1,237 @@
 /*************************************************
 *      Perl-Compatible Regular Expressions       *
 *************************************************/
 /* PCRE is a library of functions to support regular expressions whose syntax
 and semantics are as close as possible to those of the Perl 5 language.
                       Written by Philip Hazel
     Original API code Copyright (c) 1997-2012 University of Cambridge
         New API code Copyright (c) 2014 University of Cambridge
 -----------------------------------------------------------------------------
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are met:
    * Redistributions of source code must retain the above copyright notice,
      this list of conditions and the following disclaimer.
    * Redistributions in binary form must reproduce the above copyright
      notice, this list of conditions and the following disclaimer in the
      documentation and/or other materials provided with the distribution.
    * Neither the name of the University of Cambridge nor the names of its
      contributors may be used to endorse or promote products derived from
      this software without specific prior written permission.
 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 POSSIBILITY OF SUCH DAMAGE.
 -----------------------------------------------------------------------------
 */
 #ifndef _PCRE2_UCP_H
 #define _PCRE2_UCP_H
 /* This file contains definitions of the property values that are returned by
 the UCD access macros. New values that are added for new releases of Unicode
 should always be at the end of each enum, for backwards compatibility.
 IMPORTANT: Note also that the specific numeric values of the enums have to be
 the same as the values that are generated by the maint/MultiStage2.py script,
 where the equivalent property descriptive names are listed in vectors.
 ALSO: The specific values of the first two enums are assumed for the table
 called catposstab in pcre2_compile.c. */
 /* These are the general character categories. */
 enum {
  ucp_C,     /* Other */
  ucp_L,     /* Letter */
  ucp_M,     /* Mark */
  ucp_N,     /* Number */
  ucp_P,     /* Punctuation */
  ucp_S,     /* Symbol */
  ucp_Z      /* Separator */
 };
 /* These are the particular character categories. */
 enum {
  ucp_Cc,    /* Control */
  ucp_Cf,    /* Format */
  ucp_Cn,    /* Unassigned */
  ucp_Co,    /* Private use */
  ucp_Cs,    /* Surrogate */
  ucp_Ll,    /* Lower case letter */
  ucp_Lm,    /* Modifier letter */
  ucp_Lo,    /* Other letter */
  ucp_Lt,    /* Title case letter */
  ucp_Lu,    /* Upper case letter */
  ucp_Mc,    /* Spacing mark */
  ucp_Me,    /* Enclosing mark */
  ucp_Mn,    /* Non-spacing mark */
  ucp_Nd,    /* Decimal number */
  ucp_Nl,    /* Letter number */
  ucp_No,    /* Other number */
  ucp_Pc,    /* Connector punctuation */
  ucp_Pd,    /* Dash punctuation */
  ucp_Pe,    /* Close punctuation */
  ucp_Pf,    /* Final punctuation */
  ucp_Pi,    /* Initial punctuation */
  ucp_Po,    /* Other punctuation */
  ucp_Ps,    /* Open punctuation */
  ucp_Sc,    /* Currency symbol */
  ucp_Sk,    /* Modifier symbol */
  ucp_Sm,    /* Mathematical symbol */
  ucp_So,    /* Other symbol */
  ucp_Zl,    /* Line separator */
  ucp_Zp,    /* Paragraph separator */
  ucp_Zs     /* Space separator */
 };
 /* These are grapheme break properties. Note that the code for processing them
 assumes that the values are less than 16. If more values are added that take
 the number to 16 or more, the code will have to be rewritten. */
 enum {
  ucp_gbCR,                /*  0 */
  ucp_gbLF,                /*  1 */
  ucp_gbControl,           /*  2 */
  ucp_gbExtend,            /*  3 */
  ucp_gbPrepend,           /*  4 */
  ucp_gbSpacingMark,       /*  5 */
  ucp_gbL,                 /*  6 Hangul syllable type L */
  ucp_gbV,                 /*  7 Hangul syllable type V */
  ucp_gbT,                 /*  8 Hangul syllable type T */
  ucp_gbLV,                /*  9 Hangul syllable type LV */
  ucp_gbLVT,               /* 10 Hangul syllable type LVT */
  ucp_gbRegionalIndicator, /* 11 */
  ucp_gbOther              /* 12 */
 };
 /* These are the script identifications. */
 enum {
  ucp_Arabic,
  ucp_Armenian,
  ucp_Bengali,
  ucp_Bopomofo,
  ucp_Braille,
  ucp_Buginese,
  ucp_Buhid,
  ucp_Canadian_Aboriginal,
  ucp_Cherokee,
  ucp_Common,
  ucp_Coptic,
  ucp_Cypriot,
  ucp_Cyrillic,
  ucp_Deseret,
  ucp_Devanagari,
  ucp_Ethiopic,
  ucp_Georgian,
  ucp_Glagolitic,
  ucp_Gothic,
  ucp_Greek,
  ucp_Gujarati,
  ucp_Gurmukhi,
  ucp_Han,
  ucp_Hangul,
  ucp_Hanunoo,
  ucp_Hebrew,
  ucp_Hiragana,
  ucp_Inherited,
  ucp_Kannada,
  ucp_Katakana,
  ucp_Kharoshthi,
  ucp_Khmer,
  ucp_Lao,
  ucp_Latin,
  ucp_Limbu,
  ucp_Linear_B,
  ucp_Malayalam,
  ucp_Mongolian,
  ucp_Myanmar,
  ucp_New_Tai_Lue,
  ucp_Ogham,
  ucp_Old_Italic,
  ucp_Old_Persian,
  ucp_Oriya,
  ucp_Osmanya,
  ucp_Runic,
  ucp_Shavian,
  ucp_Sinhala,
  ucp_Syloti_Nagri,
  ucp_Syriac,
  ucp_Tagalog,
  ucp_Tagbanwa,
  ucp_Tai_Le,
  ucp_Tamil,
  ucp_Telugu,
  ucp_Thaana,
  ucp_Thai,
  ucp_Tibetan,
  ucp_Tifinagh,
  ucp_Ugaritic,
  ucp_Yi,
  /* New for Unicode 5.0: */
  ucp_Balinese,
  ucp_Cuneiform,
  ucp_Nko,
  ucp_Phags_Pa,
  ucp_Phoenician,
  /* New for Unicode 5.1: */
  ucp_Carian,
  ucp_Cham,
  ucp_Kayah_Li,
  ucp_Lepcha,
  ucp_Lycian,
  ucp_Lydian,
  ucp_Ol_Chiki,
  ucp_Rejang,
  ucp_Saurashtra,
  ucp_Sundanese,
  ucp_Vai,
  /* New for Unicode 5.2: */
  ucp_Avestan,
  ucp_Bamum,
  ucp_Egyptian_Hieroglyphs,
  ucp_Imperial_Aramaic,
  ucp_Inscriptional_Pahlavi,
  ucp_Inscriptional_Parthian,
  ucp_Javanese,
  ucp_Kaithi,
  ucp_Lisu,
  ucp_Meetei_Mayek,
  ucp_Old_South_Arabian,
  ucp_Old_Turkic,
  ucp_Samaritan,
  ucp_Tai_Tham,
  ucp_Tai_Viet,
  /* New for Unicode 6.0.0: */
  ucp_Batak,
  ucp_Brahmi,
  ucp_Mandaic,
  /* New for Unicode 6.1.0: */
  ucp_Chakma,
  ucp_Meroitic_Cursive,
  ucp_Meroitic_Hieroglyphs,
  ucp_Miao,
  ucp_Sharada,
  ucp_Sora_Sompeng,
  ucp_Takri
 };
 #endif
 /* End of pcvre2_ucp.h */
--- a/src/pcre2test.c
+++ b/src/pcre2test.c