Cleanup of Makefile.os4, added release rule and a README file for this release

Implement -Z in pcre2grep and update documentation
Added some special heap tests
2022-07-31 20:34:33 +01:00 · 2022-07-30 17:41:49 +01:00 · 2022-07-28 17:58:19 +01:00 · 2022-07-27 18:00:40 +01:00 · 2022-07-27 17:44:55 +01:00 · 2022-07-15 17:18:11 +01:00
164 changed files with 53173 additions and 18015 deletions
--- a/.bazelrc
+++ b/.bazelrc
@ -0,0 +1,3 @@
+common --experimental_enable_bzlmod
+build --incompatible_enable_cc_toolchain_resolution
+build --incompatible_strict_action_env
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@ -0,0 +1,77 @@
+
+name: Build
+on: [push, pull_request]
+
+jobs:
+  linux:
+    name: Linux
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v2
+        
+      - name: Autogen
+        run: ./autogen.sh
+        
+      - name: Configure
+        run: ./configure --enable-jit --enable-pcre2-8 --enable-pcre2-16 --enable-pcre2-32
+        
+      - name: Build
+        run: make
+        
+      - name: Test (main test script)
+        run: ./RunTest
+
+      - name: Test (JIT test program)
+        run: ./pcre2_jit_test
+
+      - name: Test (pcre2grep test script)
+        run: ./RunGrepTest
+    
+  alpine:
+    name: alpine
+    runs-on: ubuntu-latest
+    container: alpine 
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v2
+        
+      - name: Autotools
+        run: apk add --no-cache automake autoconf gcc libtool make musl-dev 
+        
+      - name: Autogen
+        run: ./autogen.sh
+        
+      - name: Configure
+        run: ./configure --enable-jit --enable-pcre2-8 --enable-pcre2-16 --enable-pcre2-32
+        
+      - name: Build
+        run: make
+        
+      - name: Test (main test script)
+        run: ./RunTest
+
+      - name: Test (JIT test program)
+        run: ./pcre2_jit_test
+
+      - name: Test (pcre2grep test script)
+        run: ./RunGrepTest
+        
+  windows:      
+    name: 32bit Windows
+    runs-on: windows-latest
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v2
+
+      - name: Configure
+        run: cmake -DPCRE2_SUPPORT_JIT=ON -DPCRE2_BUILD_PCRE2_16=ON -DPCRE2_BUILD_PCRE2_32=ON -B build -A Win32
+
+      - name: Build
+        run: cmake --build build
+
+      - name: Test
+        run: |
+          cd build\Debug
+          ..\..\RunTest.bat
+           
--- a/.github/workflows/codeql.yml
+++ b/.github/workflows/codeql.yml
@ -0,0 +1,73 @@
+# For most projects, this workflow file will not need changing; you simply need
+# to commit it to your repository.
+#
+# You may wish to alter this file to override the set of languages analyzed,
+# or to provide custom queries or build logic.
+#
+# ******** NOTE ********
+# We have attempted to detect the languages in your repository. Please check
+# the `language` matrix defined below to confirm you have the correct set of
+# supported CodeQL languages.
+#
+name: "CodeQL"
+
+on:
+  push:
+    branches: [ master ]
+  pull_request:
+    # The branches below must be a subset of the branches above
+    branches: [ master ]
+  schedule:
+    - cron: '27 6 * * 4'
+
+# Declare default permissions as read only.
+permissions: read-all
+
+jobs:
+  analyze:
+    name: Analyze
+    runs-on: ubuntu-latest
+    permissions:
+      actions: read
+      contents: read
+      security-events: write
+
+    strategy:
+      fail-fast: false
+      matrix:
+        language: [ 'cpp', 'python' ]
+        # CodeQL supports [ 'cpp', 'csharp', 'go', 'java', 'javascript', 'python', 'ruby' ]
+        # Learn more about CodeQL language support at https://git.io/codeql-language-support
+
+    steps:
+    - name: Checkout repository
+      uses: actions/checkout@v2
+
+    # Initializes the CodeQL tools for scanning.
+    - name: Initialize CodeQL
+      uses: github/codeql-action/init@v1
+      with:
+        languages: ${{ matrix.language }}
+        # If you wish to specify custom queries, you can do so here or in a config file.
+        # By default, queries listed here will override any specified in a config file.
+        # Prefix the list here with "+" to use these queries and those in the config file.
+        # queries: ./path/to/local/query, your-org/your-repo/queries@main
+
+    # Autobuild attempts to build any compiled languages  (C/C++, C#, or Java).
+    # If this step fails, then you should remove it and run the build manually (see below)
+    - name: Autobuild
+      uses: github/codeql-action/autobuild@v1
+
+    # ℹ️ Command-line programs to run using the OS shell.
+    # 📚 https://git.io/JvXDl
+
+    # ✏️ If the Autobuild fails above, remove it and uncomment the following three lines
+    #    and modify them (or add more) to build your code if your project
+    #    uses a compiled language
+
+    #- run: |
+    #   make bootstrap
+    #   make release
+
+    - name: Perform CodeQL Analysis
+      uses: github/codeql-action/analyze@v1
--- a/.github/workflows/scorecards.yml
+++ b/.github/workflows/scorecards.yml
@ -0,0 +1,55 @@
+name: Scorecards supply-chain security
+on:
+  # Only the default branch is supported.
+  branch_protection_rule:
+  schedule:
+    - cron: '23 17 * * 1'
+  push:
+    branches: [ master ]
+
+# Declare default permissions as read only.
+permissions: read-all
+
+jobs:
+  analysis:
+    name: Scorecards analysis
+    runs-on: ubuntu-latest
+    permissions:
+      # Needed to upload the results to code-scanning dashboard.
+      security-events: write
+      actions: read
+      contents: read
+
+    steps:
+      - name: "Checkout code"
+        uses: actions/checkout@ec3a7ce113134d7a93b817d10a8272cb61118579 # v2.4.0
+        with:
+          persist-credentials: false
+
+      - name: "Run analysis"
+        uses: ossf/scorecard-action@c1aec4ac820532bab364f02a81873c555a0ba3a1 # v1.0.4
+        with:
+          results_file: results.sarif
+          results_format: sarif
+          # Read-only PAT token. To create it,
+          # follow the steps in https://github.com/ossf/scorecard-action#pat-token-creation.
+          repo_token: ${{ secrets.SCORECARD_READ_TOKEN }}
+          # Publish the results to enable scorecard badges. For more details, see
+          # https://github.com/ossf/scorecard-action#publishing-results.
+          # For private repositories, `publish_results` will automatically be set to `false`,
+          # regardless of the value entered here.
+          publish_results: true
+
+      # Upload the results as artifacts (optional).
+      - name: "Upload artifact"
+        uses: actions/upload-artifact@82c141cc518b40d92cc801eee768e7aafc9c2fa2 # v2.3.1
+        with:
+          name: SARIF file
+          path: results.sarif
+          retention-days: 5
+
+      # Upload the results to GitHub's code scanning dashboard.
+      - name: "Upload to code-scanning"
+        uses: github/codeql-action/upload-sarif@5f532563584d71fdef14ee64d17bafb34f751ce5 # v1.0.26
+        with:
+          sarif_file: results.sarif
--- a/.gitignore
+++ b/.gitignore
@ -6,7 +6,9 @@
 *.pc
 *.o
 *~
+*.lha

+__pycache__
 .deps
 .libs

@ -74,4 +76,7 @@ src/pcre2.h
 src/pcre2_chartables.c
 src/stamp-h1

+/bazel-*
+
 # End
+
--- a/6
+++ b/6
@ -8,7 +8,7 @@ Email domain:     gmail.com
 Retired from University of Cambridge Computing Service,
 Cambridge, England.

-Copyright (c) 1997-2021 University of Cambridge
+Copyright (c) 1997-2022 University of Cambridge
 All rights reserved


@ -19,7 +19,7 @@ Written by:       Zoltan Herczeg
 Email local part: hzmester
 Emain domain:     freemail.hu

-Copyright(c) 2010-2021 Zoltan Herczeg
+Copyright(c) 2010-2022 Zoltan Herczeg
 All rights reserved.


@ -30,7 +30,7 @@ Written by:       Zoltan Herczeg
 Email local part: hzmester
 Emain domain:     freemail.hu

-Copyright(c) 2009-2021 Zoltan Herczeg
+Copyright(c) 2009-2022 Zoltan Herczeg
 All rights reserved.

 ####
--- a/BUILD.bazel
+++ b/BUILD.bazel
@ -0,0 +1,72 @@
+load("@rules_cc//cc:defs.bzl", "cc_library", "cc_test")
+load("@bazel_skylib//rules:copy_file.bzl", "copy_file")
+
+copy_file(
+    name = "config_h_generic",
+    src = "src/config.h.generic",
+    out = "src/config.h",
+)
+
+copy_file(
+    name = "pcre2_h_generic",
+    src = "src/pcre2.h.generic",
+    out = "src/pcre2.h",
+)
+
+copy_file(
+    name = "pcre2_chartables_c",
+    src = "src/pcre2_chartables.c.dist",
+    out = "src/pcre2_chartables.c",
+)
+
+cc_library(
+    name = "pcre2",
+    srcs = [
+        "src/pcre2_auto_possess.c",
+        "src/pcre2_compile.c",
+        "src/pcre2_config.c",
+        "src/pcre2_context.c",
+        "src/pcre2_convert.c",
+        "src/pcre2_dfa_match.c",
+        "src/pcre2_error.c",
+        "src/pcre2_extuni.c",
+        "src/pcre2_find_bracket.c",
+        "src/pcre2_maketables.c",
+        "src/pcre2_match.c",
+        "src/pcre2_match_data.c",
+        "src/pcre2_newline.c",
+        "src/pcre2_ord2utf.c",
+        "src/pcre2_pattern_info.c",
+        "src/pcre2_script_run.c",
+        "src/pcre2_serialize.c",
+        "src/pcre2_string_utils.c",
+        "src/pcre2_study.c",
+        "src/pcre2_substitute.c",
+        "src/pcre2_substring.c",
+        "src/pcre2_tables.c",
+        "src/pcre2_ucd.c",
+        "src/pcre2_ucptables.c",
+        "src/pcre2_valid_utf.c",
+        "src/pcre2_xclass.c",
+        ":pcre2_chartables_c",
+    ],
+    hdrs = glob(["src/*.h"]) + [
+        ":config_h_generic",
+        ":pcre2_h_generic",
+    ],
+    defines = [
+        "HAVE_CONFIG_H",
+        "PCRE2_CODE_UNIT_WIDTH=8",
+        "PCRE2_STATIC",
+    ],
+    includes = ["src"],
+    strip_include_prefix = "src",
+    visibility = ["//visibility:public"],
+)
+
+cc_binary(
+    name = "pcre2demo",
+    srcs = ["src/pcre2demo.c"],
+    visibility = ["//visibility:public"],
+    deps = [":pcre2"],
+)
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -103,13 +103,18 @@
 PROJECT(PCRE2 C)

 # Increased minimum to 2.8.5 to support GNUInstallDirs.
-# Increased minimum to 3.0.0 because older than 2.8.12 is deprecated.
-CMAKE_MINIMUM_REQUIRED(VERSION 3.0.0)
+# Increased minimum to 3.1 to support imported targets.
+CMAKE_MINIMUM_REQUIRED(VERSION 3.1)

 # Set policy CMP0026 to avoid warnings for the use of LOCATION in
 # GET_TARGET_PROPERTY. This should no longer be required.
 # CMAKE_POLICY(SET CMP0026 OLD)

+# With a recent cmake, you can provide a rootdir to look for non
+# standard installed library dependencies, but to do so, the policy
+# needs to be set to new (by uncommenting the following)
+# CMAKE_POLICY(SET CMP0074 NEW)
+
 # For FindReadline.cmake. This was changed to allow setting CMAKE_MODULE_PATH
 # on the command line.
 # SET(CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake)
@ -142,10 +147,16 @@ CHECK_INCLUDE_FILE(windows.h    HAVE_WINDOWS_H)
 CHECK_SYMBOL_EXISTS(bcopy         "strings.h"  HAVE_BCOPY)
 CHECK_SYMBOL_EXISTS(memfd_create  "sys/mman.h" HAVE_MEMFD_CREATE)
 CHECK_SYMBOL_EXISTS(memmove       "string.h"   HAVE_MEMMOVE)
-CHECK_SYMBOL_EXISTS(realpath      "stdlib.h"   HAVE_REALPATH)
 CHECK_SYMBOL_EXISTS(secure_getenv "stdlib.h"   HAVE_SECURE_GETENV)
 CHECK_SYMBOL_EXISTS(strerror      "string.h"   HAVE_STRERROR)

+CHECK_C_SOURCE_COMPILES(
+  "#include <stdlib.h>
+   #include <limits.h>
+   int main(int c, char *v[]) { char buf[PATH_MAX]; realpath(v[1], buf); return 0; }"
+  HAVE_REALPATH
+)
+
 set(ORIG_CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS})
 set(CMAKE_REQUIRED_FLAGS "${CMAKE_REQUIRED_FLAGS} -Werror")
 CHECK_C_SOURCE_COMPILES(
@ -300,9 +311,19 @@ ENDIF(PCRE2_SUPPORT_LIBZ)
 IF(EDITLINE_FOUND)
  OPTION (PCRE2_SUPPORT_LIBEDIT  "Enable support for linking pcre2test with libedit." OFF)
 ENDIF(EDITLINE_FOUND)
-IF(PCRE2_SUPPORT_LIBEDIT)
-  INCLUDE_DIRECTORIES(${EDITLINE_INCLUDE_DIR})
-ENDIF(PCRE2_SUPPORT_LIBEDIT)
+IF(EDITLINE_FOUND)
+  IF(PCRE2_SUPPORT_LIBEDIT)
+    INCLUDE_DIRECTORIES(${EDITLINE_INCLUDE_DIR})
+  ENDIF(PCRE2_SUPPORT_LIBEDIT)
+ELSE(EDITLINE_FOUND)
+  IF(PCRE2_SUPPORT_LIBEDIT)
+    MESSAGE(FATAL_ERROR
+      " libedit not found, set EDITLINE_INCLUDE_DIR to a compatible header\n"
+      " or set Editline_ROOT to a full libedit installed tree, as needed\n"
+      " Might need to enable policy CMP0074 in CMakeLists.txt"
+    )
+  ENDIF(PCRE2_SUPPORT_LIBEDIT)
+ENDIF(EDITLINE_FOUND)

 # readline lib
 IF(READLINE_FOUND)
@ -340,7 +361,12 @@ IF(PCRE2_BUILD_PCRE2GREP AND NOT PCRE2_BUILD_PCRE2_8)
 ENDIF(PCRE2_BUILD_PCRE2GREP AND NOT PCRE2_BUILD_PCRE2_8)

 IF(PCRE2_SUPPORT_LIBREADLINE AND PCRE2_SUPPORT_LIBEDIT)
-        MESSAGE(FATAL_ERROR "Only one of libreadline or libeditline can be specified")
+        IF(READLINE_FOUND)
+                MESSAGE(FATAL_ERROR
+                  " Only one of the readline compatible libraries can be enabled.\n"
+                  " Disable libreadline with -DPCRE2_SUPPORT_LIBREADLINE=OFF"
+                )
+        ENDIF(READLINE_FOUND)
 ENDIF(PCRE2_SUPPORT_LIBREADLINE AND PCRE2_SUPPORT_LIBEDIT)

 IF(PCRE2_SUPPORT_BSR_ANYCRLF)
@ -356,7 +382,13 @@ IF(PCRE2_SUPPORT_UNICODE)
 ENDIF(PCRE2_SUPPORT_UNICODE)

 IF(PCRE2_SUPPORT_JIT)
-        SET(SUPPORT_JIT 1)
+	SET(SUPPORT_JIT 1)
+	IF(UNIX)
+		FIND_PACKAGE(Threads REQUIRED)
+		IF(CMAKE_USE_PTHREADS_INIT)
+			SET(REQUIRE_PTHREAD 1)
+		ENDIF(CMAKE_USE_PTHREADS_INIT)
+	ENDIF(UNIX)
 ENDIF(PCRE2_SUPPORT_JIT)

 IF(PCRE2_SUPPORT_JIT_SEALLOC)
@ -626,6 +658,8 @@ IF(MINGW AND BUILD_SHARED_LIBS)
 ENDIF(MINGW AND BUILD_SHARED_LIBS)

 IF(MSVC AND BUILD_SHARED_LIBS)
+  SET(dll_pdb_files ${PROJECT_BINARY_DIR}/pcre2-posix.pdb ${dll_pdb_files})
+  SET(dll_pdb_debug_files ${PROJECT_BINARY_DIR}/pcre2-posixd.pdb ${dll_pdb_debug_files})
  IF (EXISTS ${PROJECT_SOURCE_DIR}/pcre2.rc)
    SET(PCRE2_SOURCES ${PCRE2_SOURCES} pcre2.rc)
  ENDIF(EXISTS ${PROJECT_SOURCE_DIR}/pcre2.rc)
@ -671,6 +705,10 @@ IF(PCRE2_BUILD_PCRE2_8)
      VERSION ${LIBPCRE2_8_VERSION}
      SOVERSION ${LIBPCRE2_8_SOVERSION})
    TARGET_COMPILE_DEFINITIONS(pcre2-8-static PUBLIC PCRE2_STATIC)
+    TARGET_INCLUDE_DIRECTORIES(pcre2-8-static PUBLIC ${PROJECT_BINARY_DIR})
+    IF(REQUIRE_PTHREAD)
+        TARGET_LINK_LIBRARIES(pcre2-8-static Threads::Threads)
+    ENDIF(REQUIRE_PTHREAD)
    SET(targets ${targets} pcre2-8-static)
    ADD_LIBRARY(pcre2-posix-static STATIC ${PCRE2POSIX_HEADERS} ${PCRE2POSIX_SOURCES})
    SET_TARGET_PROPERTIES(pcre2-posix-static PROPERTIES
@ -681,6 +719,7 @@ IF(PCRE2_BUILD_PCRE2_8)
      SOVERSION ${LIBPCRE2_POSIX_SOVERSION})
    TARGET_LINK_LIBRARIES(pcre2-posix-static pcre2-8-static)
    TARGET_COMPILE_DEFINITIONS(pcre2-posix-static PUBLIC PCRE2_STATIC)
+    TARGET_INCLUDE_DIRECTORIES(pcre2-posix-static PUBLIC ${PROJECT_BINARY_DIR})
    SET(targets ${targets} pcre2-posix-static)

    IF(MSVC)
@ -697,6 +736,7 @@ IF(PCRE2_BUILD_PCRE2_8)

  IF(BUILD_SHARED_LIBS)
    ADD_LIBRARY(pcre2-8-shared SHARED ${PCRE2_HEADERS} ${PCRE2_SOURCES} ${PROJECT_BINARY_DIR}/config.h)
+    TARGET_INCLUDE_DIRECTORIES(pcre2-8-shared PUBLIC ${PROJECT_BINARY_DIR})
    SET_TARGET_PROPERTIES(pcre2-8-shared PROPERTIES
      COMPILE_DEFINITIONS PCRE2_CODE_UNIT_WIDTH=8
      MACHO_COMPATIBILITY_VERSION "${LIBPCRE2_8_MACHO_COMPATIBILITY_VERSION}"
@ -704,8 +744,12 @@ IF(PCRE2_BUILD_PCRE2_8)
      VERSION ${LIBPCRE2_8_VERSION}
      SOVERSION ${LIBPCRE2_8_SOVERSION}
      OUTPUT_NAME pcre2-8)
+    IF(REQUIRE_PTHREAD)
+        TARGET_LINK_LIBRARIES(pcre2-8-shared Threads::Threads)
+    ENDIF(REQUIRE_PTHREAD)
    SET(targets ${targets} pcre2-8-shared)
    ADD_LIBRARY(pcre2-posix-shared SHARED ${PCRE2POSIX_HEADERS} ${PCRE2POSIX_SOURCES})
+    TARGET_INCLUDE_DIRECTORIES(pcre2-posix-shared PUBLIC ${PROJECT_BINARY_DIR})
    SET_TARGET_PROPERTIES(pcre2-posix-shared PROPERTIES
      COMPILE_DEFINITIONS PCRE2_CODE_UNIT_WIDTH=8
      MACHO_COMPATIBILITY_VERSION "${LIBPCRE2_POSIX_MACHO_COMPATIBILITY_VERSION}"
@ -715,6 +759,8 @@ IF(PCRE2_BUILD_PCRE2_8)
      OUTPUT_NAME pcre2-posix)
    TARGET_LINK_LIBRARIES(pcre2-posix-shared pcre2-8-shared)
    SET(targets ${targets} pcre2-posix-shared)
+    SET(dll_pdb_files ${PROJECT_BINARY_DIR}/pcre2-8.pdb ${dll_pdb_files})
+    SET(dll_pdb_debug_files ${PROJECT_BINARY_DIR}/pcre2-8d.pdb ${dll_pdb_debug_files})

    IF(MINGW)
      IF(NON_STANDARD_LIB_PREFIX)
@ -740,6 +786,7 @@ ENDIF(PCRE2_BUILD_PCRE2_8)
 IF(PCRE2_BUILD_PCRE2_16)
  IF(BUILD_STATIC_LIBS)
    ADD_LIBRARY(pcre2-16-static STATIC ${PCRE2_HEADERS} ${PCRE2_SOURCES} ${PROJECT_BINARY_DIR}/config.h)
+    TARGET_INCLUDE_DIRECTORIES(pcre2-16-static PUBLIC ${PROJECT_BINARY_DIR})
    SET_TARGET_PROPERTIES(pcre2-16-static PROPERTIES
      COMPILE_DEFINITIONS PCRE2_CODE_UNIT_WIDTH=16
      MACHO_COMPATIBILITY_VERSION "${LIBPCRE2_32_MACHO_COMPATIBILITY_VERSION}"
@ -747,6 +794,9 @@ IF(PCRE2_BUILD_PCRE2_16)
      VERSION ${LIBPCRE2_16_VERSION}
      SOVERSION ${LIBPCRE2_16_SOVERSION})
    TARGET_COMPILE_DEFINITIONS(pcre2-16-static PUBLIC PCRE2_STATIC)
+    IF(REQUIRE_PTHREAD)
+      TARGET_LINK_LIBRARIES(pcre2-16-static Threads::Threads)
+    ENDIF(REQUIRE_PTHREAD)
    SET(targets ${targets} pcre2-16-static)

    IF(MSVC)
@ -761,6 +811,7 @@ IF(PCRE2_BUILD_PCRE2_16)

  IF(BUILD_SHARED_LIBS)
    ADD_LIBRARY(pcre2-16-shared SHARED ${PCRE2_HEADERS} ${PCRE2_SOURCES} ${PROJECT_BINARY_DIR}/config.h)
+    TARGET_INCLUDE_DIRECTORIES(pcre2-16-shared PUBLIC ${PROJECT_BINARY_DIR})
    SET_TARGET_PROPERTIES(pcre2-16-shared PROPERTIES
      COMPILE_DEFINITIONS PCRE2_CODE_UNIT_WIDTH=16
      MACHO_COMPATIBILITY_VERSION "${LIBPCRE2_32_MACHO_COMPATIBILITY_VERSION}"
@ -768,7 +819,12 @@ IF(PCRE2_BUILD_PCRE2_16)
      VERSION ${LIBPCRE2_16_VERSION}
      SOVERSION ${LIBPCRE2_16_SOVERSION}
      OUTPUT_NAME pcre2-16)
+    IF(REQUIRE_PTHREAD)
+      TARGET_LINK_LIBRARIES(pcre2-16-shared Threads::Threads)
+    ENDIF(REQUIRE_PTHREAD)
    SET(targets ${targets} pcre2-16-shared)
+    SET(dll_pdb_files ${PROJECT_BINARY_DIR}/pcre2-16.pdb ${dll_pdb_files})
+    SET(dll_pdb_debug_files ${PROJECT_BINARY_DIR}/pcre2-16d.pdb ${dll_pdb_debug_files})

    IF(MINGW)
      IF(NON_STANDARD_LIB_PREFIX)
@ -792,6 +848,7 @@ ENDIF(PCRE2_BUILD_PCRE2_16)
 IF(PCRE2_BUILD_PCRE2_32)
  IF(BUILD_STATIC_LIBS)
    ADD_LIBRARY(pcre2-32-static STATIC ${PCRE2_HEADERS} ${PCRE2_SOURCES} ${PROJECT_BINARY_DIR}/config.h)
+    TARGET_INCLUDE_DIRECTORIES(pcre2-32-static PUBLIC ${PROJECT_BINARY_DIR})
    SET_TARGET_PROPERTIES(pcre2-32-static PROPERTIES
      COMPILE_DEFINITIONS PCRE2_CODE_UNIT_WIDTH=32
      MACHO_COMPATIBILITY_VERSION "${LIBPCRE2_32_MACHO_COMPATIBILITY_VERSION}"
@ -799,6 +856,9 @@ IF(PCRE2_BUILD_PCRE2_32)
      VERSION ${LIBPCRE2_32_VERSION}
      SOVERSION ${LIBPCRE2_32_SOVERSION})
    TARGET_COMPILE_DEFINITIONS(pcre2-32-static PUBLIC PCRE2_STATIC)
+    IF(REQUIRE_PTHREAD)
+      TARGET_LINK_LIBRARIES(pcre2-32-static Threads::Threads)
+    ENDIF(REQUIRE_PTHREAD)
    SET(targets ${targets} pcre2-32-static)

    IF(MSVC)
@ -813,6 +873,7 @@ IF(PCRE2_BUILD_PCRE2_32)

  IF(BUILD_SHARED_LIBS)
    ADD_LIBRARY(pcre2-32-shared SHARED ${PCRE2_HEADERS} ${PCRE2_SOURCES} ${PROJECT_BINARY_DIR}/config.h)
+    TARGET_INCLUDE_DIRECTORIES(pcre2-32-shared PUBLIC ${PROJECT_BINARY_DIR})
    SET_TARGET_PROPERTIES(pcre2-32-shared PROPERTIES
      COMPILE_DEFINITIONS PCRE2_CODE_UNIT_WIDTH=32
      MACHO_COMPATIBILITY_VERSION "${LIBPCRE2_32_MACHO_COMPATIBILITY_VERSION}"
@ -820,7 +881,12 @@ IF(PCRE2_BUILD_PCRE2_32)
      VERSION ${LIBPCRE2_32_VERSION}
      SOVERSION ${LIBPCRE2_32_SOVERSION}
      OUTPUT_NAME pcre2-32)
+    IF(REQUIRE_PTHREAD)
+      TARGET_LINK_LIBRARIES(pcre2-32-shared Threads::Threads)
+    ENDIF(REQUIRE_PTHREAD)
    SET(targets ${targets} pcre2-32-shared)
+    SET(dll_pdb_files ${PROJECT_BINARY_DIR}/pcre2-32.pdb ${dll_pdb_files})
+    SET(dll_pdb_debug_files ${PROJECT_BINARY_DIR}/pcre2-32d.pdb ${dll_pdb_debug_files})

    IF(MINGW)
      IF(NON_STANDARD_LIB_PREFIX)
@ -1022,25 +1088,13 @@ FILE(GLOB html ${PROJECT_SOURCE_DIR}/doc/html/*.html)
 FILE(GLOB man1 ${PROJECT_SOURCE_DIR}/doc/*.1)
 FILE(GLOB man3 ${PROJECT_SOURCE_DIR}/doc/*.3)

-FOREACH(man ${man3})
-        GET_FILENAME_COMPONENT(man_tmp ${man} NAME)
-        SET(man3_new ${man3} ${man})
-ENDFOREACH(man ${man3})
-SET(man3 ${man3_new})
-
 INSTALL(FILES ${man1} DESTINATION man/man1)
 INSTALL(FILES ${man3} DESTINATION man/man3)
 INSTALL(FILES ${html} DESTINATION share/doc/pcre2/html)

 IF(MSVC AND INSTALL_MSVC_PDB)
-    INSTALL(FILES ${PROJECT_BINARY_DIR}/pcre2.pdb
-                  ${PROJECT_BINARY_DIR}/pcre2posix.pdb
-            DESTINATION bin
-            CONFIGURATIONS RelWithDebInfo)
-    INSTALL(FILES ${PROJECT_BINARY_DIR}/pcre2d.pdb
-                  ${PROJECT_BINARY_DIR}/pcre2posixd.pdb
-            DESTINATION bin
-            CONFIGURATIONS Debug)
+ INSTALL(FILES ${dll_pdb_files} DESTINATION bin CONFIGURATIONS RelWithDebInfo)
+ INSTALL(FILES ${dll_pdb_debug_files} DESTINATION bin CONFIGURATIONS Debug)
 ENDIF(MSVC AND INSTALL_MSVC_PDB)

 # Help, only for nice output
--- a/169
+++ b/169
@ -1,5 +1,160 @@
-Change Log for PCRE2
--------------------
+Change Log for PCRE2 - see also the Git log
+-------------------------------------------
+
+
+Version 10.41 xx-xxx-2022
+-------------------------
+
+1. Add fflush() before and after a fork callout in pcre2grep to get its output
+to be the same on all systems. (THere were previously ordering differences in
+Alpine Linux).
+
+2. Merged patch from @carenas (GitHub #110) for pthreads support in CMake.
+
+3. SSF scorecards grumbled about possible overflow in an expression in
+pcre2test. It never would have overflowed in practice, but some casts have been
+added and at the some time there's been some tidying of fprints that output
+size_t values.
+
+4. PR #94 showed up an unused enum in pcre2_convert.c, which is now removed.
+
+5. Minor code re-arrangement to remove gcc warning about realloc() in
+pcre2test.
+
+6. Change a number of int variables that hold buffer and line lengths in
+pcre2grep to PCRE2_SIZE (aka size_t).
+
+7. Added an #ifdef to cut out a call to PRIV(jit_free) when JIT is not
+supported (even though that function would do nothing in that case) at the
+request of a user who doesn't even want to link with pcre_jit_compile.o. Also
+tidied up an untidy #ifdef arrangement in pcre2test.
+
+8. Fixed an issue in the backtracking optimization of character repeats in
+JIT. Furthermore optimize star repetitions, not just plus repetitions.
+
+9. Removed the use of an initial backtracking frames vector on the system stack 
+in pcre2_match() so that it now always uses the heap. (In a multi-thread 
+environment with very small stacks there had been an issue.) This also is 
+tidier for JIT matching, which didn't need that vector. The heap vector is now 
+remembered in the match data block and re-used if that block itself is re-used. 
+It is freed with the match data block.
+
+10. Adjusted the find_limits code in pcre2test to work with change 9 above.
+
+11. Added find_limits_noheap to pcre2test, because the heap limits are now 
+different in different environments and so cannot be included in the standard 
+tests.
+
+12. Created a test for pcre2_match() heap processing that is not part of the 
+tests run by 'make check', but can be run manually. The current output is from 
+a 64-bit system.
+
+13. Implemented -Z aka --null in pcre2grep.
+
+
+Version 10.40 15-April-2022
+---------------------------
+
+1. Merged patch from @carenas (GitHub #35, 7db87842) to fix pcre2grep incorrect
+handling of multiple passes.
+
+2. Merged patch from @carenas (GitHub #36, dae47509) to fix portability issue
+in pcre2grep with buffered fseek(stdin).
+
+3. Merged patch from @carenas (GitHub #37, acc520924) to fix tests when -S is
+not supported.
+
+4. Revert an unintended change in JIT repeat detection.
+
+5. Merged patch from @carenas (GitHub #52, b037bfa1) to fix build on GNU Hurd.
+
+6. Merged documentation and comments patches from @carenas (GitHub #47).
+
+7. Merged patch from @carenas (GitHub #49) to remove obsolete JFriedl test code
+from pcre2grep.
+
+8. Merged patch from @carenas (GitHub #48) to fix CMake install issue #46.
+
+9. Merged patch from @carenas (GitHub #53) fixing NULL checks in matching and
+substituting.
+
+10. Add null_subject and null_replacement modifiers to pcre2test.
+
+11. Add check for NULL subject to POSIX regexec() function.
+
+12. Add check for NULL replacement to pcre2_substitute().
+
+13. For the subject arguments of pcre2_match(), pcre2_dfa_match(), and
+pcre2_substitute(), and the replacement argument of the latter, if the pointer
+is NULL and the length is zero, treat as an empty string. Apparently a number
+of applications treat NULL/0 in this way.
+
+14. Added support for Bidi_Class and a number of binary Unicode properties,
+including Bidi_Control.
+
+15. Fix some minor issues raised by clang sanitize.
+
+16. Very minor code speed up for maximizing character property matches.
+
+17. A number of changes to script matching for \p and \P:
+
+    (a) Script extensions for a character are now coded as a bitmap instead of
+        a list of script numbers, which should be faster and does not need a
+        loop.
+
+    (b) Added the syntax \p{script:xxx} and \p{script_extensions:xxx} (synonyms
+        sc and scx).
+
+    (c) Changed \p{scriptname} from being the same as \p{sc:scriptname} to being
+        the same as \p{scx:scriptname} because this change happened in Perl at
+        release 5.26.
+
+    (d) The standard Unicode 4-letter abbreviations for script names are now
+        recognized.
+
+    (e) In accordance with Unicode and Perl's "loose matching" rules, spaces,
+        hyphens, and underscores are ignored in property names, which are then
+        matched independent of case.
+
+18. The Python scripts in the maint directory have been refactored. There are
+now three scripts that generate pcre2_ucd.c, pcre2_ucp.h, and pcre2_ucptables.c
+(which is #included by pcre2_tables.c). The data lists that used to be
+duplicated are now held in a single common Python module.
+
+19. On CHERI, and thus Arm's Morello prototype, pointers are represented as
+hardware capabilities, which consist of both an integer address and additional
+metadata, meaning they are twice the size of the platform's size_t type, i.e.
+16 bytes on a 64-bit system. The ovector member of heapframe happens to only be
+8 byte aligned, and so computing frame_size ended up with a multiple of 8 but
+not 16. Whilst the first frame was always suitably aligned, this then
+misaligned the frame that follows, resulting in an alignment fault when storing
+a pointer to Fecode at the start of match. Patch to fix this issue by Jessica
+Clarke PR#72.
+
+20. Added -LP and -LS listing options to pcre2test.
+
+21. A user discovered that the library names in CMakeLists.txt for MSVC
+debugger (PDB) files were incorrect - perhaps never tried for PCRE2?
+
+22. An item such as [Aa] is optimized into a caseless single character match.
+When this was quantified (e.g. [Aa]{2}) and was also the last literal item in a
+pattern, the optimizing "must be present for a match" character check was not
+being flagged as caseless, causing some matches that should have succeeded to
+fail.
+
+23. Fixed a unicode property matching issue in JIT. The character was not
+fully read in caseless matching.
+
+24. Fixed an issue affecting recursions in JIT caused by duplicated data
+transfers.
+
+25. Merged patch from @carenas (GitHub #96) which fixes some problems with
+pcre2test and readline/readedit:
+
+  * Use the right header for libedit in FreeBSD with autoconf
+  * Really allow libedit with cmake
+  * Avoid using readline headers with libedit
+

 Version 10.39 29-October-2021
 -----------------------------
@ -14,10 +169,10 @@ Version 10.39 29-October-2021
  honoured if chosen.

  prtdiff_t is signed, so use a signed type instead, and make sure
-  that an appropiate width is chosen if pointers are 64bit wide and
+  that an appropriate width is chosen if pointers are 64bit wide and
  long is not (ex: Windows 64bit).

-  IMHO removing the cast (and therefore the positibilty of truncation)
+  IMHO removing the cast (and therefore the possibilty of truncation)
  make the code cleaner and the fallback is likely portable enough
  with all 64-bit POSIX systems doing LP64 except for Windows.

@ -68,7 +223,7 @@ Version 10.38 01-October-2021
 -----------------------------

 1. Fix invalid single character repetition issues in JIT when the repetition
-is inside a capturing bracket and the bracket is preceeded by character
+is inside a capturing bracket and the bracket is preceded by character
 literals.

 2. Installed revised CMake configuration files provided by Jan-Willem Blokland.
@ -308,7 +463,7 @@ now correctly backtracked, so this unnecessary restriction has been removed.

 7. Added PCRE2_SUBSTITUTE_MATCHED.

-8. Added (?* and (?<* as synonms for (*napla: and (*naplb: to match another
+8. Added (?* and (?<* as synonyms for (*napla: and (*naplb: to match another
 regex engine. The Perl regex folks are aware of this usage and have made a note
 about it.

@ -739,7 +894,7 @@ Patch by Guillem Jover.
 warnings were reported.

 38. Using the clang compiler with sanitizing options causes runtime complaints
-about truncation for statments such as x = ~x when x is an 8-bit value; it
+about truncation for statements such as x = ~x when x is an 8-bit value; it
 seems to compute ~x as a 32-bit value. Changing such statements to x = 255 ^ x
 gets rid of the warnings. There were also two missing casts in pcre2test.

--- a/64
+++ b/64
@ -8,8 +8,8 @@ library is referred to as PCRE1 below. For information about testing PCRE2, see
 the pcre2test documentation and the comment at the head of the RunTest file.

 PCRE1 releases were up to 8.3x when PCRE2 was developed, and later bug fix
-releases remain in the 8.xx series. PCRE2 releases started at 10.00 to avoid
-confusion with PCRE1.
+releases carried on the 8.xx series, up to the final 8.45 release. PCRE2
+releases started at 10.00 to avoid confusion with PCRE1.


 Historical note 1
@ -38,8 +38,8 @@ Historical note 2
 By contrast, the code originally written by Henry Spencer (which was
 subsequently heavily modified for Perl) compiles the expression twice: once in
 a dummy mode in order to find out how much store will be needed, and then for
-real. (The Perl version probably doesn't do this any more; I'm talking about
-the original library.) The execution function operates by backtracking and
+real. (The Perl version may or may not still do this; I'm talking about the
+original library.) The execution function operates by backtracking and
 maximizing (or, optionally, minimizing, in Perl) the amount of the subject that
 matches individual wild portions of the pattern. This is an "NFA algorithm" in
 Friedl's terminology.
@ -151,8 +151,8 @@ of code units in the item itself. The exception is the aforementioned large
 advance to check for such values. When auto-callouts are enabled, the generous
 assumption is made that there will be a callout for each pattern code unit
 (which of course is only actually true if all code units are literals) plus one
-at the end. There is a default parsed pattern vector on the system stack, but
-if this is not big enough, heap memory is used.
+at the end. A default parsed pattern vector is defined on the system stack, to
+minimize memory handling, but if this is not big enough, heap memory is used.

 As before, the actual compiling function is run twice, the first time to
 determine the amount of memory needed for the final compiled pattern. It
@ -187,7 +187,7 @@ META_CLASS_EMPTY      [] empty class - only with PCRE2_ALLOW_EMPTY_CLASS
 META_CLASS_EMPTY_NOT  [^] negative empty class - ditto
 META_CLASS_END        ] end of non-empty class
 META_CLASS_NOT        [^ start non-empty negative class
-META_COMMIT           (*COMMIT)
+META_COMMIT           (*COMMIT) - no argument (see below for with argument)
 META_COND_ASSERT      (?(?assertion)
 META_DOLLAR           $ metacharacter
 META_DOT              . metacharacter
@ -201,18 +201,18 @@ META_NOCAPTURE        (?: no capture parens
 META_PLUS             +
 META_PLUS_PLUS        ++
 META_PLUS_QUERY       +?
-META_PRUNE            (*PRUNE) - no argument
+META_PRUNE            (*PRUNE) - no argument (see below for with argument)
 META_QUERY            ?
 META_QUERY_PLUS       ?+
 META_QUERY_QUERY      ??
 META_RANGE_ESCAPED    hyphen in class range with at least one escape
 META_RANGE_LITERAL    hyphen in class range defined literally
-META_SKIP             (*SKIP) - no argument
-META_THEN             (*THEN) - no argument
+META_SKIP             (*SKIP) - no argument (see below for with argument)
+META_THEN             (*THEN) - no argument (see below for with argument)

 The two RANGE values occur only in character classes. They are positioned
 between two literals that define the start and end of the range. In an EBCDIC
-evironment it is necessary to know whether either of the range values was
+environment it is necessary to know whether either of the range values was
 specified as an escape. In an ASCII/Unicode environment the distinction is not
 relevant.

@ -229,17 +229,16 @@ If the data for META_ALT is non-zero, it is inside a lookbehind, and the data
 is the length of its branch, for which OP_REVERSE must be generated.

 META_BACKREF, META_CAPTURE, and META_RECURSE have the capture group number as
-their data in the lower 16 bits of the element.
+their data in the lower 16 bits of the element. META_RECURSE is followed by an
+offset, for use in error messages.

 META_BACKREF is followed by an offset if the back reference group number is 10
-or more. The offsets of the first ocurrences of references to groups whose
+or more. The offsets of the first occurrences of references to groups whose
 numbers are less than 10 are put in cb->small_ref_offset[] (only the first
 occurrence is useful). On 64-bit systems this avoids using more than two parsed
 pattern elements for items such as \3. The offset is used when an error occurs
 because the reference is to a non-existent group.

-META_RECURSE is always followed by an offset, for use in error messages.
-
 META_ESCAPE has an ESC_xxx value as its data. For ESC_P and ESC_p, the next
 element contains the 16-bit type and data property values, packed together.
 ESC_g and ESC_k are used only for named references - numerical ones are turned
@ -291,9 +290,9 @@ META_LOOKBEHIND       (?<=      start of lookbehind
 META_LOOKBEHIND_NA    (*naplb:  start of non-atomic lookbehind
 META_LOOKBEHINDNOT    (?<!      start of negative lookbehind

-The following are followed by two elements, the minimum and maximum. Repeat
-values are limited to 65535 (MAX_REPEAT). A maximum value of "unlimited" is
-represented by UNLIMITED_REPEAT, which is bigger than MAX_REPEAT:
+The following are followed by two elements, the minimum and maximum. The
+maximum value is limited to 65535 (MAX_REPEAT). A maximum value of "unlimited"
+is represented by UNLIMITED_REPEAT, which is bigger than MAX_REPEAT:

 META_MINMAX           {n,m}  repeat
 META_MINMAX_PLUS      {n,m}+ repeat
@ -347,11 +346,11 @@ support is not available for this kind of matching.
 Changeable options
 ------------------

-The /i, /m, or /s options (PCRE2_CASELESS, PCRE2_MULTILINE, PCRE2_DOTALL, and
-others) may be changed in the middle of patterns by items such as (?i). Their
-processing is handled entirely at compile time by generating different opcodes
-for the different settings. The runtime functions do not need to keep track of
-an option's state.
+The /i, /m, or /s options (PCRE2_CASELESS, PCRE2_MULTILINE, PCRE2_DOTALL) and
+some others may be changed in the middle of patterns by items such as (?i).
+Their processing is handled entirely at compile time by generating different
+opcodes for the different settings. The runtime functions do not need to keep
+track of an option's state.

 PCRE2_DUPNAMES, PCRE2_EXTENDED, PCRE2_EXTENDED_MORE, and PCRE2_NO_AUTO_CAPTURE
 are tracked and processed during the parsing pre-pass. The others are handled
@ -437,7 +436,7 @@ Backtracking control verbs
 --------------------------

 Verbs with no arguments generate opcodes with no following data (as listed
-in the section above). 
+in the section above).

 (*MARK:NAME) generates OP_MARK followed by the mark name, preceded by a
 length in one code unit, and followed by a binary zero. The name length is
@ -468,8 +467,8 @@ Caseless matching (positive or negative) of characters that have more than two
 case-equivalent code points (which is possible only in UTF mode) is handled by
 compiling a Unicode property item (see below), with the pseudo-property
 PT_CLIST. The value of this property is an offset in a vector called
-"ucd_caseless_sets" which identifies the start of a short list of equivalent
-characters, terminated by the value NOTACHAR (0xffffffff).
+"ucd_caseless_sets" which identifies the start of a short list of case
+equivalent characters, terminated by the value NOTACHAR (0xffffffff).


 Repeating single characters
@ -546,8 +545,9 @@ Each is followed by two code units that encode the desired property as a type
 and a value. The types are a set of #defines of the form PT_xxx, and the values
 are enumerations of the form ucp_xx, defined in the pcre2_ucp.h source file.
 The value is relevant only for PT_GC (General Category), PT_PC (Particular
-Category), PT_SC (Script), and the pseudo-property PT_CLIST, which is used to
-identify a list of case-equivalent characters when there are three or more.
+Category), PT_SC (Script), PT_BIDICL (Bidi Class), PT_BOOL (Boolean property),
+and the pseudo-property PT_CLIST, which is used to identify a list of
+case-equivalent characters when there are three or more (see above).

 Repeats of these items use the OP_TYPESTAR etc. set of opcodes, followed by
 three code units: OP_PROP or OP_NOTPROP, and then the desired property type and
@ -665,9 +665,9 @@ a count that immediately follows the offset.
 There are several opcodes that mark the end of a subpattern group. OP_KET is
 used for subpatterns that do not repeat indefinitely, OP_KETRMIN and
 OP_KETRMAX are used for indefinite repetitions, minimally or maximally
-respectively, and OP_KETRPOS for possessive repetitions (see below for more 
+respectively, and OP_KETRPOS for possessive repetitions (see below for more
 details). All four are followed by a LINK_SIZE value giving (as a positive
-number) the offset back to the matching bracket opcode.
+number) the offset back to the matching opening bracket opcode.

 If a subpattern is quantified such that it is permitted to match zero times, it
 is preceded by one of OP_BRAZERO, OP_BRAMINZERO, or OP_SKIPZERO. These are
@ -718,7 +718,7 @@ Assertions

 Forward assertions are also just like other subpatterns, but starting with one
 of the opcodes OP_ASSERT, OP_ASSERT_NA (non-atomic assertion), or
-OP_ASSERT_NOT. Backward assertions use the opcodes OP_ASSERTBACK, 
+OP_ASSERT_NOT. Backward assertions use the opcodes OP_ASSERTBACK,
 OP_ASSERTBACK_NA, and OP_ASSERTBACK_NOT, and the first opcode inside the
 assertion is OP_REVERSE, followed by a count of the number of characters to
 move back the pointer in the subject string. In ASCII or UTF-32 mode, the count
@ -827,4 +827,4 @@ not a real opcode, but is used to check at compile time that tables indexed by
 opcode are the correct length, in order to catch updating errors.

 Philip Hazel
-12 July 2019
+April 2022
--- a/6
+++ b/6
@ -26,7 +26,7 @@ Email domain:     gmail.com
 Retired from University of Cambridge Computing Service,
 Cambridge, England.

-Copyright (c) 1997-2021 University of Cambridge
+Copyright (c) 1997-2022 University of Cambridge
 All rights reserved.


@ -37,7 +37,7 @@ Written by:       Zoltan Herczeg
 Email local part: hzmester
 Email domain:     freemail.hu

-Copyright(c) 2010-2021 Zoltan Herczeg
+Copyright(c) 2010-2022 Zoltan Herczeg
 All rights reserved.


@ -48,7 +48,7 @@ Written by:       Zoltan Herczeg
 Email local part: hzmester
 Email domain:     freemail.hu

-Copyright(c) 2009-2021 Zoltan Herczeg
+Copyright(c) 2009-2022 Zoltan Herczeg
 All rights reserved.


--- a/MODULE.bazel
+++ b/MODULE.bazel
@ -0,0 +1,8 @@
+module(
+    name = "pcre2",
+    version = "10.40",
+    compatibility_level = 1,
+)
+
+bazel_dep(name = "rules_cc", version = "0.0.1")
+bazel_dep(name = "bazel_skylib", version = "1.2.1")
--- a/Makefile.am
+++ b/Makefile.am
@ -382,6 +382,10 @@ COMMON_SOURCES = \
  src/pcre2_valid_utf.c \
  src/pcre2_xclass.c

+# The pcre2_ucptables.c file is #included by pcre2_tables.c
+
+EXTRA_DIST += src/pcre2_ucptables.c
+
 if WITH_PCRE2_8
 lib_LTLIBRARIES += libpcre2-8.la
 libpcre2_8_la_SOURCES = \
@ -448,9 +452,10 @@ EXTRA_DIST += \
  src/sljit/sljitNativePPC_32.c \
  src/sljit/sljitNativePPC_64.c \
  src/sljit/sljitNativePPC_common.c \
+  src/sljit/sljitNativeRISCV_32.c \
+  src/sljit/sljitNativeRISCV_64.c \
+  src/sljit/sljitNativeRISCV_common.c \
  src/sljit/sljitNativeS390X.c \
-  src/sljit/sljitNativeSPARC_32.c \
-  src/sljit/sljitNativeSPARC_common.c \
  src/sljit/sljitNativeX86_32.c \
  src/sljit/sljitNativeX86_64.c \
  src/sljit/sljitNativeX86_common.c \
@ -663,6 +668,7 @@ EXTRA_DIST += \
  testdata/testinput23 \
  testdata/testinput24 \
  testdata/testinput25 \
+  testdata/testinput26 \
  testdata/testinputEBC \
  testdata/testoutput1 \
  testdata/testoutput2 \
@ -705,6 +711,7 @@ EXTRA_DIST += \
  testdata/testoutput23 \
  testdata/testoutput24 \
  testdata/testoutput25 \
+  testdata/testoutput26 \
  testdata/testoutputEBC \
  testdata/valgrind-jit.supp \
  testdata/wintestinput3 \
--- a/Makefile.os4
+++ b/Makefile.os4
@ -0,0 +1,271 @@
+#
+# Project: pcre2
+#
+# Created on: 10-01-2022 22:01:46
+#
+# commands to use:
+# make -f Makefile.os4 libpcre2.a
+# make -f Makefile.os4 libpcre2-posix.a
+# make -f Makefile.os4 pcre2test
+# sh RunTest
+# make -f Makefile.os4 clean
+#
+
+###################################################################
+##
+##////  Objects
+##
+###################################################################
+
+libpcre2_OBJ := \
+	 src/pcre2_chartables.o src/pcre2_auto_possess.o src/pcre2_compile.o \
+	 src/pcre2_config.o src/pcre2_context.o src/pcre2_convert.o \
+	 src/pcre2_dfa_match.o src/pcre2_error.o src/pcre2_extuni.o \
+	 src/pcre2_find_bracket.o src/pcre2_jit_compile.o src/pcre2_maketables.o \
+	 src/pcre2_match.o src/pcre2_match_data.o src/pcre2_newline.o \
+	 src/pcre2_ord2utf.o src/pcre2_pattern_info.o src/pcre2_script_run.o \
+	 src/pcre2_serialize.o src/pcre2_string_utils.o src/pcre2_study.o \
+	 src/pcre2_substitute.o src/pcre2_substring.o src/pcre2_tables.o \
+	 src/pcre2_ucd.o src/pcre2_valid_utf.o src/pcre2_xclass.o \
+	
+
+
+pcre2posix_OBJ := \
+	 src/pcre2posix.o
+
+
+pcre2test_OBJ := \
+	 src/pcre2test.o
+
+
+pcre2grep_OBJ := \
+	 src/pcre2grep.o
+
+###################################################################
+##
+##////  Variables and Environment
+##
+###################################################################
+
+MCRT := -mcrt=newlib
+ifeq ($(USE_CLIB2), yes)
+MCRT := -mcrt=clib2
+endif
+
+CC := gcc:bin/gcc
+
+INCPATH := -I. -Isrc
+
+# for pcre2test
+CFLAGS := $(MCRT) $(INCPATH) -O2 -DHAVE_CONFIG_H -DPCRE2_CODE_UNIT_WIDTH=8
+
+###################################################################
+##
+##////  General rules
+##
+###################################################################
+
+.PHONY: all all-before all-after clean clean-custom realclean
+
+all: all-before libpcre2.a libpcre2-posix.a all-after
+
+all-before:
+#	You can add rules here to execute before the project is built
+
+all-after:
+#	You can add rules here to execute after the project is built
+
+tests: pcre2test pcre2grep
+
+clean: clean-custom
+	@echo "Cleaning compiler objects..."
+	@rm -f  $(libpcre2_OBJ) $(pcre2posix_OBJ) $(pcre2test_OBJ)
+
+cleanall: clean
+	@echo "Cleaning compiler targets..."
+	@rm -f  libpcre.a libpcre-posix.a pcre2test pcre2grep
+
+###################################################################
+##
+##////  Targets
+##
+###################################################################
+
+libpcre2.a: $(libpcre2_OBJ)
+	ar -rcs libpcre2.a $(libpcre2_OBJ)
+	ranlib libpcre2.a
+
+libpcre2-posix.a: $(pcre2posix_OBJ)
+	ar -rcs libpcre2-posix.a $(pcre2posix_OBJ)
+	ranlib libpcre2-posix.a
+
+pcre2test: libpcre2.a libpcre2-posix.a $(pcre2test_OBJ)
+	@echo "Linking pcre2test"
+	@gcc:bin/gcc $(MCRT) -o pcre2test $(pcre2test_OBJ) -L. -lauto -lpcre2 -lpcre2-posix
+	@echo "Removing stale debug target: pcre2test"
+	@rm -f pcre2test.debug
+	
+pcre2grep: libpcre2.a $(pcre2grep_OBJ)
+	@echo "Linking pcre2grep"
+	@gcc:bin/gcc $(MCRT) -o pcre2grep $(pcre2grep_OBJ) -L . -lauto -lpcre2
+	@echo "Removing stale debug target: pcre2grep"
+	@rm -f pcre2grep.debug
+
+
+###################################################################
+##
+##////  Standard rules
+##
+###################################################################
+
+# A default rule to make all the objects listed below
+# because we are hiding compiler commands from the output
+
+.c.o:
+	@echo "Compiling $<"
+	@$(CC) -c $< -o $*.o $(CFLAGS)
+
+src/pcre2_chartables.o: src/pcre2_chartables.c src/config.h src/pcre2_internal.h \
+	 src/pcre2.h src/pcre2_ucp.h
+
+src/pcre2_auto_possess.o: src/pcre2_auto_possess.c src/config.h src/pcre2_internal.h \
+	 src/pcre2.h src/pcre2_ucp.h
+
+src/pcre2_compile.o: src/pcre2_compile.c src/config.h src/pcre2_internal.h \
+	 src/pcre2.h src/pcre2_ucp.h src/pcre2_intmodedep.h \
+	
+
+src/pcre2_config.o: src/pcre2_config.c src/config.h src/pcre2_internal.h \
+	 src/pcre2.h src/pcre2_ucp.h
+
+src/pcre2_context.o: src/pcre2_context.c src/config.h src/pcre2_internal.h \
+	 src/pcre2.h src/pcre2_ucp.h
+
+src/pcre2_convert.o: src/pcre2_convert.c src/config.h src/pcre2_internal.h \
+	 src/pcre2.h src/pcre2_ucp.h
+
+src/pcre2_dfa_match.o: src/pcre2_dfa_match.c src/config.h src/pcre2_internal.h \
+	 src/pcre2.h src/pcre2_ucp.h
+
+src/pcre2_error.o: src/pcre2_error.c src/config.h src/pcre2_internal.h \
+	 src/pcre2.h src/pcre2_ucp.h
+
+src/pcre2_extuni.o: src/pcre2_extuni.c src/config.h src/pcre2_internal.h \
+	 src/pcre2.h src/pcre2_ucp.h
+
+src/pcre2_find_bracket.o: src/pcre2_find_bracket.c src/config.h src/pcre2_internal.h \
+	 src/pcre2.h src/pcre2_ucp.h
+
+src/pcre2_jit_compile.o: src/pcre2_jit_compile.c src/config.h src/pcre2_internal.h \
+	 src/pcre2.h src/pcre2_ucp.h src/pcre2_intmodedep.h \
+	 src/sljit/sljitLir.c src/sljit/sljitLir.h src/sljit/sljitConfig.h \
+	 src/sljit/sljitConfigInternal.h src/sljit/sljitUtils.c src/sljit/sljitProtExecAllocator.c \
+	 src/sljit/sljitWXExecAllocator.c src/sljit/sljitExecAllocator.c src/pcre2_jit_simd_inc.h \
+	 src/pcre2_jit_neon_inc.h src/pcre2_jit_match.c
+
+src/pcre2_maketables.o: src/pcre2_maketables.c
+
+src/pcre2_match.o: src/pcre2_match.c src/config.h src/pcre2_internal.h \
+	 src/pcre2.h src/pcre2_ucp.h
+
+src/pcre2_match_data.o: src/pcre2_match_data.c src/config.h src/pcre2_internal.h \
+	 src/pcre2.h src/pcre2_ucp.h
+
+src/pcre2_newline.o: src/pcre2_newline.c src/config.h src/pcre2_internal.h \
+	 src/pcre2.h src/pcre2_ucp.h
+
+src/pcre2_ord2utf.o: src/pcre2_ord2utf.c src/config.h src/pcre2_internal.h \
+	 src/pcre2.h src/pcre2_ucp.h
+
+src/pcre2_pattern_info.o: src/pcre2_pattern_info.c src/config.h src/pcre2_internal.h \
+	 src/pcre2.h src/pcre2_ucp.h
+
+src/pcre2_script_run.o: src/pcre2_script_run.c src/config.h src/pcre2_internal.h \
+	 src/pcre2.h src/pcre2_ucp.h
+
+src/pcre2_serialize.o: src/pcre2_serialize.c src/config.h src/pcre2_internal.h \
+	 src/pcre2.h src/pcre2_ucp.h
+
+src/pcre2test.o: src/pcre2test.c src/config.h src/pcre2.h \
+	 src/pcre2posix.h src/pcre2_internal.h src/pcre2_ucp.h \
+	 src/pcre2_intmodedep.h src/pcre2_tables.c src/pcre2_ucptables.c \
+	 src/pcre2_ucd.c src/pcre2_printint.c
+
+src/pcre2_string_utils.o: src/pcre2_string_utils.c src/config.h src/pcre2_internal.h \
+	 src/pcre2.h src/pcre2_ucp.h
+
+src/pcre2_study.o: src/pcre2_study.c src/config.h src/pcre2_internal.h \
+	 src/pcre2.h src/pcre2_ucp.h
+
+src/pcre2_substitute.o: src/pcre2_substitute.c src/config.h src/pcre2_internal.h \
+	 src/pcre2.h src/pcre2_ucp.h
+
+src/pcre2_substring.o: src/pcre2_substring.c src/config.h src/pcre2_internal.h \
+	 src/pcre2.h src/pcre2_ucp.h
+
+src/pcre2posix.o: src/pcre2posix.c src/config.h src/pcre2.h \
+	
+
+src/pcre2_tables.o: src/pcre2_tables.c src/config.h src/pcre2_internal.h \
+	 src/pcre2.h src/pcre2_ucp.h src/pcre2_intmodedep.h \
+	
+
+src/pcre2_ucd.o: src/pcre2_ucd.c src/config.h src/pcre2_internal.h \
+	 src/pcre2.h src/pcre2_ucp.h
+
+src/pcre2_valid_utf.o: src/pcre2_valid_utf.c src/config.h src/pcre2_internal.h \
+	 src/pcre2.h src/pcre2_ucp.h
+
+src/pcre2_xclass.o: src/pcre2_xclass.c src/config.h src/pcre2_internal.h \
+	 src/pcre2.h src/pcre2_ucp.h
+
+
+src/pcre2grep.o: src/pcre2grep.c src/config.h
+
+###################################################################
+##
+##////  Custom rules
+##
+###################################################################
+
+runtests: libpcre2.a libpcre2-posix.a tests
+	sh RunTest
+	sh RunGrepTest
+
+release:
+	@echo "Create release folders..."
+	@mkdir -p release/local/newlib/lib release/local/clib2/lib release/local/Documentation/pcre2 release/local/common/include
+	
+	@echo "Building newlib based libraries..."
+	@make -f Makefile.os4 all
+	@cp libpcre2.a release/local/newlib/lib/
+	@cp libpcre2-posix.a release/local/newlib/lib/
+	
+	@echo "Clean build and libraries files..."
+	@make -f Makefile.os4 cleanall
+	
+	@echo "Building clib2 based libraries..."
+	@make -f Makefile.os4 all USE_CLIB2=yes
+	@cp libpcre2.a release/local/clib2/lib/
+	@cp libpcre2-posix.a release/local/clib2/lib/
+
+	@echo "Copy the necessary files..."
+	@cp src/pcre2.h release/local/common/include/
+	@cp src/pcre2posix.h release/local/common/include/
+	@cp COPYING release/local/Documentation/pcre2/
+	@cp HACKING release/local/Documentation/pcre2/
+	@cp LICENCE release/local/Documentation/pcre2/
+	@cp README release/local/Documentation/pcre2/
+	@cp README-OS4.md release/local/Documentation/pcre2/
+	
+	@echo "Clean build and libraries files..."
+	@make -f Makefile.os4 cleanall
+	
+	@echo "Creating the lha release file..."
+	@rm -f pcre2.lha
+	@lha -aeqr3 a pcre2.lha release/
+	
+	@rm -rf release
+
+###################################################################
+
--- a/32
+++ b/32
@ -2,6 +2,38 @@ News about PCRE2 releases
 -------------------------


+Version 10.40 15-April-2022
+---------------------------
+
+This is mostly a bug-fixing and code-tidying release. However, there are some
+extensions to Unicode property handling:
+
+* Added support for Bidi_Class and a number of binary Unicode properties,
+including Bidi_Control.
+
+* A number of changes to script matching for \p and \P:
+
+  (a) Script extensions for a character are now coded as a bitmap instead of
+      a list of script numbers, which should be faster and does not need a
+      loop.
+
+  (b) Added the syntax \p{script:xxx} and \p{script_extensions:xxx} (synonyms
+      sc and scx).
+
+  (c) Changed \p{scriptname} from being the same as \p{sc:scriptname} to being
+      the same as \p{scx:scriptname} because this change happened in Perl at
+      release 5.26.
+
+  (d) The standard Unicode 4-letter abbreviations for script names are now
+      recognized.
+
+  (e) In accordance with Unicode and Perl's "loose matching" rules, spaces,
+      hyphens, and underscores are ignored in property names, which are then
+      matched independent of case.
+
+As always, see ChangeLog for a list of all changes (also the Git log).
+
+
 Version 10.39 29-October-2021
 -----------------------------

--- a/5
+++ b/5
@ -121,6 +121,7 @@ environment, for example.
       pcre2_substring.c
       pcre2_tables.c
       pcre2_ucd.c
+       pcre2_ucptables.c
       pcre2_valid_utf.c
       pcre2_xclass.c

@ -306,7 +307,7 @@ cache can be deleted by selecting "File > Delete Cache".
 3.  Create a new, empty build directory, preferably a subdirectory of the
    source dir. For example, C:\pcre2\pcre2-xx\build.

-4.  Run cmake-gui from the Shell envirornment of your build tool, for example,
+4.  Run cmake-gui from the Shell environment of your build tool, for example,
    Msys for Msys/MinGW or Visual Studio Command Prompt for VC/VC++. Do not try
    to start Cmake from the Windows Start menu, as this can lead to errors.

@ -373,7 +374,7 @@ Otherwise:
 1. Copy RunTest.bat into the directory where pcre2test.exe and pcre2grep.exe
   have been created.

-2. Edit RunTest.bat to indentify the full or relative location of
+2. Edit RunTest.bat to identify the full or relative location of
   the pcre2 source (wherein which the testdata folder resides), e.g.:

   set srcdir=C:\pcre2\pcre2-10.00
--- a/49
+++ b/49
@ -8,7 +8,7 @@ features, and the internals have been improved. The original PCRE1 library is
 now obsolete and no longer maintained. The latest release of PCRE2 is available
 in .tar.gz, tar.bz2, or .zip form from this GitHub repository:

-https://github.com/PhilipHazel/pcre2/releases
+https://github.com/PCRE2Project/pcre2/releases

 There is a mailing list for discussion about the development of PCRE2 at
 pcre2-dev@googlegroups.com. You can subscribe by sending an email to
@ -17,7 +17,7 @@ pcre2-dev+subscribe@googlegroups.com.
 You can access the archives and also subscribe or manage your subscription
 here:

-https://groups.google.com/pcre2-dev
+https://groups.google.com/g/pcre2-dev

 Please read the NEWS file if you are upgrading from a previous release. The
 contents of this README file are:
@ -114,12 +114,18 @@ Building PCRE2 using autotools
 The following instructions assume the use of the widely used "configure; make;
 make install" (autotools) process.

-To build PCRE2 on system that supports autotools, first run the "configure"
-command from the PCRE2 distribution directory, with your current directory set
+If you have downloaded and unpacked a PCRE2 release tarball, run the
+"configure" command from the PCRE2 directory, with your current directory set
 to the directory where you want the files to be created. This command is a
 standard GNU "autoconf" configuration script, for which generic instructions
 are supplied in the file INSTALL.

+The files in the GitHub repository do not contain "configure". If you have
+downloaded the PCRE2 source files from GitHub, before you can run "configure"
+you must run the shell script called autogen.sh. This runs a number of
+autotools to create a "configure" script (you must of course have the autotools
+commands installed in order to do this).
+
 Most commonly, people build PCRE2 within its own distribution directory, and in
 this case, on many systems, just running "./configure" is sufficient. However,
 the usual methods of changing standard defaults are available. For example:
@ -188,10 +194,10 @@ library. They are also documented in the pcre2build man page.

  As well as supporting UTF strings, Unicode support includes support for the
  \P, \p, and \X sequences that recognize Unicode character properties.
-  However, only the basic two-letter properties such as Lu are supported.
-  Escape sequences such as \d and \w in patterns do not by default make use of
-  Unicode properties, but can be made to do so by setting the PCRE2_UCP option
-  or starting a pattern with (*UCP).
+  However, only a subset of Unicode properties are supported; see the
+  pcre2pattern man page for details. Escape sequences such as \d and \w in
+  patterns do not by default make use of Unicode properties, but can be made to
+  do so by setting the PCRE2_UCP option or starting a pattern with (*UCP).

 . You can build PCRE2 to recognize either CR or LF or the sequence CRLF, or any
  of the preceding, or any of the Unicode newline sequences, or the NUL (zero)
@ -369,7 +375,8 @@ library. They are also documented in the pcre2build man page.
  necessary to specify something like LIBS="-lncurses" as well. This is
  because, to quote the readline INSTALL, "Readline uses the termcap functions,
  but does not link with the termcap or curses library itself, allowing
-  applications which link with readline the to choose an appropriate library."
+  applications which link with readline the option to choose an appropriate
+  library."
  If you get error messages about missing functions tgetstr, tgetent, tputs,
  tgetflag, or tgoto, this is the problem, and linking with the ncurses library
  should fix it.
@ -394,10 +401,10 @@ library. They are also documented in the pcre2build man page.
  Setting --enable-fuzz-support also causes a binary called pcre2fuzzcheck to
  be created. This is normally run under valgrind or used when PCRE2 is
  compiled with address sanitizing enabled. It calls the fuzzing function and
-  outputs information about it is doing. The input strings are specified by
-  arguments: if an argument starts with "=" the rest of it is a literal input
-  string. Otherwise, it is assumed to be a file name, and the contents of the
-  file are the test string.
+  outputs information about what it is doing. The input strings are specified
+  by arguments: if an argument starts with "=" the rest of it is a literal
+  input string. Otherwise, it is assumed to be a file name, and the contents
+  of the file are the test string.

 . Releases before 10.30 could be compiled with --disable-stack-for-recursion,
  which caused pcre2_match() to use individual blocks on the heap for
@ -411,7 +418,7 @@ The "configure" script builds the following files for the basic C library:
 . Makefile             the makefile that builds the library
 . src/config.h         build-time configuration options for the library
 . src/pcre2.h          the public PCRE2 header file
-. pcre2-config          script that shows the building settings such as CFLAGS
+. pcre2-config         script that shows the building settings such as CFLAGS
                         that were set for "configure"
 . libpcre2-8.pc        )
 . libpcre2-16.pc       ) data for the pkg-config command
@ -571,9 +578,9 @@ at build time" for more details.
 Making new tarballs
 -------------------

-The command "make dist" creates two PCRE2 tarballs, in tar.gz and zip formats.
-The command "make distcheck" does the same, but then does a trial build of the
-new distribution to ensure that it works.
+The command "make dist" creates three PCRE2 tarballs, in tar.gz, tar.bz2, and
+zip formats. The command "make distcheck" does the same, but then does a trial
+build of the new distribution to ensure that it works.

 If you have modified any of the man page sources in the doc directory, you
 should first run the PrepareRelease script before making a distribution. This
@ -602,13 +609,13 @@ is available. RunTest outputs a comment when it skips a test.

 Many (but not all) of the tests that are not skipped are run twice if JIT
 support is available. On the second run, JIT compilation is forced. This
-testing can be suppressed by putting "nojit" on the RunTest command line.
+testing can be suppressed by putting "-nojit" on the RunTest command line.

 The entire set of tests is run once for each of the 8-bit, 16-bit and 32-bit
 libraries that are enabled. If you want to run just one set of tests, call
 RunTest with either the -8, -16 or -32 option.

-If valgrind is installed, you can run the tests under it by putting "valgrind"
+If valgrind is installed, you can run the tests under it by putting "-valgrind"
 on the RunTest command line. To run pcre2test on just one or more specific test
 files, give their numbers as arguments to RunTest, for example:

@ -689,7 +696,7 @@ Test 14 contains some special UTF and UCP tests that give different output for
 different code unit widths.

 Test 15 contains a number of tests that must not be run with JIT. They check,
-among other non-JIT things, the match-limiting features of the intepretive
+among other non-JIT things, the match-limiting features of the interpretive
 matcher.

 Test 16 is run only when JIT support is not available. It checks that an
@ -905,4 +912,4 @@ The distribution should contain the files listed below.
 Philip Hazel
 Email local part: Philip.Hazel
 Email domain: gmail.com
-Last updated: 29 October 2021
+Last updated: 15 April 2022
--- a/README-OS4.md
+++ b/README-OS4.md
@ -0,0 +1,39 @@
+PCRE2 (Perl-compatible regular expression library)
+---------------------------------------------------------------------------
+
+This is a port of PCRE2 10.40 by Philip Hazel for AmigaOS 4, as found at the
+GitHub repository https://github.com/PCRE2Project/pcre2
+
+More information about PCRE can be found at its official website
+at https://www.pcre.org and at the documentation that comes with this
+package.
+
+In the archive both newlib and clib2 libraries are included. It has been
+tested with various applications, but in case you find issues please 
+contact me.
+
+To install it into your AmigaOS 4 SDK installation, just extract all the 
+files in the SDK: path.
+
+Compile
+--------------------------
+The source and the changes I did can be found at my personale repository
+https://git.walkero.gr/walkero/pcre2
+
+You can compile it using the Makefile.os4 file, and produce the libraries
+yourself.
+
+* with newlib run:
+  ```bash
+  make -f Makefile.os4 all
+  ```
+* with clib2 run:
+  ```bash
+  make -f Makefile.os4 all USE_CLIB2=yes
+  ```
+
+Changelog
+--------------------------
+v10.40r1 - 2022-07-31
+* First release
+
--- a/README.md
+++ b/README.md
@ -14,14 +14,14 @@ flexible API, the code of PCRE2 has been much improved since the fork.
 ## Download

 As well as downloading from the 
-[GitHub site](https://github.com/PhilipHazel/pcre2), you can download PCRE2 
+[GitHub site](https://github.com/PCRE2Project/pcre2), you can download PCRE2 
 or the older, unmaintained PCRE1 library from an 
 [*unofficial* mirror](https://sourceforge.net/projects/pcre/files/) at SourceForge.

 You can check out the PCRE2 source code via Git or Subversion:

-    git clone https://github.com/PhilipHazel/pcre2.git
-    svn co    https://github.com/PhilipHazel/pcre2.git
+    git clone https://github.com/PCRE2Project/pcre2.git
+    svn co    https://github.com/PCRE2Project/pcre2.git

 ## Contributed Ports

@ -36,7 +36,7 @@ default character encoding, can be found at
 ## Documentation

 You can read the PCRE2 documentation 
-[here](https://philiphazel.github.io/pcre2/doc/html/index.html).
+[here](https://PCRE2Project.github.io/pcre2/doc/html/index.html).

 Comparisons to Perl's regular expression semantics can be found in the
 community authored Wikipedia entry for PCRE.
--- a/48
+++ b/48
@ -68,6 +68,22 @@ diff -b  /dev/null /dev/null 2>/dev/null && cf="diff -b"
 diff -u  /dev/null /dev/null 2>/dev/null && cf="diff -u"
 diff -ub /dev/null /dev/null 2>/dev/null && cf="diff -ub"

+# Some tests involve NUL characters. It seems impossible to handle them easily
+# in many operating systems. An earlier version of this script used sed to
+# translate NUL into the string ZERO, but this didn't work on Solaris (aka
+# SunOS), where the version of sed explicitly doesn't like them, and also MacOS
+# (Darwin), OpenBSD, FreeBSD, NetBSD, and some Linux distributions like Alpine,
+# even when using GNU sed. A user suggested using tr instead, which
+# necessitates translating to a single character. However, on (some versions
+# of?) Solaris, the normal "tr" cannot handle binary zeros, but if
+# /usr/xpg4/bin/tr is available, it can do so, so test for that.
+
+if [ -x /usr/xpg4/bin/tr ] ; then
+  tr=/usr/xpg4/bin/tr
+else
+  tr=tr
+fi
+
 # If this test is being run from "make check", $srcdir will be set. If not, set
 # it to the current or parent directory, whichever one contains the test data.
 # Subsequently, we run most of the pcre2grep tests in the source directory so
@ -674,13 +690,27 @@ echo "---------------------------- Test 131 -----------------------------" >>tes
 echo "RC=$?" >>testtrygrep

 echo "---------------------------- Test 132 -----------------------------" >>testtrygrep
-(cd $srcdir; $valgrind $vjs $pcre2grep -m1 -A3 '^match'; echo '---'; head -1) <$srcdir/testdata/grepinput >>testtrygrep 2>&1
+(cd $srcdir; exec 3<testdata/grepinput; $valgrind $vjs $pcre2grep -m1 -A3 '^match' <&3; echo '---'; head -1 <&3; exec 3<&-) >>testtrygrep 2>&1
 echo "RC=$?" >>testtrygrep

 echo "---------------------------- Test 133 -----------------------------" >>testtrygrep
+(cd $srcdir; exec 3<testdata/grepinput; $valgrind $vjs $pcre2grep -m1 -A3 '^match' <&3; echo '---'; $valgrind $vjs $pcre2grep -m1 -A3 '^match' <&3; exec 3<&-) >>testtrygrep 2>&1
+echo "RC=$?" >>testtrygrep
+
+echo "---------------------------- Test 134 -----------------------------" >>testtrygrep
 (cd $srcdir; $valgrind $vjs $pcre2grep -m1 -O '=$x{41}$x423$o{103}$o1045=' 'fox') <$srcdir/testdata/grepinputv >>testtrygrep 2>&1
 echo "RC=$?" >>testtrygrep

+echo "---------------------------- Test 135 -----------------------------" >>testtrygrep
+(cd $srcdir; $valgrind $vjs $pcre2grep -HZ 'word' ./testdata/grepinputv) | $tr '\000' '@' >>testtrygrep
+echo "RC=$?" >>testtrygrep
+(cd $srcdir; $valgrind $vjs $pcre2grep -lZ 'word' ./testdata/grepinputv ./testdata/grepinputv) | $tr '\000' '@' >>testtrygrep
+echo "RC=$?" >>testtrygrep
+(cd $srcdir; $valgrind $vjs $pcre2grep -A 1 -B 1 -HZ 'word' ./testdata/grepinputv) | $tr '\000' '@' >>testtrygrep
+echo "RC=$?" >>testtrygrep
+(cd $srcdir; $valgrind $vjs $pcre2grep -MHZn 'start[\s]+end' testdata/grepinputM) >>testtrygrep
+echo "RC=$?" >>testtrygrep
+
 # Now compare the results.

 $cf $srcdir/testdata/grepoutput testtrygrep
@ -755,22 +785,6 @@ $valgrind $vjs $pcre2grep -n --newline=any "^(abc|def|ghi|jkl)" testNinputgrep >
 printf '%c--------------------------- Test N6 ------------------------------\r\n' - >>testtrygrep
 $valgrind $vjs $pcre2grep -n --newline=anycrlf "^(abc|def|ghi|jkl)" testNinputgrep >>testtrygrep

-# This next test involves NUL characters. It seems impossible to handle them
-# easily in many operating systems. An earlier version of this script used sed
-# to translate NUL into the string ZERO, but this didn't work on Solaris (aka
-# SunOS), where the version of sed explicitly doesn't like them, and also MacOS
-# (Darwin), OpenBSD, FreeBSD, NetBSD, and some Linux distributions like Alpine,
-# even when using GNU sed. A user suggested using tr instead, which
-# necessitates translating to a single character (@). However, on (some
-# versions of?) Solaris, the normal "tr" cannot handle binary zeros, but if
-# /usr/xpg4/bin/tr is available, it can do so, so test for that.
-
-if [ -x /usr/xpg4/bin/tr ] ; then
-  tr=/usr/xpg4/bin/tr
-else
-  tr=tr
-fi
-
 printf '%c--------------------------- Test N7 ------------------------------\r\n' - >>testtrygrep
 printf 'abc\0def' >testNinputgrep
 $valgrind $vjs $pcre2grep -na --newline=nul "^(abc|def)" testNinputgrep | $tr '\000' '@' >>testtrygrep
--- a/63
+++ b/63
@ -17,8 +17,16 @@
 # individual test numbers, ranges of tests such as 3-6 or 3- (meaning 3 to the
 # end), or a number preceded by ~ to exclude a test. For example, "3-15 ~10"
 # runs tests 3 to 15, excluding test 10, and just "~10" runs all the tests
-# except test 10. Whatever order the arguments are in, the tests are always run
-# in numerical order.
+# except test 10. Whatever order the arguments are in, these tests are always
+# run in numerical order.
+#
+# If no specific tests are selected (which is the case when this script is run
+# via 'make check') the default is to run all the numbered tests.
+#
+# There may also be named (as well as numbered) tests for special purposes. At
+# present there is just one, called "heap". This test's output contains the
+# sizes of heap frames and frame vectors, which depend on the environment. It
+# is therefore not run unless explicitly requested.
 #
 # Inappropriate tests are automatically skipped (with a comment to say so). For
 # example, if JIT support is not compiled, test 16 is skipped, whereas if JIT
@ -80,7 +88,9 @@ title22="Test 22: \C tests with UTF (not supported for DFA matching)"
 title23="Test 23: \C disabled test"
 title24="Test 24: Non-UTF pattern conversion tests"
 title25="Test 25: UTF pattern conversion tests"
-maxtest=25
+title26="Test 26: Auto-generated unicode property tests"
+maxtest=26
+titleheap="Test 'heap': Environment-specific heap tests"

 if [ $# -eq 1 -a "$1" = "list" ]; then
  echo $title0
@ -109,6 +119,12 @@ if [ $# -eq 1 -a "$1" = "list" ]; then
  echo $title23
  echo $title24
  echo $title25
+  echo $title26
+  echo ""
+  echo $titleheap
+  echo ""
+  echo "Numbered tests are automatically run if nothing selected."
+  echo "Named tests must be explicitly selected."
  exit 0
 fi

@ -238,6 +254,8 @@ do22=no
 do23=no
 do24=no
 do25=no
+do26=no
+doheap=no

 while [ $# -gt 0 ] ; do
  case $1 in
@ -267,6 +285,8 @@ while [ $# -gt 0 ] ; do
   23) do23=yes;;
   24) do24=yes;;
   25) do25=yes;;
+   26) do26=yes;;
+ heap) doheap=yes;;
   -8) arg8=yes;;
  -16) arg16=yes;;
  -32) arg32=yes;;
@ -320,7 +340,8 @@ fi
 # set up a large stack.

 $sim ./pcre2test -S 64 /dev/null /dev/null
-if [ $? -eq 0 -a "$bigstack" != "" ] ; then
+support_setstack=$?
+if [ $support_setstack -eq 0 -a "$bigstack" != "" ] ; then
  setstack="-S 64"
 else
  setstack=""
@ -407,8 +428,8 @@ if [ $jit -ne 0 -a "$nojit" != "yes" ] ; then
  fi
 fi

-# If no specific tests were requested, select all. Those that are not
-# relevant will be automatically skipped.
+# If no specific tests were requested, select all the numbered tests. Those
+# that are not relevant will be automatically skipped.

 if [ $do0  = no -a $do1  = no -a $do2  = no -a $do3  = no -a \
     $do4  = no -a $do5  = no -a $do6  = no -a $do7  = no -a \
@ -416,7 +437,7 @@ if [ $do0  = no -a $do1  = no -a $do2  = no -a $do3  = no -a \
     $do12 = no -a $do13 = no -a $do14 = no -a $do15 = no -a \
     $do16 = no -a $do17 = no -a $do18 = no -a $do19 = no -a \
     $do20 = no -a $do21 = no -a $do22 = no -a $do23 = no -a \
-     $do24 = no -a $do25 = no \
+     $do24 = no -a $do25 = no -a $do26 = no -a $doheap = no \
   ]; then
  do0=yes
  do1=yes
@ -444,6 +465,7 @@ if [ $do0  = no -a $do1  = no -a $do2  = no -a $do3  = no -a \
  do23=yes
  do24=yes
  do25=yes
+  do26=yes
 fi

 # Handle any explicit skips at this stage, so that an argument list may consist
@ -479,7 +501,9 @@ for bmode in "$test8" "$test16" "$test32"; do
    echo '' >testtry
    checkspecial '-C'
    checkspecial '--help'
-    checkspecial '-S 1 -t 10 testSinput'
+    if [ $support_setstack -eq 0 ] ; then
+      checkspecial '-S 1 -t 10 testSinput'
+    fi
    echo "  OK"
  fi

@ -860,6 +884,29 @@ for bmode in "$test8" "$test16" "$test32"; do
    fi
  fi

+  # Auto-generated unicode property tests
+
+  if [ $do26 = yes ] ; then
+    echo $title26
+    if [ $utf -eq 0 ] ; then
+      echo "  Skipped because UTF-$bits support is not available"
+    else
+      for opt in "" $jitopt; do
+        $sim $valgrind ${opt:+$vjs} ./pcre2test -q $setstack $bmode $opt $testdata/testinput26 testtry
+        checkresult $? 26 "$opt"
+      done
+    fi
+  fi
+
+  # Manually selected heap tests - output may vary in different environments,
+  # which is why that are not automatically run.
+
+  if [ $doheap = yes ] ; then
+    echo $titleheap
+    $sim $valgrind ./pcre2test -q $setstack $bmode $testdata/testinputheap testtry
+    checkresult $? heap-$bits ""
+  fi
+
 # End of loop for 8/16/32-bit tests
 done

--- a/RunTest.bat
+++ b/RunTest.bat
@ -135,9 +135,9 @@ if "%all%" == "yes" (
  set do7=yes
  set do8=yes
  set do9=yes
-  set do10=yes
+  set do10=no
  set do11=yes
-  set do12=yes
+  set do12=no
  set do13=yes
  set do14=yes
  set do15=yes
--- a/WORKSPACE.bazel
+++ b/WORKSPACE.bazel
@ -0,0 +1 @@
+# See MODULE.bazel
--- a/cmake/FindEditline.cmake
+++ b/cmake/FindEditline.cmake
@ -1,17 +1,16 @@
 # Modified from FindReadline.cmake (PH Feb 2012)

-if(EDITLINE_INCLUDE_DIR AND EDITLINE_LIBRARY AND NCURSES_LIBRARY)
+if(EDITLINE_INCLUDE_DIR AND EDITLINE_LIBRARY)
  set(EDITLINE_FOUND TRUE)
-else(EDITLINE_INCLUDE_DIR AND EDITLINE_LIBRARY AND NCURSES_LIBRARY)
-  FIND_PATH(EDITLINE_INCLUDE_DIR readline.h
-    /usr/include/editline
-    /usr/include/edit/readline  
-    /usr/include/readline
+else(EDITLINE_INCLUDE_DIR AND EDITLINE_LIBRARY)
+  FIND_PATH(EDITLINE_INCLUDE_DIR readline.h PATH_SUFFIXES
+    editline
+    edit/readline
  )
  
  FIND_LIBRARY(EDITLINE_LIBRARY NAMES edit)
  include(FindPackageHandleStandardArgs)
-  FIND_PACKAGE_HANDLE_STANDARD_ARGS(Editline DEFAULT_MSG EDITLINE_INCLUDE_DIR EDITLINE_LIBRARY )
+  FIND_PACKAGE_HANDLE_STANDARD_ARGS(Editline DEFAULT_MSG EDITLINE_INCLUDE_DIR EDITLINE_LIBRARY)

  MARK_AS_ADVANCED(EDITLINE_INCLUDE_DIR EDITLINE_LIBRARY)
-endif(EDITLINE_INCLUDE_DIR AND EDITLINE_LIBRARY AND NCURSES_LIBRARY)
+endif(EDITLINE_INCLUDE_DIR AND EDITLINE_LIBRARY)
--- a/configure.ac
+++ b/configure.ac
@ -9,15 +9,15 @@ dnl The PCRE2_PRERELEASE feature is for identifying release candidates. It might
 dnl be defined as -RC2, for example. For real releases, it should be empty.

 m4_define(pcre2_major, [10])
-m4_define(pcre2_minor, [39])
+m4_define(pcre2_minor, [41])
 m4_define(pcre2_prerelease, [])
-m4_define(pcre2_date, [2021-10-29])
+m4_define(pcre2_date, [2022-xx-xx])

 # Libtool shared library interface versions (current:revision:age)
-m4_define(libpcre2_8_version,     [10:4:10])
-m4_define(libpcre2_16_version,    [10:4:10])
-m4_define(libpcre2_32_version,    [10:4:10])
-m4_define(libpcre2_posix_version, [3:1:0])
+m4_define(libpcre2_8_version,     [11:0:11])
+m4_define(libpcre2_16_version,    [11:0:11])
+m4_define(libpcre2_32_version,    [11:0:11])
+m4_define(libpcre2_posix_version, [3:2:0])

 # NOTE: The CMakeLists.txt file searches for the above variables in the first
 # 50 lines of this file. Please update that if the variables above are moved.
@ -512,7 +512,20 @@ AC_TYPE_SIZE_T

 # Checks for library functions.

-AC_CHECK_FUNCS(bcopy memfd_create memmove mkostemp realpath secure_getenv strerror)
+AC_CHECK_FUNCS(bcopy memfd_create memmove mkostemp secure_getenv strerror)
+AC_MSG_CHECKING([for realpath])
+AC_LINK_IFELSE([AC_LANG_PROGRAM([[
+#include <stdlib.h>
+#include <limits.h>
+]],[[
+char buffer[PATH_MAX];
+realpath(".", buffer);
+]])],
+[AC_MSG_RESULT([yes])
+ AC_DEFINE([HAVE_REALPATH], 1,
+  [Define to 1 if you have the `realpath' function.])
+],
+AC_MSG_RESULT([no]))

 # Check for the availability of libz (aka zlib)

@ -584,14 +597,14 @@ if test "$enable_pcre2test_libreadline" = "yes"; then
 fi
 fi

-
 # Check for the availability of libedit. Different distributions put its
 # headers in different places. Try to cover the most common ones.

 if test "$enable_pcre2test_libedit" = "yes"; then
-  AC_CHECK_HEADERS([editline/readline.h], [HAVE_EDITLINE_READLINE_H=1],
-    [AC_CHECK_HEADERS([edit/readline/readline.h], [HAVE_READLINE_READLINE_H=1],
-      [AC_CHECK_HEADERS([readline/readline.h], [HAVE_READLINE_READLINE_H=1])])])
+  AC_CHECK_HEADERS([editline/readline.h edit/readline/readline.h readline.h], [
+    HAVE_LIBEDIT_HEADER=1
+    break
+  ])
  AC_CHECK_LIB([edit], [readline], [LIBEDIT="-ledit"])
 fi

@ -927,10 +940,9 @@ if test "$enable_pcre2test_libedit" = "yes"; then
    echo "** Cannot use both --enable-pcre2test-libedit and --enable-pcre2test-readline"
    exit 1
  fi
-  if test "$HAVE_EDITLINE_READLINE_H" != "1" -a \
-          "$HAVE_READLINE_READLINE_H" != "1"; then
-    echo "** Cannot --enable-pcre2test-libedit because neither editline/readline.h"
-    echo "** nor readline/readline.h was found."
+  if test -z "$HAVE_LIBEDIT_HEADER"; then
+    echo "** Cannot --enable-pcre2test-libedit because neither editline/readline.h,"
+    echo "** edit/readline/readline.h nor a compatible header was found."
    exit 1
  fi
  if test -z "$LIBEDIT"; then
--- a/doc/html/NON-AUTOTOOLS-BUILD.txt
+++ b/doc/html/NON-AUTOTOOLS-BUILD.txt
@ -121,6 +121,7 @@ environment, for example.
       pcre2_substring.c
       pcre2_tables.c
       pcre2_ucd.c
+       pcre2_ucptables.c
       pcre2_valid_utf.c
       pcre2_xclass.c

@ -306,7 +307,7 @@ cache can be deleted by selecting "File > Delete Cache".
 3.  Create a new, empty build directory, preferably a subdirectory of the
    source dir. For example, C:\pcre2\pcre2-xx\build.

-4.  Run cmake-gui from the Shell envirornment of your build tool, for example,
+4.  Run cmake-gui from the Shell environment of your build tool, for example,
    Msys for Msys/MinGW or Visual Studio Command Prompt for VC/VC++. Do not try
    to start Cmake from the Windows Start menu, as this can lead to errors.

@ -373,7 +374,7 @@ Otherwise:
 1. Copy RunTest.bat into the directory where pcre2test.exe and pcre2grep.exe
   have been created.

-2. Edit RunTest.bat to indentify the full or relative location of
+2. Edit RunTest.bat to identify the full or relative location of
   the pcre2 source (wherein which the testdata folder resides), e.g.:

   set srcdir=C:\pcre2\pcre2-10.00
--- a/doc/html/README.txt
+++ b/doc/html/README.txt
@ -8,7 +8,7 @@ features, and the internals have been improved. The original PCRE1 library is
 now obsolete and no longer maintained. The latest release of PCRE2 is available
 in .tar.gz, tar.bz2, or .zip form from this GitHub repository:

-https://github.com/PhilipHazel/pcre2/releases
+https://github.com/PCRE2Project/pcre2/releases

 There is a mailing list for discussion about the development of PCRE2 at
 pcre2-dev@googlegroups.com. You can subscribe by sending an email to
@ -17,7 +17,7 @@ pcre2-dev+subscribe@googlegroups.com.
 You can access the archives and also subscribe or manage your subscription
 here:

-https://groups.google.com/pcre2-dev
+https://groups.google.com/g/pcre2-dev

 Please read the NEWS file if you are upgrading from a previous release. The
 contents of this README file are:
@ -114,12 +114,18 @@ Building PCRE2 using autotools
 The following instructions assume the use of the widely used "configure; make;
 make install" (autotools) process.

-To build PCRE2 on system that supports autotools, first run the "configure"
-command from the PCRE2 distribution directory, with your current directory set
+If you have downloaded and unpacked a PCRE2 release tarball, run the
+"configure" command from the PCRE2 directory, with your current directory set
 to the directory where you want the files to be created. This command is a
 standard GNU "autoconf" configuration script, for which generic instructions
 are supplied in the file INSTALL.

+The files in the GitHub repository do not contain "configure". If you have
+downloaded the PCRE2 source files from GitHub, before you can run "configure"
+you must run the shell script called autogen.sh. This runs a number of
+autotools to create a "configure" script (you must of course have the autotools
+commands installed in order to do this).
+
 Most commonly, people build PCRE2 within its own distribution directory, and in
 this case, on many systems, just running "./configure" is sufficient. However,
 the usual methods of changing standard defaults are available. For example:
@ -188,10 +194,10 @@ library. They are also documented in the pcre2build man page.

  As well as supporting UTF strings, Unicode support includes support for the
  \P, \p, and \X sequences that recognize Unicode character properties.
-  However, only the basic two-letter properties such as Lu are supported.
-  Escape sequences such as \d and \w in patterns do not by default make use of
-  Unicode properties, but can be made to do so by setting the PCRE2_UCP option
-  or starting a pattern with (*UCP).
+  However, only a subset of Unicode properties are supported; see the
+  pcre2pattern man page for details. Escape sequences such as \d and \w in
+  patterns do not by default make use of Unicode properties, but can be made to
+  do so by setting the PCRE2_UCP option or starting a pattern with (*UCP).

 . You can build PCRE2 to recognize either CR or LF or the sequence CRLF, or any
  of the preceding, or any of the Unicode newline sequences, or the NUL (zero)
@ -369,7 +375,8 @@ library. They are also documented in the pcre2build man page.
  necessary to specify something like LIBS="-lncurses" as well. This is
  because, to quote the readline INSTALL, "Readline uses the termcap functions,
  but does not link with the termcap or curses library itself, allowing
-  applications which link with readline the to choose an appropriate library."
+  applications which link with readline the option to choose an appropriate
+  library."
  If you get error messages about missing functions tgetstr, tgetent, tputs,
  tgetflag, or tgoto, this is the problem, and linking with the ncurses library
  should fix it.
@ -394,10 +401,10 @@ library. They are also documented in the pcre2build man page.
  Setting --enable-fuzz-support also causes a binary called pcre2fuzzcheck to
  be created. This is normally run under valgrind or used when PCRE2 is
  compiled with address sanitizing enabled. It calls the fuzzing function and
-  outputs information about it is doing. The input strings are specified by
-  arguments: if an argument starts with "=" the rest of it is a literal input
-  string. Otherwise, it is assumed to be a file name, and the contents of the
-  file are the test string.
+  outputs information about what it is doing. The input strings are specified
+  by arguments: if an argument starts with "=" the rest of it is a literal
+  input string. Otherwise, it is assumed to be a file name, and the contents
+  of the file are the test string.

 . Releases before 10.30 could be compiled with --disable-stack-for-recursion,
  which caused pcre2_match() to use individual blocks on the heap for
@ -411,7 +418,7 @@ The "configure" script builds the following files for the basic C library:
 . Makefile             the makefile that builds the library
 . src/config.h         build-time configuration options for the library
 . src/pcre2.h          the public PCRE2 header file
-. pcre2-config          script that shows the building settings such as CFLAGS
+. pcre2-config         script that shows the building settings such as CFLAGS
                         that were set for "configure"
 . libpcre2-8.pc        )
 . libpcre2-16.pc       ) data for the pkg-config command
@ -571,9 +578,9 @@ at build time" for more details.
 Making new tarballs
 -------------------

-The command "make dist" creates two PCRE2 tarballs, in tar.gz and zip formats.
-The command "make distcheck" does the same, but then does a trial build of the
-new distribution to ensure that it works.
+The command "make dist" creates three PCRE2 tarballs, in tar.gz, tar.bz2, and
+zip formats. The command "make distcheck" does the same, but then does a trial
+build of the new distribution to ensure that it works.

 If you have modified any of the man page sources in the doc directory, you
 should first run the PrepareRelease script before making a distribution. This
@ -602,13 +609,13 @@ is available. RunTest outputs a comment when it skips a test.

 Many (but not all) of the tests that are not skipped are run twice if JIT
 support is available. On the second run, JIT compilation is forced. This
-testing can be suppressed by putting "nojit" on the RunTest command line.
+testing can be suppressed by putting "-nojit" on the RunTest command line.

 The entire set of tests is run once for each of the 8-bit, 16-bit and 32-bit
 libraries that are enabled. If you want to run just one set of tests, call
 RunTest with either the -8, -16 or -32 option.

-If valgrind is installed, you can run the tests under it by putting "valgrind"
+If valgrind is installed, you can run the tests under it by putting "-valgrind"
 on the RunTest command line. To run pcre2test on just one or more specific test
 files, give their numbers as arguments to RunTest, for example:

@ -689,7 +696,7 @@ Test 14 contains some special UTF and UCP tests that give different output for
 different code unit widths.

 Test 15 contains a number of tests that must not be run with JIT. They check,
-among other non-JIT things, the match-limiting features of the intepretive
+among other non-JIT things, the match-limiting features of the interpretive
 matcher.

 Test 16 is run only when JIT support is not available. It checks that an
@ -905,4 +912,4 @@ The distribution should contain the files listed below.
 Philip Hazel
 Email local part: Philip.Hazel
 Email domain: gmail.com
-Last updated: 29 October 2021
+Last updated: 15 April 2022
--- a/doc/html/pcre2_compile.html
+++ b/doc/html/pcre2_compile.html
@ -92,8 +92,18 @@ Additional options may be set in the compile context via the
 function.
 </P>
 <P>
-The yield of this function is a pointer to a private data structure that
-contains the compiled pattern, or NULL if an error was detected.
+If either of <i>errorcode</i> or <i>erroroffset</i> is NULL, the function returns
+NULL immediately. Otherwise, the yield of this function is a pointer to a
+private data structure that contains the compiled pattern, or NULL if an error
+was detected. In the error case, a text error message can be obtained by
+passing the value returned via the <i>errorcode</i> argument to the the
+<b>pcre2_get_error_message()</b> function. The offset (in code units) where the
+error was encountered is returned via the <i>erroroffset</i> argument.
+</P>
+<P>
+If there is no error, the value passed via <i>errorcode</i> returns the message
+"no error" if passed to <b>pcre2_get_error_message()</b>, and the value passed
+via <i>erroroffset</i> is zero.
 </P>
 <P>
 There is a complete description of the PCRE2 native API, with more detail on
--- a/doc/html/pcre2_jit_stack_create.html
+++ b/doc/html/pcre2_jit_stack_create.html
@ -34,7 +34,8 @@ allocation. The result can be passed to the JIT run-time code by calling
 <b>pcre2_jit_stack_assign()</b> to associate the stack with a compiled pattern,
 which can then be processed by <b>pcre2_match()</b> or <b>pcre2_jit_match()</b>.
 A maximum stack size of 512KiB to 1MiB should be more than enough for any
-pattern. For more details, see the
+pattern. If the stack couldn't be allocated or the values passed were not
+reasonable, NULL will be returned. For more details, see the
 <a href="pcre2jit.html"><b>pcre2jit</b></a>
 page.
 </P>
--- a/doc/html/pcre2_serialize_decode.html
+++ b/doc/html/pcre2_serialize_decode.html
@ -48,7 +48,7 @@ the following negative error codes:
  PCRE2_ERROR_BADDATA   <i>number_of_codes</i> is zero or less
  PCRE2_ERROR_BADMAGIC  mismatch of id bytes in <i>bytes</i>
  PCRE2_ERROR_BADMODE   mismatch of variable unit size or PCRE version
-  PCRE2_ERROR_MEMORY    memory allocation failed
+  PCRE2_ERROR_NOMEMORY  memory allocation failed
  PCRE2_ERROR_NULL      <i>codes</i> or <i>bytes</i> is NULL
 </pre>
 PCRE2_ERROR_BADMAGIC may mean that the data is corrupt, or that it was compiled
--- a/doc/html/pcre2_set_compile_extra_options.html
+++ b/doc/html/pcre2_set_compile_extra_options.html
@ -30,8 +30,8 @@ This function sets additional option bits for <b>pcre2_compile()</b> that are
 housed in a compile context. It completely replaces all the bits. The extra
 options are:
 <pre>
-  PCRE2_EXTRA_ALLOW_LOOKAROUND_BSK     Allow \K in lookarounds PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES  Allow \x{df800} to \x{dfff}
-                                         in UTF-8 and UTF-32 modes
+  PCRE2_EXTRA_ALLOW_LOOKAROUND_BSK     Allow \K in lookarounds
+  PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES  Allow \x{d800} to \x{dfff} in UTF-8 and UTF-32 modes
  PCRE2_EXTRA_ALT_BSUX                 Extended alternate \u, \U, and \x handling
  PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL    Treat all invalid escapes as a literal following character
  PCRE2_EXTRA_ESCAPED_CR_IS_LF         Interpret \r as \n
--- a/doc/html/pcre2_substitute.html
+++ b/doc/html/pcre2_substitute.html
@ -68,29 +68,29 @@ automatically added.
 The subject and replacement lengths can be given as PCRE2_ZERO_TERMINATED for
 zero-terminated strings. The options are:
 <pre>
-  PCRE2_ANCHORED             Match only at the first position
-  PCRE2_ENDANCHORED          Pattern can match only at end of subject
-  PCRE2_NOTBOL               Subject is not the beginning of a line
-  PCRE2_NOTEOL               Subject is not the end of a line
-  PCRE2_NOTEMPTY             An empty string is not a valid match
-  PCRE2_NOTEMPTY_ATSTART     An empty string at the start of the subject is not a valid match
-  PCRE2_NO_JIT               Do not use JIT matching
-  PCRE2_NO_UTF_CHECK         Do not check the subject or replacement for UTF validity (only relevant if
-                              PCRE2_UTF was set at compile time)
-  PCRE2_SUBSTITUTE_EXTENDED  Do extended replacement processing
-  PCRE2_SUBSTITUTE_GLOBAL    Replace all occurrences in the subject
-  PCRE2_SUBSTITUTE_LITERAL   The replacement string is literal
-  PCRE2_SUBSTITUTE_MATCHED   Use pre-existing match data for 1st match
-  PCRE2_SUBSTITUTE_OVERFLOW_LENGTH  If overflow, compute needed length
+  PCRE2_ANCHORED                     Match only at the first position
+  PCRE2_ENDANCHORED                  Match only at end of subject
+  PCRE2_NOTBOL                       Subject is not the beginning of a line
+  PCRE2_NOTEOL                       Subject is not the end of a line
+  PCRE2_NOTEMPTY                     An empty string is not a valid match
+  PCRE2_NOTEMPTY_ATSTART             An empty string at the start of the subject is not a valid match
+  PCRE2_NO_JIT                       Do not use JIT matching
+  PCRE2_NO_UTF_CHECK                 Do not check for UTF validity in the subject or replacement
+                                      (only relevant if PCRE2_UTF was set at compile time)
+  PCRE2_SUBSTITUTE_EXTENDED          Do extended replacement processing
+  PCRE2_SUBSTITUTE_GLOBAL            Replace all occurrences in the subject
+  PCRE2_SUBSTITUTE_LITERAL           The replacement string is literal
+  PCRE2_SUBSTITUTE_MATCHED           Use pre-existing match data for first match
+  PCRE2_SUBSTITUTE_OVERFLOW_LENGTH   If overflow, compute needed length
  PCRE2_SUBSTITUTE_REPLACEMENT_ONLY  Return only replacement string(s)
-  PCRE2_SUBSTITUTE_UNKNOWN_UNSET  Treat unknown group as unset
-  PCRE2_SUBSTITUTE_UNSET_EMPTY  Simple unset insert = empty string
+  PCRE2_SUBSTITUTE_UNKNOWN_UNSET     Treat unknown group as unset
+  PCRE2_SUBSTITUTE_UNSET_EMPTY       Simple unset insert = empty string
 </pre>
 If PCRE2_SUBSTITUTE_LITERAL is set, PCRE2_SUBSTITUTE_EXTENDED,
 PCRE2_SUBSTITUTE_UNKNOWN_UNSET, and PCRE2_SUBSTITUTE_UNSET_EMPTY are ignored.
 </P>
 <P>
-If PCRE2_SUBSTITUTE_MATCHED is set, <i>match_data</i> must be non-zero; its
+If PCRE2_SUBSTITUTE_MATCHED is set, <i>match_data</i> must be non-NULL; its
 contents must be the result of a call to <b>pcre2_match()</b> using the same
 pattern and subject.
 </P>
--- a/doc/html/pcre2api.html
+++ b/doc/html/pcre2api.html
@ -1017,7 +1017,7 @@ has its own memory control arrangements (see the
 documentation for more details). If the limit is reached, the negative error
 code PCRE2_ERROR_HEAPLIMIT is returned. The default limit can be set when PCRE2
 is built; if it is not, the default is set very large and is essentially
-"unlimited".
+unlimited.
 </P>
 <P>
 A value for the heap limit may also be supplied by an item at the start of a
@ -1030,19 +1030,17 @@ less than the limit set by the caller of <b>pcre2_match()</b> or, if no such
 limit is set, less than the default.
 </P>
 <P>
-The <b>pcre2_match()</b> function starts out using a 20KiB vector on the system
-stack for recording backtracking points. The more nested backtracking points
-there are (that is, the deeper the search tree), the more memory is needed.
-Heap memory is used only if the initial vector is too small. If the heap limit
-is set to a value less than 21 (in particular, zero) no heap memory will be
-used. In this case, only patterns that do not have a lot of nested backtracking
-can be successfully processed.
+The <b>pcre2_match()</b> function always needs some heap memory, so setting a
+value of zero guarantees a "heap limit exceeded" error. Details of how
+<b>pcre2_match()</b> uses the heap are given in the
+<a href="pcre2perform.html"><b>pcre2perform</b></a>
+documentation.
 </P>
 <P>
-Similarly, for <b>pcre2_dfa_match()</b>, a vector on the system stack is used
-when processing pattern recursions, lookarounds, or atomic groups, and only if
-this is not big enough is heap memory used. In this case, too, setting a value
-of zero disables the use of the heap.
+For <b>pcre2_dfa_match()</b>, a vector on the system stack is used when
+processing pattern recursions, lookarounds, or atomic groups, and only if this
+is not big enough is heap memory used. In this case, setting a value of zero
+disables the use of the heap.
 <br>
 <br>
 <b>int pcre2_set_match_limit(pcre2_match_context *<i>mcontext</i>,</b>
@ -1089,10 +1087,10 @@ less than the limit set by the caller of <b>pcre2_match()</b> or
 <br>
 <br>
 This parameter limits the depth of nested backtracking in <b>pcre2_match()</b>.
-Each time a nested backtracking point is passed, a new memory "frame" is used
+Each time a nested backtracking point is passed, a new memory frame is used
 to remember the state of matching at that point. Thus, this parameter
 indirectly limits the amount of memory that is used in a match. However,
-because the size of each memory "frame" depends on the number of capturing
+because the size of each memory frame depends on the number of capturing
 parentheses, the actual memory limit varies from pattern to pattern. This limit
 was more useful in versions before 10.30, where function recursion was used for
 backtracking.
@ -1383,8 +1381,7 @@ If <i>errorcode</i> or <i>erroroffset</i> is NULL, <b>pcre2_compile()</b> return
 NULL immediately. Otherwise, the variables to which these point are set to an
 error code and an offset (number of code units) within the pattern,
 respectively, when <b>pcre2_compile()</b> returns NULL because a compilation
-error has occurred. The values are not defined when compilation is successful
-and <b>pcre2_compile()</b> returns a non-NULL value.
+error has occurred. 
 </P>
 <P>
 There are nearly 100 positive error codes that <b>pcre2_compile()</b> may return
@ -1399,15 +1396,18 @@ because the textual error messages that are obtained by calling the
 message"
 <a href="#geterrormessage">below)</a>
 should be self-explanatory. Macro names starting with PCRE2_ERROR_ are defined
-for both positive and negative error codes in <b>pcre2.h</b>.
+for both positive and negative error codes in <b>pcre2.h</b>. When compilation
+is successful <i>errorcode</i> is set to a value that returns the message "no
+error" if passed to <b>pcre2_get_error_message()</b>.
 </P>
 <P>
 The value returned in <i>erroroffset</i> is an indication of where in the
-pattern the error occurred. It is not necessarily the furthest point in the
-pattern that was read. For example, after the error "lookbehind assertion is
-not fixed length", the error offset points to the start of the failing
-assertion. For an invalid UTF-8 or UTF-16 string, the offset is that of the
-first code unit of the failing character.
+pattern an error occurred. When there is no error, zero is returned. A non-zero
+value is not necessarily the furthest point in the pattern that was read. For
+example, after the error "lookbehind assertion is not fixed length", the error
+offset points to the start of the failing assertion. For an invalid UTF-8 or
+UTF-16 string, the offset is that of the first code unit of the failing
+character.
 </P>
 <P>
 Some errors are not detected until the whole pattern has been scanned; in these
@ -1845,7 +1845,7 @@ undefined. It may cause your program to crash or loop.
 </P>
 <P>
 Note that this option can also be passed to <b>pcre2_match()</b> and
-<b>pcre_dfa_match()</b>, to suppress UTF validity checking of the subject
+<b>pcre2_dfa_match()</b>, to suppress UTF validity checking of the subject
 string.
 </P>
 <P>
@ -2055,8 +2055,8 @@ point. However, this applies only to characters whose code points are less than
 \d.
 </P>
 <P>
-When PCRE2 is built with Unicode support (the default), the Unicode properties
-of all characters can be tested with \p and \P, or, alternatively, the
+When PCRE2 is built with Unicode support (the default), certain Unicode
+character properties can be tested with \p and \P, or, alternatively, the
 PCRE2_UCP option can be set when a pattern is compiled; this causes \w and
 friends to use Unicode property support instead of the built-in tables.
 PCRE2_UCP also causes upper/lower casing operations on characters with code
@ -2316,7 +2316,7 @@ return zero. The third argument should point to a <b>size_t</b> variable.
  PCRE2_INFO_LASTCODETYPE
 </pre>
 Returns 1 if there is a rightmost literal code unit that must exist in any
-matched string, other than at its start. The third argument should  point to a
+matched string, other than at its start. The third argument should point to a
 <b>uint32_t</b> variable. If there is no such value, 0 is returned. When 1 is
 returned, the code unit value itself can be retrieved using
 PCRE2_INFO_LASTCODEUNIT. For anchored patterns, a last literal value is
@ -2640,7 +2640,9 @@ The subject string is passed to <b>pcre2_match()</b> as a pointer in
 <i>startoffset</i>. The length and offset are in code units, not characters.
 That is, they are in bytes for the 8-bit library, 16-bit code units for the
 16-bit library, and 32-bit code units for the 32-bit library, whether or not
-UTF processing is enabled.
+UTF processing is enabled. As a special case, if <i>subject</i> is NULL and
+<i>length</i> is zero, the subject is assumed to be an empty string. If
+<i>length</i> is non-zero, an error occurs if <i>subject</i> is NULL.
 </P>
 <P>
 If <i>startoffset</i> is greater than the length of the subject,
@ -3144,11 +3146,11 @@ The backtracking match limit was reached.
 <pre>
  PCRE2_ERROR_NOMEMORY
 </pre>
-If a pattern contains many nested backtracking points, heap memory is used to
-remember them. This error is given when the memory allocation function (default
-or custom) fails. Note that a different error, PCRE2_ERROR_HEAPLIMIT, is given
-if the amount of memory needed exceeds the heap limit. PCRE2_ERROR_NOMEMORY is
-also returned if PCRE2_COPY_MATCHED_SUBJECT is set and memory allocation fails.
+Heap memory is used to remember backgracking points. This error is given when
+the memory allocation function (default or custom) fails. Note that a different
+error, PCRE2_ERROR_HEAPLIMIT, is given if the amount of memory needed exceeds
+the heap limit. PCRE2_ERROR_NOMEMORY is also returned if
+PCRE2_COPY_MATCHED_SUBJECT is set and memory allocation fails.
 <pre>
  PCRE2_ERROR_NULL
 </pre>
@ -3394,12 +3396,17 @@ same number causes an error at compile time.
 <P>
 This function optionally calls <b>pcre2_match()</b> and then makes a copy of the
 subject string in <i>outputbuffer</i>, replacing parts that were matched with
-the <i>replacement</i> string, whose length is supplied in <b>rlength</b>. This
-can be given as PCRE2_ZERO_TERMINATED for a zero-terminated string. There is an
-option (see PCRE2_SUBSTITUTE_REPLACEMENT_ONLY below) to return just the
-replacement string(s). The default action is to perform just one replacement if
-the pattern matches, but there is an option that requests multiple replacements
-(see PCRE2_SUBSTITUTE_GLOBAL below).
+the <i>replacement</i> string, whose length is supplied in <b>rlength</b>, which
+can be given as PCRE2_ZERO_TERMINATED for a zero-terminated string. As a
+special case, if <i>replacement</i> is NULL and <i>rlength</i> is zero, the
+replacement is assumed to be an empty string. If <i>rlength</i> is non-zero, an
+error occurs if <i>replacement</i> is NULL.
+</P>
+<P>
+There is an option (see PCRE2_SUBSTITUTE_REPLACEMENT_ONLY below) to return just
+the replacement string(s). The default action is to perform just one
+replacement if the pattern matches, but there is an option that requests
+multiple replacements (see PCRE2_SUBSTITUTE_GLOBAL below).
 </P>
 <P>
 If successful, <b>pcre2_substitute()</b> returns the number of substitutions
@ -3433,12 +3440,12 @@ block may or may not have been changed.
 As well as the usual options for <b>pcre2_match()</b>, a number of additional
 options can be set in the <i>options</i> argument of <b>pcre2_substitute()</b>.
 One such option is PCRE2_SUBSTITUTE_MATCHED. When this is set, an external
-<i>match_data</i> block must be provided, and it must have been used for an
-external call to <b>pcre2_match()</b>. The data in the <i>match_data</i> block
-(return code, offset vector) is used for the first substitution instead of
-calling <b>pcre2_match()</b> from within <b>pcre2_substitute()</b>. This allows
-an application to check for a match before choosing to substitute, without
-having to repeat the match.
+<i>match_data</i> block must be provided, and it must have already been used for
+an external call to <b>pcre2_match()</b> with the same pattern and subject
+arguments. The data in the <i>match_data</i> block (return code, offset vector)
+is then used for the first substitution instead of calling <b>pcre2_match()</b>
+from within <b>pcre2_substitute()</b>. This allows an application to check for a
+match before choosing to substitute, without having to repeat the match.
 </P>
 <P>
 The contents of the externally supplied match data block are not changed when
@ -3583,7 +3590,7 @@ and force lower case. The escape sequences change the current state: \U and
 terminating a \Q quoted sequence) reverts to no case forcing. The sequences
 \u and \l force the next character (if it is a letter) to upper or lower
 case, respectively, and then the state automatically reverts to no case
-forcing. Case forcing applies to all inserted  characters, including those from
+forcing. Case forcing applies to all inserted characters, including those from
 capture groups and letters within \Q...\E quoted sequences. If either
 PCRE2_UTF or PCRE2_UCP was set when the pattern was compiled, Unicode
 properties are used for case forcing characters whose code points are greater
@ -3655,7 +3662,9 @@ default.
 </P>
 <P>
 PCRE2_ERROR_NULL is returned if PCRE2_SUBSTITUTE_MATCHED is set but the
-<i>match_data</i> argument is NULL.
+<i>match_data</i> argument is NULL or if the <i>subject</i> or <i>replacement</i>
+arguments are NULL. For backward compatibility reasons an exception is made for
+the <i>replacement</i> argument if the <i>rlength</i> argument is also 0.
 </P>
 <P>
 PCRE2_ERROR_BADREPLACEMENT is used for miscellaneous syntax errors in the
@ -3810,12 +3819,13 @@ other alternatives. Ultimately, when it runs out of matches,
 <P>
 The function <b>pcre2_dfa_match()</b> is called to match a subject string
 against a compiled pattern, using a matching algorithm that scans the subject
-string just once (not counting lookaround assertions), and does not backtrack.
-This has different characteristics to the normal algorithm, and is not
-compatible with Perl. Some of the features of PCRE2 patterns are not supported.
-Nevertheless, there are times when this kind of matching can be useful. For a
-discussion of the two matching algorithms, and a list of features that
-<b>pcre2_dfa_match()</b> does not support, see the
+string just once (not counting lookaround assertions), and does not backtrack
+(except when processing lookaround assertions). This has different
+characteristics to the normal algorithm, and is not compatible with Perl. Some
+of the features of PCRE2 patterns are not supported. Nevertheless, there are
+times when this kind of matching can be useful. For a discussion of the two
+matching algorithms, and a list of features that <b>pcre2_dfa_match()</b> does
+not support, see the
 <a href="pcre2matching.html"><b>pcre2matching</b></a>
 documentation.
 </P>
@ -3850,7 +3860,7 @@ Here is an example of a simple call to <b>pcre2_dfa_match()</b>:
 </PRE>
 </P>
 <br><b>
-Option bits for <b>pcre_dfa_match()</b>
+Option bits for <b>pcre2_dfa_match()</b>
 </b><br>
 <P>
 The unused bits of the <i>options</i> argument for <b>pcre2_dfa_match()</b> must
@ -4008,9 +4018,9 @@ Cambridge, England.
 </P>
 <br><a name="SEC42" href="#TOC1">REVISION</a><br>
 <P>
-Last updated: 30 August 2021
+Last updated: 27 July 2022
 <br>
-Copyright &copy; 1997-2021 University of Cambridge.
+Copyright &copy; 1997-2022 University of Cambridge.
 <br>
 <p>
 Return to the <a href="index.html">PCRE2 index page</a>.
--- a/doc/html/pcre2build.html
+++ b/doc/html/pcre2build.html
@ -142,8 +142,9 @@ locked this out by setting PCRE2_NEVER_UTF.
 UTF support allows the libraries to process character code points up to
 0x10ffff in the strings that they handle. Unicode support also gives access to
 the Unicode properties of characters, using pattern escapes such as \P, \p,
-and \X. Only the general category properties such as <i>Lu</i> and <i>Nd</i> are
-supported. Details are given in the
+and \X. Only the general category properties such as <i>Lu</i> and <i>Nd</i>,
+script names, and some bi-directional properties are supported. Details are
+given in the
 <a href="pcre2pattern.html"><b>pcre2pattern</b></a>
 documentation.
 </P>
@ -283,12 +284,11 @@ to the <b>configure</b> command. This setting also applies to the
 counting is done differently).
 </P>
 <P>
-The <b>pcre2_match()</b> function starts out using a 20KiB vector on the system
-stack to record backtracking points. The more nested backtracking points there
-are (that is, the deeper the search tree), the more memory is needed. If the
-initial vector is not large enough, heap memory is used, up to a certain limit,
-which is specified in kibibytes (units of 1024 bytes). The limit can be changed
-at run time, as described in the
+The <b>pcre2_match()</b> function uses heap memory to record backtracking
+points. The more nested backtracking points there are (that is, the deeper the
+search tree), the more memory is needed. There is an upper limit, specified in
+kibibytes (units of 1024 bytes). This limit can be changed at run time, as
+described in the
 <a href="pcre2api.html"><b>pcre2api</b></a>
 documentation. The default limit (in effect unlimited) is 20 million. You can
 change this by a setting such as
@ -307,7 +307,7 @@ You can also explicitly limit the depth of nested backtracking in the
 for --with-match-limit. You can set a lower default limit by adding, for
 example,
 <pre>
-  --with-match-limit_depth=10000
+  --with-match-limit-depth=10000
 </pre>
 to the <b>configure</b> command. This value can be overridden at run time. This
 depth limit indirectly limits the amount of heap memory that is used, but
@ -608,16 +608,16 @@ give a warning.
 <P>
 Philip Hazel
 <br>
-University Computing Service
+Retired from University Computing Service
 <br>
 Cambridge, England.
 <br>
 </P>
 <br><a name="SEC26" href="#TOC1">REVISION</a><br>
 <P>
-Last updated: 20 March 2020
+Last updated: 27 July 2022
 <br>
-Copyright &copy; 1997-2020 University of Cambridge.
+Copyright &copy; 1997-2022 University of Cambridge.
 <br>
 <p>
 Return to the <a href="index.html">PCRE2 index page</a>.
--- a/doc/html/pcre2compat.html
+++ b/doc/html/pcre2compat.html
@ -18,33 +18,41 @@ DIFFERENCES BETWEEN PCRE2 AND PERL
 <P>
 This document describes some of the differences in the ways that PCRE2 and Perl
 handle regular expressions. The differences described here are with respect to
-Perl version 5.32.0, but as both Perl and PCRE2 are continually changing, the
+Perl version 5.34.0, but as both Perl and PCRE2 are continually changing, the
 information may at times be out of date.
 </P>
 <P>
-1. PCRE2 has only a subset of Perl's Unicode support. Details of what it does
+1. When PCRE2_DOTALL (equivalent to Perl's /s qualifier) is not set, the
+behaviour of the '.' metacharacter differs from Perl. In PCRE2, '.' matches the
+next character unless it is the start of a newline sequence. This means that,
+if the newline setting is CR, CRLF, or NUL, '.' will match the code point LF
+(0x0A) in ASCII/Unicode environments, and NL (either 0x15 or 0x25) when using
+EBCDIC. In Perl, '.' appears never to match LF, even when 0x0A is not a newline
+indicator.
+</P>
+<P>
+2. PCRE2 has only a subset of Perl's Unicode support. Details of what it does
 have are given in the
 <a href="pcre2unicode.html"><b>pcre2unicode</b></a>
 page.
 </P>
 <P>
-2. Like Perl, PCRE2 allows repeat quantifiers on parenthesized assertions, but
+3. Like Perl, PCRE2 allows repeat quantifiers on parenthesized assertions, but
 they do not mean what you might think. For example, (?!a){3} does not assert
 that the next three characters are not "a". It just asserts that the next
 character is not "a" three times (in principle; PCRE2 optimizes this to run the
 assertion just once). Perl allows some repeat quantifiers on other assertions,
-for example, \b* (but not \b{3}, though oddly it does allow ^{3}), but these
-do not seem to have any use. PCRE2 does not allow any kind of quantifier on
-non-lookaround assertions.
+for example, \b* , but these do not seem to have any use. PCRE2 does not allow
+any kind of quantifier on non-lookaround assertions.
 </P>
 <P>
-3. Capture groups that occur inside negative lookaround assertions are counted,
+4. Capture groups that occur inside negative lookaround assertions are counted,
 but their entries in the offsets vector are set only when a negative assertion
 is a condition that has a matching branch (that is, the condition is false).
 Perl may set such capture groups in other circumstances.
 </P>
 <P>
-4. The following Perl escape sequences are not supported: \F, \l, \L, \u,
+5. The following Perl escape sequences are not supported: \F, \l, \L, \u,
 \U, and \N when followed by a character name. \N on its own, matching a
 non-newline character, and \N{U+dd..}, matching a Unicode code point, are
 supported. The escapes that modify the case of following letters are
@ -55,26 +63,26 @@ PCRE2_EXTRA_ALT_BSUX options is set, \U and \u are interpreted as ECMAScript
 interprets them.
 </P>
 <P>
-5. The Perl escape sequences \p, \P, and \X are supported only if PCRE2 is
+6. The Perl escape sequences \p, \P, and \X are supported only if PCRE2 is
 built with Unicode support (the default). The properties that can be tested
 with \p and \P are limited to the general category properties such as Lu and
-Nd, script names such as Greek or Han, and the derived properties Any and L&.
-Both PCRE2 and Perl support the Cs (surrogate) property, but in PCRE2 its use
-is limited. See the
+Nd, script names such as Greek or Han, Bidi_Class, Bidi_Control, and the
+derived properties Any and LC (synonym L&). Both PCRE2 and Perl support the Cs
+(surrogate) property, but in PCRE2 its use is limited. See the
 <a href="pcre2pattern.html"><b>pcre2pattern</b></a>
 documentation for details. The long synonyms for property names that Perl
 supports (such as \p{Letter}) are not supported by PCRE2, nor is it permitted
 to prefix any of these properties with "Is".
 </P>
 <P>
-6. PCRE2 supports the \Q...\E escape for quoting substrings. Characters
+7. PCRE2 supports the \Q...\E escape for quoting substrings. Characters
 in between are treated as literals. However, this is slightly different from
 Perl in that $ and @ are also handled as literals inside the quotes. In Perl,
-they cause variable interpolation (but of course PCRE2 does not have
-variables). Also, Perl does "double-quotish backslash interpolation" on any
-backslashes between \Q and \E which, its documentation says, "may lead to
-confusing results". PCRE2 treats a backslash between \Q and \E just like any
-other character. Note the following examples:
+they cause variable interpolation (PCRE2 does not have variables). Also, Perl
+does "double-quotish backslash interpolation" on any backslashes between \Q
+and \E which, its documentation says, "may lead to confusing results". PCRE2
+treats a backslash between \Q and \E just like any other character. Note the
+following examples:
 <pre>
    Pattern            PCRE2 matches     Perl matches

@ -88,19 +96,19 @@ The \Q...\E sequence is recognized both inside and outside character classes
 by both PCRE2 and Perl.
 </P>
 <P>
-7. Fairly obviously, PCRE2 does not support the (?{code}) and (??{code})
+8. Fairly obviously, PCRE2 does not support the (?{code}) and (??{code})
 constructions. However, PCRE2 does have a "callout" feature, which allows an
 external function to be called during pattern matching. See the
 <a href="pcre2callout.html"><b>pcre2callout</b></a>
 documentation for details.
 </P>
 <P>
-8. Subroutine calls (whether recursive or not) were treated as atomic groups up
+9. Subroutine calls (whether recursive or not) were treated as atomic groups up
 to PCRE2 release 10.23, but from release 10.30 this changed, and backtracking
 into subroutine calls is now supported, as in Perl.
 </P>
 <P>
-9. In PCRE2, if any of the backtracking control verbs are used in a group that
+10. In PCRE2, if any of the backtracking control verbs are used in a group that
 is called as a subroutine (whether or not recursively), their effect is
 confined to that group; it does not extend to the surrounding pattern. This is
 not always the case in Perl. In particular, if (*THEN) is present in a group
@ -109,20 +117,20 @@ the group does not contain any | characters. Note that such groups are
 processed as anchored at the point where they are tested.
 </P>
 <P>
-10. If a pattern contains more than one backtracking control verb, the first
+11. If a pattern contains more than one backtracking control verb, the first
 one that is backtracked onto acts. For example, in the pattern
 A(*COMMIT)B(*PRUNE)C a failure in B triggers (*COMMIT), but a failure in C
 triggers (*PRUNE). Perl's behaviour is more complex; in many cases it is the
 same as PCRE2, but there are cases where it differs.
 </P>
 <P>
-11. There are some differences that are concerned with the settings of captured
+12. There are some differences that are concerned with the settings of captured
 strings when part of a pattern is repeated. For example, matching "aba" against
 the pattern /^(a(b)?)+$/ in Perl leaves $2 unset, but in PCRE2 it is set to
 "b".
 </P>
 <P>
-12. PCRE2's handling of duplicate capture group numbers and names is not as
+13. PCRE2's handling of duplicate capture group numbers and names is not as
 general as Perl's. This is a consequence of the fact the PCRE2 works internally
 just with numbers, using an external table to translate between numbers and
 names. In particular, a pattern such as (?|(?&#60;a&#62;A)|(?&#60;b&#62;B)), where the two
@ -132,42 +140,43 @@ to distinguish which group matched, because both names map to capture group
 number 1. To avoid this confusing situation, an error is given at compile time.
 </P>
 <P>
-13. Perl used to recognize comments in some places that PCRE2 does not, for
+14. Perl used to recognize comments in some places that PCRE2 does not, for
 example, between the ( and ? at the start of a group. If the /x modifier is
 set, Perl allowed white space between ( and ? though the latest Perls give an
 error (for a while it was just deprecated). There may still be some cases where
 Perl behaves differently.
 </P>
 <P>
-14. Perl, when in warning mode, gives warnings for character classes such as
+15. Perl, when in warning mode, gives warnings for character classes such as
 [A-\d] or [a-[:digit:]]. It then treats the hyphens as literals. PCRE2 has no
 warning features, so it gives an error in these cases because they are almost
 certainly user mistakes.
 </P>
 <P>
-15. In PCRE2, the upper/lower case character properties Lu and Ll are not
+16. In PCRE2, the upper/lower case character properties Lu and Ll are not
 affected when case-independent matching is specified. For example, \p{Lu}
 always matches an upper case letter. I think Perl has changed in this respect;
-in the release at the time of writing (5.32), \p{Lu} and \p{Ll} match all
+in the release at the time of writing (5.34), \p{Lu} and \p{Ll} match all
 letters, regardless of case, when case independence is specified.
 </P>
 <P>
-16. From release 5.32.0, Perl locks out the use of \K in lookaround
+17. From release 5.32.0, Perl locks out the use of \K in lookaround
 assertions. From release 10.38 PCRE2 does the same by default. However, there
 is an option for re-enabling the previous behaviour. When this option is set,
 \K is acted on when it occurs in positive assertions, but is ignored in
 negative assertions.
 </P>
 <P>
-17. PCRE2 provides some extensions to the Perl regular expression facilities.
+18. PCRE2 provides some extensions to the Perl regular expression facilities.
 Perl 5.10 included new features that were not in earlier versions of Perl, some
 of which (such as named parentheses) were in PCRE2 for some time before. This
-list is with respect to Perl 5.32:
+list is with respect to Perl 5.34:
 <br>
 <br>
 (a) Although lookbehind assertions in PCRE2 must match fixed length strings,
 each alternative toplevel branch of a lookbehind assertion can match a
-different length of string. Perl requires them all to have the same length.
+different length of string. Perl used to require them all to have the same
+length, but the latest version has some variable length support.
 <br>
 <br>
 (b) From PCRE2 10.23, backreferences to groups of fixed length are supported
@ -221,12 +230,12 @@ extension to the lookaround facilities. The default, Perl-compatible
 lookarounds are atomic.
 </P>
 <P>
-18. The Perl /a modifier restricts /d numbers to pure ascii, and the /aa
+19. The Perl /a modifier restricts /d numbers to pure ascii, and the /aa
 modifier restricts /i case-insensitive matching to pure ascii, ignoring Unicode
 rules. This separation cannot be represented with PCRE2_UCP.
 </P>
 <P>
-19. Perl has different limits than PCRE2. See the
+20. Perl has different limits than PCRE2. See the
 <a href="pcre2limit.html"><b>pcre2limit</b></a>
 documentation for details. Perl went with 5.10 from recursion to iteration
 keeping the intermediate matches on the heap, which is ~10% slower but does not
@ -248,7 +257,7 @@ Cambridge, England.
 REVISION
 </b><br>
 <P>
-Last updated: 30 August 2021
+Last updated: 08 December 2021
 <br>
 Copyright &copy; 1997-2021 University of Cambridge.
 <br>
--- a/doc/html/pcre2grep.html
+++ b/doc/html/pcre2grep.html
@ -71,13 +71,15 @@ For example:
 <pre>
  pcre2grep some-pattern file1 - file3
 </pre>
-Input files are searched line by line. By default, each line that matches a
+By default, input files are searched line by line. Each line that matches a
 pattern is copied to the standard output, and if there is more than one file,
 the file name is output at the start of each line, followed by a colon.
-However, there are options that can change how <b>pcre2grep</b> behaves. In
-particular, the <b>-M</b> option makes it possible to search for strings that
-span line boundaries. What defines a line boundary is controlled by the
-<b>-N</b> (<b>--newline</b>) option.
+However, there are options that can change how <b>pcre2grep</b> behaves. For
+example, the <b>-M</b> option makes it possible to search for strings that span
+line boundaries. What defines a line boundary is controlled by the <b>-N</b>
+(<b>--newline</b>) option. The <b>-h</b> and <b>-H</b> options control whether or
+not file names are shown, and the <b>-Z</b> option changes the file name
+terminator to a zero byte.
 </P>
 <P>
 The amount of memory used for buffering files that are being scanned is
@ -178,9 +180,11 @@ Output up to <i>number</i> lines of context after each matching line. Fewer
 lines are output if the next match or the end of the file is reached, or if the
 processing buffer size has been set too small. If file names and/or line
 numbers are being output, a hyphen separator is used instead of a colon for the
-context lines. A line containing "--" is output between each group of lines,
-unless they are in fact contiguous in the input file. The value of <i>number</i>
-is expected to be relatively small. When <b>-c</b> is used, <b>-A</b> is ignored.
+context lines (the <b>-Z</b> option can be used to change the file name
+terminator to a zero byte). A line containing "--" is output between each group
+of lines, unless they are in fact contiguous in the input file. The value of
+<i>number</i> is expected to be relatively small. When <b>-c</b> is used,
+<b>-A</b> is ignored.
 </P>
 <P>
 <b>-a</b>, <b>--text</b>
@ -199,9 +203,10 @@ Output up to <i>number</i> lines of context before each matching line. Fewer
 lines are output if the previous match or the start of the file is within
 <i>number</i> lines, or if the processing buffer size has been set too small. If
 file names and/or line numbers are being output, a hyphen separator is used
-instead of a colon for the context lines. A line containing "--" is output
-between each group of lines, unless they are in fact contiguous in the input
-file. The value of <i>number</i> is expected to be relatively small. When
+instead of a colon for the context lines (the <b>-Z</b> option can be used to
+change the file name terminator to a zero byte). A line containing "--" is
+output between each group of lines, unless they are in fact contiguous in the
+input file. The value of <i>number</i> is expected to be relatively small. When
 <b>-c</b> is used, <b>-B</b> is ignored.
 </P>
 <P>
@ -411,20 +416,22 @@ shown separately. This option is mutually exclusive with <b>--output</b>,
 <P>
 <b>-H</b>, <b>--with-filename</b>
 Force the inclusion of the file name at the start of output lines when
-searching a single file. By default, the file name is not shown in this case.
-For matching lines, the file name is followed by a colon; for context lines, a
-hyphen separator is used. If a line number is also being output, it follows the
-file name. When the <b>-M</b> option causes a pattern to match more than one
-line, only the first is preceded by the file name. This option overrides any
-previous <b>-h</b>, <b>-l</b>, or <b>-L</b> options.
+searching a single file. The file name is not normally shown in this case.
+By default, for matching lines, the file name is followed by a colon; for
+context lines, a hyphen separator is used. The <b>-Z</b> option can be used to
+change the terminator to a zero byte. If a line number is also being output,
+it follows the file name. When the <b>-M</b> option causes a pattern to match
+more than one line, only the first is preceded by the file name. This option
+overrides any previous <b>-h</b>, <b>-l</b>, or <b>-L</b> options.
 </P>
 <P>
 <b>-h</b>, <b>--no-filename</b>
-Suppress the output file names when searching multiple files. By default,
-file names are shown when multiple files are searched. For matching lines, the
-file name is followed by a colon; for context lines, a hyphen separator is used.
-If a line number is also being output, it follows the file name. This option
-overrides any previous <b>-H</b>, <b>-L</b>, or <b>-l</b> options.
+Suppress the output file names when searching multiple files. File names are
+normally shown when multiple files are searched. By default, for matching
+lines, the file name is followed by a colon; for context lines, a hyphen
+separator is used. The <b>-Z</b> option can be used to change the terminator to
+a zero byte. If a line number is also being output, it follows the file name.
+This option overrides any previous <b>-H</b>, <b>-L</b>, or <b>-l</b> options.
 </P>
 <P>
 <b>--heap-limit</b>=<i>number</i>
@ -481,18 +488,20 @@ given any number of times. If a directory matches both <b>--include-dir</b> and
 <b>-L</b>, <b>--files-without-match</b>
 Instead of outputting lines from the files, just output the names of the files
 that do not contain any lines that would have been output. Each file name is
-output once, on a separate line. This option overrides any previous <b>-H</b>,
-<b>-h</b>, or <b>-l</b> options.
+output once, on a separate line by default, but if the <b>-Z</b> option is set, 
+they are separated by zero bytes instead of newlines. This option overrides any
+previous <b>-H</b>, <b>-h</b>, or <b>-l</b> options.
 </P>
 <P>
 <b>-l</b>, <b>--files-with-matches</b>
 Instead of outputting lines from the files, just output the names of the files
 containing lines that would have been output. Each file name is output once, on
-a separate line. Searching normally stops as soon as a matching line is found
-in a file. However, if the <b>-c</b> (count) option is also used, matching
-continues in order to obtain the correct count, and those files that have at
-least one match are listed along with their counts. Using this option with
-<b>-c</b> is a way of suppressing the listing of files with no matches that
+a separate line, but if the <b>-Z</b> option is set, they are separated by zero
+bytes instead of newlines. Searching normally stops as soon as a matching line
+is found in a file. However, if the <b>-c</b> (count) option is also used,
+matching continues in order to obtain the correct count, and those files that
+have at least one match are listed along with their counts. Using this option
+with <b>-c</b> is a way of suppressing the listing of files with no matches that
 occurs with <b>-c</b> on its own. This option overrides any previous <b>-H</b>,
 <b>-h</b>, or <b>-L</b> options.
 </P>
@ -592,10 +601,7 @@ value set by <b>--match-limit</b> is reached, an error occurs.
 <br>
 <br>
 The <b>--heap-limit</b> option specifies, as a number of kibibytes (units of
-1024 bytes), the amount of heap memory that may be used for matching. Heap
-memory is needed only if matching the pattern requires a significant number of
-nested backtracking points to be remembered. This parameter can be set to zero
-to forbid the use of heap memory altogether.
+1024 bytes), the maximum amount of heap memory that may be used for matching.
 <br>
 <br>
 The <b>--depth-limit</b> option limits the depth of nested backtracking points,
@ -839,6 +845,13 @@ pattern and ")$" at the end. This option applies only to the patterns that are
 matched against the contents of files; it does not apply to patterns specified
 by any of the <b>--include</b> or <b>--exclude</b> options.
 </P>
+<P>
+<b>-Z</b>, <b>--null</b>
+Terminate files names in the regular output with a zero byte (the NUL
+character) instead of what would normally appear. This is useful when file
+names contain unusual characters such as colons, hyphens, or even newlines. The
+option does not apply to file names in error messages.
+</P>
 <br><a name="SEC7" href="#TOC1">ENVIRONMENT VARIABLES</a><br>
 <P>
 The environment variables <b>LC_ALL</b> and <b>LC_CTYPE</b> are examined, in that
@ -1053,9 +1066,9 @@ Cambridge, England.
 </P>
 <br><a name="SEC16" href="#TOC1">REVISION</a><br>
 <P>
-Last updated: 31 August 2021
+Last updated: 30 July 2022
 <br>
-Copyright &copy; 1997-2021 University of Cambridge.
+Copyright &copy; 1997-2022 University of Cambridge.
 <br>
 <p>
 Return to the <a href="index.html">PCRE2 index page</a>.
--- a/doc/html/pcre2jit.html
+++ b/doc/html/pcre2jit.html
@ -269,11 +269,11 @@ starts another match, that match must use a different JIT stack to the one used
 for currently suspended match(es).
 </P>
 <P>
-In a multithread application, if you do not
-specify a JIT stack, or if you assign or pass back NULL from a callback, that
-is thread-safe, because each thread has its own machine stack. However, if you
-assign or pass back a non-NULL JIT stack, this must be a different stack for
-each thread so that the application is thread-safe.
+In a multithread application, if you do not specify a JIT stack, or if you
+assign or pass back NULL from a callback, that is thread-safe, because each
+thread has its own machine stack. However, if you assign or pass back a
+non-NULL JIT stack, this must be a different stack for each thread so that the
+application is thread-safe.
 </P>
 <P>
 Strictly speaking, even more is allowed. You can assign the same non-NULL stack
@ -382,8 +382,8 @@ out this complicated API.
 <b>void pcre2_jit_free_unused_memory(pcre2_general_context *<i>gcontext</i>);</b>
 </P>
 <P>
-The JIT executable allocator does not free all memory when it is possible.
-It expects new allocations, and keeps some free memory around to improve
+The JIT executable allocator does not free all memory when it is possible. It
+expects new allocations, and keeps some free memory around to improve
 allocation speed. However, in low memory conditions, it might be better to free
 all possible memory. You can cause this to happen by calling
 pcre2_jit_free_unused_memory(). Its argument is a general context, for custom
@ -442,10 +442,10 @@ that was not compiled.
 <P>
 When you call <b>pcre2_match()</b>, as well as testing for invalid options, a
 number of other sanity checks are performed on the arguments. For example, if
-the subject pointer is NULL, an immediate error is given. Also, unless
-PCRE2_NO_UTF_CHECK is set, a UTF subject string is tested for validity. In the
-interests of speed, these checks do not happen on the JIT fast path, and if
-invalid data is passed, the result is undefined.
+the subject pointer is NULL but the length is non-zero, an immediate error is
+given. Also, unless PCRE2_NO_UTF_CHECK is set, a UTF subject string is tested
+for validity. In the interests of speed, these checks do not happen on the JIT
+fast path, and if invalid data is passed, the result is undefined.
 </P>
 <P>
 Bypassing the sanity checks and the <b>pcre2_match()</b> wrapping can give
@ -466,9 +466,9 @@ Cambridge, England.
 </P>
 <br><a name="SEC14" href="#TOC1">REVISION</a><br>
 <P>
-Last updated: 23 May 2019
+Last updated: 30 November 2021
 <br>
-Copyright &copy; 1997-2019 University of Cambridge.
+Copyright &copy; 1997-2021 University of Cambridge.
 <br>
 <p>
 Return to the <a href="index.html">PCRE2 index page</a>.
--- a/doc/html/pcre2limits.html
+++ b/doc/html/pcre2limits.html
@ -71,13 +71,18 @@ is 255 code units for the 8-bit library and 65535 code units for the 16-bit and
 The maximum length of a string argument to a callout is the largest number a
 32-bit unsigned integer can hold.
 </P>
+<P>
+The maximum amount of heap memory used for matching is controlled by the heap 
+limit, which can be set in a pattern or in a match context. The default is a 
+very large number, effectively unlimited.
+</P>
 <br><b>
 AUTHOR
 </b><br>
 <P>
 Philip Hazel
 <br>
-University Computing Service
+Retired from University Computing Service
 <br>
 Cambridge, England.
 <br>
@ -86,9 +91,9 @@ Cambridge, England.
 REVISION
 </b><br>
 <P>
-Last updated: 02 February 2019
+Last updated: 26 July 2022
 <br>
-Copyright &copy; 1997-2019 University of Cambridge.
+Copyright &copy; 1997-2022 University of Cambridge.
 <br>
 <p>
 Return to the <a href="index.html">PCRE2 index page</a>.
--- a/doc/html/pcre2pattern.html
+++ b/doc/html/pcre2pattern.html
@ -534,7 +534,7 @@ for themselves. For example, outside a character class:
  \0113  is a tab followed by the character "3"
  \113   might be a backreference, otherwise the character with octal code 113
  \377   might be a backreference, otherwise the value 255 (decimal)
-  \81    is always a backreference .sp
+  \81    is always a backreference
 </pre>
 Note that octal values of 100 or greater that are specified using this syntax
 must not be introduced by a leading zero, because no more than three octal
@ -776,199 +776,62 @@ can be used in any mode, though in 8-bit and 16-bit non-UTF modes these
 sequences are of course limited to testing characters whose code points are
 less than U+0100 and U+10000, respectively. In 32-bit non-UTF mode, code points
 greater than 0x10ffff (the Unicode limit) may be encountered. These are all
-treated as being in the Unknown script and with an unassigned type. The extra
-escape sequences are:
+treated as being in the Unknown script and with an unassigned type.
+</P>
+<P>
+Matching characters by Unicode property is not fast, because PCRE2 has to do a
+multistage table lookup in order to find a character's property. That is why
+the traditional escape sequences such as \d and \w do not use Unicode
+properties in PCRE2 by default, though you can make them do so by setting the
+PCRE2_UCP option or by starting the pattern with (*UCP).
+</P>
+<P>
+The extra escape sequences that provide property support are:
 <pre>
  \p{<i>xx</i>}   a character with the <i>xx</i> property
  \P{<i>xx</i>}   a character without the <i>xx</i> property
  \X       a Unicode extended grapheme cluster
 </pre>
-The property names represented by <i>xx</i> above are case-sensitive. There is
-support for Unicode script names, Unicode general category properties, "Any",
-which matches any character (including newline), and some special PCRE2
-properties (described in the
-<a href="#extraprops">next section).</a>
-Other Perl properties such as "InMusicalSymbols" are not supported by PCRE2.
-Note that \P{Any} does not match any characters, so always causes a match
-failure.
+The property names represented by <i>xx</i> above are not case-sensitive, and in
+accordance with Unicode's "loose matching" rules, spaces, hyphens, and
+underscores are ignored. There is support for Unicode script names, Unicode
+general category properties, "Any", which matches any character (including
+newline), Bidi_Class, a number of binary (yes/no) properties, and some special
+PCRE2 properties (described
+<a href="#extraprops">below).</a>
+Certain other Perl properties such as "InMusicalSymbols" are not supported by
+PCRE2. Note that \P{Any} does not match any characters, so always causes a
+match failure.
+</P>
+<br><b>
+Script properties for \p and \P
+</b><br>
+<P>
+There are three different syntax forms for matching a script. Each Unicode
+character has a basic script and, optionally, a list of other scripts ("Script
+Extensions") with which it is commonly used. Using the Adlam script as an
+example, \p{sc:Adlam} matches characters whose basic script is Adlam, whereas
+\p{scx:Adlam} matches, in addition, characters that have Adlam in their
+extensions list. The full names "script" and "script extensions" for the
+property types are recognized, and a equals sign is an alternative to the
+colon. If a script name is given without a property type, for example,
+\p{Adlam}, it is treated as \p{scx:Adlam}. Perl changed to this
+interpretation at release 5.26 and PCRE2 changed at release 10.40.
 </P>
 <P>
-Sets of Unicode characters are defined as belonging to certain scripts. A
-character from one of these sets can be matched using a script name. For
-example:
-<pre>
-  \p{Greek}
-  \P{Han}
-</pre>
 Unassigned characters (and in non-UTF 32-bit mode, characters with code points
 greater than 0x10FFFF) are assigned the "Unknown" script. Others that are not
 part of an identified script are lumped together as "Common". The current list
-of scripts is:
-</P>
-<P>
-Adlam,
-Ahom,
-Anatolian_Hieroglyphs,
-Arabic,
-Armenian,
-Avestan,
-Balinese,
-Bamum,
-Bassa_Vah,
-Batak,
-Bengali,
-Bhaiksuki,
-Bopomofo,
-Brahmi,
-Braille,
-Buginese,
-Buhid,
-Canadian_Aboriginal,
-Carian,
-Caucasian_Albanian,
-Chakma,
-Cham,
-Cherokee,
-Chorasmian,
-Common,
-Coptic,
-Cuneiform,
-Cypriot,
-Cypro_Minoan,
-Cyrillic,
-Deseret,
-Devanagari,
-Dives_Akuru,
-Dogra,
-Duployan,
-Egyptian_Hieroglyphs,
-Elbasan,
-Elymaic,
-Ethiopic,
-Georgian,
-Glagolitic,
-Gothic,
-Grantha,
-Greek,
-Gujarati,
-Gunjala_Gondi,
-Gurmukhi,
-Han,
-Hangul,
-Hanifi_Rohingya,
-Hanunoo,
-Hatran,
-Hebrew,
-Hiragana,
-Imperial_Aramaic,
-Inherited,
-Inscriptional_Pahlavi,
-Inscriptional_Parthian,
-Javanese,
-Kaithi,
-Kannada,
-Katakana,
-Kayah_Li,
-Kharoshthi,
-Khitan_Small_Script,
-Khmer,
-Khojki,
-Khudawadi,
-Lao,
-Latin,
-Lepcha,
-Limbu,
-Linear_A,
-Linear_B,
-Lisu,
-Lycian,
-Lydian,
-Mahajani,
-Makasar,
-Malayalam,
-Mandaic,
-Manichaean,
-Marchen,
-Masaram_Gondi,
-Medefaidrin,
-Meetei_Mayek,
-Mende_Kikakui,
-Meroitic_Cursive,
-Meroitic_Hieroglyphs,
-Miao,
-Modi,
-Mongolian,
-Mro,
-Multani,
-Myanmar,
-Nabataean,
-Nandinagari,
-New_Tai_Lue,
-Newa,
-Nko,
-Nushu,
-Nyakeng_Puachue_Hmong,
-Ogham,
-Ol_Chiki,
-Old_Hungarian,
-Old_Italic,
-Old_North_Arabian,
-Old_Permic,
-Old_Persian,
-Old_Sogdian,
-Old_South_Arabian,
-Old_Turkic,
-Old_Uyghur,
-Oriya,
-Osage,
-Osmanya,
-Pahawh_Hmong,
-Palmyrene,
-Pau_Cin_Hau,
-Phags_Pa,
-Phoenician,
-Psalter_Pahlavi,
-Rejang,
-Runic,
-Samaritan,
-Saurashtra,
-Sharada,
-Shavian,
-Siddham,
-SignWriting,
-Sinhala,
-Sogdian,
-Sora_Sompeng,
-Soyombo,
-Sundanese,
-Syloti_Nagri,
-Syriac,
-Tagalog,
-Tagbanwa,
-Tai_Le,
-Tai_Tham,
-Tai_Viet,
-Takri,
-Tamil,
-Tangsa,
-Tangut,
-Telugu,
-Thaana,
-Thai,
-Tibetan,
-Tifinagh,
-Tirhuta,
-Toto,
-Ugaritic,
-Unknown,
-Vai,
-Vithkuqi,
-Wancho,
-Warang_Citi,
-Yezidi,
-Yi,
-Zanabazar_Square.
+of recognized script names and their 4-character abbreviations can be obtained
+by running this command:
+<pre>
+  pcre2test -LS
+
+</PRE>
 </P>
+<br><b>
+The general category property for \p and \P
+</b><br>
 <P>
 Each character has exactly one Unicode general category property, specified by
 a two-letter abbreviation. For compatibility with Perl, negation can be
@ -1030,9 +893,9 @@ The following general category property codes are supported:
  Zp    Paragraph separator
  Zs    Space separator
 </pre>
-The special property L& is also supported: it matches a character that has
-the Lu, Ll, or Lt property, in other words, a letter that is not classified as
-a modifier or "other".
+The special property LC, which has the synonym L&, is also supported: it
+matches a character that has the Lu, Ll, or Lt property, in other words, a
+letter that is not classified as a modifier or "other".
 </P>
 <P>
 The Cs (Surrogate) property applies only to characters whose code points are in
@ -1059,12 +922,54 @@ Specifying caseless matching does not affect these escape sequences. For
 example, \p{Lu} always matches only upper case letters. This is different from
 the behaviour of current versions of Perl.
 </P>
+<br><b>
+Binary (yes/no) properties for \p and \P
+</b><br>
 <P>
-Matching characters by Unicode property is not fast, because PCRE2 has to do a
-multistage table lookup in order to find a character's property. That is why
-the traditional escape sequences such as \d and \w do not use Unicode
-properties in PCRE2 by default, though you can make them do so by setting the
-PCRE2_UCP option or by starting the pattern with (*UCP).
+Unicode defines a number of binary properties, that is, properties whose only
+values are true or false. You can obtain a list of those that are recognized by
+\p and \P, along with their abbreviations, by running this command:
+<pre>
+  pcre2test -LP
+
+</PRE>
+</P>
+<br><b>
+The Bidi_Class property for \p and \P
+</b><br>
+<P>
+<pre>
+  \p{Bidi_Class:&#60;class&#62;}   matches a character with the given class
+  \p{BC:&#60;class&#62;}           matches a character with the given class
+</pre>
+The recognized classes are:
+<pre>
+  AL          Arabic letter
+  AN          Arabic number
+  B           paragraph separator
+  BN          boundary neutral
+  CS          common separator
+  EN          European number
+  ES          European separator
+  ET          European terminator
+  FSI         first strong isolate
+  L           left-to-right
+  LRE         left-to-right embedding
+  LRI         left-to-right isolate
+  LRO         left-to-right override
+  NSM         non-spacing mark
+  ON          other neutral
+  PDF         pop directional format
+  PDI         pop directional isolate
+  R           right-to-left
+  RLE         right-to-left embedding
+  RLI         right-to-left isolate
+  RLO         right-to-left override
+  S           segment separator
+  WS          which space
+</pre>
+An equals sign may be used instead of a colon. The class names are
+case-insensitive; only the short names listed above are recognized.
 </P>
 <br><b>
 Extended grapheme clusters
@ -1341,15 +1246,17 @@ end of the subject in both modes, and if all branches of a pattern start with
 <P>
 Outside a character class, a dot in the pattern matches any one character in
 the subject string except (by default) a character that signifies the end of a
-line.
+line. One or more characters may be specified as line terminators (see
+<a href="#newlines">"Newline conventions"</a>
+above).
 </P>
 <P>
-When a line ending is defined as a single character, dot never matches that
-character; when the two-character sequence CRLF is used, dot does not match CR
-if it is immediately followed by LF, but otherwise it matches all characters
-(including isolated CRs and LFs). When any Unicode line endings are being
-recognized, dot does not match CR or LF or any of the other line ending
-characters.
+Dot never matches a single line-ending character. When the two-character
+sequence CRLF is the only line ending, dot does not match CR if it is
+immediately followed by LF, but otherwise it matches all characters (including
+isolated CRs and LFs). When ANYCRLF is selected for line endings, no occurences
+of CR of LF match dot. When all Unicode line endings are being recognized, dot
+does not match CR or LF or any of the other line ending characters.
 </P>
 <P>
 The behaviour of dot with regard to newlines can be changed. If the
@ -2180,10 +2087,10 @@ be easier to remember:
 <pre>
  (*atomic:\d+)foo
 </pre>
-This kind of parenthesized group "locks up" the  part of the pattern it
-contains once it has matched, and a failure further into the pattern is
-prevented from backtracking into it. Backtracking past it to previous items,
-however, works as normal.
+This kind of parenthesized group "locks up" the part of the pattern it contains
+once it has matched, and a failure further into the pattern is prevented from
+backtracking into it. Backtracking past it to previous items, however, works as
+normal.
 </P>
 <P>
 An alternative description is that a group of this type matches exactly the
@ -3859,9 +3766,9 @@ Cambridge, England.
 </P>
 <br><a name="SEC32" href="#TOC1">REVISION</a><br>
 <P>
-Last updated: 30 August 2021
+Last updated: 12 January 2022
 <br>
-Copyright &copy; 1997-2021 University of Cambridge.
+Copyright &copy; 1997-2022 University of Cambridge.
 <br>
 <p>
 Return to the <a href="index.html">PCRE2 index page</a>.
--- a/doc/html/pcre2perform.html
+++ b/doc/html/pcre2perform.html
@ -83,12 +83,31 @@ From release 10.30, the interpretive (non-JIT) version of <b>pcre2_match()</b>
 uses very little system stack at run time. In earlier releases recursive
 function calls could use a great deal of stack, and this could cause problems,
 but this usage has been eliminated. Backtracking positions are now explicitly
-remembered in memory frames controlled by the code. An initial 20KiB vector of
-frames is allocated on the system stack (enough for about 100 frames for small
-patterns), but if this is insufficient, heap memory is used. The amount of heap
-memory can be limited; if the limit is set to zero, only the initial stack
-vector is used. Rewriting patterns to be time-efficient, as described below,
-may also reduce the memory requirements.
+remembered in memory frames controlled by the code. 
+</P>
+<P>
+The size of each frame depends on the size of pointer variables and the number
+of capturing parenthesized groups in the pattern being matched. On a 64-bit
+system the frame size for a pattern with no captures is 128 bytes. For each
+capturing group the size increases by 16 bytes.
+</P>
+<P>
+Until release 10.41, an initial 20KiB frames vector was allocated on the system 
+stack, but this still caused some issues for multi-thread applications where
+each thread has a very small stack. From release 10.41 backtracking memory
+frames are always held in heap memory. An initial heap allocation is obtained
+the first time any match data block is passed to <b>pcre2_match()</b>. This is
+remembered with the match data block and re-used if that block is used for
+another match. It is freed when the match data block itself is freed.
+</P>
+<P>
+The size of the initial block is the larger of 20KiB or ten times the pattern's 
+frame size, unless the heap limit is less than this, in which case the heap 
+limit is used. If the initial block proves to be too small during matching, it
+is replaced by a larger block, subject to the heap limit. The heap limit is 
+checked only when a new block is to be allocated. Reducing the heap limit 
+between calls to <b>pcre2_match()</b> with the same match data block does not 
+affect the saved block.
 </P>
 <P>
 In contrast to <b>pcre2_match()</b>, <b>pcre2_dfa_match()</b> does use recursive
@ -245,16 +264,16 @@ pattern to match. This is done by repeatedly matching with different limits.
 <P>
 Philip Hazel
 <br>
-University Computing Service
+Retired from University Computing Service
 <br>
 Cambridge, England.
 <br>
 </P>
 <br><a name="SEC6" href="#TOC1">REVISION</a><br>
 <P>
-Last updated: 03 February 2019
+Last updated: 27 July 2022
 <br>
-Copyright &copy; 1997-2019 University of Cambridge.
+Copyright &copy; 1997-2022 University of Cambridge.
 <br>
 <p>
 Return to the <a href="index.html">PCRE2 index page</a>.
--- a/doc/html/pcre2serialize.html
+++ b/doc/html/pcre2serialize.html
@ -23,12 +23,12 @@ please consult the man page, in case the conversion went wrong.
 <br><a name="SEC1" href="#TOC1">SAVING AND RE-USING PRECOMPILED PCRE2 PATTERNS</a><br>
 <P>
 <b>int32_t pcre2_serialize_decode(pcre2_code **<i>codes</i>,</b>
-<b>  int32_t <i>number_of_codes</i>, const uint32_t *<i>bytes</i>,</b>
+<b>  int32_t <i>number_of_codes</i>, const uint8_t *<i>bytes</i>,</b>
 <b>  pcre2_general_context *<i>gcontext</i>);</b>
 <br>
 <br>
-<b>int32_t pcre2_serialize_encode(pcre2_code **<i>codes</i>,</b>
-<b>  int32_t <i>number_of_codes</i>, uint32_t **<i>serialized_bytes</i>,</b>
+<b>int32_t pcre2_serialize_encode(const pcre2_code **<i>codes</i>,</b>
+<b>  int32_t <i>number_of_codes</i>, uint8_t **<i>serialized_bytes</i>,</b>
 <b>  PCRE2_SIZE *<i>serialized_size</i>, pcre2_general_context *<i>gcontext</i>);</b>
 <br>
 <br>
@ -94,7 +94,7 @@ of serialized patterns, or one of the following negative error codes:
 <pre>
  PCRE2_ERROR_BADDATA      the number of patterns is zero or less
  PCRE2_ERROR_BADMAGIC     mismatch of id bytes in one of the patterns
-  PCRE2_ERROR_MEMORY       memory allocation failed
+  PCRE2_ERROR_NOMEMORY     memory allocation failed
  PCRE2_ERROR_MIXEDTABLES  the patterns do not all use the same tables
  PCRE2_ERROR_NULL         the 1st, 3rd, or 4th argument is NULL
 </pre>
@ -154,7 +154,6 @@ mangagement functions for the decoded patterns. If this argument is NULL,
 <b>malloc()</b> and <b>free()</b> are used. After deserialization, the byte
 stream is no longer needed and can be discarded.
 <pre>
-  int32_t number_of_codes;
  pcre2_code *list_of_codes[2];
  uint8_t *bytes = &#60;serialized data&#62;;
  int32_t number_of_codes =
--- a/doc/html/pcre2syntax.html
+++ b/doc/html/pcre2syntax.html
@ -19,29 +19,31 @@ please consult the man page, in case the conversion went wrong.
 <li><a name="TOC4" href="#SEC4">CHARACTER TYPES</a>
 <li><a name="TOC5" href="#SEC5">GENERAL CATEGORY PROPERTIES FOR \p and \P</a>
 <li><a name="TOC6" href="#SEC6">PCRE2 SPECIAL CATEGORY PROPERTIES FOR \p and \P</a>
-<li><a name="TOC7" href="#SEC7">SCRIPT NAMES FOR \p AND \P</a>
-<li><a name="TOC8" href="#SEC8">CHARACTER CLASSES</a>
-<li><a name="TOC9" href="#SEC9">QUANTIFIERS</a>
-<li><a name="TOC10" href="#SEC10">ANCHORS AND SIMPLE ASSERTIONS</a>
-<li><a name="TOC11" href="#SEC11">REPORTED MATCH POINT SETTING</a>
-<li><a name="TOC12" href="#SEC12">ALTERNATION</a>
-<li><a name="TOC13" href="#SEC13">CAPTURING</a>
-<li><a name="TOC14" href="#SEC14">ATOMIC GROUPS</a>
-<li><a name="TOC15" href="#SEC15">COMMENT</a>
-<li><a name="TOC16" href="#SEC16">OPTION SETTING</a>
-<li><a name="TOC17" href="#SEC17">NEWLINE CONVENTION</a>
-<li><a name="TOC18" href="#SEC18">WHAT \R MATCHES</a>
-<li><a name="TOC19" href="#SEC19">LOOKAHEAD AND LOOKBEHIND ASSERTIONS</a>
-<li><a name="TOC20" href="#SEC20">NON-ATOMIC LOOKAROUND ASSERTIONS</a>
-<li><a name="TOC21" href="#SEC21">SCRIPT RUNS</a>
-<li><a name="TOC22" href="#SEC22">BACKREFERENCES</a>
-<li><a name="TOC23" href="#SEC23">SUBROUTINE REFERENCES (POSSIBLY RECURSIVE)</a>
-<li><a name="TOC24" href="#SEC24">CONDITIONAL PATTERNS</a>
-<li><a name="TOC25" href="#SEC25">BACKTRACKING CONTROL</a>
-<li><a name="TOC26" href="#SEC26">CALLOUTS</a>
-<li><a name="TOC27" href="#SEC27">SEE ALSO</a>
-<li><a name="TOC28" href="#SEC28">AUTHOR</a>
-<li><a name="TOC29" href="#SEC29">REVISION</a>
+<li><a name="TOC7" href="#SEC7">BINARY PROPERTIES FOR \p AND \P</a>
+<li><a name="TOC8" href="#SEC8">SCRIPT MATCHING WITH \p AND \P</a>
+<li><a name="TOC9" href="#SEC9">THE BIDI_CLASS PROPERTY FOR \p AND \P</a>
+<li><a name="TOC10" href="#SEC10">CHARACTER CLASSES</a>
+<li><a name="TOC11" href="#SEC11">QUANTIFIERS</a>
+<li><a name="TOC12" href="#SEC12">ANCHORS AND SIMPLE ASSERTIONS</a>
+<li><a name="TOC13" href="#SEC13">REPORTED MATCH POINT SETTING</a>
+<li><a name="TOC14" href="#SEC14">ALTERNATION</a>
+<li><a name="TOC15" href="#SEC15">CAPTURING</a>
+<li><a name="TOC16" href="#SEC16">ATOMIC GROUPS</a>
+<li><a name="TOC17" href="#SEC17">COMMENT</a>
+<li><a name="TOC18" href="#SEC18">OPTION SETTING</a>
+<li><a name="TOC19" href="#SEC19">NEWLINE CONVENTION</a>
+<li><a name="TOC20" href="#SEC20">WHAT \R MATCHES</a>
+<li><a name="TOC21" href="#SEC21">LOOKAHEAD AND LOOKBEHIND ASSERTIONS</a>
+<li><a name="TOC22" href="#SEC22">NON-ATOMIC LOOKAROUND ASSERTIONS</a>
+<li><a name="TOC23" href="#SEC23">SCRIPT RUNS</a>
+<li><a name="TOC24" href="#SEC24">BACKREFERENCES</a>
+<li><a name="TOC25" href="#SEC25">SUBROUTINE REFERENCES (POSSIBLY RECURSIVE)</a>
+<li><a name="TOC26" href="#SEC26">CONDITIONAL PATTERNS</a>
+<li><a name="TOC27" href="#SEC27">BACKTRACKING CONTROL</a>
+<li><a name="TOC28" href="#SEC28">CALLOUTS</a>
+<li><a name="TOC29" href="#SEC29">SEE ALSO</a>
+<li><a name="TOC30" href="#SEC30">AUTHOR</a>
+<li><a name="TOC31" href="#SEC31">REVISION</a>
 </ul>
 <br><a name="SEC1" href="#TOC1">PCRE2 REGULAR EXPRESSION SYNTAX SUMMARY</a><br>
 <P>
@ -136,6 +138,11 @@ happening, \s and \w may also match characters with code points in the range
 sequences is changed to use Unicode properties and they match many more
 characters.
 </P>
+<P>
+Property descriptions in \p and \P are matched caselessly; hyphens,
+underscores, and white space are ignored, in accordance with Unicode's "loose
+matching" rules.
+</P>
 <br><a name="SEC5" href="#TOC1">GENERAL CATEGORY PROPERTIES FOR \p and \P</a><br>
 <P>
 <pre>
@ -152,6 +159,7 @@ characters.
  Lo         Other letter
  Lt         Title case letter
  Lu         Upper case letter
+  Lc         Ll, Lu, or Lt
  L&         Ll, Lu, or Lt

  M          Mark
@ -198,171 +206,58 @@ characters.
 Perl and POSIX space are now the same. Perl added VT to its space character set
 at release 5.18.
 </P>
-<br><a name="SEC7" href="#TOC1">SCRIPT NAMES FOR \p AND \P</a><br>
+<br><a name="SEC7" href="#TOC1">BINARY PROPERTIES FOR \p AND \P</a><br>
 <P>
-Adlam,
-Ahom,
-Anatolian_Hieroglyphs,
-Arabic,
-Armenian,
-Avestan,
-Balinese,
-Bamum,
-Bassa_Vah,
-Batak,
-Bengali,
-Bhaiksuki,
-Bopomofo,
-Brahmi,
-Braille,
-Buginese,
-Buhid,
-Canadian_Aboriginal,
-Carian,
-Caucasian_Albanian,
-Chakma,
-Cham,
-Cherokee,
-Chorasmian,
-Common,
-Coptic,
-Cuneiform,
-Cypriot,
-Cypro_Minoan,
-Cyrillic,
-Deseret,
-Devanagari,
-Dives_Akuru,
-Dogra,
-Duployan,
-Egyptian_Hieroglyphs,
-Elbasan,
-Elymaic,
-Ethiopic,
-Georgian,
-Glagolitic,
-Gothic,
-Grantha,
-Greek,
-Gujarati,
-Gunjala_Gondi,
-Gurmukhi,
-Han,
-Hangul,
-Hanifi_Rohingya,
-Hanunoo,
-Hatran,
-Hebrew,
-Hiragana,
-Imperial_Aramaic,
-Inherited,
-Inscriptional_Pahlavi,
-Inscriptional_Parthian,
-Javanese,
-Kaithi,
-Kannada,
-Katakana,
-Kayah_Li,
-Kharoshthi,
-Khitan_Small_Script,
-Khmer,
-Khojki,
-Khudawadi,
-Lao,
-Latin,
-Lepcha,
-Limbu,
-Linear_A,
-Linear_B,
-Lisu,
-Lycian,
-Lydian,
-Mahajani,
-Makasar,
-Malayalam,
-Mandaic,
-Manichaean,
-Marchen,
-Masaram_Gondi,
-Medefaidrin,
-Meetei_Mayek,
-Mende_Kikakui,
-Meroitic_Cursive,
-Meroitic_Hieroglyphs,
-Miao,
-Modi,
-Mongolian,
-Mro,
-Multani,
-Myanmar,
-Nabataean,
-Nandinagari,
-New_Tai_Lue,
-Newa,
-Nko,
-Nushu,
-Nyakeng_Puachue_Hmong,
-Ogham,
-Ol_Chiki,
-Old_Hungarian,
-Old_Italic,
-Old_North_Arabian,
-Old_Permic,
-Old_Persian,
-Old_Sogdian,
-Old_South_Arabian,
-Old_Turkic,
-Old_Uyghur,
-Oriya,
-Osage,
-Osmanya,
-Pahawh_Hmong,
-Palmyrene,
-Pau_Cin_Hau,
-Phags_Pa,
-Phoenician,
-Psalter_Pahlavi,
-Rejang,
-Runic,
-Samaritan,
-Saurashtra,
-Sharada,
-Shavian,
-Siddham,
-SignWriting,
-Sinhala,
-Sogdian,
-Sora_Sompeng,
-Soyombo,
-Sundanese,
-Syloti_Nagri,
-Syriac,
-Tagalog,
-Tagbanwa,
-Tai_Le,
-Tai_Tham,
-Tai_Viet,
-Takri,
-Tamil,
-Tangsa,
-Tangut,
-Telugu,
-Thaana,
-Thai,
-Tibetan,
-Tifinagh,
-Tirhuta,
-Toto,
-Ugaritic,
-Vai,
-Vithkuqi,
-Wancho,
-Warang_Citi,
-Yezidi,
-Yi,
-Zanabazar_Square.
+Unicode defines a number of binary properties, that is, properties whose only
+values are true or false. You can obtain a list of those that are recognized by
+\p and \P, along with their abbreviations, by running this command:
+<pre>
+  pcre2test -LP
+</PRE>
 </P>
-<br><a name="SEC8" href="#TOC1">CHARACTER CLASSES</a><br>
+<br><a name="SEC8" href="#TOC1">SCRIPT MATCHING WITH \p AND \P</a><br>
+<P>
+Many script names and their 4-letter abbreviations are recognized in
+\p{sc:...} or \p{scx:...} items, or on their own with \p (and also \P of
+course). You can obtain a list of these scripts by running this command:
+<pre>
+  pcre2test -LS
+</PRE>
+</P>
+<br><a name="SEC9" href="#TOC1">THE BIDI_CLASS PROPERTY FOR \p AND \P</a><br>
+<P>
+<pre>
+  \p{Bidi_Class:&#60;class&#62;}   matches a character with the given class
+  \p{BC:&#60;class&#62;}           matches a character with the given class
+</pre>
+The recognized classes are:
+<pre>
+  AL          Arabic letter
+  AN          Arabic number
+  B           paragraph separator
+  BN          boundary neutral
+  CS          common separator
+  EN          European number
+  ES          European separator
+  ET          European terminator
+  FSI         first strong isolate
+  L           left-to-right
+  LRE         left-to-right embedding
+  LRI         left-to-right isolate
+  LRO         left-to-right override
+  NSM         non-spacing mark
+  ON          other neutral
+  PDF         pop directional format
+  PDI         pop directional isolate
+  R           right-to-left
+  RLE         right-to-left embedding
+  RLI         right-to-left isolate
+  RLO         right-to-left override
+  S           segment separator
+  WS          which space
+</PRE>
+</P>
+<br><a name="SEC10" href="#TOC1">CHARACTER CLASSES</a><br>
 <P>
 <pre>
  [...]       positive character class
@ -390,7 +285,7 @@ In PCRE2, POSIX character set names recognize only ASCII characters by default,
 but some of them use Unicode properties if PCRE2_UCP is set. You can use
 \Q...\E inside a character class.
 </P>
-<br><a name="SEC9" href="#TOC1">QUANTIFIERS</a><br>
+<br><a name="SEC11" href="#TOC1">QUANTIFIERS</a><br>
 <P>
 <pre>
  ?           0 or 1, greedy
@ -411,7 +306,7 @@ but some of them use Unicode properties if PCRE2_UCP is set. You can use
  {n,}?       n or more, lazy
 </PRE>
 </P>
-<br><a name="SEC10" href="#TOC1">ANCHORS AND SIMPLE ASSERTIONS</a><br>
+<br><a name="SEC12" href="#TOC1">ANCHORS AND SIMPLE ASSERTIONS</a><br>
 <P>
 <pre>
  \b          word boundary
@ -429,7 +324,7 @@ but some of them use Unicode properties if PCRE2_UCP is set. You can use
  \G          first matching position in subject
 </PRE>
 </P>
-<br><a name="SEC11" href="#TOC1">REPORTED MATCH POINT SETTING</a><br>
+<br><a name="SEC13" href="#TOC1">REPORTED MATCH POINT SETTING</a><br>
 <P>
 <pre>
  \K          set reported start of match
@ -439,13 +334,13 @@ for compatibility with Perl. However, if the PCRE2_EXTRA_ALLOW_LOOKAROUND_BSK
 option is set, the previous behaviour is re-enabled. When this option is set,
 \K is honoured in positive assertions, but ignored in negative ones.
 </P>
-<br><a name="SEC12" href="#TOC1">ALTERNATION</a><br>
+<br><a name="SEC14" href="#TOC1">ALTERNATION</a><br>
 <P>
 <pre>
  expr|expr|expr...
 </PRE>
 </P>
-<br><a name="SEC13" href="#TOC1">CAPTURING</a><br>
+<br><a name="SEC15" href="#TOC1">CAPTURING</a><br>
 <P>
 <pre>
  (...)           capture group
@ -460,20 +355,20 @@ In non-UTF modes, names may contain underscores and ASCII letters and digits;
 in UTF modes, any Unicode letters and Unicode decimal digits are permitted. In
 both cases, a name must not start with a digit.
 </P>
-<br><a name="SEC14" href="#TOC1">ATOMIC GROUPS</a><br>
+<br><a name="SEC16" href="#TOC1">ATOMIC GROUPS</a><br>
 <P>
 <pre>
  (?&#62;...)         atomic non-capture group
  (*atomic:...)   atomic non-capture group
 </PRE>
 </P>
-<br><a name="SEC15" href="#TOC1">COMMENT</a><br>
+<br><a name="SEC17" href="#TOC1">COMMENT</a><br>
 <P>
 <pre>
  (?#....)        comment (not nestable)
 </PRE>
 </P>
-<br><a name="SEC16" href="#TOC1">OPTION SETTING</a><br>
+<br><a name="SEC18" href="#TOC1">OPTION SETTING</a><br>
 <P>
 Changes of these options within a group are automatically cancelled at the end
 of the group.
@ -518,7 +413,7 @@ not increase them. LIMIT_RECURSION is an obsolete synonym for LIMIT_DEPTH. The
 application can lock out the use of (*UTF) and (*UCP) by setting the
 PCRE2_NEVER_UTF or PCRE2_NEVER_UCP options, respectively, at compile time.
 </P>
-<br><a name="SEC17" href="#TOC1">NEWLINE CONVENTION</a><br>
+<br><a name="SEC19" href="#TOC1">NEWLINE CONVENTION</a><br>
 <P>
 These are recognized only at the very start of the pattern or after option
 settings with a similar syntax.
@ -531,7 +426,7 @@ settings with a similar syntax.
  (*NUL)          the NUL character (binary zero)
 </PRE>
 </P>
-<br><a name="SEC18" href="#TOC1">WHAT \R MATCHES</a><br>
+<br><a name="SEC20" href="#TOC1">WHAT \R MATCHES</a><br>
 <P>
 These are recognized only at the very start of the pattern or after option
 setting with a similar syntax.
@ -540,7 +435,7 @@ setting with a similar syntax.
  (*BSR_UNICODE)  any Unicode newline sequence
 </PRE>
 </P>
-<br><a name="SEC19" href="#TOC1">LOOKAHEAD AND LOOKBEHIND ASSERTIONS</a><br>
+<br><a name="SEC21" href="#TOC1">LOOKAHEAD AND LOOKBEHIND ASSERTIONS</a><br>
 <P>
 <pre>
  (?=...)                     )
@ -561,7 +456,7 @@ setting with a similar syntax.
 </pre>
 Each top-level branch of a lookbehind must be of a fixed length.
 </P>
-<br><a name="SEC20" href="#TOC1">NON-ATOMIC LOOKAROUND ASSERTIONS</a><br>
+<br><a name="SEC22" href="#TOC1">NON-ATOMIC LOOKAROUND ASSERTIONS</a><br>
 <P>
 These assertions are specific to PCRE2 and are not Perl-compatible.
 <pre>
@ -574,7 +469,7 @@ These assertions are specific to PCRE2 and are not Perl-compatible.
  (*non_atomic_positive_lookbehind:...)  )
 </PRE>
 </P>
-<br><a name="SEC21" href="#TOC1">SCRIPT RUNS</a><br>
+<br><a name="SEC23" href="#TOC1">SCRIPT RUNS</a><br>
 <P>
 <pre>
  (*script_run:...)           ) script run, can be backtracked into
@ -584,7 +479,7 @@ These assertions are specific to PCRE2 and are not Perl-compatible.
  (*asr:...)                  )
 </PRE>
 </P>
-<br><a name="SEC22" href="#TOC1">BACKREFERENCES</a><br>
+<br><a name="SEC24" href="#TOC1">BACKREFERENCES</a><br>
 <P>
 <pre>
  \n              reference by number (can be ambiguous)
@ -601,7 +496,7 @@ These assertions are specific to PCRE2 and are not Perl-compatible.
  (?P=name)       reference by name (Python)
 </PRE>
 </P>
-<br><a name="SEC23" href="#TOC1">SUBROUTINE REFERENCES (POSSIBLY RECURSIVE)</a><br>
+<br><a name="SEC25" href="#TOC1">SUBROUTINE REFERENCES (POSSIBLY RECURSIVE)</a><br>
 <P>
 <pre>
  (?R)            recurse whole pattern
@ -620,7 +515,7 @@ These assertions are specific to PCRE2 and are not Perl-compatible.
  \g'-n'          call subroutine by relative number (PCRE2 extension)
 </PRE>
 </P>
-<br><a name="SEC24" href="#TOC1">CONDITIONAL PATTERNS</a><br>
+<br><a name="SEC26" href="#TOC1">CONDITIONAL PATTERNS</a><br>
 <P>
 <pre>
  (?(condition)yes-pattern)
@ -643,7 +538,7 @@ Note the ambiguity of (?(R) and (?(Rn) which might be named reference
 conditions or recursion tests. Such a condition is interpreted as a reference
 condition if the relevant named group exists.
 </P>
-<br><a name="SEC25" href="#TOC1">BACKTRACKING CONTROL</a><br>
+<br><a name="SEC27" href="#TOC1">BACKTRACKING CONTROL</a><br>
 <P>
 All backtracking control verbs may be in the form (*VERB:NAME). For (*MARK) the
 name is mandatory, for the others it is optional. (*SKIP) changes its behaviour
@ -670,7 +565,7 @@ pattern is not anchored.
 The effect of one of these verbs in a group called as a subroutine is confined
 to the subroutine call.
 </P>
-<br><a name="SEC26" href="#TOC1">CALLOUTS</a><br>
+<br><a name="SEC28" href="#TOC1">CALLOUTS</a><br>
 <P>
 <pre>
  (?C)            callout (assumed number 0)
@ -681,12 +576,12 @@ The allowed string delimiters are ` ' " ^ % # $ (which are the same for the
 start and the end), and the starting delimiter { matched with the ending
 delimiter }. To encode the ending delimiter within the string, double it.
 </P>
-<br><a name="SEC27" href="#TOC1">SEE ALSO</a><br>
+<br><a name="SEC29" href="#TOC1">SEE ALSO</a><br>
 <P>
 <b>pcre2pattern</b>(3), <b>pcre2api</b>(3), <b>pcre2callout</b>(3),
 <b>pcre2matching</b>(3), <b>pcre2</b>(3).
 </P>
-<br><a name="SEC28" href="#TOC1">AUTHOR</a><br>
+<br><a name="SEC30" href="#TOC1">AUTHOR</a><br>
 <P>
 Philip Hazel
 <br>
@ -695,11 +590,11 @@ Retired from University Computing Service
 Cambridge, England.
 <br>
 </P>
-<br><a name="SEC29" href="#TOC1">REVISION</a><br>
+<br><a name="SEC31" href="#TOC1">REVISION</a><br>
 <P>
-Last updated: 30 August 2021
+Last updated: 12 January 2022
 <br>
-Copyright &copy; 1997-2021 University of Cambridge.
+Copyright &copy; 1997-2022 University of Cambridge.
 <br>
 <p>
 Return to the <a href="index.html">PCRE2 index page</a>.
--- a/doc/html/pcre2test.html
+++ b/doc/html/pcre2test.html
@ -78,7 +78,7 @@ to 8-bit code units for output.
 </P>
 <P>
 In the rest of this document, the names of library functions and structures
-are given in generic form, for example, <b>pcre_compile()</b>. The actual
+are given in generic form, for example, <b>pcre2_compile()</b>. The actual
 names used in the libraries have a suffix _8, _16, or _32, as appropriate.
 <a name="inputencoding"></a></P>
 <br><a name="SEC3" href="#TOC1">INPUT ENCODING</a><br>
@ -253,7 +253,19 @@ available, and the use of JIT for matching is verified.
 <b>-LM</b>
 List modifiers: write a list of available pattern and subject modifiers to the
 standard output, then exit with zero exit code. All other options are ignored.
-If both -C and -LM are present, whichever is first is recognized.
+If both -C and any -Lx options are present, whichever is first is recognized.
+</P>
+<P>
+<b>-LP</b>
+List properties: write a list of recognized Unicode properties to the standard
+output, then exit with zero exit code. All other options are ignored. If both
+-C and any -Lx options are present, whichever is first is recognized.
+</P>
+<P>
+<b>-LS</b>
+List scripts: write a list of recogized Unicode script names to the standard
+output, then exit with zero exit code. All other options are ignored. If both
+-C and any -Lx options are present, whichever is first is recognized.
 </P>
 <P>
 <b>-pattern</b> <i>modifier-list</i>
@ -1229,7 +1241,8 @@ pattern, but can be overridden by modifiers on the subject.
      copy=&#60;number or name&#62;      copy captured substring
      depth_limit=&#60;n&#62;            set a depth limit
      dfa                        use <b>pcre2_dfa_match()</b>
-      find_limits                find match and depth limits
+      find_limits                find heap, match and depth limits
+      find_limits_noheap         find match and depth limits
      get=&#60;number or name&#62;       extract captured substring
      getall                     extract all captured substrings
  /g  global                     global matching
@ -1239,6 +1252,8 @@ pattern, but can be overridden by modifiers on the subject.
      match_limit=&#60;n&#62;            set a match limit
      memory                     show heap memory usage
      null_context               match with a NULL context
+      null_replacement           substitute with NULL replacement
+      null_subject               match with NULL subject
      offset=&#60;n&#62;                 set starting offset
      offset_limit=&#60;n&#62;           set offset limit
      ovector=&#60;n&#62;                set size of output vector
@ -1550,7 +1565,7 @@ Setting heap, match, and depth limits
 <P>
 The <b>heap_limit</b>, <b>match_limit</b>, and <b>depth_limit</b> modifiers set
 the appropriate limits in the match context. These values are ignored when the
-<b>find_limits</b> modifier is specified.
+<b>find_limits</b> or <b>find_limits_noheap</b> modifier is specified.
 </P>
 <br><b>
 Finding minimum limits
@ -1560,8 +1575,12 @@ If the <b>find_limits</b> modifier is present on a subject line, <b>pcre2test</b
 calls the relevant matching function several times, setting different values in
 the match context via <b>pcre2_set_heap_limit()</b>,
 <b>pcre2_set_match_limit()</b>, or <b>pcre2_set_depth_limit()</b> until it finds
-the minimum values for each parameter that allows the match to complete without
-error. If JIT is being used, only the match limit is relevant.
+the smallest value for each parameter that allows the match to complete without
+a "limit exceeded" error. The match itself may succeed or fail. An alternative
+modifier, <b>find_limits_noheap</b>, omits the heap limit. This is used in the
+standard tests, because the minimum heap limit varies between systems. If JIT
+is being used, only the match limit is relevant, and the other two are
+automatically omitted.
 </P>
 <P>
 When using this modifier, the pattern should not contain any limit settings
@ -1589,9 +1608,7 @@ overall amount of computing resource that is used.
 </P>
 <P>
 For both kinds of matching, the <i>heap_limit</i> number, which is in kibibytes
-(units of 1024 bytes), limits the amount of heap memory used for matching. A
-value of zero disables the use of any heap memory; many simple pattern matches
-can be done without using the heap, so zero is not an unreasonable setting.
+(units of 1024 bytes), limits the amount of heap memory used for matching.
 </P>
 <br><b>
 Showing MARK names
@ -1609,12 +1626,10 @@ Showing memory usage
 <P>
 The <b>memory</b> modifier causes <b>pcre2test</b> to log the sizes of all heap
 memory allocation and freeing calls that occur during a call to
-<b>pcre2_match()</b> or <b>pcre2_dfa_match()</b>. These occur only when a match
-requires a bigger vector than the default for remembering backtracking points
-(<b>pcre2_match()</b>) or for internal workspace (<b>pcre2_dfa_match()</b>). In
-many cases there will be no heap memory used and therefore no additional
-output. No heap memory is allocated during matching with JIT, so in that case
-the <b>memory</b> modifier never has any effect. For this modifier to work, the
+<b>pcre2_match()</b> or <b>pcre2_dfa_match()</b>. In the latter case, heap memory
+is used only when a match requires more internal workspace that the default
+allocation on the stack, so in many cases there will be no output. No heap
+memory is allocated during matching with JIT. For this modifier to work, the
 <b>null_context</b> modifier must not be set on both the pattern and the
 subject, though it can be set on one or the other.
 </P>
@ -1668,7 +1683,7 @@ When testing <b>pcre2_substitute()</b>, this modifier also has the effect of
 passing the replacement string as zero-terminated.
 </P>
 <br><b>
-Passing a NULL context
+Passing a NULL context, subject, or replacement
 </b><br>
 <P>
 Normally, <b>pcre2test</b> passes a context block to <b>pcre2_match()</b>,
@ -1676,7 +1691,13 @@ Normally, <b>pcre2test</b> passes a context block to <b>pcre2_match()</b>,
 If the <b>null_context</b> modifier is set, however, NULL is passed. This is for
 testing that the matching and substitution functions behave correctly in this
 case (they use default values). This modifier cannot be used with the
-<b>find_limits</b> or <b>substitute_callout</b> modifiers.
+<b>find_limits</b>, <b>find_limits_noheap</b>, or <b>substitute_callout</b>
+modifiers.
+</P>
+<P>
+Similarly, for testing purposes, if the <b>null_subject</b> or
+<b>null_replacement</b> modifier is set, the subject or replacement string
+pointers are passed as NULL, respectively, to the relevant functions.
 </P>
 <br><a name="SEC12" href="#TOC1">THE ALTERNATIVE MATCHING FUNCTION</a><br>
 <P>
@ -2122,9 +2143,9 @@ Cambridge, England.
 </P>
 <br><a name="SEC21" href="#TOC1">REVISION</a><br>
 <P>
-Last updated: 30 August 2021
+Last updated: 27 July 2022
 <br>
-Copyright &copy; 1997-2021 University of Cambridge.
+Copyright &copy; 1997-2022 University of Cambridge.
 <br>
 <p>
 Return to the <a href="index.html">PCRE2 index page</a>.
--- a/doc/html/pcre2unicode.html
+++ b/doc/html/pcre2unicode.html
@ -50,17 +50,18 @@ UNICODE PROPERTY SUPPORT
 <P>
 When PCRE2 is built with Unicode support, the escape sequences \p{..},
 \P{..}, and \X can be used. This is not dependent on the PCRE2_UTF setting.
-The Unicode properties that can be tested are limited to the general category
-properties such as Lu for an upper case letter or Nd for a decimal number, the
-Unicode script names such as Arabic or Han, and the derived properties Any and
-L&. Full lists are given in the
+The Unicode properties that can be tested are a subset of those that Perl
+supports. Currently they are limited to the general category properties such as
+Lu for an upper case letter or Nd for a decimal number, the Unicode script
+names such as Arabic or Han, Bidi_Class, Bidi_Control, and the derived
+properties Any and LC (synonym L&). Full lists are given in the
 <a href="pcre2pattern.html"><b>pcre2pattern</b></a>
 and
 <a href="pcre2syntax.html"><b>pcre2syntax</b></a>
-documentation. Only the short names for properties are supported. For example,
-\p{L} matches a letter. Its Perl synonym, \p{Letter}, is not supported.
-Furthermore, in Perl, many properties may optionally be prefixed by "Is", for
-compatibility with Perl 5.6. PCRE2 does not support this.
+documentation. In general, only the short names for properties are supported.
+For example, \p{L} matches a letter. Its longer synonym, \p{Letter}, is not
+supported. Furthermore, in Perl, many properties may optionally be prefixed by
+"Is", for compatibility with Perl 5.6. PCRE2 does not support this.
 </P>
 <br><b>
 WIDE CHARACTERS AND UTF MODES
@ -477,7 +478,7 @@ AUTHOR
 <P>
 Philip Hazel
 <br>
-University Computing Service
+Retired from University Computing Service
 <br>
 Cambridge, England.
 <br>
@ -486,9 +487,9 @@ Cambridge, England.
 REVISION
 </b><br>
 <P>
-Last updated: 23 February 2020
+Last updated: 22 December 2021
 <br>
-Copyright &copy; 1997-2020 University of Cambridge.
+Copyright &copy; 1997-2021 University of Cambridge.
 <br>
 <p>
 Return to the <a href="index.html">PCRE2 index page</a>.
--- a/doc/pcre2.txt
+++ b/doc/pcre2.txt
--- a/doc/pcre2_compile.3
+++ b/doc/pcre2_compile.3
@ -1,4 +1,4 @@
-.TH PCRE2_COMPILE 3 "23 May 2019" "PCRE2 10.34"
+.TH PCRE2_COMPILE 3 "22 April 2022" "PCRE2 10.41"
 .SH NAME
 PCRE2 - Perl-compatible regular expressions (revised API)
 .SH SYNOPSIS
@ -80,8 +80,17 @@ Additional options may be set in the compile context via the
 .\"
 function.
 .P
-The yield of this function is a pointer to a private data structure that
-contains the compiled pattern, or NULL if an error was detected.
+If either of \fIerrorcode\fP or \fIerroroffset\fP is NULL, the function returns
+NULL immediately. Otherwise, the yield of this function is a pointer to a
+private data structure that contains the compiled pattern, or NULL if an error
+was detected. In the error case, a text error message can be obtained by
+passing the value returned via the \fIerrorcode\fP argument to the the
+\fBpcre2_get_error_message()\fP function. The offset (in code units) where the
+error was encountered is returned via the \fIerroroffset\fP argument.
+.P
+If there is no error, the value passed via \fIerrorcode\fP returns the message
+"no error" if passed to \fBpcre2_get_error_message()\fP, and the value passed
+via \fIerroroffset\fP is zero.
 .P
 There is a complete description of the PCRE2 native API, with more detail on
 each option, in the
--- a/doc/pcre2_jit_stack_create.3
+++ b/doc/pcre2_jit_stack_create.3
@ -22,7 +22,8 @@ allocation. The result can be passed to the JIT run-time code by calling
 \fBpcre2_jit_stack_assign()\fP to associate the stack with a compiled pattern,
 which can then be processed by \fBpcre2_match()\fP or \fBpcre2_jit_match()\fP.
 A maximum stack size of 512KiB to 1MiB should be more than enough for any
-pattern. For more details, see the
+pattern. If the stack couldn't be allocated or the values passed were not
+reasonable, NULL will be returned. For more details, see the
 .\" HREF
 \fBpcre2jit\fP
 .\"
--- a/doc/pcre2_serialize_decode.3
+++ b/doc/pcre2_serialize_decode.3
@ -36,7 +36,7 @@ the following negative error codes:
  PCRE2_ERROR_BADDATA   \fInumber_of_codes\fP is zero or less
  PCRE2_ERROR_BADMAGIC  mismatch of id bytes in \fIbytes\fP
  PCRE2_ERROR_BADMODE   mismatch of variable unit size or PCRE version
-  PCRE2_ERROR_MEMORY    memory allocation failed
+  PCRE2_ERROR_NOMEMORY  memory allocation failed
  PCRE2_ERROR_NULL      \fIcodes\fP or \fIbytes\fP is NULL
 .sp
 PCRE2_ERROR_BADMAGIC may mean that the data is corrupt, or that it was compiled
--- a/doc/pcre2_set_compile_extra_options.3
+++ b/doc/pcre2_set_compile_extra_options.3
@ -18,9 +18,9 @@ This function sets additional option bits for \fBpcre2_compile()\fP that are
 housed in a compile context. It completely replaces all the bits. The extra
 options are:
 .sp
-.\" JOIN
  PCRE2_EXTRA_ALLOW_LOOKAROUND_BSK     Allow \eK in lookarounds
-  PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES  Allow \ex{df800} to \ex{dfff}
+.\" JOIN
+  PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES  Allow \ex{d800} to \ex{dfff}
                                         in UTF-8 and UTF-32 modes
 .\" JOIN
  PCRE2_EXTRA_ALT_BSUX                 Extended alternate \eu, \eU, and
--- a/doc/pcre2_substitute.3
+++ b/doc/pcre2_substitute.3
@ -55,32 +55,42 @@ automatically added.
 The subject and replacement lengths can be given as PCRE2_ZERO_TERMINATED for
 zero-terminated strings. The options are:
 .sp
-  PCRE2_ANCHORED             Match only at the first position
-  PCRE2_ENDANCHORED          Pattern can match only at end of subject
-  PCRE2_NOTBOL               Subject is not the beginning of a line
-  PCRE2_NOTEOL               Subject is not the end of a line
-  PCRE2_NOTEMPTY             An empty string is not a valid match
+  PCRE2_ANCHORED                     Match only at the first position
+  PCRE2_ENDANCHORED                  Match only at end of subject
 .\" JOIN
-  PCRE2_NOTEMPTY_ATSTART     An empty string at the start of the
-                              subject is not a valid match
-  PCRE2_NO_JIT               Do not use JIT matching
+  PCRE2_NOTBOL                       Subject is not the beginning of a
+                                      line
+  PCRE2_NOTEOL                       Subject is not the end of a line
 .\" JOIN
-  PCRE2_NO_UTF_CHECK         Do not check the subject or replacement
-                              for UTF validity (only relevant if
-                              PCRE2_UTF was set at compile time)
-  PCRE2_SUBSTITUTE_EXTENDED  Do extended replacement processing
-  PCRE2_SUBSTITUTE_GLOBAL    Replace all occurrences in the subject
-  PCRE2_SUBSTITUTE_LITERAL   The replacement string is literal
-  PCRE2_SUBSTITUTE_MATCHED   Use pre-existing match data for 1st match
-  PCRE2_SUBSTITUTE_OVERFLOW_LENGTH  If overflow, compute needed length
+  PCRE2_NOTEMPTY                     An empty string is not a
+                                      valid match
+.\" JOIN
+  PCRE2_NOTEMPTY_ATSTART             An empty string at the start of
+                                      the subject is not a valid match
+  PCRE2_NO_JIT                       Do not use JIT matching
+.\" JOIN
+  PCRE2_NO_UTF_CHECK                 Do not check for UTF validity in
+                                      the subject or replacement
+.\" JOIN
+                                      (only relevant if PCRE2_UTF was
+                                      set at compile time)
+  PCRE2_SUBSTITUTE_EXTENDED          Do extended replacement processing
+.\" JOIN
+  PCRE2_SUBSTITUTE_GLOBAL            Replace all occurrences in the
+                                      subject
+  PCRE2_SUBSTITUTE_LITERAL           The replacement string is literal
+.\" JOIN
+  PCRE2_SUBSTITUTE_MATCHED           Use pre-existing match data for
+                                      first match
+  PCRE2_SUBSTITUTE_OVERFLOW_LENGTH   If overflow, compute needed length
  PCRE2_SUBSTITUTE_REPLACEMENT_ONLY  Return only replacement string(s)
-  PCRE2_SUBSTITUTE_UNKNOWN_UNSET  Treat unknown group as unset
-  PCRE2_SUBSTITUTE_UNSET_EMPTY  Simple unset insert = empty string
+  PCRE2_SUBSTITUTE_UNKNOWN_UNSET     Treat unknown group as unset
+  PCRE2_SUBSTITUTE_UNSET_EMPTY       Simple unset insert = empty string
 .sp
 If PCRE2_SUBSTITUTE_LITERAL is set, PCRE2_SUBSTITUTE_EXTENDED,
 PCRE2_SUBSTITUTE_UNKNOWN_UNSET, and PCRE2_SUBSTITUTE_UNSET_EMPTY are ignored.
 .P
-If PCRE2_SUBSTITUTE_MATCHED is set, \fImatch_data\fP must be non-zero; its
+If PCRE2_SUBSTITUTE_MATCHED is set, \fImatch_data\fP must be non-NULL; its
 contents must be the result of a call to \fBpcre2_match()\fP using the same
 pattern and subject.
 .P
--- a/doc/pcre2api.3
+++ b/doc/pcre2api.3
@ -1,4 +1,4 @@
-.TH PCRE2API 3 "30 August 2021" "PCRE2 10.38"
+.TH PCRE2API 3 "27 July 2022" "PCRE2 10.41"
 .SH NAME
 PCRE2 - Perl-compatible regular expressions (revised API)
 .sp
@ -953,7 +953,7 @@ has its own memory control arrangements (see the
 documentation for more details). If the limit is reached, the negative error
 code PCRE2_ERROR_HEAPLIMIT is returned. The default limit can be set when PCRE2
 is built; if it is not, the default is set very large and is essentially
-"unlimited".
+unlimited.
 .P
 A value for the heap limit may also be supplied by an item at the start of a
 pattern of the form
@ -964,18 +964,18 @@ where ddd is a decimal number. However, such a setting is ignored unless ddd is
 less than the limit set by the caller of \fBpcre2_match()\fP or, if no such
 limit is set, less than the default.
 .P
-The \fBpcre2_match()\fP function starts out using a 20KiB vector on the system
-stack for recording backtracking points. The more nested backtracking points
-there are (that is, the deeper the search tree), the more memory is needed.
-Heap memory is used only if the initial vector is too small. If the heap limit
-is set to a value less than 21 (in particular, zero) no heap memory will be
-used. In this case, only patterns that do not have a lot of nested backtracking
-can be successfully processed.
+The \fBpcre2_match()\fP function always needs some heap memory, so setting a
+value of zero guarantees a "heap limit exceeded" error. Details of how
+\fBpcre2_match()\fP uses the heap are given in the
+.\" HREF
+\fBpcre2perform\fP
+.\"
+documentation.
 .P
-Similarly, for \fBpcre2_dfa_match()\fP, a vector on the system stack is used
-when processing pattern recursions, lookarounds, or atomic groups, and only if
-this is not big enough is heap memory used. In this case, too, setting a value
-of zero disables the use of the heap.
+For \fBpcre2_dfa_match()\fP, a vector on the system stack is used when
+processing pattern recursions, lookarounds, or atomic groups, and only if this
+is not big enough is heap memory used. In this case, setting a value of zero
+disables the use of the heap.
 .sp
 .nf
 .B int pcre2_set_match_limit(pcre2_match_context *\fImcontext\fP,
@ -1019,10 +1019,10 @@ less than the limit set by the caller of \fBpcre2_match()\fP or
 .fi
 .sp
 This parameter limits the depth of nested backtracking in \fBpcre2_match()\fP.
-Each time a nested backtracking point is passed, a new memory "frame" is used
+Each time a nested backtracking point is passed, a new memory frame is used
 to remember the state of matching at that point. Thus, this parameter
 indirectly limits the amount of memory that is used in a match. However,
-because the size of each memory "frame" depends on the number of capturing
+because the size of each memory frame depends on the number of capturing
 parentheses, the actual memory limit varies from pattern to pattern. This limit
 was more useful in versions before 10.30, where function recursion was used for
 backtracking.
@ -1323,8 +1323,7 @@ If \fIerrorcode\fP or \fIerroroffset\fP is NULL, \fBpcre2_compile()\fP returns
 NULL immediately. Otherwise, the variables to which these point are set to an
 error code and an offset (number of code units) within the pattern,
 respectively, when \fBpcre2_compile()\fP returns NULL because a compilation
-error has occurred. The values are not defined when compilation is successful
-and \fBpcre2_compile()\fP returns a non-NULL value.
+error has occurred. 
 .P
 There are nearly 100 positive error codes that \fBpcre2_compile()\fP may return
 if it finds an error in the pattern. There are also some negative error codes
@ -1343,14 +1342,17 @@ message"
 below)
 .\"
 should be self-explanatory. Macro names starting with PCRE2_ERROR_ are defined
-for both positive and negative error codes in \fBpcre2.h\fP.
+for both positive and negative error codes in \fBpcre2.h\fP. When compilation
+is successful \fIerrorcode\fP is set to a value that returns the message "no
+error" if passed to \fBpcre2_get_error_message()\fP.
 .P
 The value returned in \fIerroroffset\fP is an indication of where in the
-pattern the error occurred. It is not necessarily the furthest point in the
-pattern that was read. For example, after the error "lookbehind assertion is
-not fixed length", the error offset points to the start of the failing
-assertion. For an invalid UTF-8 or UTF-16 string, the offset is that of the
-first code unit of the failing character.
+pattern an error occurred. When there is no error, zero is returned. A non-zero
+value is not necessarily the furthest point in the pattern that was read. For
+example, after the error "lookbehind assertion is not fixed length", the error
+offset points to the start of the failing assertion. For an invalid UTF-8 or
+UTF-16 string, the offset is that of the first code unit of the failing
+character.
 .P
 Some errors are not detected until the whole pattern has been scanned; in these
 cases, the offset passed back is the length of the pattern. Note that the
@ -1794,7 +1796,7 @@ it is set, the effect of passing an invalid UTF string as a pattern is
 undefined. It may cause your program to crash or loop.
 .P
 Note that this option can also be passed to \fBpcre2_match()\fP and
-\fBpcre_dfa_match()\fP, to suppress UTF validity checking of the subject
+\fBpcre2_dfa_match()\fP, to suppress UTF validity checking of the subject
 string.
 .P
 Note also that setting PCRE2_NO_UTF_CHECK at compile time does not disable the
@ -2015,8 +2017,8 @@ point. However, this applies only to characters whose code points are less than
 256. By default, higher-valued code points never match escapes such as \ew or
 \ed.
 .P
-When PCRE2 is built with Unicode support (the default), the Unicode properties
-of all characters can be tested with \ep and \eP, or, alternatively, the
+When PCRE2 is built with Unicode support (the default), certain Unicode
+character properties can be tested with \ep and \eP, or, alternatively, the
 PCRE2_UCP option can be set when a pattern is compiled; this causes \ew and
 friends to use Unicode property support instead of the built-in tables.
 PCRE2_UCP also causes upper/lower casing operations on characters with code
@ -2279,7 +2281,7 @@ return zero. The third argument should point to a \fBsize_t\fP variable.
  PCRE2_INFO_LASTCODETYPE
 .sp
 Returns 1 if there is a rightmost literal code unit that must exist in any
-matched string, other than at its start. The third argument should  point to a
+matched string, other than at its start. The third argument should point to a
 \fBuint32_t\fP variable. If there is no such value, 0 is returned. When 1 is
 returned, the code unit value itself can be retrieved using
 PCRE2_INFO_LASTCODEUNIT. For anchored patterns, a last literal value is
@ -2624,7 +2626,9 @@ The subject string is passed to \fBpcre2_match()\fP as a pointer in
 \fIstartoffset\fP. The length and offset are in code units, not characters.
 That is, they are in bytes for the 8-bit library, 16-bit code units for the
 16-bit library, and 32-bit code units for the 32-bit library, whether or not
-UTF processing is enabled.
+UTF processing is enabled. As a special case, if \fIsubject\fP is NULL and
+\fIlength\fP is zero, the subject is assumed to be an empty string. If
+\fIlength\fP is non-zero, an error occurs if \fIsubject\fP is NULL.
 .P
 If \fIstartoffset\fP is greater than the length of the subject,
 \fBpcre2_match()\fP returns PCRE2_ERROR_BADOFFSET. When the starting offset is
@ -3158,11 +3162,11 @@ The backtracking match limit was reached.
 .sp
  PCRE2_ERROR_NOMEMORY
 .sp
-If a pattern contains many nested backtracking points, heap memory is used to
-remember them. This error is given when the memory allocation function (default
-or custom) fails. Note that a different error, PCRE2_ERROR_HEAPLIMIT, is given
-if the amount of memory needed exceeds the heap limit. PCRE2_ERROR_NOMEMORY is
-also returned if PCRE2_COPY_MATCHED_SUBJECT is set and memory allocation fails.
+Heap memory is used to remember backgracking points. This error is given when
+the memory allocation function (default or custom) fails. Note that a different
+error, PCRE2_ERROR_HEAPLIMIT, is given if the amount of memory needed exceeds
+the heap limit. PCRE2_ERROR_NOMEMORY is also returned if
+PCRE2_COPY_MATCHED_SUBJECT is set and memory allocation fails.
 .sp
  PCRE2_ERROR_NULL
 .sp
@ -3413,12 +3417,16 @@ same number causes an error at compile time.
 .P
 This function optionally calls \fBpcre2_match()\fP and then makes a copy of the
 subject string in \fIoutputbuffer\fP, replacing parts that were matched with
-the \fIreplacement\fP string, whose length is supplied in \fBrlength\fP. This
-can be given as PCRE2_ZERO_TERMINATED for a zero-terminated string. There is an
-option (see PCRE2_SUBSTITUTE_REPLACEMENT_ONLY below) to return just the
-replacement string(s). The default action is to perform just one replacement if
-the pattern matches, but there is an option that requests multiple replacements
-(see PCRE2_SUBSTITUTE_GLOBAL below).
+the \fIreplacement\fP string, whose length is supplied in \fBrlength\fP, which
+can be given as PCRE2_ZERO_TERMINATED for a zero-terminated string. As a
+special case, if \fIreplacement\fP is NULL and \fIrlength\fP is zero, the
+replacement is assumed to be an empty string. If \fIrlength\fP is non-zero, an
+error occurs if \fIreplacement\fP is NULL.
+.P
+There is an option (see PCRE2_SUBSTITUTE_REPLACEMENT_ONLY below) to return just
+the replacement string(s). The default action is to perform just one
+replacement if the pattern matches, but there is an option that requests
+multiple replacements (see PCRE2_SUBSTITUTE_GLOBAL below).
 .P
 If successful, \fBpcre2_substitute()\fP returns the number of substitutions
 that were carried out. This may be zero if no match was found, and is never
@ -3447,12 +3455,12 @@ block may or may not have been changed.
 As well as the usual options for \fBpcre2_match()\fP, a number of additional
 options can be set in the \fIoptions\fP argument of \fBpcre2_substitute()\fP.
 One such option is PCRE2_SUBSTITUTE_MATCHED. When this is set, an external
-\fImatch_data\fP block must be provided, and it must have been used for an
-external call to \fBpcre2_match()\fP. The data in the \fImatch_data\fP block
-(return code, offset vector) is used for the first substitution instead of
-calling \fBpcre2_match()\fP from within \fBpcre2_substitute()\fP. This allows
-an application to check for a match before choosing to substitute, without
-having to repeat the match.
+\fImatch_data\fP block must be provided, and it must have already been used for
+an external call to \fBpcre2_match()\fP with the same pattern and subject
+arguments. The data in the \fImatch_data\fP block (return code, offset vector)
+is then used for the first substitution instead of calling \fBpcre2_match()\fP
+from within \fBpcre2_substitute()\fP. This allows an application to check for a
+match before choosing to substitute, without having to repeat the match.
 .P
 The contents of the externally supplied match data block are not changed when
 PCRE2_SUBSTITUTE_MATCHED is set. If PCRE2_SUBSTITUTE_GLOBAL is also set,
@ -3584,7 +3592,7 @@ and force lower case. The escape sequences change the current state: \eU and
 terminating a \eQ quoted sequence) reverts to no case forcing. The sequences
 \eu and \el force the next character (if it is a letter) to upper or lower
 case, respectively, and then the state automatically reverts to no case
-forcing. Case forcing applies to all inserted  characters, including those from
+forcing. Case forcing applies to all inserted characters, including those from
 capture groups and letters within \eQ...\eE quoted sequences. If either
 PCRE2_UTF or PCRE2_UCP was set when the pattern was compiled, Unicode
 properties are used for case forcing characters whose code points are greater
@ -3649,7 +3657,9 @@ needed is returned via \fIoutlengthptr\fP. Note that this does not happen by
 default.
 .P
 PCRE2_ERROR_NULL is returned if PCRE2_SUBSTITUTE_MATCHED is set but the
-\fImatch_data\fP argument is NULL.
+\fImatch_data\fP argument is NULL or if the \fIsubject\fP or \fIreplacement\fP
+arguments are NULL. For backward compatibility reasons an exception is made for
+the \fIreplacement\fP argument if the \fIrlength\fP argument is also 0.
 .P
 PCRE2_ERROR_BADREPLACEMENT is used for miscellaneous syntax errors in the
 replacement string, with more particular errors being PCRE2_ERROR_BADREPESCAPE
@ -3811,12 +3821,13 @@ other alternatives. Ultimately, when it runs out of matches,
 .P
 The function \fBpcre2_dfa_match()\fP is called to match a subject string
 against a compiled pattern, using a matching algorithm that scans the subject
-string just once (not counting lookaround assertions), and does not backtrack.
-This has different characteristics to the normal algorithm, and is not
-compatible with Perl. Some of the features of PCRE2 patterns are not supported.
-Nevertheless, there are times when this kind of matching can be useful. For a
-discussion of the two matching algorithms, and a list of features that
-\fBpcre2_dfa_match()\fP does not support, see the
+string just once (not counting lookaround assertions), and does not backtrack
+(except when processing lookaround assertions). This has different
+characteristics to the normal algorithm, and is not compatible with Perl. Some
+of the features of PCRE2 patterns are not supported. Nevertheless, there are
+times when this kind of matching can be useful. For a discussion of the two
+matching algorithms, and a list of features that \fBpcre2_dfa_match()\fP does
+not support, see the
 .\" HREF
 \fBpcre2matching\fP
 .\"
@ -3848,7 +3859,7 @@ Here is an example of a simple call to \fBpcre2_dfa_match()\fP:
    wspace,         /* working space vector */
    20);            /* number of elements (NOT size in bytes) */
 .
-.SS "Option bits for \fBpcre_dfa_match()\fP"
+.SS "Option bits for \fBpcre2_dfa_match()\fP"
 .rs
 .sp
 The unused bits of the \fIoptions\fP argument for \fBpcre2_dfa_match()\fP must
@ -4016,6 +4027,6 @@ Cambridge, England.
 .rs
 .sp
 .nf
-Last updated: 30 August 2021
-Copyright (c) 1997-2021 University of Cambridge.
+Last updated: 27 July 2022
+Copyright (c) 1997-2022 University of Cambridge.
 .fi
--- a/doc/pcre2build.3
+++ b/doc/pcre2build.3
@ -1,4 +1,4 @@
-.TH PCRE2BUILD 3 "20 March 2020" "PCRE2 10.35"
+.TH PCRE2BUILD 3 "27 July 2022" "PCRE2 10.41"
 .SH NAME
 PCRE2 - Perl-compatible regular expressions (revised API)
 .
@ -122,8 +122,9 @@ locked this out by setting PCRE2_NEVER_UTF.
 UTF support allows the libraries to process character code points up to
 0x10ffff in the strings that they handle. Unicode support also gives access to
 the Unicode properties of characters, using pattern escapes such as \eP, \ep,
-and \eX. Only the general category properties such as \fILu\fP and \fINd\fP are
-supported. Details are given in the
+and \eX. Only the general category properties such as \fILu\fP and \fINd\fP,
+script names, and some bi-directional properties are supported. Details are
+given in the
 .\" HREF
 \fBpcre2pattern\fP
 .\"
@ -277,12 +278,11 @@ to the \fBconfigure\fP command. This setting also applies to the
 \fBpcre2_dfa_match()\fP matching function, and to JIT matching (though the
 counting is done differently).
 .P
-The \fBpcre2_match()\fP function starts out using a 20KiB vector on the system
-stack to record backtracking points. The more nested backtracking points there
-are (that is, the deeper the search tree), the more memory is needed. If the
-initial vector is not large enough, heap memory is used, up to a certain limit,
-which is specified in kibibytes (units of 1024 bytes). The limit can be changed
-at run time, as described in the
+The \fBpcre2_match()\fP function uses heap memory to record backtracking
+points. The more nested backtracking points there are (that is, the deeper the
+search tree), the more memory is needed. There is an upper limit, specified in
+kibibytes (units of 1024 bytes). This limit can be changed at run time, as
+described in the
 .\" HREF
 \fBpcre2api\fP
 .\"
@ -302,7 +302,7 @@ You can also explicitly limit the depth of nested backtracking in the
 for --with-match-limit. You can set a lower default limit by adding, for
 example,
 .sp
-  --with-match-limit_depth=10000
+  --with-match-limit-depth=10000
 .sp
 to the \fBconfigure\fP command. This value can be overridden at run time. This
 depth limit indirectly limits the amount of heap memory that is used, but
@ -624,7 +624,7 @@ give a warning.
 .sp
 .nf
 Philip Hazel
-University Computing Service
+Retired from University Computing Service
 Cambridge, England.
 .fi
 .
@ -633,6 +633,6 @@ Cambridge, England.
 .rs
 .sp
 .nf
-Last updated: 20 March 2020
-Copyright (c) 1997-2020 University of Cambridge.
+Last updated: 27 July 2022
+Copyright (c) 1997-2022 University of Cambridge.
 .fi
--- a/doc/pcre2compat.3
+++ b/doc/pcre2compat.3
@ -1,4 +1,4 @@
-.TH PCRE2COMPAT 3 "30 August 2021" "PCRE2 10.38"
+.TH PCRE2COMPAT 3 "08 December 2021" "PCRE2 10.40"
 .SH NAME
 PCRE2 - Perl-compatible regular expressions (revised API)
 .SH "DIFFERENCES BETWEEN PCRE2 AND PERL"
@ -6,31 +6,38 @@ PCRE2 - Perl-compatible regular expressions (revised API)
 .sp
 This document describes some of the differences in the ways that PCRE2 and Perl
 handle regular expressions. The differences described here are with respect to
-Perl version 5.32.0, but as both Perl and PCRE2 are continually changing, the
+Perl version 5.34.0, but as both Perl and PCRE2 are continually changing, the
 information may at times be out of date.
 .P
-1. PCRE2 has only a subset of Perl's Unicode support. Details of what it does
+1. When PCRE2_DOTALL (equivalent to Perl's /s qualifier) is not set, the
+behaviour of the '.' metacharacter differs from Perl. In PCRE2, '.' matches the
+next character unless it is the start of a newline sequence. This means that,
+if the newline setting is CR, CRLF, or NUL, '.' will match the code point LF
+(0x0A) in ASCII/Unicode environments, and NL (either 0x15 or 0x25) when using
+EBCDIC. In Perl, '.' appears never to match LF, even when 0x0A is not a newline
+indicator.
+.P
+2. PCRE2 has only a subset of Perl's Unicode support. Details of what it does
 have are given in the
 .\" HREF
 \fBpcre2unicode\fP
 .\"
 page.
 .P
-2. Like Perl, PCRE2 allows repeat quantifiers on parenthesized assertions, but
+3. Like Perl, PCRE2 allows repeat quantifiers on parenthesized assertions, but
 they do not mean what you might think. For example, (?!a){3} does not assert
 that the next three characters are not "a". It just asserts that the next
 character is not "a" three times (in principle; PCRE2 optimizes this to run the
 assertion just once). Perl allows some repeat quantifiers on other assertions,
-for example, \eb* (but not \eb{3}, though oddly it does allow ^{3}), but these
-do not seem to have any use. PCRE2 does not allow any kind of quantifier on
-non-lookaround assertions.
+for example, \eb* , but these do not seem to have any use. PCRE2 does not allow
+any kind of quantifier on non-lookaround assertions.
 .P
-3. Capture groups that occur inside negative lookaround assertions are counted,
+4. Capture groups that occur inside negative lookaround assertions are counted,
 but their entries in the offsets vector are set only when a negative assertion
 is a condition that has a matching branch (that is, the condition is false).
 Perl may set such capture groups in other circumstances.
 .P
-4. The following Perl escape sequences are not supported: \eF, \el, \eL, \eu,
+5. The following Perl escape sequences are not supported: \eF, \el, \eL, \eu,
 \eU, and \eN when followed by a character name. \eN on its own, matching a
 non-newline character, and \eN{U+dd..}, matching a Unicode code point, are
 supported. The escapes that modify the case of following letters are
@ -40,12 +47,12 @@ generated by default. However, if either of the PCRE2_ALT_BSUX or
 PCRE2_EXTRA_ALT_BSUX options is set, \eU and \eu are interpreted as ECMAScript
 interprets them.
 .P
-5. The Perl escape sequences \ep, \eP, and \eX are supported only if PCRE2 is
+6. The Perl escape sequences \ep, \eP, and \eX are supported only if PCRE2 is
 built with Unicode support (the default). The properties that can be tested
 with \ep and \eP are limited to the general category properties such as Lu and
-Nd, script names such as Greek or Han, and the derived properties Any and L&.
-Both PCRE2 and Perl support the Cs (surrogate) property, but in PCRE2 its use
-is limited. See the
+Nd, script names such as Greek or Han, Bidi_Class, Bidi_Control, and the
+derived properties Any and LC (synonym L&). Both PCRE2 and Perl support the Cs
+(surrogate) property, but in PCRE2 its use is limited. See the
 .\" HREF
 \fBpcre2pattern\fP
 .\"
@ -53,14 +60,14 @@ documentation for details. The long synonyms for property names that Perl
 supports (such as \ep{Letter}) are not supported by PCRE2, nor is it permitted
 to prefix any of these properties with "Is".
 .P
-6. PCRE2 supports the \eQ...\eE escape for quoting substrings. Characters
+7. PCRE2 supports the \eQ...\eE escape for quoting substrings. Characters
 in between are treated as literals. However, this is slightly different from
 Perl in that $ and @ are also handled as literals inside the quotes. In Perl,
-they cause variable interpolation (but of course PCRE2 does not have
-variables). Also, Perl does "double-quotish backslash interpolation" on any
-backslashes between \eQ and \eE which, its documentation says, "may lead to
-confusing results". PCRE2 treats a backslash between \eQ and \eE just like any
-other character. Note the following examples:
+they cause variable interpolation (PCRE2 does not have variables). Also, Perl
+does "double-quotish backslash interpolation" on any backslashes between \eQ
+and \eE which, its documentation says, "may lead to confusing results". PCRE2
+treats a backslash between \eQ and \eE just like any other character. Note the
+following examples:
 .sp
    Pattern            PCRE2 matches     Perl matches
 .sp
@ -75,7 +82,7 @@ other character. Note the following examples:
 The \eQ...\eE sequence is recognized both inside and outside character classes
 by both PCRE2 and Perl.
 .P
-7. Fairly obviously, PCRE2 does not support the (?{code}) and (??{code})
+8. Fairly obviously, PCRE2 does not support the (?{code}) and (??{code})
 constructions. However, PCRE2 does have a "callout" feature, which allows an
 external function to be called during pattern matching. See the
 .\" HREF
@ -83,11 +90,11 @@ external function to be called during pattern matching. See the
 .\"
 documentation for details.
 .P
-8. Subroutine calls (whether recursive or not) were treated as atomic groups up
+9. Subroutine calls (whether recursive or not) were treated as atomic groups up
 to PCRE2 release 10.23, but from release 10.30 this changed, and backtracking
 into subroutine calls is now supported, as in Perl.
 .P
-9. In PCRE2, if any of the backtracking control verbs are used in a group that
+10. In PCRE2, if any of the backtracking control verbs are used in a group that
 is called as a subroutine (whether or not recursively), their effect is
 confined to that group; it does not extend to the surrounding pattern. This is
 not always the case in Perl. In particular, if (*THEN) is present in a group
@ -95,18 +102,18 @@ that is called as a subroutine, its action is limited to that group, even if
 the group does not contain any | characters. Note that such groups are
 processed as anchored at the point where they are tested.
 .P
-10. If a pattern contains more than one backtracking control verb, the first
+11. If a pattern contains more than one backtracking control verb, the first
 one that is backtracked onto acts. For example, in the pattern
 A(*COMMIT)B(*PRUNE)C a failure in B triggers (*COMMIT), but a failure in C
 triggers (*PRUNE). Perl's behaviour is more complex; in many cases it is the
 same as PCRE2, but there are cases where it differs.
 .P
-11. There are some differences that are concerned with the settings of captured
+12. There are some differences that are concerned with the settings of captured
 strings when part of a pattern is repeated. For example, matching "aba" against
 the pattern /^(a(b)?)+$/ in Perl leaves $2 unset, but in PCRE2 it is set to
 "b".
 .P
-12. PCRE2's handling of duplicate capture group numbers and names is not as
+13. PCRE2's handling of duplicate capture group numbers and names is not as
 general as Perl's. This is a consequence of the fact the PCRE2 works internally
 just with numbers, using an external table to translate between numbers and
 names. In particular, a pattern such as (?|(?<a>A)|(?<b>B)), where the two
@ -115,37 +122,38 @@ causes an error at compile time. If it were allowed, it would not be possible
 to distinguish which group matched, because both names map to capture group
 number 1. To avoid this confusing situation, an error is given at compile time.
 .P
-13. Perl used to recognize comments in some places that PCRE2 does not, for
+14. Perl used to recognize comments in some places that PCRE2 does not, for
 example, between the ( and ? at the start of a group. If the /x modifier is
 set, Perl allowed white space between ( and ? though the latest Perls give an
 error (for a while it was just deprecated). There may still be some cases where
 Perl behaves differently.
 .P
-14. Perl, when in warning mode, gives warnings for character classes such as
+15. Perl, when in warning mode, gives warnings for character classes such as
 [A-\ed] or [a-[:digit:]]. It then treats the hyphens as literals. PCRE2 has no
 warning features, so it gives an error in these cases because they are almost
 certainly user mistakes.
 .P
-15. In PCRE2, the upper/lower case character properties Lu and Ll are not
+16. In PCRE2, the upper/lower case character properties Lu and Ll are not
 affected when case-independent matching is specified. For example, \ep{Lu}
 always matches an upper case letter. I think Perl has changed in this respect;
-in the release at the time of writing (5.32), \ep{Lu} and \ep{Ll} match all
+in the release at the time of writing (5.34), \ep{Lu} and \ep{Ll} match all
 letters, regardless of case, when case independence is specified.
 .P
-16. From release 5.32.0, Perl locks out the use of \eK in lookaround
+17. From release 5.32.0, Perl locks out the use of \eK in lookaround
 assertions. From release 10.38 PCRE2 does the same by default. However, there
 is an option for re-enabling the previous behaviour. When this option is set,
 \eK is acted on when it occurs in positive assertions, but is ignored in
 negative assertions.
 .P
-17. PCRE2 provides some extensions to the Perl regular expression facilities.
+18. PCRE2 provides some extensions to the Perl regular expression facilities.
 Perl 5.10 included new features that were not in earlier versions of Perl, some
 of which (such as named parentheses) were in PCRE2 for some time before. This
-list is with respect to Perl 5.32:
+list is with respect to Perl 5.34:
 .sp
 (a) Although lookbehind assertions in PCRE2 must match fixed length strings,
 each alternative toplevel branch of a lookbehind assertion can match a
-different length of string. Perl requires them all to have the same length.
+different length of string. Perl used to require them all to have the same
+length, but the latest version has some variable length support.
 .sp
 (b) From PCRE2 10.23, backreferences to groups of fixed length are supported
 in lookbehinds, provided that there is no possibility of referencing a
@ -186,11 +194,11 @@ the pattern.
 extension to the lookaround facilities. The default, Perl-compatible
 lookarounds are atomic.
 .P
-18. The Perl /a modifier restricts /d numbers to pure ascii, and the /aa
+19. The Perl /a modifier restricts /d numbers to pure ascii, and the /aa
 modifier restricts /i case-insensitive matching to pure ascii, ignoring Unicode
 rules. This separation cannot be represented with PCRE2_UCP.
 .P
-19. Perl has different limits than PCRE2. See the
+20. Perl has different limits than PCRE2. See the
 .\" HREF
 \fBpcre2limit\fP
 .\"
@ -214,6 +222,6 @@ Cambridge, England.
 .rs
 .sp
 .nf
-Last updated: 30 August 2021
+Last updated: 08 December 2021
 Copyright (c) 1997-2021 University of Cambridge.
 .fi
--- a/doc/pcre2grep.1
+++ b/doc/pcre2grep.1
@ -1,4 +1,4 @@
-.TH PCRE2GREP 1 "31 August 2021" "PCRE2 10.38"
+.TH PCRE2GREP 1 "30 July 2022" "PCRE2 10.41"
 .SH NAME
 pcre2grep - a grep with Perl-compatible regular expressions.
 .SH SYNOPSIS
@ -43,13 +43,15 @@ For example:
 .sp
  pcre2grep some-pattern file1 - file3
 .sp
-Input files are searched line by line. By default, each line that matches a
+By default, input files are searched line by line. Each line that matches a
 pattern is copied to the standard output, and if there is more than one file,
 the file name is output at the start of each line, followed by a colon.
-However, there are options that can change how \fBpcre2grep\fP behaves. In
-particular, the \fB-M\fP option makes it possible to search for strings that
-span line boundaries. What defines a line boundary is controlled by the
-\fB-N\fP (\fB--newline\fP) option.
+However, there are options that can change how \fBpcre2grep\fP behaves. For
+example, the \fB-M\fP option makes it possible to search for strings that span
+line boundaries. What defines a line boundary is controlled by the \fB-N\fP
+(\fB--newline\fP) option. The \fB-h\fP and \fB-H\fP options control whether or
+not file names are shown, and the \fB-Z\fP option changes the file name
+terminator to a zero byte.
 .P
 The amount of memory used for buffering files that are being scanned is
 controlled by parameters that can be set by the \fB--buffer-size\fP and
@ -149,9 +151,11 @@ Output up to \fInumber\fP lines of context after each matching line. Fewer
 lines are output if the next match or the end of the file is reached, or if the
 processing buffer size has been set too small. If file names and/or line
 numbers are being output, a hyphen separator is used instead of a colon for the
-context lines. A line containing "--" is output between each group of lines,
-unless they are in fact contiguous in the input file. The value of \fInumber\fP
-is expected to be relatively small. When \fB-c\fP is used, \fB-A\fP is ignored.
+context lines (the \fB-Z\fP option can be used to change the file name
+terminator to a zero byte). A line containing "--" is output between each group
+of lines, unless they are in fact contiguous in the input file. The value of
+\fInumber\fP is expected to be relatively small. When \fB-c\fP is used,
+\fB-A\fP is ignored.
 .TP
 \fB-a\fP, \fB--text\fP
 Treat binary files as text. This is equivalent to
@ -167,9 +171,10 @@ Output up to \fInumber\fP lines of context before each matching line. Fewer
 lines are output if the previous match or the start of the file is within
 \fInumber\fP lines, or if the processing buffer size has been set too small. If
 file names and/or line numbers are being output, a hyphen separator is used
-instead of a colon for the context lines. A line containing "--" is output
-between each group of lines, unless they are in fact contiguous in the input
-file. The value of \fInumber\fP is expected to be relatively small. When
+instead of a colon for the context lines (the \fB-Z\fP option can be used to
+change the file name terminator to a zero byte). A line containing "--" is
+output between each group of lines, unless they are in fact contiguous in the
+input file. The value of \fInumber\fP is expected to be relatively small. When
 \fB-c\fP is used, \fB-B\fP is ignored.
 .TP
 \fB--binary-files=\fP\fIword\fP
@ -356,19 +361,21 @@ shown separately. This option is mutually exclusive with \fB--output\fP,
 .TP
 \fB-H\fP, \fB--with-filename\fP
 Force the inclusion of the file name at the start of output lines when
-searching a single file. By default, the file name is not shown in this case.
-For matching lines, the file name is followed by a colon; for context lines, a
-hyphen separator is used. If a line number is also being output, it follows the
-file name. When the \fB-M\fP option causes a pattern to match more than one
-line, only the first is preceded by the file name. This option overrides any
-previous \fB-h\fP, \fB-l\fP, or \fB-L\fP options.
+searching a single file. The file name is not normally shown in this case.
+By default, for matching lines, the file name is followed by a colon; for
+context lines, a hyphen separator is used. The \fB-Z\fP option can be used to
+change the terminator to a zero byte. If a line number is also being output,
+it follows the file name. When the \fB-M\fP option causes a pattern to match
+more than one line, only the first is preceded by the file name. This option
+overrides any previous \fB-h\fP, \fB-l\fP, or \fB-L\fP options.
 .TP
 \fB-h\fP, \fB--no-filename\fP
-Suppress the output file names when searching multiple files. By default,
-file names are shown when multiple files are searched. For matching lines, the
-file name is followed by a colon; for context lines, a hyphen separator is used.
-If a line number is also being output, it follows the file name. This option
-overrides any previous \fB-H\fP, \fB-L\fP, or \fB-l\fP options.
+Suppress the output file names when searching multiple files. File names are
+normally shown when multiple files are searched. By default, for matching
+lines, the file name is followed by a colon; for context lines, a hyphen
+separator is used. The \fB-Z\fP option can be used to change the terminator to
+a zero byte. If a line number is also being output, it follows the file name.
+This option overrides any previous \fB-H\fP, \fB-L\fP, or \fB-l\fP options.
 .TP
 \fB--heap-limit\fP=\fInumber\fP
 See \fB--match-limit\fP below.
@ -417,17 +424,19 @@ given any number of times. If a directory matches both \fB--include-dir\fP and
 \fB-L\fP, \fB--files-without-match\fP
 Instead of outputting lines from the files, just output the names of the files
 that do not contain any lines that would have been output. Each file name is
-output once, on a separate line. This option overrides any previous \fB-H\fP,
-\fB-h\fP, or \fB-l\fP options.
+output once, on a separate line by default, but if the \fB-Z\fP option is set, 
+they are separated by zero bytes instead of newlines. This option overrides any
+previous \fB-H\fP, \fB-h\fP, or \fB-l\fP options.
 .TP
 \fB-l\fP, \fB--files-with-matches\fP
 Instead of outputting lines from the files, just output the names of the files
 containing lines that would have been output. Each file name is output once, on
-a separate line. Searching normally stops as soon as a matching line is found
-in a file. However, if the \fB-c\fP (count) option is also used, matching
-continues in order to obtain the correct count, and those files that have at
-least one match are listed along with their counts. Using this option with
-\fB-c\fP is a way of suppressing the listing of files with no matches that
+a separate line, but if the \fB-Z\fP option is set, they are separated by zero
+bytes instead of newlines. Searching normally stops as soon as a matching line
+is found in a file. However, if the \fB-c\fP (count) option is also used,
+matching continues in order to obtain the correct count, and those files that
+have at least one match are listed along with their counts. Using this option
+with \fB-c\fP is a way of suppressing the listing of files with no matches that
 occurs with \fB-c\fP on its own. This option overrides any previous \fB-H\fP,
 \fB-h\fP, or \fB-L\fP options.
 .TP
@ -516,10 +525,7 @@ counter that is incremented each time around its main processing loop. If the
 value set by \fB--match-limit\fP is reached, an error occurs.
 .sp
 The \fB--heap-limit\fP option specifies, as a number of kibibytes (units of
-1024 bytes), the amount of heap memory that may be used for matching. Heap
-memory is needed only if matching the pattern requires a significant number of
-nested backtracking points to be remembered. This parameter can be set to zero
-to forbid the use of heap memory altogether.
+1024 bytes), the maximum amount of heap memory that may be used for matching.
 .sp
 The \fB--depth-limit\fP option limits the depth of nested backtracking points,
 which indirectly limits the amount of memory that is used. The amount of memory
@ -732,6 +738,12 @@ be more than one line. This is equivalent to having "^(?:" at the start of each
 pattern and ")$" at the end. This option applies only to the patterns that are
 matched against the contents of files; it does not apply to patterns specified
 by any of the \fB--include\fP or \fB--exclude\fP options.
+.TP
+\fB-Z\fP, \fB--null\fP
+Terminate files names in the regular output with a zero byte (the NUL
+character) instead of what would normally appear. This is useful when file
+names contain unusual characters such as colons, hyphens, or even newlines. The
+option does not apply to file names in error messages.
 .
 .
 .SH "ENVIRONMENT VARIABLES"
@ -960,6 +972,6 @@ Cambridge, England.
 .rs
 .sp
 .nf
-Last updated: 31 August 2021
-Copyright (c) 1997-2021 University of Cambridge.
+Last updated: 30 July 2022
+Copyright (c) 1997-2022 University of Cambridge.
 .fi
--- a/doc/pcre2grep.txt
+++ b/doc/pcre2grep.txt
--- a/doc/pcre2jit.3
+++ b/doc/pcre2jit.3
@ -1,4 +1,4 @@
-.TH PCRE2JIT 3 "23 May 2019" "PCRE2 10.34"
+.TH PCRE2JIT 3 "30 November 2021" "PCRE2 10.40"
 .SH NAME
 PCRE2 - Perl-compatible regular expressions (revised API)
 .SH "PCRE2 JUST-IN-TIME COMPILER SUPPORT"
@ -251,11 +251,11 @@ non-sequential matches in one thread is to use callouts: if a callout function
 starts another match, that match must use a different JIT stack to the one used
 for currently suspended match(es).
 .P
-In a multithread application, if you do not
-specify a JIT stack, or if you assign or pass back NULL from a callback, that
-is thread-safe, because each thread has its own machine stack. However, if you
-assign or pass back a non-NULL JIT stack, this must be a different stack for
-each thread so that the application is thread-safe.
+In a multithread application, if you do not specify a JIT stack, or if you
+assign or pass back NULL from a callback, that is thread-safe, because each
+thread has its own machine stack. However, if you assign or pass back a
+non-NULL JIT stack, this must be a different stack for each thread so that the
+application is thread-safe.
 .P
 Strictly speaking, even more is allowed. You can assign the same non-NULL stack
 to a match context that is used by any number of patterns, as long as they are
@ -355,8 +355,8 @@ out this complicated API.
 .B void pcre2_jit_free_unused_memory(pcre2_general_context *\fIgcontext\fP);
 .fi
 .P
-The JIT executable allocator does not free all memory when it is possible.
-It expects new allocations, and keeps some free memory around to improve
+The JIT executable allocator does not free all memory when it is possible. It
+expects new allocations, and keeps some free memory around to improve
 allocation speed. However, in low memory conditions, it might be better to free
 all possible memory. You can cause this to happen by calling
 pcre2_jit_free_unused_memory(). Its argument is a general context, for custom
@ -416,10 +416,10 @@ that was not compiled.
 .P
 When you call \fBpcre2_match()\fP, as well as testing for invalid options, a
 number of other sanity checks are performed on the arguments. For example, if
-the subject pointer is NULL, an immediate error is given. Also, unless
-PCRE2_NO_UTF_CHECK is set, a UTF subject string is tested for validity. In the
-interests of speed, these checks do not happen on the JIT fast path, and if
-invalid data is passed, the result is undefined.
+the subject pointer is NULL but the length is non-zero, an immediate error is
+given. Also, unless PCRE2_NO_UTF_CHECK is set, a UTF subject string is tested
+for validity. In the interests of speed, these checks do not happen on the JIT
+fast path, and if invalid data is passed, the result is undefined.
 .P
 Bypassing the sanity checks and the \fBpcre2_match()\fP wrapping can give
 speedups of more than 10%.
@ -445,6 +445,6 @@ Cambridge, England.
 .rs
 .sp
 .nf
-Last updated: 23 May 2019
-Copyright (c) 1997-2019 University of Cambridge.
+Last updated: 30 November 2021
+Copyright (c) 1997-2021 University of Cambridge.
 .fi
--- a/doc/pcre2limits.3
+++ b/doc/pcre2limits.3
@ -1,4 +1,4 @@
-.TH PCRE2LIMITS 3 "03 February 2019" "PCRE2 10.33"
+.TH PCRE2LIMITS 3 "26 July 2022" "PCRE2 10.41"
 .SH NAME
 PCRE2 - Perl-compatible regular expressions (revised API)
 .SH "SIZE AND OTHER LIMITATIONS"
@ -51,6 +51,10 @@ is 255 code units for the 8-bit library and 65535 code units for the 16-bit and
 .P
 The maximum length of a string argument to a callout is the largest number a
 32-bit unsigned integer can hold.
+.P
+The maximum amount of heap memory used for matching is controlled by the heap 
+limit, which can be set in a pattern or in a match context. The default is a 
+very large number, effectively unlimited.
 .
 .
 .SH AUTHOR
@ -58,7 +62,7 @@ The maximum length of a string argument to a callout is the largest number a
 .sp
 .nf
 Philip Hazel
-University Computing Service
+Retired from University Computing Service
 Cambridge, England.
 .fi
 .
@ -67,6 +71,6 @@ Cambridge, England.
 .rs
 .sp
 .nf
-Last updated: 02 February 2019
-Copyright (c) 1997-2019 University of Cambridge.
+Last updated: 26 July 2022
+Copyright (c) 1997-2022 University of Cambridge.
 .fi
--- a/doc/pcre2pattern.3
+++ b/doc/pcre2pattern.3
@ -1,4 +1,4 @@
-.TH PCRE2PATTERN 3 "3o0 August 2021" "PCRE2 10.38"
+.TH PCRE2PATTERN 3 "12 January 2022" "PCRE2 10.40"
 .SH NAME
 PCRE2 - Perl-compatible regular expressions (revised API)
 .SH "PCRE2 REGULAR EXPRESSION DETAILS"
@ -509,7 +509,6 @@ for themselves. For example, outside a character class:
 .\" JOIN
  \e377   might be a backreference, otherwise
            the value 255 (decimal)
-.\" JOIN
  \e81    is always a backreference
 .sp
 Note that octal values of 100 or greater that are specified using this syntax
@ -773,200 +772,64 @@ can be used in any mode, though in 8-bit and 16-bit non-UTF modes these
 sequences are of course limited to testing characters whose code points are
 less than U+0100 and U+10000, respectively. In 32-bit non-UTF mode, code points
 greater than 0x10ffff (the Unicode limit) may be encountered. These are all
-treated as being in the Unknown script and with an unassigned type. The extra
-escape sequences are:
+treated as being in the Unknown script and with an unassigned type.
+.P
+Matching characters by Unicode property is not fast, because PCRE2 has to do a
+multistage table lookup in order to find a character's property. That is why
+the traditional escape sequences such as \ed and \ew do not use Unicode
+properties in PCRE2 by default, though you can make them do so by setting the
+PCRE2_UCP option or by starting the pattern with (*UCP).
+.P
+The extra escape sequences that provide property support are:
 .sp
  \ep{\fIxx\fP}   a character with the \fIxx\fP property
  \eP{\fIxx\fP}   a character without the \fIxx\fP property
  \eX       a Unicode extended grapheme cluster
 .sp
-The property names represented by \fIxx\fP above are case-sensitive. There is
-support for Unicode script names, Unicode general category properties, "Any",
-which matches any character (including newline), and some special PCRE2
-properties (described in the
+The property names represented by \fIxx\fP above are not case-sensitive, and in
+accordance with Unicode's "loose matching" rules, spaces, hyphens, and
+underscores are ignored. There is support for Unicode script names, Unicode
+general category properties, "Any", which matches any character (including
+newline), Bidi_Class, a number of binary (yes/no) properties, and some special
+PCRE2 properties (described
 .\" HTML <a href="#extraprops">
 .\" </a>
-next section).
+below).
 .\"
-Other Perl properties such as "InMusicalSymbols" are not supported by PCRE2.
-Note that \eP{Any} does not match any characters, so always causes a match
-failure.
+Certain other Perl properties such as "InMusicalSymbols" are not supported by
+PCRE2. Note that \eP{Any} does not match any characters, so always causes a
+match failure.
+.
+.
+.
+.SS "Script properties for \ep and \eP"
+.rs
+.sp
+There are three different syntax forms for matching a script. Each Unicode
+character has a basic script and, optionally, a list of other scripts ("Script
+Extensions") with which it is commonly used. Using the Adlam script as an
+example, \ep{sc:Adlam} matches characters whose basic script is Adlam, whereas
+\ep{scx:Adlam} matches, in addition, characters that have Adlam in their
+extensions list. The full names "script" and "script extensions" for the
+property types are recognized, and a equals sign is an alternative to the
+colon. If a script name is given without a property type, for example,
+\ep{Adlam}, it is treated as \ep{scx:Adlam}. Perl changed to this
+interpretation at release 5.26 and PCRE2 changed at release 10.40.
 .P
-Sets of Unicode characters are defined as belonging to certain scripts. A
-character from one of these sets can be matched using a script name. For
-example:
-.sp
-  \ep{Greek}
-  \eP{Han}
-.sp
 Unassigned characters (and in non-UTF 32-bit mode, characters with code points
 greater than 0x10FFFF) are assigned the "Unknown" script. Others that are not
 part of an identified script are lumped together as "Common". The current list
-of scripts is:
-.P
-Adlam,
-Ahom,
-Anatolian_Hieroglyphs,
-Arabic,
-Armenian,
-Avestan,
-Balinese,
-Bamum,
-Bassa_Vah,
-Batak,
-Bengali,
-Bhaiksuki,
-Bopomofo,
-Brahmi,
-Braille,
-Buginese,
-Buhid,
-Canadian_Aboriginal,
-Carian,
-Caucasian_Albanian,
-Chakma,
-Cham,
-Cherokee,
-Chorasmian,
-Common,
-Coptic,
-Cuneiform,
-Cypriot,
-Cypro_Minoan,
-Cyrillic,
-Deseret,
-Devanagari,
-Dives_Akuru,
-Dogra,
-Duployan,
-Egyptian_Hieroglyphs,
-Elbasan,
-Elymaic,
-Ethiopic,
-Georgian,
-Glagolitic,
-Gothic,
-Grantha,
-Greek,
-Gujarati,
-Gunjala_Gondi,
-Gurmukhi,
-Han,
-Hangul,
-Hanifi_Rohingya,
-Hanunoo,
-Hatran,
-Hebrew,
-Hiragana,
-Imperial_Aramaic,
-Inherited,
-Inscriptional_Pahlavi,
-Inscriptional_Parthian,
-Javanese,
-Kaithi,
-Kannada,
-Katakana,
-Kayah_Li,
-Kharoshthi,
-Khitan_Small_Script,
-Khmer,
-Khojki,
-Khudawadi,
-Lao,
-Latin,
-Lepcha,
-Limbu,
-Linear_A,
-Linear_B,
-Lisu,
-Lycian,
-Lydian,
-Mahajani,
-Makasar,
-Malayalam,
-Mandaic,
-Manichaean,
-Marchen,
-Masaram_Gondi,
-Medefaidrin,
-Meetei_Mayek,
-Mende_Kikakui,
-Meroitic_Cursive,
-Meroitic_Hieroglyphs,
-Miao,
-Modi,
-Mongolian,
-Mro,
-Multani,
-Myanmar,
-Nabataean,
-Nandinagari,
-New_Tai_Lue,
-Newa,
-Nko,
-Nushu,
-Nyakeng_Puachue_Hmong,
-Ogham,
-Ol_Chiki,
-Old_Hungarian,
-Old_Italic,
-Old_North_Arabian,
-Old_Permic,
-Old_Persian,
-Old_Sogdian,
-Old_South_Arabian,
-Old_Turkic,
-Old_Uyghur,
-Oriya,
-Osage,
-Osmanya,
-Pahawh_Hmong,
-Palmyrene,
-Pau_Cin_Hau,
-Phags_Pa,
-Phoenician,
-Psalter_Pahlavi,
-Rejang,
-Runic,
-Samaritan,
-Saurashtra,
-Sharada,
-Shavian,
-Siddham,
-SignWriting,
-Sinhala,
-Sogdian,
-Sora_Sompeng,
-Soyombo,
-Sundanese,
-Syloti_Nagri,
-Syriac,
-Tagalog,
-Tagbanwa,
-Tai_Le,
-Tai_Tham,
-Tai_Viet,
-Takri,
-Tamil,
-Tangsa,
-Tangut,
-Telugu,
-Thaana,
-Thai,
-Tibetan,
-Tifinagh,
-Tirhuta,
-Toto,
-Ugaritic,
-Unknown,
-Vai,
-Vithkuqi,
-Wancho,
-Warang_Citi,
-Yezidi,
-Yi,
-Zanabazar_Square.
-.P
+of recognized script names and their 4-character abbreviations can be obtained
+by running this command:
+.sp
+  pcre2test -LS
+.sp
+.
+.
+.
+.SS "The general category property for \ep and \eP"
+.rs
+.sp
 Each character has exactly one Unicode general category property, specified by
 a two-letter abbreviation. For compatibility with Perl, negation can be
 specified by including a circumflex between the opening brace and the property
@ -1026,9 +889,9 @@ The following general category property codes are supported:
  Zp    Paragraph separator
  Zs    Space separator
 .sp
-The special property L& is also supported: it matches a character that has
-the Lu, Ll, or Lt property, in other words, a letter that is not classified as
-a modifier or "other".
+The special property LC, which has the synonym L&, is also supported: it
+matches a character that has the Lu, Ll, or Lt property, in other words, a
+letter that is not classified as a modifier or "other".
 .P
 The Cs (Surrogate) property applies only to characters whose code points are in
 the range U+D800 to U+DFFF. These characters are no different to any other
@ -1052,12 +915,53 @@ Unicode table.
 Specifying caseless matching does not affect these escape sequences. For
 example, \ep{Lu} always matches only upper case letters. This is different from
 the behaviour of current versions of Perl.
-.P
-Matching characters by Unicode property is not fast, because PCRE2 has to do a
-multistage table lookup in order to find a character's property. That is why
-the traditional escape sequences such as \ed and \ew do not use Unicode
-properties in PCRE2 by default, though you can make them do so by setting the
-PCRE2_UCP option or by starting the pattern with (*UCP).
+.
+.
+.SS "Binary (yes/no) properties for \ep and \eP"
+.rs
+.sp
+Unicode defines a number of binary properties, that is, properties whose only
+values are true or false. You can obtain a list of those that are recognized by
+\ep and \eP, along with their abbreviations, by running this command:
+.sp
+  pcre2test -LP
+.sp
+.
+.
+.SS "The Bidi_Class property for \ep and \eP"
+.rs
+.sp
+  \ep{Bidi_Class:<class>}   matches a character with the given class
+  \ep{BC:<class>}           matches a character with the given class
+.sp
+The recognized classes are:
+.sp
+  AL          Arabic letter
+  AN          Arabic number
+  B           paragraph separator
+  BN          boundary neutral
+  CS          common separator
+  EN          European number
+  ES          European separator
+  ET          European terminator
+  FSI         first strong isolate
+  L           left-to-right
+  LRE         left-to-right embedding
+  LRI         left-to-right isolate
+  LRO         left-to-right override
+  NSM         non-spacing mark
+  ON          other neutral
+  PDF         pop directional format
+  PDI         pop directional isolate
+  R           right-to-left
+  RLE         right-to-left embedding
+  RLI         right-to-left isolate
+  RLO         right-to-left override
+  S           segment separator
+  WS          which space
+.sp
+An equals sign may be used instead of a colon. The class names are
+case-insensitive; only the short names listed above are recognized.
 .
 .
 .SS Extended grapheme clusters
@ -1336,14 +1240,19 @@ end of the subject in both modes, and if all branches of a pattern start with
 .sp
 Outside a character class, a dot in the pattern matches any one character in
 the subject string except (by default) a character that signifies the end of a
-line.
+line. One or more characters may be specified as line terminators (see
+.\" HTML <a href="#newlines">
+.\" </a>
+"Newline conventions"
+.\"
+above).
 .P
-When a line ending is defined as a single character, dot never matches that
-character; when the two-character sequence CRLF is used, dot does not match CR
-if it is immediately followed by LF, but otherwise it matches all characters
-(including isolated CRs and LFs). When any Unicode line endings are being
-recognized, dot does not match CR or LF or any of the other line ending
-characters.
+Dot never matches a single line-ending character. When the two-character
+sequence CRLF is the only line ending, dot does not match CR if it is
+immediately followed by LF, but otherwise it matches all characters (including
+isolated CRs and LFs). When ANYCRLF is selected for line endings, no occurences
+of CR of LF match dot. When all Unicode line endings are being recognized, dot
+does not match CR or LF or any of the other line ending characters.
 .P
 The behaviour of dot with regard to newlines can be changed. If the
 PCRE2_DOTALL option is set, a dot matches any one character, without exception.
@ -2186,10 +2095,10 @@ be easier to remember:
 .sp
  (*atomic:\ed+)foo
 .sp
-This kind of parenthesized group "locks up" the  part of the pattern it
-contains once it has matched, and a failure further into the pattern is
-prevented from backtracking into it. Backtracking past it to previous items,
-however, works as normal.
+This kind of parenthesized group "locks up" the part of the pattern it contains
+once it has matched, and a failure further into the pattern is prevented from
+backtracking into it. Backtracking past it to previous items, however, works as
+normal.
 .P
 An alternative description is that a group of this type matches exactly the
 string of characters that an identical standalone pattern would match, if
@ -3905,6 +3814,6 @@ Cambridge, England.
 .rs
 .sp
 .nf
-Last updated: 30 August 2021
-Copyright (c) 1997-2021 University of Cambridge.
+Last updated: 12 January 2022
+Copyright (c) 1997-2022 University of Cambridge.
 .fi
--- a/doc/pcre2perform.3
+++ b/doc/pcre2perform.3
@ -1,4 +1,4 @@
-.TH PCRE2PERFORM 3 "03 February 2019" "PCRE2 10.33"
+.TH PCRE2PERFORM 3 "27 July 2022" "PCRE2 10.41"
 .SH NAME
 PCRE2 - Perl-compatible regular expressions (revised API)
 .SH "PCRE2 PERFORMANCE"
@ -69,12 +69,28 @@ From release 10.30, the interpretive (non-JIT) version of \fBpcre2_match()\fP
 uses very little system stack at run time. In earlier releases recursive
 function calls could use a great deal of stack, and this could cause problems,
 but this usage has been eliminated. Backtracking positions are now explicitly
-remembered in memory frames controlled by the code. An initial 20KiB vector of
-frames is allocated on the system stack (enough for about 100 frames for small
-patterns), but if this is insufficient, heap memory is used. The amount of heap
-memory can be limited; if the limit is set to zero, only the initial stack
-vector is used. Rewriting patterns to be time-efficient, as described below,
-may also reduce the memory requirements.
+remembered in memory frames controlled by the code. 
+.P
+The size of each frame depends on the size of pointer variables and the number
+of capturing parenthesized groups in the pattern being matched. On a 64-bit
+system the frame size for a pattern with no captures is 128 bytes. For each
+capturing group the size increases by 16 bytes.
+.P
+Until release 10.41, an initial 20KiB frames vector was allocated on the system 
+stack, but this still caused some issues for multi-thread applications where
+each thread has a very small stack. From release 10.41 backtracking memory
+frames are always held in heap memory. An initial heap allocation is obtained
+the first time any match data block is passed to \fBpcre2_match()\fP. This is
+remembered with the match data block and re-used if that block is used for
+another match. It is freed when the match data block itself is freed.
+.P
+The size of the initial block is the larger of 20KiB or ten times the pattern's 
+frame size, unless the heap limit is less than this, in which case the heap 
+limit is used. If the initial block proves to be too small during matching, it
+is replaced by a larger block, subject to the heap limit. The heap limit is 
+checked only when a new block is to be allocated. Reducing the heap limit 
+between calls to \fBpcre2_match()\fP with the same match data block does not 
+affect the saved block.
 .P
 In contrast to \fBpcre2_match()\fP, \fBpcre2_dfa_match()\fP does use recursive
 function calls, but only for processing atomic groups, lookaround assertions,
@ -230,7 +246,7 @@ pattern to match. This is done by repeatedly matching with different limits.
 .sp
 .nf
 Philip Hazel
-University Computing Service
+Retired from University Computing Service
 Cambridge, England.
 .fi
 .
@ -239,6 +255,6 @@ Cambridge, England.
 .rs
 .sp
 .nf
-Last updated: 03 February 2019
-Copyright (c) 1997-2019 University of Cambridge.
+Last updated: 27 July 2022
+Copyright (c) 1997-2022 University of Cambridge.
 .fi
--- a/doc/pcre2serialize.3
+++ b/doc/pcre2serialize.3
@ -6,11 +6,11 @@ PCRE2 - Perl-compatible regular expressions (revised API)
 .sp
 .nf
 .B int32_t pcre2_serialize_decode(pcre2_code **\fIcodes\fP,
-.B "  int32_t \fInumber_of_codes\fP, const uint32_t *\fIbytes\fP,"
+.B "  int32_t \fInumber_of_codes\fP, const uint8_t *\fIbytes\fP,"
 .B "  pcre2_general_context *\fIgcontext\fP);"
 .sp
-.B int32_t pcre2_serialize_encode(pcre2_code **\fIcodes\fP,
-.B "  int32_t \fInumber_of_codes\fP, uint32_t **\fIserialized_bytes\fP,"
+.B int32_t pcre2_serialize_encode(const pcre2_code **\fIcodes\fP,
+.B "  int32_t \fInumber_of_codes\fP, uint8_t **\fIserialized_bytes\fP,"
 .B "  PCRE2_SIZE *\fIserialized_size\fP, pcre2_general_context *\fIgcontext\fP);"
 .sp
 .B void pcre2_serialize_free(uint8_t *\fIbytes\fP);
@ -81,7 +81,7 @@ of serialized patterns, or one of the following negative error codes:
 .sp
  PCRE2_ERROR_BADDATA      the number of patterns is zero or less
  PCRE2_ERROR_BADMAGIC     mismatch of id bytes in one of the patterns
-  PCRE2_ERROR_MEMORY       memory allocation failed
+  PCRE2_ERROR_NOMEMORY     memory allocation failed
  PCRE2_ERROR_MIXEDTABLES  the patterns do not all use the same tables
  PCRE2_ERROR_NULL         the 1st, 3rd, or 4th argument is NULL
 .sp
@ -141,7 +141,6 @@ mangagement functions for the decoded patterns. If this argument is NULL,
 \fBmalloc()\fP and \fBfree()\fP are used. After deserialization, the byte
 stream is no longer needed and can be discarded.
 .sp
-  int32_t number_of_codes;
  pcre2_code *list_of_codes[2];
  uint8_t *bytes = <serialized data>;
  int32_t number_of_codes =
--- a/doc/pcre2syntax.3
+++ b/doc/pcre2syntax.3
@ -1,4 +1,4 @@
-.TH PCRE2SYNTAX 3 "30 August 2021" "PCRE2 10.38"
+.TH PCRE2SYNTAX 3 "12 January 2022" "PCRE2 10.40"
 .SH NAME
 PCRE2 - Perl-compatible regular expressions (revised API)
 .SH "PCRE2 REGULAR EXPRESSION SYNTAX SUMMARY"
@ -102,6 +102,10 @@ happening, \es and \ew may also match characters with code points in the range
 128-255. If the PCRE2_UCP option is set, the behaviour of these escape
 sequences is changed to use Unicode properties and they match many more
 characters.
+.P
+Property descriptions in \ep and \eP are matched caselessly; hyphens,
+underscores, and white space are ignored, in accordance with Unicode's "loose
+matching" rules.
 .
 .
 .SH "GENERAL CATEGORY PROPERTIES FOR \ep and \eP"
@ -120,6 +124,7 @@ characters.
  Lo         Other letter
  Lt         Title case letter
  Lu         Upper case letter
+  Lc         Ll, Lu, or Lt
  L&         Ll, Lu, or Lt
 .sp
  M          Mark
@ -167,170 +172,59 @@ Perl and POSIX space are now the same. Perl added VT to its space character set
 at release 5.18.
 .
 .
-.SH "SCRIPT NAMES FOR \ep AND \eP"
+.SH "BINARY PROPERTIES FOR \ep AND \eP"
 .rs
 .sp
-Adlam,
-Ahom,
-Anatolian_Hieroglyphs,
-Arabic,
-Armenian,
-Avestan,
-Balinese,
-Bamum,
-Bassa_Vah,
-Batak,
-Bengali,
-Bhaiksuki,
-Bopomofo,
-Brahmi,
-Braille,
-Buginese,
-Buhid,
-Canadian_Aboriginal,
-Carian,
-Caucasian_Albanian,
-Chakma,
-Cham,
-Cherokee,
-Chorasmian,
-Common,
-Coptic,
-Cuneiform,
-Cypriot,
-Cypro_Minoan,
-Cyrillic,
-Deseret,
-Devanagari,
-Dives_Akuru,
-Dogra,
-Duployan,
-Egyptian_Hieroglyphs,
-Elbasan,
-Elymaic,
-Ethiopic,
-Georgian,
-Glagolitic,
-Gothic,
-Grantha,
-Greek,
-Gujarati,
-Gunjala_Gondi,
-Gurmukhi,
-Han,
-Hangul,
-Hanifi_Rohingya,
-Hanunoo,
-Hatran,
-Hebrew,
-Hiragana,
-Imperial_Aramaic,
-Inherited,
-Inscriptional_Pahlavi,
-Inscriptional_Parthian,
-Javanese,
-Kaithi,
-Kannada,
-Katakana,
-Kayah_Li,
-Kharoshthi,
-Khitan_Small_Script,
-Khmer,
-Khojki,
-Khudawadi,
-Lao,
-Latin,
-Lepcha,
-Limbu,
-Linear_A,
-Linear_B,
-Lisu,
-Lycian,
-Lydian,
-Mahajani,
-Makasar,
-Malayalam,
-Mandaic,
-Manichaean,
-Marchen,
-Masaram_Gondi,
-Medefaidrin,
-Meetei_Mayek,
-Mende_Kikakui,
-Meroitic_Cursive,
-Meroitic_Hieroglyphs,
-Miao,
-Modi,
-Mongolian,
-Mro,
-Multani,
-Myanmar,
-Nabataean,
-Nandinagari,
-New_Tai_Lue,
-Newa,
-Nko,
-Nushu,
-Nyakeng_Puachue_Hmong,
-Ogham,
-Ol_Chiki,
-Old_Hungarian,
-Old_Italic,
-Old_North_Arabian,
-Old_Permic,
-Old_Persian,
-Old_Sogdian,
-Old_South_Arabian,
-Old_Turkic,
-Old_Uyghur,
-Oriya,
-Osage,
-Osmanya,
-Pahawh_Hmong,
-Palmyrene,
-Pau_Cin_Hau,
-Phags_Pa,
-Phoenician,
-Psalter_Pahlavi,
-Rejang,
-Runic,
-Samaritan,
-Saurashtra,
-Sharada,
-Shavian,
-Siddham,
-SignWriting,
-Sinhala,
-Sogdian,
-Sora_Sompeng,
-Soyombo,
-Sundanese,
-Syloti_Nagri,
-Syriac,
-Tagalog,
-Tagbanwa,
-Tai_Le,
-Tai_Tham,
-Tai_Viet,
-Takri,
-Tamil,
-Tangsa,
-Tangut,
-Telugu,
-Thaana,
-Thai,
-Tibetan,
-Tifinagh,
-Tirhuta,
-Toto,
-Ugaritic,
-Vai,
-Vithkuqi,
-Wancho,
-Warang_Citi,
-Yezidi,
-Yi,
-Zanabazar_Square.
+Unicode defines a number of binary properties, that is, properties whose only
+values are true or false. You can obtain a list of those that are recognized by
+\ep and \eP, along with their abbreviations, by running this command:
+.sp
+  pcre2test -LP
+.
+.
+.
+.SH "SCRIPT MATCHING WITH \ep AND \eP"
+.rs
+.sp
+Many script names and their 4-letter abbreviations are recognized in
+\ep{sc:...} or \ep{scx:...} items, or on their own with \ep (and also \eP of
+course). You can obtain a list of these scripts by running this command:
+.sp
+  pcre2test -LS
+.
+.
+.
+.SH "THE BIDI_CLASS PROPERTY FOR \ep AND \eP"
+.rs
+.sp
+  \ep{Bidi_Class:<class>}   matches a character with the given class
+  \ep{BC:<class>}           matches a character with the given class
+.sp
+The recognized classes are:
+.sp
+  AL          Arabic letter
+  AN          Arabic number
+  B           paragraph separator
+  BN          boundary neutral
+  CS          common separator
+  EN          European number
+  ES          European separator
+  ET          European terminator
+  FSI         first strong isolate
+  L           left-to-right
+  LRE         left-to-right embedding
+  LRI         left-to-right isolate
+  LRO         left-to-right override
+  NSM         non-spacing mark
+  ON          other neutral
+  PDF         pop directional format
+  PDI         pop directional isolate
+  R           right-to-left
+  RLE         right-to-left embedding
+  RLI         right-to-left isolate
+  RLO         right-to-left override
+  S           segment separator
+  WS          which space
 .
 .
 .SH "CHARACTER CLASSES"
@ -684,6 +578,6 @@ Cambridge, England.
 .rs
 .sp
 .nf
-Last updated: 30 August 2021
-Copyright (c) 1997-2021 University of Cambridge.
+Last updated: 12 January 2022
+Copyright (c) 1997-2022 University of Cambridge.
 .fi
--- a/doc/pcre2test.1
+++ b/doc/pcre2test.1
@ -1,4 +1,4 @@
-.TH PCRE2TEST 1 "30 August 2021" "PCRE 10.38"
+.TH PCRE2TEST 1 "27 July 2022" "PCRE 10.41"
 .SH NAME
 pcre2test - a program for testing Perl-compatible regular expressions.
 .SH SYNOPSIS
@ -47,7 +47,7 @@ format before being passed to the library functions. Results are converted back
 to 8-bit code units for output.
 .P
 In the rest of this document, the names of library functions and structures
-are given in generic form, for example, \fBpcre_compile()\fP. The actual
+are given in generic form, for example, \fBpcre2_compile()\fP. The actual
 names used in the libraries have a suffix _8, _16, or _32, as appropriate.
 .
 .
@ -211,7 +211,17 @@ available, and the use of JIT for matching is verified.
 \fB-LM\fP
 List modifiers: write a list of available pattern and subject modifiers to the
 standard output, then exit with zero exit code. All other options are ignored.
-If both -C and -LM are present, whichever is first is recognized.
+If both -C and any -Lx options are present, whichever is first is recognized.
+.TP 10
+\fB-LP\fP
+List properties: write a list of recognized Unicode properties to the standard
+output, then exit with zero exit code. All other options are ignored. If both
+-C and any -Lx options are present, whichever is first is recognized.
+.TP 10
+\fB-LS\fP
+List scripts: write a list of recogized Unicode script names to the standard
+output, then exit with zero exit code. All other options are ignored. If both
+-C and any -Lx options are present, whichever is first is recognized.
 .TP 10
 \fB-pattern\fP \fImodifier-list\fP
 Behave as if each pattern line contains the given modifiers.
@ -1196,7 +1206,8 @@ pattern, but can be overridden by modifiers on the subject.
      copy=<number or name>      copy captured substring
      depth_limit=<n>            set a depth limit
      dfa                        use \fBpcre2_dfa_match()\fP
-      find_limits                find match and depth limits
+      find_limits                find heap, match and depth limits
+      find_limits_noheap         find match and depth limits
      get=<number or name>       extract captured substring
      getall                     extract all captured substrings
  /g  global                     global matching
@ -1206,6 +1217,8 @@ pattern, but can be overridden by modifiers on the subject.
      match_limit=<n>            set a match limit
      memory                     show heap memory usage
      null_context               match with a NULL context
+      null_replacement           substitute with NULL replacement
+      null_subject               match with NULL subject
      offset=<n>                 set starting offset
      offset_limit=<n>           set offset limit
      ovector=<n>                set size of output vector
@ -1516,7 +1529,7 @@ value that was set on the pattern.
 .sp
 The \fBheap_limit\fP, \fBmatch_limit\fP, and \fBdepth_limit\fP modifiers set
 the appropriate limits in the match context. These values are ignored when the
-\fBfind_limits\fP modifier is specified.
+\fBfind_limits\fP or \fBfind_limits_noheap\fP modifier is specified.
 .
 .
 .SS "Finding minimum limits"
@ -1526,8 +1539,12 @@ If the \fBfind_limits\fP modifier is present on a subject line, \fBpcre2test\fP
 calls the relevant matching function several times, setting different values in
 the match context via \fBpcre2_set_heap_limit()\fP,
 \fBpcre2_set_match_limit()\fP, or \fBpcre2_set_depth_limit()\fP until it finds
-the minimum values for each parameter that allows the match to complete without
-error. If JIT is being used, only the match limit is relevant.
+the smallest value for each parameter that allows the match to complete without
+a "limit exceeded" error. The match itself may succeed or fail. An alternative
+modifier, \fBfind_limits_noheap\fP, omits the heap limit. This is used in the
+standard tests, because the minimum heap limit varies between systems. If JIT
+is being used, only the match limit is relevant, and the other two are
+automatically omitted.
 .P
 When using this modifier, the pattern should not contain any limit settings
 such as (*LIMIT_MATCH=...) within it. If such a setting is present and is
@ -1551,9 +1568,7 @@ and non-recursive, to the internal matching function, thus controlling the
 overall amount of computing resource that is used.
 .P
 For both kinds of matching, the \fIheap_limit\fP number, which is in kibibytes
-(units of 1024 bytes), limits the amount of heap memory used for matching. A
-value of zero disables the use of any heap memory; many simple pattern matches
-can be done without using the heap, so zero is not an unreasonable setting.
+(units of 1024 bytes), limits the amount of heap memory used for matching.
 .
 .
 .SS "Showing MARK names"
@ -1572,12 +1587,10 @@ is added to the non-match message.
 .sp
 The \fBmemory\fP modifier causes \fBpcre2test\fP to log the sizes of all heap
 memory allocation and freeing calls that occur during a call to
-\fBpcre2_match()\fP or \fBpcre2_dfa_match()\fP. These occur only when a match
-requires a bigger vector than the default for remembering backtracking points
-(\fBpcre2_match()\fP) or for internal workspace (\fBpcre2_dfa_match()\fP). In
-many cases there will be no heap memory used and therefore no additional
-output. No heap memory is allocated during matching with JIT, so in that case
-the \fBmemory\fP modifier never has any effect. For this modifier to work, the
+\fBpcre2_match()\fP or \fBpcre2_dfa_match()\fP. In the latter case, heap memory
+is used only when a match requires more internal workspace that the default
+allocation on the stack, so in many cases there will be no output. No heap
+memory is allocated during matching with JIT. For this modifier to work, the
 \fBnull_context\fP modifier must not be set on both the pattern and the
 subject, though it can be set on one or the other.
 .
@ -1629,7 +1642,7 @@ When testing \fBpcre2_substitute()\fP, this modifier also has the effect of
 passing the replacement string as zero-terminated.
 .
 .
-.SS "Passing a NULL context"
+.SS "Passing a NULL context, subject, or replacement"
 .rs
 .sp
 Normally, \fBpcre2test\fP passes a context block to \fBpcre2_match()\fP,
@ -1637,7 +1650,12 @@ Normally, \fBpcre2test\fP passes a context block to \fBpcre2_match()\fP,
 If the \fBnull_context\fP modifier is set, however, NULL is passed. This is for
 testing that the matching and substitution functions behave correctly in this
 case (they use default values). This modifier cannot be used with the
-\fBfind_limits\fP or \fBsubstitute_callout\fP modifiers.
+\fBfind_limits\fP, \fBfind_limits_noheap\fP, or \fBsubstitute_callout\fP
+modifiers.
+.P
+Similarly, for testing purposes, if the \fBnull_subject\fP or
+\fBnull_replacement\fP modifier is set, the subject or replacement string
+pointers are passed as NULL, respectively, to the relevant functions.
 .
 .
 .SH "THE ALTERNATIVE MATCHING FUNCTION"
@ -2103,6 +2121,6 @@ Cambridge, England.
 .rs
 .sp
 .nf
-Last updated: 30 August 2021
-Copyright (c) 1997-2021 University of Cambridge.
+Last updated: 27 July 2022
+Copyright (c) 1997-2022 University of Cambridge.
 .fi
--- a/doc/pcre2test.txt
+++ b/doc/pcre2test.txt
@ -44,7 +44,7 @@ PCRE2's 8-BIT, 16-BIT AND 32-BIT LIBRARIES
       output.

       In the rest of this document, the names of library functions and struc-
-       tures  are  given in generic form, for example, pcre_compile(). The ac-
+       tures  are given in generic form, for example, pcre2_compile(). The ac-
       tual names used in the libraries have a suffix _8, _16, or _32, as  ap-
       propriate.

@ -197,7 +197,17 @@ COMMAND LINE OPTIONS

       -LM       List modifiers: write a list of available pattern and subject
                 modifiers to the standard output, then exit  with  zero  exit
-                 code.  All other options are ignored.  If both -C and -LM are
+                 code.  All other options are ignored.  If both -C and any -Lx
+                 options are present, whichever is first is recognized.
+
+       -LP       List properties: write a list of recognized  Unicode  proper-
+                 ties  to  the standard output, then exit with zero exit code.
+                 All other options are ignored. If both -C and any -Lx options
+                 are present, whichever is first is recognized.
+
+       -LS       List  scripts: write a list of recogized Unicode script names
+                 to the standard output, then exit with zero  exit  code.  All
+                 other options are ignored. If both -C and any -Lx options are
                 present, whichever is first is recognized.

       -pattern modifier-list
@ -1101,7 +1111,8 @@ SUBJECT MODIFIERS
             copy=<number or name>      copy captured substring
             depth_limit=<n>            set a depth limit
             dfa                        use pcre2_dfa_match()
-             find_limits                find match and depth limits
+             find_limits                find heap, match and depth limits
+             find_limits_noheap         find match and depth limits
             get=<number or name>       extract captured substring
             getall                     extract all captured substrings
         /g  global                     global matching
@ -1111,6 +1122,8 @@ SUBJECT MODIFIERS
             match_limit=<n>            set a match limit
             memory                     show heap memory usage
             null_context               match with a NULL context
+             null_replacement           substitute with NULL replacement
+             null_subject               match with NULL subject
             offset=<n>                 set starting offset
             offset_limit=<n>           set offset limit
             ovector=<n>                set size of output vector
@ -1399,7 +1412,7 @@ SUBJECT MODIFIERS

       The heap_limit, match_limit, and depth_limit modifiers set  the  appro-
       priate  limits  in the match context. These values are ignored when the
-       find_limits modifier is specified.
+       find_limits or find_limits_noheap modifier is specified.

   Finding minimum limits

@ -1407,8 +1420,12 @@ SUBJECT MODIFIERS
       calls  the  relevant matching function several times, setting different
       values   in   the    match    context    via    pcre2_set_heap_limit(),
       pcre2_set_match_limit(),  or pcre2_set_depth_limit() until it finds the
-       minimum values for each parameter that allows  the  match  to  complete
-       without error. If JIT is being used, only the match limit is relevant.
+       smallest value for each parameter that allows  the  match  to  complete
+       without a "limit exceeded" error. The match itself may succeed or fail.
+       An alternative modifier, find_limits_noheap, omits the heap limit. This
+       is  used  in  the standard tests, because the minimum heap limit varies
+       between systems. If JIT is being used, only the match  limit  is  rele-
+       vant, and the other two are automatically omitted.

       When using this modifier, the pattern should not contain any limit set-
       tings such as (*LIMIT_MATCH=...)  within  it.  If  such  a  setting  is
@ -1434,9 +1451,7 @@ SUBJECT MODIFIERS

       For  both  kinds  of  matching,  the  heap_limit  number,  which  is in
       kibibytes (units of 1024 bytes), limits the amount of heap memory  used
-       for matching. A value of zero disables the use of any heap memory; many
-       simple pattern matches can be done without using the heap, so  zero  is
-       not an unreasonable setting.
+       for matching.

   Showing MARK names

@ -1451,13 +1466,11 @@ SUBJECT MODIFIERS

       The  memory modifier causes pcre2test to log the sizes of all heap mem-
       ory  allocation  and  freeing  calls  that  occur  during  a  call   to
-       pcre2_match()  or  pcre2_dfa_match(). These occur only when a match re-
-       quires a bigger vector than the default  for  remembering  backtracking
-       points  (pcre2_match())  or for internal workspace (pcre2_dfa_match()).
-       In many cases there will be no heap memory used and therefore no  addi-
-       tional output. No heap memory is allocated during matching with JIT, so
-       in that case the memory modifier never has any effect. For  this  modi-
-       fier  to  work,  the  null_context modifier must not be set on both the
+       pcre2_match()  or pcre2_dfa_match(). In the latter case, heap memory is
+       used only when a match requires more internal workspace  that  the  de-
+       fault  allocation  on the stack, so in many cases there will be no out-
+       put. No heap memory is allocated during matching  with  JIT.  For  this
+       modifier to work, the null_context modifier must not be set on both the
       pattern and the subject, though it can be set on one or the other.

   Setting a starting offset
@ -1499,48 +1512,53 @@ SUBJECT MODIFIERS
       When testing pcre2_substitute(), this modifier also has the  effect  of
       passing the replacement string as zero-terminated.

-   Passing a NULL context
+   Passing a NULL context, subject, or replacement

       Normally,   pcre2test   passes   a   context  block  to  pcre2_match(),
       pcre2_dfa_match(), pcre2_jit_match()  or  pcre2_substitute().   If  the
       null_context  modifier  is  set,  however,  NULL is passed. This is for
       testing that the matching and substitution functions  behave  correctly
       in  this  case  (they use default values). This modifier cannot be used
-       with the find_limits or substitute_callout modifiers.
+       with the find_limits, find_limits_noheap, or  substitute_callout  modi-
+       fiers.
+
+       Similarly,  for  testing purposes, if the null_subject or null_replace-
+       ment modifier is set, the subject or replacement  string  pointers  are
+       passed as NULL, respectively, to the relevant functions.


 THE ALTERNATIVE MATCHING FUNCTION

-       By default,  pcre2test  uses  the  standard  PCRE2  matching  function,
+       By  default,  pcre2test  uses  the  standard  PCRE2  matching function,
       pcre2_match() to match each subject line. PCRE2 also supports an alter-
-       native matching function, pcre2_dfa_match(), which operates in  a  dif-
-       ferent  way, and has some restrictions. The differences between the two
+       native  matching  function, pcre2_dfa_match(), which operates in a dif-
+       ferent way, and has some restrictions. The differences between the  two
       functions are described in the pcre2matching documentation.

-       If the dfa modifier is set, the alternative matching function is  used.
-       This  function  finds all possible matches at a given point in the sub-
-       ject. If, however, the dfa_shortest modifier is set,  processing  stops
-       after  the  first  match is found. This is always the shortest possible
+       If  the dfa modifier is set, the alternative matching function is used.
+       This function finds all possible matches at a given point in  the  sub-
+       ject.  If,  however, the dfa_shortest modifier is set, processing stops
+       after the first match is found. This is always  the  shortest  possible
       match.


 DEFAULT OUTPUT FROM pcre2test

-       This section describes the output when the  normal  matching  function,
+       This  section  describes  the output when the normal matching function,
       pcre2_match(), is being used.

-       When  a  match  succeeds,  pcre2test  outputs the list of captured sub-
-       strings, starting with number 0 for the string that matched  the  whole
+       When a match succeeds, pcre2test outputs  the  list  of  captured  sub-
+       strings,  starting  with number 0 for the string that matched the whole
       pattern.  Otherwise, it outputs "No match" when the return is PCRE2_ER-
-       ROR_NOMATCH, or "Partial match:" followed  by  the  partially  matching
-       substring  when  the  return is PCRE2_ERROR_PARTIAL. (Note that this is
-       the entire substring that was inspected during the  partial  match;  it
-       may  include  characters  before the actual match start if a lookbehind
+       ROR_NOMATCH,  or  "Partial  match:"  followed by the partially matching
+       substring when the return is PCRE2_ERROR_PARTIAL. (Note  that  this  is
+       the  entire  substring  that was inspected during the partial match; it
+       may include characters before the actual match start  if  a  lookbehind
       assertion, \K, \b, or \B was involved.)

       For any other return, pcre2test outputs the PCRE2 negative error number
-       and  a  short  descriptive  phrase. If the error is a failed UTF string
-       check, the code unit offset of the start of the  failing  character  is
+       and a short descriptive phrase. If the error is  a  failed  UTF  string
+       check,  the  code  unit offset of the start of the failing character is
       also output. Here is an example of an interactive pcre2test run.

         $ pcre2test
@ -1556,8 +1574,8 @@ DEFAULT OUTPUT FROM pcre2test
       Unset capturing substrings that are not followed by one that is set are
       not shown by pcre2test unless the allcaptures modifier is specified. In
       the following example, there are two capturing substrings, but when the
-       first data line is matched, the second, unset substring is  not  shown.
-       An  "internal" unset substring is shown as "<unset>", as for the second
+       first  data  line is matched, the second, unset substring is not shown.
+       An "internal" unset substring is shown as "<unset>", as for the  second
       data line.

           re> /(a)|(b)/
@ -1569,11 +1587,11 @@ DEFAULT OUTPUT FROM pcre2test
          1: <unset>
          2: b

-       If the strings contain any non-printing characters, they are output  as
-       \xhh  escapes  if  the  value is less than 256 and UTF mode is not set.
+       If  the strings contain any non-printing characters, they are output as
+       \xhh escapes if the value is less than 256 and UTF  mode  is  not  set.
       Otherwise they are output as \x{hh...} escapes. See below for the defi-
-       nition  of  non-printing  characters. If the aftertext modifier is set,
-       the output for substring 0 is followed by the the rest of  the  subject
+       nition of non-printing characters. If the aftertext  modifier  is  set,
+       the  output  for substring 0 is followed by the the rest of the subject
       string, identified by "0+" like this:

           re> /cat/aftertext
@ -1593,8 +1611,8 @@ DEFAULT OUTPUT FROM pcre2test
          0: ipp
          1: pp

-       "No match" is output only if the first match attempt fails. Here is  an
-       example  of  a  failure  message (the offset 4 that is specified by the
+       "No  match" is output only if the first match attempt fails. Here is an
+       example of a failure message (the offset 4 that  is  specified  by  the
       offset modifier is past the end of the subject string):

           re> /xyz/
@ -1602,7 +1620,7 @@ DEFAULT OUTPUT FROM pcre2test
         Error -24 (bad offset value)

       Note that whereas patterns can be continued over several lines (a plain
-       ">"  prompt  is used for continuations), subject lines may not. However
+       ">" prompt is used for continuations), subject lines may  not.  However
       newlines can be included in a subject by means of the \n escape (or \r,
       \r\n, etc., depending on the newline sequence setting).

@ -1610,7 +1628,7 @@ DEFAULT OUTPUT FROM pcre2test
 OUTPUT FROM THE ALTERNATIVE MATCHING FUNCTION

       When the alternative matching function, pcre2_dfa_match(), is used, the
-       output consists of a list of all the matches that start  at  the  first
+       output  consists  of  a list of all the matches that start at the first
       point in the subject where there is at least one match. For example:

           re> /(tang|tangerine|tan)/
@ -1619,11 +1637,11 @@ OUTPUT FROM THE ALTERNATIVE MATCHING FUNCTION
          1: tang
          2: tan

-       Using  the normal matching function on this data finds only "tang". The
-       longest matching string is always given first (and numbered zero).  Af-
-       ter  a PCRE2_ERROR_PARTIAL return, the output is "Partial match:", fol-
+       Using the normal matching function on this data finds only "tang".  The
+       longest  matching string is always given first (and numbered zero). Af-
+       ter a PCRE2_ERROR_PARTIAL return, the output is "Partial match:",  fol-
       lowed by the partially matching substring. Note that this is the entire
-       substring  that  was inspected during the partial match; it may include
+       substring that was inspected during the partial match; it  may  include
       characters before the actual match start if a lookbehind assertion, \b,
       or \B was involved. (\K is not supported for DFA matching.)

@ -1639,16 +1657,16 @@ OUTPUT FROM THE ALTERNATIVE MATCHING FUNCTION
          1: tan
          0: tan

-       The alternative matching function does not support  substring  capture,
-       so  the  modifiers  that are concerned with captured substrings are not
+       The  alternative  matching function does not support substring capture,
+       so the modifiers that are concerned with captured  substrings  are  not
       relevant.


 RESTARTING AFTER A PARTIAL MATCH

-       When the alternative matching function has given  the  PCRE2_ERROR_PAR-
+       When  the  alternative matching function has given the PCRE2_ERROR_PAR-
       TIAL return, indicating that the subject partially matched the pattern,
-       you can restart the match with additional subject data by means of  the
+       you  can restart the match with additional subject data by means of the
       dfa_restart modifier. For example:

           re> /^\d?\d(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)\d\d$/
@ -1657,37 +1675,37 @@ RESTARTING AFTER A PARTIAL MATCH
         data> n05\=dfa,dfa_restart
          0: n05

-       For  further  information  about partial matching, see the pcre2partial
+       For further information about partial matching,  see  the  pcre2partial
       documentation.


 CALLOUTS

       If the pattern contains any callout requests, pcre2test's callout func-
-       tion  is  called during matching unless callout_none is specified. This
+       tion is called during matching unless callout_none is  specified.  This
       works with both matching functions, and with JIT, though there are some
-       differences  in behaviour. The output for callouts with numerical argu-
+       differences in behaviour. The output for callouts with numerical  argu-
       ments and those with string arguments is slightly different.

   Callouts with numerical arguments

       By default, the callout function displays the callout number, the start
-       and  current positions in the subject text at the callout time, and the
+       and current positions in the subject text at the callout time, and  the
       next pattern item to be tested. For example:

         --->pqrabcdef
           0    ^  ^     \d

-       This output indicates that callout number 0 occurred for  a  match  at-
-       tempt  starting at the fourth character of the subject string, when the
-       pointer was at the seventh character, and when the  next  pattern  item
-       was  \d.  Just  one circumflex is output if the start and current posi-
+       This  output  indicates  that callout number 0 occurred for a match at-
+       tempt starting at the fourth character of the subject string, when  the
+       pointer  was  at  the seventh character, and when the next pattern item
+       was \d. Just one circumflex is output if the start  and  current  posi-
       tions are the same, or if the current position precedes the start posi-
       tion, which can happen if the callout is in a lookbehind assertion.

       Callouts numbered 255 are assumed to be automatic callouts, inserted as
       a result of the auto_callout pattern modifier. In this case, instead of
-       showing  the  callout  number, the offset in the pattern, preceded by a
+       showing the callout number, the offset in the pattern,  preceded  by  a
       plus, is output. For example:

           re> /\d?[A-E]\*/auto_callout
@ -1714,17 +1732,17 @@ CALLOUTS
         +12 ^  ^
          0: abc

-       The mark changes between matching "a" and "b", but stays the  same  for
-       the  rest  of  the match, so nothing more is output. If, as a result of
-       backtracking, the mark reverts to being unset, the  text  "<unset>"  is
+       The  mark  changes between matching "a" and "b", but stays the same for
+       the rest of the match, so nothing more is output. If, as  a  result  of
+       backtracking,  the  mark  reverts to being unset, the text "<unset>" is
       output.

   Callouts with string arguments

       The output for a callout with a string argument is similar, except that
-       instead of outputting a callout number before the position  indicators,
-       the  callout string and its offset in the pattern string are output be-
-       fore the reflection of the subject string, and the  subject  string  is
+       instead  of outputting a callout number before the position indicators,
+       the callout string and its offset in the pattern string are output  be-
+       fore  the  reflection  of the subject string, and the subject string is
       reflected for each callout. For example:

           re> /^ab(?C'first')cd(?C"second")ef/
@ -1740,26 +1758,26 @@ CALLOUTS

   Callout modifiers

-       The  callout  function in pcre2test returns zero (carry on matching) by
-       default, but you can use a callout_fail modifier in a subject  line  to
+       The callout function in pcre2test returns zero (carry on  matching)  by
+       default,  but  you can use a callout_fail modifier in a subject line to
       change this and other parameters of the callout (see below).

       If the callout_capture modifier is set, the current captured groups are
       output when a callout occurs. This is useful only for non-DFA matching,
-       as  pcre2_dfa_match()  does  not  support capturing, so no captures are
+       as pcre2_dfa_match() does not support capturing,  so  no  captures  are
       ever shown.

       The normal callout output, showing the callout number or pattern offset
-       (as  described above) is suppressed if the callout_no_where modifier is
+       (as described above) is suppressed if the callout_no_where modifier  is
       set.

-       When using the interpretive  matching  function  pcre2_match()  without
-       JIT,  setting  the callout_extra modifier causes additional output from
-       pcre2test's callout function to be generated. For the first callout  in
-       a  match  attempt at a new starting position in the subject, "New match
-       attempt" is output. If there has been a backtrack since the last  call-
+       When  using  the  interpretive  matching function pcre2_match() without
+       JIT, setting the callout_extra modifier causes additional  output  from
+       pcre2test's  callout function to be generated. For the first callout in
+       a match attempt at a new starting position in the subject,  "New  match
+       attempt"  is output. If there has been a backtrack since the last call-
       out (or start of matching if this is the first callout), "Backtrack" is
-       output, followed by "No other matching paths" if  the  backtrack  ended
+       output,  followed  by  "No other matching paths" if the backtrack ended
       the previous match attempt. For example:

          re> /(a+)b/auto_callout,no_start_optimize,no_auto_possess
@ -1796,86 +1814,86 @@ CALLOUTS
          +1    ^    a+
         No match

-       Notice  that  various  optimizations must be turned off if you want all
-       possible matching paths to be  scanned.  If  no_start_optimize  is  not
-       used,  there  is an immediate "no match", without any callouts, because
-       the starting optimization fails to find "b" in the  subject,  which  it
-       knows  must  be  present for any match. If no_auto_possess is not used,
-       the "a+" item is turned into "a++", which reduces the number  of  back-
+       Notice that various optimizations must be turned off if  you  want  all
+       possible  matching  paths  to  be  scanned. If no_start_optimize is not
+       used, there is an immediate "no match", without any  callouts,  because
+       the  starting  optimization  fails to find "b" in the subject, which it
+       knows must be present for any match. If no_auto_possess  is  not  used,
+       the  "a+"  item is turned into "a++", which reduces the number of back-
       tracks.

-       The  callout_extra modifier has no effect if used with the DFA matching
+       The callout_extra modifier has no effect if used with the DFA  matching
       function, or with JIT.

   Return values from callouts

-       The default return from the callout  function  is  zero,  which  allows
+       The  default  return  from  the  callout function is zero, which allows
       matching to continue. The callout_fail modifier can be given one or two
       numbers. If there is only one number, 1 is returned instead of 0 (caus-
       ing matching to backtrack) when a callout of that number is reached. If
-       two numbers (<n>:<m>) are given, 1 is  returned  when  callout  <n>  is
-       reached  and  there  have been at least <m> callouts. The callout_error
+       two  numbers  (<n>:<m>)  are  given,  1 is returned when callout <n> is
+       reached and there have been at least <m>  callouts.  The  callout_error
       modifier is similar, except that PCRE2_ERROR_CALLOUT is returned, caus-
-       ing  the entire matching process to be aborted. If both these modifiers
-       are set for the same callout number,  callout_error  takes  precedence.
-       Note  that  callouts  with string arguments are always given the number
+       ing the entire matching process to be aborted. If both these  modifiers
+       are  set  for  the same callout number, callout_error takes precedence.
+       Note that callouts with string arguments are always  given  the  number
       zero.

-       The callout_data modifier can be given an unsigned or a  negative  num-
-       ber.   This  is  set  as the "user data" that is passed to the matching
-       function, and passed back when the callout  function  is  invoked.  Any
-       value  other  than  zero  is  used as a return from pcre2test's callout
+       The  callout_data  modifier can be given an unsigned or a negative num-
+       ber.  This is set as the "user data" that is  passed  to  the  matching
+       function,  and  passed  back  when the callout function is invoked. Any
+       value other than zero is used as  a  return  from  pcre2test's  callout
       function.

       Inserting callouts can be helpful when using pcre2test to check compli-
-       cated  regular expressions. For further information about callouts, see
+       cated regular expressions. For further information about callouts,  see
       the pcre2callout documentation.


 NON-PRINTING CHARACTERS

       When pcre2test is outputting text in the compiled version of a pattern,
-       bytes  other  than 32-126 are always treated as non-printing characters
+       bytes other than 32-126 are always treated as  non-printing  characters
       and are therefore shown as hex escapes.

-       When pcre2test is outputting text that is a matched part of  a  subject
-       string,  it behaves in the same way, unless a different locale has been
-       set for the pattern (using the locale modifier). In this case, the  is-
+       When  pcre2test  is outputting text that is a matched part of a subject
+       string, it behaves in the same way, unless a different locale has  been
+       set  for the pattern (using the locale modifier). In this case, the is-
       print() function is used to distinguish printing and non-printing char-
       acters.


 SAVING AND RESTORING COMPILED PATTERNS

-       It is possible to save compiled patterns  on  disc  or  elsewhere,  and
+       It  is  possible  to  save  compiled patterns on disc or elsewhere, and
       reload them later, subject to a number of restrictions. JIT data cannot
-       be saved. The host on which the patterns are reloaded must  be  running
+       be  saved.  The host on which the patterns are reloaded must be running
       the same version of PCRE2, with the same code unit width, and must also
-       have the same endianness, pointer width  and  PCRE2_SIZE  type.  Before
-       compiled  patterns  can be saved they must be serialized, that is, con-
-       verted to a stream of bytes. A single byte stream may contain any  num-
-       ber  of compiled patterns, but they must all use the same character ta-
-       bles. A single copy of the tables is included in the byte  stream  (its
+       have  the  same  endianness,  pointer width and PCRE2_SIZE type. Before
+       compiled patterns can be saved they must be serialized, that  is,  con-
+       verted  to a stream of bytes. A single byte stream may contain any num-
+       ber of compiled patterns, but they must all use the same character  ta-
+       bles.  A  single copy of the tables is included in the byte stream (its
       size is 1088 bytes).

-       The  functions whose names begin with pcre2_serialize_ are used for se-
-       rializing and de-serializing. They are described in the  pcre2serialize
-       documentation.  In  this  section we describe the features of pcre2test
+       The functions whose names begin with pcre2_serialize_ are used for  se-
+       rializing  and de-serializing. They are described in the pcre2serialize
+       documentation. In this section we describe the  features  of  pcre2test
       that can be used to test these functions.

-       Note that "serialization" in PCRE2 does not convert  compiled  patterns
-       to  an  abstract  format  like Java or .NET. It just makes a reloadable
+       Note  that  "serialization" in PCRE2 does not convert compiled patterns
+       to an abstract format like Java or .NET. It  just  makes  a  reloadable
       byte code stream.  Hence the restrictions on reloading mentioned above.

-       In pcre2test, when a pattern with push modifier  is  successfully  com-
-       piled,  it  is  pushed onto a stack of compiled patterns, and pcre2test
-       expects the next line to contain a new pattern (or command) instead  of
+       In  pcre2test,  when  a pattern with push modifier is successfully com-
+       piled, it is pushed onto a stack of compiled  patterns,  and  pcre2test
+       expects  the next line to contain a new pattern (or command) instead of
       a subject line. By contrast, the pushcopy modifier causes a copy of the
-       compiled pattern to be stacked, leaving the original available for  im-
-       mediate  matching.  By using push and/or pushcopy, a number of patterns
-       can be compiled and retained. These  modifiers  are  incompatible  with
+       compiled  pattern to be stacked, leaving the original available for im-
+       mediate matching. By using push and/or pushcopy, a number  of  patterns
+       can  be  compiled  and  retained. These modifiers are incompatible with
       posix, and control modifiers that act at match time are ignored (with a
-       message) for the stacked patterns. The jitverify modifier applies  only
+       message)  for the stacked patterns. The jitverify modifier applies only
       at compile time.

       The command
@ -1883,21 +1901,21 @@ SAVING AND RESTORING COMPILED PATTERNS
         #save <filename>

       causes all the stacked patterns to be serialized and the result written
-       to the named file. Afterwards, all the stacked patterns are freed.  The
+       to  the named file. Afterwards, all the stacked patterns are freed. The
       command

         #load <filename>

-       reads  the  data in the file, and then arranges for it to be de-serial-
-       ized, with the resulting compiled patterns added to the pattern  stack.
-       The  pattern  on the top of the stack can be retrieved by the #pop com-
-       mand, which must be followed by  lines  of  subjects  that  are  to  be
-       matched  with  the pattern, terminated as usual by an empty line or end
-       of file. This command may be followed by  a  modifier  list  containing
-       only  control  modifiers that act after a pattern has been compiled. In
-       particular, hex, posix, posix_nosub, push, and  pushcopy  are  not  al-
-       lowed,  nor  are  any option-setting modifiers.  The JIT modifiers are,
-       however permitted. Here is an example that saves and reloads  two  pat-
+       reads the data in the file, and then arranges for it to  be  de-serial-
+       ized,  with the resulting compiled patterns added to the pattern stack.
+       The pattern on the top of the stack can be retrieved by the  #pop  com-
+       mand,  which  must  be  followed  by  lines  of subjects that are to be
+       matched with the pattern, terminated as usual by an empty line  or  end
+       of  file.  This  command  may be followed by a modifier list containing
+       only control modifiers that act after a pattern has been  compiled.  In
+       particular,  hex,  posix,  posix_nosub,  push, and pushcopy are not al-
+       lowed, nor are any option-setting modifiers.  The  JIT  modifiers  are,
+       however  permitted.  Here is an example that saves and reloads two pat-
       terns.

         /abc/push
@ -1910,10 +1928,10 @@ SAVING AND RESTORING COMPILED PATTERNS
         #pop jit,bincode
         abc

-       If  jitverify  is  used with #pop, it does not automatically imply jit,
+       If jitverify is used with #pop, it does not  automatically  imply  jit,
       which is different behaviour from when it is used on a pattern.

-       The #popcopy command is analagous to the pushcopy modifier in  that  it
+       The  #popcopy  command is analagous to the pushcopy modifier in that it
       makes current a copy of the topmost stack pattern, leaving the original
       still on the stack.

@ -1933,5 +1951,5 @@ AUTHOR

 REVISION

-       Last updated: 30 August 2021
-       Copyright (c) 1997-2021 University of Cambridge.
+       Last updated: 27 July 2022
+       Copyright (c) 1997-2022 University of Cambridge.
--- a/doc/pcre2unicode.3
+++ b/doc/pcre2unicode.3
@ -1,4 +1,4 @@
-.TH PCRE2UNICODE 3 "23 February 2020" "PCRE2 10.35"
+.TH PCRE2UNICODE 3 "22 December 2021" "PCRE2 10.40"
 .SH NAME
 PCRE - Perl-compatible regular expressions (revised API)
 .SH "UNICODE AND UTF SUPPORT"
@ -40,10 +40,11 @@ handled, as documented below.
 .sp
 When PCRE2 is built with Unicode support, the escape sequences \ep{..},
 \eP{..}, and \eX can be used. This is not dependent on the PCRE2_UTF setting.
-The Unicode properties that can be tested are limited to the general category
-properties such as Lu for an upper case letter or Nd for a decimal number, the
-Unicode script names such as Arabic or Han, and the derived properties Any and
-L&. Full lists are given in the
+The Unicode properties that can be tested are a subset of those that Perl
+supports. Currently they are limited to the general category properties such as
+Lu for an upper case letter or Nd for a decimal number, the Unicode script
+names such as Arabic or Han, Bidi_Class, Bidi_Control, and the derived
+properties Any and LC (synonym L&). Full lists are given in the
 .\" HREF
 \fBpcre2pattern\fP
 .\"
@ -51,10 +52,10 @@ and
 .\" HREF
 \fBpcre2syntax\fP
 .\"
-documentation. Only the short names for properties are supported. For example,
-\ep{L} matches a letter. Its Perl synonym, \ep{Letter}, is not supported.
-Furthermore, in Perl, many properties may optionally be prefixed by "Is", for
-compatibility with Perl 5.6. PCRE2 does not support this.
+documentation. In general, only the short names for properties are supported.
+For example, \ep{L} matches a letter. Its longer synonym, \ep{Letter}, is not
+supported. Furthermore, in Perl, many properties may optionally be prefixed by
+"Is", for compatibility with Perl 5.6. PCRE2 does not support this.
 .
 .
 .SH "WIDE CHARACTERS AND UTF MODES"
@ -448,7 +449,7 @@ can be useful when searching for UTF text in executable or other binary files.
 .sp
 .nf
 Philip Hazel
-University Computing Service
+Retired from University Computing Service
 Cambridge, England.
 .fi
 .
@ -457,6 +458,6 @@ Cambridge, England.
 .rs
 .sp
 .nf
-Last updated: 23 February 2020
-Copyright (c) 1997-2020 University of Cambridge.
+Last updated: 22 December 2021
+Copyright (c) 1997-2021 University of Cambridge.
 .fi
--- a/index.md
+++ b/index.md
@ -14,14 +14,14 @@ flexible API, the code of PCRE2 has been much improved since the fork.
 ## Download

 As well as downloading from the 
-[GitHub site](https://github.com/PhilipHazel/pcre2), you can download PCRE2 
+[GitHub site](https://github.com/PCRE2Project/pcre2), you can download PCRE2 
 or the older, unmaintained PCRE1 library from an 
 [*unofficial* mirror](https://sourceforge.net/projects/pcre/files/) at SourceForge.

 You can check out the PCRE2 source code via Git or Subversion:

-    git clone https://github.com/PhilipHazel/pcre2.git
-    svn co    https://github.com/PhilipHazel/pcre2.git
+    git clone https://github.com/PCRE2Project/pcre2.git
+    svn co    https://github.com/PCRE2Project/pcre2.git

 ## Contributed Ports

@ -36,7 +36,7 @@ default character encoding, can be found at
 ## Documentation

 You can read the PCRE2 documentation 
-[here](https://philiphazel.github.io/pcre2/doc/html/index.html).
+[here](https://PCRE2Project.github.io/pcre2/doc/html/index.html).

 Comparisons to Perl's regular expression semantics can be found in the
 community authored Wikipedia entry for PCRE.
--- a/maint/GenerateCommon.py
+++ b/maint/GenerateCommon.py
@ -0,0 +1,355 @@
+#! /usr/bin/python
+
+#                   PCRE2 UNICODE PROPERTY SUPPORT
+#                   ------------------------------
+
+# This file is a Python module containing common lists and functions for the
+# GenerateXXX scripts that create various.c and .h files from Unicode data
+# files. It was created as part of a re-organizaton of these scripts in
+# December 2021.
+
+
+import re
+
+
+# ---------------------------------------------------------------------------
+#                             DATA LISTS
+# ---------------------------------------------------------------------------
+
+# BIDI classes in the DerivedBidiClass.txt file, with comments.
+
+bidi_classes = [
+  'AL',  'Arabic letter',
+  'AN',  'Arabic number',
+  'B',   'Paragraph separator',
+  'BN',  'Boundary neutral',
+  'CS',  'Common separator',
+  'EN',  'European number',
+  'ES',  'European separator',
+  'ET',  'European terminator',
+  'FSI', 'First strong isolate',
+  'L',   'Left to right',
+  'LRE', 'Left to right embedding',
+  'LRI', 'Left to right isolate',
+  'LRO', 'Left to right override',
+  'NSM', 'Non-spacing mark',
+  'ON',  'Other neutral',
+  'PDF', 'Pop directional format',
+  'PDI', 'Pop directional isolate',
+  'R',   'Right to left',
+  'RLE', 'Right to left embedding',
+  'RLI', 'Right to left isolate',
+  'RLO', 'Right to left override',
+  'S',   'Segment separator',
+  'WS',  'White space'
+  ]
+
+# Particular category property names, with comments. NOTE: If ever this list
+# is changed, the table called "catposstab" in the pcre2_auto_possess.c file
+# must be edited to keep in step.
+
+category_names = [
+  'Cc', 'Control',
+  'Cf', 'Format',
+  'Cn', 'Unassigned',
+  'Co', 'Private use',
+  'Cs', 'Surrogate',
+  'Ll', 'Lower case letter',
+  'Lm', 'Modifier letter',
+  'Lo', 'Other letter',
+  'Lt', 'Title case letter',
+  'Lu', 'Upper case letter',
+  'Mc', 'Spacing mark',
+  'Me', 'Enclosing mark',
+  'Mn', 'Non-spacing mark',
+  'Nd', 'Decimal number',
+  'Nl', 'Letter number',
+  'No', 'Other number',
+  'Pc', 'Connector punctuation',
+  'Pd', 'Dash punctuation',
+  'Pe', 'Close punctuation',
+  'Pf', 'Final punctuation',
+  'Pi', 'Initial punctuation',
+  'Po', 'Other punctuation',
+  'Ps', 'Open punctuation',
+  'Sc', 'Currency symbol',
+  'Sk', 'Modifier symbol',
+  'Sm', 'Mathematical symbol',
+  'So', 'Other symbol',
+  'Zl', 'Line separator',
+  'Zp', 'Paragraph separator',
+  'Zs', 'Space separator'
+  ]
+
+# The Extended_Pictographic property is not found in the file where all the
+# others are (GraphemeBreakProperty.txt). It comes from the emoji-data.txt
+# file, but we list it here so that the name has the correct index value.
+
+break_properties = [
+  'CR',                    ' 0',
+  'LF',                    ' 1',
+  'Control',               ' 2',
+  'Extend',                ' 3',
+  'Prepend',               ' 4',
+  'SpacingMark',           ' 5',
+  'L',                     ' 6 Hangul syllable type L',
+  'V',                     ' 7 Hangul syllable type V',
+  'T',                     ' 8 Hangul syllable type T',
+  'LV',                    ' 9 Hangul syllable type LV',
+  'LVT',                   '10 Hangul syllable type LVT',
+  'Regional_Indicator',    '11',
+  'Other',                 '12',
+  'ZWJ',                   '13',
+  'Extended_Pictographic', '14'
+  ]
+
+# List of files from which the names of Boolean properties are obtained, along
+# with a list of regex patterns for properties to be ignored, and a list of
+# extra pattern names to add.
+
+bool_propsfiles = ['PropList.txt', 'DerivedCoreProperties.txt', 'emoji-data.txt']
+bool_propsignore = [r'^Other_', r'^Hyphen$']
+bool_propsextras = ['ASCII', 'Bidi_Mirrored']
+
+
+# ---------------------------------------------------------------------------
+#                   GET BOOLEAN PROPERTY NAMES
+# ---------------------------------------------------------------------------
+
+# Get a list of Boolean property names from a number of files.
+
+def getbpropslist():
+  bplist = []
+  bplast = ""
+
+  for filename in bool_propsfiles:
+    try:
+      file = open('Unicode.tables/' + filename, 'r')
+    except IOError:
+      print(f"** Couldn't open {'Unicode.tables/' + filename}\n")
+      sys.exit(1)
+
+    for line in file:
+      line = re.sub(r'#.*', '', line)
+      data = list(map(str.strip, line.split(';')))
+      if len(data) <= 1 or data[1] == bplast:
+        continue
+      bplast = data[1]
+      for pat in bool_propsignore:
+        if re.match(pat, bplast) != None:
+          break
+      else:
+        bplist.append(bplast)
+
+    file.close()
+
+  bplist.extend(bool_propsextras)
+  bplist.sort()
+  return bplist
+
+bool_properties = getbpropslist()
+bool_props_list_item_size = (len(bool_properties) + 31) // 32
+
+
+
+# ---------------------------------------------------------------------------
+#                  COLLECTING PROPERTY NAMES AND ALIASES
+# ---------------------------------------------------------------------------
+
+script_names = ['Unknown']
+abbreviations = {}
+
+def collect_property_names():
+  global script_names
+  global abbreviations
+
+  names_re = re.compile(r'^[0-9A-F]{4,6}(?:\.\.[0-9A-F]{4,6})? +; ([A-Za-z_]+) #')
+
+  last_script_name = ""
+  with open("Unicode.tables/Scripts.txt") as f:
+    for line in f:
+      match_obj = names_re.match(line)
+
+      if match_obj == None or match_obj.group(1) == last_script_name:
+        continue
+
+      last_script_name = match_obj.group(1)
+      script_names.append(last_script_name)
+
+  # Sometimes there is comment in the line
+  # so splitting around semicolon is not enough
+  value_alias_re = re.compile(r' *([A-Za-z_]+) *; *([A-Za-z_]+) *; *([A-Za-z_]+)(?: *; *([A-Za-z_ ]+))?')
+
+  with open("Unicode.tables/PropertyValueAliases.txt") as f:
+    for line in f:
+      match_obj = value_alias_re.match(line)
+
+      if match_obj == None:
+        continue
+
+      if match_obj.group(1) == "sc":
+        if match_obj.group(2) == match_obj.group(3):
+          abbreviations[match_obj.group(3)] = ()
+        elif match_obj.group(4) == None:
+          abbreviations[match_obj.group(3)] = (match_obj.group(2),)
+        else:
+          abbreviations[match_obj.group(3)] = (match_obj.group(2), match_obj.group(4))
+
+  # We can also collect Boolean property abbreviations into the same dictionary
+
+  bin_alias_re = re.compile(r' *([A-Za-z_]+) *; *([A-Za-z_]+)(?: *; *([A-Za-z_]+))?')
+  with open("Unicode.tables/PropertyAliases.txt") as f:
+    for line in f:
+      match_obj = bin_alias_re.match(line)
+      if match_obj == None:
+        continue
+
+      if match_obj.group(2) in bool_properties:
+        if match_obj.group(3) == None:
+          abbreviations[match_obj.group(2)] = (match_obj.group(1),)
+        else:
+          abbreviations[match_obj.group(2)] = (match_obj.group(1), match_obj.group(3))
+
+collect_property_names()
+
+
+
+# ---------------------------------------------------------------------------
+#                      REORDERING SCRIPT NAMES
+# ---------------------------------------------------------------------------
+
+script_abbrevs = []
+
+def reorder_scripts():
+  global script_names
+  global script_abbrevs
+  global abbreviations
+
+  for name in script_names:
+    abbrevs = abbreviations[name]
+    script_abbrevs.append(name if len(abbrevs) == 0 else abbrevs[0])
+
+  extended_script_abbrevs = set()
+  with open("Unicode.tables/ScriptExtensions.txt") as f:
+    names_re = re.compile(r'^[0-9A-F]{4,6}(?:\.\.[0-9A-F]{4,6})? +; ([A-Za-z_ ]+) #')
+
+    for line in f:
+      match_obj = names_re.match(line)
+
+      if match_obj == None:
+        continue
+
+      for name in match_obj.group(1).split(" "):
+        extended_script_abbrevs.add(name)
+
+  new_script_names = []
+  new_script_abbrevs = []
+
+  for idx, abbrev in enumerate(script_abbrevs):
+    if abbrev in extended_script_abbrevs:
+      new_script_names.append(script_names[idx])
+      new_script_abbrevs.append(abbrev)
+
+  for idx, abbrev in enumerate(script_abbrevs):
+    if abbrev not in extended_script_abbrevs:
+      new_script_names.append(script_names[idx])
+      new_script_abbrevs.append(abbrev)
+
+  script_names = new_script_names
+  script_abbrevs = new_script_abbrevs
+
+reorder_scripts()
+script_list_item_size = (script_names.index('Unknown') + 31) // 32
+
+
+# ---------------------------------------------------------------------------
+#                         DERIVED LISTS
+# ---------------------------------------------------------------------------
+
+# Create general character property names from the first letters of the
+# particular categories.
+
+gcn_set = set(category_names[i][0] for i in range(0, len(category_names), 2))
+general_category_names = list(gcn_set)
+general_category_names.sort()
+
+
+# ---------------------------------------------------------------------------
+#                           FUNCTIONS
+# ---------------------------------------------------------------------------
+
+import sys
+
+# Open an output file, using the command's argument or a default. Write common
+# preliminary header information.
+
+def open_output(default):
+  if len(sys.argv) > 2:
+    print('** Too many arguments: just give a file name')
+    sys.exit(1)
+  if len(sys.argv) == 2:
+    output_name = sys.argv[1]
+  else:
+    output_name = default
+  try:
+    file = open(output_name, "w")
+  except IOError:
+    print ("** Couldn't open %s" % output_name)
+    sys.exit(1)
+
+  script_name = sys.argv[0]
+  i = script_name.rfind('/')
+  if i >= 0:
+    script_name = script_name[i+1:]
+
+  file.write("""\
+/*************************************************
+*      Perl-Compatible Regular Expressions       *
+*************************************************/
+
+/* PCRE is a library of functions to support regular expressions whose syntax
+and semantics are as close as possible to those of the Perl 5 language.
+
+                       Written by Philip Hazel
+     Original API code Copyright (c) 1997-2012 University of Cambridge
+          New API code Copyright (c) 2016-2022 University of Cambridge
+
+This module is auto-generated from Unicode data files. DO NOT EDIT MANUALLY!
+""")
+
+  file.write("Instead, modify the maint/%s script and run it to generate\n"
+  "a new version of this code.\n\n" % script_name)
+
+  file.write("""\
+-----------------------------------------------------------------------------
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+      this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of the University of Cambridge nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+-----------------------------------------------------------------------------
+*/
+\n""")
+  return file
+
+# End of UcpCommon.py
--- a/maint/GenerateTest26.py
+++ b/maint/GenerateTest26.py
@ -0,0 +1,188 @@
+#! /usr/bin/python
+
+#                   PCRE2 UNICODE PROPERTY SUPPORT
+#                   ------------------------------
+#
+# This file auto-generates unicode property tests and their expected output.
+# It is recommended to re-run this generator after the unicode files are
+# updated. The names of the generated files are `testinput26` and `testoutput26`
+
+import re
+import sys
+
+from GenerateCommon import \
+  script_names, \
+  script_abbrevs
+
+def write_both(text):
+  input_file.write(text)
+  output_file.write(text)
+
+def to_string_char(ch_idx):
+  if ch_idx < 128:
+    if ch_idx < 16:
+      return "\\x{0%x}" % ch_idx
+    if ch_idx >= 32:
+      return chr(ch_idx)
+  return "\\x{%x}" % ch_idx
+
+output_directory = ""
+
+if len(sys.argv) > 2:
+  print('** Too many arguments: just give a directory name')
+  sys.exit(1)
+if len(sys.argv) == 2:
+  output_directory = sys.argv[1]
+  if not output_directory.endswith("/"):
+    output_directory += "/"
+
+try:
+  input_file = open(output_directory + "testinput26", "w")
+  output_file = open(output_directory + "testoutput26", "w")
+except IOError:
+  print ("** Couldn't open output files")
+  sys.exit(1)
+
+write_both("# These tests are generated by maint/GenerateTest26.py, do not edit.\n\n")
+
+# ---------------------------------------------------------------------------
+#                      UNICODE SCRIPT EXTENSION TESTS
+# ---------------------------------------------------------------------------
+
+write_both("# Unicode Script Extension tests.\n\n")
+
+def gen_script_tests():
+  script_data = [None] * len(script_names)
+  char_data = [None] * 0x110000
+
+  property_re = re.compile("^([0-9A-F]{4,6})(?:\\.\\.([0-9A-F]{4,6}))? +; ([A-Za-z_ ]+) #")
+  prev_name = ""
+  script_idx = -1
+
+  with open("Unicode.tables/Scripts.txt") as f:
+    for line in f:
+      match_obj = property_re.match(line)
+
+      if match_obj == None:
+        continue
+
+      name = match_obj.group(3)
+      if name != prev_name:
+        script_idx = script_names.index(name)
+        prev_name = name
+
+      low = int(match_obj.group(1), 16)
+      high = low
+      char_data[low] = name
+
+      if match_obj.group(2) != None:
+        high = int(match_obj.group(2), 16)
+        for idx in range(low + 1, high + 1):
+           char_data[idx] = name
+
+      if script_data[script_idx] == None:
+        script_data[script_idx] = [low, None, None, None, None]
+      script_data[script_idx][1] = high
+
+  extended_script_indicies = {}
+
+  with open("Unicode.tables/ScriptExtensions.txt") as f:
+    for line in f:
+      match_obj = property_re.match(line)
+
+      if match_obj == None:
+        continue
+
+      low = int(match_obj.group(1), 16)
+      high = low
+      if match_obj.group(2) != None:
+        high = int(match_obj.group(2), 16)
+
+      for abbrev in match_obj.group(3).split(" "):
+        if abbrev not in extended_script_indicies:
+          idx = script_abbrevs.index(abbrev)
+          extended_script_indicies[abbrev] = idx
+          rec = script_data[idx]
+          rec[2] = low
+          rec[3] = high
+        else:
+          idx = extended_script_indicies[abbrev]
+          rec = script_data[idx]
+          if rec[2] > low:
+            rec[2] = low
+          if rec[3] < high:
+            rec[3] = high
+
+        if rec[4] == None:
+          name = script_names[idx]
+          for idx in range(low, high + 1):
+            if char_data[idx] != name:
+              rec[4] = idx
+              break
+
+  long_property_name = False
+
+  for idx, rec in enumerate(script_data):
+    script_name = script_names[idx]
+
+    if script_name == "Unknown":
+      continue
+
+    script_abbrev = script_abbrevs[idx]
+
+    write_both("# Base script check\n")
+    write_both("/^\\p{sc=%s}/utf\n" % script_name)
+    write_both("  %s\n" % to_string_char(rec[0]))
+    output_file.write(" 0: %s\n" % to_string_char(rec[0]))
+    write_both("\n")
+
+    write_both("/^\\p{Script=%s}/utf\n" % script_abbrev)
+    write_both("  %s\n" % to_string_char(rec[1]))
+    output_file.write(" 0: %s\n" % to_string_char(rec[1]))
+    write_both("\n")
+
+    if rec[2] != None:
+      property_name = "scx"
+      if long_property_name:
+        property_name = "Script_Extensions"
+
+      write_both("# Script extension check\n")
+      write_both("/^\\p{%s}/utf\n" % script_name)
+      write_both("  %s\n" % to_string_char(rec[2]))
+      output_file.write(" 0: %s\n" % to_string_char(rec[2]))
+      write_both("\n")
+
+      write_both("/^\\p{%s=%s}/utf\n" % (property_name, script_abbrev))
+      write_both("  %s\n" % to_string_char(rec[3]))
+      output_file.write(" 0: %s\n" % to_string_char(rec[3]))
+      write_both("\n")
+
+      long_property_name = not long_property_name
+
+      if rec[4] != None:
+        write_both("# Script extension only character\n")
+        write_both("/^\\p{%s}/utf\n" % script_name)
+        write_both("  %s\n" % to_string_char(rec[4]))
+        output_file.write(" 0: %s\n" % to_string_char(rec[4]))
+        write_both("\n")
+
+        write_both("/^\\p{sc=%s}/utf\n" % script_name)
+        write_both("  %s\n" % to_string_char(rec[4]))
+        output_file.write("No match\n")
+        write_both("\n")
+      else:
+        print("External character has not found for %s" % script_name)
+
+    high = rec[1]
+    if rec[3] != None and rec[3] > rec[1]:
+      high = rec[3]
+    write_both("# Character not in script\n")
+    write_both("/^\\p{%s}/utf\n" % script_name)
+    write_both("  %s\n" % to_string_char(high + 1))
+    output_file.write("No match\n")
+    write_both("\n")
+
+
+gen_script_tests()
+
+write_both("# End of testinput26\n")
--- a/maint/GenerateUcd.py
+++ b/maint/GenerateUcd.py
@ -0,0 +1,923 @@
+#! /usr/bin/python
+
+#                   PCRE2 UNICODE PROPERTY SUPPORT
+#                   ------------------------------
+#
+# This script generates the pcre2_ucd.c file from Unicode data files. This is
+# the compressed Unicode property data used by PCRE2. The script was created in
+# December 2021 as part of the Unicode data generation refactoring. It is
+# basically a re-working of the MultiStage2.py script that was submitted to the
+# PCRE project by Peter Kankowski in 2008 as part of a previous upgrading of
+# Unicode property support. A number of extensions have since been added. The
+# main difference in the 2021 upgrade (apart from comments and layout) is that
+# the data tables (e.g. list of script names) are now listed in or generated by
+# a separate Python module that is shared with the other Generate scripts.
+#
+# This script must be run in the "maint" directory. It requires the following
+# Unicode data tables: BidiMirrorring.txt, CaseFolding.txt,
+# DerivedBidiClass.txt, DerivedCoreProperties.txt, DerivedGeneralCategory.txt,
+# GraphemeBreakProperty.txt, PropList.txt, PropertyAliases.txt,
+# PropertyValueAliases.txt, ScriptExtensions.txt, Scripts.txt, and
+# emoji-data.txt. These must be in the Unicode.tables subdirectory.
+#
+# The emoji-data.txt file is found in the "emoji" subdirectory even though it
+# is technically part of a different (but coordinated) standard as shown
+# in files associated with Unicode Technical Standard #51 ("Unicode Emoji"),
+# for example:
+#
+# http://unicode.org/Public/emoji/13.0/ReadMe.txt
+#
+# DerivedBidiClass.txt and DerivedGeneralCategory.txt are in the "extracted"
+# subdirectory of the Unicode database (UCD) on the Unicode web site;
+# GraphemeBreakProperty.txt is in the "auxiliary" subdirectory. The other files
+# are in the top-level UCD directory.
+#
+# -----------------------------------------------------------------------------
+# Minor modifications made to the original script:
+#  Added #! line at start
+#  Removed tabs
+#  Made it work with Python 2.4 by rewriting two statements that needed 2.5
+#  Consequent code tidy
+#  Adjusted data file names to take from the Unicode.tables directory
+#  Adjusted global table names by prefixing _pcre_.
+#  Commented out stuff relating to the casefolding table, which isn't used;
+#    removed completely in 2012.
+#  Corrected size calculation
+#  Add #ifndef SUPPORT_UCP to use dummy tables when no UCP support is needed.
+#  Update for PCRE2: name changes, and SUPPORT_UCP is abolished.
+#
+# Major modifications made to the original script:
+#  Added code to add a grapheme break property field to records.
+#
+#  Added code to search for sets of more than two characters that must match
+#  each other caselessly. A new table is output containing these sets, and
+#  offsets into the table are added to the main output records. This new
+#  code scans CaseFolding.txt instead of UnicodeData.txt, which is no longer
+#  used.
+#
+#  Update for Python3:
+#    . Processed with 2to3, but that didn't fix everything
+#    . Changed string.strip to str.strip
+#    . Added encoding='utf-8' to the open() call
+#    . Inserted 'int' before blocksize/ELEMS_PER_LINE because an int is
+#        required and the result of the division is a float
+#
+#  Added code to scan the emoji-data.txt file to find the Extended Pictographic
+#  property, which is used by PCRE2 as a grapheme breaking property. This was
+#  done when updating to Unicode 11.0.0 (July 2018).
+#
+#  Added code to add a Script Extensions field to records. This has increased
+#  their size from 8 to 12 bytes, only 10 of which are currently used.
+#
+#  Added code to add a bidi class field to records by scanning the
+#  DerivedBidiClass.txt and PropList.txt files. This uses one of the two spare
+#  bytes, so now 11 out of 12 are in use.
+#
+# 01-March-2010:     Updated list of scripts for Unicode 5.2.0
+# 30-April-2011:     Updated list of scripts for Unicode 6.0.0
+#     July-2012:     Updated list of scripts for Unicode 6.1.0
+# 20-August-2012:    Added scan of GraphemeBreakProperty.txt and added a new
+#                      field in the record to hold the value. Luckily, the
+#                      structure had a hole in it, so the resulting table is
+#                      not much bigger than before.
+# 18-September-2012: Added code for multiple caseless sets. This uses the
+#                      final hole in the structure.
+# 30-September-2012: Added RegionalIndicator break property from Unicode 6.2.0
+# 13-May-2014:       Updated for PCRE2
+# 03-June-2014:      Updated for Python 3
+# 20-June-2014:      Updated for Unicode 7.0.0
+# 12-August-2014:    Updated to put Unicode version into the file
+# 19-June-2015:      Updated for Unicode 8.0.0
+# 02-July-2017:      Updated for Unicode 10.0.0
+# 03-July-2018:      Updated for Unicode 11.0.0
+# 07-July-2018:      Added code to scan emoji-data.txt for the Extended
+#                      Pictographic property.
+# 01-October-2018:   Added the 'Unknown' script name
+# 03-October-2018:   Added new field for Script Extensions
+# 27-July-2019:      Updated for Unicode 12.1.0
+# 10-March-2020:     Updated for Unicode 13.0.0
+# PCRE2-10.39:       Updated for Unicode 14.0.0
+# 05-December-2021:  Added code to scan DerivedBidiClass.txt for bidi class,
+#                      and also PropList.txt for the Bidi_Control property
+# 19-December-2021:  Reworked script extensions lists to be bit maps instead
+#                      of zero-terminated lists of script numbers.
+# ----------------------------------------------------------------------------
+#
+# Changes to the refactored script:
+#
+# 26-December-2021:  Refactoring completed
+# 10-January-2022:   Addition of general Boolean property support
+# 12-January-2022:   Merge scriptx and bidiclass fields
+# 14-January-2022:   Enlarge Boolean property offset to 12 bits
+#
+# ----------------------------------------------------------------------------
+#
+#
+# The main tables generated by this script are used by macros defined in
+# pcre2_internal.h. They look up Unicode character properties using short
+# sequences of code that contains no branches, which makes for greater speed.
+#
+# Conceptually, there is a table of records (of type ucd_record), one for each
+# Unicode character. Each record contains the script number, script extension
+# value, character type, grapheme break type, offset to caseless matching set,
+# offset to the character's other case, the bidi class, and offset to bitmap of
+# Boolean properties.
+#
+# A real table covering all Unicode characters would be far too big. It can be
+# efficiently compressed by observing that many characters have the same
+# record, and many blocks of characters (taking 128 characters in a block) have
+# the same set of records as other blocks. This leads to a 2-stage lookup
+# process.
+#
+# This script constructs seven tables. The ucd_caseless_sets table contains
+# lists of characters that all match each other caselessly. Each list is
+# in order, and is terminated by NOTACHAR (0xffffffff), which is larger than
+# any valid character. The first list is empty; this is used for characters
+# that are not part of any list.
+#
+# The ucd_digit_sets table contains the code points of the '9' characters in
+# each set of 10 decimal digits in Unicode. This is used to ensure that digits
+# in script runs all come from the same set. The first element in the vector
+# contains the number of subsequent elements, which are in ascending order.
+#
+# Scripts are partitioned into two groups. Scripts that appear in at least one
+# character's script extension list come first, followed by "Unknown" and then
+# all the rest. This sorting is done automatically in the GenerateCommon.py
+# script. A script's number is its index in the script_names list.
+#
+# The ucd_script_sets table contains bitmaps that represent lists of scripts
+# for Script Extensions properties. Each bitmap consists of a fixed number of
+# unsigned 32-bit numbers, enough to allocate a bit for every script that is
+# used in any character's extension list, that is, enough for every script
+# whose number is less than ucp_Unknown. A character's script extension value
+# in its ucd record is an offset into the ucd_script_sets vector. The first
+# bitmap has no bits set; characters that have no script extensions have zero
+# as their script extensions value so that they use this map.
+#
+# The ucd_boolprop_sets table contains bitmaps that represent lists of Boolean
+# properties. Each bitmap consists of a fixed number of unsigned 32-bit
+# numbers, enough to allocate a bit for each supported Boolean property.
+#
+# The ucd_records table contains one instance of every unique character record
+# that is required. The ucd_stage1 table is indexed by a character's block
+# number, which is the character's code point divided by 128, since 128 is the
+# size of each block. The result of a lookup in ucd_stage1 a "virtual" block
+# number.
+#
+# The ucd_stage2 table is a table of "virtual" blocks; each block is indexed by
+# the offset of a character within its own block, and the result is the index
+# number of the required record in the ucd_records vector.
+#
+# The following examples are correct for the Unicode 14.0.0 database. Future
+# updates may make change the actual lookup values.
+#
+# Example: lowercase "a" (U+0061) is in block 0
+#          lookup 0 in stage1 table yields 0
+#          lookup 97 (0x61) in the first table in stage2 yields 35
+#          record 35 is { 0, 5, 12, 0, -32, 18432, 44 }
+#             0 = ucp_Latin   => Latin script
+#             5 = ucp_Ll      => Lower case letter
+#            12 = ucp_gbOther => Grapheme break property "Other"
+#             0               => Not part of a caseless set
+#           -32 (-0x20)       => Other case is U+0041
+#         18432 = 0x4800      => Combined Bidi class + script extension values
+#            44               => Offset to Boolean properties
+#
+# The top 5 bits of the sixth field are the Bidi class, with the rest being the
+# script extension value, giving:
+#
+#             9 = ucp_bidiL   => Bidi class left-to-right
+#             0               => No special script extension property
+#
+# Almost all lowercase latin characters resolve to the same record. One or two
+# are different because they are part of a multi-character caseless set (for
+# example, k, K and the Kelvin symbol are such a set).
+#
+# Example: hiragana letter A (U+3042) is in block 96 (0x60)
+#          lookup 96 in stage1 table yields 93
+#          lookup 66 (0x42) in table 93 in stage2 yields 819
+#          record 819 is { 20, 7, 12, 0, 0, 18432, 82 }
+#            20 = ucp_Hiragana => Hiragana script
+#             7 = ucp_Lo       => Other letter
+#            12 = ucp_gbOther  => Grapheme break property "Other"
+#             0                => Not part of a caseless set
+#             0                => No other case
+#         18432 = 0x4800       => Combined Bidi class + script extension values
+#            82                => Offset to Boolean properties
+#
+# The top 5 bits of the sixth field are the Bidi class, with the rest being the
+# script extension value, giving:
+#
+#             9 = ucp_bidiL   => Bidi class left-to-right
+#             0               => No special script extension property
+#
+# Example: vedic tone karshana (U+1CD0) is in block 57 (0x39)
+#          lookup 57 in stage1 table yields 55
+#          lookup 80 (0x50) in table 55 in stage2 yields 621
+#          record 621 is { 84, 12, 3, 0, 0, 26762, 96 }
+#            84 = ucp_Inherited => Script inherited from predecessor
+#            12 = ucp_Mn        => Non-spacing mark
+#             3 = ucp_gbExtend  => Grapheme break property "Extend"
+#             0                 => Not part of a caseless set
+#             0                 => No other case
+#         26762 = 0x688A        => Combined Bidi class + script extension values
+#            96                 => Offset to Boolean properties
+#
+# The top 5 bits of the sixth field are the Bidi class, with the rest being the
+# script extension value, giving:
+#
+#            13 = ucp_bidiNSM   => Bidi class non-spacing mark
+#           138                 => Script Extension list offset = 138
+#
+# At offset 138 in the ucd_script_sets vector we find a bitmap with bits 1, 8,
+# 18, and 47 set. This means that this character is expected to be used with
+# any of those scripts, which are Bengali, Devanagari, Kannada, and Grantha.
+#
+#  Philip Hazel, last updated 14 January 2022.
+##############################################################################
+
+
+# Import standard modules
+
+import re
+import string
+import sys
+
+# Import common data lists and functions
+
+from GenerateCommon import \
+  bidi_classes, \
+  bool_properties, \
+  bool_propsfiles, \
+  bool_props_list_item_size, \
+  break_properties, \
+  category_names, \
+  general_category_names, \
+  script_abbrevs, \
+  script_list_item_size, \
+  script_names, \
+  open_output
+
+# Some general parameters
+
+MAX_UNICODE = 0x110000
+NOTACHAR = 0xffffffff
+
+
+# ---------------------------------------------------------------------------
+#                         DEFINE FUNCTIONS
+# ---------------------------------------------------------------------------
+
+
+# Parse a line of Scripts.txt, GraphemeBreakProperty.txt, DerivedBidiClass.txt
+# or DerivedGeneralCategory.txt
+
+def make_get_names(enum):
+  return lambda chardata: enum.index(chardata[1])
+
+
+# Parse a line of CaseFolding.txt
+
+def get_other_case(chardata):
+  if chardata[1] == 'C' or chardata[1] == 'S':
+    return int(chardata[2], 16) - int(chardata[0], 16)
+  return 0
+
+
+# Parse a line of ScriptExtensions.txt
+
+def get_script_extension(chardata):
+  global last_script_extension
+
+  offset = len(script_lists) * script_list_item_size
+  if last_script_extension == chardata[1]:
+    return offset - script_list_item_size
+
+  last_script_extension = chardata[1]
+  script_lists.append(tuple(script_abbrevs.index(abbrev) for abbrev in last_script_extension.split(' ')))
+  return offset
+
+
+# Read a whole table in memory, setting/checking the Unicode version
+
+def read_table(file_name, get_value, default_value):
+  global unicode_version
+
+  f = re.match(r'^[^/]+/([^.]+)\.txt$', file_name)
+  file_base = f.group(1)
+  version_pat = r"^# " + re.escape(file_base) + r"-(\d+\.\d+\.\d+)\.txt$"
+  file = open(file_name, 'r', encoding='utf-8')
+  f = re.match(version_pat, file.readline())
+  version = f.group(1)
+  if unicode_version == "":
+    unicode_version = version
+  elif unicode_version != version:
+    print("WARNING: Unicode version differs in %s", file_name, file=sys.stderr)
+
+  table = [default_value] * MAX_UNICODE
+  for line in file:
+    line = re.sub(r'#.*', '', line)
+    chardata = list(map(str.strip, line.split(';')))
+    if len(chardata) <= 1:
+      continue
+    value = get_value(chardata)
+    m = re.match(r'([0-9a-fA-F]+)(\.\.([0-9a-fA-F]+))?$', chardata[0])
+    char = int(m.group(1), 16)
+    if m.group(3) is None:
+      last = char
+    else:
+      last = int(m.group(3), 16)
+    for i in range(char, last + 1):
+      # It is important not to overwrite a previously set value because in the
+      # CaseFolding file there are lines to be ignored (returning the default
+      # value of 0) which often come after a line which has already set data.
+      if table[i] == default_value:
+        table[i] = value
+  file.close()
+  return table
+
+
+# Get the smallest possible C language type for the values in a table
+
+def get_type_size(table):
+  type_size = [("uint8_t", 1), ("uint16_t", 2), ("uint32_t", 4),
+    ("signed char", 1), ("int16_t", 2), ("int32_t", 4)]
+  limits = [(0, 255), (0, 65535), (0, 4294967295), (-128, 127),
+    (-32768, 32767), (-2147483648, 2147483647)]
+  minval = min(table)
+  maxval = max(table)
+  for num, (minlimit, maxlimit) in enumerate(limits):
+    if minlimit <= minval and maxval <= maxlimit:
+      return type_size[num]
+  raise OverflowError("Too large to fit into C types")
+
+
+# Get the total size of a list of tables
+
+def get_tables_size(*tables):
+  total_size = 0
+  for table in tables:
+    type, size = get_type_size(table)
+    total_size += size * len(table)
+  return total_size
+
+
+# Compress a table into the two stages
+
+def compress_table(table, block_size):
+  blocks = {} # Dictionary for finding identical blocks
+  stage1 = [] # Stage 1 table contains block numbers (indices into stage 2 table)
+  stage2 = [] # Stage 2 table contains the blocks with property values
+  table = tuple(table)
+  for i in range(0, len(table), block_size):
+    block = table[i:i+block_size]
+    start = blocks.get(block)
+    if start is None:
+      # Allocate a new block
+      start = len(stage2) / block_size
+      stage2 += block
+      blocks[block] = start
+    stage1.append(start)
+  return stage1, stage2
+
+
+# Output a table
+
+def write_table(table, table_name, block_size = None):
+  type, size = get_type_size(table)
+  ELEMS_PER_LINE = 16
+
+  s = "const %s %s[] = { /* %d bytes" % (type, table_name, size * len(table))
+  if block_size:
+    s += ", block = %d" % block_size
+  f.write(s + " */\n")
+  table = tuple(table)
+  if block_size is None:
+    fmt = "%3d," * ELEMS_PER_LINE + " /* U+%04X */\n"
+    mult = MAX_UNICODE / len(table)
+    for i in range(0, len(table), ELEMS_PER_LINE):
+      f.write(fmt % (table[i:i+ELEMS_PER_LINE] + (int(i * mult),)))
+  else:
+    if block_size > ELEMS_PER_LINE:
+      el = ELEMS_PER_LINE
+    else:
+      el = block_size
+    fmt = "%3d," * el + "\n"
+    if block_size > ELEMS_PER_LINE:
+      fmt = fmt * int(block_size / ELEMS_PER_LINE)
+    for i in range(0, len(table), block_size):
+      f.write(("\n/* block %d */\n" + fmt) % ((i / block_size,) + table[i:i+block_size]))
+  f.write("};\n\n")
+
+
+# Extract the unique combinations of properties into records
+
+def combine_tables(*tables):
+  records = {}
+  index = []
+  for t in zip(*tables):
+    i = records.get(t)
+    if i is None:
+      i = records[t] = len(records)
+    index.append(i)
+  return index, records
+
+
+# Create a record struct
+
+def get_record_size_struct(records):
+  size = 0
+  structure = 'typedef struct {\n'
+  for i in range(len(records[0])):
+    record_slice = [record[i] for record in records]
+    slice_type, slice_size = get_type_size(record_slice)
+    # add padding: round up to the nearest power of slice_size
+    size = (size + slice_size - 1) & -slice_size
+    size += slice_size
+    structure += '%s property_%d;\n' % (slice_type, i)
+
+  # round up to the first item of the next structure in array
+  record_slice = [record[0] for record in records]
+  slice_type, slice_size = get_type_size(record_slice)
+  size = (size + slice_size - 1) & -slice_size
+
+  structure += '} ucd_record;\n*/\n'
+  return size, structure
+
+
+# Write records
+
+def write_records(records, record_size):
+  f.write('const ucd_record PRIV(ucd_records)[] = { ' + \
+    '/* %d bytes, record size %d */\n' % (len(records) * record_size, record_size))
+  records = list(zip(list(records.keys()), list(records.values())))
+  records.sort(key = lambda x: x[1])
+  for i, record in enumerate(records):
+    f.write(('  {' + '%6d, ' * len(record[0]) + '}, /* %3d */\n') % (record[0] + (i,)))
+  f.write('};\n\n')
+
+
+# Write a bit set
+
+def write_bitsets(list, item_size):
+  for d in list:
+    bitwords = [0] * item_size
+    for idx in d:
+      bitwords[idx // 32] |= 1 << (idx & 31)
+    s = " "
+    for x in bitwords:
+      f.write("%s" % s)
+      s = ", "
+      f.write("0x%08xu" % x)
+    f.write(",\n")
+  f.write("};\n\n")
+
+
+# ---------------------------------------------------------------------------
+# This bit of code must have been useful when the original script was being
+# developed. Retain it just in case it is ever needed again.
+
+# def test_record_size():
+#   tests = [ \
+#     ( [(3,), (6,), (6,), (1,)], 1 ), \
+#     ( [(300,), (600,), (600,), (100,)], 2 ), \
+#     ( [(25, 3), (6, 6), (34, 6), (68, 1)], 2 ), \
+#     ( [(300, 3), (6, 6), (340, 6), (690, 1)], 4 ), \
+#     ( [(3, 300), (6, 6), (6, 340), (1, 690)], 4 ), \
+#     ( [(300, 300), (6, 6), (6, 340), (1, 690)], 4 ), \
+#     ( [(3, 100000), (6, 6), (6, 123456), (1, 690)], 8 ), \
+#     ( [(100000, 300), (6, 6), (123456, 6), (1, 690)], 8 ), \
+#   ]
+#   for test in tests:
+#     size, struct = get_record_size_struct(test[0])
+#     assert(size == test[1])
+# test_record_size()
+# ---------------------------------------------------------------------------
+
+
+
+# ---------------------------------------------------------------------------
+#                       MAIN CODE FOR CREATING TABLES
+# ---------------------------------------------------------------------------
+
+unicode_version = ""
+
+# Some of the tables imported from GenerateCommon.py have alternate comment
+# strings for use by GenerateUcpHeader. The comments are not wanted here, so
+# remove them.
+
+bidi_classes = bidi_classes[::2]
+break_properties = break_properties[::2]
+category_names = category_names[::2]
+
+# Create the various tables from Unicode data files
+
+script = read_table('Unicode.tables/Scripts.txt', make_get_names(script_names), script_names.index('Unknown'))
+category = read_table('Unicode.tables/DerivedGeneralCategory.txt', make_get_names(category_names), category_names.index('Cn'))
+break_props = read_table('Unicode.tables/GraphemeBreakProperty.txt', make_get_names(break_properties), break_properties.index('Other'))
+other_case = read_table('Unicode.tables/CaseFolding.txt', get_other_case, 0)
+bidi_class = read_table('Unicode.tables/DerivedBidiClass.txt', make_get_names(bidi_classes), bidi_classes.index('L'))
+
+# The grapheme breaking rules were changed for Unicode 11.0.0 (June 2018). Now
+# we need to find the Extended_Pictographic property for emoji characters. This
+# can be set as an additional grapheme break property, because the default for
+# all the emojis is "other". We scan the emoji-data.txt file and modify the
+# break-props table.
+
+file = open('Unicode.tables/emoji-data.txt', 'r', encoding='utf-8')
+for line in file:
+  line = re.sub(r'#.*', '', line)
+  chardata = list(map(str.strip, line.split(';')))
+  if len(chardata) <= 1:
+    continue
+  if chardata[1] != "Extended_Pictographic":
+    continue
+  m = re.match(r'([0-9a-fA-F]+)(\.\.([0-9a-fA-F]+))?$', chardata[0])
+  char = int(m.group(1), 16)
+  if m.group(3) is None:
+    last = char
+  else:
+    last = int(m.group(3), 16)
+  for i in range(char, last + 1):
+    if break_props[i] != break_properties.index('Other'):
+      print("WARNING: Emoji 0x%x has break property %s, not 'Other'",
+        i, break_properties[break_props[i]], file=sys.stderr)
+    break_props[i] = break_properties.index('Extended_Pictographic')
+file.close()
+
+# Handle script extensions. The get_script_extesion() function maintains a
+# list of unique bitmaps representing lists of scripts, returning the offset
+# in that list. Initialize the list with an empty set, which is used for
+# characters that have no script extensions.
+
+script_lists = [[]]
+last_script_extension = ""
+scriptx_bidi_class = read_table('Unicode.tables/ScriptExtensions.txt', get_script_extension, 0)
+
+for idx in range(len(scriptx_bidi_class)):
+  scriptx_bidi_class[idx] = scriptx_bidi_class[idx] | (bidi_class[idx] << 11)
+bidi_class = None
+
+# Find the Boolean properties of each character. This next bit of magic creates
+# a list of empty lists. Using [[]] * MAX_UNICODE gives a list of references to
+# the *same* list, which is not what we want.
+
+bprops = [[] for _ in range(MAX_UNICODE)]
+
+# Collect the properties from the various files
+
+for filename in bool_propsfiles:
+  try:
+    file = open('Unicode.tables/' + filename, 'r')
+  except IOError:
+    print(f"** Couldn't open {'Unicode.tables/' + filename}\n")
+    sys.exit(1)
+
+  for line in file:
+    line = re.sub(r'#.*', '', line)
+    data = list(map(str.strip, line.split(';')))
+    if len(data) <= 1:
+      continue
+
+    try:
+      ix = bool_properties.index(data[1])
+    except ValueError:
+      continue
+
+    m = re.match(r'([0-9a-fA-F]+)(\.\.([0-9a-fA-F]+))?$', data[0])
+    char = int(m.group(1), 16)
+    if m.group(3) is None:
+      last = char
+    else:
+      last = int(m.group(3), 16)
+
+    for i in range(char, last + 1):
+      bprops[i].append(ix)
+
+  file.close()
+
+# The ASCII property isn't listed in any files, but it is easy enough to add
+# it manually.
+
+ix = bool_properties.index("ASCII")
+for i in range(128):
+  bprops[i].append(ix)
+
+# The Bidi_Mirrored property isn't listed in any property files. We have to
+# deduce it from the file that lists the mirrored characters.
+
+ix = bool_properties.index("Bidi_Mirrored")
+
+try:
+  file = open('Unicode.tables/BidiMirroring.txt', 'r')
+except IOError:
+  print(f"** Couldn't open {'Unicode.tables/BidiMirroring.txt'}\n")
+  sys.exit(1)
+
+for line in file:
+  line = re.sub(r'#.*', '', line)
+  data = list(map(str.strip, line.split(';')))
+  if len(data) <= 1:
+    continue
+  c = int(data[0], 16)
+  bprops[c].append(ix)
+
+file.close()
+
+# Scan each character's boolean property list and created a list of unique
+# lists, at the same time, setting the index in that list for each property in
+# the bool_props vector.
+
+bool_props = [0] * MAX_UNICODE
+bool_props_lists = [[]]
+
+for c in range(MAX_UNICODE):
+  s = set(bprops[c])
+  for i in range(len(bool_props_lists)):
+    if s == set(bool_props_lists[i]):
+      break;
+  else:
+    bool_props_lists.append(bprops[c])
+    i += 1
+
+  bool_props[c] = i * bool_props_list_item_size
+
+# This block of code was added by PH in September 2012. It scans the other_case
+# table to find sets of more than two characters that must all match each other
+# caselessly. Later in this script a table of these sets is written out.
+# However, we have to do this work here in order to compute the offsets in the
+# table that are inserted into the main table.
+
+# The CaseFolding.txt file lists pairs, but the common logic for reading data
+# sets only one value, so first we go through the table and set "return"
+# offsets for those that are not already set.
+
+for c in range(MAX_UNICODE):
+  if other_case[c] != 0 and other_case[c + other_case[c]] == 0:
+    other_case[c + other_case[c]] = -other_case[c]
+
+# Now scan again and create equivalence sets.
+
+caseless_sets = []
+
+for c in range(MAX_UNICODE):
+  o = c + other_case[c]
+
+  # Trigger when this character's other case does not point back here. We
+  # now have three characters that are case-equivalent.
+
+  if other_case[o] != -other_case[c]:
+    t = o + other_case[o]
+
+    # Scan the existing sets to see if any of the three characters are already
+    # part of a set. If so, unite the existing set with the new set.
+
+    appended = 0
+    for s in caseless_sets:
+      found = 0
+      for x in s:
+        if x == c or x == o or x == t:
+          found = 1
+
+      # Add new characters to an existing set
+
+      if found:
+        found = 0
+        for y in [c, o, t]:
+          for x in s:
+            if x == y:
+              found = 1
+          if not found:
+            s.append(y)
+        appended = 1
+
+    # If we have not added to an existing set, create a new one.
+
+    if not appended:
+      caseless_sets.append([c, o, t])
+
+# End of loop looking for caseless sets.
+
+# Now scan the sets and set appropriate offsets for the characters.
+
+caseless_offsets = [0] * MAX_UNICODE
+
+offset = 1;
+for s in caseless_sets:
+  for x in s:
+    caseless_offsets[x] = offset
+  offset += len(s) + 1
+
+# End of block of code for creating offsets for caseless matching sets.
+
+
+# Combine all the tables
+
+table, records = combine_tables(script, category, break_props,
+  caseless_offsets, other_case, scriptx_bidi_class, bool_props)
+
+# Find the record size and create a string definition of the structure for
+# outputting as a comment.
+
+record_size, record_struct = get_record_size_struct(list(records.keys()))
+
+# Find the optimum block size for the two-stage table
+
+min_size = sys.maxsize
+for block_size in [2 ** i for i in range(5,10)]:
+  size = len(records) * record_size
+  stage1, stage2 = compress_table(table, block_size)
+  size += get_tables_size(stage1, stage2)
+  #print "/* block size %5d  => %5d bytes */" % (block_size, size)
+  if size < min_size:
+    min_size = size
+    min_stage1, min_stage2 = stage1, stage2
+    min_block_size = block_size
+
+
+# ---------------------------------------------------------------------------
+#                   MAIN CODE FOR WRITING THE OUTPUT FILE
+# ---------------------------------------------------------------------------
+
+# Open the output file (no return on failure). This call also writes standard
+# header boilerplate.
+
+f = open_output("pcre2_ucd.c")
+
+# Output this file's heading text
+
+f.write("""\
+/* This file contains tables of Unicode properties that are extracted from
+Unicode data files. See the comments at the start of maint/GenerateUcd.py for
+details.
+
+As well as being part of the PCRE2 library, this file is #included by the
+pcre2test program, which redefines the PRIV macro to change table names from
+_pcre2_xxx to xxxx, thereby avoiding name clashes with the library. At present,
+just one of these tables is actually needed. When compiling the library, some
+headers are needed. */
+
+#ifndef PCRE2_PCRE2TEST
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+#include "pcre2_internal.h"
+#endif /* PCRE2_PCRE2TEST */
+
+/* The tables herein are needed only when UCP support is built, and in PCRE2
+that happens automatically with UTF support. This module should not be
+referenced otherwise, so it should not matter whether it is compiled or not.
+However a comment was received about space saving - maybe the guy linked all
+the modules rather than using a library - so we include a condition to cut out
+the tables when not needed. But don't leave a totally empty module because some
+compilers barf at that. Instead, just supply some small dummy tables. */
+
+#ifndef SUPPORT_UNICODE
+const ucd_record PRIV(ucd_records)[] = {{0,0,0,0,0,0,0}};
+const uint16_t PRIV(ucd_stage1)[] = {0};
+const uint16_t PRIV(ucd_stage2)[] = {0};
+const uint32_t PRIV(ucd_caseless_sets)[] = {0};
+#else
+\n""")
+
+# --- Output some variable heading stuff ---
+
+f.write("/* Total size: %d bytes, block size: %d. */\n\n" % (min_size, min_block_size))
+f.write('const char *PRIV(unicode_version) = "{}";\n\n'.format(unicode_version))
+
+f.write("""\
+/* When recompiling tables with a new Unicode version, please check the types
+in this structure definition with those in pcre2_internal.h (the actual field
+names will be different).
+\n""")
+
+f.write(record_struct)
+
+f.write("""
+/* If the 32-bit library is run in non-32-bit mode, character values greater
+than 0x10ffff may be encountered. For these we set up a special record. */
+
+#if PCRE2_CODE_UNIT_WIDTH == 32
+const ucd_record PRIV(dummy_ucd_record)[] = {{
+  ucp_Unknown,    /* script */
+  ucp_Cn,         /* type unassigned */
+  ucp_gbOther,    /* grapheme break property */
+  0,              /* case set */
+  0,              /* other case */
+  0 | (ucp_bidiL << UCD_BIDICLASS_SHIFT), /* script extension and bidi class */
+  0,              /* bool properties offset */
+  }};
+#endif
+\n""")
+
+# --- Output the table of caseless character sets ---
+
+f.write("""\
+/* This table contains lists of characters that are caseless sets of
+more than one character. Each list is terminated by NOTACHAR. */
+
+const uint32_t PRIV(ucd_caseless_sets)[] = {
+  NOTACHAR,
+""")
+
+for s in caseless_sets:
+  s = sorted(s)
+  for x in s:
+    f.write('  0x%04x,' % x)
+  f.write('  NOTACHAR,\n')
+f.write('};\n\n')
+
+# --- Other tables are not needed by pcre2test ---
+
+f.write("""\
+/* When #included in pcre2test, we don't need the table of digit sets, nor the
+the large main UCD tables. */
+
+#ifndef PCRE2_PCRE2TEST
+\n""")
+
+# --- Read Scripts.txt again for the sets of 10 digits. ---
+
+digitsets = []
+file = open('Unicode.tables/Scripts.txt', 'r', encoding='utf-8')
+
+for line in file:
+  m = re.match(r'([0-9a-fA-F]+)\.\.([0-9a-fA-F]+)\s+;\s+\S+\s+#\s+Nd\s+', line)
+  if m is None:
+    continue
+  first = int(m.group(1),16)
+  last  = int(m.group(2),16)
+  if ((last - first + 1) % 10) != 0:
+    f.write("ERROR: %04x..%04x does not contain a multiple of 10 characters" % (first, last),
+      file=sys.stderr)
+  while first < last:
+    digitsets.append(first + 9)
+    first += 10
+file.close()
+digitsets.sort()
+
+f.write("""\
+/* This table lists the code points for the '9' characters in each set of
+decimal digits. It is used to ensure that all the digits in a script run come
+from the same set. */
+
+const uint32_t PRIV(ucd_digit_sets)[] = {
+""")
+
+f.write("  %d,  /* Number of subsequent values */" % len(digitsets))
+count = 8
+for d in digitsets:
+  if count == 8:
+    f.write("\n ")
+    count = 0
+  f.write(" 0x%05x," % d)
+  count += 1
+f.write("\n};\n\n")
+
+f.write("""\
+/* This vector is a list of script bitsets for the Script Extension property.
+The number of 32-bit words in each bitset is #defined in pcre2_ucp.h as
+ucd_script_sets_item_size. */
+
+const uint32_t PRIV(ucd_script_sets)[] = {
+""")
+write_bitsets(script_lists, script_list_item_size)
+
+f.write("""\
+/* This vector is a list of bitsets for Boolean properties. The number of
+32_bit words in each bitset is #defined as ucd_boolprop_sets_item_size in
+pcre2_ucp.h. */
+
+const uint32_t PRIV(ucd_boolprop_sets)[] = {
+""")
+write_bitsets(bool_props_lists, bool_props_list_item_size)
+
+
+# Output the main UCD tables.
+
+f.write("""\
+/* These are the main two-stage UCD tables. The fields in each record are:
+script (8 bits), character type (8 bits), grapheme break property (8 bits),
+offset to multichar other cases or zero (8 bits), offset to other case or zero
+(32 bits, signed), bidi class (5 bits) and script extension (11 bits) packed
+into a 16-bit field, and offset in binary properties table (16 bits). */
+\n""")
+
+write_records(records, record_size)
+write_table(min_stage1, 'PRIV(ucd_stage1)')
+write_table(min_stage2, 'PRIV(ucd_stage2)', min_block_size)
+
+f.write("#if UCD_BLOCK_SIZE != %d\n" % min_block_size)
+f.write("""\
+#error Please correct UCD_BLOCK_SIZE in pcre2_internal.h
+#endif
+#endif  /* SUPPORT_UNICODE */
+
+#endif  /* PCRE2_PCRE2TEST */
+
+/* End of pcre2_ucd.c */
+""")
+
+f.close
+
+# End
--- a/maint/GenerateUcpHeader.py
+++ b/maint/GenerateUcpHeader.py
@ -0,0 +1,98 @@
+#! /usr/bin/python
+
+#                   PCRE2 UNICODE PROPERTY SUPPORT
+#                   ------------------------------
+
+# This script generates the pcre2_ucp.h file from Unicode data files. This
+# header uses enumerations to give names to Unicode property types and script
+# names.
+
+# This script was created in December 2021 as part of the Unicode data
+# generation refactoring.
+
+
+# Import common data lists and functions
+
+from GenerateCommon import \
+  bidi_classes, \
+  bool_properties, \
+  bool_props_list_item_size, \
+  break_properties, \
+  category_names, \
+  general_category_names, \
+  script_list_item_size, \
+  script_names, \
+  open_output
+
+# Open the output file (no return on failure). This call also writes standard
+# header boilerplate.
+
+f = open_output("pcre2_ucp.h")
+
+# Output this file's heading text
+
+f.write("""\
+#ifndef PCRE2_UCP_H_IDEMPOTENT_GUARD
+#define PCRE2_UCP_H_IDEMPOTENT_GUARD
+
+/* This file contains definitions of the Unicode property values that are
+returned by the UCD access macros and used throughout PCRE2.
+
+IMPORTANT: The specific values of the first two enums (general and particular
+character categories) are assumed by the table called catposstab in the file
+pcre2_auto_possess.c. They are unlikely to change, but should be checked after
+an update. */
+\n""")
+
+f.write("/* These are the general character categories. */\n\nenum {\n")
+for i in general_category_names:
+  f.write("  ucp_%s,\n" % i)
+f.write("};\n\n")
+
+f.write("/* These are the particular character categories. */\n\nenum {\n")
+for i in range(0, len(category_names), 2):
+  f.write("  ucp_%s,    /* %s */\n" % (category_names[i], category_names[i+1]))
+f.write("};\n\n")
+
+f.write("/* These are Boolean properties. */\n\nenum {\n")
+for i in bool_properties:
+  f.write("  ucp_%s,\n" % i)
+
+f.write("  /* This must be last */\n")
+f.write("  ucp_Bprop_Count\n};\n\n")
+
+f.write("/* Size of entries in ucd_boolprop_sets[] */\n\n")
+f.write("#define ucd_boolprop_sets_item_size %d\n\n" % bool_props_list_item_size)
+
+f.write("/* These are the bidi class values. */\n\nenum {\n")
+for i in range(0, len(bidi_classes), 2):
+  sp = ' ' * (4 - len(bidi_classes[i]))
+  f.write("  ucp_bidi%s,%s /* %s */\n" % (bidi_classes[i], sp, bidi_classes[i+1]))
+f.write("};\n\n")
+
+f.write("/* These are grapheme break properties. The Extended Pictographic "
+  "property\ncomes from the emoji-data.txt file. */\n\nenum {\n")
+for i in range(0, len(break_properties), 2):
+  sp = ' ' * (21 - len(break_properties[i]))
+  f.write("  ucp_gb%s,%s /* %s */\n" % (break_properties[i], sp, break_properties[i+1]))
+f.write("};\n\n")
+
+f.write("/* These are the script identifications. */\n\nenum {\n  /* Scripts which has characters in other scripts. */\n")
+for i in script_names:
+  if i == "Unknown":
+    f.write("\n  /* Scripts which has no characters in other scripts. */\n")
+  f.write("  ucp_%s,\n" % i)
+f.write("\n")
+
+f.write("  /* This must be last */\n")
+f.write("  ucp_Script_Count\n};\n\n")
+
+f.write("/* Size of entries in ucd_script_sets[] */\n\n")
+f.write("#define ucd_script_sets_item_size %d\n\n" % script_list_item_size)
+
+f.write("#endif  /* PCRE2_UCP_H_IDEMPOTENT_GUARD */\n\n")
+f.write("/* End of pcre2_ucp.h */\n")
+
+f.close()
+
+# End
--- a/maint/GenerateUcpTables.py
+++ b/maint/GenerateUcpTables.py
@ -0,0 +1,203 @@
+#! /usr/bin/python
+
+#                   PCRE2 UNICODE PROPERTY SUPPORT
+#                   ------------------------------
+
+# This script generates the pcre2_ucptables.c file, which contains tables for
+# recognizing Unicode property names. It is #included by pcre2_tables.c. In
+# order to reduce the number of relocations when loading the PCRE2 library, the
+# names are held as a single large string, with offsets in the table. This is
+# tedious to maintain by hand. Therefore, a script is used to generate the
+# table.
+
+# This script was created in December 2021 based on the previous GenerateUtt
+# script, whose output had to be manually edited into pcre2_tables.c. Here is
+# the history of the original script:
+
+# -----------------------------------------------------------------------------
+# Modified by PH 17-March-2009 to generate the more verbose form that works
+# for UTF-support in EBCDIC as well as ASCII environments.
+# Modified by PH 01-March-2010 to add new scripts for Unicode 5.2.0.
+# Modified by PH 04-May-2010 to add new "X.." special categories.
+# Modified by PH 30-April-2011 to add new scripts for Unicode 6.0.0
+# Modified by ChPe 30-September-2012 to add this note; no other changes were
+# necessary for Unicode 6.2.0 support.
+# Modfied by PH 26-February-2013 to add the Xuc special category.
+# Comment modified by PH 13-May-2014 to update to PCRE2 file names.
+# Script updated to Python 3 by running it through the 2to3 converter.
+# Added script names for Unicode 7.0.0, 20-June-2014.
+# Added script names for Unicode 8.0.0, 19-June-2015.
+# Added script names for Unicode 10.0.0, 02-July-2017.
+# Added script names for Unicode 11.0.0, 03-July-2018.
+# Added 'Unknown' script, 01-October-2018.
+# Added script names for Unicode 12.1.0, 27-July-2019.
+# Added script names for Unicode 13.0.0, 10-March-2020.
+# Added Script names for Unicode 14.0.0, PCRE2-10.39
+# Added support for bidi class and bidi control, 06-December-2021
+#   This also involved lower casing strings and removing underscores, in
+#   accordance with Unicode's "loose matching" rules, which Perl observes.
+# Changed default script type from PT_SC to PT_SCX, 18-December-2021
+# -----------------------------------------------------------------------------
+#
+# Note subsequent changes here:
+#
+# 27-December-2021: Added support for 4-letter script abbreviations.
+# 10-January-2022:  Further updates for Boolean property support
+# -----------------------------------------------------------------------------
+
+
+# Import common data lists and functions
+
+from GenerateCommon import \
+  abbreviations, \
+  bool_properties, \
+  bidi_classes, \
+  category_names, \
+  general_category_names, \
+  script_names, \
+  open_output
+
+# Open the output file (no return on failure). This call also writes standard
+# header boilerplate.
+
+f = open_output("pcre2_ucptables.c")
+
+# The list in bidi_classes contains just the Unicode classes such as AN, LRE,
+# etc., along with comments. We need to add "bidi" in front of each value, in
+# order to create names that don't clash with other types of property.
+
+bidi_class_names = []
+for i in range(0, len(bidi_classes), 2):
+  bidi_class_names.append("bidi" + bidi_classes[i])
+
+# Remove the comments from other lists that contain them.
+
+category_names = category_names[::2]
+
+# Create standardized versions of the names by lowercasing and removing
+# underscores.
+
+def stdname(x):
+  return x.lower().replace('_', '')
+
+def stdnames(x):
+  y = [''] * len(x)
+  for i in range(len(x)):
+    y[i] = stdname(x[i])
+  return y
+
+std_category_names = stdnames(category_names)
+std_general_category_names = stdnames(general_category_names)
+std_bidi_class_names = stdnames(bidi_class_names)
+std_bool_properties = stdnames(bool_properties)
+
+# Create the table, starting with the Unicode script, category and bidi class
+# names. We keep both the standardized name and the original, because the
+# latter is used for the ucp_xx names. NOTE: for the script abbreviations, we
+# still use the full original names.
+
+utt_table = []
+
+scx_end = script_names.index('Unknown')
+
+for idx, name in enumerate(script_names):
+  pt_type = 'PT_SCX' if idx < scx_end else 'PT_SC'
+  utt_table.append((stdname(name), name, pt_type))
+  for abbrev in abbreviations[name]:
+    utt_table.append((stdname(abbrev), name, pt_type))
+
+# Add the remaining property lists
+
+utt_table += list(zip(std_category_names, category_names, ['PT_PC'] * len(category_names)))
+utt_table += list(zip(std_general_category_names, general_category_names, ['PT_GC'] * len(general_category_names)))
+utt_table += list(zip(std_bidi_class_names, bidi_class_names, ['PT_BIDICL'] * len(bidi_class_names)))
+
+for name in bool_properties:
+  utt_table.append((stdname(name), name, 'PT_BOOL'))
+  if name in abbreviations: 
+    for abbrev in abbreviations[name]:
+      utt_table.append((stdname(abbrev), name, 'PT_BOOL'))
+
+# Now add specials and synonyms. Note both the standardized and capitalized
+# forms are needed.
+
+utt_table.append(('any', 'Any', 'PT_ANY'))
+utt_table.append(('l&',  'L&',  'PT_LAMP'))
+utt_table.append(('lc',  'LC',  'PT_LAMP'))
+utt_table.append(('xan', 'Xan', 'PT_ALNUM'))
+utt_table.append(('xps', 'Xps', 'PT_PXSPACE'))
+utt_table.append(('xsp', 'Xsp', 'PT_SPACE'))
+utt_table.append(('xuc', 'Xuc', 'PT_UCNC'))
+utt_table.append(('xwd', 'Xwd', 'PT_WORD'))
+
+# Remove duplicates from the table and then sort it.
+
+utt_table = list(set(utt_table)) 
+utt_table.sort()
+
+# Output file-specific heading
+
+f.write("""\
+#ifdef SUPPORT_UNICODE
+
+/* The PRIV(utt)[] table below translates Unicode property names into type and
+code values. It is searched by binary chop, so must be in collating sequence of
+name. Originally, the table contained pointers to the name strings in the first
+field of each entry. However, that leads to a large number of relocations when
+a shared library is dynamically loaded. A significant reduction is made by
+putting all the names into a single, large string and using offsets instead.
+All letters are lower cased, and underscores are removed, in accordance with
+the "loose matching" rules that Unicode advises and Perl uses. */
+\n""")
+
+# We have to use STR_ macros to define the strings so that it all works in
+# UTF-8 mode on EBCDIC platforms.
+
+for utt in utt_table:
+  f.write('#define STRING_%s0' % (utt[0].replace('&', '_AMPERSAND')))
+  for c in utt[0]:
+    if c == '&':
+      f.write(' STR_AMPERSAND')
+    else:
+      f.write(' STR_%s' % c);
+  f.write(' "\\0"\n')
+
+# Output the long string of concatenated names
+
+f.write('\nconst char PRIV(utt_names)[] =\n');
+last = ''
+for utt in utt_table:
+  if utt == utt_table[-1]:
+    last = ';'
+  f.write('  STRING_%s0%s\n' % (utt[0].replace('&', '_AMPERSAND'), last))
+
+# Output the property type table
+
+f.write('\nconst ucp_type_table PRIV(utt)[] = {\n')
+offset = 0
+last = ','
+for utt in utt_table:
+  if utt[2] in ('PT_ANY', 'PT_LAMP', 'PT_ALNUM', 'PT_PXSPACE',
+      'PT_SPACE', 'PT_UCNC', 'PT_WORD'):
+    value = '0'
+  else:
+    value = 'ucp_' + utt[1]
+  if utt == utt_table[-1]:
+    last = ''
+  f.write('  { %3d, %s, %s }%s\n' % (offset, utt[2], value, last))
+  offset += len(utt[0]) + 1
+f.write('};\n\n')
+
+# Ending text
+
+f.write("""\
+const size_t PRIV(utt_size) = sizeof(PRIV(utt)) / sizeof(ucp_type_table);
+
+#endif /* SUPPORT_UNICODE */
+
+/* End of pcre2_ucptables.c */
+""")
+
+f.close
+
+# End
--- a/maint/GenerateUtt.py
+++ b/maint/GenerateUtt.py
@ -1,140 +0,0 @@
-#! /usr/bin/python
-
-# Generate utt tables. Note: this script has now been converted to Python 3.
-
-# The source file pcre2_tables.c contains (amongst other things), a table that
-# is indexed by script name. In order to reduce the number of relocations when
-# loading the library, the names are held as a single large string, with
-# offsets in the table. This is tedious to maintain by hand. Therefore, this
-# script is used to generate the table. The output is sent to stdout; usually
-# that should be directed to a temporary file. Then pcre2_tables.c can be
-# edited by replacing the relevant definitions and table therein with the
-# temporary file.
-
-# Modified by PH 17-March-2009 to generate the more verbose form that works
-# for UTF-support in EBCDIC as well as ASCII environments.
-# Modified by PH 01-March-2010 to add new scripts for Unicode 5.2.0.
-# Modified by PH 04-May-2010 to add new "X.." special categories.
-# Modified by PH 30-April-2011 to add new scripts for Unicode 6.0.0
-# Modified by ChPe 30-September-2012 to add this note; no other changes were
-# necessary for Unicode 6.2.0 support.
-# Modfied by PH 26-February-2013 to add the Xuc special category.
-# Comment modified by PH 13-May-2014 to update to PCRE2 file names.
-# Script updated to Python 3 by running it through the 2to3 converter.
-# Added script names for Unicode 7.0.0, 20-June-2014.
-# Added script names for Unicode 8.0.0, 19-June-2015.
-# Added script names for Unicode 10.0.0, 02-July-2017.
-# Added script names for Unicode 11.0.0, 03-July-2018.
-# Added 'Unknown' script, 01-October-2018.
-# Added script names for Unicode 12.1.0, 27-July-2019.
-# Added script names for Unicode 13.0.0, 10-March-2020.
-# Added Script names for Unicode 14.0.0, PCRE2-10.39
-
-script_names = ['Unknown', 'Arabic', 'Armenian', 'Bengali', 'Bopomofo', 'Braille', 'Buginese', 'Buhid', 'Canadian_Aboriginal', \
- 'Cherokee', 'Common', 'Coptic', 'Cypriot', 'Cyrillic', 'Deseret', 'Devanagari', 'Ethiopic', 'Georgian', \
- 'Glagolitic', 'Gothic', 'Greek', 'Gujarati', 'Gurmukhi', 'Han', 'Hangul', 'Hanunoo', 'Hebrew', 'Hiragana', \
- 'Inherited', 'Kannada', 'Katakana', 'Kharoshthi', 'Khmer', 'Lao', 'Latin', 'Limbu', 'Linear_B', 'Malayalam', \
- 'Mongolian', 'Myanmar', 'New_Tai_Lue', 'Ogham', 'Old_Italic', 'Old_Persian', 'Oriya', 'Osmanya', 'Runic', \
- 'Shavian', 'Sinhala', 'Syloti_Nagri', 'Syriac', 'Tagalog', 'Tagbanwa', 'Tai_Le', 'Tamil', 'Telugu', 'Thaana', \
- 'Thai', 'Tibetan', 'Tifinagh', 'Ugaritic', 'Yi', \
- # New for Unicode 5.0
- 'Balinese', 'Cuneiform', 'Nko', 'Phags_Pa', 'Phoenician', \
- # New for Unicode 5.1
- 'Carian', 'Cham', 'Kayah_Li', 'Lepcha', 'Lycian', 'Lydian', 'Ol_Chiki', 'Rejang', 'Saurashtra', 'Sundanese', 'Vai', \
- # New for Unicode 5.2
- 'Avestan', 'Bamum', 'Egyptian_Hieroglyphs', 'Imperial_Aramaic', \
- 'Inscriptional_Pahlavi', 'Inscriptional_Parthian', \
- 'Javanese', 'Kaithi', 'Lisu', 'Meetei_Mayek', \
- 'Old_South_Arabian', 'Old_Turkic', 'Samaritan', 'Tai_Tham', 'Tai_Viet', \
- # New for Unicode 6.0.0
- 'Batak', 'Brahmi', 'Mandaic', \
-# New for Unicode 6.1.0
- 'Chakma', 'Meroitic_Cursive', 'Meroitic_Hieroglyphs', 'Miao', 'Sharada', 'Sora_Sompeng', 'Takri',
-# New for Unicode 7.0.0
- 'Bassa_Vah', 'Caucasian_Albanian', 'Duployan', 'Elbasan', 'Grantha', 'Khojki', 'Khudawadi',
- 'Linear_A', 'Mahajani', 'Manichaean', 'Mende_Kikakui', 'Modi', 'Mro', 'Nabataean',
- 'Old_North_Arabian', 'Old_Permic', 'Pahawh_Hmong', 'Palmyrene', 'Psalter_Pahlavi',
- 'Pau_Cin_Hau', 'Siddham', 'Tirhuta', 'Warang_Citi',
-# New for Unicode 8.0.0
- 'Ahom', 'Anatolian_Hieroglyphs', 'Hatran', 'Multani', 'Old_Hungarian',
- 'SignWriting',
-# New for Unicode 10.0.0
- 'Adlam', 'Bhaiksuki', 'Marchen', 'Newa', 'Osage', 'Tangut', 'Masaram_Gondi',
- 'Nushu', 'Soyombo', 'Zanabazar_Square',
-# New for Unicode 11.0.0
-  'Dogra', 'Gunjala_Gondi', 'Hanifi_Rohingya', 'Makasar', 'Medefaidrin',
-  'Old_Sogdian', 'Sogdian',
-# New for Unicode 12.0.0
-  'Elymaic', 'Nandinagari', 'Nyiakeng_Puachue_Hmong', 'Wancho',
-# New for Unicode 13.0.0
-  'Chorasmian', 'Dives_Akuru', 'Khitan_Small_Script', 'Yezidi',
-# New for Unicode 14.0.0
-  'Cypro_Minoan', 'Old_Uyghur', 'Tangsa', 'Toto', 'Vithkuqi'
- ]
-
-category_names = ['Cc', 'Cf', 'Cn', 'Co', 'Cs', 'Ll', 'Lm', 'Lo', 'Lt', 'Lu',
-  'Mc', 'Me', 'Mn', 'Nd', 'Nl', 'No', 'Pc', 'Pd', 'Pe', 'Pf', 'Pi', 'Po', 'Ps',
-  'Sc', 'Sk', 'Sm', 'So', 'Zl', 'Zp', 'Zs' ]
-
-general_category_names = ['C', 'L', 'M', 'N', 'P', 'S', 'Z']
-
-# First add the Unicode script and category names.
-
-utt_table  = list(zip(script_names, ['PT_SC'] * len(script_names)))
-utt_table += list(zip(category_names, ['PT_PC'] * len(category_names)))
-utt_table += list(zip(general_category_names, ['PT_GC'] * len(general_category_names)))
-
-# Now add our own specials.
-
-utt_table.append(('Any', 'PT_ANY'))
-utt_table.append(('L&',  'PT_LAMP'))
-utt_table.append(('Xan', 'PT_ALNUM'))
-utt_table.append(('Xps', 'PT_PXSPACE'))
-utt_table.append(('Xsp', 'PT_SPACE'))
-utt_table.append(('Xuc', 'PT_UCNC'))
-utt_table.append(('Xwd', 'PT_WORD'))
-
-# Sort the table.
-
-utt_table.sort()
-
-# We have to use STR_ macros to define the strings so that it all works in
-# UTF-8 mode on EBCDIC platforms.
-
-for utt in utt_table:
-        print('#define STRING_%s0' % (utt[0].replace('&', '_AMPERSAND')), end=' ')
-        for c in utt[0]:
-                if c == '_':
-                        print('STR_UNDERSCORE', end=' ')
-                elif c == '&':
-                        print('STR_AMPERSAND', end=' ')
-                else:
-                        print('STR_%s' % c, end=' ');
-        print('"\\0"')
-
-# Print the actual table, using the string names
-
-print('')
-print('const char PRIV(utt_names)[] =');
-last = ''
-for utt in utt_table:
-        if utt == utt_table[-1]:
-                last = ';'
-        print('  STRING_%s0%s' % (utt[0].replace('&', '_AMPERSAND'), last))
-# This was how it was done before the EBCDIC-compatible modification.
-#        print '  "%s\\0"%s' % (utt[0], last)
-
-print('\nconst ucp_type_table PRIV(utt)[] = {')
-offset = 0
-last = ','
-for utt in utt_table:
-        if utt[1] in ('PT_ANY', 'PT_LAMP', 'PT_ALNUM', 'PT_PXSPACE', 
-          'PT_SPACE', 'PT_UCNC', 'PT_WORD'):
-                value = '0'
-        else:
-                value = 'ucp_' + utt[0]
-        if utt == utt_table[-1]:
-                last = ''
-        print('  { %3d, %s, %s }%s' % (offset, utt[1], value, last))
-        offset += len(utt[0]) + 1
-print('};')
--- a/maint/MultiStage2.py
+++ b/maint/MultiStage2.py
@ -1,819 +0,0 @@
-#! /usr/bin/python
-
-# Multistage table builder
-# (c) Peter Kankowski, 2008
-
-##############################################################################
-# This script was submitted to the PCRE project by Peter Kankowski as part of
-# the upgrading of Unicode property support. The new code speeds up property
-# matching many times. The script is for the use of PCRE maintainers, to
-# generate the pcre2_ucd.c file that contains a digested form of the Unicode
-# data tables. A number of extensions have been added to the original script.
-#
-# The script has now been upgraded to Python 3 for PCRE2, and should be run in
-# the maint subdirectory, using the command
-#
-# [python3] ./MultiStage2.py >../src/pcre2_ucd.c
-#
-# It requires six Unicode data tables: DerivedGeneralCategory.txt,
-# GraphemeBreakProperty.txt, Scripts.txt, ScriptExtensions.txt,
-# CaseFolding.txt, and emoji-data.txt. These must be in the
-# maint/Unicode.tables subdirectory.
-#
-# DerivedGeneralCategory.txt is found in the "extracted" subdirectory of the
-# Unicode database (UCD) on the Unicode web site; GraphemeBreakProperty.txt is
-# in the "auxiliary" subdirectory. Scripts.txt, ScriptExtensions.txt, and
-# CaseFolding.txt are directly in the UCD directory.
-#
-# The emoji-data.txt file is found in the "emoji" subdirectory even though it
-# is technically part of a different (but coordinated) standard as shown
-# in files associated with Unicode Technical Standard #51 ("Unicode Emoji"),
-# for example:
-#
-# http://unicode.org/Public/emoji/13.0/ReadMe.txt
-#
-# -----------------------------------------------------------------------------
-# Minor modifications made to this script:
-#  Added #! line at start
-#  Removed tabs
-#  Made it work with Python 2.4 by rewriting two statements that needed 2.5
-#  Consequent code tidy
-#  Adjusted data file names to take from the Unicode.tables directory
-#  Adjusted global table names by prefixing _pcre_.
-#  Commented out stuff relating to the casefolding table, which isn't used;
-#    removed completely in 2012.
-#  Corrected size calculation
-#  Add #ifndef SUPPORT_UCP to use dummy tables when no UCP support is needed.
-#  Update for PCRE2: name changes, and SUPPORT_UCP is abolished.
-#
-# Major modifications made to this script:
-#  Added code to add a grapheme break property field to records.
-#
-#  Added code to search for sets of more than two characters that must match
-#  each other caselessly. A new table is output containing these sets, and
-#  offsets into the table are added to the main output records. This new
-#  code scans CaseFolding.txt instead of UnicodeData.txt, which is no longer
-#  used.
-#
-#  Update for Python3:
-#    . Processed with 2to3, but that didn't fix everything
-#    . Changed string.strip to str.strip
-#    . Added encoding='utf-8' to the open() call
-#    . Inserted 'int' before blocksize/ELEMS_PER_LINE because an int is
-#        required and the result of the division is a float
-#
-#  Added code to scan the emoji-data.txt file to find the Extended Pictographic
-#  property, which is used by PCRE2 as a grapheme breaking property. This was
-#  done when updating to Unicode 11.0.0 (July 2018).
-#
-#  Added code to add a Script Extensions field to records. This has increased
-#  their size from 8 to 12 bytes, only 10 of which are currently used.
-#
-# 01-March-2010:     Updated list of scripts for Unicode 5.2.0
-# 30-April-2011:     Updated list of scripts for Unicode 6.0.0
-#     July-2012:     Updated list of scripts for Unicode 6.1.0
-# 20-August-2012:    Added scan of GraphemeBreakProperty.txt and added a new
-#                      field in the record to hold the value. Luckily, the
-#                      structure had a hole in it, so the resulting table is
-#                      not much bigger than before.
-# 18-September-2012: Added code for multiple caseless sets. This uses the
-#                      final hole in the structure.
-# 30-September-2012: Added RegionalIndicator break property from Unicode 6.2.0
-# 13-May-2014:       Updated for PCRE2
-# 03-June-2014:      Updated for Python 3
-# 20-June-2014:      Updated for Unicode 7.0.0
-# 12-August-2014:    Updated to put Unicode version into the file
-# 19-June-2015:      Updated for Unicode 8.0.0
-# 02-July-2017:      Updated for Unicode 10.0.0
-# 03-July-2018:      Updated for Unicode 11.0.0
-# 07-July-2018:      Added code to scan emoji-data.txt for the Extended
-#                      Pictographic property.
-# 01-October-2018:   Added the 'Unknown' script name
-# 03-October-2018:   Added new field for Script Extensions
-# 27-July-2019:      Updated for Unicode 12.1.0
-# 10-March-2020:     Updated for Unicode 13.0.0
-# PCRE2-10.39:       Updated for Unicode 14.0.0
-# ----------------------------------------------------------------------------
-#
-#
-# The main tables generated by this script are used by macros defined in
-# pcre2_internal.h. They look up Unicode character properties using short
-# sequences of code that contains no branches, which makes for greater speed.
-#
-# Conceptually, there is a table of records (of type ucd_record), containing a
-# script number, script extension value, character type, grapheme break type,
-# offset to caseless matching set, offset to the character's other case, for
-# every Unicode character. However, a real table covering all Unicode
-# characters would be far too big. It can be efficiently compressed by
-# observing that many characters have the same record, and many blocks of
-# characters (taking 128 characters in a block) have the same set of records as
-# other blocks. This leads to a 2-stage lookup process.
-#
-# This script constructs six tables. The ucd_caseless_sets table contains
-# lists of characters that all match each other caselessly. Each list is
-# in order, and is terminated by NOTACHAR (0xffffffff), which is larger than
-# any valid character. The first list is empty; this is used for characters
-# that are not part of any list.
-#
-# The ucd_digit_sets table contains the code points of the '9' characters in
-# each set of 10 decimal digits in Unicode. This is used to ensure that digits
-# in script runs all come from the same set. The first element in the vector
-# contains the number of subsequent elements, which are in ascending order.
-#
-# The ucd_script_sets vector contains lists of script numbers that are the
-# Script Extensions properties of certain characters. Each list is terminated
-# by zero (ucp_Unknown). A character with more than one script listed for its
-# Script Extension property has a negative value in its record. This is the
-# negated offset to the start of the relevant list in the ucd_script_sets
-# vector.
-#
-# The ucd_records table contains one instance of every unique record that is
-# required. The ucd_stage1 table is indexed by a character's block number,
-# which is the character's code point divided by 128, since 128 is the size
-# of each block. The result of a lookup in ucd_stage1 a "virtual" block number.
-#
-# The ucd_stage2 table is a table of "virtual" blocks; each block is indexed by
-# the offset of a character within its own block, and the result is the index
-# number of the required record in the ucd_records vector.
-#
-# The following examples are correct for the Unicode 11.0.0 database. Future
-# updates may make change the actual lookup values.
-#
-# Example: lowercase "a" (U+0061) is in block 0
-#          lookup 0 in stage1 table yields 0
-#          lookup 97 (0x61) in the first table in stage2 yields 17
-#          record 17 is { 34, 5, 12, 0, -32, 34, 0 }
-#            34 = ucp_Latin   => Latin script
-#             5 = ucp_Ll      => Lower case letter
-#            12 = ucp_gbOther => Grapheme break property "Other"
-#             0               => Not part of a caseless set
-#           -32 (-0x20)       => Other case is U+0041
-#            34 = ucp_Latin   => No special Script Extension property
-#             0               => Dummy value, unused at present
-#
-# Almost all lowercase latin characters resolve to the same record. One or two
-# are different because they are part of a multi-character caseless set (for
-# example, k, K and the Kelvin symbol are such a set).
-#
-# Example: hiragana letter A (U+3042) is in block 96 (0x60)
-#          lookup 96 in stage1 table yields 90
-#          lookup 66 (0x42) in table 90 in stage2 yields 564
-#          record 564 is { 27, 7, 12, 0, 0, 27, 0 }
-#            27 = ucp_Hiragana => Hiragana script
-#             7 = ucp_Lo       => Other letter
-#            12 = ucp_gbOther  => Grapheme break property "Other"
-#             0                => Not part of a caseless set
-#             0                => No other case
-#            27 = ucp_Hiragana => No special Script Extension property
-#             0                => Dummy value, unused at present
-#
-# Example: vedic tone karshana (U+1CD0) is in block 57 (0x39)
-#          lookup 57 in stage1 table yields 55
-#          lookup 80 (0x50) in table 55 in stage2 yields 458
-#          record 458 is { 28, 12, 3, 0, 0, -101, 0 }
-#            28 = ucp_Inherited => Script inherited from predecessor
-#            12 = ucp_Mn        => Non-spacing mark
-#             3 = ucp_gbExtend  => Grapheme break property "Extend"
-#             0                 => Not part of a caseless set
-#             0                 => No other case
-#          -101                 => Script Extension list offset = 101
-#             0                 => Dummy value, unused at present
-#
-# At offset 101 in the ucd_script_sets vector we find the list 3, 15, 107, 29,
-# and terminator 0. This means that this character is expected to be used with
-# any of those scripts, which are Bengali, Devanagari, Grantha, and Kannada.
-#
-#  Philip Hazel, 03 July 2008
-##############################################################################
-
-
-import re
-import string
-import sys
-
-MAX_UNICODE = 0x110000
-NOTACHAR = 0xffffffff
-
-
-# Parse a line of Scripts.txt, GraphemeBreakProperty.txt or DerivedGeneralCategory.txt
-def make_get_names(enum):
-        return lambda chardata: enum.index(chardata[1])
-
-# Parse a line of CaseFolding.txt
-def get_other_case(chardata):
-        if chardata[1] == 'C' or chardata[1] == 'S':
-          return int(chardata[2], 16) - int(chardata[0], 16)
-        return 0
-
-# Parse a line of ScriptExtensions.txt
-def get_script_extension(chardata):
-        this_script_list = list(chardata[1].split(' '))
-        if len(this_script_list) == 1:
-          return script_abbrevs.index(this_script_list[0])
-
-        script_numbers = []
-        for d in this_script_list:
-          script_numbers.append(script_abbrevs.index(d))
-        script_numbers.append(0)
-        script_numbers_length = len(script_numbers)
-
-        for i in range(1, len(script_lists) - script_numbers_length + 1):
-          for j in range(0, script_numbers_length):
-            found = True
-            if script_lists[i+j] != script_numbers[j]:
-              found = False
-              break
-          if found:
-            return -i
-
-        # Not found in existing lists
-
-        return_value = len(script_lists)
-        script_lists.extend(script_numbers)
-        return -return_value
-
-# Read the whole table in memory, setting/checking the Unicode version
-def read_table(file_name, get_value, default_value):
-        global unicode_version
-
-        f = re.match(r'^[^/]+/([^.]+)\.txt$', file_name)
-        file_base = f.group(1)
-        version_pat = r"^# " + re.escape(file_base) + r"-(\d+\.\d+\.\d+)\.txt$"
-        file = open(file_name, 'r', encoding='utf-8')
-        f = re.match(version_pat, file.readline())
-        version = f.group(1)
-        if unicode_version == "":
-                unicode_version = version
-        elif unicode_version != version:
-                print("WARNING: Unicode version differs in %s", file_name, file=sys.stderr)
-
-        table = [default_value] * MAX_UNICODE
-        for line in file:
-                line = re.sub(r'#.*', '', line)
-                chardata = list(map(str.strip, line.split(';')))
-                if len(chardata) <= 1:
-                        continue
-                value = get_value(chardata)
-                m = re.match(r'([0-9a-fA-F]+)(\.\.([0-9a-fA-F]+))?$', chardata[0])
-                char = int(m.group(1), 16)
-                if m.group(3) is None:
-                        last = char
-                else:
-                        last = int(m.group(3), 16)
-                for i in range(char, last + 1):
-                        # It is important not to overwrite a previously set
-                        # value because in the CaseFolding file there are lines
-                        # to be ignored (returning the default value of 0)
-                        # which often come after a line which has already set
-                        # data.
-                        if table[i] == default_value:
-                          table[i] = value
-        file.close()
-        return table
-
-# Get the smallest possible C language type for the values
-def get_type_size(table):
-        type_size = [("uint8_t", 1), ("uint16_t", 2), ("uint32_t", 4),
-                                 ("signed char", 1), ("pcre_int16", 2), ("pcre_int32", 4)]
-        limits = [(0, 255), (0, 65535), (0, 4294967295),
-                          (-128, 127), (-32768, 32767), (-2147483648, 2147483647)]
-        minval = min(table)
-        maxval = max(table)
-        for num, (minlimit, maxlimit) in enumerate(limits):
-                if minlimit <= minval and maxval <= maxlimit:
-                        return type_size[num]
-        else:
-                raise OverflowError("Too large to fit into C types")
-
-def get_tables_size(*tables):
-        total_size = 0
-        for table in tables:
-                type, size = get_type_size(table)
-                total_size += size * len(table)
-        return total_size
-
-# Compress the table into the two stages
-def compress_table(table, block_size):
-        blocks = {} # Dictionary for finding identical blocks
-        stage1 = [] # Stage 1 table contains block numbers (indices into stage 2 table)
-        stage2 = [] # Stage 2 table contains the blocks with property values
-        table = tuple(table)
-        for i in range(0, len(table), block_size):
-                block = table[i:i+block_size]
-                start = blocks.get(block)
-                if start is None:
-                        # Allocate a new block
-                        start = len(stage2) / block_size
-                        stage2 += block
-                        blocks[block] = start
-                stage1.append(start)
-
-        return stage1, stage2
-
-# Print a table
-def print_table(table, table_name, block_size = None):
-        type, size = get_type_size(table)
-        ELEMS_PER_LINE = 16
-
-        s = "const %s %s[] = { /* %d bytes" % (type, table_name, size * len(table))
-        if block_size:
-                s += ", block = %d" % block_size
-        print(s + " */")
-        table = tuple(table)
-        if block_size is None:
-                fmt = "%3d," * ELEMS_PER_LINE + " /* U+%04X */"
-                mult = MAX_UNICODE / len(table)
-                for i in range(0, len(table), ELEMS_PER_LINE):
-                        print(fmt % (table[i:i+ELEMS_PER_LINE] +
-                          (int(i * mult),)))
-        else:
-                if block_size > ELEMS_PER_LINE:
-                        el = ELEMS_PER_LINE
-                else:
-                        el = block_size
-                fmt = "%3d," * el + "\n"
-                if block_size > ELEMS_PER_LINE:
-                        fmt = fmt * int(block_size / ELEMS_PER_LINE)
-                for i in range(0, len(table), block_size):
-                        print(("/* block %d */\n" + fmt) % ((i / block_size,) + table[i:i+block_size]))
-        print("};\n")
-
-# Extract the unique combinations of properties into records
-def combine_tables(*tables):
-        records = {}
-        index = []
-        for t in zip(*tables):
-                i = records.get(t)
-                if i is None:
-                        i = records[t] = len(records)
-                index.append(i)
-        return index, records
-
-def get_record_size_struct(records):
-        size = 0
-        structure = '/* When recompiling tables with a new Unicode version, please check the\n' + \
-        'types in this structure definition from pcre2_internal.h (the actual\n' + \
-        'field names will be different):\n\ntypedef struct {\n'
-        for i in range(len(records[0])):
-                record_slice = [record[i] for record in records]
-                slice_type, slice_size = get_type_size(record_slice)
-                # add padding: round up to the nearest power of slice_size
-                size = (size + slice_size - 1) & -slice_size
-                size += slice_size
-                structure += '%s property_%d;\n' % (slice_type, i)
-
-        # round up to the first item of the next structure in array
-        record_slice = [record[0] for record in records]
-        slice_type, slice_size = get_type_size(record_slice)
-        size = (size + slice_size - 1) & -slice_size
-
-        structure += '} ucd_record;\n*/\n'
-        return size, structure
-
-def test_record_size():
-        tests = [ \
-          ( [(3,), (6,), (6,), (1,)], 1 ), \
-          ( [(300,), (600,), (600,), (100,)], 2 ), \
-          ( [(25, 3), (6, 6), (34, 6), (68, 1)], 2 ), \
-          ( [(300, 3), (6, 6), (340, 6), (690, 1)], 4 ), \
-          ( [(3, 300), (6, 6), (6, 340), (1, 690)], 4 ), \
-          ( [(300, 300), (6, 6), (6, 340), (1, 690)], 4 ), \
-          ( [(3, 100000), (6, 6), (6, 123456), (1, 690)], 8 ), \
-          ( [(100000, 300), (6, 6), (123456, 6), (1, 690)], 8 ), \
-        ]
-        for test in tests:
-            size, struct = get_record_size_struct(test[0])
-            assert(size == test[1])
-            #print struct
-
-def print_records(records, record_size):
-        print('const ucd_record PRIV(ucd_records)[] = { ' + \
-              '/* %d bytes, record size %d */' % (len(records) * record_size, record_size))
-
-        records = list(zip(list(records.keys()), list(records.values())))
-        records.sort(key = lambda x: x[1])
-        for i, record in enumerate(records):
-                print(('  {' + '%6d, ' * len(record[0]) + '}, /* %3d */') % (record[0] + (i,)))
-        print('};\n')
-
-script_names = ['Unknown', 'Arabic', 'Armenian', 'Bengali', 'Bopomofo', 'Braille', 'Buginese', 'Buhid', 'Canadian_Aboriginal',
- 'Cherokee', 'Common', 'Coptic', 'Cypriot', 'Cyrillic', 'Deseret', 'Devanagari', 'Ethiopic', 'Georgian',
- 'Glagolitic', 'Gothic', 'Greek', 'Gujarati', 'Gurmukhi', 'Han', 'Hangul', 'Hanunoo', 'Hebrew', 'Hiragana',
- 'Inherited', 'Kannada', 'Katakana', 'Kharoshthi', 'Khmer', 'Lao', 'Latin', 'Limbu', 'Linear_B', 'Malayalam',
- 'Mongolian', 'Myanmar', 'New_Tai_Lue', 'Ogham', 'Old_Italic', 'Old_Persian', 'Oriya', 'Osmanya', 'Runic',
- 'Shavian', 'Sinhala', 'Syloti_Nagri', 'Syriac', 'Tagalog', 'Tagbanwa', 'Tai_Le', 'Tamil', 'Telugu', 'Thaana',
- 'Thai', 'Tibetan', 'Tifinagh', 'Ugaritic', 'Yi',
-# New for Unicode 5.0
- 'Balinese', 'Cuneiform', 'Nko', 'Phags_Pa', 'Phoenician',
-# New for Unicode 5.1
- 'Carian', 'Cham', 'Kayah_Li', 'Lepcha', 'Lycian', 'Lydian', 'Ol_Chiki', 'Rejang', 'Saurashtra', 'Sundanese', 'Vai',
-# New for Unicode 5.2
- 'Avestan', 'Bamum', 'Egyptian_Hieroglyphs', 'Imperial_Aramaic',
- 'Inscriptional_Pahlavi', 'Inscriptional_Parthian',
- 'Javanese', 'Kaithi', 'Lisu', 'Meetei_Mayek',
- 'Old_South_Arabian', 'Old_Turkic', 'Samaritan', 'Tai_Tham', 'Tai_Viet',
-# New for Unicode 6.0.0
- 'Batak', 'Brahmi', 'Mandaic',
-# New for Unicode 6.1.0
- 'Chakma', 'Meroitic_Cursive', 'Meroitic_Hieroglyphs', 'Miao', 'Sharada', 'Sora_Sompeng', 'Takri',
-# New for Unicode 7.0.0
- 'Bassa_Vah', 'Caucasian_Albanian', 'Duployan', 'Elbasan', 'Grantha', 'Khojki', 'Khudawadi',
- 'Linear_A', 'Mahajani', 'Manichaean', 'Mende_Kikakui', 'Modi', 'Mro', 'Nabataean',
- 'Old_North_Arabian', 'Old_Permic', 'Pahawh_Hmong', 'Palmyrene', 'Psalter_Pahlavi',
- 'Pau_Cin_Hau', 'Siddham', 'Tirhuta', 'Warang_Citi',
-# New for Unicode 8.0.0
- 'Ahom', 'Anatolian_Hieroglyphs', 'Hatran', 'Multani', 'Old_Hungarian',
- 'SignWriting',
-# New for Unicode 10.0.0
- 'Adlam', 'Bhaiksuki', 'Marchen', 'Newa', 'Osage', 'Tangut', 'Masaram_Gondi',
- 'Nushu', 'Soyombo', 'Zanabazar_Square',
-# New for Unicode 11.0.0
-  'Dogra', 'Gunjala_Gondi', 'Hanifi_Rohingya', 'Makasar', 'Medefaidrin',
-  'Old_Sogdian', 'Sogdian',
-# New for Unicode 12.0.0
-  'Elymaic', 'Nandinagari', 'Nyiakeng_Puachue_Hmong', 'Wancho',
-# New for Unicode 13.0.0
-  'Chorasmian', 'Dives_Akuru', 'Khitan_Small_Script', 'Yezidi',
-# New for Unicode 14.0.0
-  'Cypro_Minoan', 'Old_Uyghur', 'Tangsa', 'Toto', 'Vithkuqi'
- ]
-
-script_abbrevs = [
-  'Zzzz', 'Arab', 'Armn', 'Beng', 'Bopo', 'Brai', 'Bugi', 'Buhd', 'Cans',
-  'Cher', 'Zyyy', 'Copt', 'Cprt', 'Cyrl', 'Dsrt', 'Deva', 'Ethi', 'Geor',
-  'Glag', 'Goth', 'Grek', 'Gujr', 'Guru', 'Hani', 'Hang', 'Hano', 'Hebr',
-  'Hira', 'Zinh', 'Knda', 'Kana', 'Khar', 'Khmr', 'Laoo', 'Latn', 'Limb',
-  'Linb', 'Mlym', 'Mong', 'Mymr', 'Talu', 'Ogam', 'Ital', 'Xpeo', 'Orya',
-  'Osma', 'Runr', 'Shaw', 'Sinh', 'Sylo', 'Syrc', 'Tglg', 'Tagb', 'Tale',
-  'Taml', 'Telu', 'Thaa', 'Thai', 'Tibt', 'Tfng', 'Ugar', 'Yiii',
-#New for Unicode 5.0
-  'Bali', 'Xsux', 'Nkoo', 'Phag', 'Phnx',
-#New for Unicode 5.1
-  'Cari', 'Cham', 'Kali', 'Lepc', 'Lyci', 'Lydi', 'Olck', 'Rjng', 'Saur',
-  'Sund', 'Vaii',
-#New for Unicode 5.2
-  'Avst', 'Bamu', 'Egyp', 'Armi', 'Phli', 'Prti', 'Java', 'Kthi', 'Lisu',
-  'Mtei', 'Sarb', 'Orkh', 'Samr', 'Lana', 'Tavt',
-#New for Unicode 6.0.0
-  'Batk', 'Brah', 'Mand',
-#New for Unicode 6.1.0
-  'Cakm', 'Merc', 'Mero', 'Plrd', 'Shrd', 'Sora', 'Takr',
-#New for Unicode 7.0.0
-  'Bass', 'Aghb', 'Dupl', 'Elba', 'Gran', 'Khoj', 'Sind', 'Lina', 'Mahj',
-  'Mani', 'Mend', 'Modi', 'Mroo', 'Nbat', 'Narb', 'Perm', 'Hmng', 'Palm',
-  'Phlp', 'Pauc', 'Sidd', 'Tirh', 'Wara',
-#New for Unicode 8.0.0
-  'Ahom', 'Hluw', 'Hatr', 'Mult', 'Hung', 'Sgnw',
-#New for Unicode 10.0.0
-  'Adlm', 'Bhks', 'Marc', 'Newa', 'Osge', 'Tang', 'Gonm', 'Nshu', 'Soyo',
-  'Zanb',
-#New for Unicode 11.0.0
-  'Dogr', 'Gong', 'Rohg', 'Maka', 'Medf', 'Sogo', 'Sogd',
-#New for Unicode 12.0.0
-  'Elym', 'Nand', 'Hmnp', 'Wcho',
-#New for Unicode 13.0.0
-  'Chrs', 'Diak', 'Kits', 'Yezi',
-#New for Unicode 14.0.0
-  'Cpmn', 'Ougr', 'Tngs', 'Toto', 'Vith'
- ]
-
-category_names = ['Cc', 'Cf', 'Cn', 'Co', 'Cs', 'Ll', 'Lm', 'Lo', 'Lt', 'Lu',
-  'Mc', 'Me', 'Mn', 'Nd', 'Nl', 'No', 'Pc', 'Pd', 'Pe', 'Pf', 'Pi', 'Po', 'Ps',
-  'Sc', 'Sk', 'Sm', 'So', 'Zl', 'Zp', 'Zs' ]
-
-# The Extended_Pictographic property is not found in the file where all the
-# others are (GraphemeBreakProperty.txt). It comes from the emoji-data.txt
-# file, but we list it here so that the name has the correct index value.
-
-break_property_names = ['CR', 'LF', 'Control', 'Extend', 'Prepend',
-  'SpacingMark', 'L', 'V', 'T', 'LV', 'LVT', 'Regional_Indicator', 'Other',
-  'ZWJ', 'Extended_Pictographic' ]
-
-test_record_size()
-unicode_version = ""
-
-script = read_table('Unicode.tables/Scripts.txt', make_get_names(script_names), script_names.index('Unknown'))
-category = read_table('Unicode.tables/DerivedGeneralCategory.txt', make_get_names(category_names), category_names.index('Cn'))
-break_props = read_table('Unicode.tables/GraphemeBreakProperty.txt', make_get_names(break_property_names), break_property_names.index('Other'))
-other_case = read_table('Unicode.tables/CaseFolding.txt', get_other_case, 0)
-
-# The grapheme breaking rules were changed for Unicode 11.0.0 (June 2018). Now
-# we need to find the Extended_Pictographic property for emoji characters. This
-# can be set as an additional grapheme break property, because the default for
-# all the emojis is "other". We scan the emoji-data.txt file and modify the
-# break-props table.
-
-file = open('Unicode.tables/emoji-data.txt', 'r', encoding='utf-8')
-for line in file:
-        line = re.sub(r'#.*', '', line)
-        chardata = list(map(str.strip, line.split(';')))
-        if len(chardata) <= 1:
-                continue
-
-        if chardata[1] != "Extended_Pictographic":
-                continue
-
-        m = re.match(r'([0-9a-fA-F]+)(\.\.([0-9a-fA-F]+))?$', chardata[0])
-        char = int(m.group(1), 16)
-        if m.group(3) is None:
-                last = char
-        else:
-                last = int(m.group(3), 16)
-        for i in range(char, last + 1):
-                if break_props[i] != break_property_names.index('Other'):
-                   print("WARNING: Emoji 0x%x has break property %s, not 'Other'",
-                     i, break_property_names[break_props[i]], file=sys.stderr)
-                break_props[i] = break_property_names.index('Extended_Pictographic')
-file.close()
-
-# The Script Extensions property default value is the Script value. Parse the
-# file, setting 'Unknown' as the default (this will never be a Script Extension
-# value), then scan it and fill in the default from Scripts. Code added by PH
-# in October 2018. Positive values are used for just a single script for a
-# code point. Negative values are negated offsets in a list of lists of
-# multiple scripts. Initialize this list with a single entry, as the zeroth
-# element is never used.
-
-script_lists = [0]
-script_abbrevs_default = script_abbrevs.index('Zzzz')
-scriptx = read_table('Unicode.tables/ScriptExtensions.txt', get_script_extension, script_abbrevs_default)
-
-for i in range(0, MAX_UNICODE):
-  if scriptx[i] == script_abbrevs_default:
-    scriptx[i] = script[i]
-
-# With the addition of the new Script Extensions field, we need some padding
-# to get the Unicode records up to 12 bytes (multiple of 4). Set a value
-# greater than 255 to make the field 16 bits.
-
-padding_dummy = [0] * MAX_UNICODE
-padding_dummy[0] = 256
-
-# This block of code was added by PH in September 2012. I am not a Python
-# programmer, so the style is probably dreadful, but it does the job. It scans
-# the other_case table to find sets of more than two characters that must all
-# match each other caselessly. Later in this script a table of these sets is
-# written out. However, we have to do this work here in order to compute the
-# offsets in the table that are inserted into the main table.
-
-# The CaseFolding.txt file lists pairs, but the common logic for reading data
-# sets only one value, so first we go through the table and set "return"
-# offsets for those that are not already set.
-
-for c in range(MAX_UNICODE):
-  if other_case[c] != 0 and other_case[c + other_case[c]] == 0:
-    other_case[c + other_case[c]] = -other_case[c]
-
-# Now scan again and create equivalence sets.
-
-sets = []
-
-for c in range(MAX_UNICODE):
-  o = c + other_case[c]
-
-  # Trigger when this character's other case does not point back here. We
-  # now have three characters that are case-equivalent.
-
-  if other_case[o] != -other_case[c]:
-    t = o + other_case[o]
-
-    # Scan the existing sets to see if any of the three characters are already
-    # part of a set. If so, unite the existing set with the new set.
-
-    appended = 0
-    for s in sets:
-      found = 0
-      for x in s:
-        if x == c or x == o or x == t:
-          found = 1
-
-      # Add new characters to an existing set
-
-      if found:
-        found = 0
-        for y in [c, o, t]:
-          for x in s:
-            if x == y:
-              found = 1
-          if not found:
-            s.append(y)
-        appended = 1
-
-    # If we have not added to an existing set, create a new one.
-
-    if not appended:
-      sets.append([c, o, t])
-
-# End of loop looking for caseless sets.
-
-# Now scan the sets and set appropriate offsets for the characters.
-
-caseless_offsets = [0] * MAX_UNICODE
-
-offset = 1;
-for s in sets:
-  for x in s:
-    caseless_offsets[x] = offset
-  offset += len(s) + 1
-
-# End of block of code for creating offsets for caseless matching sets.
-
-
-# Combine the tables
-
-table, records = combine_tables(script, category, break_props,
-  caseless_offsets, other_case, scriptx, padding_dummy)
-
-record_size, record_struct = get_record_size_struct(list(records.keys()))
-
-# Find the optimum block size for the two-stage table
-min_size = sys.maxsize
-for block_size in [2 ** i for i in range(5,10)]:
-        size = len(records) * record_size
-        stage1, stage2 = compress_table(table, block_size)
-        size += get_tables_size(stage1, stage2)
-        #print "/* block size %5d  => %5d bytes */" % (block_size, size)
-        if size < min_size:
-                min_size = size
-                min_stage1, min_stage2 = stage1, stage2
-                min_block_size = block_size
-
-print("/* This module is generated by the maint/MultiStage2.py script.")
-print("Do not modify it by hand. Instead modify the script and run it")
-print("to regenerate this code.")
-print()
-print("As well as being part of the PCRE2 library, this module is #included")
-print("by the pcre2test program, which redefines the PRIV macro to change")
-print("table names from _pcre2_xxx to xxxx, thereby avoiding name clashes")
-print("with the library. At present, just one of these tables is actually")
-print("needed. */")
-print()
-print("#ifndef PCRE2_PCRE2TEST")
-print()
-print("#ifdef HAVE_CONFIG_H")
-print("#include \"config.h\"")
-print("#endif")
-print()
-print("#include \"pcre2_internal.h\"")
-print()
-print("#endif /* PCRE2_PCRE2TEST */")
-print()
-print("/* Unicode character database. */")
-print("/* This file was autogenerated by the MultiStage2.py script. */")
-print("/* Total size: %d bytes, block size: %d. */" % (min_size, min_block_size))
-print()
-print("/* The tables herein are needed only when UCP support is built,")
-print("and in PCRE2 that happens automatically with UTF support.")
-print("This module should not be referenced otherwise, so")
-print("it should not matter whether it is compiled or not. However")
-print("a comment was received about space saving - maybe the guy linked")
-print("all the modules rather than using a library - so we include a")
-print("condition to cut out the tables when not needed. But don't leave")
-print("a totally empty module because some compilers barf at that.")
-print("Instead, just supply some small dummy tables. */")
-print()
-print("#ifndef SUPPORT_UNICODE")
-print("const ucd_record PRIV(ucd_records)[] = {{0,0,0,0,0,0,0 }};")
-print("const uint16_t PRIV(ucd_stage1)[] = {0};")
-print("const uint16_t PRIV(ucd_stage2)[] = {0};")
-print("const uint32_t PRIV(ucd_caseless_sets)[] = {0};")
-print("#else")
-print()
-print("const char *PRIV(unicode_version) = \"{}\";".format(unicode_version))
-print()
-print("/* If the 32-bit library is run in non-32-bit mode, character values")
-print("greater than 0x10ffff may be encountered. For these we set up a")
-print("special record. */")
-print()
-print("#if PCRE2_CODE_UNIT_WIDTH == 32")
-print("const ucd_record PRIV(dummy_ucd_record)[] = {{")
-print("  ucp_Unknown,    /* script */")
-print("  ucp_Cn,         /* type unassigned */")
-print("  ucp_gbOther,    /* grapheme break property */")
-print("  0,              /* case set */")
-print("  0,              /* other case */")
-print("  ucp_Unknown,    /* script extension */")
-print("  0,              /* dummy filler */")
-print("  }};")
-print("#endif")
-print()
-print(record_struct)
-
-# --- Added by PH: output the table of caseless character sets ---
-
-print("/* This table contains lists of characters that are caseless sets of")
-print("more than one character. Each list is terminated by NOTACHAR. */\n")
-
-print("const uint32_t PRIV(ucd_caseless_sets)[] = {")
-print("  NOTACHAR,")
-for s in sets:
-  s = sorted(s)
-  for x in s:
-    print('  0x%04x,' % x, end=' ')
-  print('  NOTACHAR,')
-print('};')
-print()
-
-# ------
-
-print("/* When #included in pcre2test, we don't need the table of digit")
-print("sets, nor the the large main UCD tables. */")
-print()
-print("#ifndef PCRE2_PCRE2TEST")
-print()
-
-# --- Added by PH: read Scripts.txt again for the sets of 10 digits. ---
-
-digitsets = []
-file = open('Unicode.tables/Scripts.txt', 'r', encoding='utf-8')
-
-for line in file:
-  m = re.match(r'([0-9a-fA-F]+)\.\.([0-9a-fA-F]+)\s+;\s+\S+\s+#\s+Nd\s+', line)
-  if m is None:
-    continue
-  first = int(m.group(1),16)
-  last  = int(m.group(2),16)
-  if ((last - first + 1) % 10) != 0:
-    print("ERROR: %04x..%04x does not contain a multiple of 10 characters" % (first, last),
-      file=sys.stderr)
-  while first < last:
-    digitsets.append(first + 9)
-    first += 10
-file.close()
-digitsets.sort()
-
-print("/* This table lists the code points for the '9' characters in each")
-print("set of decimal digits. It is used to ensure that all the digits in")
-print("a script run come from the same set. */\n")
-print("const uint32_t PRIV(ucd_digit_sets)[] = {")
-
-print("  %d,  /* Number of subsequent values */" % len(digitsets), end='')
-count = 8
-for d in digitsets:
-  if count == 8:
-    print("\n ", end='')
-    count = 0
-  print(" 0x%05x," % d, end='')
-  count += 1
-print("\n};\n")
-
-print("/* This vector is a list of lists of scripts for the Script Extension")
-print("property. Each sublist is zero-terminated. */\n")
-print("const uint8_t PRIV(ucd_script_sets)[] = {")
-
-count = 0
-print("  /*   0 */", end='')
-for d in script_lists:
-  print(" %3d," % d, end='')
-  count += 1
-  if d == 0:
-    print("\n  /* %3d */" % count, end='')
-print("\n};\n")
-
-# Output the main UCD tables.
-
-print("/* These are the main two-stage UCD tables. The fields in each record are:")
-print("script (8 bits), character type (8 bits), grapheme break property (8 bits),")
-print("offset to multichar other cases or zero (8 bits), offset to other case")
-print("or zero (32 bits, signed), script extension (16 bits, signed), and a dummy")
-print("16-bit field to make the whole thing a multiple of 4 bytes. */\n")
-
-print_records(records, record_size)
-print_table(min_stage1, 'PRIV(ucd_stage1)')
-print_table(min_stage2, 'PRIV(ucd_stage2)', min_block_size)
-print("#if UCD_BLOCK_SIZE != %d" % min_block_size)
-print("#error Please correct UCD_BLOCK_SIZE in pcre2_internal.h")
-print("#endif")
-print("#endif  /* SUPPORT_UNICODE */")
-print()
-print("#endif  /* PCRE2_PCRE2TEST */")
-
-
-# This code was part of the original contribution, but is commented out as it
-# was never used. A two-stage table has sufficed.
-
-"""
-
-# Three-stage tables:
-
-# Find the optimum block size for 3-stage table
-min_size = sys.maxint
-for stage3_block in [2 ** i for i in range(2,6)]:
-        stage_i, stage3 = compress_table(table, stage3_block)
-        for stage2_block in [2 ** i for i in range(5,10)]:
-                size = len(records) * 4
-                stage1, stage2 = compress_table(stage_i, stage2_block)
-                size += get_tables_size(stage1, stage2, stage3)
-                # print "/* %5d / %3d  => %5d bytes */" % (stage2_block, stage3_block, size)
-                if size < min_size:
-                        min_size = size
-                        min_stage1, min_stage2, min_stage3 = stage1, stage2, stage3
-                        min_stage2_block, min_stage3_block = stage2_block, stage3_block
-
-print "/* Total size: %d bytes" % min_size */
-print_records(records)
-print_table(min_stage1, 'ucd_stage1')
-print_table(min_stage2, 'ucd_stage2', min_stage2_block)
-print_table(min_stage3, 'ucd_stage3', min_stage3_block)
-
-"""
--- a/maint/README
+++ b/maint/README
@ -16,99 +16,122 @@ and also contains some notes for maintainers. Its contents are:
 Files in the maint directory
 ============================

-GenerateUtt.py   A Python script to generate part of the pcre2_tables.c file
-                 that contains Unicode script names in a long string with
-                 offsets, which is tedious to maintain by hand.
+GenerateCommon.py
+  A Python module containing data and functions that are used by the other
+  Generate scripts.
+  
+GenerateTest26.py
+  A Python script that generates input and expected output test data for test
+  26, which tests certain aspects of Unicode property support.  

-ManyConfigTests  A shell script that runs "configure, make, test" a number of
-                 times with different configuration settings.
+GenerateUcd.py
+  A Python script that generates the file pcre2_ucd.c from GenerateCommon.py
+  and Unicode data files, which are themselves downloaded from the Unicode web
+  site. The generated file contains the tables for a 2-stage lookup of Unicode
+  properties, along with some auxiliary tables. The script starts with a long
+  comment that gives details of the tables it constructs. 

-MultiStage2.py   A Python script that generates the file pcre2_ucd.c from six
-                 Unicode data files, which are themselves downloaded from the
-                 Unicode web site. Run this script in the "maint" directory.
-                 The generated file is written to stdout. It contains the
-                 tables for a 2-stage lookup of Unicode properties, along with
-                 some auxiliary tables.
+GenerateUcpHeader.py
+  A Python script that generates the file pcre2_ucp.h from GenerateCommon.py
+  and Unicode data files. The generated file defines constants for various
+  Unicode property values.
+
+GenerateUcpTables.py
+  A Python script that generates the file pcre2_ucptables.c from
+  GenerateCommon.py and Unicode data files. The generated file contains tables
+  for looking up Unicode property names.
+
+ManyConfigTests
+  A shell script that runs "configure, make, test" a number of times with
+  different configuration settings.

 pcre2_chartables.c.non-standard
-                 This is a set of character tables that came from a Windows
-                 system. It has characters greater than 128 that are set as
-                 spaces, amongst other things. I kept it so that it can be
-                 used for testing from time to time.
+  This is a set of character tables that came from a Windows system. It has
+  characters greater than 128 that are set as spaces, amongst other things. I
+  kept it so that it can be used for testing from time to time.

-README           This file.
+README
+  This file.

-Unicode.tables   The files in this directory were downloaded from the Unicode
-                 web site. They contain information about Unicode characters
-                 and scripts. The ones used by the MultiStage2.py script are
-                 CaseFolding.txt, DerivedGeneralCategory.txt, Scripts.txt,
-                 ScriptExtensions.txt, GraphemeBreakProperty.txt, and
-                 emoji-data.txt. I've kept UnicodeData.txt (which is no longer
-                 used by the script) because it is useful occasionally for
-                 manually looking up the details of certain characters.
-                 However, note that character names in this file such as
-                 "Arabic sign sanah" do NOT mean that the character is in a
-                 particular script (in this case, Arabic). Scripts.txt and
-                 ScriptExtensions.txt are where to look for script information.
+Unicode.tables
+  The files in this directory were downloaded from the Unicode web site. They
+  contain information about Unicode characters and scripts, and are used by the
+  Generate scripts. There is also UnicodeData.txt, which is no longer used by
+  any script, because it is useful occasionally for manually looking up the
+  details of certain characters. However, note that character names in this
+  file such as "Arabic sign sanah" do NOT mean that the character is in a
+  particular script (in this case, Arabic). Scripts.txt and
+  ScriptExtensions.txt are where to look for script information.

-ucptest.c        A short C program for testing the Unicode property macros
-                 that do lookups in the pcre2_ucd.c data, mainly useful after
-                 rebuilding the Unicode property table. Compile and run this in
-                 the "maint" directory (see comments at its head). This program
-                 can also be used to find characters with specific properties.
+ucptest.c
+  A program for testing the Unicode property macros that do lookups in the
+  pcre2_ucd.c data, mainly useful after rebuilding the Unicode property tables.
+  Compile and run this in the "maint" directory (see comments at its head).
+  This program can also be used to find characters with specific properties and 
+  to list which properties are supported. 

-ucptestdata      A directory containing four files, testinput{1,2} and
-                 testoutput{1,2}, for use in conjunction with the ucptest
-                 program.
+ucptestdata
+  A directory containing four files, testinput{1,2} and testoutput{1,2}, for
+  use in conjunction with the ucptest program.

-utf8.c           A short, freestanding C program for converting a Unicode code
-                 point into a sequence of bytes in the UTF-8 encoding, and vice
-                 versa. If its argument is a hex number such as 0x1234, it
-                 outputs a list of the equivalent UTF-8 bytes. If its argument
-                 is a sequence of concatenated UTF-8 bytes (e.g. e188b4) it
-                 treats them as a UTF-8 character and outputs the equivalent
-                 code point in hex. See comments at its head for details.
+utf8.c
+  A short, freestanding C program for converting a Unicode code point into a
+  sequence of bytes in the UTF-8 encoding, and vice versa. If its argument is a
+  hex number such as 0x1234, it outputs a list of the equivalent UTF-8 bytes.
+  If its argument is a sequence of concatenated UTF-8 bytes (e.g. 12e188b4) it
+  treats them as a UTF-8 string and outputs the equivalent code points in hex.
+  See comments at its head for details.


 Updating to a new Unicode release
 =================================

 When there is a new release of Unicode, the files in Unicode.tables must be
-refreshed from the web site. If the new version of Unicode adds new character
-scripts, the source file pcre2_ucp.h and both the MultiStage2.py and the
-GenerateUtt.py scripts must be edited to add the new names. I have been adding
-each new group at the end of the relevant list, with a comment. Note also that
-both the pcre2syntax.3 and pcre2pattern.3 man pages contain lists of Unicode
-script names.
+refreshed from the web site. Once that is done, the four Python scripts that 
+generate files from the Unicode data can be run from within the "maint" 
+directory.

-MultiStage2.py has two lists: the full names and the abbreviations that are
-found in the ScriptExtensions.txt file. A list of script names and their
-abbreviations can be found in the PropertyValueAliases.txt file on the
-Unicode web site. There is also a Wikipedia page that lists them, and notes the
-Unicode version in which they were introduced:
+Note: Previously, it was necessary to update lists of scripts and their 
+abbreviations by hand before running the Python scripts. This is no longer
+necessary because the scripts have been upgraded to extract this information
+themselves. Also, there used to be explicit lists of scripts in two of the man
+pages. This is no longer the case; the pcre2test program can now output a list 
+of supported scripts.

-https://en.wikipedia.org/wiki/Unicode_scripts#Table_of_Unicode_scripts
+You can give an output file name as an argument to the following scripts, but
+by default:

-Once the script name lists have been updated, MultiStage2.py can be run to
-generate a new version of pcre2_ucd.c, and GenerateUtt.py can be run to
-generate the tricky tables for inclusion in pcre2_tables.c (which must be
-hand-edited). If MultiStage2.py gives the error "ValueError: list.index(x): x
-not in list", the cause is usually a missing (or misspelt) name in one of the
-lists of scripts.
+GenerateUcd.py        creates pcre2_ucd.c        )
+GenerateUcpHeader.py  creates pcre2_ucp.h        ) in the current directory
+GenerateUcpTables.py  creates pcre2_ucptables.c  )

-The ucptest program can be compiled and used to check that the new tables in
-pcre2_ucd.c work properly, using the data files in ucptestdata to check a
-number of test characters. It used to be necessary to update the source
-ucptest.c whenever new Unicode scripts were added, but this is no longer
-required because that program now uses the lists in the PCRE2 source. However,
-adding a few tests for new scripts to the files in ucptestdata is a good idea.
+These files can be compared against the existing versions in the src directory
+to check on any changes before replacing the old files, but you can also
+generate directly into the final location by running:
+
+./GenerateUcd.py       ../src/pcre2_ucd.c
+./GenerateUcpHeader.py ../src/pcre2_ucp.h
+./GenerateUcpTables.py ../src/pcre2_ucptables.c
+
+Once the .c and .h files are in the ../src directory, the ucptest program can
+be compiled and used to check that the new tables work properly. The data files
+in ucptestdata are set up to check a number of test characters. See the
+comments at the start of ucptest.c. If there are new scripts, adding a few
+tests to the files in ucptestdata is a good idea.
+
+Finally, you should run the GenerateTest26.py script to regenerate new versions 
+of the input and expected output from a series of Unicode property tests that 
+are automatically generated from the Unicode data files. By default, the files
+are written to testinput26 and testoutput26 in the current directory, but you
+can give an alternative directory name as an argument to the script. These
+files should eventually be installed in the main testdata directory.


 Preparing for a PCRE2 release
 =============================

-This section contains a checklist of things that I consult before building a
-distribution for a new release.
+This section contains a checklist of things that I do before building a new
+release.

 . Ensure that the version number and version date are correct in configure.ac.

@ -117,17 +140,16 @@ distribution for a new release.

 . If new build options or new source files have been added, ensure that they
  are added to the CMake files as well as to the autoconf files. The relevant
-  files are CMakeLists.txt and config-cmake.h.in. After making a release
-  tarball, test it out with CMake if there have been changes here.
+  files are CMakeLists.txt and config-cmake.h.in. After making a release, test
+  it out with CMake if there have been changes here.

 . Run ./autogen.sh to ensure everything is up-to-date.

 . Compile and test with many different config options, and combinations of
  options. Also, test with valgrind by running "RunTest valgrind" and
-  "RunGrepTest valgrind" (which takes quite a long time). The script
-  maint/ManyConfigTests now encapsulates this testing. It runs tests with
-  different configurations, and it also runs some of them with valgrind, all of
-  which can take quite some time.
+  "RunGrepTest valgrind". The script maint/ManyConfigTests now encapsulates
+  this testing. It runs tests with different configurations, and it also runs
+  some of them with valgrind, all of which can take quite some time.

 . Run tests in both 32-bit and 64-bit environments if possible. I can no longer
  run 32-bit tests.
@ -142,7 +164,8 @@ distribution for a new release.
  -fsanitize=signed-integer-overflow

 . Do a test build using CMake. Remove src/config.h first, lest it override the
-  version that CMake creates. Do NOT use parallel make.
+  version that CMake creates. Also do a CMake unity build to check that it 
+  still works: [c]cmake -DCMAKE_UNITY_BUILD=ON sets up a unity build.

 . Run perltest.sh on the test data for tests 1 and 4. The output should match
  the PCRE2 test output, apart from the version identification at the start of
@ -161,11 +184,12 @@ distribution for a new release.
  systems. For example, on Solaris it is helpful to test using Sun's cc
  compiler as a change from gcc. Adding -xarch=v9 to the cc options does a
  64-bit test, but it also needs -S 64 for pcre2test to increase the stack size
-  for test 2. Since I retired I can no longer do much of this, but instead I
-  rely on putting out release candidates for testing by the community.
+  for test 2. Since I retired I can no longer do much of this. There are 
+  automated tests under Ubuntu, Alpine, and Windows that are now set up as 
+  GitHub actions. Check that they are running clean.

 . The buildbots at http://buildfarm.opencsw.org/ do some automated testing
-  of PCRE2 and should be checked before putting out a release.
+  of PCRE2 and should also be checked before putting out a release.


 Updating version info for libtool
@ -221,10 +245,11 @@ it reports them and then aborts. Otherwise it removes trailing spaces from
 sources and refreshes the HTML documentation. Update the GitHub repository with
 "git push".

-Once PrepareRelease has run clean, run "make distcheck" to create the tarball
+Once PrepareRelease has run clean, run "make distcheck" to create the tarballs
 and the zipball. I then sign these files. Double-check with "git status" that
-the repository is fully up-to-date, then create a new tag on GitHub. Upload the
-tarball, zipball, and the signatures as "assets" of the GitHub release.
+the repository is fully up-to-date, then create a new tag and a release on
+GitHub. Upload the tarballs, zipball, and the signatures as "assets" of the
+GitHub release.

 When the new release is out, don't forget to tell webmaster@pcre.org and the
 mailing list.
@ -343,8 +368,6 @@ years.

  See Unicode TR 29. The last two are very much aimed at natural language.

-. (?[...]) extended classes: big project.
-
 . Allow a callout to specify a number of characters to skip. This can be done
  compatibly via an extra callout field.

@ -414,13 +437,8 @@ years.
  with lookarounds for \b and \B. Ideally the setting should last till the end
  of the group, which means remembering all previous settings; maybe a fixed
  amount of stack would do - how deep would anyone want to nest these things?
-  See GitHub issue #13 for a compendium of character class issues.
-
-. Recognize the short script names. They are already listed in maint/
-  Multistage2.py because they are needed for scanning the script extensions
-  file.
-
-. Use script extensions for \p?
+  See GitHub issue #13 for a compendium of character class issues, including
+  (?[...]) extended classes.

 . A user suggested something like --with-build-info to set a build information
  string that could be retrieved by pcre2_config(). However, there's no
@ -439,4 +457,4 @@ years.
 Philip Hazel
 Email local part: Philip.Hazel
 Email domain: gmail.com
-Last updated: 26 August 2021
+Last updated: 25 April 2022
--- a/maint/Unicode.tables/BidiMirroring.txt
+++ b/maint/Unicode.tables/BidiMirroring.txt
@ -0,0 +1,633 @@
+# BidiMirroring-14.0.0.txt
+# Date: 2021-08-08, 22:55:00 GMT [KW, RP]
+# © 2021 Unicode®, Inc.
+# For terms of use, see https://www.unicode.org/terms_of_use.html
+#
+# Unicode Character Database
+# For documentation, see https://www.unicode.org/reports/tr44/
+#
+# Bidi_Mirroring_Glyph Property
+#
+# This file is an informative contributory data file in the
+# Unicode Character Database.
+#
+# This data file lists characters that have the Bidi_Mirrored=Yes property
+# value, for which there is another Unicode character that typically has a glyph
+# that is the mirror image of the original character's glyph.
+#
+# The repertoire covered by the file is Unicode 14.0.0.
+#
+# The file contains a list of lines with mappings from one code point
+# to another one for character-based mirroring.
+# Note that for "real" mirroring, a rendering engine needs to select
+# appropriate alternative glyphs, and that many Unicode characters do not
+# have a mirror-image Unicode character.
+#
+# Each mapping line contains two fields, separated by a semicolon (';').
+# Each of the two fields contains a code point represented as a
+# variable-length hexadecimal value with 4 to 6 digits.
+# A comment indicates where the characters are "BEST FIT" mirroring.
+#
+# Code points for which Bidi_Mirrored=Yes, but for which no appropriate
+# characters exist with mirrored glyphs, are
+# listed as comments at the end of the file.
+#
+# Formally, the default value of the Bidi_Mirroring_Glyph property
+# for each code point is <none>, unless a mapping to
+# some other character is specified in this data file. When a code
+# point has the default value for the Bidi_Mirroring_Glyph property,
+# that means that no other character exists whose glyph is suitable
+# for character-based mirroring.
+#
+# For information on bidi mirroring, see UAX #9: Unicode Bidirectional Algorithm,
+# at https://www.unicode.org/reports/tr9/
+#
+# This file was originally created by Markus Scherer.
+# Extended for Unicode 3.2, 4.0, 4.1, 5.0, 5.1, 5.2, and 6.0 by Ken Whistler,
+# and for subsequent versions by Ken Whistler, Laurentiu Iancu, and Roozbeh Pournader.
+#
+# Historical and Compatibility Information:
+#
+# The OpenType Mirroring Pairs List (OMPL) is frozen to match the
+# Unicode 5.1 version of the Bidi_Mirroring_Glyph property (2008).
+# See https://www.microsoft.com/typography/otspec/ompl.txt
+#
+# The Unicode 6.1 version of the Bidi_Mirroring_Glyph property (2011)
+# added one mirroring pair: 27CB <--> 27CD.
+#
+# The Unicode 11.0 version of the Bidi_Mirroring_Glyph property (2018)
+# underwent a substantial revision, to formally recognize all of the
+# exact mirroring pairs and "BEST FIT" mirroring pairs that had been
+# added after the freezing of the OMPL list. As a result, starting
+# with Unicode 11.0, the bmg mapping values more accurately reflect
+# the current status of glyphs for Bidi_Mirrored characters in
+# the Unicode Standard, but this listing now extends significantly
+# beyond the frozen OMPL list. Implementers should be aware of this
+# intentional distinction.
+#
+# ############################################################
+#
+# Property:	Bidi_Mirroring_Glyph
+#
+# @missing: 0000..10FFFF; <none>
+
+0028; 0029 # LEFT PARENTHESIS
+0029; 0028 # RIGHT PARENTHESIS
+003C; 003E # LESS-THAN SIGN
+003E; 003C # GREATER-THAN SIGN
+005B; 005D # LEFT SQUARE BRACKET
+005D; 005B # RIGHT SQUARE BRACKET
+007B; 007D # LEFT CURLY BRACKET
+007D; 007B # RIGHT CURLY BRACKET
+00AB; 00BB # LEFT-POINTING DOUBLE ANGLE QUOTATION MARK
+00BB; 00AB # RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK
+0F3A; 0F3B # TIBETAN MARK GUG RTAGS GYON
+0F3B; 0F3A # TIBETAN MARK GUG RTAGS GYAS
+0F3C; 0F3D # TIBETAN MARK ANG KHANG GYON
+0F3D; 0F3C # TIBETAN MARK ANG KHANG GYAS
+169B; 169C # OGHAM FEATHER MARK
+169C; 169B # OGHAM REVERSED FEATHER MARK
+2039; 203A # SINGLE LEFT-POINTING ANGLE QUOTATION MARK
+203A; 2039 # SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
+2045; 2046 # LEFT SQUARE BRACKET WITH QUILL
+2046; 2045 # RIGHT SQUARE BRACKET WITH QUILL
+207D; 207E # SUPERSCRIPT LEFT PARENTHESIS
+207E; 207D # SUPERSCRIPT RIGHT PARENTHESIS
+208D; 208E # SUBSCRIPT LEFT PARENTHESIS
+208E; 208D # SUBSCRIPT RIGHT PARENTHESIS
+2208; 220B # ELEMENT OF
+2209; 220C # [BEST FIT] NOT AN ELEMENT OF
+220A; 220D # SMALL ELEMENT OF
+220B; 2208 # CONTAINS AS MEMBER
+220C; 2209 # [BEST FIT] DOES NOT CONTAIN AS MEMBER
+220D; 220A # SMALL CONTAINS AS MEMBER
+2215; 29F5 # DIVISION SLASH
+221F; 2BFE # RIGHT ANGLE
+2220; 29A3 # ANGLE
+2221; 299B # MEASURED ANGLE
+2222; 29A0 # SPHERICAL ANGLE
+2224; 2AEE # DOES NOT DIVIDE
+223C; 223D # TILDE OPERATOR
+223D; 223C # REVERSED TILDE
+2243; 22CD # ASYMPTOTICALLY EQUAL TO
+2245; 224C # APPROXIMATELY EQUAL TO
+224C; 2245 # ALL EQUAL TO
+2252; 2253 # APPROXIMATELY EQUAL TO OR THE IMAGE OF
+2253; 2252 # IMAGE OF OR APPROXIMATELY EQUAL TO
+2254; 2255 # COLON EQUALS
+2255; 2254 # EQUALS COLON
+2264; 2265 # LESS-THAN OR EQUAL TO
+2265; 2264 # GREATER-THAN OR EQUAL TO
+2266; 2267 # LESS-THAN OVER EQUAL TO
+2267; 2266 # GREATER-THAN OVER EQUAL TO
+2268; 2269 # [BEST FIT] LESS-THAN BUT NOT EQUAL TO
+2269; 2268 # [BEST FIT] GREATER-THAN BUT NOT EQUAL TO
+226A; 226B # MUCH LESS-THAN
+226B; 226A # MUCH GREATER-THAN
+226E; 226F # [BEST FIT] NOT LESS-THAN
+226F; 226E # [BEST FIT] NOT GREATER-THAN
+2270; 2271 # [BEST FIT] NEITHER LESS-THAN NOR EQUAL TO
+2271; 2270 # [BEST FIT] NEITHER GREATER-THAN NOR EQUAL TO
+2272; 2273 # [BEST FIT] LESS-THAN OR EQUIVALENT TO
+2273; 2272 # [BEST FIT] GREATER-THAN OR EQUIVALENT TO
+2274; 2275 # [BEST FIT] NEITHER LESS-THAN NOR EQUIVALENT TO
+2275; 2274 # [BEST FIT] NEITHER GREATER-THAN NOR EQUIVALENT TO
+2276; 2277 # LESS-THAN OR GREATER-THAN
+2277; 2276 # GREATER-THAN OR LESS-THAN
+2278; 2279 # [BEST FIT] NEITHER LESS-THAN NOR GREATER-THAN
+2279; 2278 # [BEST FIT] NEITHER GREATER-THAN NOR LESS-THAN
+227A; 227B # PRECEDES
+227B; 227A # SUCCEEDS
+227C; 227D # PRECEDES OR EQUAL TO
+227D; 227C # SUCCEEDS OR EQUAL TO
+227E; 227F # [BEST FIT] PRECEDES OR EQUIVALENT TO
+227F; 227E # [BEST FIT] SUCCEEDS OR EQUIVALENT TO
+2280; 2281 # [BEST FIT] DOES NOT PRECEDE
+2281; 2280 # [BEST FIT] DOES NOT SUCCEED
+2282; 2283 # SUBSET OF
+2283; 2282 # SUPERSET OF
+2284; 2285 # [BEST FIT] NOT A SUBSET OF
+2285; 2284 # [BEST FIT] NOT A SUPERSET OF
+2286; 2287 # SUBSET OF OR EQUAL TO
+2287; 2286 # SUPERSET OF OR EQUAL TO
+2288; 2289 # [BEST FIT] NEITHER A SUBSET OF NOR EQUAL TO
+2289; 2288 # [BEST FIT] NEITHER A SUPERSET OF NOR EQUAL TO
+228A; 228B # [BEST FIT] SUBSET OF WITH NOT EQUAL TO
+228B; 228A # [BEST FIT] SUPERSET OF WITH NOT EQUAL TO
+228F; 2290 # SQUARE IMAGE OF
+2290; 228F # SQUARE ORIGINAL OF
+2291; 2292 # SQUARE IMAGE OF OR EQUAL TO
+2292; 2291 # SQUARE ORIGINAL OF OR EQUAL TO
+2298; 29B8 # CIRCLED DIVISION SLASH
+22A2; 22A3 # RIGHT TACK
+22A3; 22A2 # LEFT TACK
+22A6; 2ADE # ASSERTION
+22A8; 2AE4 # TRUE
+22A9; 2AE3 # FORCES
+22AB; 2AE5 # DOUBLE VERTICAL BAR DOUBLE RIGHT TURNSTILE
+22B0; 22B1 # PRECEDES UNDER RELATION
+22B1; 22B0 # SUCCEEDS UNDER RELATION
+22B2; 22B3 # NORMAL SUBGROUP OF
+22B3; 22B2 # CONTAINS AS NORMAL SUBGROUP
+22B4; 22B5 # NORMAL SUBGROUP OF OR EQUAL TO
+22B5; 22B4 # CONTAINS AS NORMAL SUBGROUP OR EQUAL TO
+22B6; 22B7 # ORIGINAL OF
+22B7; 22B6 # IMAGE OF
+22B8; 27DC # MULTIMAP
+22C9; 22CA # LEFT NORMAL FACTOR SEMIDIRECT PRODUCT
+22CA; 22C9 # RIGHT NORMAL FACTOR SEMIDIRECT PRODUCT
+22CB; 22CC # LEFT SEMIDIRECT PRODUCT
+22CC; 22CB # RIGHT SEMIDIRECT PRODUCT
+22CD; 2243 # REVERSED TILDE EQUALS
+22D0; 22D1 # DOUBLE SUBSET
+22D1; 22D0 # DOUBLE SUPERSET
+22D6; 22D7 # LESS-THAN WITH DOT
+22D7; 22D6 # GREATER-THAN WITH DOT
+22D8; 22D9 # VERY MUCH LESS-THAN
+22D9; 22D8 # VERY MUCH GREATER-THAN
+22DA; 22DB # LESS-THAN EQUAL TO OR GREATER-THAN
+22DB; 22DA # GREATER-THAN EQUAL TO OR LESS-THAN
+22DC; 22DD # EQUAL TO OR LESS-THAN
+22DD; 22DC # EQUAL TO OR GREATER-THAN
+22DE; 22DF # EQUAL TO OR PRECEDES
+22DF; 22DE # EQUAL TO OR SUCCEEDS
+22E0; 22E1 # [BEST FIT] DOES NOT PRECEDE OR EQUAL
+22E1; 22E0 # [BEST FIT] DOES NOT SUCCEED OR EQUAL
+22E2; 22E3 # [BEST FIT] NOT SQUARE IMAGE OF OR EQUAL TO
+22E3; 22E2 # [BEST FIT] NOT SQUARE ORIGINAL OF OR EQUAL TO
+22E4; 22E5 # [BEST FIT] SQUARE IMAGE OF OR NOT EQUAL TO
+22E5; 22E4 # [BEST FIT] SQUARE ORIGINAL OF OR NOT EQUAL TO
+22E6; 22E7 # [BEST FIT] LESS-THAN BUT NOT EQUIVALENT TO
+22E7; 22E6 # [BEST FIT] GREATER-THAN BUT NOT EQUIVALENT TO
+22E8; 22E9 # [BEST FIT] PRECEDES BUT NOT EQUIVALENT TO
+22E9; 22E8 # [BEST FIT] SUCCEEDS BUT NOT EQUIVALENT TO
+22EA; 22EB # [BEST FIT] NOT NORMAL SUBGROUP OF
+22EB; 22EA # [BEST FIT] DOES NOT CONTAIN AS NORMAL SUBGROUP
+22EC; 22ED # [BEST FIT] NOT NORMAL SUBGROUP OF OR EQUAL TO
+22ED; 22EC # [BEST FIT] DOES NOT CONTAIN AS NORMAL SUBGROUP OR EQUAL
+22F0; 22F1 # UP RIGHT DIAGONAL ELLIPSIS
+22F1; 22F0 # DOWN RIGHT DIAGONAL ELLIPSIS
+22F2; 22FA # ELEMENT OF WITH LONG HORIZONTAL STROKE
+22F3; 22FB # ELEMENT OF WITH VERTICAL BAR AT END OF HORIZONTAL STROKE
+22F4; 22FC # SMALL ELEMENT OF WITH VERTICAL BAR AT END OF HORIZONTAL STROKE
+22F6; 22FD # ELEMENT OF WITH OVERBAR
+22F7; 22FE # SMALL ELEMENT OF WITH OVERBAR
+22FA; 22F2 # CONTAINS WITH LONG HORIZONTAL STROKE
+22FB; 22F3 # CONTAINS WITH VERTICAL BAR AT END OF HORIZONTAL STROKE
+22FC; 22F4 # SMALL CONTAINS WITH VERTICAL BAR AT END OF HORIZONTAL STROKE
+22FD; 22F6 # CONTAINS WITH OVERBAR
+22FE; 22F7 # SMALL CONTAINS WITH OVERBAR
+2308; 2309 # LEFT CEILING
+2309; 2308 # RIGHT CEILING
+230A; 230B # LEFT FLOOR
+230B; 230A # RIGHT FLOOR
+2329; 232A # LEFT-POINTING ANGLE BRACKET
+232A; 2329 # RIGHT-POINTING ANGLE BRACKET
+2768; 2769 # MEDIUM LEFT PARENTHESIS ORNAMENT
+2769; 2768 # MEDIUM RIGHT PARENTHESIS ORNAMENT
+276A; 276B # MEDIUM FLATTENED LEFT PARENTHESIS ORNAMENT
+276B; 276A # MEDIUM FLATTENED RIGHT PARENTHESIS ORNAMENT
+276C; 276D # MEDIUM LEFT-POINTING ANGLE BRACKET ORNAMENT
+276D; 276C # MEDIUM RIGHT-POINTING ANGLE BRACKET ORNAMENT
+276E; 276F # HEAVY LEFT-POINTING ANGLE QUOTATION MARK ORNAMENT
+276F; 276E # HEAVY RIGHT-POINTING ANGLE QUOTATION MARK ORNAMENT
+2770; 2771 # HEAVY LEFT-POINTING ANGLE BRACKET ORNAMENT
+2771; 2770 # HEAVY RIGHT-POINTING ANGLE BRACKET ORNAMENT
+2772; 2773 # LIGHT LEFT TORTOISE SHELL BRACKET ORNAMENT
+2773; 2772 # LIGHT RIGHT TORTOISE SHELL BRACKET ORNAMENT
+2774; 2775 # MEDIUM LEFT CURLY BRACKET ORNAMENT
+2775; 2774 # MEDIUM RIGHT CURLY BRACKET ORNAMENT
+27C3; 27C4 # OPEN SUBSET
+27C4; 27C3 # OPEN SUPERSET
+27C5; 27C6 # LEFT S-SHAPED BAG DELIMITER
+27C6; 27C5 # RIGHT S-SHAPED BAG DELIMITER
+27C8; 27C9 # REVERSE SOLIDUS PRECEDING SUBSET
+27C9; 27C8 # SUPERSET PRECEDING SOLIDUS
+27CB; 27CD # MATHEMATICAL RISING DIAGONAL
+27CD; 27CB # MATHEMATICAL FALLING DIAGONAL
+27D5; 27D6 # LEFT OUTER JOIN
+27D6; 27D5 # RIGHT OUTER JOIN
+27DC; 22B8 # LEFT MULTIMAP
+27DD; 27DE # LONG RIGHT TACK
+27DE; 27DD # LONG LEFT TACK
+27E2; 27E3 # WHITE CONCAVE-SIDED DIAMOND WITH LEFTWARDS TICK
+27E3; 27E2 # WHITE CONCAVE-SIDED DIAMOND WITH RIGHTWARDS TICK
+27E4; 27E5 # WHITE SQUARE WITH LEFTWARDS TICK
+27E5; 27E4 # WHITE SQUARE WITH RIGHTWARDS TICK
+27E6; 27E7 # MATHEMATICAL LEFT WHITE SQUARE BRACKET
+27E7; 27E6 # MATHEMATICAL RIGHT WHITE SQUARE BRACKET
+27E8; 27E9 # MATHEMATICAL LEFT ANGLE BRACKET
+27E9; 27E8 # MATHEMATICAL RIGHT ANGLE BRACKET
+27EA; 27EB # MATHEMATICAL LEFT DOUBLE ANGLE BRACKET
+27EB; 27EA # MATHEMATICAL RIGHT DOUBLE ANGLE BRACKET
+27EC; 27ED # MATHEMATICAL LEFT WHITE TORTOISE SHELL BRACKET
+27ED; 27EC # MATHEMATICAL RIGHT WHITE TORTOISE SHELL BRACKET
+27EE; 27EF # MATHEMATICAL LEFT FLATTENED PARENTHESIS
+27EF; 27EE # MATHEMATICAL RIGHT FLATTENED PARENTHESIS
+2983; 2984 # LEFT WHITE CURLY BRACKET
+2984; 2983 # RIGHT WHITE CURLY BRACKET
+2985; 2986 # LEFT WHITE PARENTHESIS
+2986; 2985 # RIGHT WHITE PARENTHESIS
+2987; 2988 # Z NOTATION LEFT IMAGE BRACKET
+2988; 2987 # Z NOTATION RIGHT IMAGE BRACKET
+2989; 298A # Z NOTATION LEFT BINDING BRACKET
+298A; 2989 # Z NOTATION RIGHT BINDING BRACKET
+298B; 298C # LEFT SQUARE BRACKET WITH UNDERBAR
+298C; 298B # RIGHT SQUARE BRACKET WITH UNDERBAR
+298D; 2990 # LEFT SQUARE BRACKET WITH TICK IN TOP CORNER
+298E; 298F # RIGHT SQUARE BRACKET WITH TICK IN BOTTOM CORNER
+298F; 298E # LEFT SQUARE BRACKET WITH TICK IN BOTTOM CORNER
+2990; 298D # RIGHT SQUARE BRACKET WITH TICK IN TOP CORNER
+2991; 2992 # LEFT ANGLE BRACKET WITH DOT
+2992; 2991 # RIGHT ANGLE BRACKET WITH DOT
+2993; 2994 # LEFT ARC LESS-THAN BRACKET
+2994; 2993 # RIGHT ARC GREATER-THAN BRACKET
+2995; 2996 # DOUBLE LEFT ARC GREATER-THAN BRACKET
+2996; 2995 # DOUBLE RIGHT ARC LESS-THAN BRACKET
+2997; 2998 # LEFT BLACK TORTOISE SHELL BRACKET
+2998; 2997 # RIGHT BLACK TORTOISE SHELL BRACKET
+299B; 2221 # MEASURED ANGLE OPENING LEFT
+29A0; 2222 # SPHERICAL ANGLE OPENING LEFT
+29A3; 2220 # REVERSED ANGLE
+29A4; 29A5 # ANGLE WITH UNDERBAR
+29A5; 29A4 # REVERSED ANGLE WITH UNDERBAR
+29A8; 29A9 # MEASURED ANGLE WITH OPEN ARM ENDING IN ARROW POINTING UP AND RIGHT
+29A9; 29A8 # MEASURED ANGLE WITH OPEN ARM ENDING IN ARROW POINTING UP AND LEFT
+29AA; 29AB # MEASURED ANGLE WITH OPEN ARM ENDING IN ARROW POINTING DOWN AND RIGHT
+29AB; 29AA # MEASURED ANGLE WITH OPEN ARM ENDING IN ARROW POINTING DOWN AND LEFT
+29AC; 29AD # MEASURED ANGLE WITH OPEN ARM ENDING IN ARROW POINTING RIGHT AND UP
+29AD; 29AC # MEASURED ANGLE WITH OPEN ARM ENDING IN ARROW POINTING LEFT AND UP
+29AE; 29AF # MEASURED ANGLE WITH OPEN ARM ENDING IN ARROW POINTING RIGHT AND DOWN
+29AF; 29AE # MEASURED ANGLE WITH OPEN ARM ENDING IN ARROW POINTING LEFT AND DOWN
+29B8; 2298 # CIRCLED REVERSE SOLIDUS
+29C0; 29C1 # CIRCLED LESS-THAN
+29C1; 29C0 # CIRCLED GREATER-THAN
+29C4; 29C5 # SQUARED RISING DIAGONAL SLASH
+29C5; 29C4 # SQUARED FALLING DIAGONAL SLASH
+29CF; 29D0 # LEFT TRIANGLE BESIDE VERTICAL BAR
+29D0; 29CF # VERTICAL BAR BESIDE RIGHT TRIANGLE
+29D1; 29D2 # BOWTIE WITH LEFT HALF BLACK
+29D2; 29D1 # BOWTIE WITH RIGHT HALF BLACK
+29D4; 29D5 # TIMES WITH LEFT HALF BLACK
+29D5; 29D4 # TIMES WITH RIGHT HALF BLACK
+29D8; 29D9 # LEFT WIGGLY FENCE
+29D9; 29D8 # RIGHT WIGGLY FENCE
+29DA; 29DB # LEFT DOUBLE WIGGLY FENCE
+29DB; 29DA # RIGHT DOUBLE WIGGLY FENCE
+29E8; 29E9 # DOWN-POINTING TRIANGLE WITH LEFT HALF BLACK
+29E9; 29E8 # DOWN-POINTING TRIANGLE WITH RIGHT HALF BLACK
+29F5; 2215 # REVERSE SOLIDUS OPERATOR
+29F8; 29F9 # BIG SOLIDUS
+29F9; 29F8 # BIG REVERSE SOLIDUS
+29FC; 29FD # LEFT-POINTING CURVED ANGLE BRACKET
+29FD; 29FC # RIGHT-POINTING CURVED ANGLE BRACKET
+2A2B; 2A2C # MINUS SIGN WITH FALLING DOTS
+2A2C; 2A2B # MINUS SIGN WITH RISING DOTS
+2A2D; 2A2E # PLUS SIGN IN LEFT HALF CIRCLE
+2A2E; 2A2D # PLUS SIGN IN RIGHT HALF CIRCLE
+2A34; 2A35 # MULTIPLICATION SIGN IN LEFT HALF CIRCLE
+2A35; 2A34 # MULTIPLICATION SIGN IN RIGHT HALF CIRCLE
+2A3C; 2A3D # INTERIOR PRODUCT
+2A3D; 2A3C # RIGHTHAND INTERIOR PRODUCT
+2A64; 2A65 # Z NOTATION DOMAIN ANTIRESTRICTION
+2A65; 2A64 # Z NOTATION RANGE ANTIRESTRICTION
+2A79; 2A7A # LESS-THAN WITH CIRCLE INSIDE
+2A7A; 2A79 # GREATER-THAN WITH CIRCLE INSIDE
+2A7B; 2A7C # [BEST FIT] LESS-THAN WITH QUESTION MARK ABOVE
+2A7C; 2A7B # [BEST FIT] GREATER-THAN WITH QUESTION MARK ABOVE
+2A7D; 2A7E # LESS-THAN OR SLANTED EQUAL TO
+2A7E; 2A7D # GREATER-THAN OR SLANTED EQUAL TO
+2A7F; 2A80 # LESS-THAN OR SLANTED EQUAL TO WITH DOT INSIDE
+2A80; 2A7F # GREATER-THAN OR SLANTED EQUAL TO WITH DOT INSIDE
+2A81; 2A82 # LESS-THAN OR SLANTED EQUAL TO WITH DOT ABOVE
+2A82; 2A81 # GREATER-THAN OR SLANTED EQUAL TO WITH DOT ABOVE
+2A83; 2A84 # LESS-THAN OR SLANTED EQUAL TO WITH DOT ABOVE RIGHT
+2A84; 2A83 # GREATER-THAN OR SLANTED EQUAL TO WITH DOT ABOVE LEFT
+2A85; 2A86 # [BEST FIT] LESS-THAN OR APPROXIMATE
+2A86; 2A85 # [BEST FIT] GREATER-THAN OR APPROXIMATE
+2A87; 2A88 # [BEST FIT] LESS-THAN AND SINGLE-LINE NOT EQUAL TO
+2A88; 2A87 # [BEST FIT] GREATER-THAN AND SINGLE-LINE NOT EQUAL TO
+2A89; 2A8A # [BEST FIT] LESS-THAN AND NOT APPROXIMATE
+2A8A; 2A89 # [BEST FIT] GREATER-THAN AND NOT APPROXIMATE
+2A8B; 2A8C # LESS-THAN ABOVE DOUBLE-LINE EQUAL ABOVE GREATER-THAN
+2A8C; 2A8B # GREATER-THAN ABOVE DOUBLE-LINE EQUAL ABOVE LESS-THAN
+2A8D; 2A8E # [BEST FIT] LESS-THAN ABOVE SIMILAR OR EQUAL
+2A8E; 2A8D # [BEST FIT] GREATER-THAN ABOVE SIMILAR OR EQUAL
+2A8F; 2A90 # [BEST FIT] LESS-THAN ABOVE SIMILAR ABOVE GREATER-THAN
+2A90; 2A8F # [BEST FIT] GREATER-THAN ABOVE SIMILAR ABOVE LESS-THAN
+2A91; 2A92 # LESS-THAN ABOVE GREATER-THAN ABOVE DOUBLE-LINE EQUAL
+2A92; 2A91 # GREATER-THAN ABOVE LESS-THAN ABOVE DOUBLE-LINE EQUAL
+2A93; 2A94 # LESS-THAN ABOVE SLANTED EQUAL ABOVE GREATER-THAN ABOVE SLANTED EQUAL
+2A94; 2A93 # GREATER-THAN ABOVE SLANTED EQUAL ABOVE LESS-THAN ABOVE SLANTED EQUAL
+2A95; 2A96 # SLANTED EQUAL TO OR LESS-THAN
+2A96; 2A95 # SLANTED EQUAL TO OR GREATER-THAN
+2A97; 2A98 # SLANTED EQUAL TO OR LESS-THAN WITH DOT INSIDE
+2A98; 2A97 # SLANTED EQUAL TO OR GREATER-THAN WITH DOT INSIDE
+2A99; 2A9A # DOUBLE-LINE EQUAL TO OR LESS-THAN
+2A9A; 2A99 # DOUBLE-LINE EQUAL TO OR GREATER-THAN
+2A9B; 2A9C # DOUBLE-LINE SLANTED EQUAL TO OR LESS-THAN
+2A9C; 2A9B # DOUBLE-LINE SLANTED EQUAL TO OR GREATER-THAN
+2A9D; 2A9E # [BEST FIT] SIMILAR OR LESS-THAN
+2A9E; 2A9D # [BEST FIT] SIMILAR OR GREATER-THAN
+2A9F; 2AA0 # [BEST FIT] SIMILAR ABOVE LESS-THAN ABOVE EQUALS SIGN
+2AA0; 2A9F # [BEST FIT] SIMILAR ABOVE GREATER-THAN ABOVE EQUALS SIGN
+2AA1; 2AA2 # DOUBLE NESTED LESS-THAN
+2AA2; 2AA1 # DOUBLE NESTED GREATER-THAN
+2AA6; 2AA7 # LESS-THAN CLOSED BY CURVE
+2AA7; 2AA6 # GREATER-THAN CLOSED BY CURVE
+2AA8; 2AA9 # LESS-THAN CLOSED BY CURVE ABOVE SLANTED EQUAL
+2AA9; 2AA8 # GREATER-THAN CLOSED BY CURVE ABOVE SLANTED EQUAL
+2AAA; 2AAB # SMALLER THAN
+2AAB; 2AAA # LARGER THAN
+2AAC; 2AAD # SMALLER THAN OR EQUAL TO
+2AAD; 2AAC # LARGER THAN OR EQUAL TO
+2AAF; 2AB0 # PRECEDES ABOVE SINGLE-LINE EQUALS SIGN
+2AB0; 2AAF # SUCCEEDS ABOVE SINGLE-LINE EQUALS SIGN
+2AB1; 2AB2 # [BEST FIT] PRECEDES ABOVE SINGLE-LINE NOT EQUAL TO
+2AB2; 2AB1 # [BEST FIT] SUCCEEDS ABOVE SINGLE-LINE NOT EQUAL TO
+2AB3; 2AB4 # PRECEDES ABOVE EQUALS SIGN
+2AB4; 2AB3 # SUCCEEDS ABOVE EQUALS SIGN
+2AB5; 2AB6 # [BEST FIT] PRECEDES ABOVE NOT EQUAL TO
+2AB6; 2AB5 # [BEST FIT] SUCCEEDS ABOVE NOT EQUAL TO
+2AB7; 2AB8 # [BEST FIT] PRECEDES ABOVE ALMOST EQUAL TO
+2AB8; 2AB7 # [BEST FIT] SUCCEEDS ABOVE ALMOST EQUAL TO
+2AB9; 2ABA # [BEST FIT] PRECEDES ABOVE NOT ALMOST EQUAL TO
+2ABA; 2AB9 # [BEST FIT] SUCCEEDS ABOVE NOT ALMOST EQUAL TO
+2ABB; 2ABC # DOUBLE PRECEDES
+2ABC; 2ABB # DOUBLE SUCCEEDS
+2ABD; 2ABE # SUBSET WITH DOT
+2ABE; 2ABD # SUPERSET WITH DOT
+2ABF; 2AC0 # SUBSET WITH PLUS SIGN BELOW
+2AC0; 2ABF # SUPERSET WITH PLUS SIGN BELOW
+2AC1; 2AC2 # SUBSET WITH MULTIPLICATION SIGN BELOW
+2AC2; 2AC1 # SUPERSET WITH MULTIPLICATION SIGN BELOW
+2AC3; 2AC4 # SUBSET OF OR EQUAL TO WITH DOT ABOVE
+2AC4; 2AC3 # SUPERSET OF OR EQUAL TO WITH DOT ABOVE
+2AC5; 2AC6 # SUBSET OF ABOVE EQUALS SIGN
+2AC6; 2AC5 # SUPERSET OF ABOVE EQUALS SIGN
+2AC7; 2AC8 # [BEST FIT] SUBSET OF ABOVE TILDE OPERATOR
+2AC8; 2AC7 # [BEST FIT] SUPERSET OF ABOVE TILDE OPERATOR
+2AC9; 2ACA # [BEST FIT] SUBSET OF ABOVE ALMOST EQUAL TO
+2ACA; 2AC9 # [BEST FIT] SUPERSET OF ABOVE ALMOST EQUAL TO
+2ACB; 2ACC # [BEST FIT] SUBSET OF ABOVE NOT EQUAL TO
+2ACC; 2ACB # [BEST FIT] SUPERSET OF ABOVE NOT EQUAL TO
+2ACD; 2ACE # SQUARE LEFT OPEN BOX OPERATOR
+2ACE; 2ACD # SQUARE RIGHT OPEN BOX OPERATOR
+2ACF; 2AD0 # CLOSED SUBSET
+2AD0; 2ACF # CLOSED SUPERSET
+2AD1; 2AD2 # CLOSED SUBSET OR EQUAL TO
+2AD2; 2AD1 # CLOSED SUPERSET OR EQUAL TO
+2AD3; 2AD4 # SUBSET ABOVE SUPERSET
+2AD4; 2AD3 # SUPERSET ABOVE SUBSET
+2AD5; 2AD6 # SUBSET ABOVE SUBSET
+2AD6; 2AD5 # SUPERSET ABOVE SUPERSET
+2ADE; 22A6 # SHORT LEFT TACK
+2AE3; 22A9 # DOUBLE VERTICAL BAR LEFT TURNSTILE
+2AE4; 22A8 # VERTICAL BAR DOUBLE LEFT TURNSTILE
+2AE5; 22AB # DOUBLE VERTICAL BAR DOUBLE LEFT TURNSTILE
+2AEC; 2AED # DOUBLE STROKE NOT SIGN
+2AED; 2AEC # REVERSED DOUBLE STROKE NOT SIGN
+2AEE; 2224 # DOES NOT DIVIDE WITH REVERSED NEGATION SLASH
+2AF7; 2AF8 # TRIPLE NESTED LESS-THAN
+2AF8; 2AF7 # TRIPLE NESTED GREATER-THAN
+2AF9; 2AFA # DOUBLE-LINE SLANTED LESS-THAN OR EQUAL TO
+2AFA; 2AF9 # DOUBLE-LINE SLANTED GREATER-THAN OR EQUAL TO
+2BFE; 221F # REVERSED RIGHT ANGLE
+2E02; 2E03 # LEFT SUBSTITUTION BRACKET
+2E03; 2E02 # RIGHT SUBSTITUTION BRACKET
+2E04; 2E05 # LEFT DOTTED SUBSTITUTION BRACKET
+2E05; 2E04 # RIGHT DOTTED SUBSTITUTION BRACKET
+2E09; 2E0A # LEFT TRANSPOSITION BRACKET
+2E0A; 2E09 # RIGHT TRANSPOSITION BRACKET
+2E0C; 2E0D # LEFT RAISED OMISSION BRACKET
+2E0D; 2E0C # RIGHT RAISED OMISSION BRACKET
+2E1C; 2E1D # LEFT LOW PARAPHRASE BRACKET
+2E1D; 2E1C # RIGHT LOW PARAPHRASE BRACKET
+2E20; 2E21 # LEFT VERTICAL BAR WITH QUILL
+2E21; 2E20 # RIGHT VERTICAL BAR WITH QUILL
+2E22; 2E23 # TOP LEFT HALF BRACKET
+2E23; 2E22 # TOP RIGHT HALF BRACKET
+2E24; 2E25 # BOTTOM LEFT HALF BRACKET
+2E25; 2E24 # BOTTOM RIGHT HALF BRACKET
+2E26; 2E27 # LEFT SIDEWAYS U BRACKET
+2E27; 2E26 # RIGHT SIDEWAYS U BRACKET
+2E28; 2E29 # LEFT DOUBLE PARENTHESIS
+2E29; 2E28 # RIGHT DOUBLE PARENTHESIS
+2E55; 2E56 # LEFT SQUARE BRACKET WITH STROKE
+2E56; 2E55 # RIGHT SQUARE BRACKET WITH STROKE
+2E57; 2E58 # LEFT SQUARE BRACKET WITH DOUBLE STROKE
+2E58; 2E57 # RIGHT SQUARE BRACKET WITH DOUBLE STROKE
+2E59; 2E5A # TOP HALF LEFT PARENTHESIS
+2E5A; 2E59 # TOP HALF RIGHT PARENTHESIS
+2E5B; 2E5C # BOTTOM HALF LEFT PARENTHESIS
+2E5C; 2E5B # BOTTOM HALF RIGHT PARENTHESIS
+3008; 3009 # LEFT ANGLE BRACKET
+3009; 3008 # RIGHT ANGLE BRACKET
+300A; 300B # LEFT DOUBLE ANGLE BRACKET
+300B; 300A # RIGHT DOUBLE ANGLE BRACKET
+300C; 300D # [BEST FIT] LEFT CORNER BRACKET
+300D; 300C # [BEST FIT] RIGHT CORNER BRACKET
+300E; 300F # [BEST FIT] LEFT WHITE CORNER BRACKET
+300F; 300E # [BEST FIT] RIGHT WHITE CORNER BRACKET
+3010; 3011 # LEFT BLACK LENTICULAR BRACKET
+3011; 3010 # RIGHT BLACK LENTICULAR BRACKET
+3014; 3015 # LEFT TORTOISE SHELL BRACKET
+3015; 3014 # RIGHT TORTOISE SHELL BRACKET
+3016; 3017 # LEFT WHITE LENTICULAR BRACKET
+3017; 3016 # RIGHT WHITE LENTICULAR BRACKET
+3018; 3019 # LEFT WHITE TORTOISE SHELL BRACKET
+3019; 3018 # RIGHT WHITE TORTOISE SHELL BRACKET
+301A; 301B # LEFT WHITE SQUARE BRACKET
+301B; 301A # RIGHT WHITE SQUARE BRACKET
+FE59; FE5A # SMALL LEFT PARENTHESIS
+FE5A; FE59 # SMALL RIGHT PARENTHESIS
+FE5B; FE5C # SMALL LEFT CURLY BRACKET
+FE5C; FE5B # SMALL RIGHT CURLY BRACKET
+FE5D; FE5E # SMALL LEFT TORTOISE SHELL BRACKET
+FE5E; FE5D # SMALL RIGHT TORTOISE SHELL BRACKET
+FE64; FE65 # SMALL LESS-THAN SIGN
+FE65; FE64 # SMALL GREATER-THAN SIGN
+FF08; FF09 # FULLWIDTH LEFT PARENTHESIS
+FF09; FF08 # FULLWIDTH RIGHT PARENTHESIS
+FF1C; FF1E # FULLWIDTH LESS-THAN SIGN
+FF1E; FF1C # FULLWIDTH GREATER-THAN SIGN
+FF3B; FF3D # FULLWIDTH LEFT SQUARE BRACKET
+FF3D; FF3B # FULLWIDTH RIGHT SQUARE BRACKET
+FF5B; FF5D # FULLWIDTH LEFT CURLY BRACKET
+FF5D; FF5B # FULLWIDTH RIGHT CURLY BRACKET
+FF5F; FF60 # FULLWIDTH LEFT WHITE PARENTHESIS
+FF60; FF5F # FULLWIDTH RIGHT WHITE PARENTHESIS
+FF62; FF63 # [BEST FIT] HALFWIDTH LEFT CORNER BRACKET
+FF63; FF62 # [BEST FIT] HALFWIDTH RIGHT CORNER BRACKET
+
+# The following characters have no appropriate mirroring character.
+# For these characters it is up to the rendering system
+#   to provide mirrored glyphs.
+
+# 2140; DOUBLE-STRUCK N-ARY SUMMATION
+# 2201; COMPLEMENT
+# 2202; PARTIAL DIFFERENTIAL
+# 2203; THERE EXISTS
+# 2204; THERE DOES NOT EXIST
+# 2211; N-ARY SUMMATION
+# 2216; SET MINUS
+# 221A; SQUARE ROOT
+# 221B; CUBE ROOT
+# 221C; FOURTH ROOT
+# 221D; PROPORTIONAL TO
+# 2226; NOT PARALLEL TO
+# 222B; INTEGRAL
+# 222C; DOUBLE INTEGRAL
+# 222D; TRIPLE INTEGRAL
+# 222E; CONTOUR INTEGRAL
+# 222F; SURFACE INTEGRAL
+# 2230; VOLUME INTEGRAL
+# 2231; CLOCKWISE INTEGRAL
+# 2232; CLOCKWISE CONTOUR INTEGRAL
+# 2233; ANTICLOCKWISE CONTOUR INTEGRAL
+# 2239; EXCESS
+# 223B; HOMOTHETIC
+# 223E; INVERTED LAZY S
+# 223F; SINE WAVE
+# 2240; WREATH PRODUCT
+# 2241; NOT TILDE
+# 2242; MINUS TILDE
+# 2244; NOT ASYMPTOTICALLY EQUAL TO
+# 2246; APPROXIMATELY BUT NOT ACTUALLY EQUAL TO
+# 2247; NEITHER APPROXIMATELY NOR ACTUALLY EQUAL TO
+# 2248; ALMOST EQUAL TO
+# 2249; NOT ALMOST EQUAL TO
+# 224A; ALMOST EQUAL OR EQUAL TO
+# 224B; TRIPLE TILDE
+# 225F; QUESTIONED EQUAL TO
+# 2260; NOT EQUAL TO
+# 2262; NOT IDENTICAL TO
+# 228C; MULTISET
+# 22A7; MODELS
+# 22AA; TRIPLE VERTICAL BAR RIGHT TURNSTILE
+# 22AC; DOES NOT PROVE
+# 22AD; NOT TRUE
+# 22AE; DOES NOT FORCE
+# 22AF; NEGATED DOUBLE VERTICAL BAR DOUBLE RIGHT TURNSTILE
+# 22BE; RIGHT ANGLE WITH ARC
+# 22BF; RIGHT TRIANGLE
+# 22F5; ELEMENT OF WITH DOT ABOVE
+# 22F8; ELEMENT OF WITH UNDERBAR
+# 22F9; ELEMENT OF WITH TWO HORIZONTAL STROKES
+# 22FF; Z NOTATION BAG MEMBERSHIP
+# 2320; TOP HALF INTEGRAL
+# 2321; BOTTOM HALF INTEGRAL
+# 27C0; THREE DIMENSIONAL ANGLE
+# 27CC; LONG DIVISION
+# 27D3; LOWER RIGHT CORNER WITH DOT
+# 27D4; UPPER LEFT CORNER WITH DOT
+# 299C; RIGHT ANGLE VARIANT WITH SQUARE
+# 299D; MEASURED RIGHT ANGLE WITH DOT
+# 299E; ANGLE WITH S INSIDE
+# 299F; ACUTE ANGLE
+# 29A2; TURNED ANGLE
+# 29A6; OBLIQUE ANGLE OPENING UP
+# 29A7; OBLIQUE ANGLE OPENING DOWN
+# 29C2; CIRCLE WITH SMALL CIRCLE TO THE RIGHT
+# 29C3; CIRCLE WITH TWO HORIZONTAL STROKES TO THE RIGHT
+# 29C9; TWO JOINED SQUARES
+# 29CE; RIGHT TRIANGLE ABOVE LEFT TRIANGLE
+# 29DC; INCOMPLETE INFINITY
+# 29E1; INCREASES AS
+# 29E3; EQUALS SIGN AND SLANTED PARALLEL
+# 29E4; EQUALS SIGN AND SLANTED PARALLEL WITH TILDE ABOVE
+# 29E5; IDENTICAL TO AND SLANTED PARALLEL
+# 29F4; RULE-DELAYED
+# 29F6; SOLIDUS WITH OVERBAR
+# 29F7; REVERSE SOLIDUS WITH HORIZONTAL STROKE
+# 2A0A; MODULO TWO SUM
+# 2A0B; SUMMATION WITH INTEGRAL
+# 2A0C; QUADRUPLE INTEGRAL OPERATOR
+# 2A0D; FINITE PART INTEGRAL
+# 2A0E; INTEGRAL WITH DOUBLE STROKE
+# 2A0F; INTEGRAL AVERAGE WITH SLASH
+# 2A10; CIRCULATION FUNCTION
+# 2A11; ANTICLOCKWISE INTEGRATION
+# 2A12; LINE INTEGRATION WITH RECTANGULAR PATH AROUND POLE
+# 2A13; LINE INTEGRATION WITH SEMICIRCULAR PATH AROUND POLE
+# 2A14; LINE INTEGRATION NOT INCLUDING THE POLE
+# 2A15; INTEGRAL AROUND A POINT OPERATOR
+# 2A16; QUATERNION INTEGRAL OPERATOR
+# 2A17; INTEGRAL WITH LEFTWARDS ARROW WITH HOOK
+# 2A18; INTEGRAL WITH TIMES SIGN
+# 2A19; INTEGRAL WITH INTERSECTION
+# 2A1A; INTEGRAL WITH UNION
+# 2A1B; INTEGRAL WITH OVERBAR
+# 2A1C; INTEGRAL WITH UNDERBAR
+# 2A1E; LARGE LEFT TRIANGLE OPERATOR
+# 2A1F; Z NOTATION SCHEMA COMPOSITION
+# 2A20; Z NOTATION SCHEMA PIPING
+# 2A21; Z NOTATION SCHEMA PROJECTION
+# 2A24; PLUS SIGN WITH TILDE ABOVE
+# 2A26; PLUS SIGN WITH TILDE BELOW
+# 2A29; MINUS SIGN WITH COMMA ABOVE
+# 2A3E; Z NOTATION RELATIONAL COMPOSITION
+# 2A57; SLOPING LARGE OR
+# 2A58; SLOPING LARGE AND
+# 2A6A; TILDE OPERATOR WITH DOT ABOVE
+# 2A6B; TILDE OPERATOR WITH RISING DOTS
+# 2A6C; SIMILAR MINUS SIMILAR
+# 2A6D; CONGRUENT WITH DOT ABOVE
+# 2A6F; ALMOST EQUAL TO WITH CIRCUMFLEX ACCENT
+# 2A70; APPROXIMATELY EQUAL OR EQUAL TO
+# 2A73; EQUALS SIGN ABOVE TILDE OPERATOR
+# 2A74; DOUBLE COLON EQUAL
+# 2AA3; DOUBLE NESTED LESS-THAN WITH UNDERBAR
+# 2ADC; FORKING
+# 2AE2; VERTICAL BAR TRIPLE RIGHT TURNSTILE
+# 2AE6; LONG DASH FROM LEFT MEMBER OF DOUBLE VERTICAL
+# 2AF3; PARALLEL WITH TILDE OPERATOR
+# 2AFB; TRIPLE SOLIDUS BINARY RELATION
+# 2AFD; DOUBLE SOLIDUS OPERATOR
+# 1D6DB; MATHEMATICAL BOLD PARTIAL DIFFERENTIAL
+# 1D715; MATHEMATICAL ITALIC PARTIAL DIFFERENTIAL
+# 1D74F; MATHEMATICAL BOLD ITALIC PARTIAL DIFFERENTIAL
+# 1D789; MATHEMATICAL SANS-SERIF BOLD PARTIAL DIFFERENTIAL
+# 1D7C3; MATHEMATICAL SANS-SERIF BOLD ITALIC PARTIAL DIFFERENTIAL
+
+# EOF
--- a/maint/Unicode.tables/DerivedBidiClass.txt
+++ b/maint/Unicode.tables/DerivedBidiClass.txt
--- a/maint/Unicode.tables/DerivedCoreProperties.txt
+++ b/maint/Unicode.tables/DerivedCoreProperties.txt
--- a/maint/Unicode.tables/PropList.txt
+++ b/maint/Unicode.tables/PropList.txt
--- a/maint/Unicode.tables/PropertyAliases.txt
+++ b/maint/Unicode.tables/PropertyAliases.txt
@ -0,0 +1,212 @@
+# PropertyAliases-14.0.0.txt
+# Date: 2021-03-08, 19:35:48 GMT
+# © 2021 Unicode®, Inc.
+# Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries.
+# For terms of use, see http://www.unicode.org/terms_of_use.html
+#
+# Unicode Character Database
+#   For documentation, see http://www.unicode.org/reports/tr44/
+#
+# This file contains aliases for properties used in the UCD.
+# These names can be used for XML formats of UCD data, for regular-expression
+# property tests, and other programmatic textual descriptions of Unicode data.
+#
+# The names may be translated in appropriate environments, and additional
+# aliases may be useful.
+#
+# FORMAT
+#
+# Each line has two or more fields, separated by semicolons.
+#
+# First Field: The first field is the short name for the property.
+# It is typically an abbreviation, but in a number of cases it is simply
+# a duplicate of the "long name" in the second field.
+# For Unihan database tags, the short name is actually a longer string than
+# the tag specified in the second field.
+#
+# Second Field: The second field is the long name for the property,
+# typically the formal name used in documentation about the property.
+#
+# The above are the preferred aliases. Other aliases may be listed in additional fields.
+#
+# Loose matching should be applied to all property names and property values, with
+# the exception of String Property values. With loose matching of property names and
+# values, the case distinctions, whitespace, and '_' are ignored. For Numeric Property
+# values, numeric equivalencies are applied: thus "01.00" is equivalent to "1".
+#
+# NOTE: Property value names are NOT unique across properties. For example:
+#
+#   AL means Arabic Letter for the Bidi_Class property, and
+#   AL means Above_Left for the Combining_Class property, and
+#   AL means Alphabetic for the Line_Break property.
+#
+# In addition, some property names may be the same as some property value names.
+# For example:
+#
+#   sc means the Script property, and
+#   Sc means the General_Category property value Currency_Symbol (Sc)
+#
+# The combination of property value and property name is, however, unique.
+#
+# For more information, see UAX #44, Unicode Character Database, and
+# UTS #18, Unicode Regular Expressions.
+# ================================================
+
+
+# ================================================
+# Numeric Properties
+# ================================================
+cjkAccountingNumeric     ; kAccountingNumeric
+cjkOtherNumeric          ; kOtherNumeric
+cjkPrimaryNumeric        ; kPrimaryNumeric
+nv                       ; Numeric_Value
+
+# ================================================
+# String Properties
+# ================================================
+cf                       ; Case_Folding
+cjkCompatibilityVariant  ; kCompatibilityVariant
+dm                       ; Decomposition_Mapping
+FC_NFKC                  ; FC_NFKC_Closure
+lc                       ; Lowercase_Mapping
+NFKC_CF                  ; NFKC_Casefold
+scf                      ; Simple_Case_Folding         ; sfc
+slc                      ; Simple_Lowercase_Mapping
+stc                      ; Simple_Titlecase_Mapping
+suc                      ; Simple_Uppercase_Mapping
+tc                       ; Titlecase_Mapping
+uc                       ; Uppercase_Mapping
+
+# ================================================
+# Miscellaneous Properties
+# ================================================
+bmg                      ; Bidi_Mirroring_Glyph
+bpb                      ; Bidi_Paired_Bracket
+cjkIICore                ; kIICore
+cjkIRG_GSource           ; kIRG_GSource
+cjkIRG_HSource           ; kIRG_HSource
+cjkIRG_JSource           ; kIRG_JSource
+cjkIRG_KPSource          ; kIRG_KPSource
+cjkIRG_KSource           ; kIRG_KSource
+cjkIRG_MSource           ; kIRG_MSource
+cjkIRG_SSource           ; kIRG_SSource
+cjkIRG_TSource           ; kIRG_TSource
+cjkIRG_UKSource          ; kIRG_UKSource
+cjkIRG_USource           ; kIRG_USource
+cjkIRG_VSource           ; kIRG_VSource
+cjkRSUnicode             ; kRSUnicode                  ; Unicode_Radical_Stroke; URS
+EqUIdeo                  ; Equivalent_Unified_Ideograph
+isc                      ; ISO_Comment
+JSN                      ; Jamo_Short_Name
+na                       ; Name
+na1                      ; Unicode_1_Name
+Name_Alias               ; Name_Alias
+scx                      ; Script_Extensions
+
+# ================================================
+# Catalog Properties
+# ================================================
+age                      ; Age
+blk                      ; Block
+sc                       ; Script
+
+# ================================================
+# Enumerated Properties
+# ================================================
+bc                       ; Bidi_Class
+bpt                      ; Bidi_Paired_Bracket_Type
+ccc                      ; Canonical_Combining_Class
+dt                       ; Decomposition_Type
+ea                       ; East_Asian_Width
+gc                       ; General_Category
+GCB                      ; Grapheme_Cluster_Break
+hst                      ; Hangul_Syllable_Type
+InPC                     ; Indic_Positional_Category
+InSC                     ; Indic_Syllabic_Category
+jg                       ; Joining_Group
+jt                       ; Joining_Type
+lb                       ; Line_Break
+NFC_QC                   ; NFC_Quick_Check
+NFD_QC                   ; NFD_Quick_Check
+NFKC_QC                  ; NFKC_Quick_Check
+NFKD_QC                  ; NFKD_Quick_Check
+nt                       ; Numeric_Type
+SB                       ; Sentence_Break
+vo                       ; Vertical_Orientation
+WB                       ; Word_Break
+
+# ================================================
+# Binary Properties
+# ================================================
+AHex                     ; ASCII_Hex_Digit
+Alpha                    ; Alphabetic
+Bidi_C                   ; Bidi_Control
+Bidi_M                   ; Bidi_Mirrored
+Cased                    ; Cased
+CE                       ; Composition_Exclusion
+CI                       ; Case_Ignorable
+Comp_Ex                  ; Full_Composition_Exclusion
+CWCF                     ; Changes_When_Casefolded
+CWCM                     ; Changes_When_Casemapped
+CWKCF                    ; Changes_When_NFKC_Casefolded
+CWL                      ; Changes_When_Lowercased
+CWT                      ; Changes_When_Titlecased
+CWU                      ; Changes_When_Uppercased
+Dash                     ; Dash
+Dep                      ; Deprecated
+DI                       ; Default_Ignorable_Code_Point
+Dia                      ; Diacritic
+EBase                    ; Emoji_Modifier_Base
+EComp                    ; Emoji_Component
+EMod                     ; Emoji_Modifier
+Emoji                    ; Emoji
+EPres                    ; Emoji_Presentation
+Ext                      ; Extender
+ExtPict                  ; Extended_Pictographic
+Gr_Base                  ; Grapheme_Base
+Gr_Ext                   ; Grapheme_Extend
+Gr_Link                  ; Grapheme_Link
+Hex                      ; Hex_Digit
+Hyphen                   ; Hyphen
+IDC                      ; ID_Continue
+Ideo                     ; Ideographic
+IDS                      ; ID_Start
+IDSB                     ; IDS_Binary_Operator
+IDST                     ; IDS_Trinary_Operator
+Join_C                   ; Join_Control
+LOE                      ; Logical_Order_Exception
+Lower                    ; Lowercase
+Math                     ; Math
+NChar                    ; Noncharacter_Code_Point
+OAlpha                   ; Other_Alphabetic
+ODI                      ; Other_Default_Ignorable_Code_Point
+OGr_Ext                  ; Other_Grapheme_Extend
+OIDC                     ; Other_ID_Continue
+OIDS                     ; Other_ID_Start
+OLower                   ; Other_Lowercase
+OMath                    ; Other_Math
+OUpper                   ; Other_Uppercase
+Pat_Syn                  ; Pattern_Syntax
+Pat_WS                   ; Pattern_White_Space
+PCM                      ; Prepended_Concatenation_Mark
+QMark                    ; Quotation_Mark
+Radical                  ; Radical
+RI                       ; Regional_Indicator
+SD                       ; Soft_Dotted
+STerm                    ; Sentence_Terminal
+Term                     ; Terminal_Punctuation
+UIdeo                    ; Unified_Ideograph
+Upper                    ; Uppercase
+VS                       ; Variation_Selector
+WSpace                   ; White_Space                 ; space
+XIDC                     ; XID_Continue
+XIDS                     ; XID_Start
+XO_NFC                   ; Expands_On_NFC
+XO_NFD                   ; Expands_On_NFD
+XO_NFKC                  ; Expands_On_NFKC
+XO_NFKD                  ; Expands_On_NFKD
+
+# ================================================
+# Total:    129
+
+# EOF
--- a/maint/Unicode.tables/PropertyValueAliases.txt
+++ b/maint/Unicode.tables/PropertyValueAliases.txt
--- a/maint/ucptest.c
+++ b/maint/ucptest.c
@ -2,7 +2,7 @@
 * A program for testing the Unicode property table *
 ***************************************************/

-/* Copyright (c) University of Cambridge 2008-2020 */
+/* Copyright (c) University of Cambridge 2008-2022 */

 /* Compile thus:

@ -14,40 +14,50 @@
 */

 /* This is a hacked-up program for testing the Unicode properties tables of
-PCRE2. It can also be used for finding characters with certain properties.
-I wrote it to help with debugging PCRE, and have added things that I found
-useful, in a rather haphazard way. The code has never been seriously tidied or
-checked for robustness, but it shouldn't now give compiler warnings.
+PCRE2. It can also be used for finding characters with certain properties. I
+wrote it to help with debugging, and have added things that I found useful, in
+a rather haphazard way. The code has never been seriously tidied or checked for
+robustness, but it shouldn't now give compiler warnings.

-There is only one option: "-s". If given, it applies only to the "findprop" 
-command. It causes the UTF-8 sequence of bytes that encode the character to be 
-output between angle brackets at the end of the line. On a UTF-8 terminal, this 
+There is only one option: "-s". If given, it applies only to the "findprop"
+command. It causes the UTF-8 sequence of bytes that encode the character to be
+output between angle brackets at the end of the line. On a UTF-8 terminal, this
 will show the appropriate graphic for the code point.

 If the command has arguments, they are concatenated into a buffer, separated by
 spaces. If the first argument starts "U+" or consists entirely of hexadecimal
 digits, "findprop" is inserted at the start. The buffer is then processed as a
 single line file, after which the program exits. If there are no arguments, the
-program reads commands line by line on stdin and writes output to stdout. The 
+program reads commands line by line on stdin and writes output to stdout. The
 return code is always zero.

 There are three commands:

-"findprop" must be followed by a space-separated list of Unicode code points as
-hex numbers, either without any prefix or starting with "U+". The output is one
-line per character, giving its Unicode properties followed by its other case or 
-cases if one or more exist, followed by its Script Extension list if it is not
-just the same as the base script. This list is in square brackets. The
-properties are:
+The command "findprop" must be followed by a space-separated list of Unicode
+code points as hex numbers, either without any prefix or starting with "U+", or
+as individual UTF-8 characters preceded by '+'. For example:

-General type        e.g. Letter
-Specific type       e.g. Upper case letter
-Script              e.g. Medefaidrin
-Grapheme break type e.g. Extend (most common is Other)
+  findprop U+1234 5Abc +?

-"find" must be followed by a list of property names and their values. The 
-values are case-sensitive. This finds characters that have those properties. If
-multiple properties are listed, they must all be matched. Currently supported:
+The output is one long line per character, listing Unicode properties that have
+values, followed by its other case or cases if one or more exist, followed by
+its Script Extension list if there is one. This list is in square brackets. A
+second list in square brackets gives all the Boolean properties of the
+character. The properties that come first are:
+
+  Bidi class          e.g. NSM (most common is L)
+  General type        e.g. Letter
+  Specific type       e.g. Upper case letter
+  Script              e.g. Medefaidrin
+  Grapheme break type e.g. Extend (most common is Other)
+
+Script names and Boolean property names are all in lower case, with underscores
+and hyphens removed, because that's how they are stored for "loose" matching.
+
+The command "find" must be followed by a list of property types and their
+values. The values are case-sensitive, except for bidi class. This finds
+characters that have those properties. If multiple properties are listed, they
+must all be matched. Currently supported:

  script <name>    The character must have this script property. Only one
                     such script may be given.
@ -56,17 +66,20 @@ multiple properties are listed, they must all be matched. Currently supported:
                     scripts must be present.
  type <abbrev>    The character's specific type (e.g. Lu or Nd) must match.
  gbreak <name>    The grapheme break property must match.
+  bidi <class>     The character's bidi class must match.
+  bool <name>      The character's Boolean property list must contain this
+                     property.

 If a <name> or <abbrev> is preceded by !, the value must NOT be present. For
-Script Extensions, there may be a mixture of positive and negative
-requirements. All must be satisfied.
+Script Extensions and Boolean properties, there may be a mixture of positive
+and negative requirements. All must be satisfied.

 Sequences of two or more characters are shown as ranges, for example
 U+0041..U+004A. No more than 100 lines are are output. If there are more
-characters, the list ends with ... 
+characters, the list ends with ...

-"list" must be followed by a property name (script, type, or gbreak). The
-defined values for that property are listed. */
+The command "list" must be followed by one of property names script, bool,
+type, gbreak or bidi. The defined values for that property are listed. */


 #ifdef HAVE_CONFIG_H
@ -97,6 +110,9 @@ defined values for that property are listed. */
 #include <editline/readline.h>
 #else
 #include <readline/readline.h>
+#ifdef RL_VERSION_MAJOR
+#include <readline/history.h>
+#endif
 #endif
 #endif
 #endif
@ -145,7 +161,7 @@ static const unsigned char *type_names[] = {
  US"So", US"Other symbol",
  US"Zl", US"Line separator",
  US"Zp", US"Paragraph separator",
-  US"Zs", US"Space separator" 
+  US"Zs", US"Space separator"
 };

 static const unsigned char *gb_names[] = {
@ -160,12 +176,37 @@ static const unsigned char *gb_names[] = {
  US"T",                     US"Hangul syllable type T",
  US"LV",                    US"Hangul syllable type LV",
  US"LVT",                   US"Hangul syllable type LVT",
-  US"RegionalIndicator",     US"",
+  US"Regional_Indicator",    US"",
  US"Other",                 US"",
  US"ZWJ",                   US"zero width joiner",
  US"Extended_Pictographic", US""
 };

+static const unsigned char *bd_names[] = {
+  US"AL",   US"Arabic letter",
+  US"AN",   US"Arabid number",
+  US"B",    US"Paragraph separator",
+  US"BN",   US"Boundary neutral",
+  US"CS",   US"Common separator",
+  US"EN",   US"European number",
+  US"ES",   US"European separator",
+  US"ET",   US"European terminator",
+  US"FSI",  US"First string isolate",
+  US"L",    US"Left-to-right",
+  US"LRE",  US"Left-to-right embedding",
+  US"LRI",  US"Left-to-right isolate",
+  US"LRO",  US"Left-to-right override",
+  US"NSM",  US"Non-spacing mark",
+  US"ON",   US"Other neutral",
+  US"PDF",  US"Pop directional format",
+  US"PDI",  US"Pop directional isolate",
+  US"R",    US"Right-to-left",
+  US"RLE",  US"Right-to-left embedding",
+  US"RLI",  US"Right-to-left isolate",
+  US"RLO",  US"Right-to-left override",
+  US"S",    US"Segment separator",
+  US"WS",   US"White space"
+};

 static const unsigned int utf8_table1[] = {
  0x0000007f, 0x000007ff, 0x0000ffff, 0x001fffff, 0x03ffffff, 0x7fffffff};
@ -173,6 +214,41 @@ static const unsigned int utf8_table1[] = {
 static const int utf8_table2[] = {
  0, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc};

+/* Macro to pick up the remaining bytes of a UTF-8 character, advancing
+the pointer. */
+
+#define GETUTF8INC(c, eptr) \
+    { \
+    if ((c & 0x20u) == 0) \
+      c = ((c & 0x1fu) << 6) | (*eptr++ & 0x3fu); \
+    else if ((c & 0x10u) == 0) \
+      { \
+      c = ((c & 0x0fu) << 12) | ((*eptr & 0x3fu) << 6) | (eptr[1] & 0x3fu); \
+      eptr += 2; \
+      } \
+    else if ((c & 0x08u) == 0) \
+      { \
+      c = ((c & 0x07u) << 18) | ((*eptr & 0x3fu) << 12) | \
+          ((eptr[1] & 0x3fu) << 6) | (eptr[2] & 0x3fu); \
+      eptr += 3; \
+      } \
+    else if ((c & 0x04u) == 0) \
+      { \
+      c = ((c & 0x03u) << 24) | ((*eptr & 0x3fu) << 18) | \
+          ((eptr[1] & 0x3fu) << 12) | ((eptr[2] & 0x3fu) << 6) | \
+          (eptr[3] & 0x3fu); \
+      eptr += 4; \
+      } \
+    else \
+      { \
+      c = ((c & 0x01u) << 30) | ((*eptr & 0x3fu) << 24) | \
+          ((eptr[1] & 0x3fu) << 18) | ((eptr[2] & 0x3fu) << 12) | \
+          ((eptr[3] & 0x3fu) << 6) | (eptr[4] & 0x3fu); \
+      eptr += 5; \
+      } \
+    }
+
+

 /*************************************************
 *       Convert character value to UTF-8         *
@ -224,25 +300,54 @@ return isatty(fileno(stdin));


 /*************************************************
-*      Get script name from ucp ident            *
+*            Get  name from ucp ident            *
 *************************************************/

-static const char *
-get_scriptname(int script)
-{
-size_t i;
-const ucp_type_table *u;
+/* The utt table contains both full names and abbreviations. So search for both
+and use the longer if two are found, unless the first one is only 3 characters
+and we are looking for a script (some scripts have 3-character names). If this
+were not just a test program it might be worth making some kind of reverse
+index. */

+static const char *
+get_propname(int prop, int type)
+{
+size_t i, j, len;
+size_t foundlist[2];
+const char *yield;
+int typex = (type == PT_SC)? PT_SCX : type;
+
+j = 0;
 for (i = 0; i < PRIV(utt_size); i++)
  {
-  u = PRIV(utt) + i; 
-  if (u->type == PT_SC && u->value == script) break;
+  const ucp_type_table *u = PRIV(utt) + i;
+  if ((u->type == type || u->type == typex) && u->value == prop)
+    {
+    foundlist[j++] = i;
+    if (j >= 2) break;
+    }
  }
-if (i < PRIV(utt_size))
-  return PRIV(utt_names) + u->name_offset;
  
-return "??";
-}  
+if (j == 0) return "??";
+
+yield = NULL;
+len = 0;
+
+for (i = 0; i < j; i++)
+  {
+  const char *s = PRIV(utt_names) + (PRIV(utt) + foundlist[i])->name_offset;
+  size_t sl = strlen(s);
+
+  if (sl > len)
+    {
+    yield = s;
+    if (sl == 3 && type == PT_SC) break;
+    len = sl;
+    }
+  }
+
+return yield;
+}


 /*************************************************
@ -257,13 +362,16 @@ int fulltype = UCD_CHARTYPE(c);
 int script = UCD_SCRIPT(c);
 int scriptx = UCD_SCRIPTX(c);
 int gbprop = UCD_GRAPHBREAK(c);
+int bidi = UCD_BIDICLASS(c);
 unsigned int othercase = UCD_OTHERCASE(c);
 int caseset = UCD_CASESET(c);
+int bprops = UCD_BPROPS(c);

 const unsigned char *fulltypename = US"??";
 const unsigned char *typename = US"??";
 const unsigned char *graphbreak = US"??";
-const unsigned char *scriptname = CUS get_scriptname(script);
+const unsigned char *bidiclass = US"??";
+const unsigned char *scriptname = CUS get_propname(script, PT_SC);

 switch (type)
  {
@ -323,7 +431,7 @@ switch(gbprop)
  case ucp_gbT:            graphbreak = US"Hangul syllable type T"; break;
  case ucp_gbLV:           graphbreak = US"Hangul syllable type LV"; break;
  case ucp_gbLVT:          graphbreak = US"Hangul syllable type LVT"; break;
-  case ucp_gbRegionalIndicator:
+  case ucp_gbRegional_Indicator:
                           graphbreak = US"Regional Indicator"; break;
  case ucp_gbOther:        graphbreak = US"Other"; break;
  case ucp_gbZWJ:          graphbreak = US"Zero Width Joiner"; break;
@ -332,7 +440,37 @@ switch(gbprop)
  default:                 graphbreak = US"Unknown"; break;
  }

-printf("U+%04X %s: %s, %s, %s", c, typename, fulltypename, scriptname, graphbreak);
+switch(bidi)
+  {
+  case ucp_bidiAL:   bidiclass = US"AL "; break;
+  case ucp_bidiFSI:  bidiclass = US"FSI"; break;
+  case ucp_bidiL:    bidiclass = US"L  "; break;
+  case ucp_bidiLRE:  bidiclass = US"LRE"; break;
+  case ucp_bidiLRI:  bidiclass = US"LRI"; break;
+  case ucp_bidiLRO:  bidiclass = US"LRO"; break;
+  case ucp_bidiPDF:  bidiclass = US"PDF"; break;
+  case ucp_bidiPDI:  bidiclass = US"PDI"; break;
+  case ucp_bidiR:    bidiclass = US"R  "; break;
+  case ucp_bidiRLE:  bidiclass = US"RLE"; break;
+  case ucp_bidiRLI:  bidiclass = US"RLI"; break;
+  case ucp_bidiRLO:  bidiclass = US"RLO"; break;
+  case ucp_bidiAN:   bidiclass = US"AN "; break;
+  case ucp_bidiB:    bidiclass = US"B  "; break;
+  case ucp_bidiBN:   bidiclass = US"BN "; break;
+  case ucp_bidiCS:   bidiclass = US"CS "; break;
+  case ucp_bidiEN:   bidiclass = US"EN "; break;
+  case ucp_bidiES:   bidiclass = US"ES "; break;
+  case ucp_bidiET:   bidiclass = US"ET "; break;
+  case ucp_bidiNSM:  bidiclass = US"NSM"; break;
+  case ucp_bidiON:   bidiclass = US"ON "; break;
+  case ucp_bidiS:    bidiclass = US"S  "; break;
+  case ucp_bidiWS:   bidiclass = US"WS "; break;
+  default:           bidiclass = US"???"; break;
+  }
+
+printf("U+%04X %s %s: %s, %s, %s", c, bidiclass, typename, fulltypename,
+  scriptname, graphbreak);
+
 if (is_just_one && othercase != c)
  {
  printf(", U+%04X", othercase);
@ -341,36 +479,47 @@ if (is_just_one && othercase != c)
    const uint32_t *p = PRIV(ucd_caseless_sets) + caseset - 1;
    while (*(++p) < NOTACHAR)
      {
-      unsigned int d = *p;  
+      unsigned int d = *p;
      if (d != othercase && d != c) printf(", U+%04X", d);
-      } 
+      }
    }
  }

-if (scriptx != script)
+if (scriptx != 0)
  {
+  const char *sep = "";
+  const uint32_t *p = PRIV(ucd_script_sets) + scriptx;
  printf(", [");
-  if (scriptx >= 0)
-    printf("%s", get_scriptname(scriptx));
-  else
+  for (int i = 0; i < ucp_Unknown; i++)
+  if (MAPBIT(p, i) != 0)
    {
-    const char *sep = "";
-    const uint8_t *p = PRIV(ucd_script_sets) - scriptx;
-    while (*p != 0)
-      {
-      printf("%s%s", sep, get_scriptname(*p++));
-      sep = ", ";
-      }
+    printf("%s%s", sep, get_propname(i, PT_SC));
+    sep = ", ";
    }
  printf("]");
  }
-  
+
+if (bprops != 0)
+  {
+  const char *sep = "";
+  const uint32_t *p = PRIV(ucd_boolprop_sets) + 
+    bprops * ucd_boolprop_sets_item_size;
+  printf(", [");
+  for (int i = 0; i < ucp_Bprop_Count; i++)
+  if (MAPBIT(p, i) != 0)
+    {
+    printf("%s%s", sep, get_propname(i, PT_BOOL));
+    sep = ", ";
+    }
+  printf("]");
+  }
+
 if (show_character && is_just_one)
  {
  unsigned char buffer[8];
  size_t len = ord2utf8(c, buffer);
-  printf(", >%.*s<", (int)len, buffer);  
-  }  
+  printf(", >%.*s<", (int)len, buffer);
+  }

 printf("\n");
 }
@ -384,19 +533,23 @@ printf("\n");
 static void
 find_chars(unsigned char *s)
 {
-unsigned char name[24];
-unsigned char value[24];
+unsigned char name[128];
+unsigned char value[128];
 unsigned char *t;
 unsigned int count= 0;
-int scriptx_list[24];
+int scriptx_list[128];
 unsigned int scriptx_count = 0;
+int bprop_list[128];
+unsigned int bprop_count = 0;
 uint32_t i, c;
 int script = -1;
 int type = -1;
 int gbreak = -1;
+int bidiclass = -1;
 BOOL script_not = FALSE;
 BOOL type_not = FALSE;
 BOOL gbreak_not = FALSE;
+BOOL bidiclass_not = FALSE;
 BOOL hadrange = FALSE;
 const ucd_record *ucd, *next_ucd;
 const char *pad = "        ";
@ -410,13 +563,18 @@ while (*s != 0)
  *t = 0;
  while (isspace(*s)) s++;

-  for (t = value; *s != 0 && !isspace(*s); s++) *t++ = *s;
+  for (t = value; *s != 0 && !isspace(*s); s++) 
+    {
+    if (*s != '_' && *s != '-') *t++ = *s;
+    } 
  *t = 0;
  while (isspace(*s)) s++;

  if (strcmp(CS name, "script") == 0 ||
      strcmp(CS name, "scriptx") == 0)
    {
+    for (t = value; *t != 0; t++) *t = tolower(*t);
+ 
    if (value[0] == '!')
      {
      if (name[6] == 'x') scriptx_not = TRUE;
@ -426,11 +584,11 @@ while (*s != 0)

    for (i = 0; i < PRIV(utt_size); i++)
      {
-      const ucp_type_table *u = PRIV(utt) + i; 
-      if (u->type == PT_SC && strcmp(CS(value + offset), 
+      const ucp_type_table *u = PRIV(utt) + i;
+      if ((u->type == PT_SCX || u->type == PT_SC) && strcmp(CS(value + offset),
            PRIV(utt_names) + u->name_offset) == 0)
        {
-        c = u->value; 
+        c = u->value;
        if (name[6] == 'x')
          {
          scriptx_list[scriptx_count++] = scriptx_not? (-c):c;
@ -454,6 +612,33 @@ while (*s != 0)
      }
    }

+  else if (strcmp(CS name, "bool") == 0)
+    {
+    int not = 1;
+    if (value[0] == '!')
+      {
+      not = -1;
+      offset = 1;
+      }
+
+    for (i = 0; i < PRIV(utt_size); i++)
+      {
+      const ucp_type_table *u = PRIV(utt) + i;
+      if (u->type == PT_BOOL && strcmp(CS(value + offset),
+            PRIV(utt_names) + u->name_offset) == 0)
+        {
+        bprop_list[bprop_count++] = u->value * not;
+        break;
+        }
+      }
+
+    if (i >= PRIV(utt_size))
+      {
+      printf("** Unrecognized property name \"%s\"\n", value);
+      return;
+      }
+    }
+
  else if (strcmp(CS name, "type") == 0)
    {
    if (type >= 0)
@ -516,6 +701,38 @@ while (*s != 0)
      }
    }

+  else if (strcmp(CS name, "bidi") == 0 ||
+           strcmp(CS name, "bidiclass") == 0 ||
+           strcmp(CS name, "bidi_class") == 0 )
+    {
+    if (bidiclass >= 0)
+      {
+      printf("** Only 1 bidi class value allowed\n");
+      return;
+      }
+    else
+      {
+      if (value[0] == '!')
+        {
+        bidiclass_not = TRUE;
+        offset = 1;
+        }
+      for (i = 0; i < sizeof(bd_names)/sizeof(char *); i += 2)
+        {
+        if (strcasecmp(CS (value + offset), CS bd_names[i]) == 0)
+          {
+          bidiclass = i/2;
+          break;
+          }
+        }
+      if (i >= sizeof(bd_names)/sizeof(char *))
+        {
+        printf("** Unrecognized bidi class name \"%s\"\n", value);
+        return;
+        }
+      }
+    }
+
  else
    {
    printf("** Unrecognized property name \"%s\"\n", name);
@ -523,7 +740,8 @@ while (*s != 0)
    }
  }

-if (script < 0 && scriptx_count == 0 && type < 0 && gbreak < 0)
+if (script < 0 && scriptx_count == 0 && bprop_count == 0 && type < 0 &&
+    gbreak < 0 && bidiclass < 0)
  {
  printf("** No properties specified\n");
  return;
@ -535,55 +753,55 @@ for (c = 0; c <= 0x10ffff; c++)

  if (scriptx_count > 0)
    {
-    const uint8_t *char_scriptx = NULL;
+    const uint32_t *bits_scriptx = PRIV(ucd_script_sets) + UCD_SCRIPTX(c);
    unsigned int found = 0;
-    int scriptx = UCD_SCRIPTX(c);
-
-    if (scriptx < 0) char_scriptx = PRIV(ucd_script_sets) - scriptx;

    for (i = 0; i < scriptx_count; i++)
      {
+      int x = scriptx_list[i]/32;
+      int y = scriptx_list[i]%32;
+
      /* Positive requirment */
      if (scriptx_list[i] >= 0)
        {
-        if (scriptx >= 0)
-          {
-          if (scriptx == scriptx_list[i]) found++;
-          }
-
-        else
-          {
-          const uint8_t *p;
-          for (p = char_scriptx; *p != 0; p++)
-            {
-            if (scriptx_list[i] == *p)
-              {
-              found++;
-              break;
-              }
-            }
-          }
+        if ((bits_scriptx[x] & (1u<<y)) != 0) found++;
        }
      /* Negative requirement */
      else
        {
-        if (scriptx >= 0)
-          {
-          if (scriptx != -scriptx_list[i]) found++;
-          }
-        else
-          {
-          const uint8_t *p;
-          for (p = char_scriptx; *p != 0; p++)
-            if (-scriptx_list[i] == *p) break;
-          if (*p == 0) found++;
-          }
+        if ((bits_scriptx[x] & (1u<<y)) == 0) found++;
        }
      }

    if (found != scriptx_count) continue;
    }

+  if (bprop_count > 0)
+    {
+    const uint32_t *bits_bprop = PRIV(ucd_boolprop_sets) + 
+      UCD_BPROPS(c) * ucd_boolprop_sets_item_size;
+    unsigned int found = 0;
+
+    for (i = 0; i < bprop_count; i++)
+      {
+      int x = bprop_list[i]/32;
+      int y = bprop_list[i]%32;
+
+      /* Positive requirement */
+      if (bprop_list[i] >= 0)
+        {
+        if ((bits_bprop[x] & (1u<<y)) != 0) found++;
+        }
+      /* Negative requirement */
+      else
+        {
+        if ((bits_bprop[-x] & (1u<<(-y))) == 0) found++;
+        }
+      }
+
+    if (found != bprop_count) continue;
+    }
+
  if (type >= 0)
    {
    if (type_not)
@ -608,6 +826,18 @@ for (c = 0; c <= 0x10ffff; c++)
      }
    }

+  if (bidiclass >= 0)
+    {
+    if (bidiclass_not)
+      {
+      if (bidiclass == UCD_BIDICLASS(c)) continue;
+      }
+    else
+      {
+      if (bidiclass != UCD_BIDICLASS(c)) continue;
+      }
+    }
+
  /* All conditions are met. Look for runs. */

  ucd = GET_UCD(c);
@ -663,23 +893,37 @@ if (strcmp(CS name, "findprop") == 0)
  {
  while (*s != 0)
    {
-    unsigned int c; 
+    unsigned int c;
    unsigned char *endptr;
-    t = s; 
-    if (strncmp(CS t, "U+", 2) == 0) t += 2;
-    c = strtoul(CS t, CSS(&endptr), 16);
+    t = s;
+
+    if (*t == '+')
+      {
+      c = *(++t);
+      if (c > 0x7fu)
+        {
+        GETCHARINC(c, t);
+        }
+      endptr = t+1;
+      }
+    else
+      {
+      if (strncmp(CS t, "U+", 2) == 0) t += 2;
+      c = strtoul(CS t, CSS(&endptr), 16);
+      }
+
    if (*endptr != 0 && !isspace(*endptr))
      {
      while (*endptr != 0 && !isspace(*endptr)) endptr++;
-      printf("** Invalid hex number: ignored \"%.*s\"\n", (int)(endptr-s), s);
+      printf("** Invalid character specifier: ignored \"%.*s\"\n", (int)(endptr-s), s);
      }
-    else  
+    else
      {
-      if (c > 0x10ffff) 
+      if (c > 0x10ffff)
        printf("** U+%x is too big for a Unicode code point\n", c);
-      else   
+      else
        print_prop(c, TRUE);
-      } 
+      }
    s = endptr;
    while (isspace(*s)) s++;
    }
@ -689,7 +933,7 @@ else if (strcmp(CS name, "find") == 0)
  {
  find_chars(s);
  }
-  
+
 else if (strcmp(CS name, "list") == 0)
  {
  while (*s != 0)
@ -698,38 +942,52 @@ else if (strcmp(CS name, "list") == 0)
    for (t = name; *s != 0 && !isspace(*s); s++) *t++ = *s;
    *t = 0;
    while (isspace(*s)) s++;
-    
+
    if (strcmp(CS name, "script") == 0 || strcmp(CS name, "scripts") == 0)
      {
-      for (i = 0; i < PRIV(utt_size); i++) 
-        if (PRIV(utt)[i].type == PT_SC)
-          printf("%s\n", PRIV(utt_names) + PRIV(utt)[i].name_offset);  
+      for (i = 0; i < PRIV(utt_size); i++)
+        if (PRIV(utt)[i].type == PT_SCX || PRIV(utt)[i].type == PT_SC)
+          printf("%s\n", PRIV(utt_names) + PRIV(utt)[i].name_offset);
      }
-      
+
+    else if (strcmp(CS name, "bool") == 0)
+      {
+      for (i = 0; i < PRIV(utt_size); i++)
+        if (PRIV(utt)[i].type == PT_BOOL)
+          printf("%s\n", PRIV(utt_names) + PRIV(utt)[i].name_offset);
+      }
+
    else if (strcmp(CS name, "type") == 0 || strcmp(CS name, "types") == 0)
      {
      for (i = 0; i < sizeof(type_names)/sizeof(char *); i += 2)
-        printf("%s %s\n", type_names[i], type_names[i+1]); 
-      }  
-      
+        printf("%s %s\n", type_names[i], type_names[i+1]);
+      }
+
    else if (strcmp(CS name, "gbreak") == 0 || strcmp(CS name, "gbreaks") == 0)
      {
      for (i = 0; i < sizeof(gb_names)/sizeof(char *); i += 2)
        {
-        if (gb_names[i+1][0] != 0)  
+        if (gb_names[i+1][0] != 0)
          printf("%-3s (%s)\n", gb_names[i], gb_names[i+1]);
-        else   
+        else
          printf("%s\n", gb_names[i]);
-        } 
-      }    
+        }
+      }

-    else 
+    else if (strcmp(CS name, "bidi") == 0 ||
+             strcmp(CS name, "bidiclasses") == 0)
      {
-      printf("** Unknown property \"%s\"\n", name);  
+      for (i = 0; i < sizeof(bd_names)/sizeof(char *); i += 2)
+        printf("%3s %s\n", bd_names[i], bd_names[i+1]);
+      }
+
+    else
+      {
+      printf("** Unknown property \"%s\"\n", name);
      break;
-      }  
-    }  
-  }  
+      }
+    }
+  }

 else printf("** Unknown test command \"%s\"\n", name);
 }
@ -751,32 +1009,32 @@ if (argc > 1 && strcmp(argv[1], "-s") == 0)
  {
  show_character = TRUE;
  first_arg++;
-  }   
+  }

 if (argc > first_arg)
  {
  int i;
-  BOOL hexfirst = TRUE; 
-  char *arg = argv[first_arg]; 
+  BOOL datafirst = TRUE;
+  char *arg = argv[first_arg];
  unsigned char *s = buffer;
-  
-  if (strncmp(arg, "U+", 2) != 0 && !isdigit(*arg)) 
+
+  if (*arg != '+' && strncmp(arg, "U+", 2) != 0 && !isdigit(*arg))
    {
-    while (*arg != 0) 
+    while (*arg != 0)
      {
-      if (!isxdigit(*arg++)) { hexfirst = FALSE; break; }  
-      } 
-    } 
-     
-  if (hexfirst)
+      if (!isxdigit(*arg++)) { datafirst = FALSE; break; }
+      }
+    }
+
+  if (datafirst)
    {
    strcpy(CS s, "findprop ");
    s += 9;
    }
-    
+
  for (i = first_arg; i < argc; i++)
    {
-    s += sprintf(CS s, "%s ", argv[i]);       
+    s += sprintf(CS s, "%s ", argv[i]);
    }

  process_command_line(buffer);
@ -812,7 +1070,7 @@ for(;;)
    if (fgets(CS buffer, sizeof(buffer), stdin) == NULL) break;
    if (!interactive) printf("%s", buffer);
    }
-    
+
  process_command_line(buffer);
  }

--- a/maint/ucptestdata/testinput1
+++ b/maint/ucptestdata/testinput1
@ -46,3 +46,5 @@ findprop 32ff
 findprop 1f16d

 findprop U+10e93 U+10eaa
+
+findprop 0602 202a 202b 202c 2068 2069 202d 202e 2067
--- a/maint/ucptestdata/testinput2
+++ b/maint/ucptestdata/testinput2
@ -4,3 +4,16 @@ find type Sk
 find type Pd
 find gbreak LVT
 find script Old_Uyghur
+find bidi PDF
+find bidi CS
+find bidi CS type Sm
+find bidi B
+find bidi FSI
+find bidi PDI
+find bidi RLI
+find bidi RLO
+find bidi S
+find bidi WS
+find script bopo
+find bool prependedconcatenationmark
+find bool pcm
--- a/maint/ucptestdata/testoutput1
+++ b/maint/ucptestdata/testoutput1
@ -1,398 +1,409 @@
 findprop 00 01 02 03 04 05 06 07 08 09 0a 0b 0c 0d 0e 0f 
-U+0000 Control: Control, Common, Control
-U+0001 Control: Control, Common, Control
-U+0002 Control: Control, Common, Control
-U+0003 Control: Control, Common, Control
-U+0004 Control: Control, Common, Control
-U+0005 Control: Control, Common, Control
-U+0006 Control: Control, Common, Control
-U+0007 Control: Control, Common, Control
-U+0008 Control: Control, Common, Control
-U+0009 Control: Control, Common, Control
-U+000A Control: Control, Common, LF
-U+000B Control: Control, Common, Control
-U+000C Control: Control, Common, Control
-U+000D Control: Control, Common, CR
-U+000E Control: Control, Common, Control
-U+000F Control: Control, Common, Control
+U+0000 BN  Control: Control, common, Control, [ascii, patternwhitespace, whitespace]
+U+0001 BN  Control: Control, common, Control, [ascii, patternwhitespace, whitespace]
+U+0002 BN  Control: Control, common, Control, [ascii, patternwhitespace, whitespace]
+U+0003 BN  Control: Control, common, Control, [ascii, patternwhitespace, whitespace]
+U+0004 BN  Control: Control, common, Control, [ascii, patternwhitespace, whitespace]
+U+0005 BN  Control: Control, common, Control, [ascii, patternwhitespace, whitespace]
+U+0006 BN  Control: Control, common, Control, [ascii, patternwhitespace, whitespace]
+U+0007 BN  Control: Control, common, Control, [ascii, patternwhitespace, whitespace]
+U+0008 BN  Control: Control, common, Control, [ascii, patternwhitespace, whitespace]
+U+0009 S   Control: Control, common, Control, [ascii, graphemebase, patternsyntax, sentenceterminal, terminalpunctuation]
+U+000A B   Control: Control, common, LF, [ascii, graphemebase, patternsyntax, sentenceterminal, terminalpunctuation]
+U+000B S   Control: Control, common, Control, [ascii, graphemebase, patternsyntax, sentenceterminal, terminalpunctuation]
+U+000C WS  Control: Control, common, Control, [ascii, graphemebase, patternsyntax, sentenceterminal, terminalpunctuation]
+U+000D B   Control: Control, common, CR, [ascii, graphemebase, patternsyntax, sentenceterminal, terminalpunctuation]
+U+000E BN  Control: Control, common, Control, [ascii, patternwhitespace, whitespace]
+U+000F BN  Control: Control, common, Control, [ascii, patternwhitespace, whitespace]
 findprop 10 11 12 13 14 15 16 17 18 19 1a 1b 1c 1d 1e 1f 
-U+0010 Control: Control, Common, Control
-U+0011 Control: Control, Common, Control
-U+0012 Control: Control, Common, Control
-U+0013 Control: Control, Common, Control
-U+0014 Control: Control, Common, Control
-U+0015 Control: Control, Common, Control
-U+0016 Control: Control, Common, Control
-U+0017 Control: Control, Common, Control
-U+0018 Control: Control, Common, Control
-U+0019 Control: Control, Common, Control
-U+001A Control: Control, Common, Control
-U+001B Control: Control, Common, Control
-U+001C Control: Control, Common, Control
-U+001D Control: Control, Common, Control
-U+001E Control: Control, Common, Control
-U+001F Control: Control, Common, Control
+U+0010 BN  Control: Control, common, Control, [ascii, patternwhitespace, whitespace]
+U+0011 BN  Control: Control, common, Control, [ascii, patternwhitespace, whitespace]
+U+0012 BN  Control: Control, common, Control, [ascii, patternwhitespace, whitespace]
+U+0013 BN  Control: Control, common, Control, [ascii, patternwhitespace, whitespace]
+U+0014 BN  Control: Control, common, Control, [ascii, patternwhitespace, whitespace]
+U+0015 BN  Control: Control, common, Control, [ascii, patternwhitespace, whitespace]
+U+0016 BN  Control: Control, common, Control, [ascii, patternwhitespace, whitespace]
+U+0017 BN  Control: Control, common, Control, [ascii, patternwhitespace, whitespace]
+U+0018 BN  Control: Control, common, Control, [ascii, patternwhitespace, whitespace]
+U+0019 BN  Control: Control, common, Control, [ascii, patternwhitespace, whitespace]
+U+001A BN  Control: Control, common, Control, [ascii, patternwhitespace, whitespace]
+U+001B BN  Control: Control, common, Control, [ascii, patternwhitespace, whitespace]
+U+001C B   Control: Control, common, Control, [ascii, patternwhitespace, whitespace]
+U+001D B   Control: Control, common, Control, [ascii, patternwhitespace, whitespace]
+U+001E B   Control: Control, common, Control, [ascii, patternwhitespace, whitespace]
+U+001F S   Control: Control, common, Control, [ascii, patternwhitespace, whitespace]
 findprop 20 21 22 23 24 25 26 27 28 29 2a 2b 2c 2d 2e 2f 
-U+0020 Separator: Space separator, Common, Other
-U+0021 Punctuation: Other punctuation, Common, Other
-U+0022 Punctuation: Other punctuation, Common, Other
-U+0023 Punctuation: Other punctuation, Common, Other
-U+0024 Symbol: Currency symbol, Common, Other
-U+0025 Punctuation: Other punctuation, Common, Other
-U+0026 Punctuation: Other punctuation, Common, Other
-U+0027 Punctuation: Other punctuation, Common, Other
-U+0028 Punctuation: Open punctuation, Common, Other
-U+0029 Punctuation: Close punctuation, Common, Other
-U+002A Punctuation: Other punctuation, Common, Other
-U+002B Symbol: Mathematical symbol, Common, Other
-U+002C Punctuation: Other punctuation, Common, Other
-U+002D Punctuation: Dash punctuation, Common, Other
-U+002E Punctuation: Other punctuation, Common, Other
-U+002F Punctuation: Other punctuation, Common, Other
+U+0020 WS  Separator: Space separator, common, Other, [ascii, emoji, emojicomponent, graphemebase, patternsyntax]
+U+0021 ON  Punctuation: Other punctuation, common, Other, [ascii, caseignorable, graphemebase, patternsyntax, quotationmark]
+U+0022 ON  Punctuation: Other punctuation, common, Other, [ascii, graphemebase, math, patternsyntax]
+U+0023 ET  Punctuation: Other punctuation, common, Other, [ascii, dash, graphemebase, patternsyntax]
+U+0024 ET  Symbol: Currency symbol, common, Other, [ascii, asciihexdigit, emoji, emojicomponent, graphemebase, hexdigit, idcontinue, xidcontinue]
+U+0025 ET  Punctuation: Other punctuation, common, Other, [ascii, asciihexdigit, emoji, emojicomponent, graphemebase, hexdigit, idcontinue, xidcontinue]
+U+0026 ON  Punctuation: Other punctuation, common, Other, [ascii, asciihexdigit, emoji, emojicomponent, graphemebase, hexdigit, idcontinue, xidcontinue]
+U+0027 ON  Punctuation: Other punctuation, common, Other, [ascii, bidimirrored, graphemebase, math, patternsyntax]
+U+0028 ON  Punctuation: Open punctuation, common, Other, [ascii, alphabetic, cased, changeswhencasefolded, changeswhencasemapped, changeswhenlowercased, graphemebase, idcontinue, idstart, uppercase, xidcontinue, xidstart]
+U+0029 ON  Punctuation: Close punctuation, common, Other, [ascii, alphabetic, cased, changeswhencasefolded, changeswhencasemapped, changeswhenlowercased, graphemebase, idcontinue, idstart, uppercase, xidcontinue, xidstart]
+U+002A ON  Punctuation: Other punctuation, common, Other, [ascii, dash, graphemebase, patternsyntax]
+U+002B ES  Symbol: Mathematical symbol, common, Other, [ascii, graphemebase, idcontinue, xidcontinue]
+U+002C CS  Punctuation: Other punctuation, common, Other, [ascii, asciihexdigit, alphabetic, cased, changeswhencasemapped, changeswhentitlecased, changeswhenuppercased, graphemebase, hexdigit, idcontinue, idstart, lowercase, xidcontinue, xidstart]
+U+002D ES  Punctuation: Dash punctuation, common, Other, [ascii, alphabetic, cased, changeswhencasemapped, changeswhentitlecased, changeswhenuppercased, graphemebase, idcontinue, idstart, lowercase, softdotted, xidcontinue, xidstart]
+U+002E CS  Punctuation: Other punctuation, common, Other, [graphemebase, whitespace]
+U+002F CS  Punctuation: Other punctuation, common, Other, [ascii, asciihexdigit, emoji, emojicomponent, graphemebase, hexdigit, idcontinue, xidcontinue]
 findprop 30 31 32 33 34 35 36 37 38 39 3a 3b 3c 3d 3e 3f 
-U+0030 Number: Decimal number, Common, Other
-U+0031 Number: Decimal number, Common, Other
-U+0032 Number: Decimal number, Common, Other
-U+0033 Number: Decimal number, Common, Other
-U+0034 Number: Decimal number, Common, Other
-U+0035 Number: Decimal number, Common, Other
-U+0036 Number: Decimal number, Common, Other
-U+0037 Number: Decimal number, Common, Other
-U+0038 Number: Decimal number, Common, Other
-U+0039 Number: Decimal number, Common, Other
-U+003A Punctuation: Other punctuation, Common, Other
-U+003B Punctuation: Other punctuation, Common, Other
-U+003C Symbol: Mathematical symbol, Common, Other
-U+003D Symbol: Mathematical symbol, Common, Other
-U+003E Symbol: Mathematical symbol, Common, Other
-U+003F Punctuation: Other punctuation, Common, Other
+U+0030 EN  Number: Decimal number, common, Other, [caseignorable, diacritic, graphemebase]
+U+0031 EN  Number: Decimal number, common, Other, [caseignorable, diacritic, graphemebase]
+U+0032 EN  Number: Decimal number, common, Other, [caseignorable, diacritic, graphemebase]
+U+0033 EN  Number: Decimal number, common, Other, [caseignorable, diacritic, graphemebase]
+U+0034 EN  Number: Decimal number, common, Other, [caseignorable, diacritic, graphemebase]
+U+0035 EN  Number: Decimal number, common, Other, [caseignorable, diacritic, graphemebase]
+U+0036 EN  Number: Decimal number, common, Other, [caseignorable, diacritic, graphemebase]
+U+0037 EN  Number: Decimal number, common, Other, [caseignorable, diacritic, graphemebase]
+U+0038 EN  Number: Decimal number, common, Other, [caseignorable, diacritic, graphemebase]
+U+0039 EN  Number: Decimal number, common, Other, [caseignorable, diacritic, graphemebase]
+U+003A CS  Punctuation: Other punctuation, common, Other, [alphabetic, cased, graphemebase, idcontinue, idstart, lowercase, xidcontinue, xidstart]
+U+003B ON  Punctuation: Other punctuation, common, Other, [ascii, asciihexdigit, alphabetic, cased, changeswhencasemapped, changeswhentitlecased, changeswhenuppercased, graphemebase, hexdigit, idcontinue, idstart, lowercase, xidcontinue, xidstart]
+U+003C ON  Symbol: Mathematical symbol, common, Other, [graphemebase, math, patternsyntax]
+U+003D ON  Symbol: Mathematical symbol, common, Other, [ascii, graphemebase, idcontinue, xidcontinue]
+U+003E ON  Symbol: Mathematical symbol, common, Other, [graphemebase, math, patternsyntax]
+U+003F ON  Punctuation: Other punctuation, common, Other, [ascii, caseignorable, graphemebase, patternsyntax, quotationmark]
 findprop 40 41 42 43 44 45 46 47 48 49 4a 4b 4c 4d 4e 4f 
-U+0040 Punctuation: Other punctuation, Common, Other
-U+0041 Letter: Upper case letter, Latin, Other, U+0061
-U+0042 Letter: Upper case letter, Latin, Other, U+0062
-U+0043 Letter: Upper case letter, Latin, Other, U+0063
-U+0044 Letter: Upper case letter, Latin, Other, U+0064
-U+0045 Letter: Upper case letter, Latin, Other, U+0065
-U+0046 Letter: Upper case letter, Latin, Other, U+0066
-U+0047 Letter: Upper case letter, Latin, Other, U+0067
-U+0048 Letter: Upper case letter, Latin, Other, U+0068
-U+0049 Letter: Upper case letter, Latin, Other, U+0069
-U+004A Letter: Upper case letter, Latin, Other, U+006A
-U+004B Letter: Upper case letter, Latin, Other, U+006B, U+212A
-U+004C Letter: Upper case letter, Latin, Other, U+006C
-U+004D Letter: Upper case letter, Latin, Other, U+006D
-U+004E Letter: Upper case letter, Latin, Other, U+006E
-U+004F Letter: Upper case letter, Latin, Other, U+006F
+U+0040 ON  Punctuation: Other punctuation, common, Other, [ascii, asciihexdigit, emoji, emojicomponent, graphemebase, hexdigit, idcontinue, xidcontinue]
+U+0041 L   Letter: Upper case letter, latin, Other, U+0061, [graphemebase]
+U+0042 L   Letter: Upper case letter, latin, Other, U+0062, [graphemebase]
+U+0043 L   Letter: Upper case letter, latin, Other, U+0063, [graphemebase]
+U+0044 L   Letter: Upper case letter, latin, Other, U+0064, [graphemebase]
+U+0045 L   Letter: Upper case letter, latin, Other, U+0065, [graphemebase]
+U+0046 L   Letter: Upper case letter, latin, Other, U+0066, [graphemebase]
+U+0047 L   Letter: Upper case letter, latin, Other, U+0067, [caseignorable, diacritic, extender, graphemebase, idcontinue, xidcontinue]
+U+0048 L   Letter: Upper case letter, latin, Other, U+0068, [caseignorable, diacritic, extender, graphemebase, idcontinue, xidcontinue]
+U+0049 L   Letter: Upper case letter, latin, Other, U+0069, [caseignorable, diacritic, extender, graphemebase, idcontinue, xidcontinue]
+U+004A L   Letter: Upper case letter, latin, Other, U+006A, [caseignorable, diacritic, extender, graphemebase, idcontinue, xidcontinue]
+U+004B L   Letter: Upper case letter, latin, Other, U+006B, U+212A, [caseignorable, diacritic, extender, graphemebase, idcontinue, xidcontinue]
+U+004C L   Letter: Upper case letter, latin, Other, U+006C, [caseignorable, diacritic, extender, graphemebase, idcontinue, xidcontinue]
+U+004D L   Letter: Upper case letter, latin, Other, U+006D, [caseignorable, diacritic, extender, graphemebase, idcontinue, xidcontinue]
+U+004E L   Letter: Upper case letter, latin, Other, U+006E, [caseignorable, diacritic, extender, graphemebase, idcontinue, xidcontinue]
+U+004F L   Letter: Upper case letter, latin, Other, U+006F, [caseignorable, diacritic, extender, graphemebase, idcontinue, xidcontinue]
 findprop 50 51 52 53 54 55 56 57 58 59 5a 5b 5c 5d 5e 5f 
-U+0050 Letter: Upper case letter, Latin, Other, U+0070
-U+0051 Letter: Upper case letter, Latin, Other, U+0071
-U+0052 Letter: Upper case letter, Latin, Other, U+0072
-U+0053 Letter: Upper case letter, Latin, Other, U+0073, U+017F
-U+0054 Letter: Upper case letter, Latin, Other, U+0074
-U+0055 Letter: Upper case letter, Latin, Other, U+0075
-U+0056 Letter: Upper case letter, Latin, Other, U+0076
-U+0057 Letter: Upper case letter, Latin, Other, U+0077
-U+0058 Letter: Upper case letter, Latin, Other, U+0078
-U+0059 Letter: Upper case letter, Latin, Other, U+0079
-U+005A Letter: Upper case letter, Latin, Other, U+007A
-U+005B Punctuation: Open punctuation, Common, Other
-U+005C Punctuation: Other punctuation, Common, Other
-U+005D Punctuation: Close punctuation, Common, Other
-U+005E Symbol: Modifier symbol, Common, Other
-U+005F Punctuation: Connector punctuation, Common, Other
+U+0050 L   Letter: Upper case letter, latin, Other, U+0070, [caseignorable, diacritic, extender, graphemebase, idcontinue, xidcontinue]
+U+0051 L   Letter: Upper case letter, latin, Other, U+0071, [caseignorable, diacritic, extender, graphemebase, idcontinue, xidcontinue]
+U+0052 L   Letter: Upper case letter, latin, Other, U+0072, [caseignorable, diacritic, extender, graphemebase, idcontinue, xidcontinue]
+U+0053 L   Letter: Upper case letter, latin, Other, U+0073, U+017F, [caseignorable, diacritic, extender, graphemebase, idcontinue, xidcontinue]
+U+0054 L   Letter: Upper case letter, latin, Other, U+0074, [caseignorable, diacritic, extender, graphemebase, idcontinue, xidcontinue]
+U+0055 L   Letter: Upper case letter, latin, Other, U+0075, [caseignorable, diacritic, extender, graphemebase, idcontinue, xidcontinue]
+U+0056 L   Letter: Upper case letter, latin, Other, U+0076, [caseignorable, diacritic, extender, graphemebase, idcontinue, xidcontinue]
+U+0057 L   Letter: Upper case letter, latin, Other, U+0077, [caseignorable, diacritic, extender, graphemebase, idcontinue, xidcontinue]
+U+0058 L   Letter: Upper case letter, latin, Other, U+0078, [caseignorable, diacritic, extender, graphemebase, idcontinue, xidcontinue]
+U+0059 L   Letter: Upper case letter, latin, Other, U+0079, [caseignorable, diacritic, extender, graphemebase, idcontinue, xidcontinue]
+U+005A L   Letter: Upper case letter, latin, Other, U+007A, [caseignorable, diacritic, extender, graphemebase, idcontinue, xidcontinue]
+U+005B ON  Punctuation: Open punctuation, common, Other, [ascii, alphabetic, cased, changeswhencasefolded, changeswhencasemapped, changeswhenlowercased, graphemebase, idcontinue, idstart, uppercase, xidcontinue, xidstart]
+U+005C ON  Punctuation: Other punctuation, common, Other, [ascii, asciihexdigit, emoji, emojicomponent, graphemebase, hexdigit, idcontinue, xidcontinue]
+U+005D ON  Punctuation: Close punctuation, common, Other, [ascii, alphabetic, cased, changeswhencasefolded, changeswhencasemapped, changeswhenlowercased, graphemebase, idcontinue, idstart, uppercase, xidcontinue, xidstart]
+U+005E ON  Symbol: Modifier symbol, common, Other, [alphabetic, cased, changeswhencasemapped, changeswhentitlecased, changeswhenuppercased, graphemebase, idcontinue, idstart, lowercase, xidcontinue, xidstart]
+U+005F ON  Punctuation: Connector punctuation, common, Other, [alphabetic, cased, changeswhencasefolded, changeswhencasemapped, changeswhentitlecased, changeswhenuppercased, deprecated, graphemebase, idcontinue, idstart, lowercase, xidcontinue, xidstart]
 findprop 60 61 62 63 64 65 66 67 68 69 6a 6b 6c 6d 6e 6f 
-U+0060 Symbol: Modifier symbol, Common, Other
-U+0061 Letter: Lower case letter, Latin, Other, U+0041
-U+0062 Letter: Lower case letter, Latin, Other, U+0042
-U+0063 Letter: Lower case letter, Latin, Other, U+0043
-U+0064 Letter: Lower case letter, Latin, Other, U+0044
-U+0065 Letter: Lower case letter, Latin, Other, U+0045
-U+0066 Letter: Lower case letter, Latin, Other, U+0046
-U+0067 Letter: Lower case letter, Latin, Other, U+0047
-U+0068 Letter: Lower case letter, Latin, Other, U+0048
-U+0069 Letter: Lower case letter, Latin, Other, U+0049
-U+006A Letter: Lower case letter, Latin, Other, U+004A
-U+006B Letter: Lower case letter, Latin, Other, U+004B, U+212A
-U+006C Letter: Lower case letter, Latin, Other, U+004C
-U+006D Letter: Lower case letter, Latin, Other, U+004D
-U+006E Letter: Lower case letter, Latin, Other, U+004E
-U+006F Letter: Lower case letter, Latin, Other, U+004F
+U+0060 ON  Symbol: Modifier symbol, common, Other, [alphabetic, cased, changeswhencasefolded, changeswhencasemapped, changeswhenlowercased, changeswhentitlecased, graphemebase, idcontinue, idstart, uppercase, xidcontinue, xidstart]
+U+0061 L   Letter: Lower case letter, latin, Other, U+0041, [alphabetic, caseignorable, cased, diacritic, graphemebase, idcontinue, idstart, lowercase, xidcontinue, xidstart]
+U+0062 L   Letter: Lower case letter, latin, Other, U+0042, [alphabetic, caseignorable, cased, diacritic, graphemebase, idcontinue, idstart, lowercase, xidcontinue, xidstart]
+U+0063 L   Letter: Lower case letter, latin, Other, U+0043, [alphabetic, caseignorable, cased, diacritic, graphemebase, idcontinue, idstart, lowercase, xidcontinue, xidstart]
+U+0064 L   Letter: Lower case letter, latin, Other, U+0044, [alphabetic, caseignorable, cased, diacritic, graphemebase, idcontinue, idstart, lowercase, xidcontinue, xidstart]
+U+0065 L   Letter: Lower case letter, latin, Other, U+0045, [alphabetic, caseignorable, cased, diacritic, graphemebase, idcontinue, idstart, lowercase, xidcontinue, xidstart]
+U+0066 L   Letter: Lower case letter, latin, Other, U+0046, [alphabetic, caseignorable, cased, diacritic, graphemebase, idcontinue, idstart, lowercase, xidcontinue, xidstart]
+U+0067 L   Letter: Lower case letter, latin, Other, U+0047, [alphabetic, caseignorable, diacritic, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
+U+0068 L   Letter: Lower case letter, latin, Other, U+0048, [alphabetic, caseignorable, diacritic, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
+U+0069 L   Letter: Lower case letter, latin, Other, U+0049, [caseignorable, diacritic, graphemeextend, idcontinue, xidcontinue]
+U+006A L   Letter: Lower case letter, latin, Other, U+004A, [caseignorable, diacritic, graphemeextend, idcontinue, xidcontinue]
+U+006B L   Letter: Lower case letter, latin, Other, U+004B, U+212A, [alphabetic, caseignorable, diacritic, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
+U+006C L   Letter: Lower case letter, latin, Other, U+004C, [alphabetic, caseignorable, diacritic, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
+U+006D L   Letter: Lower case letter, latin, Other, U+004D, [alphabetic, caseignorable, diacritic, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
+U+006E L   Letter: Lower case letter, latin, Other, U+004E, [alphabetic, caseignorable, diacritic, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
+U+006F L   Letter: Lower case letter, latin, Other, U+004F, [alphabetic, caseignorable, diacritic, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
 findprop 70 71 72 73 74 75 76 77 78 79 7a 7b 7c 7d 7e 7f 
-U+0070 Letter: Lower case letter, Latin, Other, U+0050
-U+0071 Letter: Lower case letter, Latin, Other, U+0051
-U+0072 Letter: Lower case letter, Latin, Other, U+0052
-U+0073 Letter: Lower case letter, Latin, Other, U+0053, U+017F
-U+0074 Letter: Lower case letter, Latin, Other, U+0054
-U+0075 Letter: Lower case letter, Latin, Other, U+0055
-U+0076 Letter: Lower case letter, Latin, Other, U+0056
-U+0077 Letter: Lower case letter, Latin, Other, U+0057
-U+0078 Letter: Lower case letter, Latin, Other, U+0058
-U+0079 Letter: Lower case letter, Latin, Other, U+0059
-U+007A Letter: Lower case letter, Latin, Other, U+005A
-U+007B Punctuation: Open punctuation, Common, Other
-U+007C Symbol: Mathematical symbol, Common, Other
-U+007D Punctuation: Close punctuation, Common, Other
-U+007E Symbol: Mathematical symbol, Common, Other
-U+007F Control: Control, Common, Control
+U+0070 L   Letter: Lower case letter, latin, Other, U+0050, [alphabetic, caseignorable, diacritic, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
+U+0071 L   Letter: Lower case letter, latin, Other, U+0051, [alphabetic, caseignorable, diacritic, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
+U+0072 L   Letter: Lower case letter, latin, Other, U+0052, [alphabetic, caseignorable, diacritic, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
+U+0073 L   Letter: Lower case letter, latin, Other, U+0053, U+017F, [alphabetic, caseignorable, diacritic, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
+U+0074 L   Letter: Lower case letter, latin, Other, U+0054, [alphabetic, caseignorable, diacritic, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
+U+0075 L   Letter: Lower case letter, latin, Other, U+0055, [alphabetic, caseignorable, diacritic, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
+U+0076 L   Letter: Lower case letter, latin, Other, U+0056, [alphabetic, caseignorable, diacritic, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
+U+0077 L   Letter: Lower case letter, latin, Other, U+0057, [alphabetic, caseignorable, diacritic, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
+U+0078 L   Letter: Lower case letter, latin, Other, U+0058, [alphabetic, caseignorable, diacritic, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
+U+0079 L   Letter: Lower case letter, latin, Other, U+0059, [alphabetic, caseignorable, diacritic, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
+U+007A L   Letter: Lower case letter, latin, Other, U+005A, [alphabetic, caseignorable, diacritic, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
+U+007B ON  Punctuation: Open punctuation, common, Other, [ascii, alphabetic, cased, changeswhencasefolded, changeswhencasemapped, changeswhenlowercased, graphemebase, idcontinue, idstart, uppercase, xidcontinue, xidstart]
+U+007C ON  Symbol: Mathematical symbol, common, Other, [ascii, graphemebase, idcontinue, xidcontinue]
+U+007D ON  Punctuation: Close punctuation, common, Other, [ascii, alphabetic, cased, changeswhencasefolded, changeswhencasemapped, changeswhenlowercased, graphemebase, idcontinue, idstart, uppercase, xidcontinue, xidstart]
+U+007E ON  Symbol: Mathematical symbol, common, Other, [ascii, graphemebase, idcontinue, xidcontinue]
+U+007F BN  Control: Control, common, Control, [ascii, patternwhitespace, whitespace]

 findprop 80 81 82 83 84 85 86 87 88 89 8a 8b 8c 8d 8e 8f 
-U+0080 Control: Control, Common, Control
-U+0081 Control: Control, Common, Control
-U+0082 Control: Control, Common, Control
-U+0083 Control: Control, Common, Control
-U+0084 Control: Control, Common, Control
-U+0085 Control: Control, Common, Control
-U+0086 Control: Control, Common, Control
-U+0087 Control: Control, Common, Control
-U+0088 Control: Control, Common, Control
-U+0089 Control: Control, Common, Control
-U+008A Control: Control, Common, Control
-U+008B Control: Control, Common, Control
-U+008C Control: Control, Common, Control
-U+008D Control: Control, Common, Control
-U+008E Control: Control, Common, Control
-U+008F Control: Control, Common, Control
+U+0080 BN  Control: Control, common, Control
+U+0081 BN  Control: Control, common, Control
+U+0082 BN  Control: Control, common, Control
+U+0083 BN  Control: Control, common, Control
+U+0084 BN  Control: Control, common, Control
+U+0085 B   Control: Control, common, Control, [caseignorable, defaultignorablecodepoint, graphemeextend, idcontinue, xidcontinue]
+U+0086 BN  Control: Control, common, Control
+U+0087 BN  Control: Control, common, Control
+U+0088 BN  Control: Control, common, Control
+U+0089 BN  Control: Control, common, Control
+U+008A BN  Control: Control, common, Control
+U+008B BN  Control: Control, common, Control
+U+008C BN  Control: Control, common, Control
+U+008D BN  Control: Control, common, Control
+U+008E BN  Control: Control, common, Control
+U+008F BN  Control: Control, common, Control
 findprop 90 91 92 93 94 95 96 97 98 99 9a 9b 9c 9d 9e 9f 
-U+0090 Control: Control, Common, Control
-U+0091 Control: Control, Common, Control
-U+0092 Control: Control, Common, Control
-U+0093 Control: Control, Common, Control
-U+0094 Control: Control, Common, Control
-U+0095 Control: Control, Common, Control
-U+0096 Control: Control, Common, Control
-U+0097 Control: Control, Common, Control
-U+0098 Control: Control, Common, Control
-U+0099 Control: Control, Common, Control
-U+009A Control: Control, Common, Control
-U+009B Control: Control, Common, Control
-U+009C Control: Control, Common, Control
-U+009D Control: Control, Common, Control
-U+009E Control: Control, Common, Control
-U+009F Control: Control, Common, Control
+U+0090 BN  Control: Control, common, Control
+U+0091 BN  Control: Control, common, Control
+U+0092 BN  Control: Control, common, Control
+U+0093 BN  Control: Control, common, Control
+U+0094 BN  Control: Control, common, Control
+U+0095 BN  Control: Control, common, Control
+U+0096 BN  Control: Control, common, Control
+U+0097 BN  Control: Control, common, Control
+U+0098 BN  Control: Control, common, Control
+U+0099 BN  Control: Control, common, Control
+U+009A BN  Control: Control, common, Control
+U+009B BN  Control: Control, common, Control
+U+009C BN  Control: Control, common, Control
+U+009D BN  Control: Control, common, Control
+U+009E BN  Control: Control, common, Control
+U+009F BN  Control: Control, common, Control
 findprop a0 a1 a2 a3 a4 a5 a6 a7 a8 a9 aa ab ac ad ae af 
-U+00A0 Separator: Space separator, Common, Other
-U+00A1 Punctuation: Other punctuation, Common, Other
-U+00A2 Symbol: Currency symbol, Common, Other
-U+00A3 Symbol: Currency symbol, Common, Other
-U+00A4 Symbol: Currency symbol, Common, Other
-U+00A5 Symbol: Currency symbol, Common, Other
-U+00A6 Symbol: Other symbol, Common, Other
-U+00A7 Punctuation: Other punctuation, Common, Other
-U+00A8 Symbol: Modifier symbol, Common, Other
-U+00A9 Symbol: Other symbol, Common, Extended Pictographic
-U+00AA Letter: Other letter, Latin, Other
-U+00AB Punctuation: Initial punctuation, Common, Other
-U+00AC Symbol: Mathematical symbol, Common, Other
-U+00AD Control: Format, Common, Control
-U+00AE Symbol: Other symbol, Common, Extended Pictographic
-U+00AF Symbol: Modifier symbol, Common, Other
+U+00A0 CS  Separator: Space separator, common, Other, [alphabetic, caseignorable, cased, diacritic, graphemebase, idcontinue, idstart, lowercase]
+U+00A1 ON  Punctuation: Other punctuation, common, Other, [caseignorable, graphemebase, idcontinue, terminalpunctuation, xidcontinue]
+U+00A2 ET  Symbol: Currency symbol, common, Other, [caseignorable, graphemebase, idcontinue, terminalpunctuation, xidcontinue]
+U+00A3 ET  Symbol: Currency symbol, common, Other, [caseignorable, graphemebase, idcontinue, terminalpunctuation, xidcontinue]
+U+00A4 ET  Symbol: Currency symbol, common, Other, [caseignorable, graphemebase, idcontinue, terminalpunctuation, xidcontinue]
+U+00A5 ET  Symbol: Currency symbol, common, Other, [caseignorable, graphemebase, idcontinue, terminalpunctuation, xidcontinue]
+U+00A6 ON  Symbol: Other symbol, common, Other, [caseignorable, graphemebase, idcontinue, terminalpunctuation, xidcontinue]
+U+00A7 ON  Punctuation: Other punctuation, common, Other, [caseignorable, graphemebase, idcontinue, terminalpunctuation, xidcontinue]
+U+00A8 ON  Symbol: Modifier symbol, common, Other, [alphabetic, cased, graphemebase, idcontinue, idstart, math, uppercase, xidcontinue, xidstart]
+U+00A9 ON  Symbol: Other symbol, common, Extended Pictographic, [alphabetic, cased, changeswhencasefolded, changeswhencasemapped, changeswhenlowercased, graphemebase, idcontinue, idstart, math, uppercase, xidcontinue, xidstart]
+U+00AA L   Letter: Other letter, latin, Other, [caseignorable, graphemeextend]
+U+00AB ON  Punctuation: Initial punctuation, common, Other, [graphemebase, sentenceterminal, terminalpunctuation]
+U+00AC ON  Symbol: Mathematical symbol, common, Other, [alphabetic, caseignorable, diacritic, graphemeextend, idcontinue, xidcontinue]
+U+00AD BN  Control: Format, common, Control, [caseignorable, prependedconcatenationmark]
+U+00AE ON  Symbol: Other symbol, common, Extended Pictographic, [alphabetic, cased, changeswhencasefolded, changeswhencasemapped, changeswhenlowercased, graphemebase, idcontinue, idstart, math, uppercase, xidcontinue, xidstart]
+U+00AF ON  Symbol: Modifier symbol, common, Other, [alphabetic, cased, graphemebase, idcontinue, idstart, math, uppercase, xidcontinue, xidstart]
 findprop b0 b1 b2 b3 b4 b5 b6 b7 b8 b9 ba bb bc bd be bf 
-U+00B0 Symbol: Other symbol, Common, Other
-U+00B1 Symbol: Mathematical symbol, Common, Other
-U+00B2 Number: Other number, Common, Other
-U+00B3 Number: Other number, Common, Other
-U+00B4 Symbol: Modifier symbol, Common, Other
-U+00B5 Letter: Lower case letter, Common, Other, U+03BC, U+039C
-U+00B6 Punctuation: Other punctuation, Common, Other
-U+00B7 Punctuation: Other punctuation, Common, Other
-U+00B8 Symbol: Modifier symbol, Common, Other
-U+00B9 Number: Other number, Common, Other
-U+00BA Letter: Other letter, Latin, Other
-U+00BB Punctuation: Final punctuation, Common, Other
-U+00BC Number: Other number, Common, Other
-U+00BD Number: Other number, Common, Other
-U+00BE Number: Other number, Common, Other
-U+00BF Punctuation: Other punctuation, Common, Other
+U+00B0 ET  Symbol: Other symbol, common, Other, [caseignorable, graphemebase, idcontinue, terminalpunctuation, xidcontinue]
+U+00B1 ET  Symbol: Mathematical symbol, common, Other, [alphabetic, caseignorable, diacritic, graphemeextend, idcontinue, xidcontinue]
+U+00B2 EN  Number: Other number, common, Other, [alphabetic, caseignorable, extender, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
+U+00B3 EN  Number: Other number, common, Other, [alphabetic, caseignorable, extender, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
+U+00B4 ON  Symbol: Modifier symbol, common, Other, [alphabetic, cased, graphemebase, idcontinue, idstart, math, uppercase, xidcontinue, xidstart]
+U+00B5 L   Letter: Lower case letter, common, Other, U+03BC, U+039C, [alphabetic, deprecated, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
+U+00B6 ON  Punctuation: Other punctuation, common, Other, [caseignorable, graphemebase, idcontinue, terminalpunctuation, xidcontinue]
+U+00B7 ON  Punctuation: Other punctuation, common, Other, [alphabetic, graphemebase, idcontinue, xidcontinue]
+U+00B8 ON  Symbol: Modifier symbol, common, Other, [alphabetic, cased, graphemebase, idcontinue, idstart, math, uppercase, xidcontinue, xidstart]
+U+00B9 EN  Number: Other number, common, Other, [alphabetic, caseignorable, extender, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
+U+00BA L   Letter: Other letter, latin, Other, [caseignorable, graphemeextend]
+U+00BB ON  Punctuation: Final punctuation, common, Other, [graphemebase, sentenceterminal, terminalpunctuation]
+U+00BC ON  Number: Other number, common, Other, [alphabetic, caseignorable, extender, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
+U+00BD ON  Number: Other number, common, Other, [alphabetic, caseignorable, extender, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
+U+00BE ON  Number: Other number, common, Other, [alphabetic, caseignorable, extender, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
+U+00BF ON  Punctuation: Other punctuation, common, Other, [caseignorable, graphemebase, idcontinue, terminalpunctuation, xidcontinue]
 findprop c0 c1 c2 c3 c4 c5 c6 c7 c8 c9 ca cb cc cd ce cf 
-U+00C0 Letter: Upper case letter, Latin, Other, U+00E0
-U+00C1 Letter: Upper case letter, Latin, Other, U+00E1
-U+00C2 Letter: Upper case letter, Latin, Other, U+00E2
-U+00C3 Letter: Upper case letter, Latin, Other, U+00E3
-U+00C4 Letter: Upper case letter, Latin, Other, U+00E4
-U+00C5 Letter: Upper case letter, Latin, Other, U+00E5, U+212B
-U+00C6 Letter: Upper case letter, Latin, Other, U+00E6
-U+00C7 Letter: Upper case letter, Latin, Other, U+00E7
-U+00C8 Letter: Upper case letter, Latin, Other, U+00E8
-U+00C9 Letter: Upper case letter, Latin, Other, U+00E9
-U+00CA Letter: Upper case letter, Latin, Other, U+00EA
-U+00CB Letter: Upper case letter, Latin, Other, U+00EB
-U+00CC Letter: Upper case letter, Latin, Other, U+00EC
-U+00CD Letter: Upper case letter, Latin, Other, U+00ED
-U+00CE Letter: Upper case letter, Latin, Other, U+00EE
-U+00CF Letter: Upper case letter, Latin, Other, U+00EF
+U+00C0 L   Letter: Upper case letter, latin, Other, U+00E0, [alphabetic, graphemeextend, idcontinue, xidcontinue]
+U+00C1 L   Letter: Upper case letter, latin, Other, U+00E1, [alphabetic, graphemeextend, idcontinue, xidcontinue]
+U+00C2 L   Letter: Upper case letter, latin, Other, U+00E2, [alphabetic, graphemeextend, idcontinue, xidcontinue]
+U+00C3 L   Letter: Upper case letter, latin, Other, U+00E3, [alphabetic, graphemeextend, idcontinue, xidcontinue]
+U+00C4 L   Letter: Upper case letter, latin, Other, U+00E4, [alphabetic, graphemeextend, idcontinue, xidcontinue]
+U+00C5 L   Letter: Upper case letter, latin, Other, U+00E5, U+212B, [alphabetic, graphemeextend, idcontinue, xidcontinue]
+U+00C6 L   Letter: Upper case letter, latin, Other, U+00E6, [alphabetic, graphemeextend, idcontinue, xidcontinue]
+U+00C7 L   Letter: Upper case letter, latin, Other, U+00E7, [alphabetic, graphemeextend, idcontinue, xidcontinue]
+U+00C8 L   Letter: Upper case letter, latin, Other, U+00E8, [alphabetic, graphemeextend, idcontinue, xidcontinue]
+U+00C9 L   Letter: Upper case letter, latin, Other, U+00E9, [alphabetic, graphemeextend, idcontinue, xidcontinue]
+U+00CA L   Letter: Upper case letter, latin, Other, U+00EA, [alphabetic, graphemeextend, idcontinue, xidcontinue]
+U+00CB L   Letter: Upper case letter, latin, Other, U+00EB, [alphabetic, graphemeextend, idcontinue, xidcontinue]
+U+00CC L   Letter: Upper case letter, latin, Other, U+00EC, [alphabetic, graphemeextend, idcontinue, xidcontinue]
+U+00CD L   Letter: Upper case letter, latin, Other, U+00ED, [alphabetic, graphemeextend, idcontinue, xidcontinue]
+U+00CE L   Letter: Upper case letter, latin, Other, U+00EE, [alphabetic, graphemeextend, idcontinue, xidcontinue]
+U+00CF L   Letter: Upper case letter, latin, Other, U+00EF, [alphabetic, graphemeextend, idcontinue, xidcontinue]
 findprop d0 d1 d2 d3 d4 d5 d6 d7 d8 d9 da db dc dd de df 
-U+00D0 Letter: Upper case letter, Latin, Other, U+00F0
-U+00D1 Letter: Upper case letter, Latin, Other, U+00F1
-U+00D2 Letter: Upper case letter, Latin, Other, U+00F2
-U+00D3 Letter: Upper case letter, Latin, Other, U+00F3
-U+00D4 Letter: Upper case letter, Latin, Other, U+00F4
-U+00D5 Letter: Upper case letter, Latin, Other, U+00F5
-U+00D6 Letter: Upper case letter, Latin, Other, U+00F6
-U+00D7 Symbol: Mathematical symbol, Common, Other
-U+00D8 Letter: Upper case letter, Latin, Other, U+00F8
-U+00D9 Letter: Upper case letter, Latin, Other, U+00F9
-U+00DA Letter: Upper case letter, Latin, Other, U+00FA
-U+00DB Letter: Upper case letter, Latin, Other, U+00FB
-U+00DC Letter: Upper case letter, Latin, Other, U+00FC
-U+00DD Letter: Upper case letter, Latin, Other, U+00FD
-U+00DE Letter: Upper case letter, Latin, Other, U+00FE
-U+00DF Letter: Lower case letter, Latin, Other, U+1E9E
+U+00D0 L   Letter: Upper case letter, latin, Other, U+00F0, [alphabetic, graphemeextend, idcontinue, xidcontinue]
+U+00D1 L   Letter: Upper case letter, latin, Other, U+00F1, [alphabetic, graphemeextend, idcontinue, xidcontinue]
+U+00D2 L   Letter: Upper case letter, latin, Other, U+00F2, [alphabetic, graphemeextend, idcontinue, xidcontinue]
+U+00D3 L   Letter: Upper case letter, latin, Other, U+00F3, [alphabetic, graphemeextend, idcontinue, xidcontinue]
+U+00D4 L   Letter: Upper case letter, latin, Other, U+00F4, [alphabetic, graphemeextend, idcontinue, xidcontinue]
+U+00D5 L   Letter: Upper case letter, latin, Other, U+00F5, [alphabetic, graphemeextend, idcontinue, xidcontinue]
+U+00D6 L   Letter: Upper case letter, latin, Other, U+00F6, [alphabetic, graphemeextend, idcontinue, xidcontinue]
+U+00D7 ON  Symbol: Mathematical symbol, common, Other, [alphabetic, caseignorable, diacritic, graphemeextend, idcontinue, xidcontinue]
+U+00D8 L   Letter: Upper case letter, latin, Other, U+00F8, [alphabetic, graphemeextend, idcontinue, xidcontinue]
+U+00D9 L   Letter: Upper case letter, latin, Other, U+00F9, [alphabetic, graphemeextend, idcontinue, xidcontinue]
+U+00DA L   Letter: Upper case letter, latin, Other, U+00FA, [alphabetic, graphemeextend, idcontinue, xidcontinue]
+U+00DB L   Letter: Upper case letter, latin, Other, U+00FB, [alphabetic, graphemeextend, idcontinue, xidcontinue]
+U+00DC L   Letter: Upper case letter, latin, Other, U+00FC, [alphabetic, graphemeextend, idcontinue, xidcontinue]
+U+00DD L   Letter: Upper case letter, latin, Other, U+00FD, [alphabetic, graphemeextend, idcontinue, xidcontinue]
+U+00DE L   Letter: Upper case letter, latin, Other, U+00FE, [alphabetic, graphemeextend, idcontinue, xidcontinue]
+U+00DF L   Letter: Lower case letter, latin, Other, U+1E9E, [alphabetic, deprecated, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
 findprop e0 e1 e2 e3 e4 e5 e6 e7 e8 e9 ea eb ec ed ee ef 
-U+00E0 Letter: Lower case letter, Latin, Other, U+00C0
-U+00E1 Letter: Lower case letter, Latin, Other, U+00C1
-U+00E2 Letter: Lower case letter, Latin, Other, U+00C2
-U+00E3 Letter: Lower case letter, Latin, Other, U+00C3
-U+00E4 Letter: Lower case letter, Latin, Other, U+00C4
-U+00E5 Letter: Lower case letter, Latin, Other, U+00C5, U+212B
-U+00E6 Letter: Lower case letter, Latin, Other, U+00C6
-U+00E7 Letter: Lower case letter, Latin, Other, U+00C7
-U+00E8 Letter: Lower case letter, Latin, Other, U+00C8
-U+00E9 Letter: Lower case letter, Latin, Other, U+00C9
-U+00EA Letter: Lower case letter, Latin, Other, U+00CA
-U+00EB Letter: Lower case letter, Latin, Other, U+00CB
-U+00EC Letter: Lower case letter, Latin, Other, U+00CC
-U+00ED Letter: Lower case letter, Latin, Other, U+00CD
-U+00EE Letter: Lower case letter, Latin, Other, U+00CE
-U+00EF Letter: Lower case letter, Latin, Other, U+00CF
+U+00E0 L   Letter: Lower case letter, latin, Other, U+00C0, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue]
+U+00E1 L   Letter: Lower case letter, latin, Other, U+00C1, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue]
+U+00E2 L   Letter: Lower case letter, latin, Other, U+00C2, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue]
+U+00E3 L   Letter: Lower case letter, latin, Other, U+00C3, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue]
+U+00E4 L   Letter: Lower case letter, latin, Other, U+00C4, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue]
+U+00E5 L   Letter: Lower case letter, latin, Other, U+00C5, U+212B, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue]
+U+00E6 L   Letter: Lower case letter, latin, Other, U+00C6, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue]
+U+00E7 L   Letter: Lower case letter, latin, Other, U+00C7, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue]
+U+00E8 L   Letter: Lower case letter, latin, Other, U+00C8, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue]
+U+00E9 L   Letter: Lower case letter, latin, Other, U+00C9, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue]
+U+00EA L   Letter: Lower case letter, latin, Other, U+00CA, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue]
+U+00EB L   Letter: Lower case letter, latin, Other, U+00CB, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue]
+U+00EC L   Letter: Lower case letter, latin, Other, U+00CC, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue]
+U+00ED L   Letter: Lower case letter, latin, Other, U+00CD, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue]
+U+00EE L   Letter: Lower case letter, latin, Other, U+00CE, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue]
+U+00EF L   Letter: Lower case letter, latin, Other, U+00CF, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue]
 findprop f0 f1 f2 f3 f4 f5 f6 f7 f8 f9 fa fb fc fd fe ff 
-U+00F0 Letter: Lower case letter, Latin, Other, U+00D0
-U+00F1 Letter: Lower case letter, Latin, Other, U+00D1
-U+00F2 Letter: Lower case letter, Latin, Other, U+00D2
-U+00F3 Letter: Lower case letter, Latin, Other, U+00D3
-U+00F4 Letter: Lower case letter, Latin, Other, U+00D4
-U+00F5 Letter: Lower case letter, Latin, Other, U+00D5
-U+00F6 Letter: Lower case letter, Latin, Other, U+00D6
-U+00F7 Symbol: Mathematical symbol, Common, Other
-U+00F8 Letter: Lower case letter, Latin, Other, U+00D8
-U+00F9 Letter: Lower case letter, Latin, Other, U+00D9
-U+00FA Letter: Lower case letter, Latin, Other, U+00DA
-U+00FB Letter: Lower case letter, Latin, Other, U+00DB
-U+00FC Letter: Lower case letter, Latin, Other, U+00DC
-U+00FD Letter: Lower case letter, Latin, Other, U+00DD
-U+00FE Letter: Lower case letter, Latin, Other, U+00DE
-U+00FF Letter: Lower case letter, Latin, Other, U+0178
+U+00F0 L   Letter: Lower case letter, latin, Other, U+00D0, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue]
+U+00F1 L   Letter: Lower case letter, latin, Other, U+00D1, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue]
+U+00F2 L   Letter: Lower case letter, latin, Other, U+00D2, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue]
+U+00F3 L   Letter: Lower case letter, latin, Other, U+00D3, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue]
+U+00F4 L   Letter: Lower case letter, latin, Other, U+00D4, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue]
+U+00F5 L   Letter: Lower case letter, latin, Other, U+00D5, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue]
+U+00F6 L   Letter: Lower case letter, latin, Other, U+00D6, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue]
+U+00F7 ON  Symbol: Mathematical symbol, common, Other, [alphabetic, caseignorable, diacritic, graphemeextend, idcontinue, xidcontinue]
+U+00F8 L   Letter: Lower case letter, latin, Other, U+00D8, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue]
+U+00F9 L   Letter: Lower case letter, latin, Other, U+00D9, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue]
+U+00FA L   Letter: Lower case letter, latin, Other, U+00DA, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue]
+U+00FB L   Letter: Lower case letter, latin, Other, U+00DB, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue]
+U+00FC L   Letter: Lower case letter, latin, Other, U+00DC, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue]
+U+00FD L   Letter: Lower case letter, latin, Other, U+00DD, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue]
+U+00FE L   Letter: Lower case letter, latin, Other, U+00DE, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue]
+U+00FF L   Letter: Lower case letter, latin, Other, U+0178, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue]

 findprop 0100 0101 0102 0103 0104 0105 0106
-U+0100 Letter: Upper case letter, Latin, Other, U+0101
-U+0101 Letter: Lower case letter, Latin, Other, U+0100
-U+0102 Letter: Upper case letter, Latin, Other, U+0103
-U+0103 Letter: Lower case letter, Latin, Other, U+0102
-U+0104 Letter: Upper case letter, Latin, Other, U+0105
-U+0105 Letter: Lower case letter, Latin, Other, U+0104
-U+0106 Letter: Upper case letter, Latin, Other, U+0107
+U+0100 L   Letter: Upper case letter, latin, Other, U+0101, [alphabetic, graphemeextend, idcontinue, xidcontinue]
+U+0101 L   Letter: Lower case letter, latin, Other, U+0100, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue]
+U+0102 L   Letter: Upper case letter, latin, Other, U+0103, [alphabetic, graphemeextend, idcontinue, xidcontinue]
+U+0103 L   Letter: Lower case letter, latin, Other, U+0102, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue]
+U+0104 L   Letter: Upper case letter, latin, Other, U+0105, [alphabetic, graphemeextend, idcontinue, xidcontinue]
+U+0105 L   Letter: Lower case letter, latin, Other, U+0104, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue]
+U+0106 L   Letter: Upper case letter, latin, Other, U+0107, [alphabetic, graphemeextend, idcontinue, xidcontinue]

 findprop ffe0 ffe1 ffe2 ffe3 ffe4 ffe5 ffe6 ffe7 
-U+FFE0 Symbol: Currency symbol, Common, Other
-U+FFE1 Symbol: Currency symbol, Common, Other
-U+FFE2 Symbol: Mathematical symbol, Common, Other
-U+FFE3 Symbol: Modifier symbol, Common, Other
-U+FFE4 Symbol: Other symbol, Common, Other
-U+FFE5 Symbol: Currency symbol, Common, Other
-U+FFE6 Symbol: Currency symbol, Common, Other
-U+FFE7 Control: Unassigned, Unknown, Other
+U+FFE0 ET  Symbol: Currency symbol, common, Other, [alphabetic, caseignorable, extender, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
+U+FFE1 ET  Symbol: Currency symbol, common, Other, [alphabetic, caseignorable, extender, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
+U+FFE2 ON  Symbol: Mathematical symbol, common, Other, [emoji, extendedpictographic, graphemebase]
+U+FFE3 ON  Symbol: Modifier symbol, common, Other, [alphabetic, cased, graphemebase, idcontinue, idstart, math, uppercase, xidcontinue, xidstart]
+U+FFE4 ON  Symbol: Other symbol, common, Other, [alphabetic, caseignorable, extender, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
+U+FFE5 ET  Symbol: Currency symbol, common, Other, [alphabetic, caseignorable, extender, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
+U+FFE6 ET  Symbol: Currency symbol, common, Other, [alphabetic, caseignorable, extender, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
+U+FFE7 L   Control: Unassigned, unknown, Other
 findprop ffe8 ffe9 ffea ffeb ffec ffed ffee ffef
-U+FFE8 Symbol: Other symbol, Common, Other
-U+FFE9 Symbol: Mathematical symbol, Common, Other
-U+FFEA Symbol: Mathematical symbol, Common, Other
-U+FFEB Symbol: Mathematical symbol, Common, Other
-U+FFEC Symbol: Mathematical symbol, Common, Other
-U+FFED Symbol: Other symbol, Common, Other
-U+FFEE Symbol: Other symbol, Common, Other
-U+FFEF Control: Unassigned, Unknown, Other
+U+FFE8 ON  Symbol: Other symbol, common, Other, [alphabetic, caseignorable, extender, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
+U+FFE9 ON  Symbol: Mathematical symbol, common, Other, [emoji, extendedpictographic, graphemebase]
+U+FFEA ON  Symbol: Mathematical symbol, common, Other, [emoji, extendedpictographic, graphemebase]
+U+FFEB ON  Symbol: Mathematical symbol, common, Other, [emoji, extendedpictographic, graphemebase]
+U+FFEC ON  Symbol: Mathematical symbol, common, Other, [emoji, extendedpictographic, graphemebase]
+U+FFED ON  Symbol: Other symbol, common, Other, [alphabetic, caseignorable, extender, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
+U+FFEE ON  Symbol: Other symbol, common, Other, [alphabetic, caseignorable, extender, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
+U+FFEF L   Control: Unassigned, unknown, Other
 findprop fff8 fff9 fffa fffb fffc fffd fffe ffff
-U+FFF8 Control: Unassigned, Unknown, Control
-U+FFF9 Control: Format, Common, Control
-U+FFFA Control: Format, Common, Control
-U+FFFB Control: Format, Common, Control
-U+FFFC Symbol: Other symbol, Common, Other
-U+FFFD Symbol: Other symbol, Common, Other
-U+FFFE Control: Unassigned, Unknown, Other
-U+FFFF Control: Unassigned, Unknown, Other
+U+FFF8 BN  Control: Unassigned, unknown, Control, [dash, defaultignorablecodepoint, deprecated, extendedpictographic, joincontrol, lowercase, patternwhitespace, quotationmark, sentenceterminal, softdotted, xidcontinue, xidstart]
+U+FFF9 ON  Control: Format, common, Control, [changeswhenuppercased, deprecated, emojimodifier, emojipresentation, extender, sentenceterminal, xidcontinue, xidstart]
+U+FFFA ON  Control: Format, common, Control, [changeswhenuppercased, deprecated, emojimodifier, emojipresentation, extender, sentenceterminal, xidcontinue, xidstart]
+U+FFFB ON  Control: Format, common, Control, [changeswhenuppercased, deprecated, emojimodifier, emojipresentation, extender, sentenceterminal, xidcontinue, xidstart]
+U+FFFC ON  Symbol: Other symbol, common, Other, [alphabetic, caseignorable, extender, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
+U+FFFD ON  Symbol: Other symbol, common, Other, [alphabetic, caseignorable, extender, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
+U+FFFE BN  Control: Unassigned, unknown, Other, [changeswhenuppercased, deprecated, emojicomponent, emojimodifierbase, extender, quotationmark, sentenceterminal, xidcontinue, xidstart]
+U+FFFF BN  Control: Unassigned, unknown, Other, [changeswhenuppercased, deprecated, emojicomponent, emojimodifierbase, extender, quotationmark, sentenceterminal, xidcontinue, xidstart]
 findprop 10000 10001 e01ef f0000 100000
-U+10000 Letter: Other letter, Linear_B, Other
-U+10001 Letter: Other letter, Linear_B, Other
-U+E01EF Mark: Non-spacing mark, Inherited, Extend
-U+F0000 Control: Private use, Unknown, Other
-U+100000 Control: Private use, Unknown, Other
+U+10000 L   Letter: Other letter, linearb, Other, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
+U+10001 L   Letter: Other letter, linearb, Other, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
+U+E01EF NSM Mark: Non-spacing mark, inherited, Extend, []
+U+F0000 L   Control: Private use, unknown, Other
+U+100000 L   Control: Private use, unknown, Other

 findprop 1b00 12000 7c0 a840 10900
-U+1B00 Mark: Non-spacing mark, Balinese, Extend
-U+12000 Letter: Other letter, Cuneiform, Other
-U+07C0 Number: Decimal number, Nko, Other
-U+A840 Letter: Other letter, Phags_Pa, Other
-U+10900 Letter: Other letter, Phoenician, Other
+U+1B00 NSM Mark: Non-spacing mark, balinese, Extend, [alphabetic, cased, changeswhencasefolded, changeswhencasemapped, changeswhenlowercased, graphemebase, uppercase]
+U+12000 L   Letter: Other letter, cuneiform, Other, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
+U+07C0 R   Number: Decimal number, nko, Other, [graphemebase, patternsyntax, terminalpunctuation]
+U+A840 L   Letter: Other letter, phagspa, Other, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
+U+10900 R   Letter: Other letter, phoenician, Other, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
 findprop 1d79 a77d
-U+1D79 Letter: Lower case letter, Latin, Other, U+A77D
-U+A77D Letter: Upper case letter, Latin, Other, U+1D79
+U+1D79 L   Letter: Lower case letter, latin, Other, U+A77D, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue]
+U+A77D L   Letter: Upper case letter, latin, Other, U+1D79, [alphabetic, graphemeextend, idcontinue, xidcontinue]

 findprop  0800  083e  a4d0  a4f7  aa80  aadf
-U+0800 Letter: Other letter, Samaritan, Other
-U+083E Punctuation: Other punctuation, Samaritan, Other
-U+A4D0 Letter: Other letter, Lisu, Other
-U+A4F7 Letter: Other letter, Lisu, Other
-U+AA80 Letter: Other letter, Tai_Viet, Other
-U+AADF Punctuation: Other punctuation, Tai_Viet, Other
+U+0800 R   Letter: Other letter, samaritan, Other, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
+U+083E R   Punctuation: Other punctuation, samaritan, Other, [bidimirrored, graphemebase, math, patternsyntax]
+U+A4D0 L   Letter: Other letter, lisu, Other, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
+U+A4F7 L   Letter: Other letter, lisu, Other, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
+U+AA80 L   Letter: Other letter, taiviet, Other, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
+U+AADF L   Punctuation: Other punctuation, taiviet, Other, [graphemebase, patternsyntax, sentenceterminal, terminalpunctuation]
 findprop 10b00 10b35 13000 1342e 10840 10855
-U+10B00 Letter: Other letter, Avestan, Other
-U+10B35 Letter: Other letter, Avestan, Other
-U+13000 Letter: Other letter, Egyptian_Hieroglyphs, Other
-U+1342E Letter: Other letter, Egyptian_Hieroglyphs, Other
-U+10840 Letter: Other letter, Imperial_Aramaic, Other
-U+10855 Letter: Other letter, Imperial_Aramaic, Other
+U+10B00 R   Letter: Other letter, avestan, Other, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
+U+10B35 R   Letter: Other letter, avestan, Other, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
+U+13000 L   Letter: Other letter, egyptianhieroglyphs, Other, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
+U+1342E L   Letter: Other letter, egyptianhieroglyphs, Other, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
+U+10840 R   Letter: Other letter, imperialaramaic, Other, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
+U+10855 R   Letter: Other letter, imperialaramaic, Other, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]

 findprop 11100 1113c 11680 116c0
-U+11100 Mark: Non-spacing mark, Chakma, Extend
-U+1113C Number: Decimal number, Chakma, Other
-U+11680 Letter: Other letter, Takri, Other
-U+116C0 Number: Decimal number, Takri, Other
+U+11100 NSM Mark: Non-spacing mark, chakma, Extend, [alphabetic, cased, changeswhencasefolded, changeswhencasemapped, changeswhenlowercased, graphemebase, uppercase]
+U+1113C L   Number: Decimal number, chakma, Other, [graphemebase, patternsyntax, terminalpunctuation]
+U+11680 L   Letter: Other letter, takri, Other, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
+U+116C0 L   Number: Decimal number, takri, Other, [graphemebase, patternsyntax, terminalpunctuation]

 findprop 0d 0a 0e 0711 1b04 1111 1169 11fe ae4c ad89
-U+000D Control: Control, Common, CR
-U+000A Control: Control, Common, LF
-U+000E Control: Control, Common, Control
-U+0711 Mark: Non-spacing mark, Syriac, Extend
-U+1B04 Mark: Spacing mark, Balinese, SpacingMark
-U+1111 Letter: Other letter, Hangul, Hangul syllable type L
-U+1169 Letter: Other letter, Hangul, Hangul syllable type V
-U+11FE Letter: Other letter, Hangul, Hangul syllable type T
-U+AE4C Letter: Other letter, Hangul, Hangul syllable type LV
-U+AD89 Letter: Other letter, Hangul, Hangul syllable type LVT
+U+000D B   Control: Control, common, CR, [ascii, graphemebase, patternsyntax, sentenceterminal, terminalpunctuation]
+U+000A B   Control: Control, common, LF, [ascii, graphemebase, patternsyntax, sentenceterminal, terminalpunctuation]
+U+000E BN  Control: Control, common, Control, [ascii, patternwhitespace, whitespace]
+U+0711 NSM Mark: Non-spacing mark, syriac, Extend, [alphabetic, cased, changeswhencasefolded, changeswhencasemapped, changeswhenlowercased, graphemebase, uppercase]
+U+1B04 L   Mark: Spacing mark, balinese, SpacingMark, [dash, emoji, extendedpictographic, graphemebase, patternsyntax]
+U+1111 L   Letter: Other letter, hangul, Hangul syllable type L, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
+U+1169 L   Letter: Other letter, hangul, Hangul syllable type V, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
+U+11FE L   Letter: Other letter, hangul, Hangul syllable type T, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
+U+AE4C L   Letter: Other letter, hangul, Hangul syllable type LV, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
+U+AD89 L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]

 findprop 118a0 11ac7 16ad0
-U+118A0 Letter: Upper case letter, Warang_Citi, Other, U+118C0
-U+11AC7 Letter: Other letter, Pau_Cin_Hau, Other
-U+16AD0 Letter: Other letter, Bassa_Vah, Other
+U+118A0 L   Letter: Upper case letter, warangciti, Other, U+118C0, [alphabetic, graphemeextend, idcontinue, xidcontinue]
+U+11AC7 L   Letter: Other letter, paucinhau, Other, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
+U+16AD0 L   Letter: Other letter, bassavah, Other, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]

 findprop 11700 14400 108e0 11280 1d800
-U+11700 Letter: Other letter, Ahom, Other
-U+14400 Letter: Other letter, Anatolian_Hieroglyphs, Other
-U+108E0 Letter: Other letter, Hatran, Other
-U+11280 Letter: Other letter, Multani, Other
-U+1D800 Symbol: Other symbol, SignWriting, Other
+U+11700 L   Letter: Other letter, ahom, Other, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
+U+14400 L   Letter: Other letter, anatolianhieroglyphs, Other, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
+U+108E0 R   Letter: Other letter, hatran, Other, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
+U+11280 L   Letter: Other letter, multani, Other, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
+U+1D800 L   Symbol: Other symbol, signwriting, Other, [alphabetic, caseignorable, extender, graphemebase, idcontinue, idstart, xidcontinue, xidstart]

 findprop 11800 1e903 11da9 10d27 11ee0 16e48 10f27 10f30
-U+11800 Letter: Other letter, Dogra, Other
-U+1E903 Letter: Upper case letter, Adlam, Other, U+1E925
-U+11DA9 Number: Decimal number, Gunjala_Gondi, Other
-U+10D27 Mark: Non-spacing mark, Hanifi_Rohingya, Extend
-U+11EE0 Letter: Other letter, Makasar, Other
-U+16E48 Letter: Upper case letter, Medefaidrin, Other, U+16E68
-U+10F27 Letter: Other letter, Old_Sogdian, Other
-U+10F30 Letter: Other letter, Sogdian, Other
+U+11800 L   Letter: Other letter, dogra, Other, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
+U+1E903 R   Letter: Upper case letter, adlam, Other, U+1E925, [alphabetic, graphemeextend, idcontinue, xidcontinue]
+U+11DA9 L   Number: Decimal number, gunjalagondi, Other, [graphemebase, patternsyntax, terminalpunctuation]
+U+10D27 NSM Mark: Non-spacing mark, hanifirohingya, Extend, [extendedpictographic, graphemebase, patternsyntax]
+U+11EE0 L   Letter: Other letter, makasar, Other, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
+U+16E48 L   Letter: Upper case letter, medefaidrin, Other, U+16E68, [alphabetic, graphemeextend, idcontinue, xidcontinue]
+U+10F27 R   Letter: Other letter, oldsogdian, Other, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
+U+10F30 AL  Letter: Other letter, sogdian, Other, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]

 findprop  a836  a833  1cf4  20f0  1cd0
-U+A836 Symbol: Other symbol, Common, Other, [Devanagari, Dogra, Gujarati, Gurmukhi, Khojki, Kaithi, Mahajani, Modi, Khudawadi, Takri, Tirhuta]
-U+A833 Number: Other number, Common, Other, [Devanagari, Dogra, Gujarati, Gurmukhi, Khojki, Kannada, Kaithi, Mahajani, Modi, Nandinagari, Khudawadi, Takri, Tirhuta]
-U+1CF4 Mark: Non-spacing mark, Inherited, Extend, [Devanagari, Grantha, Kannada]
-U+20F0 Mark: Non-spacing mark, Inherited, Extend, [Devanagari, Grantha, Latin]
-U+1CD0 Mark: Non-spacing mark, Inherited, Extend, [Bengali, Devanagari, Grantha, Kannada]
+U+A836 L   Symbol: Other symbol, common, Other, [devanagari, gurmukhi, gujarati, kaithi, takri, khojki, mahajani, modi, khudawadi, tirhuta, dogra], [alphabetic, caseignorable, extender, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
+U+A833 L   Number: Other number, common, Other, [devanagari, gurmukhi, gujarati, kannada, kaithi, takri, khojki, mahajani, modi, khudawadi, tirhuta, dogra, nandinagari], [alphabetic, caseignorable, extender, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
+U+1CF4 NSM Mark: Non-spacing mark, inherited, Extend, [devanagari, kannada, grantha], [alphabetic, cased, graphemebase, idcontinue, idstart, lowercase, softdotted, xidcontinue, xidstart]
+U+20F0 NSM Mark: Non-spacing mark, inherited, Extend, [latin, devanagari, grantha], [caseignorable, graphemebase, patternsyntax, quotationmark]
+U+1CD0 NSM Mark: Non-spacing mark, inherited, Extend, [devanagari, bengali, kannada, grantha], [alphabetic, cased, graphemebase, idcontinue, idstart, lowercase, softdotted, xidcontinue, xidstart]

 findprop 32ff
-U+32FF Symbol: Other symbol, Common, Other, [Han]
+U+32FF L   Symbol: Other symbol, common, Other, [han], [alphabetic, caseignorable, extender, graphemebase, idcontinue, idstart, xidcontinue, xidstart]

 findprop 1f16d
-U+1F16D Symbol: Other symbol, Common, Extended Pictographic
+U+1F16D ON  Symbol: Other symbol, common, Extended Pictographic, [ascii, sentenceterminal, unifiedideograph, whitespace, xidcontinue]

 findprop U+10e93 U+10eaa
-U+10E93 Letter: Other letter, Yezidi, Other
-U+10EAA Control: Unassigned, Unknown, Other
+U+10E93 R   Letter: Other letter, yezidi, Other, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
+U+10EAA R   Control: Unassigned, unknown, Other
+
+findprop 0602 202a 202b 202c 2068 2069 202d 202e 2067
+U+0602 AN  Control: Format, arabic, Prepend, [alphabetic, cased, changeswhencasemapped, changeswhentitlecased, changeswhenuppercased, graphemebase, lowercase]
+U+202A LRE Control: Format, common, Control, [extendedpictographic, graphemebase, math, patternsyntax]
+U+202B RLE Control: Format, common, Control, [extendedpictographic, graphemebase, math, patternsyntax]
+U+202C PDF Control: Format, common, Control, [extendedpictographic, graphemebase, math, patternsyntax]
+U+2068 FSI Control: Format, common, Control, [extendedpictographic, graphemebase, math, patternsyntax]
+U+2069 PDI Control: Format, common, Control, [extendedpictographic, graphemebase, math, patternsyntax]
+U+202D LRO Control: Format, common, Control, [extendedpictographic, graphemebase, math, patternsyntax]
+U+202E RLO Control: Format, common, Control, [extendedpictographic, graphemebase, math, patternsyntax]
+U+2067 RLI Control: Format, common, Control, [extendedpictographic, graphemebase, math, patternsyntax]
--- a/maint/ucptestdata/testoutput2
+++ b/maint/ucptestdata/testoutput2
@ -1,196 +1,298 @@
 find script Han
-U+2E80..U+2E99 Symbol: Other symbol, Han, Other
-U+2E9B..U+2EF3 Symbol: Other symbol, Han, Other
-U+2F00..U+2FD5 Symbol: Other symbol, Han, Other
-        U+3005 Letter: Modifier letter, Han, Other
-        U+3007 Number: Letter number, Han, Other
-U+3021..U+3029 Number: Letter number, Han, Other
-U+3038..U+303A Number: Letter number, Han, Other
-        U+303B Letter: Modifier letter, Han, Other
-U+3400..U+4DBF Letter: Other letter, Han, Other
-U+4E00..U+9FFF Letter: Other letter, Han, Other
-U+F900..U+FA6D Letter: Other letter, Han, Other
-U+FA70..U+FAD9 Letter: Other letter, Han, Other
-        U+16FE2 Punctuation: Other punctuation, Han, Other
-       U+16FE3 Letter: Modifier letter, Han, Other
-U+16FF0..U+16FF1 Mark: Spacing mark, Han, SpacingMark
-U+20000..U+2A6DF Letter: Other letter, Han, Other
-U+2A700..U+2B738 Letter: Other letter, Han, Other
-U+2B740..U+2B81D Letter: Other letter, Han, Other
-U+2B820..U+2CEA1 Letter: Other letter, Han, Other
-U+2CEB0..U+2EBE0 Letter: Other letter, Han, Other
-U+2F800..U+2FA1D Letter: Other letter, Han, Other
-U+30000..U+3134A Letter: Other letter, Han, Other
+U+2E80..U+2E99 ON  Symbol: Other symbol, han, Other, [ascii, sentenceterminal, unifiedideograph, whitespace, xidstart]
+U+2E9B..U+2EF3 ON  Symbol: Other symbol, han, Other, [ascii, sentenceterminal, unifiedideograph, whitespace, xidstart]
+U+2F00..U+2FD5 ON  Symbol: Other symbol, han, Other, [ascii, sentenceterminal, unifiedideograph, whitespace, xidstart]
+        U+3005 L   Letter: Modifier letter, han, Other, [emoji, emojimodifierbase, emojipresentation, extendedpictographic, graphemebase, patternsyntax]
+        U+3007 L   Number: Letter number, han, Other, [sentenceterminal, unifiedideograph, xidcontinue, xidstart]
+U+3021..U+3029 L   Number: Letter number, han, Other, [sentenceterminal, unifiedideograph, xidcontinue, xidstart]
+U+3038..U+303A L   Number: Letter number, han, Other, [sentenceterminal, unifiedideograph, xidcontinue, xidstart]
+        U+303B L   Letter: Modifier letter, han, Other, [alphabetic, graphemebase, idcontinue, idstart, ideographic, xidcontinue, xidstart]
+U+3400..U+4DBF L   Letter: Other letter, han, Other, [changeswhenuppercased, deprecated, emojimodifier, emojimodifierbase, extender, quotationmark, sentenceterminal, xidcontinue, xidstart]
+U+4E00..U+9FFF L   Letter: Other letter, han, Other, [changeswhenuppercased, deprecated, emojimodifier, emojimodifierbase, extender, quotationmark, sentenceterminal, xidcontinue, xidstart]
+U+F900..U+FA0D L   Letter: Other letter, han, Other, [sentenceterminal, unifiedideograph, xidcontinue, xidstart]
+U+FA0E..U+FA0F L   Letter: Other letter, han, Other, [changeswhenuppercased, deprecated, emojimodifier, emojimodifierbase, extender, quotationmark, sentenceterminal, xidcontinue, xidstart]
+        U+FA10 L   Letter: Other letter, han, Other, [sentenceterminal, unifiedideograph, xidcontinue, xidstart]
+        U+FA11 L   Letter: Other letter, han, Other, [changeswhenuppercased, deprecated, emojimodifier, emojimodifierbase, extender, quotationmark, sentenceterminal, xidcontinue, xidstart]
+        U+FA12 L   Letter: Other letter, han, Other, [sentenceterminal, unifiedideograph, xidcontinue, xidstart]
+U+FA13..U+FA14 L   Letter: Other letter, han, Other, [changeswhenuppercased, deprecated, emojimodifier, emojimodifierbase, extender, quotationmark, sentenceterminal, xidcontinue, xidstart]
+U+FA15..U+FA1E L   Letter: Other letter, han, Other, [sentenceterminal, unifiedideograph, xidcontinue, xidstart]
+        U+FA1F L   Letter: Other letter, han, Other, [changeswhenuppercased, deprecated, emojimodifier, emojimodifierbase, extender, quotationmark, sentenceterminal, xidcontinue, xidstart]
+        U+FA20 L   Letter: Other letter, han, Other, [sentenceterminal, unifiedideograph, xidcontinue, xidstart]
+        U+FA21 L   Letter: Other letter, han, Other, [changeswhenuppercased, deprecated, emojimodifier, emojimodifierbase, extender, quotationmark, sentenceterminal, xidcontinue, xidstart]
+        U+FA22 L   Letter: Other letter, han, Other, [sentenceterminal, unifiedideograph, xidcontinue, xidstart]
+U+FA23..U+FA24 L   Letter: Other letter, han, Other, [changeswhenuppercased, deprecated, emojimodifier, emojimodifierbase, extender, quotationmark, sentenceterminal, xidcontinue, xidstart]
+U+FA25..U+FA26 L   Letter: Other letter, han, Other, [sentenceterminal, unifiedideograph, xidcontinue, xidstart]
+U+FA27..U+FA29 L   Letter: Other letter, han, Other, [changeswhenuppercased, deprecated, emojimodifier, emojimodifierbase, extender, quotationmark, sentenceterminal, xidcontinue, xidstart]
+U+FA2A..U+FA6D L   Letter: Other letter, han, Other, [sentenceterminal, unifiedideograph, xidcontinue, xidstart]
+U+FA70..U+FAD9 L   Letter: Other letter, han, Other, [sentenceterminal, unifiedideograph, xidcontinue, xidstart]
+        U+16FE2 ON  Punctuation: Other punctuation, han, Other, [alphabetic, caseignorable, extender, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
+       U+16FE3 L   Letter: Modifier letter, han, Other, [emoji, emojimodifierbase, emojipresentation, extendedpictographic, graphemebase, patternsyntax]
+U+16FF0..U+16FF1 L   Mark: Spacing mark, han, SpacingMark, [caseignorable, graphemeextend, idcontinue, ideographic, xidcontinue]
+U+20000..U+2A6DF L   Letter: Other letter, han, Other, [changeswhenuppercased, deprecated, emojimodifier, emojimodifierbase, extender, quotationmark, sentenceterminal, xidcontinue, xidstart]
+U+2A700..U+2B738 L   Letter: Other letter, han, Other, [changeswhenuppercased, deprecated, emojimodifier, emojimodifierbase, extender, quotationmark, sentenceterminal, xidcontinue, xidstart]
+U+2B740..U+2B81D L   Letter: Other letter, han, Other, [changeswhenuppercased, deprecated, emojimodifier, emojimodifierbase, extender, quotationmark, sentenceterminal, xidcontinue, xidstart]
+U+2B820..U+2CEA1 L   Letter: Other letter, han, Other, [changeswhenuppercased, deprecated, emojimodifier, emojimodifierbase, extender, quotationmark, sentenceterminal, xidcontinue, xidstart]
+U+2CEB0..U+2EBE0 L   Letter: Other letter, han, Other, [changeswhenuppercased, deprecated, emojimodifier, emojimodifierbase, extender, quotationmark, sentenceterminal, xidcontinue, xidstart]
+U+2F800..U+2FA1D L   Letter: Other letter, han, Other, [sentenceterminal, unifiedideograph, xidcontinue, xidstart]
+U+30000..U+3134A L   Letter: Other letter, han, Other, [changeswhenuppercased, deprecated, emojimodifier, emojimodifierbase, extender, quotationmark, sentenceterminal, xidcontinue, xidstart]
 find type Pe script Common scriptx Hangul
-U+3009 Punctuation: Close punctuation, Common, Other, [Bopomofo, Hangul, Han, Hiragana, Katakana, Yi]
-U+300B Punctuation: Close punctuation, Common, Other, [Bopomofo, Hangul, Han, Hiragana, Katakana, Yi]
-U+300D Punctuation: Close punctuation, Common, Other, [Bopomofo, Hangul, Han, Hiragana, Katakana, Yi]
-U+300F Punctuation: Close punctuation, Common, Other, [Bopomofo, Hangul, Han, Hiragana, Katakana, Yi]
-U+3011 Punctuation: Close punctuation, Common, Other, [Bopomofo, Hangul, Han, Hiragana, Katakana, Yi]
-U+3015 Punctuation: Close punctuation, Common, Other, [Bopomofo, Hangul, Han, Hiragana, Katakana, Yi]
-U+3017 Punctuation: Close punctuation, Common, Other, [Bopomofo, Hangul, Han, Hiragana, Katakana, Yi]
-U+3019 Punctuation: Close punctuation, Common, Other, [Bopomofo, Hangul, Han, Hiragana, Katakana, Yi]
-U+301B Punctuation: Close punctuation, Common, Other, [Bopomofo, Hangul, Han, Hiragana, Katakana, Yi]
-U+301E..U+301F Punctuation: Close punctuation, Common, Other, [Bopomofo, Hangul, Han, Hiragana, Katakana]
-        U+FF63 Punctuation: Close punctuation, Common, Other, [Bopomofo, Hangul, Han, Hiragana, Katakana, Yi]
+U+3009 ON  Punctuation: Close punctuation, common, Other, [hangul, hiragana, katakana, bopomofo, han, yiii], [ascii, alphabetic, cased, changeswhencasemapped, changeswhentitlecased, dash, emojimodifier, emojimodifierbase]
+U+300B ON  Punctuation: Close punctuation, common, Other, [hangul, hiragana, katakana, bopomofo, han, yiii], [ascii, alphabetic, cased, changeswhencasemapped, changeswhentitlecased, dash, emojimodifier, emojimodifierbase]
+U+300D ON  Punctuation: Close punctuation, common, Other, [hangul, hiragana, katakana, bopomofo, han, yiii], [graphemebase, sentenceterminal, terminalpunctuation]
+U+300F ON  Punctuation: Close punctuation, common, Other, [hangul, hiragana, katakana, bopomofo, han, yiii], [graphemebase, sentenceterminal, terminalpunctuation]
+U+3011 ON  Punctuation: Close punctuation, common, Other, [hangul, hiragana, katakana, bopomofo, han, yiii], [ascii, alphabetic, cased, changeswhencasemapped, changeswhentitlecased, dash, emojimodifier, emojimodifierbase]
+U+3015 ON  Punctuation: Close punctuation, common, Other, [hangul, hiragana, katakana, bopomofo, han, yiii], [ascii, alphabetic, cased, changeswhencasemapped, changeswhentitlecased, dash, emojimodifier, emojimodifierbase]
+U+3017 ON  Punctuation: Close punctuation, common, Other, [hangul, hiragana, katakana, bopomofo, han, yiii], [ascii, alphabetic, cased, changeswhencasemapped, changeswhentitlecased, dash, emojimodifier, emojimodifierbase]
+U+3019 ON  Punctuation: Close punctuation, common, Other, [hangul, hiragana, katakana, bopomofo, han, yiii], [ascii, alphabetic, cased, changeswhencasemapped, changeswhentitlecased, dash, emojimodifier, emojimodifierbase]
+U+301B ON  Punctuation: Close punctuation, common, Other, [hangul, hiragana, katakana, bopomofo, han, yiii], [ascii, alphabetic, cased, changeswhencasemapped, changeswhentitlecased, dash, emojimodifier, emojimodifierbase]
+U+301E..U+301F ON  Punctuation: Close punctuation, common, Other, [hangul, hiragana, katakana, bopomofo, han], [softdotted, terminalpunctuation, unifiedideograph, xidcontinue, xidstart]
+        U+FF63 ON  Punctuation: Close punctuation, common, Other, [hangul, hiragana, katakana, bopomofo, han, yiii], [changeswhencasemapped, changeswhenlowercased, changeswhentitlecased, emojimodifier, emojimodifierbase]
 find type Sk
-U+005E Symbol: Modifier symbol, Common, Other
-U+0060 Symbol: Modifier symbol, Common, Other
-U+00A8 Symbol: Modifier symbol, Common, Other
-U+00AF Symbol: Modifier symbol, Common, Other
-U+00B4 Symbol: Modifier symbol, Common, Other
-U+00B8 Symbol: Modifier symbol, Common, Other
-U+02C2..U+02C5 Symbol: Modifier symbol, Common, Other
-U+02D2..U+02DF Symbol: Modifier symbol, Common, Other
-U+02E5..U+02E9 Symbol: Modifier symbol, Common, Other
-U+02EA..U+02EB Symbol: Modifier symbol, Bopomofo, Other
-        U+02ED Symbol: Modifier symbol, Common, Other
-U+02EF..U+02FF Symbol: Modifier symbol, Common, Other
-        U+0375 Symbol: Modifier symbol, Greek, Other
-        U+0384 Symbol: Modifier symbol, Greek, Other
-        U+0385 Symbol: Modifier symbol, Common, Other
-        U+0888 Symbol: Modifier symbol, Arabic, Other
-        U+1FBD Symbol: Modifier symbol, Greek, Other
-U+1FBF..U+1FC1 Symbol: Modifier symbol, Greek, Other
-U+1FCD..U+1FCF Symbol: Modifier symbol, Greek, Other
-U+1FDD..U+1FDF Symbol: Modifier symbol, Greek, Other
-U+1FED..U+1FEF Symbol: Modifier symbol, Greek, Other
-U+1FFD..U+1FFE Symbol: Modifier symbol, Greek, Other
-U+309B..U+309C Symbol: Modifier symbol, Common, Other, [Hiragana, Katakana]
-U+A700..U+A707 Symbol: Modifier symbol, Common, Other, [Han, Latin]
-U+A708..U+A716 Symbol: Modifier symbol, Common, Other
-U+A720..U+A721 Symbol: Modifier symbol, Common, Other
-U+A789..U+A78A Symbol: Modifier symbol, Common, Other
-        U+AB5B Symbol: Modifier symbol, Common, Other
-U+AB6A..U+AB6B Symbol: Modifier symbol, Common, Other
-U+FBB2..U+FBC2 Symbol: Modifier symbol, Arabic, Other
-        U+FF3E Symbol: Modifier symbol, Common, Other
-        U+FF40 Symbol: Modifier symbol, Common, Other
-        U+FFE3 Symbol: Modifier symbol, Common, Other
-U+1F3FB..U+1F3FF Symbol: Modifier symbol, Common, Extend
+U+005E ON  Symbol: Modifier symbol, common, Other, [alphabetic, cased, changeswhencasemapped, changeswhentitlecased, changeswhenuppercased, graphemebase, idcontinue, idstart, lowercase, xidcontinue, xidstart]
+U+0060 ON  Symbol: Modifier symbol, common, Other, [alphabetic, cased, changeswhencasefolded, changeswhencasemapped, changeswhenlowercased, changeswhentitlecased, graphemebase, idcontinue, idstart, uppercase, xidcontinue, xidstart]
+U+00A8 ON  Symbol: Modifier symbol, common, Other, [alphabetic, cased, graphemebase, idcontinue, idstart, math, uppercase, xidcontinue, xidstart]
+U+00AF ON  Symbol: Modifier symbol, common, Other, [alphabetic, cased, graphemebase, idcontinue, idstart, math, uppercase, xidcontinue, xidstart]
+U+00B4 ON  Symbol: Modifier symbol, common, Other, [alphabetic, cased, graphemebase, idcontinue, idstart, math, uppercase, xidcontinue, xidstart]
+U+00B8 ON  Symbol: Modifier symbol, common, Other, [alphabetic, cased, graphemebase, idcontinue, idstart, math, uppercase, xidcontinue, xidstart]
+U+02C2..U+02C5 ON  Symbol: Modifier symbol, common, Other, [alphabetic, cased, graphemebase, idcontinue, idstart, math, uppercase, xidcontinue, xidstart]
+U+02D2..U+02DF ON  Symbol: Modifier symbol, common, Other, [alphabetic, cased, graphemebase, idcontinue, idstart, math, uppercase, xidcontinue, xidstart]
+U+02E5..U+02E9 ON  Symbol: Modifier symbol, common, Other, [alphabetic, cased, graphemebase, idcontinue, idstart, math, uppercase, xidcontinue, xidstart]
+U+02EA..U+02EB ON  Symbol: Modifier symbol, bopomofo, Other, [alphabetic, cased, graphemebase, idcontinue, idstart, math, uppercase, xidcontinue, xidstart]
+        U+02ED ON  Symbol: Modifier symbol, common, Other, [alphabetic, cased, graphemebase, idcontinue, idstart, math, uppercase, xidcontinue, xidstart]
+U+02EF..U+02FF ON  Symbol: Modifier symbol, common, Other, [alphabetic, cased, graphemebase, idcontinue, idstart, math, uppercase, xidcontinue, xidstart]
+        U+0375 ON  Symbol: Modifier symbol, greek, Other, [alphabetic, cased, graphemebase, idcontinue, idstart, math, uppercase, xidcontinue, xidstart]
+        U+0384 ON  Symbol: Modifier symbol, greek, Other, [alphabetic, cased, graphemebase, idcontinue, idstart, math, uppercase, xidcontinue, xidstart]
+        U+0385 ON  Symbol: Modifier symbol, common, Other, [alphabetic, cased, graphemebase, idcontinue, idstart, math, uppercase, xidcontinue, xidstart]
+        U+0888 AL  Symbol: Modifier symbol, arabic, Other, [alphabetic, cased, graphemebase, idcontinue, idstart, lowercase, math, softdotted, xidcontinue, xidstart]
+        U+1FBD ON  Symbol: Modifier symbol, greek, Other, [alphabetic, cased, graphemebase, idcontinue, idstart, math, uppercase, xidcontinue, xidstart]
+U+1FBF..U+1FC1 ON  Symbol: Modifier symbol, greek, Other, [alphabetic, cased, graphemebase, idcontinue, idstart, math, uppercase, xidcontinue, xidstart]
+U+1FCD..U+1FCF ON  Symbol: Modifier symbol, greek, Other, [alphabetic, cased, graphemebase, idcontinue, idstart, math, uppercase, xidcontinue, xidstart]
+U+1FDD..U+1FDF ON  Symbol: Modifier symbol, greek, Other, [alphabetic, cased, graphemebase, idcontinue, idstart, math, uppercase, xidcontinue, xidstart]
+U+1FED..U+1FEF ON  Symbol: Modifier symbol, greek, Other, [alphabetic, cased, graphemebase, idcontinue, idstart, math, uppercase, xidcontinue, xidstart]
+U+1FFD..U+1FFE ON  Symbol: Modifier symbol, greek, Other, [alphabetic, cased, graphemebase, idcontinue, idstart, math, uppercase, xidcontinue, xidstart]
+U+309B..U+309C ON  Symbol: Modifier symbol, common, Other, [hiragana, katakana], [alphabetic, bidimirrored, caseignorable, cased, changeswhencasefolded, changeswhenlowercased, changeswhentitlecased, changeswhenuppercased, dash, defaultignorablecodepoint, deprecated, diacritic, emoji, emojicomponent, emojimodifier, emojimodifierbase, emojipresentation, extendedpictographic, extender, graphemebase, graphemeextend, graphemelink, hexdigit, idsbinaryoperator, idstrinaryoperator, idcontinue, idstart, ideographic, sentenceterminal, unifiedideograph, whitespace, xidcontinue]
+U+A700..U+A707 ON  Symbol: Modifier symbol, common, Other, [latin, han], [alphabetic, cased, graphemebase, idcontinue, idstart, math, uppercase, xidcontinue, xidstart]
+U+A708..U+A716 ON  Symbol: Modifier symbol, common, Other, [alphabetic, cased, graphemebase, idcontinue, idstart, math, uppercase, xidcontinue, xidstart]
+U+A720..U+A721 ON  Symbol: Modifier symbol, common, Other, [alphabetic, cased, graphemebase, idcontinue, idstart, math, uppercase, xidcontinue, xidstart]
+U+A789..U+A78A L   Symbol: Modifier symbol, common, Other, [alphabetic, cased, graphemebase, idcontinue, idstart, math, uppercase, xidcontinue, xidstart]
+        U+AB5B L   Symbol: Modifier symbol, common, Other, [alphabetic, cased, graphemebase, idcontinue, idstart, math, uppercase, xidcontinue, xidstart]
+U+AB6A..U+AB6B ON  Symbol: Modifier symbol, common, Other, [alphabetic, cased, graphemebase, idcontinue, idstart, math, uppercase, xidcontinue, xidstart]
+U+FBB2..U+FBC2 AL  Symbol: Modifier symbol, arabic, Other, [alphabetic, cased, graphemebase, idcontinue, idstart, lowercase, math, softdotted, xidcontinue, xidstart]
+        U+FF3E ON  Symbol: Modifier symbol, common, Other, [asciihexdigit, bidicontrol, bidimirrored, cased, changeswhencasefolded, sentenceterminal, unifiedideograph, whitespace, xidstart]
+        U+FF40 ON  Symbol: Modifier symbol, common, Other, [alphabetic, cased, graphemebase, idcontinue, idstart, math, uppercase, xidcontinue, xidstart]
+        U+FFE3 ON  Symbol: Modifier symbol, common, Other, [alphabetic, cased, graphemebase, idcontinue, idstart, math, uppercase, xidcontinue, xidstart]
+U+1F3FB..U+1F3FF ON  Symbol: Modifier symbol, common, Extend, [changeswhencasemapped, changeswhenuppercased, emojimodifier, emojimodifierbase, math, patternsyntax, radical, sentenceterminal, terminalpunctuation]
 find type Pd
-U+002D Punctuation: Dash punctuation, Common, Other
-U+058A Punctuation: Dash punctuation, Armenian, Other
-U+05BE Punctuation: Dash punctuation, Hebrew, Other
-U+1400 Punctuation: Dash punctuation, Canadian_Aboriginal, Other
-U+1806 Punctuation: Dash punctuation, Mongolian, Other
-U+2010..U+2015 Punctuation: Dash punctuation, Common, Other
-        U+2E17 Punctuation: Dash punctuation, Common, Other
-        U+2E1A Punctuation: Dash punctuation, Common, Other
-U+2E3A..U+2E3B Punctuation: Dash punctuation, Common, Other
-        U+2E40 Punctuation: Dash punctuation, Common, Other
-        U+2E5D Punctuation: Dash punctuation, Common, Other
-        U+301C Punctuation: Dash punctuation, Common, Other, [Bopomofo, Hangul, Han, Hiragana, Katakana]
-        U+3030 Punctuation: Dash punctuation, Common, Extended Pictographic, [Bopomofo, Hangul, Han, Hiragana, Katakana]
-        U+30A0 Punctuation: Dash punctuation, Common, Other, [Hiragana, Katakana]
-U+FE31..U+FE32 Punctuation: Dash punctuation, Common, Other
-        U+FE58 Punctuation: Dash punctuation, Common, Other
-        U+FE63 Punctuation: Dash punctuation, Common, Other
-        U+FF0D Punctuation: Dash punctuation, Common, Other
-        U+10EAD Punctuation: Dash punctuation, Yezidi, Other
+U+002D ES  Punctuation: Dash punctuation, common, Other, [ascii, alphabetic, cased, changeswhencasemapped, changeswhentitlecased, changeswhenuppercased, graphemebase, idcontinue, idstart, lowercase, softdotted, xidcontinue, xidstart]
+U+058A ON  Punctuation: Dash punctuation, armenian, Other, [emoji, emojipresentation, extendedpictographic, graphemebase, patternsyntax]
+U+05BE R   Punctuation: Dash punctuation, hebrew, Other, [emoji, emojipresentation, extendedpictographic, graphemebase, patternsyntax]
+U+1400 ON  Punctuation: Dash punctuation, canadianaboriginal, Other, [emoji, emojipresentation, extendedpictographic, graphemebase, patternsyntax]
+U+1806 ON  Punctuation: Dash punctuation, mongolian, Other, [emoji, emojipresentation, extendedpictographic, graphemebase, patternsyntax]
+U+2010..U+2015 ON  Punctuation: Dash punctuation, common, Other, [dash, defaultignorablecodepoint, deprecated, emojipresentation, joincontrol, lowercase, patternwhitespace, radical, regionalindicator, softdotted, xidcontinue, xidstart]
+        U+2E17 ON  Punctuation: Dash punctuation, common, Other, [dash, defaultignorablecodepoint, deprecated, emojipresentation, joincontrol, lowercase, patternwhitespace, radical, regionalindicator, softdotted, xidcontinue, xidstart]
+        U+2E1A ON  Punctuation: Dash punctuation, common, Other, [dash, defaultignorablecodepoint, deprecated, emojipresentation, joincontrol, lowercase, patternwhitespace, radical, regionalindicator, softdotted, xidcontinue, xidstart]
+U+2E3A..U+2E3B ON  Punctuation: Dash punctuation, common, Other, [dash, defaultignorablecodepoint, deprecated, emojipresentation, joincontrol, lowercase, patternwhitespace, radical, regionalindicator, softdotted, xidcontinue, xidstart]
+        U+2E40 ON  Punctuation: Dash punctuation, common, Other, [dash, defaultignorablecodepoint, deprecated, emojipresentation, joincontrol, lowercase, patternwhitespace, radical, regionalindicator, softdotted, xidcontinue, xidstart]
+        U+2E5D ON  Punctuation: Dash punctuation, common, Other, [dash, defaultignorablecodepoint, deprecated, emojipresentation, joincontrol, lowercase, patternwhitespace, radical, regionalindicator, softdotted, xidcontinue, xidstart]
+        U+301C ON  Punctuation: Dash punctuation, common, Other, [hangul, hiragana, katakana, bopomofo, han], [dash, defaultignorablecodepoint, deprecated, emojipresentation, joincontrol, lowercase, patternwhitespace, radical, regionalindicator, softdotted, xidcontinue, xidstart]
+        U+3030 ON  Punctuation: Dash punctuation, common, Extended Pictographic, [hangul, hiragana, katakana, bopomofo, han], [changeswhencasemapped, changeswhenuppercased, emojimodifier, emojimodifierbase, joincontrol, logicalorderexception, lowercase, prependedconcatenationmark, quotationmark, radical, regionalindicator, sentenceterminal, softdotted, terminalpunctuation, unifiedideograph, uppercase, variationselector, whitespace, xidcontinue, xidstart]
+        U+30A0 ON  Punctuation: Dash punctuation, common, Other, [hiragana, katakana], [emoji, emojipresentation, extendedpictographic, graphemebase, patternsyntax]
+U+FE31..U+FE32 ON  Punctuation: Dash punctuation, common, Other, [emoji, emojipresentation, extendedpictographic, graphemebase, patternsyntax]
+        U+FE58 ON  Punctuation: Dash punctuation, common, Other, [emoji, emojipresentation, extendedpictographic, graphemebase, patternsyntax]
+        U+FE63 ES  Punctuation: Dash punctuation, common, Other, [caseignorable, sentenceterminal, unifiedideograph, xidcontinue]
+        U+FF0D ES  Punctuation: Dash punctuation, common, Other, [emoji, emojipresentation, extendedpictographic, graphemebase, patternsyntax]
+        U+10EAD R   Punctuation: Dash punctuation, yezidi, Other, [emoji, emojipresentation, extendedpictographic, graphemebase, patternsyntax]
 find gbreak LVT
-U+AC01..U+AC1B Letter: Other letter, Hangul, Hangul syllable type LVT
-U+AC1D..U+AC37 Letter: Other letter, Hangul, Hangul syllable type LVT
-U+AC39..U+AC53 Letter: Other letter, Hangul, Hangul syllable type LVT
-U+AC55..U+AC6F Letter: Other letter, Hangul, Hangul syllable type LVT
-U+AC71..U+AC8B Letter: Other letter, Hangul, Hangul syllable type LVT
-U+AC8D..U+ACA7 Letter: Other letter, Hangul, Hangul syllable type LVT
-U+ACA9..U+ACC3 Letter: Other letter, Hangul, Hangul syllable type LVT
-U+ACC5..U+ACDF Letter: Other letter, Hangul, Hangul syllable type LVT
-U+ACE1..U+ACFB Letter: Other letter, Hangul, Hangul syllable type LVT
-U+ACFD..U+AD17 Letter: Other letter, Hangul, Hangul syllable type LVT
-U+AD19..U+AD33 Letter: Other letter, Hangul, Hangul syllable type LVT
-U+AD35..U+AD4F Letter: Other letter, Hangul, Hangul syllable type LVT
-U+AD51..U+AD6B Letter: Other letter, Hangul, Hangul syllable type LVT
-U+AD6D..U+AD87 Letter: Other letter, Hangul, Hangul syllable type LVT
-U+AD89..U+ADA3 Letter: Other letter, Hangul, Hangul syllable type LVT
-U+ADA5..U+ADBF Letter: Other letter, Hangul, Hangul syllable type LVT
-U+ADC1..U+ADDB Letter: Other letter, Hangul, Hangul syllable type LVT
-U+ADDD..U+ADF7 Letter: Other letter, Hangul, Hangul syllable type LVT
-U+ADF9..U+AE13 Letter: Other letter, Hangul, Hangul syllable type LVT
-U+AE15..U+AE2F Letter: Other letter, Hangul, Hangul syllable type LVT
-U+AE31..U+AE4B Letter: Other letter, Hangul, Hangul syllable type LVT
-U+AE4D..U+AE67 Letter: Other letter, Hangul, Hangul syllable type LVT
-U+AE69..U+AE83 Letter: Other letter, Hangul, Hangul syllable type LVT
-U+AE85..U+AE9F Letter: Other letter, Hangul, Hangul syllable type LVT
-U+AEA1..U+AEBB Letter: Other letter, Hangul, Hangul syllable type LVT
-U+AEBD..U+AED7 Letter: Other letter, Hangul, Hangul syllable type LVT
-U+AED9..U+AEF3 Letter: Other letter, Hangul, Hangul syllable type LVT
-U+AEF5..U+AF0F Letter: Other letter, Hangul, Hangul syllable type LVT
-U+AF11..U+AF2B Letter: Other letter, Hangul, Hangul syllable type LVT
-U+AF2D..U+AF47 Letter: Other letter, Hangul, Hangul syllable type LVT
-U+AF49..U+AF63 Letter: Other letter, Hangul, Hangul syllable type LVT
-U+AF65..U+AF7F Letter: Other letter, Hangul, Hangul syllable type LVT
-U+AF81..U+AF9B Letter: Other letter, Hangul, Hangul syllable type LVT
-U+AF9D..U+AFB7 Letter: Other letter, Hangul, Hangul syllable type LVT
-U+AFB9..U+AFD3 Letter: Other letter, Hangul, Hangul syllable type LVT
-U+AFD5..U+AFEF Letter: Other letter, Hangul, Hangul syllable type LVT
-U+AFF1..U+B00B Letter: Other letter, Hangul, Hangul syllable type LVT
-U+B00D..U+B027 Letter: Other letter, Hangul, Hangul syllable type LVT
-U+B029..U+B043 Letter: Other letter, Hangul, Hangul syllable type LVT
-U+B045..U+B05F Letter: Other letter, Hangul, Hangul syllable type LVT
-U+B061..U+B07B Letter: Other letter, Hangul, Hangul syllable type LVT
-U+B07D..U+B097 Letter: Other letter, Hangul, Hangul syllable type LVT
-U+B099..U+B0B3 Letter: Other letter, Hangul, Hangul syllable type LVT
-U+B0B5..U+B0CF Letter: Other letter, Hangul, Hangul syllable type LVT
-U+B0D1..U+B0EB Letter: Other letter, Hangul, Hangul syllable type LVT
-U+B0ED..U+B107 Letter: Other letter, Hangul, Hangul syllable type LVT
-U+B109..U+B123 Letter: Other letter, Hangul, Hangul syllable type LVT
-U+B125..U+B13F Letter: Other letter, Hangul, Hangul syllable type LVT
-U+B141..U+B15B Letter: Other letter, Hangul, Hangul syllable type LVT
-U+B15D..U+B177 Letter: Other letter, Hangul, Hangul syllable type LVT
-U+B179..U+B193 Letter: Other letter, Hangul, Hangul syllable type LVT
-U+B195..U+B1AF Letter: Other letter, Hangul, Hangul syllable type LVT
-U+B1B1..U+B1CB Letter: Other letter, Hangul, Hangul syllable type LVT
-U+B1CD..U+B1E7 Letter: Other letter, Hangul, Hangul syllable type LVT
-U+B1E9..U+B203 Letter: Other letter, Hangul, Hangul syllable type LVT
-U+B205..U+B21F Letter: Other letter, Hangul, Hangul syllable type LVT
-U+B221..U+B23B Letter: Other letter, Hangul, Hangul syllable type LVT
-U+B23D..U+B257 Letter: Other letter, Hangul, Hangul syllable type LVT
-U+B259..U+B273 Letter: Other letter, Hangul, Hangul syllable type LVT
-U+B275..U+B28F Letter: Other letter, Hangul, Hangul syllable type LVT
-U+B291..U+B2AB Letter: Other letter, Hangul, Hangul syllable type LVT
-U+B2AD..U+B2C7 Letter: Other letter, Hangul, Hangul syllable type LVT
-U+B2C9..U+B2E3 Letter: Other letter, Hangul, Hangul syllable type LVT
-U+B2E5..U+B2FF Letter: Other letter, Hangul, Hangul syllable type LVT
-U+B301..U+B31B Letter: Other letter, Hangul, Hangul syllable type LVT
-U+B31D..U+B337 Letter: Other letter, Hangul, Hangul syllable type LVT
-U+B339..U+B353 Letter: Other letter, Hangul, Hangul syllable type LVT
-U+B355..U+B36F Letter: Other letter, Hangul, Hangul syllable type LVT
-U+B371..U+B38B Letter: Other letter, Hangul, Hangul syllable type LVT
-U+B38D..U+B3A7 Letter: Other letter, Hangul, Hangul syllable type LVT
-U+B3A9..U+B3C3 Letter: Other letter, Hangul, Hangul syllable type LVT
-U+B3C5..U+B3DF Letter: Other letter, Hangul, Hangul syllable type LVT
-U+B3E1..U+B3FB Letter: Other letter, Hangul, Hangul syllable type LVT
-U+B3FD..U+B417 Letter: Other letter, Hangul, Hangul syllable type LVT
-U+B419..U+B433 Letter: Other letter, Hangul, Hangul syllable type LVT
-U+B435..U+B44F Letter: Other letter, Hangul, Hangul syllable type LVT
-U+B451..U+B46B Letter: Other letter, Hangul, Hangul syllable type LVT
-U+B46D..U+B487 Letter: Other letter, Hangul, Hangul syllable type LVT
-U+B489..U+B4A3 Letter: Other letter, Hangul, Hangul syllable type LVT
-U+B4A5..U+B4BF Letter: Other letter, Hangul, Hangul syllable type LVT
-U+B4C1..U+B4DB Letter: Other letter, Hangul, Hangul syllable type LVT
-U+B4DD..U+B4F7 Letter: Other letter, Hangul, Hangul syllable type LVT
-U+B4F9..U+B513 Letter: Other letter, Hangul, Hangul syllable type LVT
-U+B515..U+B52F Letter: Other letter, Hangul, Hangul syllable type LVT
-U+B531..U+B54B Letter: Other letter, Hangul, Hangul syllable type LVT
-U+B54D..U+B567 Letter: Other letter, Hangul, Hangul syllable type LVT
-U+B569..U+B583 Letter: Other letter, Hangul, Hangul syllable type LVT
-U+B585..U+B59F Letter: Other letter, Hangul, Hangul syllable type LVT
-U+B5A1..U+B5BB Letter: Other letter, Hangul, Hangul syllable type LVT
-U+B5BD..U+B5D7 Letter: Other letter, Hangul, Hangul syllable type LVT
-U+B5D9..U+B5F3 Letter: Other letter, Hangul, Hangul syllable type LVT
-U+B5F5..U+B60F Letter: Other letter, Hangul, Hangul syllable type LVT
-U+B611..U+B62B Letter: Other letter, Hangul, Hangul syllable type LVT
-U+B62D..U+B647 Letter: Other letter, Hangul, Hangul syllable type LVT
-U+B649..U+B663 Letter: Other letter, Hangul, Hangul syllable type LVT
-U+B665..U+B67F Letter: Other letter, Hangul, Hangul syllable type LVT
-U+B681..U+B69B Letter: Other letter, Hangul, Hangul syllable type LVT
-U+B69D..U+B6B7 Letter: Other letter, Hangul, Hangul syllable type LVT
-U+B6B9..U+B6D3 Letter: Other letter, Hangul, Hangul syllable type LVT
-U+B6D5..U+B6EF Letter: Other letter, Hangul, Hangul syllable type LVT
+U+AC01..U+AC1B L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
+U+AC1D..U+AC37 L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
+U+AC39..U+AC53 L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
+U+AC55..U+AC6F L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
+U+AC71..U+AC8B L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
+U+AC8D..U+ACA7 L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
+U+ACA9..U+ACC3 L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
+U+ACC5..U+ACDF L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
+U+ACE1..U+ACFB L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
+U+ACFD..U+AD17 L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
+U+AD19..U+AD33 L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
+U+AD35..U+AD4F L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
+U+AD51..U+AD6B L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
+U+AD6D..U+AD87 L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
+U+AD89..U+ADA3 L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
+U+ADA5..U+ADBF L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
+U+ADC1..U+ADDB L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
+U+ADDD..U+ADF7 L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
+U+ADF9..U+AE13 L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
+U+AE15..U+AE2F L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
+U+AE31..U+AE4B L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
+U+AE4D..U+AE67 L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
+U+AE69..U+AE83 L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
+U+AE85..U+AE9F L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
+U+AEA1..U+AEBB L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
+U+AEBD..U+AED7 L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
+U+AED9..U+AEF3 L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
+U+AEF5..U+AF0F L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
+U+AF11..U+AF2B L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
+U+AF2D..U+AF47 L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
+U+AF49..U+AF63 L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
+U+AF65..U+AF7F L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
+U+AF81..U+AF9B L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
+U+AF9D..U+AFB7 L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
+U+AFB9..U+AFD3 L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
+U+AFD5..U+AFEF L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
+U+AFF1..U+B00B L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
+U+B00D..U+B027 L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
+U+B029..U+B043 L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
+U+B045..U+B05F L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
+U+B061..U+B07B L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
+U+B07D..U+B097 L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
+U+B099..U+B0B3 L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
+U+B0B5..U+B0CF L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
+U+B0D1..U+B0EB L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
+U+B0ED..U+B107 L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
+U+B109..U+B123 L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
+U+B125..U+B13F L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
+U+B141..U+B15B L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
+U+B15D..U+B177 L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
+U+B179..U+B193 L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
+U+B195..U+B1AF L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
+U+B1B1..U+B1CB L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
+U+B1CD..U+B1E7 L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
+U+B1E9..U+B203 L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
+U+B205..U+B21F L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
+U+B221..U+B23B L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
+U+B23D..U+B257 L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
+U+B259..U+B273 L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
+U+B275..U+B28F L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
+U+B291..U+B2AB L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
+U+B2AD..U+B2C7 L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
+U+B2C9..U+B2E3 L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
+U+B2E5..U+B2FF L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
+U+B301..U+B31B L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
+U+B31D..U+B337 L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
+U+B339..U+B353 L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
+U+B355..U+B36F L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
+U+B371..U+B38B L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
+U+B38D..U+B3A7 L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
+U+B3A9..U+B3C3 L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
+U+B3C5..U+B3DF L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
+U+B3E1..U+B3FB L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
+U+B3FD..U+B417 L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
+U+B419..U+B433 L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
+U+B435..U+B44F L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
+U+B451..U+B46B L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
+U+B46D..U+B487 L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
+U+B489..U+B4A3 L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
+U+B4A5..U+B4BF L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
+U+B4C1..U+B4DB L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
+U+B4DD..U+B4F7 L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
+U+B4F9..U+B513 L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
+U+B515..U+B52F L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
+U+B531..U+B54B L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
+U+B54D..U+B567 L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
+U+B569..U+B583 L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
+U+B585..U+B59F L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
+U+B5A1..U+B5BB L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
+U+B5BD..U+B5D7 L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
+U+B5D9..U+B5F3 L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
+U+B5F5..U+B60F L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
+U+B611..U+B62B L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
+U+B62D..U+B647 L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
+U+B649..U+B663 L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
+U+B665..U+B67F L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
+U+B681..U+B69B L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
+U+B69D..U+B6B7 L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
+U+B6B9..U+B6D3 L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
+U+B6D5..U+B6EF L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
 ...
 find script Old_Uyghur
-U+10F70..U+10F81 Letter: Other letter, Old_Uyghur, Other
-U+10F82..U+10F85 Mark: Non-spacing mark, Old_Uyghur, Extend
-U+10F86..U+10F89 Punctuation: Other punctuation, Old_Uyghur, Other
+U+10F70..U+10F81 R   Letter: Other letter, olduyghur, Other, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
+U+10F82..U+10F85 NSM Mark: Non-spacing mark, olduyghur, Extend, [alphabetic, cased, graphemebase, idcontinue, idstart, lowercase, softdotted, xidcontinue, xidstart]
+U+10F86..U+10F89 R   Punctuation: Other punctuation, olduyghur, Other, [bidimirrored, graphemebase, math, patternsyntax]
+find bidi PDF
+U+202C PDF Control: Format, common, Control, [extendedpictographic, graphemebase, math, patternsyntax]
+find bidi CS
+U+002C CS  Punctuation: Other punctuation, common, Other, [ascii, asciihexdigit, alphabetic, cased, changeswhencasemapped, changeswhentitlecased, changeswhenuppercased, graphemebase, hexdigit, idcontinue, idstart, lowercase, xidcontinue, xidstart]
+U+002E CS  Punctuation: Other punctuation, common, Other, [graphemebase, whitespace]
+U+002F CS  Punctuation: Other punctuation, common, Other, [ascii, asciihexdigit, emoji, emojicomponent, graphemebase, hexdigit, idcontinue, xidcontinue]
+U+003A CS  Punctuation: Other punctuation, common, Other, [alphabetic, cased, graphemebase, idcontinue, idstart, lowercase, xidcontinue, xidstart]
+U+00A0 CS  Separator: Space separator, common, Other, [alphabetic, caseignorable, cased, diacritic, graphemebase, idcontinue, idstart, lowercase]
+U+060C CS  Punctuation: Other punctuation, common, Other, [arabic, syriac, thaana, nko, hanifirohingya, yezidi], [graphemebase, patternsyntax, sentenceterminal, terminalpunctuation]
+U+202F CS  Separator: Space separator, common, Other, [latin, mongolian], [alphabetic, caseignorable, cased, diacritic, graphemebase, idcontinue, idstart, lowercase]
+U+2044 CS  Symbol: Mathematical symbol, common, Other, [alphabetic, caseignorable, diacritic, graphemeextend, idcontinue, xidcontinue]
+U+FE50 CS  Punctuation: Other punctuation, common, Other, [graphemebase, patternsyntax, sentenceterminal, terminalpunctuation]
+U+FE52 CS  Punctuation: Other punctuation, common, Other, [changeswhenuppercased, deprecated, emojimodifier, emojimodifierbase, extender, quotationmark, sentenceterminal, xidcontinue, xidstart]
+U+FE55 CS  Punctuation: Other punctuation, common, Other, [changeswhencasemapped, changeswhenuppercased, emojimodifier, emojimodifierbase, joincontrol, noncharactercodepoint, patternwhitespace, prependedconcatenationmark]
+U+FF0C CS  Punctuation: Other punctuation, common, Other, [graphemebase, patternsyntax, sentenceterminal, terminalpunctuation]
+U+FF0E CS  Punctuation: Other punctuation, common, Other, [changeswhenuppercased, deprecated, emojimodifier, emojimodifierbase, extender, quotationmark, sentenceterminal, xidcontinue, xidstart]
+U+FF0F CS  Punctuation: Other punctuation, common, Other, [alphabetic, caseignorable, extender, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
+U+FF1A CS  Punctuation: Other punctuation, common, Other, [changeswhencasemapped, changeswhenuppercased, emojimodifier, emojimodifierbase, joincontrol, noncharactercodepoint, patternwhitespace, prependedconcatenationmark]
+find bidi CS type Sm
+U+2044 CS  Symbol: Mathematical symbol, common, Other, [alphabetic, caseignorable, diacritic, graphemeextend, idcontinue, xidcontinue]
+find bidi B
+U+000A B   Control: Control, common, LF, [ascii, graphemebase, patternsyntax, sentenceterminal, terminalpunctuation]
+U+000D B   Control: Control, common, CR, [ascii, graphemebase, patternsyntax, sentenceterminal, terminalpunctuation]
+U+001C..U+001E B   Control: Control, common, Control, [ascii, patternwhitespace, whitespace]
+        U+0085 B   Control: Control, common, Control, [caseignorable, defaultignorablecodepoint, graphemeextend, idcontinue, xidcontinue]
+        U+2029 B   Separator: Paragraph separator, common, Control, [caseignorable, defaultignorablecodepoint, graphemeextend, idcontinue, xidcontinue]
+find bidi FSI
+U+2068 FSI Control: Format, common, Control, [extendedpictographic, graphemebase, math, patternsyntax]
+find bidi PDI
+U+2069 PDI Control: Format, common, Control, [extendedpictographic, graphemebase, math, patternsyntax]
+find bidi RLI
+U+2067 RLI Control: Format, common, Control, [extendedpictographic, graphemebase, math, patternsyntax]
+find bidi RLO
+U+202E RLO Control: Format, common, Control, [extendedpictographic, graphemebase, math, patternsyntax]
+find bidi S
+U+0009 S   Control: Control, common, Control, [ascii, graphemebase, patternsyntax, sentenceterminal, terminalpunctuation]
+U+000B S   Control: Control, common, Control, [ascii, graphemebase, patternsyntax, sentenceterminal, terminalpunctuation]
+U+001F S   Control: Control, common, Control, [ascii, patternwhitespace, whitespace]
+find bidi WS
+U+000C WS  Control: Control, common, Control, [ascii, graphemebase, patternsyntax, sentenceterminal, terminalpunctuation]
+U+0020 WS  Separator: Space separator, common, Other, [ascii, emoji, emojicomponent, graphemebase, patternsyntax]
+U+1680 WS  Separator: Space separator, ogham, Other, [alphabetic, caseignorable, cased, diacritic, graphemebase, idcontinue, idstart, lowercase]
+U+2000..U+200A WS  Separator: Space separator, common, Other, [alphabetic, caseignorable, cased, diacritic, graphemebase, idcontinue, idstart, lowercase]
+        U+2028 WS  Separator: Line separator, common, Control, [caseignorable, defaultignorablecodepoint, graphemeextend, idcontinue, xidcontinue]
+        U+205F WS  Separator: Space separator, common, Other, [alphabetic, caseignorable, cased, diacritic, graphemebase, idcontinue, idstart, lowercase]
+        U+3000 WS  Separator: Space separator, common, Other, [alphabetic, caseignorable, cased, diacritic, graphemebase, idcontinue, idstart, lowercase]
+find script bopo
+U+02EA..U+02EB ON  Symbol: Modifier symbol, bopomofo, Other, [alphabetic, cased, graphemebase, idcontinue, idstart, math, uppercase, xidcontinue, xidstart]
+U+3105..U+312F L   Letter: Other letter, bopomofo, Other, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
+U+31A0..U+31BF L   Letter: Other letter, bopomofo, Other, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
+find bool prependedconcatenationmark
+U+00AD BN  Control: Format, common, Control, [caseignorable, prependedconcatenationmark]
+U+180E BN  Control: Format, mongolian, Control, [caseignorable, prependedconcatenationmark]
+U+200B BN  Control: Format, common, Control, [caseignorable, prependedconcatenationmark]
+U+2060 BN  Control: Format, common, Control, [caseignorable, prependedconcatenationmark]
+U+2118 ON  Symbol: Mathematical symbol, common, Other, [changeswhencasemapped, changeswhentitlecased, emojimodifier, emojimodifierbase, patternsyntax, patternwhitespace, prependedconcatenationmark, quotationmark, radical, regionalindicator, sentenceterminal, softdotted, terminalpunctuation, unifiedideograph, uppercase, variationselector, whitespace, xidcontinue, xidstart]
+U+3030 ON  Punctuation: Dash punctuation, common, Extended Pictographic, [hangul, hiragana, katakana, bopomofo, han], [changeswhencasemapped, changeswhenuppercased, emojimodifier, emojimodifierbase, joincontrol, logicalorderexception, lowercase, prependedconcatenationmark, quotationmark, radical, regionalindicator, sentenceterminal, softdotted, terminalpunctuation, unifiedideograph, uppercase, variationselector, whitespace, xidcontinue, xidstart]
+U+AAC0 L   Letter: Other letter, taiviet, Other, [changeswhencasemapped, changeswhenuppercased, emojimodifier, emojimodifierbase, logicalorderexception, lowercase, math, patternwhitespace, prependedconcatenationmark]
+U+AAC2 L   Letter: Other letter, taiviet, Other, [changeswhencasemapped, changeswhenuppercased, emojimodifier, emojimodifierbase, logicalorderexception, lowercase, math, patternwhitespace, prependedconcatenationmark]
+U+FE0F NSM Mark: Non-spacing mark, inherited, Extend, [changeswhencasemapped, changeswhenuppercased, emojimodifier, emojimodifierbase, joincontrol, logicalorderexception, math, patternwhitespace, prependedconcatenationmark]
+U+FE55 CS  Punctuation: Other punctuation, common, Other, [changeswhencasemapped, changeswhenuppercased, emojimodifier, emojimodifierbase, joincontrol, noncharactercodepoint, patternwhitespace, prependedconcatenationmark]
+U+FEFF BN  Control: Format, common, Control, [caseignorable, prependedconcatenationmark]
+U+FF1A CS  Punctuation: Other punctuation, common, Other, [changeswhencasemapped, changeswhenuppercased, emojimodifier, emojimodifierbase, joincontrol, noncharactercodepoint, patternwhitespace, prependedconcatenationmark]
+U+FF21..U+FF26 L   Letter: Upper case letter, latin, Other, [changeswhencasemapped, changeswhenuppercased, emojimodifier, emojimodifierbase, logicalorderexception, lowercase, noncharactercodepoint, patternwhitespace, prependedconcatenationmark]
+U+10D22..U+10D23 AL  Letter: Other letter, hanifirohingya, Other, [changeswhencasemapped, changeswhenuppercased, emojimodifier, emojimodifierbase, logicalorderexception, lowercase, math, patternwhitespace, prependedconcatenationmark]
+       U+1135D L   Letter: Other letter, grantha, Other, [changeswhencasemapped, changeswhentitlecased, emojimodifier, emojimodifierbase, graphemeextend, hexdigit, logicalorderexception, lowercase, math, noncharactercodepoint, patternsyntax, patternwhitespace, prependedconcatenationmark, quotationmark, radical, regionalindicator, sentenceterminal, softdotted, terminalpunctuation, unifiedideograph, uppercase, variationselector, whitespace, xidcontinue, xidstart]
+U+1BCA0..U+1BCA3 BN  Control: Format, common, Control, [duployan], [caseignorable, prependedconcatenationmark]
+U+1D173..U+1D17A BN  Control: Format, common, Control, [caseignorable, prependedconcatenationmark]
+U+1F1E6..U+1F1FF L   Symbol: Other symbol, common, Regional Indicator, [changeswhencasemapped, changeswhenuppercased, emojimodifier, emojimodifierbase, math, patternwhitespace, prependedconcatenationmark, quotationmark, radical, regionalindicator, sentenceterminal, softdotted, terminalpunctuation, unifiedideograph, uppercase, variationselector, whitespace, xidcontinue, xidstart]
+find bool pcm
+U+00AD BN  Control: Format, common, Control, [caseignorable, prependedconcatenationmark]
+U+180E BN  Control: Format, mongolian, Control, [caseignorable, prependedconcatenationmark]
+U+200B BN  Control: Format, common, Control, [caseignorable, prependedconcatenationmark]
+U+2060 BN  Control: Format, common, Control, [caseignorable, prependedconcatenationmark]
+U+2118 ON  Symbol: Mathematical symbol, common, Other, [changeswhencasemapped, changeswhentitlecased, emojimodifier, emojimodifierbase, patternsyntax, patternwhitespace, prependedconcatenationmark, quotationmark, radical, regionalindicator, sentenceterminal, softdotted, terminalpunctuation, unifiedideograph, uppercase, variationselector, whitespace, xidcontinue, xidstart]
+U+3030 ON  Punctuation: Dash punctuation, common, Extended Pictographic, [hangul, hiragana, katakana, bopomofo, han], [changeswhencasemapped, changeswhenuppercased, emojimodifier, emojimodifierbase, joincontrol, logicalorderexception, lowercase, prependedconcatenationmark, quotationmark, radical, regionalindicator, sentenceterminal, softdotted, terminalpunctuation, unifiedideograph, uppercase, variationselector, whitespace, xidcontinue, xidstart]
+U+AAC0 L   Letter: Other letter, taiviet, Other, [changeswhencasemapped, changeswhenuppercased, emojimodifier, emojimodifierbase, logicalorderexception, lowercase, math, patternwhitespace, prependedconcatenationmark]
+U+AAC2 L   Letter: Other letter, taiviet, Other, [changeswhencasemapped, changeswhenuppercased, emojimodifier, emojimodifierbase, logicalorderexception, lowercase, math, patternwhitespace, prependedconcatenationmark]
+U+FE0F NSM Mark: Non-spacing mark, inherited, Extend, [changeswhencasemapped, changeswhenuppercased, emojimodifier, emojimodifierbase, joincontrol, logicalorderexception, math, patternwhitespace, prependedconcatenationmark]
+U+FE55 CS  Punctuation: Other punctuation, common, Other, [changeswhencasemapped, changeswhenuppercased, emojimodifier, emojimodifierbase, joincontrol, noncharactercodepoint, patternwhitespace, prependedconcatenationmark]
+U+FEFF BN  Control: Format, common, Control, [caseignorable, prependedconcatenationmark]
+U+FF1A CS  Punctuation: Other punctuation, common, Other, [changeswhencasemapped, changeswhenuppercased, emojimodifier, emojimodifierbase, joincontrol, noncharactercodepoint, patternwhitespace, prependedconcatenationmark]
+U+FF21..U+FF26 L   Letter: Upper case letter, latin, Other, [changeswhencasemapped, changeswhenuppercased, emojimodifier, emojimodifierbase, logicalorderexception, lowercase, noncharactercodepoint, patternwhitespace, prependedconcatenationmark]
+U+10D22..U+10D23 AL  Letter: Other letter, hanifirohingya, Other, [changeswhencasemapped, changeswhenuppercased, emojimodifier, emojimodifierbase, logicalorderexception, lowercase, math, patternwhitespace, prependedconcatenationmark]
+       U+1135D L   Letter: Other letter, grantha, Other, [changeswhencasemapped, changeswhentitlecased, emojimodifier, emojimodifierbase, graphemeextend, hexdigit, logicalorderexception, lowercase, math, noncharactercodepoint, patternsyntax, patternwhitespace, prependedconcatenationmark, quotationmark, radical, regionalindicator, sentenceterminal, softdotted, terminalpunctuation, unifiedideograph, uppercase, variationselector, whitespace, xidcontinue, xidstart]
+U+1BCA0..U+1BCA3 BN  Control: Format, common, Control, [duployan], [caseignorable, prependedconcatenationmark]
+U+1D173..U+1D17A BN  Control: Format, common, Control, [caseignorable, prependedconcatenationmark]
+U+1F1E6..U+1F1FF L   Symbol: Other symbol, common, Regional Indicator, [changeswhencasemapped, changeswhenuppercased, emojimodifier, emojimodifierbase, math, patternwhitespace, prependedconcatenationmark, quotationmark, radical, regionalindicator, sentenceterminal, softdotted, terminalpunctuation, unifiedideograph, uppercase, variationselector, whitespace, xidcontinue, xidstart]
--- a/src/config.h.generic
+++ b/src/config.h.generic
@ -97,6 +97,9 @@ sure both macros are undefined; an emulation function will then be used. */
 /* Have PTHREAD_PRIO_INHERIT. */
 /* #undef HAVE_PTHREAD_PRIO_INHERIT */

+/* Define to 1 if you have the <readline.h> header file. */
+/* #undef HAVE_READLINE_H */
+
 /* Define to 1 if you have the <readline/history.h> header file. */
 /* #undef HAVE_READLINE_HISTORY_H */

@ -233,7 +236,7 @@ sure both macros are undefined; an emulation function will then be used. */
 #define PACKAGE_NAME "PCRE2"

 /* Define to the full name and version of this package. */
-#define PACKAGE_STRING "PCRE2 10.39"
+#define PACKAGE_STRING "PCRE2 10.40"

 /* Define to the one symbol short name of this package. */
 #define PACKAGE_TARNAME "pcre2"
@ -242,7 +245,7 @@ sure both macros are undefined; an emulation function will then be used. */
 #define PACKAGE_URL ""

 /* Define to the version of this package. */
-#define PACKAGE_VERSION "10.39"
+#define PACKAGE_VERSION "10.40"

 /* The value of PARENS_NEST_LIMIT specifies the maximum depth of nested
   parentheses (of any kind) in a pattern. This limits the amount of system
@ -435,7 +438,7 @@ sure both macros are undefined; an emulation function will then be used. */
 #endif

 /* Version number of package */
-#define VERSION "10.39"
+#define VERSION "10.40"

 /* Define to empty if `const' does not conform to ANSI C. */
 /* #undef const */
--- a/src/config.h.in
+++ b/src/config.h.in
@ -97,6 +97,9 @@ sure both macros are undefined; an emulation function will then be used. */
 /* Have PTHREAD_PRIO_INHERIT. */
 #undef HAVE_PTHREAD_PRIO_INHERIT

+/* Define to 1 if you have the <readline.h> header file. */
+#undef HAVE_READLINE_H
+
 /* Define to 1 if you have the <readline/history.h> header file. */
 #undef HAVE_READLINE_HISTORY_H

--- a/src/pcre2.h.generic
+++ b/src/pcre2.h.generic
@ -42,9 +42,9 @@ POSSIBILITY OF SUCH DAMAGE.
 /* The current PCRE version information. */

 #define PCRE2_MAJOR           10
-#define PCRE2_MINOR           39
+#define PCRE2_MINOR           40
 #define PCRE2_PRERELEASE      
-#define PCRE2_DATE            2021-10-29
+#define PCRE2_DATE            2022-04-14

 /* When an application links to a PCRE DLL in Windows, the symbols that are
 imported have to be identified as such. When building PCRE2, the appropriate
--- a/src/pcre2_auto_possess.c
+++ b/src/pcre2_auto_possess.c
@ -7,7 +7,7 @@ and semantics are as close as possible to those of the Perl 5 language.

                       Written by Philip Hazel
     Original API code Copyright (c) 1997-2012 University of Cambridge
-          New API code Copyright (c) 2016-2021 University of Cambridge
+          New API code Copyright (c) 2016-2022 University of Cambridge

 -----------------------------------------------------------------------------
 Redistribution and use in source and binary forms, with or without
@ -123,18 +123,21 @@ opcode is used to select the column. The values are as follows:
 */

 static const uint8_t propposstab[PT_TABSIZE][PT_TABSIZE] = {
-/* ANY LAMP GC  PC  SC ALNUM SPACE PXSPACE WORD CLIST UCNC */
-  { 0,  0,  0,  0,  0,    0,    0,      0,   0,    0,   0 },  /* PT_ANY */
-  { 0,  3,  0,  0,  0,    3,    1,      1,   0,    0,   0 },  /* PT_LAMP */
-  { 0,  0,  2,  4,  0,    9,   10,     10,  11,    0,   0 },  /* PT_GC */
-  { 0,  0,  5,  2,  0,   15,   16,     16,  17,    0,   0 },  /* PT_PC */
-  { 0,  0,  0,  0,  2,    0,    0,      0,   0,    0,   0 },  /* PT_SC */
-  { 0,  3,  6, 12,  0,    3,    1,      1,   0,    0,   0 },  /* PT_ALNUM */
-  { 0,  1,  7, 13,  0,    1,    3,      3,   1,    0,   0 },  /* PT_SPACE */
-  { 0,  1,  7, 13,  0,    1,    3,      3,   1,    0,   0 },  /* PT_PXSPACE */
-  { 0,  0,  8, 14,  0,    0,    1,      1,   3,    0,   0 },  /* PT_WORD */
-  { 0,  0,  0,  0,  0,    0,    0,      0,   0,    0,   0 },  /* PT_CLIST */
-  { 0,  0,  0,  0,  0,    0,    0,      0,   0,    0,   3 }   /* PT_UCNC */
+/* ANY LAMP GC  PC  SC  SCX ALNUM SPACE PXSPACE WORD CLIST UCNC BIDICL BOOL */
+  { 0,  0,  0,  0,  0,   0,    0,    0,      0,   0,    0,   0,    0,    0 },  /* PT_ANY */
+  { 0,  3,  0,  0,  0,   0,    3,    1,      1,   0,    0,   0,    0,    0 },  /* PT_LAMP */
+  { 0,  0,  2,  4,  0,   0,    9,   10,     10,  11,    0,   0,    0,    0 },  /* PT_GC */
+  { 0,  0,  5,  2,  0,   0,   15,   16,     16,  17,    0,   0,    0,    0 },  /* PT_PC */
+  { 0,  0,  0,  0,  2,   2,    0,    0,      0,   0,    0,   0,    0,    0 },  /* PT_SC */
+  { 0,  0,  0,  0,  2,   2,    0,    0,      0,   0,    0,   0,    0,    0 },  /* PT_SCX */
+  { 0,  3,  6, 12,  0,   0,    3,    1,      1,   0,    0,   0,    0,    0 },  /* PT_ALNUM */
+  { 0,  1,  7, 13,  0,   0,    1,    3,      3,   1,    0,   0,    0,    0 },  /* PT_SPACE */
+  { 0,  1,  7, 13,  0,   0,    1,    3,      3,   1,    0,   0,    0,    0 },  /* PT_PXSPACE */
+  { 0,  0,  8, 14,  0,   0,    0,    1,      1,   3,    0,   0,    0,    0 },  /* PT_WORD */
+  { 0,  0,  0,  0,  0,   0,    0,    0,      0,   0,    0,   0,    0,    0 },  /* PT_CLIST */
+  { 0,  0,  0,  0,  0,   0,    0,    0,      0,   0,    0,   3,    0,    0 },  /* PT_UCNC */
+  { 0,  0,  0,  0,  0,   0,    0,    0,      0,   0,    0,   0,    0,    0 },  /* PT_BIDICL */
+  { 0,  0,  0,  0,  0,   0,    0,    0,      0,   0,    0,   0,    0,    0 }   /* PT_BOOL */
 };

 /* This table is used to check whether auto-possessification is possible
@ -196,6 +199,7 @@ static BOOL
 check_char_prop(uint32_t c, unsigned int ptype, unsigned int pdata,
  BOOL negated)
 {
+BOOL ok;
 const uint32_t *p;
 const ucd_record *prop = GET_UCD(c);

@ -215,6 +219,11 @@ switch(ptype)
  case PT_SC:
  return (pdata == prop->script) == negated;

+  case PT_SCX:
+  ok = (pdata == prop->script
+        || MAPBIT(PRIV(ucd_script_sets) + UCD_SCRIPTX_PROP(prop), pdata) != 0);
+  return ok == negated;
+
  /* These are specials */

  case PT_ALNUM:
@ -251,6 +260,14 @@ switch(ptype)
    if (c == *p++) return negated;
    }
  break;  /* Control never reaches here */
+
+  /* Haven't yet thought these through. */
+
+  case PT_BIDICL:
+  return FALSE;
+
+  case PT_BOOL:
+  return FALSE;
  }

 return FALSE;
--- a/src/pcre2_compile.c
+++ b/src/pcre2_compile.c
@ -7,7 +7,7 @@ and semantics are as close as possible to those of the Perl 5 language.

                       Written by Philip Hazel
     Original API code Copyright (c) 1997-2012 University of Cambridge
-          New API code Copyright (c) 2016-2021 University of Cambridge
+          New API code Copyright (c) 2016-2022 University of Cambridge

 -----------------------------------------------------------------------------
 Redistribution and use in source and binary forms, with or without
@ -124,7 +124,7 @@ static unsigned int

 static int
  compile_regex(uint32_t, PCRE2_UCHAR **, uint32_t **, int *, uint32_t,
-    uint32_t *, int32_t *, uint32_t *, int32_t *, branch_chain *,
+    uint32_t *, uint32_t *, uint32_t *, uint32_t *, branch_chain *,
    compile_block *, PCRE2_SIZE *);

 static int
@ -385,13 +385,15 @@ compiler is clever with identical subexpressions. */

 #define SETBIT(a,b) a[(b)/8] = (uint8_t)(a[(b)/8] | (1u << ((b)&7)))

-/* Private flags added to firstcu and reqcu. */
+/* Values and flags for the unsigned xxcuflags variables that accompany xxcu
+variables, which are concerned with first and required code units. A value
+greater than or equal to REQ_NONE means "no code unit set"; otherwise the
+matching xxcu variable is set, and the low valued bits are relevant. */

-#define REQ_CASELESS    (1u << 0)       /* Indicates caselessness */
-#define REQ_VARY        (1u << 1)       /* reqcu followed non-literal item */
-/* Negative values for the firstcu and reqcu flags */
-#define REQ_UNSET       (-2)            /* Not yet found anything */
-#define REQ_NONE        (-1)            /* Found not fixed char */
+#define REQ_UNSET     0xffffffffu  /* Not yet found anything */
+#define REQ_NONE      0xfffffffeu  /* Found not fixed character */
+#define REQ_CASELESS  0x00000001u  /* Code unit in xxcu is caseless */
+#define REQ_VARY      0x00000002u  /* Code unit is followed by non-literal */

 /* These flags are used in the groupinfo vector. */

@ -1264,8 +1266,10 @@ PCRE2_SIZE* ref_count;

 if (code != NULL)
  {
+#ifdef SUPPORT_JIT   
  if (code->executable_jit != NULL)
    PRIV(jit_free)(code->executable_jit, &code->memctl);
+#endif

  if ((code->flags & PCRE2_DEREF_TABLES) != 0)
    {
@ -2088,7 +2092,9 @@ get_ucp(PCRE2_SPTR *ptrptr, BOOL *negptr, uint16_t *ptypeptr,
 PCRE2_UCHAR c;
 PCRE2_SIZE i, bot, top;
 PCRE2_SPTR ptr = *ptrptr;
-PCRE2_UCHAR name[32];
+PCRE2_UCHAR name[50];
+PCRE2_UCHAR *vptr = NULL;
+uint16_t ptscript = PT_NOTSCRIPT;

 if (ptr >= cb->end_pattern) goto ERROR_RETURN;
 c = *ptr++;
@ -2100,36 +2106,95 @@ negation. */
 if (c == CHAR_LEFT_CURLY_BRACKET)
  {
  if (ptr >= cb->end_pattern) goto ERROR_RETURN;
+
  if (*ptr == CHAR_CIRCUMFLEX_ACCENT)
    {
    *negptr = TRUE;
    ptr++;
    }
+
  for (i = 0; i < (int)(sizeof(name) / sizeof(PCRE2_UCHAR)) - 1; i++)
    {
    if (ptr >= cb->end_pattern) goto ERROR_RETURN;
    c = *ptr++;
+    while (c == '_' || c == '-' || isspace(c))
+      {
+      if (ptr >= cb->end_pattern) goto ERROR_RETURN;
+      c = *ptr++;
+      }
    if (c == CHAR_NUL) goto ERROR_RETURN;
    if (c == CHAR_RIGHT_CURLY_BRACKET) break;
-    name[i] = c;
+    name[i] = tolower(c);
+    if ((c == ':' || c == '=') && vptr == NULL) vptr = name + i;
    }
+
  if (c != CHAR_RIGHT_CURLY_BRACKET) goto ERROR_RETURN;
  name[i] = 0;
  }

-/* Otherwise there is just one following character, which must be an ASCII
-letter. */
+/* If { doesn't follow \p or \P there is just one following character, which
+must be an ASCII letter. */

 else if (MAX_255(c) && (cb->ctypes[c] & ctype_letter) != 0)
  {
-  name[0] = c;
+  name[0] = tolower(c);
  name[1] = 0;
  }
 else goto ERROR_RETURN;

 *ptrptr = ptr;

-/* Search for a recognized property name using binary chop. */
+/* If the property contains ':' or '=' we have class name and value separately
+specified. The following are supported:
+
+  . Bidi_Class (synonym bc), for which the property names are "bidi<name>".
+  . Script (synonym sc) for which the property name is the script name
+  . Script_Extensions (synonym scx), ditto
+
+As this is a small number, we currently just check the names directly. If this
+grows, a sorted table and a switch will be neater.
+
+For both the script properties, set a PT_xxx value so that (1) they can be
+distinguished and (2) invalid script names that happen to be the name of
+another property can be diagnosed. */
+
+if (vptr != NULL)
+  {
+  int offset = 0;
+  PCRE2_UCHAR sname[8];
+
+  *vptr = 0;   /* Terminate property name */
+  if (PRIV(strcmp_c8)(name, STRING_bidiclass) == 0 ||
+      PRIV(strcmp_c8)(name, STRING_bc) == 0)
+    {
+    offset = 4;
+    sname[0] = CHAR_b;
+    sname[1] = CHAR_i;  /* There is no strcpy_c8 function */
+    sname[2] = CHAR_d;
+    sname[3] = CHAR_i;
+    }
+
+  else if (PRIV(strcmp_c8)(name, STRING_script) == 0 ||
+           PRIV(strcmp_c8)(name, STRING_sc) == 0)
+    ptscript = PT_SC;
+
+  else if (PRIV(strcmp_c8)(name, STRING_scriptextensions) == 0 ||
+           PRIV(strcmp_c8)(name, STRING_scx) == 0)
+    ptscript = PT_SCX;
+
+  else
+    {
+    *errorcodeptr = ERR47;
+    return FALSE;
+    }
+
+  /* Adjust the string in name[] as needed */
+
+  memmove(name + offset, vptr + 1, (name + i - vptr)*sizeof(PCRE2_UCHAR));
+  if (offset != 0) memmove(name, sname, offset*sizeof(PCRE2_UCHAR));
+  }
+
+/* Search for a recognized property using binary chop. */

 bot = 0;
 top = PRIV(utt_size);
@ -2139,15 +2204,37 @@ while (bot < top)
  int r;
  i = (bot + top) >> 1;
  r = PRIV(strcmp_c8)(name, PRIV(utt_names) + PRIV(utt)[i].name_offset);
+
+  /* When a matching property is found, some extra checking is needed when the
+  \p{xx:yy} syntax is used and xx is either sc or scx. */
+
  if (r == 0)
    {
-    *ptypeptr = PRIV(utt)[i].type;
    *pdataptr = PRIV(utt)[i].value;
-    return TRUE;
+    if (vptr == NULL || ptscript == PT_NOTSCRIPT)
+      {
+      *ptypeptr = PRIV(utt)[i].type;
+      return TRUE;
+      }
+
+    switch (PRIV(utt)[i].type)
+      {
+      case PT_SC:
+      *ptypeptr = PT_SC;
+      return TRUE;
+
+      case PT_SCX:
+      *ptypeptr = ptscript;
+      return TRUE;
+      }
+
+    break;  /* Non-script found */
    }
+
  if (r > 0) bot = i + 1; else top = i;
  }
-*errorcodeptr = ERR47;   /* Unrecognized name */
+
+*errorcodeptr = ERR47;   /* Unrecognized property */
 return FALSE;

 ERROR_RETURN:            /* Malformed \P or \p */
@ -5285,9 +5372,9 @@ Arguments:
  pptrptr           points to the current parsed pattern pointer
  errorcodeptr      points to error code variable
  firstcuptr        place to put the first required code unit
-  firstcuflagsptr   place to put the first code unit flags, or a negative number
+  firstcuflagsptr   place to put the first code unit flags
  reqcuptr          place to put the last required code unit
-  reqcuflagsptr     place to put the last required code unit flags, or a negative number
+  reqcuflagsptr     place to put the last required code unit flags
  bcptr             points to current branch chain
  cb                contains pointers to tables etc.
  lengthptr         NULL during the real compile phase
@ -5300,8 +5387,8 @@ Returns:            0 There's been an error, *errorcodeptr is non-zero

 static int
 compile_branch(uint32_t *optionsptr, PCRE2_UCHAR **codeptr, uint32_t **pptrptr,
-  int *errorcodeptr, uint32_t *firstcuptr, int32_t *firstcuflagsptr,
-  uint32_t *reqcuptr, int32_t *reqcuflagsptr, branch_chain *bcptr,
+  int *errorcodeptr, uint32_t *firstcuptr, uint32_t *firstcuflagsptr,
+  uint32_t *reqcuptr, uint32_t *reqcuflagsptr, branch_chain *bcptr,
  compile_block *cb, PCRE2_SIZE *lengthptr)
 {
 int bravalue = 0;
@ -5316,9 +5403,9 @@ uint32_t zeroreqcu, zerofirstcu;
 uint32_t escape;
 uint32_t *pptr = *pptrptr;
 uint32_t meta, meta_arg;
-int32_t firstcuflags, reqcuflags;
-int32_t zeroreqcuflags, zerofirstcuflags;
-int32_t req_caseopt, reqvary, tempreqvary;
+uint32_t firstcuflags, reqcuflags;
+uint32_t zeroreqcuflags, zerofirstcuflags;
+uint32_t req_caseopt, reqvary, tempreqvary;
 PCRE2_SIZE offset = 0;
 PCRE2_SIZE length_prevgroup = 0;
 PCRE2_UCHAR *code = *codeptr;
@ -5374,13 +5461,13 @@ item types that can be repeated set these backoff variables appropriately. */
 firstcu = reqcu = zerofirstcu = zeroreqcu = 0;
 firstcuflags = reqcuflags = zerofirstcuflags = zeroreqcuflags = REQ_UNSET;

-/* The variable req_caseopt contains either the REQ_CASELESS value or zero,
+/* The variable req_caseopt contains either the REQ_CASELESS bit or zero,
 according to the current setting of the caseless flag. The REQ_CASELESS value
 leaves the lower 28 bit empty. It is added into the firstcu or reqcu variables
 to record the case status of the value. This is used only for ASCII characters.
 */

-req_caseopt = ((options & PCRE2_CASELESS) != 0)? REQ_CASELESS:0;
+req_caseopt = ((options & PCRE2_CASELESS) != 0)? REQ_CASELESS : 0;

 /* Switch on next META item until the end of the branch */

@ -5395,13 +5482,12 @@ for (;; pptr++)
  BOOL possessive_quantifier;
  BOOL note_group_empty;
  int class_has_8bitchar;
-  int i;
  uint32_t mclength;
  uint32_t skipunits;
  uint32_t subreqcu, subfirstcu;
  uint32_t groupnumber;
  uint32_t verbarglen, verbculen;
-  int32_t subreqcuflags, subfirstcuflags;  /* Must be signed */
+  uint32_t subreqcuflags, subfirstcuflags;
  open_capitem *oc;
  PCRE2_UCHAR mcbuffer[8];

@ -5770,9 +5856,9 @@ for (;; pptr++)
        if (taboffset >= 0)
          {
          if (tabopt >= 0)
-            for (i = 0; i < 32; i++) pbits[i] |= cbits[(int)i + taboffset];
+            for (int i = 0; i < 32; i++) pbits[i] |= cbits[(int)i + taboffset];
          else
-            for (i = 0; i < 32; i++) pbits[i] &= ~cbits[(int)i + taboffset];
+            for (int i = 0; i < 32; i++) pbits[i] &= ~cbits[(int)i + taboffset];
          }

        /* Now see if we need to remove any special characters. An option
@ -5786,9 +5872,9 @@ for (;; pptr++)
        being built and we are done. */

        if (local_negate)
-          for (i = 0; i < 32; i++) classbits[i] |= ~pbits[i];
+          for (int i = 0; i < 32; i++) classbits[i] |= (uint8_t)(~pbits[i]);
        else
-          for (i = 0; i < 32; i++) classbits[i] |= pbits[i];
+          for (int i = 0; i < 32; i++) classbits[i] |= pbits[i];

        /* Every class contains at least one < 256 character. */

@ -5827,21 +5913,23 @@ for (;; pptr++)
        switch(escape)
          {
          case ESC_d:
-          for (i = 0; i < 32; i++) classbits[i] |= cbits[i+cbit_digit];
+          for (int i = 0; i < 32; i++) classbits[i] |= cbits[i+cbit_digit];
          break;

          case ESC_D:
          should_flip_negation = TRUE;
-          for (i = 0; i < 32; i++) classbits[i] |= ~cbits[i+cbit_digit];
+          for (int i = 0; i < 32; i++)
+            classbits[i] |= (uint8_t)(~cbits[i+cbit_digit]);
          break;

          case ESC_w:
-          for (i = 0; i < 32; i++) classbits[i] |= cbits[i+cbit_word];
+          for (int i = 0; i < 32; i++) classbits[i] |= cbits[i+cbit_word];
          break;

          case ESC_W:
          should_flip_negation = TRUE;
-          for (i = 0; i < 32; i++) classbits[i] |= ~cbits[i+cbit_word];
+          for (int i = 0; i < 32; i++)
+            classbits[i] |= (uint8_t)(~cbits[i+cbit_word]);
          break;

          /* Perl 5.004 onwards omitted VT from \s, but restored it at Perl
@ -5852,12 +5940,13 @@ for (;; pptr++)
          longer treat \s and \S specially. */

          case ESC_s:
-          for (i = 0; i < 32; i++) classbits[i] |= cbits[i+cbit_space];
+          for (int i = 0; i < 32; i++) classbits[i] |= cbits[i+cbit_space];
          break;

          case ESC_S:
          should_flip_negation = TRUE;
-          for (i = 0; i < 32; i++) classbits[i] |= ~cbits[i+cbit_space];
+          for (int i = 0; i < 32; i++)
+            classbits[i] |= (uint8_t)(~cbits[i+cbit_space]);
          break;

          /* When adding the horizontal or vertical space lists to a class, or
@ -6098,7 +6187,7 @@ for (;; pptr++)
        if (negate_class && !xclass_has_prop)
          {
          /* Using 255 ^ instead of ~ avoids clang sanitize warning. */
-          for (i = 0; i < 32; i++) classbits[i] = 255 ^ classbits[i];
+          for (int i = 0; i < 32; i++) classbits[i] = 255 ^ classbits[i];
          }
        memcpy(code, classbits, 32);
        code = class_uchardata + (32 / sizeof(PCRE2_UCHAR));
@ -6124,7 +6213,7 @@ for (;; pptr++)
      if (negate_class)
        {
       /* Using 255 ^ instead of ~ avoids clang sanitize warning. */
-       for (i = 0; i < 32; i++) classbits[i] = 255 ^ classbits[i];
+       for (int i = 0; i < 32; i++) classbits[i] = 255 ^ classbits[i];
       }
      memcpy(code, classbits, 32);
      }
@ -6198,7 +6287,7 @@ for (;; pptr++)
    verbarglen = *(++pptr);
    verbculen = 0;
    tempcode = code++;
-    for (i = 0; i < (int)verbarglen; i++)
+    for (int i = 0; i < (int)verbarglen; i++)
      {
      meta = *(++pptr);
 #ifdef SUPPORT_UNICODE
@ -6247,6 +6336,7 @@ for (;; pptr++)
    bravalue = OP_COND;
      {
      int count, index;
+      unsigned int i;
      PCRE2_SPTR name;
      named_group *ng = cb->named_groups;
      uint32_t length = *(++pptr);
@ -6286,7 +6376,7 @@ for (;; pptr++)
        groupnumber = 0;
        if (meta == META_COND_RNUMBER)
          {
-          for (i = 1; i < (int)length; i++)
+          for (i = 1; i < length; i++)
            {
            groupnumber = groupnumber * 10 + name[i] - CHAR_0;
            if (groupnumber > MAX_GROUP_NUMBER)
@ -6608,7 +6698,7 @@ for (;; pptr++)

      if (firstcuflags == REQ_UNSET && subfirstcuflags != REQ_UNSET)
        {
-        if (subfirstcuflags >= 0)
+        if (subfirstcuflags < REQ_NONE)
          {
          firstcu = subfirstcu;
          firstcuflags = subfirstcuflags;
@ -6622,7 +6712,7 @@ for (;; pptr++)
      into reqcu if there wasn't one, using the vary flag that was in
      existence beforehand. */

-      else if (subfirstcuflags >= 0 && subreqcuflags < 0)
+      else if (subfirstcuflags < REQ_NONE && subreqcuflags >= REQ_NONE)
        {
        subreqcu = subfirstcu;
        subreqcuflags = subfirstcuflags | tempreqvary;
@ -6631,7 +6721,7 @@ for (;; pptr++)
      /* If the subpattern set a required code unit (or set a first code unit
      that isn't really the first code unit - see above), set it. */

-      if (subreqcuflags >= 0)
+      if (subreqcuflags < REQ_NONE)
        {
        reqcu = subreqcu;
        reqcuflags = subreqcuflags;
@ -6650,7 +6740,7 @@ for (;; pptr++)
    in that example, 'X' ends up set for both. */

    else if ((bravalue == OP_ASSERT || bravalue == OP_ASSERT_NA) &&
-             subreqcuflags >= 0 && subfirstcuflags >= 0)
+             subreqcuflags < REQ_NONE && subfirstcuflags < REQ_NONE)
      {
      reqcu = subreqcu;
      reqcuflags = subreqcuflags;
@ -6680,7 +6770,7 @@ for (;; pptr++)
      this name is duplicated. */

      groupnumber = 0;
-      for (i = 0; i < cb->names_found; i++, ng++)
+      for (unsigned int i = 0; i < cb->names_found; i++, ng++)
        {
        if (length == ng->length &&
            PRIV(strncmp)(name, ng->name, length) == 0)
@ -6935,14 +7025,19 @@ for (;; pptr++)
 #endif  /* MAYBE_UTF_MULTI */

      /* Handle the case of a single code unit - either with no UTF support, or
-      with UTF disabled, or for a single-code-unit UTF character. */
+      with UTF disabled, or for a single-code-unit UTF character. In the latter
+      case, for a repeated positive match, get the caseless flag for the
+      required code unit from the previous character, because a class like [Aa]
+      sets a caseless A but by now the req_caseopt flag has been reset. */
+
        {
        mcbuffer[0] = code[-1];
        mclength = 1;
        if (op_previous <= OP_CHARI && repeat_min > 1)
          {
          reqcu = mcbuffer[0];
-          reqcuflags = req_caseopt | cb->req_varyopt;
+          reqcuflags = cb->req_varyopt;
+          if (op_previous == OP_CHARI) reqcuflags |= REQ_CASELESS;
          }
        }
      goto OUTPUT_SINGLE_REPEAT;  /* Code shared with single character types */
@ -7034,7 +7129,7 @@ for (;; pptr++)
          *lengthptr += delta;
          }

-        else for (i = 0; i < replicate; i++)
+        else for (int i = 0; i < replicate; i++)
          {
          memcpy(code, previous, CU2BYTES(1 + LINK_SIZE));
          previous = code;
@ -7210,12 +7305,12 @@ for (;; pptr++)

            else
              {
-              if (groupsetfirstcu && reqcuflags < 0)
+              if (groupsetfirstcu && reqcuflags >= REQ_NONE)
                {
                reqcu = firstcu;
                reqcuflags = firstcuflags;
                }
-              for (i = 1; (uint32_t)i < repeat_min; i++)
+              for (uint32_t i = 1; i < repeat_min; i++)
                {
                memcpy(code, previous, CU2BYTES(len));
                code += len;
@ -7259,14 +7354,14 @@ for (;; pptr++)

          /* This is compiling for real */

-          else for (i = repeat_max - 1; i >= 0; i--)
+          else for (uint32_t i = repeat_max; i >= 1; i--)
            {
            *code++ = OP_BRAZERO + repeat_type;

            /* All but the final copy start a new nesting, maintaining the
            chain of brackets outstanding. */

-            if (i != 0)
+            if (i != 1)
              {
              int linkoffset;
              *code++ = OP_BRA;
@ -7985,9 +8080,9 @@ Arguments:
  errorcodeptr      -> pointer to error code variable
  skipunits         skip this many code units at start (for brackets and OP_COND)
  firstcuptr        place to put the first required code unit
-  firstcuflagsptr   place to put the first code unit flags, or a negative number
+  firstcuflagsptr   place to put the first code unit flags
  reqcuptr          place to put the last required code unit
-  reqcuflagsptr     place to put the last required code unit flags, or a negative number
+  reqcuflagsptr     place to put the last required code unit flags
  bcptr             pointer to the chain of currently open branches
  cb                points to the data block with tables pointers etc.
  lengthptr         NULL during the real compile phase
@ -8001,7 +8096,7 @@ Returns:            0 There has been an error
 static int
 compile_regex(uint32_t options, PCRE2_UCHAR **codeptr, uint32_t **pptrptr,
  int *errorcodeptr, uint32_t skipunits, uint32_t *firstcuptr,
-  int32_t *firstcuflagsptr, uint32_t *reqcuptr,int32_t *reqcuflagsptr,
+  uint32_t *firstcuflagsptr, uint32_t *reqcuptr, uint32_t *reqcuflagsptr,
  branch_chain *bcptr, compile_block *cb, PCRE2_SIZE *lengthptr)
 {
 PCRE2_UCHAR *code = *codeptr;
@ -8014,9 +8109,9 @@ int okreturn = 1;
 uint32_t *pptr = *pptrptr;
 uint32_t firstcu, reqcu;
 uint32_t lookbehindlength;
-int32_t firstcuflags, reqcuflags;
+uint32_t firstcuflags, reqcuflags;
 uint32_t branchfirstcu, branchreqcu;
-int32_t branchfirstcuflags, branchreqcuflags;
+uint32_t branchfirstcuflags, branchreqcuflags;
 PCRE2_SIZE length;
 branch_chain bc;

@ -8135,9 +8230,9 @@ for (;;)

      if (firstcuflags != branchfirstcuflags || firstcu != branchfirstcu)
        {
-        if (firstcuflags >= 0)
+        if (firstcuflags < REQ_NONE)
          {
-          if (reqcuflags < 0)
+          if (reqcuflags >= REQ_NONE)
            {
            reqcu = firstcu;
            reqcuflags = firstcuflags;
@ -8149,8 +8244,8 @@ for (;;)
      /* If we (now or from before) have no firstcu, a firstcu from the
      branch becomes a reqcu if there isn't a branch reqcu. */

-      if (firstcuflags < 0 && branchfirstcuflags >= 0 &&
-          branchreqcuflags < 0)
+      if (firstcuflags >= REQ_NONE && branchfirstcuflags < REQ_NONE &&
+          branchreqcuflags >= REQ_NONE)
        {
        branchreqcu = branchfirstcu;
        branchreqcuflags = branchfirstcuflags;
@ -8298,7 +8393,7 @@ Returns:     TRUE or FALSE
 */

 static BOOL
-is_anchored(PCRE2_SPTR code, unsigned int bracket_map, compile_block *cb,
+is_anchored(PCRE2_SPTR code, uint32_t bracket_map, compile_block *cb,
  int atomcount, BOOL inassert)
 {
 do {
@ -8321,7 +8416,7 @@ do {
            op == OP_SCBRA || op == OP_SCBRAPOS)
     {
     int n = GET2(scode, 1+LINK_SIZE);
-     int new_map = bracket_map | ((n < 32)? (1u << n) : 1);
+     uint32_t new_map = bracket_map | ((n < 32)? (1u << n) : 1);
     if (!is_anchored(scode, new_map, cb, atomcount, inassert)) return FALSE;
     }

@ -8681,15 +8776,15 @@ Returns:     the fixed first code unit, or 0 with REQ_NONE in flags
 */

 static uint32_t
-find_firstassertedcu(PCRE2_SPTR code, int32_t *flags, uint32_t inassert)
+find_firstassertedcu(PCRE2_SPTR code, uint32_t *flags, uint32_t inassert)
 {
 uint32_t c = 0;
-int cflags = REQ_NONE;
+uint32_t cflags = REQ_NONE;

 *flags = REQ_NONE;
 do {
   uint32_t d;
-   int dflags;
+   uint32_t dflags;
   int xl = (*code == OP_CBRA || *code == OP_SCBRA ||
             *code == OP_CBRAPOS || *code == OP_SCBRAPOS)? IMM2_SIZE:0;
   PCRE2_SPTR scode = first_significant_code(code + 1+LINK_SIZE + xl, TRUE);
@ -8712,9 +8807,8 @@ do {
     case OP_SCRIPT_RUN:
     d = find_firstassertedcu(scode, &dflags, inassert +
       ((op == OP_ASSERT || op == OP_ASSERT_NA)?1:0));
-     if (dflags < 0)
-       return 0;
-     if (cflags < 0) { c = d; cflags = dflags; }
+     if (dflags >= REQ_NONE) return 0;
+     if (cflags >= REQ_NONE) { c = d; cflags = dflags; }
       else if (c != d || cflags != dflags) return 0;
     break;

@ -8727,7 +8821,7 @@ do {
     case OP_MINPLUS:
     case OP_POSPLUS:
     if (inassert == 0) return 0;
-     if (cflags < 0) { c = scode[1]; cflags = 0; }
+     if (cflags >= REQ_NONE) { c = scode[1]; cflags = 0; }
       else if (c != scode[1]) return 0;
     break;

@ -8753,7 +8847,7 @@ do {
 #endif
 #endif

-     if (cflags < 0) { c = scode[1]; cflags = REQ_CASELESS; }
+     if (cflags >= REQ_NONE) { c = scode[1]; cflags = REQ_CASELESS; }
       else if (c != scode[1]) return 0;
     break;
     }
@ -9689,7 +9783,7 @@ PCRE2_SIZE re_blocksize;              /* Size of memory block */
 PCRE2_SIZE big32count = 0;            /* 32-bit literals >= 0x80000000 */
 PCRE2_SIZE parsed_size_needed;        /* Needed for parsed pattern */

-int32_t firstcuflags, reqcuflags;     /* Type of first/req code unit */
+uint32_t firstcuflags, reqcuflags;    /* Type of first/req code unit */
 uint32_t firstcu, reqcu;              /* Value of first/req code unit */
 uint32_t setflags = 0;                /* NL and BSR set flags */

@ -10369,13 +10463,13 @@ if ((re->overall_options & PCRE2_NO_START_OPTIMIZE) == 0)
  (these are not saved during the compile because they can cause conflicts with
  actual literals that follow). */

-  if (firstcuflags < 0)
+  if (firstcuflags >= REQ_NONE)
    firstcu = find_firstassertedcu(codestart, &firstcuflags, 0);

  /* Save the data for a first code unit. The existence of one means the
  minimum length must be at least 1. */

-  if (firstcuflags >= 0)
+  if (firstcuflags < REQ_NONE)
    {
    re->first_codeunit = firstcu;
    re->flags |= PCRE2_FIRSTSET;
@ -10422,16 +10516,16 @@ if ((re->overall_options & PCRE2_NO_START_OPTIMIZE) == 0)
  different character and not a non-starting code unit of the first character,
  because the minimum length count is in characters, not code units. */

-  if (reqcuflags >= 0)
+  if (reqcuflags < REQ_NONE)
    {
 #if PCRE2_CODE_UNIT_WIDTH == 16
    if ((re->overall_options & PCRE2_UTF) == 0 ||   /* Not UTF */
-        firstcuflags < 0 ||                         /* First not set */
+        firstcuflags >= REQ_NONE ||                 /* First not set */
        (firstcu & 0xf800) != 0xd800 ||             /* First not surrogate */
        (reqcu & 0xfc00) != 0xdc00)                 /* Req not low surrogate */
 #elif PCRE2_CODE_UNIT_WIDTH == 8
    if ((re->overall_options & PCRE2_UTF) == 0 ||   /* Not UTF */
-        firstcuflags < 0 ||                         /* First not set */
+        firstcuflags >= REQ_NONE ||                 /* First not set */
        (firstcu & 0x80) == 0 ||                    /* First is ASCII */
        (reqcu & 0x80) == 0)                        /* Req is ASCII */
 #endif
@ -10528,4 +10622,10 @@ re = NULL;
 goto EXIT;
 }

+/* These #undefs are here to enable unity builds with CMake. */
+
+#undef NLBLOCK /* Block containing newline information */
+#undef PSSTART /* Field containing processed string start */
+#undef PSEND   /* Field containing processed string end */
+
 /* End of pcre2_compile.c */
--- a/src/pcre2_convert.c
+++ b/src/pcre2_convert.c
@ -7,7 +7,7 @@ and semantics are as close as possible to those of the Perl 5 language.

                       Written by Philip Hazel
     Original API code Copyright (c) 1997-2012 University of Cambridge
-          New API code Copyright (c) 2016-2018 University of Cambridge
+          New API code Copyright (c) 2016-2022 University of Cambridge

 -----------------------------------------------------------------------------
 Redistribution and use in source and binary forms, with or without
@ -65,9 +65,8 @@ POSSIBILITY OF SUCH DAMAGE.
 #define STR_QUERY_s STR_LEFT_PARENTHESIS STR_QUESTION_MARK STR_s STR_RIGHT_PARENTHESIS
 #define STR_STAR_NUL STR_LEFT_PARENTHESIS STR_ASTERISK STR_N STR_U STR_L STR_RIGHT_PARENTHESIS

-/* States for range and POSIX processing */
+/* States for POSIX processing */

-enum { RANGE_NOT_STARTED, RANGE_STARTING, RANGE_STARTED };
 enum { POSIX_START_REGEX, POSIX_ANCHORED, POSIX_NOT_BRACKET,
       POSIX_CLASS_NOT_STARTED, POSIX_CLASS_STARTING, POSIX_CLASS_STARTED };

--- a/src/pcre2_dfa_match.c
+++ b/src/pcre2_dfa_match.c
@ -7,7 +7,7 @@ and semantics are as close as possible to those of the Perl 5 language.

                       Written by Philip Hazel
     Original API code Copyright (c) 1997-2012 University of Cambridge
-          New API code Copyright (c) 2016-2021 University of Cambridge
+          New API code Copyright (c) 2016-2022 University of Cambridge

 -----------------------------------------------------------------------------
 Redistribution and use in source and binary forms, with or without
@ -350,7 +350,7 @@ Returns:            the return from the callout
 */

 static int
-do_callout(PCRE2_SPTR code, PCRE2_SIZE *offsets, PCRE2_SPTR current_subject,
+do_callout_dfa(PCRE2_SPTR code, PCRE2_SIZE *offsets, PCRE2_SPTR current_subject,
  PCRE2_SPTR ptr, dfa_match_block *mb, PCRE2_SIZE extracode,
  PCRE2_SIZE *lengthptr)
 {
@ -1193,6 +1193,11 @@ for (;;)
          OK = prop->script == code[2];
          break;

+          case PT_SCX:
+          OK = (prop->script == code[2] ||
+                MAPBIT(PRIV(ucd_script_sets) + UCD_SCRIPTX_PROP(prop), code[2]) != 0);
+          break;
+
          /* These are specials for combination cases. */

          case PT_ALNUM:
@ -1240,6 +1245,15 @@ for (;;)
               c >= 0xe000;
          break;

+          case PT_BIDICL:
+          OK = UCD_BIDICLASS(c) == code[2];
+          break;
+
+          case PT_BOOL:
+          OK = MAPBIT(PRIV(ucd_boolprop_sets) +
+            UCD_BPROPS_PROP(prop), code[2]) != 0;
+          break;
+
          /* Should never occur, but keep compilers from grumbling. */

          default:
@ -1451,6 +1465,11 @@ for (;;)
          OK = prop->script == code[3];
          break;

+          case PT_SCX:
+          OK = (prop->script == code[3] ||
+                MAPBIT(PRIV(ucd_script_sets) + UCD_SCRIPTX_PROP(prop), code[3]) != 0);
+          break;
+
          /* These are specials for combination cases. */

          case PT_ALNUM:
@ -1498,6 +1517,15 @@ for (;;)
               c >= 0xe000;
          break;

+          case PT_BIDICL:
+          OK = UCD_BIDICLASS(c) == code[3];
+          break;
+
+          case PT_BOOL:
+          OK = MAPBIT(PRIV(ucd_boolprop_sets) +
+            UCD_BPROPS_PROP(prop), code[3]) != 0;
+          break;
+
          /* Should never occur, but keep compilers from grumbling. */

          default:
@ -1692,6 +1720,11 @@ for (;;)
          OK = prop->script == code[3];
          break;

+          case PT_SCX:
+          OK = (prop->script == code[3] ||
+                MAPBIT(PRIV(ucd_script_sets) + UCD_SCRIPTX_PROP(prop), code[3]) != 0);
+          break;
+
          /* These are specials for combination cases. */

          case PT_ALNUM:
@ -1739,6 +1772,15 @@ for (;;)
               c >= 0xe000;
          break;

+          case PT_BIDICL:
+          OK = UCD_BIDICLASS(c) == code[3];
+          break;
+
+          case PT_BOOL:
+          OK = MAPBIT(PRIV(ucd_boolprop_sets) +
+            UCD_BPROPS_PROP(prop), code[3]) != 0;
+          break;
+
          /* Should never occur, but keep compilers from grumbling. */

          default:
@ -1958,6 +2000,12 @@ for (;;)
          OK = prop->script == code[1 + IMM2_SIZE + 2];
          break;

+          case PT_SCX:
+          OK = (prop->script == code[1 + IMM2_SIZE + 2] ||
+                MAPBIT(PRIV(ucd_script_sets) + UCD_SCRIPTX_PROP(prop),
+                  code[1 + IMM2_SIZE + 2]) != 0);
+          break;
+
          /* These are specials for combination cases. */

          case PT_ALNUM:
@ -2005,6 +2053,15 @@ for (;;)
               c >= 0xe000;
          break;

+          case PT_BIDICL:
+          OK = UCD_BIDICLASS(c) == code[1 + IMM2_SIZE + 2];
+          break;
+
+          case PT_BOOL:
+          OK = MAPBIT(PRIV(ucd_boolprop_sets) +
+            UCD_BPROPS_PROP(prop), code[1 + IMM2_SIZE + 2]) != 0;
+          break;
+
          /* Should never occur, but keep compilers from grumbling. */

          default:
@ -2742,7 +2799,7 @@ for (;;)
            || code[LINK_SIZE + 1] == OP_CALLOUT_STR)
          {
          PCRE2_SIZE callout_length;
-          rrc = do_callout(code, offsets, current_subject, ptr, mb,
+          rrc = do_callout_dfa(code, offsets, current_subject, ptr, mb,
            1 + LINK_SIZE, &callout_length);
          if (rrc < 0) return rrc;                 /* Abandon */
          if (rrc > 0) break;                      /* Fail this thread */
@ -3139,7 +3196,7 @@ for (;;)
      case OP_CALLOUT_STR:
        {
        PCRE2_SIZE callout_length;
-        rrc = do_callout(code, offsets, current_subject, ptr, mb, 0,
+        rrc = do_callout_dfa(code, offsets, current_subject, ptr, mb, 0,
          &callout_length);
        if (rrc < 0) return rrc;   /* Abandon */
        if (rrc == 0)
@ -3285,8 +3342,15 @@ rws->next = NULL;
 rws->size = RWS_BASE_SIZE;
 rws->free = RWS_BASE_SIZE - RWS_ANCHOR_SIZE;

-/* A length equal to PCRE2_ZERO_TERMINATED implies a zero-terminated
-subject string. */
+/* Recognize NULL, length 0 as an empty string. */
+
+if (subject == NULL && length == 0) subject = (PCRE2_SPTR)"";
+
+/* Plausibility checks */
+
+if ((options & ~PUBLIC_DFA_MATCH_OPTIONS) != 0) return PCRE2_ERROR_BADOPTION;
+if (re == NULL || subject == NULL || workspace == NULL || match_data == NULL)
+  return PCRE2_ERROR_NULL;

 if (length == PCRE2_ZERO_TERMINATED)
  {
@ -3294,11 +3358,6 @@ if (length == PCRE2_ZERO_TERMINATED)
  was_zero_terminated = 1;
  }

-/* Plausibility checks */
-
-if ((options & ~PUBLIC_DFA_MATCH_OPTIONS) != 0) return PCRE2_ERROR_BADOPTION;
-if (re == NULL || subject == NULL || workspace == NULL || match_data == NULL)
-  return PCRE2_ERROR_NULL;
 if (wscount < 20) return PCRE2_ERROR_DFA_WSSIZE;
 if (start_offset > length) return PCRE2_ERROR_BADOFFSET;

@ -3998,4 +4057,10 @@ while (rws->next != NULL)
 return rc;
 }

+/* These #undefs are here to enable unity builds with CMake. */
+
+#undef NLBLOCK /* Block containing newline information */
+#undef PSSTART /* Field containing processed string start */
+#undef PSEND   /* Field containing processed string end */
+
 /* End of pcre2_dfa_match.c */
--- a/src/pcre2_error.c
+++ b/src/pcre2_error.c
@ -119,7 +119,7 @@ static const unsigned char compile_error_texts[] =
  /* 45 */
  "this version of PCRE2 does not have support for \\P, \\p, or \\X\0"
  "malformed \\P or \\p sequence\0"
-  "unknown property name after \\P or \\p\0"
+  "unknown property after \\P or \\p\0"
  "subpattern name is too long (maximum " XSTRING(MAX_NAME_SIZE) " code units)\0"
  "too many named subpatterns (maximum " XSTRING(MAX_NAME_COUNT) ")\0"
  /* 50 */
@ -253,7 +253,7 @@ static const unsigned char match_error_texts[] =
  "unknown substring\0"
  /* 50 */
  "non-unique substring name\0"
-  "NULL argument passed\0"
+  "NULL argument passed with non-zero length\0"
  "nested recursion at the same subject position\0"
  "matching depth limit exceeded\0"
  "requested value is not available\0"
--- a/src/pcre2_extuni.c
+++ b/src/pcre2_extuni.c
@ -7,7 +7,7 @@ and semantics are as close as possible to those of the Perl 5 language.

                       Written by Philip Hazel
     Original API code Copyright (c) 1997-2012 University of Cambridge
-          New API code Copyright (c) 2016-2019 University of Cambridge
+          New API code Copyright (c) 2016-2021 University of Cambridge

 -----------------------------------------------------------------------------
 Redistribution and use in source and binary forms, with or without
@ -105,7 +105,7 @@ while (eptr < end_subject)
  /* Not breaking between Regional Indicators is allowed only if there
  are an even number of preceding RIs. */

-  if (lgb == ucp_gbRegionalIndicator && rgb == ucp_gbRegionalIndicator)
+  if (lgb == ucp_gbRegional_Indicator && rgb == ucp_gbRegional_Indicator)
    {
    int ricount = 0;
    PCRE2_SPTR bptr = eptr - 1;
@ -123,7 +123,7 @@ while (eptr < end_subject)
        }
      else
      c = *bptr;
-      if (UCD_GRAPHBREAK(c) != ucp_gbRegionalIndicator) break;
+      if (UCD_GRAPHBREAK(c) != ucp_gbRegional_Indicator) break;
      ricount++;
      }
    if ((ricount & 1) != 0) break;  /* Grapheme break required */
--- a/src/pcre2_fuzzsupport.c
+++ b/src/pcre2_fuzzsupport.c
@ -151,6 +151,10 @@ for (i = 0; i < 2; i++)
    int j;
    uint32_t save_match_options = match_options;

+#ifdef SUPPORT_JIT
+    pcre2_jit_compile(code, PCRE2_JIT_COMPLETE);
+#endif
+
    /* Create match data and context blocks only when we first need them. Set
    low match and depth limits to avoid wasting too much searching large
    pattern trees. Almost all matches are going to fail. */
--- a/src/pcre2_internal.h
+++ b/src/pcre2_internal.h
@ -7,7 +7,7 @@ and semantics are as close as possible to those of the Perl 5 language.

                       Written by Philip Hazel
     Original API code Copyright (c) 1997-2012 University of Cambridge
-          New API code Copyright (c) 2016-2020 University of Cambridge
+          New API code Copyright (c) 2016-2022 University of Cambridge

 -----------------------------------------------------------------------------
 Redistribution and use in source and binary forms, with or without
@ -220,18 +220,17 @@ not rely on this. */

 #define COMPILE_ERROR_BASE 100

-/* The initial frames vector for remembering backtracking points in
-pcre2_match() is allocated on the system stack, of this size (bytes). The size
-must be a multiple of sizeof(PCRE2_SPTR) in all environments, so making it a
-multiple of 8 is best. Typical frame sizes are a few hundred bytes (it depends
-on the number of capturing parentheses) so 20KiB handles quite a few frames. A
-larger vector on the heap is obtained for patterns that need more frames. The
-maximum size of this can be limited. */
+/* The initial frames vector for remembering pcre2_match() backtracking points
+is allocated on the heap, of this size (bytes) or ten times the frame size if
+larger, unless the heap limit is smaller. Typical frame sizes are a few hundred
+bytes (it depends on the number of capturing parentheses) so 20KiB handles
+quite a few frames. A larger vector on the heap is obtained for matches that
+need more frames, subject to the heap limit. */

 #define START_FRAMES_SIZE 20480

-/* Similarly, for DFA matching, an initial internal workspace vector is
-allocated on the stack. */
+/* For DFA matching, an initial internal workspace vector is allocated on the
+stack. The heap is used only if this turns out to be too small. */

 #define DFA_START_RWS_SIZE 30720

@ -954,6 +953,13 @@ a positive value. */
 #define STRING_LIMIT_RECURSION_EQ         "LIMIT_RECURSION="
 #define STRING_MARK                       "MARK"

+#define STRING_bc                         "bc"
+#define STRING_bidiclass                  "bidiclass"
+#define STRING_sc                         "sc"
+#define STRING_script                     "script"
+#define STRING_scriptextensions           "scriptextensions"
+#define STRING_scx                        "scx"
+
 #else  /* SUPPORT_UNICODE */

 /* UTF-8 support is enabled; always use UTF-8 (=ASCII) character codes. This
@ -1248,26 +1254,39 @@ only. */
 #define STRING_LIMIT_RECURSION_EQ         STR_L STR_I STR_M STR_I STR_T STR_UNDERSCORE STR_R STR_E STR_C STR_U STR_R STR_S STR_I STR_O STR_N STR_EQUALS_SIGN
 #define STRING_MARK                       STR_M STR_A STR_R STR_K

+#define STRING_bc                         STR_b STR_c
+#define STRING_bidiclass                  STR_b STR_i STR_d STR_i STR_c STR_l STR_a STR_s STR_s
+#define STRING_sc                         STR_s STR_c
+#define STRING_script                     STR_s STR_c STR_r STR_i STR_p STR_t
+#define STRING_scriptextensions           STR_s STR_c STR_r STR_i STR_p STR_t STR_e STR_x STR_t STR_e STR_n STR_s STR_i STR_o STR_n STR_s
+#define STRING_scx                        STR_s STR_c STR_x
+
+
 #endif  /* SUPPORT_UNICODE */

 /* -------------------- End of character and string names -------------------*/

 /* -------------------- Definitions for compiled patterns -------------------*/

-/* Codes for different types of Unicode property */
+/* Codes for different types of Unicode property. If these definitions are
+changed, the autopossessifying table in pcre2_auto_possess.c must be updated to
+match. */

 #define PT_ANY        0    /* Any property - matches all chars */
 #define PT_LAMP       1    /* L& - the union of Lu, Ll, Lt */
 #define PT_GC         2    /* Specified general characteristic (e.g. L) */
 #define PT_PC         3    /* Specified particular characteristic (e.g. Lu) */
-#define PT_SC         4    /* Script (e.g. Han) */
-#define PT_ALNUM      5    /* Alphanumeric - the union of L and N */
-#define PT_SPACE      6    /* Perl space - Z plus 9,10,12,13 */
-#define PT_PXSPACE    7    /* POSIX space - Z plus 9,10,11,12,13 */
-#define PT_WORD       8    /* Word - L plus N plus underscore */
-#define PT_CLIST      9    /* Pseudo-property: match character list */
-#define PT_UCNC      10    /* Universal Character nameable character */
-#define PT_TABSIZE   11    /* Size of square table for autopossessify tests */
+#define PT_SC         4    /* Script only (e.g. Han) */
+#define PT_SCX        5    /* Script extensions (includes SC) */
+#define PT_ALNUM      6    /* Alphanumeric - the union of L and N */
+#define PT_SPACE      7    /* Perl space - general category Z plus 9,10,12,13 */
+#define PT_PXSPACE    8    /* POSIX space - Z plus 9,10,11,12,13 */
+#define PT_WORD       9    /* Word - L plus N plus underscore */
+#define PT_CLIST     10    /* Pseudo-property: match character list */
+#define PT_UCNC      11    /* Universal Character nameable character */
+#define PT_BIDICL    12    /* Specified bidi class */
+#define PT_BOOL      13    /* Boolean property */
+#define PT_TABSIZE   14    /* Size of square table for autopossessify tests */

 /* The following special properties are used only in XCLASS items, when POSIX
 classes are specified and PCRE2_UCP is set - in other words, for Unicode
@ -1275,22 +1294,27 @@ handling of these classes. They are not available via the \p or \P escapes like
 those in the above list, and so they do not take part in the autopossessifying
 table. */

-#define PT_PXGRAPH   11    /* [:graph:] - characters that mark the paper */
-#define PT_PXPRINT   12    /* [:print:] - [:graph:] plus non-control spaces */
-#define PT_PXPUNCT   13    /* [:punct:] - punctuation characters */
+#define PT_PXGRAPH   14    /* [:graph:] - characters that mark the paper */
+#define PT_PXPRINT   15    /* [:print:] - [:graph:] plus non-control spaces */
+#define PT_PXPUNCT   16    /* [:punct:] - punctuation characters */
+
+/* This value is used when parsing \p and \P escapes to indicate that neither
+\p{script:...} nor \p{scx:...} has been encountered. */
+
+#define PT_NOTSCRIPT 255

 /* Flag bits and data types for the extended class (OP_XCLASS) for classes that
 contain characters with values greater than 255. */

-#define XCL_NOT       0x01    /* Flag: this is a negative class */
-#define XCL_MAP       0x02    /* Flag: a 32-byte map is present */
-#define XCL_HASPROP   0x04    /* Flag: property checks are present. */
+#define XCL_NOT      0x01  /* Flag: this is a negative class */
+#define XCL_MAP      0x02  /* Flag: a 32-byte map is present */
+#define XCL_HASPROP  0x04  /* Flag: property checks are present. */

-#define XCL_END       0    /* Marks end of individual items */
-#define XCL_SINGLE    1    /* Single item (one multibyte char) follows */
-#define XCL_RANGE     2    /* A range (two multibyte chars) follows */
-#define XCL_PROP      3    /* Unicode property (2-byte property code follows) */
-#define XCL_NOTPROP   4    /* Unicode inverted property (ditto) */
+#define XCL_END      0     /* Marks end of individual items */
+#define XCL_SINGLE   1     /* Single item (one multibyte char) follows */
+#define XCL_RANGE    2     /* A range (two multibyte chars) follows */
+#define XCL_PROP     3     /* Unicode property (2-byte property code follows) */
+#define XCL_NOTPROP  4     /* Unicode inverted property (ditto) */

 /* These are escaped items that aren't just an encoding of a particular data
 value such as \n. They must have non-zero values, as check_escape() returns 0
@ -1797,8 +1821,8 @@ typedef struct {
  uint8_t gbprop;     /* ucp_gbControl, etc. (grapheme break property) */
  uint8_t caseset;    /* offset to multichar other cases or zero */
  int32_t other_case; /* offset to other case, or zero if none */
-  int16_t scriptx;    /* script extension value */
-  int16_t dummy;      /* spare - to round to multiple of 4 bytes */
+  uint16_t scriptx_bidiclass; /* script extension (11 bit) and bidi class (5 bit) values */
+  uint16_t bprops;    /* binary properties offset */
 } ucd_record;

 /* UCD access macros */
@ -1815,13 +1839,30 @@ typedef struct {
 #define GET_UCD(ch) REAL_GET_UCD(ch)
 #endif

+#define UCD_SCRIPTX_MASK 0x3ff
+#define UCD_BIDICLASS_SHIFT 11
+#define UCD_BPROPS_MASK 0xfff
+
+#define UCD_SCRIPTX_PROP(prop) ((prop)->scriptx_bidiclass & UCD_SCRIPTX_MASK)
+#define UCD_BIDICLASS_PROP(prop) ((prop)->scriptx_bidiclass >> UCD_BIDICLASS_SHIFT)
+#define UCD_BPROPS_PROP(prop) ((prop)->bprops & UCD_BPROPS_MASK)
+
 #define UCD_CHARTYPE(ch)    GET_UCD(ch)->chartype
 #define UCD_SCRIPT(ch)      GET_UCD(ch)->script
 #define UCD_CATEGORY(ch)    PRIV(ucp_gentype)[UCD_CHARTYPE(ch)]
 #define UCD_GRAPHBREAK(ch)  GET_UCD(ch)->gbprop
 #define UCD_CASESET(ch)     GET_UCD(ch)->caseset
 #define UCD_OTHERCASE(ch)   ((uint32_t)((int)ch + (int)(GET_UCD(ch)->other_case)))
-#define UCD_SCRIPTX(ch)     GET_UCD(ch)->scriptx
+#define UCD_SCRIPTX(ch)     UCD_SCRIPTX_PROP(GET_UCD(ch))
+#define UCD_BPROPS(ch)      UCD_BPROPS_PROP(GET_UCD(ch))
+#define UCD_BIDICLASS(ch)   UCD_BIDICLASS_PROP(GET_UCD(ch))
+
+/* The "scriptx" and bprops fields contain offsets into vectors of 32-bit words
+that form a bitmap representing a list of scripts or boolean properties. These
+macros test or set a bit in the map by number. */
+
+#define MAPBIT(map,n) ((map)[(n)/32]&(1u<<((n)%32)))
+#define MAPSET(map,n) ((map)[(n)/32]|=(1u<<((n)%32)))

 /* Header for serialized pcre2 codes. */

@ -1878,6 +1919,7 @@ extern const uint8_t          PRIV(utf8_table4)[];
 #endif
 #define _pcre2_hspace_list             PCRE2_SUFFIX(_pcre2_hspace_list_)
 #define _pcre2_vspace_list             PCRE2_SUFFIX(_pcre2_vspace_list_)
+#define _pcre2_ucd_boolprop_sets       PCRE2_SUFFIX(_pcre2_ucd_boolprop_sets_)
 #define _pcre2_ucd_caseless_sets       PCRE2_SUFFIX(_pcre2_ucd_caseless_sets_)
 #define _pcre2_ucd_digit_sets          PCRE2_SUFFIX(_pcre2_ucd_digit_sets_)
 #define _pcre2_ucd_script_sets         PCRE2_SUFFIX(_pcre2_ucd_script_sets_)
@ -1901,9 +1943,10 @@ extern const pcre2_match_context       PRIV(default_match_context);
 extern const uint8_t                   PRIV(default_tables)[];
 extern const uint32_t                  PRIV(hspace_list)[];
 extern const uint32_t                  PRIV(vspace_list)[];
+extern const uint32_t                  PRIV(ucd_boolprop_sets)[];
 extern const uint32_t                  PRIV(ucd_caseless_sets)[];
 extern const uint32_t                  PRIV(ucd_digit_sets)[];
-extern const uint8_t                   PRIV(ucd_script_sets)[];
+extern const uint32_t                  PRIV(ucd_script_sets)[];
 extern const ucd_record                PRIV(ucd_records)[];
 #if PCRE2_CODE_UNIT_WIDTH == 32
 extern const ucd_record                PRIV(dummy_ucd_record)[];
--- a/src/pcre2_intmodedep.h
+++ b/src/pcre2_intmodedep.h
@ -7,7 +7,7 @@ and semantics are as close as possible to those of the Perl 5 language.

                       Written by Philip Hazel
     Original API code Copyright (c) 1997-2012 University of Cambridge
-          New API code Copyright (c) 2016-2018 University of Cambridge
+          New API code Copyright (c) 2016-2022 University of Cambridge

 -----------------------------------------------------------------------------
 Redistribution and use in source and binary forms, with or without
@ -519,7 +519,7 @@ it is. This is called only in UTF-32 mode - we don't put a test within the
 macro because almost all calls are already within a block of UTF-32 only
 code.

-These are all no-ops since all UTF-32 characters fit into one pcre_uchar. */
+These are all no-ops since all UTF-32 characters fit into one PCRE2_UCHAR. */

 #define BACKCHAR(eptr) do { } while (0)

@ -649,19 +649,23 @@ the size varies from call to call. As the maximum number of capturing
 subpatterns is 65535 we must allow for 65536 strings to include the overall
 match. (See also the heapframe structure below.) */

+struct heapframe;  /* Forward reference */
+
 typedef struct pcre2_real_match_data {
-  pcre2_memctl     memctl;
-  const pcre2_real_code *code;    /* The pattern used for the match */
-  PCRE2_SPTR       subject;       /* The subject that was matched */
-  PCRE2_SPTR       mark;          /* Pointer to last mark */
-  PCRE2_SIZE       leftchar;      /* Offset to leftmost code unit */
-  PCRE2_SIZE       rightchar;     /* Offset to rightmost code unit */
-  PCRE2_SIZE       startchar;     /* Offset to starting code unit */
-  uint8_t          matchedby;     /* Type of match (normal, JIT, DFA) */
-  uint8_t          flags;         /* Various flags */
-  uint16_t         oveccount;     /* Number of pairs */
-  int              rc;            /* The return code from the match */
-  PCRE2_SIZE       ovector[131072]; /* Must be last in the structure */
+  pcre2_memctl     memctl;           /* Memory control fields */
+  const pcre2_real_code *code;       /* The pattern used for the match */
+  PCRE2_SPTR       subject;          /* The subject that was matched */
+  PCRE2_SPTR       mark;             /* Pointer to last mark */
+  struct heapframe *heapframes;      /* Backtracking frames heap memory */
+  PCRE2_SIZE       heapframes_size;  /* Malloc-ed size */
+  PCRE2_SIZE       leftchar;         /* Offset to leftmost code unit */
+  PCRE2_SIZE       rightchar;        /* Offset to rightmost code unit */
+  PCRE2_SIZE       startchar;        /* Offset to starting code unit */
+  uint8_t          matchedby;        /* Type of match (normal, JIT, DFA) */
+  uint8_t          flags;            /* Various flags */
+  uint16_t         oveccount;        /* Number of pairs */
+  int              rc;               /* The return code from the match */
+  PCRE2_SIZE       ovector[131072];  /* Must be last in the structure */
 } pcre2_real_match_data;


@ -747,8 +751,8 @@ typedef struct compile_block {
  uint32_t class_range_start;      /* Overall class range start */
  uint32_t class_range_end;        /* Overall class range end */
  PCRE2_UCHAR nl[4];               /* Newline string when fixed length */
+  uint32_t req_varyopt;            /* "After variable item" flag for reqbyte */
  int  max_lookbehind;             /* Maximum lookbehind (characters) */
-  int  req_varyopt;                /* "After variable item" flag for reqbyte */
  BOOL had_accept;                 /* (*ACCEPT) encountered */
  BOOL had_pruneorskip;            /* (*PRUNE) or (*SKIP) encountered */
  BOOL had_recurse;                /* Had a recursion or subroutine call */
@ -764,7 +768,7 @@ typedef struct pcre2_real_jit_stack {
 } pcre2_real_jit_stack;

 /* Structure for items in a linked list that represents an explicit recursive
-call within the pattern when running pcre_dfa_match(). */
+call within the pattern when running pcre2_dfa_match(). */

 typedef struct dfa_recursion_info {
  struct dfa_recursion_info *prevrec;
@ -838,15 +842,22 @@ multiple of PCRE2_SIZE. See various comments above. */
 typedef char check_heapframe_size[
  ((sizeof(heapframe) % sizeof(PCRE2_SIZE)) == 0)? (+1):(-1)];

+/* Structure for computing the alignment of heapframe. */
+
+typedef struct heapframe_align {
+  char unalign;    /* Completely unalign the current offset */
+  heapframe frame; /* Offset is its alignment */
+} heapframe_align;
+
+/* This define is the minimum alignment required for a heapframe, in bytes. */
+
+#define HEAPFRAME_ALIGNMENT offsetof(heapframe_align, frame)
+
 /* Structure for passing "static" information around between the functions
 doing traditional NFA matching (pcre2_match() and friends). */

 typedef struct match_block {
  pcre2_memctl memctl;            /* For general use */
-  PCRE2_SIZE frame_vector_size;   /* Size of a backtracking frame */
-  heapframe *match_frames;        /* Points to vector of frames */
-  heapframe *match_frames_top;    /* Points after the end of the vector */
-  heapframe *stack_frames;        /* The original vector on the stack */
  PCRE2_SIZE heap_limit;          /* As it says */
  uint32_t match_limit;           /* As it says */
  uint32_t match_limit_depth;     /* As it says */
--- a/src/pcre2_jit_compile.c
+++ b/src/pcre2_jit_compile.c
--- a/src/pcre2_jit_match.c
+++ b/src/pcre2_jit_match.c
@ -120,7 +120,7 @@ else if ((options & PCRE2_PARTIAL_SOFT) != 0)
 if (functions == NULL || functions->executable_funcs[index] == NULL)
  return PCRE2_ERROR_JIT_BADOPTION;

-/* Sanity checks should be handled by pcre_exec. */
+/* Sanity checks should be handled by pcre2_match. */
 arguments.str = subject + start_offset;
 arguments.begin = subject;
 arguments.end = subject + length;
--- a/src/pcre2_jit_misc.c
+++ b/src/pcre2_jit_misc.c
@ -135,7 +135,7 @@ return NULL;

 pcre2_jit_stack *jit_stack;

-if (startsize < 1 || maxsize < 1)
+if (startsize == 0 || maxsize == 0 || maxsize > SIZE_MAX - STACK_GROWTH_RATE)
  return NULL;
 if (startsize > maxsize)
  startsize = maxsize;
--- a/src/pcre2_jit_simd_inc.h
+++ b/src/pcre2_jit_simd_inc.h
@ -339,7 +339,7 @@ if (common->mode != PCRE2_JIT_COMPLETE)
  {
  JUMPHERE(partial_quit[0]);
  JUMPHERE(partial_quit[1]);
-  OP2(SLJIT_SUB | SLJIT_SET_GREATER, SLJIT_UNUSED, 0, STR_PTR, 0, STR_END, 0);
+  OP2U(SLJIT_SUB | SLJIT_SET_GREATER, STR_PTR, 0, STR_END, 0);
  CMOV(SLJIT_GREATER, STR_PTR, STR_END, 0);
  }
 else
@ -537,7 +537,7 @@ if (common->match_end_ptr != 0)
  OP1(SLJIT_MOV, TMP3, 0, STR_END, 0);
  OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, SLJIT_IMM, IN_UCHARS(offs1 + 1));

-  OP2(SLJIT_SUB | SLJIT_SET_LESS, SLJIT_UNUSED, 0, TMP1, 0, STR_END, 0);
+  OP2U(SLJIT_SUB | SLJIT_SET_LESS, TMP1, 0, STR_END, 0);
  CMOV(SLJIT_LESS, STR_END, TMP1, 0);
  }

@ -883,14 +883,14 @@ if (char1 == char2)

 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
  if (common->utf && offset > 0)
-    sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_RET(SW) | SLJIT_ARG1(SW) | SLJIT_ARG2(UW) | SLJIT_ARG3(UW) | SLJIT_ARG4(UW),
-                     SLJIT_IMM, SLJIT_FUNC_OFFSET(ffcs_utf));
+    sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_ARGS4(W, W, W, W, W),
+                     SLJIT_IMM, SLJIT_FUNC_ADDR(ffcs_utf));
  else
-    sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_RET(SW) | SLJIT_ARG1(SW) | SLJIT_ARG2(UW) | SLJIT_ARG3(UW) | SLJIT_ARG4(UW),
-                     SLJIT_IMM, SLJIT_FUNC_OFFSET(ffcs));
+    sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_ARGS4(W, W, W, W, W),
+                     SLJIT_IMM, SLJIT_FUNC_ADDR(ffcs));
 #else
-  sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_RET(SW) | SLJIT_ARG1(SW) | SLJIT_ARG2(UW) | SLJIT_ARG3(UW) | SLJIT_ARG4(UW),
-                   SLJIT_IMM, SLJIT_FUNC_OFFSET(ffcs));
+  sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_ARGS4(W, W, W, W, W),
+                   SLJIT_IMM, SLJIT_FUNC_ADDR(ffcs));
 #endif
  }
 else
@ -904,14 +904,14 @@ else

 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
    if (common->utf && offset > 0)
-      sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_RET(SW) | SLJIT_ARG1(SW) | SLJIT_ARG2(UW) | SLJIT_ARG3(UW) | SLJIT_ARG4(UW),
-                       SLJIT_IMM, SLJIT_FUNC_OFFSET(ffcs_mask_utf));
+      sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_ARGS4(W, W, W, W, W),
+                       SLJIT_IMM, SLJIT_FUNC_ADDR(ffcs_mask_utf));
    else
-      sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_RET(SW) | SLJIT_ARG1(SW) | SLJIT_ARG2(UW) | SLJIT_ARG3(UW) | SLJIT_ARG4(UW),
-                       SLJIT_IMM, SLJIT_FUNC_OFFSET(ffcs_mask));
+      sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_ARGS4(W, W, W, W, W),
+                       SLJIT_IMM, SLJIT_FUNC_ADDR(ffcs_mask));
 #else
-    sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_RET(SW) | SLJIT_ARG1(SW) | SLJIT_ARG2(UW) | SLJIT_ARG3(UW) | SLJIT_ARG4(UW),
-                     SLJIT_IMM, SLJIT_FUNC_OFFSET(ffcs_mask));
+    sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_ARGS4(W, W, W, W, W),
+                     SLJIT_IMM, SLJIT_FUNC_ADDR(ffcs_mask));
 #endif
    }
  else
@ -922,14 +922,14 @@ else

 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
    if (common->utf && offset > 0)
-      sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_RET(SW) | SLJIT_ARG1(SW) | SLJIT_ARG2(UW) | SLJIT_ARG3(UW) | SLJIT_ARG4(UW),
-                       SLJIT_IMM, SLJIT_FUNC_OFFSET(ffcs_2_utf));
+      sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_ARGS4(W, W, W, W, W),
+                       SLJIT_IMM, SLJIT_FUNC_ADDR(ffcs_2_utf));
    else
-      sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_RET(SW) | SLJIT_ARG1(SW) | SLJIT_ARG2(UW) | SLJIT_ARG3(UW) | SLJIT_ARG4(UW),
-                       SLJIT_IMM, SLJIT_FUNC_OFFSET(ffcs_2));
+      sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_ARGS4(W, W, W, W, W),
+                       SLJIT_IMM, SLJIT_FUNC_ADDR(ffcs_2));
 #else
-    sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_RET(SW) | SLJIT_ARG1(SW) | SLJIT_ARG2(UW) | SLJIT_ARG3(UW) | SLJIT_ARG4(UW),
-                     SLJIT_IMM, SLJIT_FUNC_OFFSET(ffcs_2));
+    sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_ARGS4(W, W, W, W, W),
+                     SLJIT_IMM, SLJIT_FUNC_ADDR(ffcs_2));
 #endif
    }
  }
@ -1067,7 +1067,7 @@ else
  OP1(SLJIT_MOV, SLJIT_R0, 0, SLJIT_MEM1(SLJIT_SP), common->match_end_ptr);
  OP2(SLJIT_ADD, SLJIT_R0, 0, SLJIT_R0, 0, SLJIT_IMM, IN_UCHARS(offs1 + 1));

-  OP2(SLJIT_SUB | SLJIT_SET_LESS, SLJIT_UNUSED, 0, STR_END, 0, SLJIT_R0, 0);
+  OP2U(SLJIT_SUB | SLJIT_SET_LESS, STR_END, 0, SLJIT_R0, 0);
  CMOV(SLJIT_LESS, SLJIT_R0, STR_END, 0);
  }

@ -1084,31 +1084,31 @@ if (diff == 1) {
  if (char1a == char1b && char2a == char2b) {
 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
    if (common->utf)
-      sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_RET(SW) | SLJIT_ARG1(SW) | SLJIT_ARG2(SW) | SLJIT_ARG3(SW) | SLJIT_ARG4(SW),
-                       SLJIT_IMM, SLJIT_FUNC_OFFSET(ffcps_0_utf));
+      sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_ARGS4(W, W, W, W, W),
+                       SLJIT_IMM, SLJIT_FUNC_ADDR(ffcps_0_utf));
    else
 #endif
-      sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_RET(SW) | SLJIT_ARG1(SW) | SLJIT_ARG2(SW) | SLJIT_ARG3(SW) | SLJIT_ARG4(SW),
-                       SLJIT_IMM, SLJIT_FUNC_OFFSET(ffcps_0));
+      sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_ARGS4(W, W, W, W, W),
+                       SLJIT_IMM, SLJIT_FUNC_ADDR(ffcps_0));
  } else {
 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
    if (common->utf)
-      sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_RET(SW) | SLJIT_ARG1(SW) | SLJIT_ARG2(SW) | SLJIT_ARG3(SW) | SLJIT_ARG4(SW),
-                       SLJIT_IMM, SLJIT_FUNC_OFFSET(ffcps_1_utf));
+      sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_ARGS4(W, W, W, W, W),
+                       SLJIT_IMM, SLJIT_FUNC_ADDR(ffcps_1_utf));
    else
 #endif
-      sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_RET(SW) | SLJIT_ARG1(SW) | SLJIT_ARG2(SW) | SLJIT_ARG3(SW) | SLJIT_ARG4(SW),
-                       SLJIT_IMM, SLJIT_FUNC_OFFSET(ffcps_1));
+      sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_ARGS4(W, W, W, W, W),
+                       SLJIT_IMM, SLJIT_FUNC_ADDR(ffcps_1));
  }
 } else {
 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
  if (common->utf)
-    sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_RET(SW) | SLJIT_ARG1(SW) | SLJIT_ARG2(SW) | SLJIT_ARG3(SW) | SLJIT_ARG4(SW),
-                     SLJIT_IMM, SLJIT_FUNC_OFFSET(ffcps_default_utf));
+    sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_ARGS4(W, W, W, W, W),
+                     SLJIT_IMM, SLJIT_FUNC_ADDR(ffcps_default_utf));
  else
 #endif
-    sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_RET(SW) | SLJIT_ARG1(SW) | SLJIT_ARG2(SW) | SLJIT_ARG3(SW) | SLJIT_ARG4(SW),
-                     SLJIT_IMM, SLJIT_FUNC_OFFSET(ffcps_default));
+    sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_ARGS4(W, W, W, W, W),
+                     SLJIT_IMM, SLJIT_FUNC_ADDR(ffcps_default));
 }

 /* Restore STR_PTR register. */
@ -1418,7 +1418,7 @@ if (common->mode != PCRE2_JIT_COMPLETE)
  {
  JUMPHERE(partial_quit[0]);
  JUMPHERE(partial_quit[1]);
-  OP2(SLJIT_SUB | SLJIT_SET_GREATER, SLJIT_UNUSED, 0, STR_PTR, 0, STR_END, 0);
+  OP2U(SLJIT_SUB | SLJIT_SET_GREATER, STR_PTR, 0, STR_END, 0);
  CMOV(SLJIT_GREATER, STR_PTR, STR_END, 0);
  }
 else
@ -1673,7 +1673,7 @@ if (common->match_end_ptr != 0)
  OP1(SLJIT_MOV, TMP3, 0, STR_END, 0);
  OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, SLJIT_IMM, IN_UCHARS(offs1 + 1));

-  OP2(SLJIT_SUB | SLJIT_SET_LESS, SLJIT_UNUSED, 0, TMP1, 0, STR_END, 0);
+  OP2U(SLJIT_SUB | SLJIT_SET_LESS, TMP1, 0, STR_END, 0);
  CMOV(SLJIT_LESS, STR_END, TMP1, 0);
  }

--- a/Show More
+++ b/Show More