Cleanup of Makefile.os4, added release rule and a README file for this release

Implement -Z in pcre2grep and update documentation
Added some special heap tests
2022-07-31 20:34:33 +01:00 · 2022-07-30 17:41:49 +01:00 · 2022-07-28 17:58:19 +01:00 · 2022-07-27 18:00:40 +01:00 · 2022-07-27 17:44:55 +01:00 · 2022-07-15 17:18:11 +01:00
190 changed files with 57580 additions and 20001 deletions
--- a/.bazelrc
+++ b/.bazelrc
@ -0,0 +1,3 @@
+common --experimental_enable_bzlmod
+build --incompatible_enable_cc_toolchain_resolution
+build --incompatible_strict_action_env
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@ -0,0 +1,77 @@
+
+name: Build
+on: [push, pull_request]
+
+jobs:
+  linux:
+    name: Linux
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v2
+        
+      - name: Autogen
+        run: ./autogen.sh
+        
+      - name: Configure
+        run: ./configure --enable-jit --enable-pcre2-8 --enable-pcre2-16 --enable-pcre2-32
+        
+      - name: Build
+        run: make
+        
+      - name: Test (main test script)
+        run: ./RunTest
+
+      - name: Test (JIT test program)
+        run: ./pcre2_jit_test
+
+      - name: Test (pcre2grep test script)
+        run: ./RunGrepTest
+    
+  alpine:
+    name: alpine
+    runs-on: ubuntu-latest
+    container: alpine 
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v2
+        
+      - name: Autotools
+        run: apk add --no-cache automake autoconf gcc libtool make musl-dev 
+        
+      - name: Autogen
+        run: ./autogen.sh
+        
+      - name: Configure
+        run: ./configure --enable-jit --enable-pcre2-8 --enable-pcre2-16 --enable-pcre2-32
+        
+      - name: Build
+        run: make
+        
+      - name: Test (main test script)
+        run: ./RunTest
+
+      - name: Test (JIT test program)
+        run: ./pcre2_jit_test
+
+      - name: Test (pcre2grep test script)
+        run: ./RunGrepTest
+        
+  windows:      
+    name: 32bit Windows
+    runs-on: windows-latest
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v2
+
+      - name: Configure
+        run: cmake -DPCRE2_SUPPORT_JIT=ON -DPCRE2_BUILD_PCRE2_16=ON -DPCRE2_BUILD_PCRE2_32=ON -B build -A Win32
+
+      - name: Build
+        run: cmake --build build
+
+      - name: Test
+        run: |
+          cd build\Debug
+          ..\..\RunTest.bat
+           
--- a/.github/workflows/codeql.yml
+++ b/.github/workflows/codeql.yml
@ -0,0 +1,73 @@
+# For most projects, this workflow file will not need changing; you simply need
+# to commit it to your repository.
+#
+# You may wish to alter this file to override the set of languages analyzed,
+# or to provide custom queries or build logic.
+#
+# ******** NOTE ********
+# We have attempted to detect the languages in your repository. Please check
+# the `language` matrix defined below to confirm you have the correct set of
+# supported CodeQL languages.
+#
+name: "CodeQL"
+
+on:
+  push:
+    branches: [ master ]
+  pull_request:
+    # The branches below must be a subset of the branches above
+    branches: [ master ]
+  schedule:
+    - cron: '27 6 * * 4'
+
+# Declare default permissions as read only.
+permissions: read-all
+
+jobs:
+  analyze:
+    name: Analyze
+    runs-on: ubuntu-latest
+    permissions:
+      actions: read
+      contents: read
+      security-events: write
+
+    strategy:
+      fail-fast: false
+      matrix:
+        language: [ 'cpp', 'python' ]
+        # CodeQL supports [ 'cpp', 'csharp', 'go', 'java', 'javascript', 'python', 'ruby' ]
+        # Learn more about CodeQL language support at https://git.io/codeql-language-support
+
+    steps:
+    - name: Checkout repository
+      uses: actions/checkout@v2
+
+    # Initializes the CodeQL tools for scanning.
+    - name: Initialize CodeQL
+      uses: github/codeql-action/init@v1
+      with:
+        languages: ${{ matrix.language }}
+        # If you wish to specify custom queries, you can do so here or in a config file.
+        # By default, queries listed here will override any specified in a config file.
+        # Prefix the list here with "+" to use these queries and those in the config file.
+        # queries: ./path/to/local/query, your-org/your-repo/queries@main
+
+    # Autobuild attempts to build any compiled languages  (C/C++, C#, or Java).
+    # If this step fails, then you should remove it and run the build manually (see below)
+    - name: Autobuild
+      uses: github/codeql-action/autobuild@v1
+
+    # ℹ️ Command-line programs to run using the OS shell.
+    # 📚 https://git.io/JvXDl
+
+    # ✏️ If the Autobuild fails above, remove it and uncomment the following three lines
+    #    and modify them (or add more) to build your code if your project
+    #    uses a compiled language
+
+    #- run: |
+    #   make bootstrap
+    #   make release
+
+    - name: Perform CodeQL Analysis
+      uses: github/codeql-action/analyze@v1
--- a/.github/workflows/scorecards.yml
+++ b/.github/workflows/scorecards.yml
@ -0,0 +1,55 @@
+name: Scorecards supply-chain security
+on:
+  # Only the default branch is supported.
+  branch_protection_rule:
+  schedule:
+    - cron: '23 17 * * 1'
+  push:
+    branches: [ master ]
+
+# Declare default permissions as read only.
+permissions: read-all
+
+jobs:
+  analysis:
+    name: Scorecards analysis
+    runs-on: ubuntu-latest
+    permissions:
+      # Needed to upload the results to code-scanning dashboard.
+      security-events: write
+      actions: read
+      contents: read
+
+    steps:
+      - name: "Checkout code"
+        uses: actions/checkout@ec3a7ce113134d7a93b817d10a8272cb61118579 # v2.4.0
+        with:
+          persist-credentials: false
+
+      - name: "Run analysis"
+        uses: ossf/scorecard-action@c1aec4ac820532bab364f02a81873c555a0ba3a1 # v1.0.4
+        with:
+          results_file: results.sarif
+          results_format: sarif
+          # Read-only PAT token. To create it,
+          # follow the steps in https://github.com/ossf/scorecard-action#pat-token-creation.
+          repo_token: ${{ secrets.SCORECARD_READ_TOKEN }}
+          # Publish the results to enable scorecard badges. For more details, see
+          # https://github.com/ossf/scorecard-action#publishing-results.
+          # For private repositories, `publish_results` will automatically be set to `false`,
+          # regardless of the value entered here.
+          publish_results: true
+
+      # Upload the results as artifacts (optional).
+      - name: "Upload artifact"
+        uses: actions/upload-artifact@82c141cc518b40d92cc801eee768e7aafc9c2fa2 # v2.3.1
+        with:
+          name: SARIF file
+          path: results.sarif
+          retention-days: 5
+
+      # Upload the results to GitHub's code scanning dashboard.
+      - name: "Upload to code-scanning"
+        uses: github/codeql-action/upload-sarif@5f532563584d71fdef14ee64d17bafb34f751ce5 # v1.0.26
+        with:
+          sarif_file: results.sarif
--- a/.gitignore
+++ b/.gitignore
@ -1,47 +1,82 @@
-INSTALL
-Makefile.in
-aclocal.m4
-ar-lib
-compile
-config.guess
-config.sub
-configure
-depcomp
-install-sh
-ltmain.sh
-m4/
-missing
-test-driver
+# Public .gitignore file for PCRE2

-Makefile
-config.log
-config.status
-libpcre2-*.pc
-libtool
-pcre2-config
-src/.deps
-src/config.h
-src/pcre2.h
-src/stamp-h1
-
-.libs
-*.o
-*.lo
 *.a
+*.lo
 *.la
-src/.dirstamp
-src/pcre2_chartables.c
+*.pc
+*.o
+*~
+*.lha

-pcre2grep
-pcre2test
-pcre2_jit_test
+__pycache__
+.deps
+.libs

+INSTALL
+Makefile
+Makefile.in
 RunGrepTest.log
 RunGrepTest.trs
 RunTest.log
 RunTest.trs
+
+aclocal.m4
+ar-lib
+compile
+config.guess
+config.log
+config.status
+config.sub
+configure
+depcomp
+install-sh
+libtool
+ltmain.sh
+missing
+pcre2-config
+pcre2_dftables
+pcre2_jit_test
 pcre2_jit_test.log
 pcre2_jit_test.trs
+pcre2demo
+pcre2fuzzcheck
+pcre2grep
+pcre2test
+test-driver
 test-suite.log
+test3input
+test3output
+testNinput
+testNinputgrep
+teststderr
+teststderrM
+teststderrgrep
+teststdout
+teststdoutM
+testtemp1
+testtemp1grep
+testtemp2
+testtemp2grep
+testtry
+testtrygrep
+
+m4/libtool.m4
+m4/ltoptions.m4
+m4/ltsugar.m4
+m4/ltversion.m4
+m4/lt~obsolete.m4
+
+maint/ucptest
+maint/utf8
+
+src/.deps
+src/.dirstamp
+src/config.h
+src/pcre2.h
+src/pcre2_chartables.c
+src/stamp-h1
+
+/bazel-*
+
+# End

-*~
--- a/8
+++ b/8
@ -5,10 +5,10 @@ Written by:       Philip Hazel
 Email local part: Philip.Hazel
 Email domain:     gmail.com

-University of Cambridge Computing Service,
+Retired from University of Cambridge Computing Service,
 Cambridge, England.

-Copyright (c) 1997-2021 University of Cambridge
+Copyright (c) 1997-2022 University of Cambridge
 All rights reserved


@ -19,7 +19,7 @@ Written by:       Zoltan Herczeg
 Email local part: hzmester
 Emain domain:     freemail.hu

-Copyright(c) 2010-2021 Zoltan Herczeg
+Copyright(c) 2010-2022 Zoltan Herczeg
 All rights reserved.


@ -30,7 +30,7 @@ Written by:       Zoltan Herczeg
 Email local part: hzmester
 Emain domain:     freemail.hu

-Copyright(c) 2009-2021 Zoltan Herczeg
+Copyright(c) 2009-2022 Zoltan Herczeg
 All rights reserved.

 ####
--- a/BUILD.bazel
+++ b/BUILD.bazel
@ -0,0 +1,72 @@
+load("@rules_cc//cc:defs.bzl", "cc_library", "cc_test")
+load("@bazel_skylib//rules:copy_file.bzl", "copy_file")
+
+copy_file(
+    name = "config_h_generic",
+    src = "src/config.h.generic",
+    out = "src/config.h",
+)
+
+copy_file(
+    name = "pcre2_h_generic",
+    src = "src/pcre2.h.generic",
+    out = "src/pcre2.h",
+)
+
+copy_file(
+    name = "pcre2_chartables_c",
+    src = "src/pcre2_chartables.c.dist",
+    out = "src/pcre2_chartables.c",
+)
+
+cc_library(
+    name = "pcre2",
+    srcs = [
+        "src/pcre2_auto_possess.c",
+        "src/pcre2_compile.c",
+        "src/pcre2_config.c",
+        "src/pcre2_context.c",
+        "src/pcre2_convert.c",
+        "src/pcre2_dfa_match.c",
+        "src/pcre2_error.c",
+        "src/pcre2_extuni.c",
+        "src/pcre2_find_bracket.c",
+        "src/pcre2_maketables.c",
+        "src/pcre2_match.c",
+        "src/pcre2_match_data.c",
+        "src/pcre2_newline.c",
+        "src/pcre2_ord2utf.c",
+        "src/pcre2_pattern_info.c",
+        "src/pcre2_script_run.c",
+        "src/pcre2_serialize.c",
+        "src/pcre2_string_utils.c",
+        "src/pcre2_study.c",
+        "src/pcre2_substitute.c",
+        "src/pcre2_substring.c",
+        "src/pcre2_tables.c",
+        "src/pcre2_ucd.c",
+        "src/pcre2_ucptables.c",
+        "src/pcre2_valid_utf.c",
+        "src/pcre2_xclass.c",
+        ":pcre2_chartables_c",
+    ],
+    hdrs = glob(["src/*.h"]) + [
+        ":config_h_generic",
+        ":pcre2_h_generic",
+    ],
+    defines = [
+        "HAVE_CONFIG_H",
+        "PCRE2_CODE_UNIT_WIDTH=8",
+        "PCRE2_STATIC",
+    ],
+    includes = ["src"],
+    strip_include_prefix = "src",
+    visibility = ["//visibility:public"],
+)
+
+cc_binary(
+    name = "pcre2demo",
+    srcs = ["src/pcre2demo.c"],
+    visibility = ["//visibility:public"],
+    deps = [":pcre2"],
+)
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -94,16 +94,27 @@
 # 2020-04-28 PH added function check for memfd_create based on Carlo's patch
 # 2020-05-25 PH added a check for Intel CET
 # 2020-12-03 PH altered the definition of pcre2test as suggested by Daniel
+# 2021-06-29 JWSB added the option to build static library with PIC.
+# 2021-07-05 JWSB modified such both the static and shared library can be
+#            build in one go.
+# 2021-08-28 PH increased minimum version
+# 2021-08-28 PH added test for realpath()

 PROJECT(PCRE2 C)

 # Increased minimum to 2.8.5 to support GNUInstallDirs.
-CMAKE_MINIMUM_REQUIRED(VERSION 2.8.5)
+# Increased minimum to 3.1 to support imported targets.
+CMAKE_MINIMUM_REQUIRED(VERSION 3.1)

 # Set policy CMP0026 to avoid warnings for the use of LOCATION in
 # GET_TARGET_PROPERTY. This should no longer be required.
 # CMAKE_POLICY(SET CMP0026 OLD)

+# With a recent cmake, you can provide a rootdir to look for non
+# standard installed library dependencies, but to do so, the policy
+# needs to be set to new (by uncommenting the following)
+# CMAKE_POLICY(SET CMP0074 NEW)
+
 # For FindReadline.cmake. This was changed to allow setting CMAKE_MODULE_PATH
 # on the command line.
 # SET(CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake)
@ -128,8 +139,6 @@ INCLUDE(CheckTypeSize)
 INCLUDE(GNUInstallDirs) # for CMAKE_INSTALL_LIBDIR

 CHECK_INCLUDE_FILE(dirent.h     HAVE_DIRENT_H)
-CHECK_INCLUDE_FILE(stdint.h     HAVE_STDINT_H)
-CHECK_INCLUDE_FILE(inttypes.h   HAVE_INTTYPES_H)
 CHECK_INCLUDE_FILE(sys/stat.h   HAVE_SYS_STAT_H)
 CHECK_INCLUDE_FILE(sys/types.h  HAVE_SYS_TYPES_H)
 CHECK_INCLUDE_FILE(unistd.h     HAVE_UNISTD_H)
@ -141,6 +150,13 @@ CHECK_SYMBOL_EXISTS(memmove       "string.h"   HAVE_MEMMOVE)
 CHECK_SYMBOL_EXISTS(secure_getenv "stdlib.h"   HAVE_SECURE_GETENV)
 CHECK_SYMBOL_EXISTS(strerror      "string.h"   HAVE_STRERROR)

+CHECK_C_SOURCE_COMPILES(
+  "#include <stdlib.h>
+   #include <limits.h>
+   int main(int c, char *v[]) { char buf[PATH_MAX]; realpath(v[1], buf); return 0; }"
+  HAVE_REALPATH
+)
+
 set(ORIG_CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS})
 set(CMAKE_REQUIRED_FLAGS "${CMAKE_REQUIRED_FLAGS} -Werror")
 CHECK_C_SOURCE_COMPILES(
@ -172,8 +188,9 @@ ENDIF(INTEL_CET_ENABLED)
 # Note: CMakeSetup displays these in alphabetical order, regardless of
 # the order we use here.

-SET(BUILD_SHARED_LIBS OFF CACHE BOOL
-    "Build shared libraries instead of static ones.")
+SET(BUILD_SHARED_LIBS OFF CACHE BOOL "Build shared libraries.")
+
+OPTION(BUILD_STATIC_LIBS "Build static libraries." ON)

 OPTION(PCRE2_BUILD_PCRE2_8 "Build 8 bit PCRE2 library" ON)

@ -181,6 +198,8 @@ OPTION(PCRE2_BUILD_PCRE2_16 "Build 16 bit PCRE2 library" OFF)

 OPTION(PCRE2_BUILD_PCRE2_32 "Build 32 bit PCRE2 library" OFF)

+OPTION(PCRE2_STATIC_PIC "Build the static library with the option position independent code enabled." OFF)
+
 OPTION(PCRE2_DEBUG "Include debugging code" OFF)

 OPTION(PCRE2_DISABLE_PERCENT_ZT "Disable the use of %zu and %td (rarely needed)" OFF)
@ -292,9 +311,19 @@ ENDIF(PCRE2_SUPPORT_LIBZ)
 IF(EDITLINE_FOUND)
  OPTION (PCRE2_SUPPORT_LIBEDIT  "Enable support for linking pcre2test with libedit." OFF)
 ENDIF(EDITLINE_FOUND)
-IF(PCRE2_SUPPORT_LIBEDIT)
-  INCLUDE_DIRECTORIES(${EDITLINE_INCLUDE_DIR})
-ENDIF(PCRE2_SUPPORT_LIBEDIT)
+IF(EDITLINE_FOUND)
+  IF(PCRE2_SUPPORT_LIBEDIT)
+    INCLUDE_DIRECTORIES(${EDITLINE_INCLUDE_DIR})
+  ENDIF(PCRE2_SUPPORT_LIBEDIT)
+ELSE(EDITLINE_FOUND)
+  IF(PCRE2_SUPPORT_LIBEDIT)
+    MESSAGE(FATAL_ERROR
+      " libedit not found, set EDITLINE_INCLUDE_DIR to a compatible header\n"
+      " or set Editline_ROOT to a full libedit installed tree, as needed\n"
+      " Might need to enable policy CMP0074 in CMakeLists.txt"
+    )
+  ENDIF(PCRE2_SUPPORT_LIBEDIT)
+ENDIF(EDITLINE_FOUND)

 # readline lib
 IF(READLINE_FOUND)
@ -306,9 +335,9 @@ ENDIF(PCRE2_SUPPORT_LIBREADLINE)

 # Prepare build configuration

-IF(NOT BUILD_SHARED_LIBS)
-        SET(PCRE2_STATIC 1)
-ENDIF(NOT BUILD_SHARED_LIBS)
+IF(NOT BUILD_SHARED_LIBS AND NOT BUILD_STATIC_LIBS)
+        MESSAGE(FATAL_ERROR "At least one of BUILD_SHARED_LIBS or BUILD_STATIC_LIBS must be enabled.")
+ENDIF(NOT BUILD_SHARED_LIBS AND NOT BUILD_STATIC_LIBS)

 IF(NOT PCRE2_BUILD_PCRE2_8 AND NOT PCRE2_BUILD_PCRE2_16 AND NOT PCRE2_BUILD_PCRE2_32)
        MESSAGE(FATAL_ERROR "At least one of PCRE2_BUILD_PCRE2_8, PCRE2_BUILD_PCRE2_16 or PCRE2_BUILD_PCRE2_32 must be enabled")
@ -332,7 +361,12 @@ IF(PCRE2_BUILD_PCRE2GREP AND NOT PCRE2_BUILD_PCRE2_8)
 ENDIF(PCRE2_BUILD_PCRE2GREP AND NOT PCRE2_BUILD_PCRE2_8)

 IF(PCRE2_SUPPORT_LIBREADLINE AND PCRE2_SUPPORT_LIBEDIT)
-        MESSAGE(FATAL_ERROR "Only one of libreadline or libeditline can be specified")
+        IF(READLINE_FOUND)
+                MESSAGE(FATAL_ERROR
+                  " Only one of the readline compatible libraries can be enabled.\n"
+                  " Disable libreadline with -DPCRE2_SUPPORT_LIBREADLINE=OFF"
+                )
+        ENDIF(READLINE_FOUND)
 ENDIF(PCRE2_SUPPORT_LIBREADLINE AND PCRE2_SUPPORT_LIBEDIT)

 IF(PCRE2_SUPPORT_BSR_ANYCRLF)
@ -348,7 +382,13 @@ IF(PCRE2_SUPPORT_UNICODE)
 ENDIF(PCRE2_SUPPORT_UNICODE)

 IF(PCRE2_SUPPORT_JIT)
-        SET(SUPPORT_JIT 1)
+	SET(SUPPORT_JIT 1)
+	IF(UNIX)
+		FIND_PACKAGE(Threads REQUIRED)
+		IF(CMAKE_USE_PTHREADS_INIT)
+			SET(REQUIRE_PTHREAD 1)
+		ENDIF(CMAKE_USE_PTHREADS_INIT)
+	ENDIF(UNIX)
 ENDIF(PCRE2_SUPPORT_JIT)

 IF(PCRE2_SUPPORT_JIT_SEALLOC)
@ -597,39 +637,37 @@ SET(PCRE2_SOURCES
 SET(PCRE2POSIX_HEADERS src/pcre2posix.h)
 SET(PCRE2POSIX_SOURCES src/pcre2posix.c)

-IF(MINGW AND NOT PCRE2_STATIC)
-IF (EXISTS ${PROJECT_SOURCE_DIR}/pcre2.rc)
-ADD_CUSTOM_COMMAND(OUTPUT ${PROJECT_SOURCE_DIR}/pcre2.o
-PRE-LINK
-COMMAND windres ARGS pcre2.rc pcre2.o
-WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}
-COMMENT Using pcre2 coff info in mingw build)
-SET(PCRE2_SOURCES
-  ${PCRE2_SOURCES} ${PROJECT_SOURCE_DIR}/pcre2.o
-)
-ENDIF(EXISTS ${PROJECT_SOURCE_DIR}/pcre2.rc)
-IF (EXISTS ${PROJECT_SOURCE_DIR}/pcre2posix.rc)
-ADD_CUSTOM_COMMAND(OUTPUT ${PROJECT_SOURCE_DIR}/pcre2posix.o
-PRE-LINK
-COMMAND windres ARGS pcre2posix.rc pcre2posix.o
-WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}
-COMMENT Using pcre2posix coff info in mingw build)
-SET(PCRE2POSIX_SOURCES
-  ${PCRE2POSIX_SOURCES} ${PROJECT_SOURCE_DIR}/pcre2posix.o
-)
-ENDIF(EXISTS ${PROJECT_SOURCE_DIR}/pcre2posix.rc)
-ENDIF(MINGW AND NOT PCRE2_STATIC)
+IF(MINGW AND BUILD_SHARED_LIBS)
+  IF (EXISTS ${PROJECT_SOURCE_DIR}/pcre2.rc)
+    ADD_CUSTOM_COMMAND(OUTPUT ${PROJECT_SOURCE_DIR}/pcre2.o
+      PRE-LINK
+      COMMAND windres ARGS pcre2.rc pcre2.o
+      WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}
+      COMMENT Using pcre2 coff info in mingw build)
+    SET(PCRE2_SOURCES ${PCRE2_SOURCES} ${PROJECT_SOURCE_DIR}/pcre2.o)
+  ENDIF(EXISTS ${PROJECT_SOURCE_DIR}/pcre2.rc)

-IF(MSVC AND NOT PCRE2_STATIC)
-IF (EXISTS ${PROJECT_SOURCE_DIR}/pcre2.rc)
-SET(PCRE2_SOURCES
-  ${PCRE2_SOURCES} pcre2.rc)
-ENDIF(EXISTS ${PROJECT_SOURCE_DIR}/pcre2.rc)
-IF (EXISTS ${PROJECT_SOURCE_DIR}/pcre2posix.rc)
-SET(PCRE2POSIX_SOURCES
-  ${PCRE2POSIX_SOURCES} pcre2posix.rc)
-ENDIF (EXISTS ${PROJECT_SOURCE_DIR}/pcre2posix.rc)
-ENDIF(MSVC AND NOT PCRE2_STATIC)
+  IF (EXISTS ${PROJECT_SOURCE_DIR}/pcre2posix.rc)
+    ADD_CUSTOM_COMMAND(OUTPUT ${PROJECT_SOURCE_DIR}/pcre2posix.o
+      PRE-LINK
+      COMMAND windres ARGS pcre2posix.rc pcre2posix.o
+      WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}
+      COMMENT Using pcre2posix coff info in mingw build)
+    SET(PCRE2POSIX_SOURCES ${PCRE2POSIX_SOURCES} ${PROJECT_SOURCE_DIR}/pcre2posix.o)
+  ENDIF(EXISTS ${PROJECT_SOURCE_DIR}/pcre2posix.rc)
+ENDIF(MINGW AND BUILD_SHARED_LIBS)
+
+IF(MSVC AND BUILD_SHARED_LIBS)
+  SET(dll_pdb_files ${PROJECT_BINARY_DIR}/pcre2-posix.pdb ${dll_pdb_files})
+  SET(dll_pdb_debug_files ${PROJECT_BINARY_DIR}/pcre2-posixd.pdb ${dll_pdb_debug_files})
+  IF (EXISTS ${PROJECT_SOURCE_DIR}/pcre2.rc)
+    SET(PCRE2_SOURCES ${PCRE2_SOURCES} pcre2.rc)
+  ENDIF(EXISTS ${PROJECT_SOURCE_DIR}/pcre2.rc)
+
+  IF (EXISTS ${PROJECT_SOURCE_DIR}/pcre2posix.rc)
+    SET(PCRE2POSIX_SOURCES ${PCRE2POSIX_SOURCES} pcre2posix.rc)
+  ENDIF (EXISTS ${PROJECT_SOURCE_DIR}/pcre2posix.rc)
+ENDIF(MSVC AND BUILD_SHARED_LIBS)

 # Fix static compilation with MSVC: https://bugs.exim.org/show_bug.cgi?id=1681
 # This code was taken from the CMake wiki, not from WebM.
@ -658,76 +696,213 @@ SET(targets)
 # 8-bit library

 IF(PCRE2_BUILD_PCRE2_8)
-ADD_LIBRARY(pcre2-8 ${PCRE2_HEADERS} ${PCRE2_SOURCES} ${PROJECT_BINARY_DIR}/config.h)
-SET_TARGET_PROPERTIES(pcre2-8 PROPERTIES
-  COMPILE_DEFINITIONS PCRE2_CODE_UNIT_WIDTH=8
-  MACHO_COMPATIBILITY_VERSION "${LIBPCRE2_8_MACHO_COMPATIBILITY_VERSION}"
-  MACHO_CURRENT_VERSION "${LIBPCRE2_8_MACHO_CURRENT_VERSION}"
-  VERSION ${LIBPCRE2_8_VERSION}
-  SOVERSION ${LIBPCRE2_8_SOVERSION})
-SET(targets ${targets} pcre2-8)
-ADD_LIBRARY(pcre2-posix ${PCRE2POSIX_HEADERS} ${PCRE2POSIX_SOURCES})
-SET_TARGET_PROPERTIES(pcre2-posix PROPERTIES
-  COMPILE_DEFINITIONS PCRE2_CODE_UNIT_WIDTH=8
-  MACHO_COMPATIBILITY_VERSION "${LIBPCRE2_POSIX_MACHO_COMPATIBILITY_VERSION}"
-  MACHO_CURRENT_VERSION "${LIBPCRE2_POSIX_MACHO_CURRENT_VERSION}"
-  VERSION ${LIBPCRE2_POSIX_VERSION}
-  SOVERSION ${LIBPCRE2_POSIX_SOVERSION})
-SET(targets ${targets} pcre2-posix)
-TARGET_LINK_LIBRARIES(pcre2-posix pcre2-8)
+  IF(BUILD_STATIC_LIBS)
+    ADD_LIBRARY(pcre2-8-static STATIC ${PCRE2_HEADERS} ${PCRE2_SOURCES} ${PROJECT_BINARY_DIR}/config.h)
+    SET_TARGET_PROPERTIES(pcre2-8-static PROPERTIES
+      COMPILE_DEFINITIONS PCRE2_CODE_UNIT_WIDTH=8
+      MACHO_COMPATIBILITY_VERSION "${LIBPCRE2_8_MACHO_COMPATIBILITY_VERSION}"
+      MACHO_CURRENT_VERSION "${LIBPCRE2_8_MACHO_CURRENT_VERSION}"
+      VERSION ${LIBPCRE2_8_VERSION}
+      SOVERSION ${LIBPCRE2_8_SOVERSION})
+    TARGET_COMPILE_DEFINITIONS(pcre2-8-static PUBLIC PCRE2_STATIC)
+    TARGET_INCLUDE_DIRECTORIES(pcre2-8-static PUBLIC ${PROJECT_BINARY_DIR})
+    IF(REQUIRE_PTHREAD)
+        TARGET_LINK_LIBRARIES(pcre2-8-static Threads::Threads)
+    ENDIF(REQUIRE_PTHREAD)
+    SET(targets ${targets} pcre2-8-static)
+    ADD_LIBRARY(pcre2-posix-static STATIC ${PCRE2POSIX_HEADERS} ${PCRE2POSIX_SOURCES})
+    SET_TARGET_PROPERTIES(pcre2-posix-static PROPERTIES
+      COMPILE_DEFINITIONS PCRE2_CODE_UNIT_WIDTH=8
+      MACHO_COMPATIBILITY_VERSION "${LIBPCRE2_POSIX_MACHO_COMPATIBILITY_VERSION}"
+      MACHO_CURRENT_VERSION "${LIBPCRE2_POSIX_MACHO_CURRENT_VERSION}"
+      VERSION ${LIBPCRE2_POSIX_VERSION}
+      SOVERSION ${LIBPCRE2_POSIX_SOVERSION})
+    TARGET_LINK_LIBRARIES(pcre2-posix-static pcre2-8-static)
+    TARGET_COMPILE_DEFINITIONS(pcre2-posix-static PUBLIC PCRE2_STATIC)
+    TARGET_INCLUDE_DIRECTORIES(pcre2-posix-static PUBLIC ${PROJECT_BINARY_DIR})
+    SET(targets ${targets} pcre2-posix-static)

-IF(MINGW AND NOT PCRE2_STATIC)
-  IF(NON_STANDARD_LIB_PREFIX)
-    SET_TARGET_PROPERTIES(pcre2-8 pcre2-posix PROPERTIES PREFIX "")
-  ENDIF(NON_STANDARD_LIB_PREFIX)
-  IF(NON_STANDARD_LIB_SUFFIX)
-    SET_TARGET_PROPERTIES(pcre2-8 pcre2-posix PROPERTIES SUFFIX "-0.dll")
-  ENDIF(NON_STANDARD_LIB_SUFFIX)
-ENDIF(MINGW AND NOT PCRE2_STATIC)
+    IF(MSVC)
+      SET_TARGET_PROPERTIES(pcre2-8-static PROPERTIES OUTPUT_NAME pcre2-8-static)
+      SET_TARGET_PROPERTIES(pcre2-posix-static PROPERTIES OUTPUT_NAME pcre2-posix-static)
+    ELSE(MSVC)
+      SET_TARGET_PROPERTIES(pcre2-8-static PROPERTIES OUTPUT_NAME pcre2-8)
+      SET_TARGET_PROPERTIES(pcre2-posix-static PROPERTIES OUTPUT_NAME pcre2-posix)
+    ENDIF(MSVC)
+    IF(PCRE2_STATIC_PIC)
+      SET_TARGET_PROPERTIES(pcre2-8-static pcre2-posix-static PROPERTIES POSITION_INDEPENDENT_CODE 1)
+    ENDIF(PCRE2_STATIC_PIC)
+  ENDIF(BUILD_STATIC_LIBS)
+
+  IF(BUILD_SHARED_LIBS)
+    ADD_LIBRARY(pcre2-8-shared SHARED ${PCRE2_HEADERS} ${PCRE2_SOURCES} ${PROJECT_BINARY_DIR}/config.h)
+    TARGET_INCLUDE_DIRECTORIES(pcre2-8-shared PUBLIC ${PROJECT_BINARY_DIR})
+    SET_TARGET_PROPERTIES(pcre2-8-shared PROPERTIES
+      COMPILE_DEFINITIONS PCRE2_CODE_UNIT_WIDTH=8
+      MACHO_COMPATIBILITY_VERSION "${LIBPCRE2_8_MACHO_COMPATIBILITY_VERSION}"
+      MACHO_CURRENT_VERSION "${LIBPCRE2_8_MACHO_CURRENT_VERSION}"
+      VERSION ${LIBPCRE2_8_VERSION}
+      SOVERSION ${LIBPCRE2_8_SOVERSION}
+      OUTPUT_NAME pcre2-8)
+    IF(REQUIRE_PTHREAD)
+        TARGET_LINK_LIBRARIES(pcre2-8-shared Threads::Threads)
+    ENDIF(REQUIRE_PTHREAD)
+    SET(targets ${targets} pcre2-8-shared)
+    ADD_LIBRARY(pcre2-posix-shared SHARED ${PCRE2POSIX_HEADERS} ${PCRE2POSIX_SOURCES})
+    TARGET_INCLUDE_DIRECTORIES(pcre2-posix-shared PUBLIC ${PROJECT_BINARY_DIR})
+    SET_TARGET_PROPERTIES(pcre2-posix-shared PROPERTIES
+      COMPILE_DEFINITIONS PCRE2_CODE_UNIT_WIDTH=8
+      MACHO_COMPATIBILITY_VERSION "${LIBPCRE2_POSIX_MACHO_COMPATIBILITY_VERSION}"
+      MACHO_CURRENT_VERSION "${LIBPCRE2_POSIX_MACHO_CURRENT_VERSION}"
+      VERSION ${LIBPCRE2_POSIX_VERSION}
+      SOVERSION ${LIBPCRE2_POSIX_SOVERSION}
+      OUTPUT_NAME pcre2-posix)
+    TARGET_LINK_LIBRARIES(pcre2-posix-shared pcre2-8-shared)
+    SET(targets ${targets} pcre2-posix-shared)
+    SET(dll_pdb_files ${PROJECT_BINARY_DIR}/pcre2-8.pdb ${dll_pdb_files})
+    SET(dll_pdb_debug_files ${PROJECT_BINARY_DIR}/pcre2-8d.pdb ${dll_pdb_debug_files})
+
+    IF(MINGW)
+      IF(NON_STANDARD_LIB_PREFIX)
+        SET_TARGET_PROPERTIES(pcre2-8-shared pcre2-posix-shared PROPERTIES PREFIX "")
+      ENDIF(NON_STANDARD_LIB_PREFIX)
+      IF(NON_STANDARD_LIB_SUFFIX)
+        SET_TARGET_PROPERTIES(pcre2-8-shared pcre2-posix-shared PROPERTIES SUFFIX "-0.dll")
+      ENDIF(NON_STANDARD_LIB_SUFFIX)
+    ENDIF(MINGW)
+  ENDIF(BUILD_SHARED_LIBS)
+
+  IF(BUILD_STATIC_LIBS)
+    ADD_LIBRARY(pcre2-8 ALIAS pcre2-8-static)
+    ADD_LIBRARY(pcre2-posix ALIAS pcre2-posix-static)
+  ELSE(BUILD_STATIC_LIBS)
+    ADD_LIBRARY(pcre2-8 ALIAS pcre2-8-shared)
+    ADD_LIBRARY(pcre2-posix ALIAS pcre2-posix-shared)
+  ENDIF(BUILD_STATIC_LIBS)
 ENDIF(PCRE2_BUILD_PCRE2_8)

 # 16-bit library

 IF(PCRE2_BUILD_PCRE2_16)
-ADD_LIBRARY(pcre2-16 ${PCRE2_HEADERS} ${PCRE2_SOURCES} ${PROJECT_BINARY_DIR}/config.h)
-SET_TARGET_PROPERTIES(pcre2-16 PROPERTIES
-  COMPILE_DEFINITIONS PCRE2_CODE_UNIT_WIDTH=16
-  MACHO_COMPATIBILITY_VERSION "${LIBPCRE2_32_MACHO_COMPATIBILITY_VERSION}"
-  MACHO_CURRENT_VERSION "${LIBPCRE2_32_MACHO_CURRENT_VERSION}"
-  VERSION ${LIBPCRE2_16_VERSION}
-  SOVERSION ${LIBPCRE2_16_SOVERSION})
-SET(targets ${targets} pcre2-16)
+  IF(BUILD_STATIC_LIBS)
+    ADD_LIBRARY(pcre2-16-static STATIC ${PCRE2_HEADERS} ${PCRE2_SOURCES} ${PROJECT_BINARY_DIR}/config.h)
+    TARGET_INCLUDE_DIRECTORIES(pcre2-16-static PUBLIC ${PROJECT_BINARY_DIR})
+    SET_TARGET_PROPERTIES(pcre2-16-static PROPERTIES
+      COMPILE_DEFINITIONS PCRE2_CODE_UNIT_WIDTH=16
+      MACHO_COMPATIBILITY_VERSION "${LIBPCRE2_32_MACHO_COMPATIBILITY_VERSION}"
+      MACHO_CURRENT_VERSION "${LIBPCRE2_32_MACHO_CURRENT_VERSION}"
+      VERSION ${LIBPCRE2_16_VERSION}
+      SOVERSION ${LIBPCRE2_16_SOVERSION})
+    TARGET_COMPILE_DEFINITIONS(pcre2-16-static PUBLIC PCRE2_STATIC)
+    IF(REQUIRE_PTHREAD)
+      TARGET_LINK_LIBRARIES(pcre2-16-static Threads::Threads)
+    ENDIF(REQUIRE_PTHREAD)
+    SET(targets ${targets} pcre2-16-static)

-IF(MINGW AND NOT PCRE2_STATIC)
-  IF(NON_STANDARD_LIB_PREFIX)
-    SET_TARGET_PROPERTIES(pcre2-16 PROPERTIES PREFIX "")
-  ENDIF(NON_STANDARD_LIB_PREFIX)
-  IF(NON_STANDARD_LIB_SUFFIX)
-    SET_TARGET_PROPERTIES(pcre2-16 PROPERTIES SUFFIX "-0.dll")
-  ENDIF(NON_STANDARD_LIB_SUFFIX)
-ENDIF(MINGW AND NOT PCRE2_STATIC)
+    IF(MSVC)
+      SET_TARGET_PROPERTIES(pcre2-16-static PROPERTIES OUTPUT_NAME pcre2-16-static)
+    ELSE(MSVC)
+      SET_TARGET_PROPERTIES(pcre2-16-static PROPERTIES OUTPUT_NAME pcre2-16)
+    ENDIF(MSVC)
+    IF(PCRE2_STATIC_PIC)
+      SET_TARGET_PROPERTIES(pcre2-16-static PROPERTIES POSITION_INDEPENDENT_CODE 1)
+    ENDIF(PCRE2_STATIC_PIC)
+  ENDIF(BUILD_STATIC_LIBS)
+
+  IF(BUILD_SHARED_LIBS)
+    ADD_LIBRARY(pcre2-16-shared SHARED ${PCRE2_HEADERS} ${PCRE2_SOURCES} ${PROJECT_BINARY_DIR}/config.h)
+    TARGET_INCLUDE_DIRECTORIES(pcre2-16-shared PUBLIC ${PROJECT_BINARY_DIR})
+    SET_TARGET_PROPERTIES(pcre2-16-shared PROPERTIES
+      COMPILE_DEFINITIONS PCRE2_CODE_UNIT_WIDTH=16
+      MACHO_COMPATIBILITY_VERSION "${LIBPCRE2_32_MACHO_COMPATIBILITY_VERSION}"
+      MACHO_CURRENT_VERSION "${LIBPCRE2_32_MACHO_CURRENT_VERSION}"
+      VERSION ${LIBPCRE2_16_VERSION}
+      SOVERSION ${LIBPCRE2_16_SOVERSION}
+      OUTPUT_NAME pcre2-16)
+    IF(REQUIRE_PTHREAD)
+      TARGET_LINK_LIBRARIES(pcre2-16-shared Threads::Threads)
+    ENDIF(REQUIRE_PTHREAD)
+    SET(targets ${targets} pcre2-16-shared)
+    SET(dll_pdb_files ${PROJECT_BINARY_DIR}/pcre2-16.pdb ${dll_pdb_files})
+    SET(dll_pdb_debug_files ${PROJECT_BINARY_DIR}/pcre2-16d.pdb ${dll_pdb_debug_files})
+
+    IF(MINGW)
+      IF(NON_STANDARD_LIB_PREFIX)
+        SET_TARGET_PROPERTIES(pcre2-16-shared PROPERTIES PREFIX "")
+      ENDIF(NON_STANDARD_LIB_PREFIX)
+      IF(NON_STANDARD_LIB_SUFFIX)
+        SET_TARGET_PROPERTIES(pcre2-16-shared PROPERTIES SUFFIX "-0.dll")
+      ENDIF(NON_STANDARD_LIB_SUFFIX)
+    ENDIF(MINGW)
+  ENDIF(BUILD_SHARED_LIBS)
+
+  IF(BUILD_STATIC_LIBS)
+    ADD_LIBRARY(pcre2-16 ALIAS pcre2-16-static)
+  ELSE(BUILD_STATIC_LIBS)
+    ADD_LIBRARY(pcre2-16 ALIAS pcre2-16-shared)
+  ENDIF(BUILD_STATIC_LIBS)
 ENDIF(PCRE2_BUILD_PCRE2_16)

 # 32-bit library

 IF(PCRE2_BUILD_PCRE2_32)
-ADD_LIBRARY(pcre2-32 ${PCRE2_HEADERS} ${PCRE2_SOURCES} ${PROJECT_BINARY_DIR}/config.h)
-SET_TARGET_PROPERTIES(pcre2-32 PROPERTIES
-  COMPILE_DEFINITIONS PCRE2_CODE_UNIT_WIDTH=32
-  MACHO_COMPATIBILITY_VERSION "${LIBPCRE2_32_MACHO_COMPATIBILITY_VERSION}"
-  MACHO_CURRENT_VERSION "${LIBPCRE2_32_MACHO_CURRENT_VERSION}"
-  VERSION ${LIBPCRE2_32_VERSION}
-  SOVERSION ${LIBPCRE2_32_SOVERSION})
-SET(targets ${targets} pcre2-32)
+  IF(BUILD_STATIC_LIBS)
+    ADD_LIBRARY(pcre2-32-static STATIC ${PCRE2_HEADERS} ${PCRE2_SOURCES} ${PROJECT_BINARY_DIR}/config.h)
+    TARGET_INCLUDE_DIRECTORIES(pcre2-32-static PUBLIC ${PROJECT_BINARY_DIR})
+    SET_TARGET_PROPERTIES(pcre2-32-static PROPERTIES
+      COMPILE_DEFINITIONS PCRE2_CODE_UNIT_WIDTH=32
+      MACHO_COMPATIBILITY_VERSION "${LIBPCRE2_32_MACHO_COMPATIBILITY_VERSION}"
+      MACHO_CURRENT_VERSION "${LIBPCRE2_32_MACHO_CURRENT_VERSION}"
+      VERSION ${LIBPCRE2_32_VERSION}
+      SOVERSION ${LIBPCRE2_32_SOVERSION})
+    TARGET_COMPILE_DEFINITIONS(pcre2-32-static PUBLIC PCRE2_STATIC)
+    IF(REQUIRE_PTHREAD)
+      TARGET_LINK_LIBRARIES(pcre2-32-static Threads::Threads)
+    ENDIF(REQUIRE_PTHREAD)
+    SET(targets ${targets} pcre2-32-static)

-IF(MINGW AND NOT PCRE2_STATIC)
-  IF(NON_STANDARD_LIB_PREFIX)
-    SET_TARGET_PROPERTIES(pcre2-32 PROPERTIES PREFIX "")
-  ENDIF(NON_STANDARD_LIB_PREFIX)
-  IF(NON_STANDARD_LIB_SUFFIX)
-    SET_TARGET_PROPERTIES(pcre2-32 PROPERTIES SUFFIX "-0.dll")
-  ENDIF(NON_STANDARD_LIB_SUFFIX)
-ENDIF(MINGW AND NOT PCRE2_STATIC)
+    IF(MSVC)
+      SET_TARGET_PROPERTIES(pcre2-32-static PROPERTIES OUTPUT_NAME pcre2-32-static)
+    ELSE(MSVC)
+      SET_TARGET_PROPERTIES(pcre2-32-static PROPERTIES OUTPUT_NAME pcre2-32)
+    ENDIF(MSVC)
+    IF(PCRE2_STATIC_PIC)
+      SET_TARGET_PROPERTIES(pcre2-32-static PROPERTIES POSITION_INDEPENDENT_CODE 1)
+    ENDIF(PCRE2_STATIC_PIC)
+  ENDIF(BUILD_STATIC_LIBS)
+
+  IF(BUILD_SHARED_LIBS)
+    ADD_LIBRARY(pcre2-32-shared SHARED ${PCRE2_HEADERS} ${PCRE2_SOURCES} ${PROJECT_BINARY_DIR}/config.h)
+    TARGET_INCLUDE_DIRECTORIES(pcre2-32-shared PUBLIC ${PROJECT_BINARY_DIR})
+    SET_TARGET_PROPERTIES(pcre2-32-shared PROPERTIES
+      COMPILE_DEFINITIONS PCRE2_CODE_UNIT_WIDTH=32
+      MACHO_COMPATIBILITY_VERSION "${LIBPCRE2_32_MACHO_COMPATIBILITY_VERSION}"
+      MACHO_CURRENT_VERSION "${LIBPCRE2_32_MACHO_CURRENT_VERSION}"
+      VERSION ${LIBPCRE2_32_VERSION}
+      SOVERSION ${LIBPCRE2_32_SOVERSION}
+      OUTPUT_NAME pcre2-32)
+    IF(REQUIRE_PTHREAD)
+      TARGET_LINK_LIBRARIES(pcre2-32-shared Threads::Threads)
+    ENDIF(REQUIRE_PTHREAD)
+    SET(targets ${targets} pcre2-32-shared)
+    SET(dll_pdb_files ${PROJECT_BINARY_DIR}/pcre2-32.pdb ${dll_pdb_files})
+    SET(dll_pdb_debug_files ${PROJECT_BINARY_DIR}/pcre2-32d.pdb ${dll_pdb_debug_files})
+
+    IF(MINGW)
+      IF(NON_STANDARD_LIB_PREFIX)
+        SET_TARGET_PROPERTIES(pcre2-32-shared PROPERTIES PREFIX "")
+      ENDIF(NON_STANDARD_LIB_PREFIX)
+      IF(NON_STANDARD_LIB_SUFFIX)
+        SET_TARGET_PROPERTIES(pcre2-32-shared PROPERTIES SUFFIX "-0.dll")
+      ENDIF(NON_STANDARD_LIB_SUFFIX)
+    ENDIF(MINGW)
+  ENDIF(BUILD_SHARED_LIBS)
+
+  IF(BUILD_STATIC_LIBS)
+    ADD_LIBRARY(pcre2-32 ALIAS pcre2-32-static)
+  ELSE(BUILD_STATIC_LIBS)
+    ADD_LIBRARY(pcre2-32 ALIAS pcre2-32-shared)
+  ENDIF(BUILD_STATIC_LIBS)
 ENDIF(PCRE2_BUILD_PCRE2_32)

 # Executables
@ -900,37 +1075,34 @@ INSTALL(FILES "${CMAKE_CURRENT_BINARY_DIR}/pcre2-config"

 INSTALL(FILES ${PCRE2_HEADERS} ${PCRE2POSIX_HEADERS} DESTINATION include)

+# CMake config files.
+set(PCRE2_CONFIG_IN  ${CMAKE_CURRENT_SOURCE_DIR}/cmake/pcre2-config.cmake.in)
+set(PCRE2_CONFIG_OUT ${CMAKE_CURRENT_BINARY_DIR}/cmake/pcre2-config.cmake)
+configure_file(${PCRE2_CONFIG_IN} ${PCRE2_CONFIG_OUT} @ONLY)
+set(PCRE2_CONFIG_VERSION_IN  ${CMAKE_CURRENT_SOURCE_DIR}/cmake/pcre2-config-version.cmake.in)
+set(PCRE2_CONFIG_VERSION_OUT ${CMAKE_CURRENT_BINARY_DIR}/cmake/pcre2-config-version.cmake)
+configure_file(${PCRE2_CONFIG_VERSION_IN} ${PCRE2_CONFIG_VERSION_OUT} @ONLY)
+install(FILES ${PCRE2_CONFIG_OUT} ${PCRE2_CONFIG_VERSION_OUT} DESTINATION cmake)
+
 FILE(GLOB html ${PROJECT_SOURCE_DIR}/doc/html/*.html)
 FILE(GLOB man1 ${PROJECT_SOURCE_DIR}/doc/*.1)
 FILE(GLOB man3 ${PROJECT_SOURCE_DIR}/doc/*.3)

-FOREACH(man ${man3})
-        GET_FILENAME_COMPONENT(man_tmp ${man} NAME)
-        SET(man3_new ${man3} ${man})
-ENDFOREACH(man ${man3})
-SET(man3 ${man3_new})
-
 INSTALL(FILES ${man1} DESTINATION man/man1)
 INSTALL(FILES ${man3} DESTINATION man/man3)
 INSTALL(FILES ${html} DESTINATION share/doc/pcre2/html)

 IF(MSVC AND INSTALL_MSVC_PDB)
-    INSTALL(FILES ${PROJECT_BINARY_DIR}/pcre2.pdb
-                  ${PROJECT_BINARY_DIR}/pcre2posix.pdb
-            DESTINATION bin
-            CONFIGURATIONS RelWithDebInfo)
-    INSTALL(FILES ${PROJECT_BINARY_DIR}/pcre2d.pdb
-                  ${PROJECT_BINARY_DIR}/pcre2posixd.pdb
-            DESTINATION bin
-            CONFIGURATIONS Debug)
+ INSTALL(FILES ${dll_pdb_files} DESTINATION bin CONFIGURATIONS RelWithDebInfo)
+ INSTALL(FILES ${dll_pdb_debug_files} DESTINATION bin CONFIGURATIONS Debug)
 ENDIF(MSVC AND INSTALL_MSVC_PDB)

 # Help, only for nice output
-IF(BUILD_SHARED_LIBS)
-  SET(BUILD_STATIC_LIBS OFF)
-ELSE(BUILD_SHARED_LIBS)
+IF(BUILD_STATIC_LIBS)
  SET(BUILD_STATIC_LIBS ON)
-ENDIF(BUILD_SHARED_LIBS)
+ELSE(BUILD_STATIC_LIBS)
+  SET(BUILD_STATIC_LIBS OFF)
+ENDIF(BUILD_STATIC_LIBS)

 IF(PCRE2_HEAP_MATCH_RECURSE)
  MESSAGE(WARNING "HEAP_MATCH_RECURSE is obsolete and does nothing.")
@ -968,6 +1140,7 @@ IF(PCRE2_SHOW_REPORT)
  MESSAGE(STATUS "  Match depth limit ............... : ${PCRE2_MATCH_LIMIT_DEPTH}")
  MESSAGE(STATUS "  Build shared libs ............... : ${BUILD_SHARED_LIBS}")
  MESSAGE(STATUS "  Build static libs ............... : ${BUILD_STATIC_LIBS}")
+  MESSAGE(STATUS "     with PIC enabled ............. : ${PCRE2_STATIC_PIC}")
  MESSAGE(STATUS "  Build pcre2grep ................. : ${PCRE2_BUILD_PCRE2GREP}")
  MESSAGE(STATUS "  Enable JIT in pcre2grep ......... : ${PCRE2GREP_SUPPORT_JIT}")
  MESSAGE(STATUS "  Enable callouts in pcre2grep .... : ${PCRE2GREP_SUPPORT_CALLOUT}")
@ -1002,10 +1175,10 @@ IF(PCRE2_SHOW_REPORT)
    MESSAGE(STATUS "  Use %zu and %td ..................: AUTO" )
  ENDIF(PCRE2_DISABLE_PERCENT_ZT)

-  IF(MINGW AND NOT PCRE2_STATIC)
+  IF(MINGW AND BUILD_SHARED_LIBS)
    MESSAGE(STATUS "  Non-standard dll names (prefix) . : ${NON_STANDARD_LIB_PREFIX}")
    MESSAGE(STATUS "  Non-standard dll names (suffix) . : ${NON_STANDARD_LIB_SUFFIX}")
-  ENDIF(MINGW AND NOT PCRE2_STATIC)
+  ENDIF(MINGW AND BUILD_SHARED_LIBS)

  IF(MSVC)
    MESSAGE(STATUS "  Install MSVC .pdb files ..........: ${INSTALL_MSVC_PDB}")
--- a/285
+++ b/285
@ -1,5 +1,282 @@
-Change Log for PCRE2
--------------------
+Change Log for PCRE2 - see also the Git log
+-------------------------------------------
+
+
+Version 10.41 xx-xxx-2022
+-------------------------
+
+1. Add fflush() before and after a fork callout in pcre2grep to get its output
+to be the same on all systems. (THere were previously ordering differences in
+Alpine Linux).
+
+2. Merged patch from @carenas (GitHub #110) for pthreads support in CMake.
+
+3. SSF scorecards grumbled about possible overflow in an expression in
+pcre2test. It never would have overflowed in practice, but some casts have been
+added and at the some time there's been some tidying of fprints that output
+size_t values.
+
+4. PR #94 showed up an unused enum in pcre2_convert.c, which is now removed.
+
+5. Minor code re-arrangement to remove gcc warning about realloc() in
+pcre2test.
+
+6. Change a number of int variables that hold buffer and line lengths in
+pcre2grep to PCRE2_SIZE (aka size_t).
+
+7. Added an #ifdef to cut out a call to PRIV(jit_free) when JIT is not
+supported (even though that function would do nothing in that case) at the
+request of a user who doesn't even want to link with pcre_jit_compile.o. Also
+tidied up an untidy #ifdef arrangement in pcre2test.
+
+8. Fixed an issue in the backtracking optimization of character repeats in
+JIT. Furthermore optimize star repetitions, not just plus repetitions.
+
+9. Removed the use of an initial backtracking frames vector on the system stack 
+in pcre2_match() so that it now always uses the heap. (In a multi-thread 
+environment with very small stacks there had been an issue.) This also is 
+tidier for JIT matching, which didn't need that vector. The heap vector is now 
+remembered in the match data block and re-used if that block itself is re-used. 
+It is freed with the match data block.
+
+10. Adjusted the find_limits code in pcre2test to work with change 9 above.
+
+11. Added find_limits_noheap to pcre2test, because the heap limits are now 
+different in different environments and so cannot be included in the standard 
+tests.
+
+12. Created a test for pcre2_match() heap processing that is not part of the 
+tests run by 'make check', but can be run manually. The current output is from 
+a 64-bit system.
+
+13. Implemented -Z aka --null in pcre2grep.
+
+
+Version 10.40 15-April-2022
+---------------------------
+
+1. Merged patch from @carenas (GitHub #35, 7db87842) to fix pcre2grep incorrect
+handling of multiple passes.
+
+2. Merged patch from @carenas (GitHub #36, dae47509) to fix portability issue
+in pcre2grep with buffered fseek(stdin).
+
+3. Merged patch from @carenas (GitHub #37, acc520924) to fix tests when -S is
+not supported.
+
+4. Revert an unintended change in JIT repeat detection.
+
+5. Merged patch from @carenas (GitHub #52, b037bfa1) to fix build on GNU Hurd.
+
+6. Merged documentation and comments patches from @carenas (GitHub #47).
+
+7. Merged patch from @carenas (GitHub #49) to remove obsolete JFriedl test code
+from pcre2grep.
+
+8. Merged patch from @carenas (GitHub #48) to fix CMake install issue #46.
+
+9. Merged patch from @carenas (GitHub #53) fixing NULL checks in matching and
+substituting.
+
+10. Add null_subject and null_replacement modifiers to pcre2test.
+
+11. Add check for NULL subject to POSIX regexec() function.
+
+12. Add check for NULL replacement to pcre2_substitute().
+
+13. For the subject arguments of pcre2_match(), pcre2_dfa_match(), and
+pcre2_substitute(), and the replacement argument of the latter, if the pointer
+is NULL and the length is zero, treat as an empty string. Apparently a number
+of applications treat NULL/0 in this way.
+
+14. Added support for Bidi_Class and a number of binary Unicode properties,
+including Bidi_Control.
+
+15. Fix some minor issues raised by clang sanitize.
+
+16. Very minor code speed up for maximizing character property matches.
+
+17. A number of changes to script matching for \p and \P:
+
+    (a) Script extensions for a character are now coded as a bitmap instead of
+        a list of script numbers, which should be faster and does not need a
+        loop.
+
+    (b) Added the syntax \p{script:xxx} and \p{script_extensions:xxx} (synonyms
+        sc and scx).
+
+    (c) Changed \p{scriptname} from being the same as \p{sc:scriptname} to being
+        the same as \p{scx:scriptname} because this change happened in Perl at
+        release 5.26.
+
+    (d) The standard Unicode 4-letter abbreviations for script names are now
+        recognized.
+
+    (e) In accordance with Unicode and Perl's "loose matching" rules, spaces,
+        hyphens, and underscores are ignored in property names, which are then
+        matched independent of case.
+
+18. The Python scripts in the maint directory have been refactored. There are
+now three scripts that generate pcre2_ucd.c, pcre2_ucp.h, and pcre2_ucptables.c
+(which is #included by pcre2_tables.c). The data lists that used to be
+duplicated are now held in a single common Python module.
+
+19. On CHERI, and thus Arm's Morello prototype, pointers are represented as
+hardware capabilities, which consist of both an integer address and additional
+metadata, meaning they are twice the size of the platform's size_t type, i.e.
+16 bytes on a 64-bit system. The ovector member of heapframe happens to only be
+8 byte aligned, and so computing frame_size ended up with a multiple of 8 but
+not 16. Whilst the first frame was always suitably aligned, this then
+misaligned the frame that follows, resulting in an alignment fault when storing
+a pointer to Fecode at the start of match. Patch to fix this issue by Jessica
+Clarke PR#72.
+
+20. Added -LP and -LS listing options to pcre2test.
+
+21. A user discovered that the library names in CMakeLists.txt for MSVC
+debugger (PDB) files were incorrect - perhaps never tried for PCRE2?
+
+22. An item such as [Aa] is optimized into a caseless single character match.
+When this was quantified (e.g. [Aa]{2}) and was also the last literal item in a
+pattern, the optimizing "must be present for a match" character check was not
+being flagged as caseless, causing some matches that should have succeeded to
+fail.
+
+23. Fixed a unicode property matching issue in JIT. The character was not
+fully read in caseless matching.
+
+24. Fixed an issue affecting recursions in JIT caused by duplicated data
+transfers.
+
+25. Merged patch from @carenas (GitHub #96) which fixes some problems with
+pcre2test and readline/readedit:
+
+  * Use the right header for libedit in FreeBSD with autoconf
+  * Really allow libedit with cmake
+  * Avoid using readline headers with libedit
+
+
+Version 10.39 29-October-2021
+-----------------------------
+
+1. Fix incorrect detection of alternatives in first character search in JIT.
+
+2. Merged patch from @carenas (GitHub #28):
+
+  Visual Studio 2013 includes support for %zu and %td, so let newer
+  versions of it avoid the fallback, and while at it, make sure that
+  the first check is for DISABLE_PERCENT_ZT so it will be always
+  honoured if chosen.
+
+  prtdiff_t is signed, so use a signed type instead, and make sure
+  that an appropriate width is chosen if pointers are 64bit wide and
+  long is not (ex: Windows 64bit).
+
+  IMHO removing the cast (and therefore the possibilty of truncation)
+  make the code cleaner and the fallback is likely portable enough
+  with all 64-bit POSIX systems doing LP64 except for Windows.
+
+3. Merged patch from @carenas (GitHub #29) to update to Unicode 14.0.0.
+
+4. Merged patch from @carenas (GitHub #30):
+
+  * Cleanup: remove references to no longer used stdint.h
+
+  Since 19c50b9d (Unconditionally use inttypes.h instead of trying for stdint.h
+  (simplification) and remove the now unnecessary inclusion in
+  pcre2_internal.h., 2018-11-14), stdint.h is no longer used.
+
+  Remove checks for it in autotools and CMake and document better the expected
+  build failures for systems that might have stdint.h (C99) and not inttypes.h
+  (from POSIX), like old Windows.
+
+  * Cleanup: remove detection for inttypes.h which is a hard dependency
+
+  CMake checks for standard headers are not meant to be used for hard
+  dependencies, so will prevent a possible fallback to work.
+
+  Alternatively, the header could be checked to make the configuration fail
+  instead of breaking the build, but that was punted, as it was missing anyway
+  from autotools.
+
+5. Merged patch from @carenas (GitHub #32):
+
+  * jit: allow building with ancient MSVC versions
+
+  Visual Studio older than 2013 fails to build with JIT enabled, because it is
+  unable to parse non C89 compatible syntax, with mixed declarations and code.
+  While most recent compilers wouldn't even report this as a warning since it
+  is valid C99, it could be also made visible by adding to gcc/clang the
+  -Wdeclaration-after-statement flag at build time.
+
+  Move the code below the affected definitions.
+
+  * pcre2grep: avoid mixing declarations with code
+
+  Since d5a61ee8 (Patch to detect (and ignore) symlink loops in pcre2grep,
+  2021-08-28), code will fail to build in a strict C89 compiler.
+
+  Reformat slightly to make it C89 compatible again.
+
+
+Version 10.38 01-October-2021
+-----------------------------
+
+1. Fix invalid single character repetition issues in JIT when the repetition
+is inside a capturing bracket and the bracket is preceded by character
+literals.
+
+2. Installed revised CMake configuration files provided by Jan-Willem Blokland.
+This extends the CMake build system to build both static and shared libraries
+in one go, builds the static library with PIC, and exposes PCRE2 libraries
+using the CMake config files. JWB provided these notes:
+
+- Introduced CMake variable BUILD_STATIC_LIBS to build the static library.
+
+- Make a small modification to config-cmake.h.in by removing the PCRE2_STATIC
+  variable. Added PCRE2_STATIC variable to the static build using the
+  target_compile_definitions() function.
+
+- Extended the CMake config files.
+
+  - Introduced CMake variable PCRE2_USE_STATIC_LIBS to easily switch between
+    the static and shared libraries.
+
+  - Added the PCRE_STATIC variable to the target compile definitions for the
+    import of the static library.
+
+Building static and shared libraries using MSVC results in a name clash of
+the libraries. Both static and shared library builds create, for example, the
+file pcre2-8.lib. Therefore, I decided to change the static library names by
+adding "-static". For example, pcre2-8.lib has become pcre2-8-static.lib.
+[Comment by PH: this is MSVC-specific. It doesn't happen on Linux.]
+
+3. Increased the minimum release number for CMake to 3.0.0 because older than
+2.8.12 is deprecated (it was set to 2.8.5) and causes warnings. Even 3.0.0 is
+quite old; it was released in 2014.
+
+4. Implemented a modified version of Thomas Tempelmann's pcre2grep patch for
+detecting symlink loops. This is dependent on the availability of realpath(),
+which is now tested for in ./configure and CMakeLists.txt.
+
+5. Implemented a modified version of Thomas Tempelmann's patch for faster
+case-independent "first code unit" searches for unanchored patterns in 8-bit
+mode in the interpreters. Instead of just remembering whether one case matched
+or not, it remembers the position of a previous match so as to avoid
+unnecessary repeated searching.
+
+6. Perl now locks out \K in lookarounds, so PCRE2 now does the same by default.
+However, just in case anybody was relying on the old behaviour, there is an
+option called PCRE2_EXTRA_ALLOW_LOOKAROUND_BSK that enables the old behaviour.
+An option has also been added to pcre2grep to enable this.
+
+7. Re-enable a JIT optimization which was unintentionally disabled in 10.35.
+
+8. There is a loop counter to catch excessively crazy patterns when checking
+the lengths of lookbehinds at compile time. This was incorrectly getting reset
+whenever a lookahead was processed, leading to some fuzzer-generated patterns
+taking a very long time to compile when (?|) was present in the pattern,
+because (?|) disables caching of group lengths.


 Version 10.37 26-May-2021
@ -186,7 +463,7 @@ now correctly backtracked, so this unnecessary restriction has been removed.

 7. Added PCRE2_SUBSTITUTE_MATCHED.

-8. Added (?* and (?<* as synonms for (*napla: and (*naplb: to match another
+8. Added (?* and (?<* as synonyms for (*napla: and (*naplb: to match another
 regex engine. The Perl regex folks are aware of this usage and have made a note
 about it.

@ -617,7 +894,7 @@ Patch by Guillem Jover.
 warnings were reported.

 38. Using the clang compiler with sanitizing options causes runtime complaints
-about truncation for statments such as x = ~x when x is an 8-bit value; it
+about truncation for statements such as x = ~x when x is an 8-bit value; it
 seems to compute ~x as a 32-bit value. Changing such statements to x = 255 ^ x
 gets rid of the warnings. There were also two missing casts in pcre2test.

--- a/64
+++ b/64
@ -8,8 +8,8 @@ library is referred to as PCRE1 below. For information about testing PCRE2, see
 the pcre2test documentation and the comment at the head of the RunTest file.

 PCRE1 releases were up to 8.3x when PCRE2 was developed, and later bug fix
-releases remain in the 8.xx series. PCRE2 releases started at 10.00 to avoid
-confusion with PCRE1.
+releases carried on the 8.xx series, up to the final 8.45 release. PCRE2
+releases started at 10.00 to avoid confusion with PCRE1.


 Historical note 1
@ -38,8 +38,8 @@ Historical note 2
 By contrast, the code originally written by Henry Spencer (which was
 subsequently heavily modified for Perl) compiles the expression twice: once in
 a dummy mode in order to find out how much store will be needed, and then for
-real. (The Perl version probably doesn't do this any more; I'm talking about
-the original library.) The execution function operates by backtracking and
+real. (The Perl version may or may not still do this; I'm talking about the
+original library.) The execution function operates by backtracking and
 maximizing (or, optionally, minimizing, in Perl) the amount of the subject that
 matches individual wild portions of the pattern. This is an "NFA algorithm" in
 Friedl's terminology.
@ -151,8 +151,8 @@ of code units in the item itself. The exception is the aforementioned large
 advance to check for such values. When auto-callouts are enabled, the generous
 assumption is made that there will be a callout for each pattern code unit
 (which of course is only actually true if all code units are literals) plus one
-at the end. There is a default parsed pattern vector on the system stack, but
-if this is not big enough, heap memory is used.
+at the end. A default parsed pattern vector is defined on the system stack, to
+minimize memory handling, but if this is not big enough, heap memory is used.

 As before, the actual compiling function is run twice, the first time to
 determine the amount of memory needed for the final compiled pattern. It
@ -187,7 +187,7 @@ META_CLASS_EMPTY      [] empty class - only with PCRE2_ALLOW_EMPTY_CLASS
 META_CLASS_EMPTY_NOT  [^] negative empty class - ditto
 META_CLASS_END        ] end of non-empty class
 META_CLASS_NOT        [^ start non-empty negative class
-META_COMMIT           (*COMMIT)
+META_COMMIT           (*COMMIT) - no argument (see below for with argument)
 META_COND_ASSERT      (?(?assertion)
 META_DOLLAR           $ metacharacter
 META_DOT              . metacharacter
@ -201,18 +201,18 @@ META_NOCAPTURE        (?: no capture parens
 META_PLUS             +
 META_PLUS_PLUS        ++
 META_PLUS_QUERY       +?
-META_PRUNE            (*PRUNE) - no argument
+META_PRUNE            (*PRUNE) - no argument (see below for with argument)
 META_QUERY            ?
 META_QUERY_PLUS       ?+
 META_QUERY_QUERY      ??
 META_RANGE_ESCAPED    hyphen in class range with at least one escape
 META_RANGE_LITERAL    hyphen in class range defined literally
-META_SKIP             (*SKIP) - no argument
-META_THEN             (*THEN) - no argument
+META_SKIP             (*SKIP) - no argument (see below for with argument)
+META_THEN             (*THEN) - no argument (see below for with argument)

 The two RANGE values occur only in character classes. They are positioned
 between two literals that define the start and end of the range. In an EBCDIC
-evironment it is necessary to know whether either of the range values was
+environment it is necessary to know whether either of the range values was
 specified as an escape. In an ASCII/Unicode environment the distinction is not
 relevant.

@ -229,17 +229,16 @@ If the data for META_ALT is non-zero, it is inside a lookbehind, and the data
 is the length of its branch, for which OP_REVERSE must be generated.

 META_BACKREF, META_CAPTURE, and META_RECURSE have the capture group number as
-their data in the lower 16 bits of the element.
+their data in the lower 16 bits of the element. META_RECURSE is followed by an
+offset, for use in error messages.

 META_BACKREF is followed by an offset if the back reference group number is 10
-or more. The offsets of the first ocurrences of references to groups whose
+or more. The offsets of the first occurrences of references to groups whose
 numbers are less than 10 are put in cb->small_ref_offset[] (only the first
 occurrence is useful). On 64-bit systems this avoids using more than two parsed
 pattern elements for items such as \3. The offset is used when an error occurs
 because the reference is to a non-existent group.

-META_RECURSE is always followed by an offset, for use in error messages.
-
 META_ESCAPE has an ESC_xxx value as its data. For ESC_P and ESC_p, the next
 element contains the 16-bit type and data property values, packed together.
 ESC_g and ESC_k are used only for named references - numerical ones are turned
@ -291,9 +290,9 @@ META_LOOKBEHIND       (?<=      start of lookbehind
 META_LOOKBEHIND_NA    (*naplb:  start of non-atomic lookbehind
 META_LOOKBEHINDNOT    (?<!      start of negative lookbehind

-The following are followed by two elements, the minimum and maximum. Repeat
-values are limited to 65535 (MAX_REPEAT). A maximum value of "unlimited" is
-represented by UNLIMITED_REPEAT, which is bigger than MAX_REPEAT:
+The following are followed by two elements, the minimum and maximum. The
+maximum value is limited to 65535 (MAX_REPEAT). A maximum value of "unlimited"
+is represented by UNLIMITED_REPEAT, which is bigger than MAX_REPEAT:

 META_MINMAX           {n,m}  repeat
 META_MINMAX_PLUS      {n,m}+ repeat
@ -347,11 +346,11 @@ support is not available for this kind of matching.
 Changeable options
 ------------------

-The /i, /m, or /s options (PCRE2_CASELESS, PCRE2_MULTILINE, PCRE2_DOTALL, and
-others) may be changed in the middle of patterns by items such as (?i). Their
-processing is handled entirely at compile time by generating different opcodes
-for the different settings. The runtime functions do not need to keep track of
-an option's state.
+The /i, /m, or /s options (PCRE2_CASELESS, PCRE2_MULTILINE, PCRE2_DOTALL) and
+some others may be changed in the middle of patterns by items such as (?i).
+Their processing is handled entirely at compile time by generating different
+opcodes for the different settings. The runtime functions do not need to keep
+track of an option's state.

 PCRE2_DUPNAMES, PCRE2_EXTENDED, PCRE2_EXTENDED_MORE, and PCRE2_NO_AUTO_CAPTURE
 are tracked and processed during the parsing pre-pass. The others are handled
@ -437,7 +436,7 @@ Backtracking control verbs
 --------------------------

 Verbs with no arguments generate opcodes with no following data (as listed
-in the section above). 
+in the section above).

 (*MARK:NAME) generates OP_MARK followed by the mark name, preceded by a
 length in one code unit, and followed by a binary zero. The name length is
@ -468,8 +467,8 @@ Caseless matching (positive or negative) of characters that have more than two
 case-equivalent code points (which is possible only in UTF mode) is handled by
 compiling a Unicode property item (see below), with the pseudo-property
 PT_CLIST. The value of this property is an offset in a vector called
-"ucd_caseless_sets" which identifies the start of a short list of equivalent
-characters, terminated by the value NOTACHAR (0xffffffff).
+"ucd_caseless_sets" which identifies the start of a short list of case
+equivalent characters, terminated by the value NOTACHAR (0xffffffff).


 Repeating single characters
@ -546,8 +545,9 @@ Each is followed by two code units that encode the desired property as a type
 and a value. The types are a set of #defines of the form PT_xxx, and the values
 are enumerations of the form ucp_xx, defined in the pcre2_ucp.h source file.
 The value is relevant only for PT_GC (General Category), PT_PC (Particular
-Category), PT_SC (Script), and the pseudo-property PT_CLIST, which is used to
-identify a list of case-equivalent characters when there are three or more.
+Category), PT_SC (Script), PT_BIDICL (Bidi Class), PT_BOOL (Boolean property),
+and the pseudo-property PT_CLIST, which is used to identify a list of
+case-equivalent characters when there are three or more (see above).

 Repeats of these items use the OP_TYPESTAR etc. set of opcodes, followed by
 three code units: OP_PROP or OP_NOTPROP, and then the desired property type and
@ -665,9 +665,9 @@ a count that immediately follows the offset.
 There are several opcodes that mark the end of a subpattern group. OP_KET is
 used for subpatterns that do not repeat indefinitely, OP_KETRMIN and
 OP_KETRMAX are used for indefinite repetitions, minimally or maximally
-respectively, and OP_KETRPOS for possessive repetitions (see below for more 
+respectively, and OP_KETRPOS for possessive repetitions (see below for more
 details). All four are followed by a LINK_SIZE value giving (as a positive
-number) the offset back to the matching bracket opcode.
+number) the offset back to the matching opening bracket opcode.

 If a subpattern is quantified such that it is permitted to match zero times, it
 is preceded by one of OP_BRAZERO, OP_BRAMINZERO, or OP_SKIPZERO. These are
@ -718,7 +718,7 @@ Assertions

 Forward assertions are also just like other subpatterns, but starting with one
 of the opcodes OP_ASSERT, OP_ASSERT_NA (non-atomic assertion), or
-OP_ASSERT_NOT. Backward assertions use the opcodes OP_ASSERTBACK, 
+OP_ASSERT_NOT. Backward assertions use the opcodes OP_ASSERTBACK,
 OP_ASSERTBACK_NA, and OP_ASSERTBACK_NOT, and the first opcode inside the
 assertion is OP_REVERSE, followed by a count of the number of characters to
 move back the pointer in the subject string. In ASCII or UTF-32 mode, the count
@ -827,4 +827,4 @@ not a real opcode, but is used to check at compile time that tables indexed by
 opcode are the correct length, in order to catch updating errors.

 Philip Hazel
-12 July 2019
+April 2022
--- a/8
+++ b/8
@ -23,10 +23,10 @@ Written by:       Philip Hazel
 Email local part: Philip.Hazel
 Email domain:     gmail.com

-University of Cambridge Computing Service,
+Retired from University of Cambridge Computing Service,
 Cambridge, England.

-Copyright (c) 1997-2021 University of Cambridge
+Copyright (c) 1997-2022 University of Cambridge
 All rights reserved.


@ -37,7 +37,7 @@ Written by:       Zoltan Herczeg
 Email local part: hzmester
 Email domain:     freemail.hu

-Copyright(c) 2010-2021 Zoltan Herczeg
+Copyright(c) 2010-2022 Zoltan Herczeg
 All rights reserved.


@ -48,7 +48,7 @@ Written by:       Zoltan Herczeg
 Email local part: hzmester
 Email domain:     freemail.hu

-Copyright(c) 2009-2021 Zoltan Herczeg
+Copyright(c) 2009-2022 Zoltan Herczeg
 All rights reserved.


--- a/MODULE.bazel
+++ b/MODULE.bazel
@ -0,0 +1,8 @@
+module(
+    name = "pcre2",
+    version = "10.40",
+    compatibility_level = 1,
+)
+
+bazel_dep(name = "rules_cc", version = "0.0.1")
+bazel_dep(name = "bazel_skylib", version = "1.2.1")
--- a/Makefile.am
+++ b/Makefile.am
@ -382,6 +382,10 @@ COMMON_SOURCES = \
  src/pcre2_valid_utf.c \
  src/pcre2_xclass.c

+# The pcre2_ucptables.c file is #included by pcre2_tables.c
+
+EXTRA_DIST += src/pcre2_ucptables.c
+
 if WITH_PCRE2_8
 lib_LTLIBRARIES += libpcre2-8.la
 libpcre2_8_la_SOURCES = \
@ -448,9 +452,10 @@ EXTRA_DIST += \
  src/sljit/sljitNativePPC_32.c \
  src/sljit/sljitNativePPC_64.c \
  src/sljit/sljitNativePPC_common.c \
+  src/sljit/sljitNativeRISCV_32.c \
+  src/sljit/sljitNativeRISCV_64.c \
+  src/sljit/sljitNativeRISCV_common.c \
  src/sljit/sljitNativeS390X.c \
-  src/sljit/sljitNativeSPARC_32.c \
-  src/sljit/sljitNativeSPARC_common.c \
  src/sljit/sljitNativeX86_32.c \
  src/sljit/sljitNativeX86_64.c \
  src/sljit/sljitNativeX86_common.c \
@ -663,6 +668,7 @@ EXTRA_DIST += \
  testdata/testinput23 \
  testdata/testinput24 \
  testdata/testinput25 \
+  testdata/testinput26 \
  testdata/testinputEBC \
  testdata/testoutput1 \
  testdata/testoutput2 \
@ -705,6 +711,7 @@ EXTRA_DIST += \
  testdata/testoutput23 \
  testdata/testoutput24 \
  testdata/testoutput25 \
+  testdata/testoutput26 \
  testdata/testoutputEBC \
  testdata/valgrind-jit.supp \
  testdata/wintestinput3 \
@ -859,9 +866,11 @@ endif # WITH_GCOV

 EXTRA_DIST += \
  cmake/COPYING-CMAKE-SCRIPTS \
+  cmake/FindEditline.cmake \
  cmake/FindPackageHandleStandardArgs.cmake \
  cmake/FindReadline.cmake \
-  cmake/FindEditline.cmake \
+  cmake/pcre2-config-version.cmake.in \
+  cmake/pcre2-config.cmake.in \
  CMakeLists.txt \
  config-cmake.h.in

--- a/Makefile.os4
+++ b/Makefile.os4
@ -0,0 +1,271 @@
+#
+# Project: pcre2
+#
+# Created on: 10-01-2022 22:01:46
+#
+# commands to use:
+# make -f Makefile.os4 libpcre2.a
+# make -f Makefile.os4 libpcre2-posix.a
+# make -f Makefile.os4 pcre2test
+# sh RunTest
+# make -f Makefile.os4 clean
+#
+
+###################################################################
+##
+##////  Objects
+##
+###################################################################
+
+libpcre2_OBJ := \
+	 src/pcre2_chartables.o src/pcre2_auto_possess.o src/pcre2_compile.o \
+	 src/pcre2_config.o src/pcre2_context.o src/pcre2_convert.o \
+	 src/pcre2_dfa_match.o src/pcre2_error.o src/pcre2_extuni.o \
+	 src/pcre2_find_bracket.o src/pcre2_jit_compile.o src/pcre2_maketables.o \
+	 src/pcre2_match.o src/pcre2_match_data.o src/pcre2_newline.o \
+	 src/pcre2_ord2utf.o src/pcre2_pattern_info.o src/pcre2_script_run.o \
+	 src/pcre2_serialize.o src/pcre2_string_utils.o src/pcre2_study.o \
+	 src/pcre2_substitute.o src/pcre2_substring.o src/pcre2_tables.o \
+	 src/pcre2_ucd.o src/pcre2_valid_utf.o src/pcre2_xclass.o \
+	
+
+
+pcre2posix_OBJ := \
+	 src/pcre2posix.o
+
+
+pcre2test_OBJ := \
+	 src/pcre2test.o
+
+
+pcre2grep_OBJ := \
+	 src/pcre2grep.o
+
+###################################################################
+##
+##////  Variables and Environment
+##
+###################################################################
+
+MCRT := -mcrt=newlib
+ifeq ($(USE_CLIB2), yes)
+MCRT := -mcrt=clib2
+endif
+
+CC := gcc:bin/gcc
+
+INCPATH := -I. -Isrc
+
+# for pcre2test
+CFLAGS := $(MCRT) $(INCPATH) -O2 -DHAVE_CONFIG_H -DPCRE2_CODE_UNIT_WIDTH=8
+
+###################################################################
+##
+##////  General rules
+##
+###################################################################
+
+.PHONY: all all-before all-after clean clean-custom realclean
+
+all: all-before libpcre2.a libpcre2-posix.a all-after
+
+all-before:
+#	You can add rules here to execute before the project is built
+
+all-after:
+#	You can add rules here to execute after the project is built
+
+tests: pcre2test pcre2grep
+
+clean: clean-custom
+	@echo "Cleaning compiler objects..."
+	@rm -f  $(libpcre2_OBJ) $(pcre2posix_OBJ) $(pcre2test_OBJ)
+
+cleanall: clean
+	@echo "Cleaning compiler targets..."
+	@rm -f  libpcre.a libpcre-posix.a pcre2test pcre2grep
+
+###################################################################
+##
+##////  Targets
+##
+###################################################################
+
+libpcre2.a: $(libpcre2_OBJ)
+	ar -rcs libpcre2.a $(libpcre2_OBJ)
+	ranlib libpcre2.a
+
+libpcre2-posix.a: $(pcre2posix_OBJ)
+	ar -rcs libpcre2-posix.a $(pcre2posix_OBJ)
+	ranlib libpcre2-posix.a
+
+pcre2test: libpcre2.a libpcre2-posix.a $(pcre2test_OBJ)
+	@echo "Linking pcre2test"
+	@gcc:bin/gcc $(MCRT) -o pcre2test $(pcre2test_OBJ) -L. -lauto -lpcre2 -lpcre2-posix
+	@echo "Removing stale debug target: pcre2test"
+	@rm -f pcre2test.debug
+	
+pcre2grep: libpcre2.a $(pcre2grep_OBJ)
+	@echo "Linking pcre2grep"
+	@gcc:bin/gcc $(MCRT) -o pcre2grep $(pcre2grep_OBJ) -L . -lauto -lpcre2
+	@echo "Removing stale debug target: pcre2grep"
+	@rm -f pcre2grep.debug
+
+
+###################################################################
+##
+##////  Standard rules
+##
+###################################################################
+
+# A default rule to make all the objects listed below
+# because we are hiding compiler commands from the output
+
+.c.o:
+	@echo "Compiling $<"
+	@$(CC) -c $< -o $*.o $(CFLAGS)
+
+src/pcre2_chartables.o: src/pcre2_chartables.c src/config.h src/pcre2_internal.h \
+	 src/pcre2.h src/pcre2_ucp.h
+
+src/pcre2_auto_possess.o: src/pcre2_auto_possess.c src/config.h src/pcre2_internal.h \
+	 src/pcre2.h src/pcre2_ucp.h
+
+src/pcre2_compile.o: src/pcre2_compile.c src/config.h src/pcre2_internal.h \
+	 src/pcre2.h src/pcre2_ucp.h src/pcre2_intmodedep.h \
+	
+
+src/pcre2_config.o: src/pcre2_config.c src/config.h src/pcre2_internal.h \
+	 src/pcre2.h src/pcre2_ucp.h
+
+src/pcre2_context.o: src/pcre2_context.c src/config.h src/pcre2_internal.h \
+	 src/pcre2.h src/pcre2_ucp.h
+
+src/pcre2_convert.o: src/pcre2_convert.c src/config.h src/pcre2_internal.h \
+	 src/pcre2.h src/pcre2_ucp.h
+
+src/pcre2_dfa_match.o: src/pcre2_dfa_match.c src/config.h src/pcre2_internal.h \
+	 src/pcre2.h src/pcre2_ucp.h
+
+src/pcre2_error.o: src/pcre2_error.c src/config.h src/pcre2_internal.h \
+	 src/pcre2.h src/pcre2_ucp.h
+
+src/pcre2_extuni.o: src/pcre2_extuni.c src/config.h src/pcre2_internal.h \
+	 src/pcre2.h src/pcre2_ucp.h
+
+src/pcre2_find_bracket.o: src/pcre2_find_bracket.c src/config.h src/pcre2_internal.h \
+	 src/pcre2.h src/pcre2_ucp.h
+
+src/pcre2_jit_compile.o: src/pcre2_jit_compile.c src/config.h src/pcre2_internal.h \
+	 src/pcre2.h src/pcre2_ucp.h src/pcre2_intmodedep.h \
+	 src/sljit/sljitLir.c src/sljit/sljitLir.h src/sljit/sljitConfig.h \
+	 src/sljit/sljitConfigInternal.h src/sljit/sljitUtils.c src/sljit/sljitProtExecAllocator.c \
+	 src/sljit/sljitWXExecAllocator.c src/sljit/sljitExecAllocator.c src/pcre2_jit_simd_inc.h \
+	 src/pcre2_jit_neon_inc.h src/pcre2_jit_match.c
+
+src/pcre2_maketables.o: src/pcre2_maketables.c
+
+src/pcre2_match.o: src/pcre2_match.c src/config.h src/pcre2_internal.h \
+	 src/pcre2.h src/pcre2_ucp.h
+
+src/pcre2_match_data.o: src/pcre2_match_data.c src/config.h src/pcre2_internal.h \
+	 src/pcre2.h src/pcre2_ucp.h
+
+src/pcre2_newline.o: src/pcre2_newline.c src/config.h src/pcre2_internal.h \
+	 src/pcre2.h src/pcre2_ucp.h
+
+src/pcre2_ord2utf.o: src/pcre2_ord2utf.c src/config.h src/pcre2_internal.h \
+	 src/pcre2.h src/pcre2_ucp.h
+
+src/pcre2_pattern_info.o: src/pcre2_pattern_info.c src/config.h src/pcre2_internal.h \
+	 src/pcre2.h src/pcre2_ucp.h
+
+src/pcre2_script_run.o: src/pcre2_script_run.c src/config.h src/pcre2_internal.h \
+	 src/pcre2.h src/pcre2_ucp.h
+
+src/pcre2_serialize.o: src/pcre2_serialize.c src/config.h src/pcre2_internal.h \
+	 src/pcre2.h src/pcre2_ucp.h
+
+src/pcre2test.o: src/pcre2test.c src/config.h src/pcre2.h \
+	 src/pcre2posix.h src/pcre2_internal.h src/pcre2_ucp.h \
+	 src/pcre2_intmodedep.h src/pcre2_tables.c src/pcre2_ucptables.c \
+	 src/pcre2_ucd.c src/pcre2_printint.c
+
+src/pcre2_string_utils.o: src/pcre2_string_utils.c src/config.h src/pcre2_internal.h \
+	 src/pcre2.h src/pcre2_ucp.h
+
+src/pcre2_study.o: src/pcre2_study.c src/config.h src/pcre2_internal.h \
+	 src/pcre2.h src/pcre2_ucp.h
+
+src/pcre2_substitute.o: src/pcre2_substitute.c src/config.h src/pcre2_internal.h \
+	 src/pcre2.h src/pcre2_ucp.h
+
+src/pcre2_substring.o: src/pcre2_substring.c src/config.h src/pcre2_internal.h \
+	 src/pcre2.h src/pcre2_ucp.h
+
+src/pcre2posix.o: src/pcre2posix.c src/config.h src/pcre2.h \
+	
+
+src/pcre2_tables.o: src/pcre2_tables.c src/config.h src/pcre2_internal.h \
+	 src/pcre2.h src/pcre2_ucp.h src/pcre2_intmodedep.h \
+	
+
+src/pcre2_ucd.o: src/pcre2_ucd.c src/config.h src/pcre2_internal.h \
+	 src/pcre2.h src/pcre2_ucp.h
+
+src/pcre2_valid_utf.o: src/pcre2_valid_utf.c src/config.h src/pcre2_internal.h \
+	 src/pcre2.h src/pcre2_ucp.h
+
+src/pcre2_xclass.o: src/pcre2_xclass.c src/config.h src/pcre2_internal.h \
+	 src/pcre2.h src/pcre2_ucp.h
+
+
+src/pcre2grep.o: src/pcre2grep.c src/config.h
+
+###################################################################
+##
+##////  Custom rules
+##
+###################################################################
+
+runtests: libpcre2.a libpcre2-posix.a tests
+	sh RunTest
+	sh RunGrepTest
+
+release:
+	@echo "Create release folders..."
+	@mkdir -p release/local/newlib/lib release/local/clib2/lib release/local/Documentation/pcre2 release/local/common/include
+	
+	@echo "Building newlib based libraries..."
+	@make -f Makefile.os4 all
+	@cp libpcre2.a release/local/newlib/lib/
+	@cp libpcre2-posix.a release/local/newlib/lib/
+	
+	@echo "Clean build and libraries files..."
+	@make -f Makefile.os4 cleanall
+	
+	@echo "Building clib2 based libraries..."
+	@make -f Makefile.os4 all USE_CLIB2=yes
+	@cp libpcre2.a release/local/clib2/lib/
+	@cp libpcre2-posix.a release/local/clib2/lib/
+
+	@echo "Copy the necessary files..."
+	@cp src/pcre2.h release/local/common/include/
+	@cp src/pcre2posix.h release/local/common/include/
+	@cp COPYING release/local/Documentation/pcre2/
+	@cp HACKING release/local/Documentation/pcre2/
+	@cp LICENCE release/local/Documentation/pcre2/
+	@cp README release/local/Documentation/pcre2/
+	@cp README-OS4.md release/local/Documentation/pcre2/
+	
+	@echo "Clean build and libraries files..."
+	@make -f Makefile.os4 cleanall
+	
+	@echo "Creating the lha release file..."
+	@rm -f pcre2.lha
+	@lha -aeqr3 a pcre2.lha release/
+	
+	@rm -rf release
+
+###################################################################
+
--- a/58
+++ b/58
@ -2,6 +2,64 @@ News about PCRE2 releases
 -------------------------


+Version 10.40 15-April-2022
+---------------------------
+
+This is mostly a bug-fixing and code-tidying release. However, there are some
+extensions to Unicode property handling:
+
+* Added support for Bidi_Class and a number of binary Unicode properties,
+including Bidi_Control.
+
+* A number of changes to script matching for \p and \P:
+
+  (a) Script extensions for a character are now coded as a bitmap instead of
+      a list of script numbers, which should be faster and does not need a
+      loop.
+
+  (b) Added the syntax \p{script:xxx} and \p{script_extensions:xxx} (synonyms
+      sc and scx).
+
+  (c) Changed \p{scriptname} from being the same as \p{sc:scriptname} to being
+      the same as \p{scx:scriptname} because this change happened in Perl at
+      release 5.26.
+
+  (d) The standard Unicode 4-letter abbreviations for script names are now
+      recognized.
+
+  (e) In accordance with Unicode and Perl's "loose matching" rules, spaces,
+      hyphens, and underscores are ignored in property names, which are then
+      matched independent of case.
+
+As always, see ChangeLog for a list of all changes (also the Git log).
+
+
+Version 10.39 29-October-2021
+-----------------------------
+
+This release is happening soon after 10.38 because the bug fix is important.
+
+1. Fix incorrect detection of alternatives in first character search in JIT.
+
+2. Update to Unicode 14.0.0.
+
+3. Some code cleanups (see ChangeLog).
+
+
+Version 10.38 01-October-2021
+-----------------------------
+
+As well as some bug fixes and tidies (as always, see ChangeLog for details),
+the documentation is updated to list the new URLs, following the move of the
+source repository to GitHub and the mailing list to Google Groups.
+
+* The CMake build system can now build both static and shared libraries in one
+go.
+
+* Following Perl's lead, \K is now locked out in lookaround assertions by
+default, but an option is provided to re-enable the previous behaviour.
+
+
 Version 10.37 26-May-2021
 -------------------------

--- a/13
+++ b/13
@ -121,6 +121,7 @@ environment, for example.
       pcre2_substring.c
       pcre2_tables.c
       pcre2_ucd.c
+       pcre2_ucptables.c
       pcre2_valid_utf.c
       pcre2_xclass.c

@ -306,7 +307,7 @@ cache can be deleted by selecting "File > Delete Cache".
 3.  Create a new, empty build directory, preferably a subdirectory of the
    source dir. For example, C:\pcre2\pcre2-xx\build.

-4.  Run cmake-gui from the Shell envirornment of your build tool, for example,
+4.  Run cmake-gui from the Shell environment of your build tool, for example,
    Msys for Msys/MinGW or Visual Studio Command Prompt for VC/VC++. Do not try
    to start Cmake from the Windows Start menu, as this can lead to errors.

@ -343,10 +344,10 @@ cache can be deleted by selecting "File > Delete Cache".

 BUILDING PCRE2 ON WINDOWS WITH VISUAL STUDIO

-The code currently cannot be compiled without a stdint.h header, which is
-available only in relatively recent versions of Visual Studio. However, this
-portable and permissively-licensed implementation of the header worked without
-issue:
+The code currently cannot be compiled without an inttypes.h header, which is
+available only with Visual Studio 2013 or newer. However, this portable and
+permissively-licensed implementation of the stdint.h header could be used as an
+alternative:

  http://www.azillionmonkeys.com/qed/pstdint.h

@ -373,7 +374,7 @@ Otherwise:
 1. Copy RunTest.bat into the directory where pcre2test.exe and pcre2grep.exe
   have been created.

-2. Edit RunTest.bat to indentify the full or relative location of
+2. Edit RunTest.bat to identify the full or relative location of
   the pcre2 source (wherein which the testdata folder resides), e.g.:

   set srcdir=C:\pcre2\pcre2-10.00
--- a/70
+++ b/70
@ -5,18 +5,19 @@ PCRE2 is a re-working of the original PCRE1 library to provide an entirely new
 API. Since its initial release in 2015, there has been further development of
 the code and it now differs from PCRE1 in more than just the API. There are new
 features, and the internals have been improved. The original PCRE1 library is
-now obsolete and should not be used in new projects. The latest release of
-PCRE2 is available in three alternative formats from:
+now obsolete and no longer maintained. The latest release of PCRE2 is available
+in .tar.gz, tar.bz2, or .zip form from this GitHub repository:

-https://ftp.pcre.org/pub/pcre/pcre2-10.xx.tar.gz
-https://ftp.pcre.org/pub/pcre/pcre2-10.xx.tar.bz2
-https://ftp.pcre.org/pub/pcre/pcre2-10.xx.tar.zip
+https://github.com/PCRE2Project/pcre2/releases

-There is a mailing list for discussion about the development of PCRE at
-pcre-dev@exim.org. You can access the archives and subscribe or manage your
-subscription here:
+There is a mailing list for discussion about the development of PCRE2 at
+pcre2-dev@googlegroups.com. You can subscribe by sending an email to
+pcre2-dev+subscribe@googlegroups.com.

-   https://lists.exim.org/mailman/listinfo/pcre-dev
+You can access the archives and also subscribe or manage your subscription
+here:
+
+https://groups.google.com/g/pcre2-dev

 Please read the NEWS file if you are upgrading from a previous release. The
 contents of this README file are:
@ -113,12 +114,18 @@ Building PCRE2 using autotools
 The following instructions assume the use of the widely used "configure; make;
 make install" (autotools) process.

-To build PCRE2 on system that supports autotools, first run the "configure"
-command from the PCRE2 distribution directory, with your current directory set
+If you have downloaded and unpacked a PCRE2 release tarball, run the
+"configure" command from the PCRE2 directory, with your current directory set
 to the directory where you want the files to be created. This command is a
 standard GNU "autoconf" configuration script, for which generic instructions
 are supplied in the file INSTALL.

+The files in the GitHub repository do not contain "configure". If you have
+downloaded the PCRE2 source files from GitHub, before you can run "configure"
+you must run the shell script called autogen.sh. This runs a number of
+autotools to create a "configure" script (you must of course have the autotools
+commands installed in order to do this).
+
 Most commonly, people build PCRE2 within its own distribution directory, and in
 this case, on many systems, just running "./configure" is sufficient. However,
 the usual methods of changing standard defaults are available. For example:
@ -187,10 +194,10 @@ library. They are also documented in the pcre2build man page.

  As well as supporting UTF strings, Unicode support includes support for the
  \P, \p, and \X sequences that recognize Unicode character properties.
-  However, only the basic two-letter properties such as Lu are supported.
-  Escape sequences such as \d and \w in patterns do not by default make use of
-  Unicode properties, but can be made to do so by setting the PCRE2_UCP option
-  or starting a pattern with (*UCP).
+  However, only a subset of Unicode properties are supported; see the
+  pcre2pattern man page for details. Escape sequences such as \d and \w in
+  patterns do not by default make use of Unicode properties, but can be made to
+  do so by setting the PCRE2_UCP option or starting a pattern with (*UCP).

 . You can build PCRE2 to recognize either CR or LF or the sequence CRLF, or any
  of the preceding, or any of the Unicode newline sequences, or the NUL (zero)
@ -368,19 +375,20 @@ library. They are also documented in the pcre2build man page.
  necessary to specify something like LIBS="-lncurses" as well. This is
  because, to quote the readline INSTALL, "Readline uses the termcap functions,
  but does not link with the termcap or curses library itself, allowing
-  applications which link with readline the to choose an appropriate library."
+  applications which link with readline the option to choose an appropriate
+  library."
  If you get error messages about missing functions tgetstr, tgetent, tputs,
  tgetflag, or tgoto, this is the problem, and linking with the ncurses library
  should fix it.

 . The C99 standard defines formatting modifiers z and t for size_t and
  ptrdiff_t values, respectively. By default, PCRE2 uses these modifiers in
-  environments other than Microsoft Visual Studio when __STDC_VERSION__ is
-  defined and has a value greater than or equal to 199901L (indicating C99).
-  However, there is at least one environment that claims to be C99 but does not
-  support these modifiers. If --disable-percent-zt is specified, no use is made
-  of the z or t modifiers. Instead or %td or %zu, %lu is used, with a cast for
-  size_t values.
+  environments other than Microsoft Visual Studio versions earlier than 2013
+  when __STDC_VERSION__ is defined and has a value greater than or equal to
+  199901L (indicating C99). However, there is at least one environment that
+  claims to be C99 but does not support these modifiers. If
+  --disable-percent-zt is specified, no use is made of the z or t modifiers.
+  Instead of %td or %zu, %lu is used, with a cast for size_t values.

 . There is a special option called --enable-fuzz-support for use by people who
  want to run fuzzing tests on PCRE2. At present this applies only to the 8-bit
@ -393,10 +401,10 @@ library. They are also documented in the pcre2build man page.
  Setting --enable-fuzz-support also causes a binary called pcre2fuzzcheck to
  be created. This is normally run under valgrind or used when PCRE2 is
  compiled with address sanitizing enabled. It calls the fuzzing function and
-  outputs information about it is doing. The input strings are specified by
-  arguments: if an argument starts with "=" the rest of it is a literal input
-  string. Otherwise, it is assumed to be a file name, and the contents of the
-  file are the test string.
+  outputs information about what it is doing. The input strings are specified
+  by arguments: if an argument starts with "=" the rest of it is a literal
+  input string. Otherwise, it is assumed to be a file name, and the contents
+  of the file are the test string.

 . Releases before 10.30 could be compiled with --disable-stack-for-recursion,
  which caused pcre2_match() to use individual blocks on the heap for
@ -410,7 +418,7 @@ The "configure" script builds the following files for the basic C library:
 . Makefile             the makefile that builds the library
 . src/config.h         build-time configuration options for the library
 . src/pcre2.h          the public PCRE2 header file
-. pcre2-config          script that shows the building settings such as CFLAGS
+. pcre2-config         script that shows the building settings such as CFLAGS
                         that were set for "configure"
 . libpcre2-8.pc        )
 . libpcre2-16.pc       ) data for the pkg-config command
@ -601,13 +609,13 @@ is available. RunTest outputs a comment when it skips a test.

 Many (but not all) of the tests that are not skipped are run twice if JIT
 support is available. On the second run, JIT compilation is forced. This
-testing can be suppressed by putting "nojit" on the RunTest command line.
+testing can be suppressed by putting "-nojit" on the RunTest command line.

 The entire set of tests is run once for each of the 8-bit, 16-bit and 32-bit
 libraries that are enabled. If you want to run just one set of tests, call
 RunTest with either the -8, -16 or -32 option.

-If valgrind is installed, you can run the tests under it by putting "valgrind"
+If valgrind is installed, you can run the tests under it by putting "-valgrind"
 on the RunTest command line. To run pcre2test on just one or more specific test
 files, give their numbers as arguments to RunTest, for example:

@ -688,7 +696,7 @@ Test 14 contains some special UTF and UCP tests that give different output for
 different code unit widths.

 Test 15 contains a number of tests that must not be run with JIT. They check,
-among other non-JIT things, the match-limiting features of the intepretive
+among other non-JIT things, the match-limiting features of the interpretive
 matcher.

 Test 16 is run only when JIT support is not available. It checks that an
@ -904,4 +912,4 @@ The distribution should contain the files listed below.
 Philip Hazel
 Email local part: Philip.Hazel
 Email domain: gmail.com
-Last updated: 28 April 2021
+Last updated: 15 April 2022
--- a/README-OS4.md
+++ b/README-OS4.md
@ -0,0 +1,39 @@
+PCRE2 (Perl-compatible regular expression library)
+---------------------------------------------------------------------------
+
+This is a port of PCRE2 10.40 by Philip Hazel for AmigaOS 4, as found at the
+GitHub repository https://github.com/PCRE2Project/pcre2
+
+More information about PCRE can be found at its official website
+at https://www.pcre.org and at the documentation that comes with this
+package.
+
+In the archive both newlib and clib2 libraries are included. It has been
+tested with various applications, but in case you find issues please 
+contact me.
+
+To install it into your AmigaOS 4 SDK installation, just extract all the 
+files in the SDK: path.
+
+Compile
+--------------------------
+The source and the changes I did can be found at my personale repository
+https://git.walkero.gr/walkero/pcre2
+
+You can compile it using the Makefile.os4 file, and produce the libraries
+yourself.
+
+* with newlib run:
+  ```bash
+  make -f Makefile.os4 all
+  ```
+* with clib2 run:
+  ```bash
+  make -f Makefile.os4 all USE_CLIB2=yes
+  ```
+
+Changelog
+--------------------------
+v10.40r1 - 2022-07-31
+* First release
+
--- a/README.md
+++ b/README.md
@ -0,0 +1,56 @@
+# PCRE2 - Perl-Compatible Regular Expressions
+
+The PCRE2 library is a set of C functions that implement regular expression
+pattern matching using the same syntax and semantics as Perl 5. PCRE2 has its
+own native API, as well as a set of wrapper functions that correspond to the
+POSIX regular expression API. The PCRE2 library is free, even for building 
+proprietary software. It comes in three forms, for processing 8-bit, 16-bit,
+or 32-bit code units, in either literal or UTF encoding.
+
+PCRE2 was first released in 2015 to replace the API in the original PCRE 
+library, which is now obsolete and no longer maintained. As well as a more
+flexible API, the code of PCRE2 has been much improved since the fork.
+ 
+## Download
+
+As well as downloading from the 
+[GitHub site](https://github.com/PCRE2Project/pcre2), you can download PCRE2 
+or the older, unmaintained PCRE1 library from an 
+[*unofficial* mirror](https://sourceforge.net/projects/pcre/files/) at SourceForge.
+
+You can check out the PCRE2 source code via Git or Subversion:
+
+    git clone https://github.com/PCRE2Project/pcre2.git
+    svn co    https://github.com/PCRE2Project/pcre2.git
+
+## Contributed Ports
+
+If you just need the command-line PCRE2 tools on Windows, precompiled binary
+versions are available at this 
+[Rexegg page](http://www.rexegg.com/pcregrep-pcretest.html).
+
+A PCRE2 port for z/OS, a mainframe operating system which uses EBCDIC as its
+default character encoding, can be found at 
+[http://www.cbttape.org](http://www.cbttape.org/) (File 939).
+
+## Documentation
+
+You can read the PCRE2 documentation 
+[here](https://PCRE2Project.github.io/pcre2/doc/html/index.html).
+
+Comparisons to Perl's regular expression semantics can be found in the
+community authored Wikipedia entry for PCRE.
+
+There is a curated summary of changes for each PCRE release, copies of
+documentation from older releases, and other useful information from the third
+party authored 
+[RexEgg PCRE Documentation and Change Log page](http://www.rexegg.com/pcre-documentation.html).
+
+## Contact
+
+To report a problem with the PCRE2 library, or to make a feature request, please
+use the PCRE2 GitHub issues tracker. There is a mailing list for discussion of
+ PCRE2 issues and development at pcre2-dev@googlegroups.com, which is where any
+announcements will be made. You can browse the 
+[list archives](https://groups.google.com/g/pcre2-dev).
+
--- a/60
+++ b/60
@ -68,6 +68,22 @@ diff -b  /dev/null /dev/null 2>/dev/null && cf="diff -b"
 diff -u  /dev/null /dev/null 2>/dev/null && cf="diff -u"
 diff -ub /dev/null /dev/null 2>/dev/null && cf="diff -ub"

+# Some tests involve NUL characters. It seems impossible to handle them easily
+# in many operating systems. An earlier version of this script used sed to
+# translate NUL into the string ZERO, but this didn't work on Solaris (aka
+# SunOS), where the version of sed explicitly doesn't like them, and also MacOS
+# (Darwin), OpenBSD, FreeBSD, NetBSD, and some Linux distributions like Alpine,
+# even when using GNU sed. A user suggested using tr instead, which
+# necessitates translating to a single character. However, on (some versions
+# of?) Solaris, the normal "tr" cannot handle binary zeros, but if
+# /usr/xpg4/bin/tr is available, it can do so, so test for that.
+
+if [ -x /usr/xpg4/bin/tr ] ; then
+  tr=/usr/xpg4/bin/tr
+else
+  tr=tr
+fi
+
 # If this test is being run from "make check", $srcdir will be set. If not, set
 # it to the current or parent directory, whichever one contains the test data.
 # Subsequently, we run most of the pcre2grep tests in the source directory so
@ -558,7 +574,7 @@ echo "RC=$?" >>testtrygrep
 echo "---------------------------- Test 107 -----------------------------" >>testtrygrep
 echo "a" >testtemp1grep
 echo "aaaaa" >>testtemp1grep
-(cd $srcdir; $valgrind $vjs $pcre2grep  --line-offsets '(?<=\Ka)' $builddir/testtemp1grep) >>testtrygrep 2>&1
+(cd $srcdir; $valgrind $vjs $pcre2grep  --line-offsets --allow-lookaround-bsk '(?<=\Ka)' $builddir/testtemp1grep) >>testtrygrep 2>&1
 echo "RC=$?" >>testtrygrep

 echo "---------------------------- Test 108 ------------------------------" >>testtrygrep
@ -638,13 +654,13 @@ echo "RC=$?" >>testtrygrep

 echo "---------------------------- Test 125 -----------------------------" >>testtrygrep
 printf 'abcd\n' >testNinputgrep
-$valgrind $vjs $pcre2grep --colour=always '(?<=\K.)' testNinputgrep >>testtrygrep
+$valgrind $vjs $pcre2grep --colour=always --allow-lookaround-bsk '(?<=\K.)' testNinputgrep >>testtrygrep
 echo "RC=$?" >>testtrygrep
-$valgrind $vjs $pcre2grep --colour=always '(?=.\K)' testNinputgrep >>testtrygrep
+$valgrind $vjs $pcre2grep --colour=always --allow-lookaround-bsk '(?=.\K)' testNinputgrep >>testtrygrep
 echo "RC=$?" >>testtrygrep
-$valgrind $vjs $pcre2grep --colour=always '(?<=\K[ac])' testNinputgrep >>testtrygrep
+$valgrind $vjs $pcre2grep --colour=always --allow-lookaround-bsk '(?<=\K[ac])' testNinputgrep >>testtrygrep
 echo "RC=$?" >>testtrygrep
-$valgrind $vjs $pcre2grep --colour=always '(?=[ac]\K)' testNinputgrep >>testtrygrep
+$valgrind $vjs $pcre2grep --colour=always --allow-lookaround-bsk '(?=[ac]\K)' testNinputgrep >>testtrygrep
 echo "RC=$?" >>testtrygrep

 echo "---------------------------- Test 126 -----------------------------" >>testtrygrep
@ -674,13 +690,27 @@ echo "---------------------------- Test 131 -----------------------------" >>tes
 echo "RC=$?" >>testtrygrep

 echo "---------------------------- Test 132 -----------------------------" >>testtrygrep
-(cd $srcdir; $valgrind $vjs $pcre2grep -m1 -A3 '^match'; echo '---'; head -1) <$srcdir/testdata/grepinput >>testtrygrep 2>&1
+(cd $srcdir; exec 3<testdata/grepinput; $valgrind $vjs $pcre2grep -m1 -A3 '^match' <&3; echo '---'; head -1 <&3; exec 3<&-) >>testtrygrep 2>&1
 echo "RC=$?" >>testtrygrep

 echo "---------------------------- Test 133 -----------------------------" >>testtrygrep
+(cd $srcdir; exec 3<testdata/grepinput; $valgrind $vjs $pcre2grep -m1 -A3 '^match' <&3; echo '---'; $valgrind $vjs $pcre2grep -m1 -A3 '^match' <&3; exec 3<&-) >>testtrygrep 2>&1
+echo "RC=$?" >>testtrygrep
+
+echo "---------------------------- Test 134 -----------------------------" >>testtrygrep
 (cd $srcdir; $valgrind $vjs $pcre2grep -m1 -O '=$x{41}$x423$o{103}$o1045=' 'fox') <$srcdir/testdata/grepinputv >>testtrygrep 2>&1
 echo "RC=$?" >>testtrygrep

+echo "---------------------------- Test 135 -----------------------------" >>testtrygrep
+(cd $srcdir; $valgrind $vjs $pcre2grep -HZ 'word' ./testdata/grepinputv) | $tr '\000' '@' >>testtrygrep
+echo "RC=$?" >>testtrygrep
+(cd $srcdir; $valgrind $vjs $pcre2grep -lZ 'word' ./testdata/grepinputv ./testdata/grepinputv) | $tr '\000' '@' >>testtrygrep
+echo "RC=$?" >>testtrygrep
+(cd $srcdir; $valgrind $vjs $pcre2grep -A 1 -B 1 -HZ 'word' ./testdata/grepinputv) | $tr '\000' '@' >>testtrygrep
+echo "RC=$?" >>testtrygrep
+(cd $srcdir; $valgrind $vjs $pcre2grep -MHZn 'start[\s]+end' testdata/grepinputM) >>testtrygrep
+echo "RC=$?" >>testtrygrep
+
 # Now compare the results.

 $cf $srcdir/testdata/grepoutput testtrygrep
@ -701,7 +731,7 @@ if [ $utf8 -ne 0 ] ; then
  echo "RC=$?" >>testtrygrep

  echo "---------------------------- Test U3 ------------------------------" >>testtrygrep
-  (cd $srcdir; $valgrind $vjs $pcre2grep --line-offsets -u --newline=any '(?<=\K\x{17f})' ./testdata/grepinput8) >>testtrygrep
+  (cd $srcdir; $valgrind $vjs $pcre2grep --line-offsets -u --newline=any --allow-lookaround-bsk '(?<=\K\x{17f})' ./testdata/grepinput8) >>testtrygrep
  echo "RC=$?" >>testtrygrep

  echo "---------------------------- Test U4 ------------------------------" >>testtrygrep
@ -755,22 +785,6 @@ $valgrind $vjs $pcre2grep -n --newline=any "^(abc|def|ghi|jkl)" testNinputgrep >
 printf '%c--------------------------- Test N6 ------------------------------\r\n' - >>testtrygrep
 $valgrind $vjs $pcre2grep -n --newline=anycrlf "^(abc|def|ghi|jkl)" testNinputgrep >>testtrygrep

-# This next test involves NUL characters. It seems impossible to handle them
-# easily in many operating systems. An earlier version of this script used sed
-# to translate NUL into the string ZERO, but this didn't work on Solaris (aka
-# SunOS), where the version of sed explicitly doesn't like them, and also MacOS
-# (Darwin), OpenBSD, FreeBSD, NetBSD, and some Linux distributions like Alpine,
-# even when using GNU sed. A user suggested using tr instead, which
-# necessitates translating to a single character (@). However, on (some
-# versions of?) Solaris, the normal "tr" cannot handle binary zeros, but if
-# /usr/xpg4/bin/tr is available, it can do so, so test for that.
-
-if [ -x /usr/xpg4/bin/tr ] ; then
-  tr=/usr/xpg4/bin/tr
-else
-  tr=tr
-fi
-
 printf '%c--------------------------- Test N7 ------------------------------\r\n' - >>testtrygrep
 printf 'abc\0def' >testNinputgrep
 $valgrind $vjs $pcre2grep -na --newline=nul "^(abc|def)" testNinputgrep | $tr '\000' '@' >>testtrygrep
--- a/63
+++ b/63
@ -17,8 +17,16 @@
 # individual test numbers, ranges of tests such as 3-6 or 3- (meaning 3 to the
 # end), or a number preceded by ~ to exclude a test. For example, "3-15 ~10"
 # runs tests 3 to 15, excluding test 10, and just "~10" runs all the tests
-# except test 10. Whatever order the arguments are in, the tests are always run
-# in numerical order.
+# except test 10. Whatever order the arguments are in, these tests are always
+# run in numerical order.
+#
+# If no specific tests are selected (which is the case when this script is run
+# via 'make check') the default is to run all the numbered tests.
+#
+# There may also be named (as well as numbered) tests for special purposes. At
+# present there is just one, called "heap". This test's output contains the
+# sizes of heap frames and frame vectors, which depend on the environment. It
+# is therefore not run unless explicitly requested.
 #
 # Inappropriate tests are automatically skipped (with a comment to say so). For
 # example, if JIT support is not compiled, test 16 is skipped, whereas if JIT
@ -80,7 +88,9 @@ title22="Test 22: \C tests with UTF (not supported for DFA matching)"
 title23="Test 23: \C disabled test"
 title24="Test 24: Non-UTF pattern conversion tests"
 title25="Test 25: UTF pattern conversion tests"
-maxtest=25
+title26="Test 26: Auto-generated unicode property tests"
+maxtest=26
+titleheap="Test 'heap': Environment-specific heap tests"

 if [ $# -eq 1 -a "$1" = "list" ]; then
  echo $title0
@ -109,6 +119,12 @@ if [ $# -eq 1 -a "$1" = "list" ]; then
  echo $title23
  echo $title24
  echo $title25
+  echo $title26
+  echo ""
+  echo $titleheap
+  echo ""
+  echo "Numbered tests are automatically run if nothing selected."
+  echo "Named tests must be explicitly selected."
  exit 0
 fi

@ -238,6 +254,8 @@ do22=no
 do23=no
 do24=no
 do25=no
+do26=no
+doheap=no

 while [ $# -gt 0 ] ; do
  case $1 in
@ -267,6 +285,8 @@ while [ $# -gt 0 ] ; do
   23) do23=yes;;
   24) do24=yes;;
   25) do25=yes;;
+   26) do26=yes;;
+ heap) doheap=yes;;
   -8) arg8=yes;;
  -16) arg16=yes;;
  -32) arg32=yes;;
@ -320,7 +340,8 @@ fi
 # set up a large stack.

 $sim ./pcre2test -S 64 /dev/null /dev/null
-if [ $? -eq 0 -a "$bigstack" != "" ] ; then
+support_setstack=$?
+if [ $support_setstack -eq 0 -a "$bigstack" != "" ] ; then
  setstack="-S 64"
 else
  setstack=""
@ -407,8 +428,8 @@ if [ $jit -ne 0 -a "$nojit" != "yes" ] ; then
  fi
 fi

-# If no specific tests were requested, select all. Those that are not
-# relevant will be automatically skipped.
+# If no specific tests were requested, select all the numbered tests. Those
+# that are not relevant will be automatically skipped.

 if [ $do0  = no -a $do1  = no -a $do2  = no -a $do3  = no -a \
     $do4  = no -a $do5  = no -a $do6  = no -a $do7  = no -a \
@ -416,7 +437,7 @@ if [ $do0  = no -a $do1  = no -a $do2  = no -a $do3  = no -a \
     $do12 = no -a $do13 = no -a $do14 = no -a $do15 = no -a \
     $do16 = no -a $do17 = no -a $do18 = no -a $do19 = no -a \
     $do20 = no -a $do21 = no -a $do22 = no -a $do23 = no -a \
-     $do24 = no -a $do25 = no \
+     $do24 = no -a $do25 = no -a $do26 = no -a $doheap = no \
   ]; then
  do0=yes
  do1=yes
@ -444,6 +465,7 @@ if [ $do0  = no -a $do1  = no -a $do2  = no -a $do3  = no -a \
  do23=yes
  do24=yes
  do25=yes
+  do26=yes
 fi

 # Handle any explicit skips at this stage, so that an argument list may consist
@ -479,7 +501,9 @@ for bmode in "$test8" "$test16" "$test32"; do
    echo '' >testtry
    checkspecial '-C'
    checkspecial '--help'
-    checkspecial '-S 1 -t 10 testSinput'
+    if [ $support_setstack -eq 0 ] ; then
+      checkspecial '-S 1 -t 10 testSinput'
+    fi
    echo "  OK"
  fi

@ -860,6 +884,29 @@ for bmode in "$test8" "$test16" "$test32"; do
    fi
  fi

+  # Auto-generated unicode property tests
+
+  if [ $do26 = yes ] ; then
+    echo $title26
+    if [ $utf -eq 0 ] ; then
+      echo "  Skipped because UTF-$bits support is not available"
+    else
+      for opt in "" $jitopt; do
+        $sim $valgrind ${opt:+$vjs} ./pcre2test -q $setstack $bmode $opt $testdata/testinput26 testtry
+        checkresult $? 26 "$opt"
+      done
+    fi
+  fi
+
+  # Manually selected heap tests - output may vary in different environments,
+  # which is why that are not automatically run.
+
+  if [ $doheap = yes ] ; then
+    echo $titleheap
+    $sim $valgrind ./pcre2test -q $setstack $bmode $testdata/testinputheap testtry
+    checkresult $? heap-$bits ""
+  fi
+
 # End of loop for 8/16/32-bit tests
 done

--- a/RunTest.bat
+++ b/RunTest.bat
@ -135,9 +135,9 @@ if "%all%" == "yes" (
  set do7=yes
  set do8=yes
  set do9=yes
-  set do10=yes
+  set do10=no
  set do11=yes
-  set do12=yes
+  set do12=no
  set do13=yes
  set do14=yes
  set do15=yes
--- a/WORKSPACE.bazel
+++ b/WORKSPACE.bazel
@ -0,0 +1 @@
+# See MODULE.bazel
--- a/cmake/FindEditline.cmake
+++ b/cmake/FindEditline.cmake
@ -1,17 +1,16 @@
 # Modified from FindReadline.cmake (PH Feb 2012)

-if(EDITLINE_INCLUDE_DIR AND EDITLINE_LIBRARY AND NCURSES_LIBRARY)
+if(EDITLINE_INCLUDE_DIR AND EDITLINE_LIBRARY)
  set(EDITLINE_FOUND TRUE)
-else(EDITLINE_INCLUDE_DIR AND EDITLINE_LIBRARY AND NCURSES_LIBRARY)
-  FIND_PATH(EDITLINE_INCLUDE_DIR readline.h
-    /usr/include/editline
-    /usr/include/edit/readline  
-    /usr/include/readline
+else(EDITLINE_INCLUDE_DIR AND EDITLINE_LIBRARY)
+  FIND_PATH(EDITLINE_INCLUDE_DIR readline.h PATH_SUFFIXES
+    editline
+    edit/readline
  )
  
  FIND_LIBRARY(EDITLINE_LIBRARY NAMES edit)
  include(FindPackageHandleStandardArgs)
-  FIND_PACKAGE_HANDLE_STANDARD_ARGS(Editline DEFAULT_MSG EDITLINE_INCLUDE_DIR EDITLINE_LIBRARY )
+  FIND_PACKAGE_HANDLE_STANDARD_ARGS(Editline DEFAULT_MSG EDITLINE_INCLUDE_DIR EDITLINE_LIBRARY)

  MARK_AS_ADVANCED(EDITLINE_INCLUDE_DIR EDITLINE_LIBRARY)
-endif(EDITLINE_INCLUDE_DIR AND EDITLINE_LIBRARY AND NCURSES_LIBRARY)
+endif(EDITLINE_INCLUDE_DIR AND EDITLINE_LIBRARY)
--- a/cmake/pcre2-config-version.cmake.in
+++ b/cmake/pcre2-config-version.cmake.in
@ -0,0 +1,15 @@
+set(PACKAGE_VERSION_MAJOR @PCRE2_MAJOR@)
+set(PACKAGE_VERSION_MINOR @PCRE2_MINOR@)
+set(PACKAGE_VERSION_PATCH 0)
+set(PACKAGE_VERSION @PCRE2_MAJOR@.@PCRE2_MINOR@.0)
+
+# Check whether the requested PACKAGE_FIND_VERSION is compatible
+if(PACKAGE_VERSION VERSION_LESS PACKAGE_FIND_VERSION OR
+   PACKAGE_VERSION_MAJOR GREATER PACKAGE_FIND_VERSION_MAJOR)
+  set(PACKAGE_VERSION_COMPATIBLE FALSE)
+else()
+  set(PACKAGE_VERSION_COMPATIBLE TRUE)
+  if(PACKAGE_VERSION VERSION_EQUAL PACKAGE_FIND_VERSION)
+    set(PACKAGE_VERSION_EXACT TRUE)
+  endif()
+endif()
--- a/cmake/pcre2-config.cmake.in
+++ b/cmake/pcre2-config.cmake.in
@ -0,0 +1,145 @@
+# pcre2-config.cmake
+# ----------------
+#
+# Finds the PCRE2 library, specify the starting search path in PCRE2_ROOT.
+#
+# Static vs. shared
+# -----------------
+# To make use of the static library instead of the shared one, one needs
+# to set the variable PCRE2_USE_STATIC_LIBS to ON before calling find_package.
+# Example:
+#   set(PCRE2_USE_STATIC_LIBS ON)
+#   find_package(PCRE2 CONFIG COMPONENTS 8BIT)
+#
+# This will define the following variables:
+#
+#   PCRE2_FOUND   - True if the system has the PCRE2 library.
+#   PCRE2_VERSION - The version of the PCRE2 library which was found.
+#
+# and the following imported targets:
+#
+#   PCRE2::8BIT  - The 8 bit PCRE2 library.
+#   PCRE2::16BIT - The 16 bit PCRE2 library.
+#   PCRE2::32BIT - The 32 bit PCRE2 library.
+#   PCRE2::POSIX - The POSIX PCRE2 library.
+
+set(PCRE2_NON_STANDARD_LIB_PREFIX @NON_STANDARD_LIB_PREFIX@)
+set(PCRE2_NON_STANDARD_LIB_SUFFIX @NON_STANDARD_LIB_SUFFIX@)
+set(PCRE2_8BIT_NAME pcre2-8)
+set(PCRE2_16BIT_NAME pcre2-16)
+set(PCRE2_32BIT_NAME pcre2-32)
+set(PCRE2_POSIX_NAME pcre2-posix)
+find_path(PCRE2_INCLUDE_DIR NAMES pcre2.h DOC "PCRE2 include directory")
+if (PCRE2_USE_STATIC_LIBS)
+  if (MSVC)
+    set(PCRE2_8BIT_NAME pcre2-8-static)
+    set(PCRE2_16BIT_NAME pcre2-16-static)
+    set(PCRE2_32BIT_NAME pcre2-32-static)
+    set(PCRE2_POSIX_NAME pcre2-posix-static)
+  endif ()
+
+  set(PCRE2_PREFIX ${CMAKE_STATIC_LIBRARY_PREFIX})
+  set(PCRE2_SUFFIX ${CMAKE_STATIC_LIBRARY_SUFFIX})
+else ()
+  set(PCRE2_PREFIX ${CMAKE_SHARED_LIBRARY_PREFIX})
+  if (MINGW AND PCRE2_NON_STANDARD_LIB_PREFIX)
+    set(PCRE2_PREFIX "")
+  endif ()
+
+  set(PCRE2_SUFFIX ${CMAKE_SHARED_LIBRARY_SUFFIX})
+  if (MINGW AND PCRE2_NON_STANDARD_LIB_SUFFIX)
+    set(PCRE2_SUFFIX "-0.dll")
+  endif ()
+endif ()
+find_library(PCRE2_8BIT_LIBRARY NAMES ${PCRE2_PREFIX}${PCRE2_8BIT_NAME}${PCRE2_SUFFIX} ${PCRE2_PREFIX}${PCRE2_8BIT_NAME}d${PCRE2_SUFFIX} DOC "8 bit PCRE2 library")
+find_library(PCRE2_16BIT_LIBRARY NAMES ${PCRE2_PREFIX}${PCRE2_16BIT_NAME}${PCRE2_SUFFIX} ${PCRE2_PREFIX}${PCRE2_8BIT_NAME}d${PCRE2_SUFFIX} DOC "16 bit PCRE2 library")
+find_library(PCRE2_32BIT_LIBRARY NAMES ${PCRE2_PREFIX}${PCRE2_32BIT_NAME}${PCRE2_SUFFIX} ${PCRE2_PREFIX}${PCRE2_8BIT_NAME}d${PCRE2_SUFFIX} DOC "32 bit PCRE2 library")
+find_library(PCRE2_POSIX_LIBRARY NAMES ${PCRE2_PREFIX}${PCRE2_POSIX_NAME}${PCRE2_SUFFIX} ${PCRE2_PREFIX}${PCRE2_8BIT_NAME}d${PCRE2_SUFFIX} DOC "8 bit POSIX PCRE2 library")
+unset(PCRE2_NON_STANDARD_LIB_PREFIX)
+unset(PCRE2_NON_STANDARD_LIB_SUFFIX)
+unset(PCRE2_8BIT_NAME)
+unset(PCRE2_16BIT_NAME)
+unset(PCRE2_32BIT_NAME)
+unset(PCRE2_POSIX_NAME)
+
+# Set version
+if (PCRE2_INCLUDE_DIR)
+  set(PCRE2_VERSION "@PCRE2_MAJOR@.@PCRE2_MINOR@.0")
+endif ()
+
+# Which components have been found.
+if (PCRE2_8BIT_LIBRARY)
+  set(PCRE2_8BIT_FOUND TRUE)
+endif ()
+if (PCRE2_16BIT_LIBRARY)
+  set(PCRE2_16BIT_FOUND TRUE)
+endif ()
+if (PCRE2_32BIT_LIBRARY)
+  set(PCRE2_32BIT_FOUND TRUE)
+endif ()
+if (PCRE2_POSIX_LIBRARY)
+  set(PCRE2_POSIX_FOUND TRUE)
+endif ()
+
+# Check if at least one component has been specified.
+list(LENGTH PCRE2_FIND_COMPONENTS PCRE2_NCOMPONENTS)
+if (PCRE2_NCOMPONENTS LESS 1)
+  message(FATAL_ERROR "No components have been specified. This is not allowed. Please, specify at least one component.")
+endif ()
+unset(PCRE2_NCOMPONENTS)
+
+# When POSIX component has been specified make sure that also 8BIT component is specified.
+set(PCRE2_8BIT_COMPONENT FALSE)
+set(PCRE2_POSIX_COMPONENT FALSE)
+foreach(component ${PCRE2_FIND_COMPONENTS})
+  if (component STREQUAL "8BIT")
+    set(PCRE2_8BIT_COMPONENT TRUE)
+  elseif (component STREQUAL "POSIX")
+    set(PCRE2_POSIX_COMPONENT TRUE)
+  endif ()
+endforeach()
+
+if (PCRE2_POSIX_COMPONENT AND NOT PCRE2_8BIT_COMPONENT)
+  message(FATAL_ERROR "The component POSIX is specified while the 8BIT one is not. This is not allowed. Please, also specify the 8BIT component.")
+endif()
+unset(PCRE2_8BIT_COMPONENT)
+unset(PCRE2_POSIX_COMPONENT)
+
+include(FindPackageHandleStandardArgs)
+set(${CMAKE_FIND_PACKAGE_NAME}_CONFIG "${CMAKE_CURRENT_LIST_FILE}")
+find_package_handle_standard_args(PCRE2
+  FOUND_VAR PCRE2_FOUND
+  REQUIRED_VARS PCRE2_INCLUDE_DIR
+  HANDLE_COMPONENTS
+  VERSION_VAR PCRE2_VERSION
+  CONFIG_MODE
+)
+
+set(PCRE2_LIBRARIES)
+if (PCRE2_FOUND)
+  foreach(component ${PCRE2_FIND_COMPONENTS})
+    if (PCRE2_USE_STATIC_LIBS)
+      add_library(PCRE2::${component} STATIC IMPORTED)
+      target_compile_definitions(PCRE2::${component} INTERFACE PCRE2_STATIC)
+    else ()
+      add_library(PCRE2::${component} SHARED IMPORTED)
+    endif ()
+    set_target_properties(PCRE2::${component} PROPERTIES
+      IMPORTED_LOCATION "${PCRE2_${component}_LIBRARY}"
+      INTERFACE_INCLUDE_DIRECTORIES "${PCRE2_INCLUDE_DIR}"
+    )
+    if (component STREQUAL "POSIX")
+      set_target_properties(PCRE2::${component} PROPERTIES
+        INTERFACE_LINK_LIBRARIES "PCRE2::8BIT"
+        LINK_LIBRARIES "PCRE2::8BIT"
+      )
+    endif ()
+
+    set(PCRE2_LIBRARIES ${PCRE2_LIBRARIES} ${PCRE2_${component}_LIBRARY})
+    mark_as_advanced(PCRE2_${component}_LIBRARY)
+  endforeach()
+endif ()
+
+mark_as_advanced(
+  PCRE2_INCLUDE_DIR
+)
--- a/config-cmake.h.in
+++ b/config-cmake.h.in
@ -2,8 +2,6 @@

 #cmakedefine HAVE_ATTRIBUTE_UNINITIALIZED 1
 #cmakedefine HAVE_DIRENT_H 1
-#cmakedefine HAVE_INTTYPES_H 1    
-#cmakedefine HAVE_STDINT_H 1                                                   
 #cmakedefine HAVE_STRERROR 1
 #cmakedefine HAVE_SYS_STAT_H 1
 #cmakedefine HAVE_SYS_TYPES_H 1
@ -16,8 +14,6 @@
 #cmakedefine HAVE_SECURE_GETENV 1
 #cmakedefine HAVE_STRERROR 1

-#cmakedefine PCRE2_STATIC 1
-
 #cmakedefine SUPPORT_PCRE2_8 1
 #cmakedefine SUPPORT_PCRE2_16 1
 #cmakedefine SUPPORT_PCRE2_32 1
--- a/configure.ac
+++ b/configure.ac
@ -9,15 +9,15 @@ dnl The PCRE2_PRERELEASE feature is for identifying release candidates. It might
 dnl be defined as -RC2, for example. For real releases, it should be empty.

 m4_define(pcre2_major, [10])
-m4_define(pcre2_minor, [37])
+m4_define(pcre2_minor, [41])
 m4_define(pcre2_prerelease, [])
-m4_define(pcre2_date, [2021-05-26])
+m4_define(pcre2_date, [2022-xx-xx])

 # Libtool shared library interface versions (current:revision:age)
-m4_define(libpcre2_8_version,     [10:2:10])
-m4_define(libpcre2_16_version,    [10:2:10])
-m4_define(libpcre2_32_version,    [10:2:10])
-m4_define(libpcre2_posix_version, [3:0:0])
+m4_define(libpcre2_8_version,     [11:0:11])
+m4_define(libpcre2_16_version,    [11:0:11])
+m4_define(libpcre2_32_version,    [11:0:11])
+m4_define(libpcre2_posix_version, [3:2:0])

 # NOTE: The CMakeLists.txt file searches for the above variables in the first
 # 50 lines of this file. Please update that if the variables above are moved.
@ -513,6 +513,19 @@ AC_TYPE_SIZE_T
 # Checks for library functions.

 AC_CHECK_FUNCS(bcopy memfd_create memmove mkostemp secure_getenv strerror)
+AC_MSG_CHECKING([for realpath])
+AC_LINK_IFELSE([AC_LANG_PROGRAM([[
+#include <stdlib.h>
+#include <limits.h>
+]],[[
+char buffer[PATH_MAX];
+realpath(".", buffer);
+]])],
+[AC_MSG_RESULT([yes])
+ AC_DEFINE([HAVE_REALPATH], 1,
+  [Define to 1 if you have the `realpath' function.])
+],
+AC_MSG_RESULT([no]))

 # Check for the availability of libz (aka zlib)

@ -584,14 +597,14 @@ if test "$enable_pcre2test_libreadline" = "yes"; then
 fi
 fi

-
 # Check for the availability of libedit. Different distributions put its
 # headers in different places. Try to cover the most common ones.

 if test "$enable_pcre2test_libedit" = "yes"; then
-  AC_CHECK_HEADERS([editline/readline.h], [HAVE_EDITLINE_READLINE_H=1],
-    [AC_CHECK_HEADERS([edit/readline/readline.h], [HAVE_READLINE_READLINE_H=1],
-      [AC_CHECK_HEADERS([readline/readline.h], [HAVE_READLINE_READLINE_H=1])])])
+  AC_CHECK_HEADERS([editline/readline.h edit/readline/readline.h readline.h], [
+    HAVE_LIBEDIT_HEADER=1
+    break
+  ])
  AC_CHECK_LIB([edit], [readline], [LIBEDIT="-ledit"])
 fi

@ -927,10 +940,9 @@ if test "$enable_pcre2test_libedit" = "yes"; then
    echo "** Cannot use both --enable-pcre2test-libedit and --enable-pcre2test-readline"
    exit 1
  fi
-  if test "$HAVE_EDITLINE_READLINE_H" != "1" -a \
-          "$HAVE_READLINE_READLINE_H" != "1"; then
-    echo "** Cannot --enable-pcre2test-libedit because neither editline/readline.h"
-    echo "** nor readline/readline.h was found."
+  if test -z "$HAVE_LIBEDIT_HEADER"; then
+    echo "** Cannot --enable-pcre2test-libedit because neither editline/readline.h,"
+    echo "** edit/readline/readline.h nor a compatible header was found."
    exit 1
  fi
  if test -z "$LIBEDIT"; then
--- a/doc/html/NON-AUTOTOOLS-BUILD.txt
+++ b/doc/html/NON-AUTOTOOLS-BUILD.txt
@ -121,6 +121,7 @@ environment, for example.
       pcre2_substring.c
       pcre2_tables.c
       pcre2_ucd.c
+       pcre2_ucptables.c
       pcre2_valid_utf.c
       pcre2_xclass.c

@ -306,7 +307,7 @@ cache can be deleted by selecting "File > Delete Cache".
 3.  Create a new, empty build directory, preferably a subdirectory of the
    source dir. For example, C:\pcre2\pcre2-xx\build.

-4.  Run cmake-gui from the Shell envirornment of your build tool, for example,
+4.  Run cmake-gui from the Shell environment of your build tool, for example,
    Msys for Msys/MinGW or Visual Studio Command Prompt for VC/VC++. Do not try
    to start Cmake from the Windows Start menu, as this can lead to errors.

@ -343,10 +344,10 @@ cache can be deleted by selecting "File > Delete Cache".

 BUILDING PCRE2 ON WINDOWS WITH VISUAL STUDIO

-The code currently cannot be compiled without a stdint.h header, which is
-available only in relatively recent versions of Visual Studio. However, this
-portable and permissively-licensed implementation of the header worked without
-issue:
+The code currently cannot be compiled without an inttypes.h header, which is
+available only with Visual Studio 2013 or newer. However, this portable and
+permissively-licensed implementation of the stdint.h header could be used as an
+alternative:

  http://www.azillionmonkeys.com/qed/pstdint.h

@ -373,7 +374,7 @@ Otherwise:
 1. Copy RunTest.bat into the directory where pcre2test.exe and pcre2grep.exe
   have been created.

-2. Edit RunTest.bat to indentify the full or relative location of
+2. Edit RunTest.bat to identify the full or relative location of
   the pcre2 source (wherein which the testdata folder resides), e.g.:

   set srcdir=C:\pcre2\pcre2-10.00
--- a/doc/html/README.txt
+++ b/doc/html/README.txt
@ -5,18 +5,19 @@ PCRE2 is a re-working of the original PCRE1 library to provide an entirely new
 API. Since its initial release in 2015, there has been further development of
 the code and it now differs from PCRE1 in more than just the API. There are new
 features, and the internals have been improved. The original PCRE1 library is
-now obsolete and should not be used in new projects. The latest release of
-PCRE2 is available in three alternative formats from:
+now obsolete and no longer maintained. The latest release of PCRE2 is available
+in .tar.gz, tar.bz2, or .zip form from this GitHub repository:

-https://ftp.pcre.org/pub/pcre/pcre2-10.xx.tar.gz
-https://ftp.pcre.org/pub/pcre/pcre2-10.xx.tar.bz2
-https://ftp.pcre.org/pub/pcre/pcre2-10.xx.tar.zip
+https://github.com/PCRE2Project/pcre2/releases

-There is a mailing list for discussion about the development of PCRE at
-pcre-dev@exim.org. You can access the archives and subscribe or manage your
-subscription here:
+There is a mailing list for discussion about the development of PCRE2 at
+pcre2-dev@googlegroups.com. You can subscribe by sending an email to
+pcre2-dev+subscribe@googlegroups.com.

-   https://lists.exim.org/mailman/listinfo/pcre-dev
+You can access the archives and also subscribe or manage your subscription
+here:
+
+https://groups.google.com/g/pcre2-dev

 Please read the NEWS file if you are upgrading from a previous release. The
 contents of this README file are:
@ -113,12 +114,18 @@ Building PCRE2 using autotools
 The following instructions assume the use of the widely used "configure; make;
 make install" (autotools) process.

-To build PCRE2 on system that supports autotools, first run the "configure"
-command from the PCRE2 distribution directory, with your current directory set
+If you have downloaded and unpacked a PCRE2 release tarball, run the
+"configure" command from the PCRE2 directory, with your current directory set
 to the directory where you want the files to be created. This command is a
 standard GNU "autoconf" configuration script, for which generic instructions
 are supplied in the file INSTALL.

+The files in the GitHub repository do not contain "configure". If you have
+downloaded the PCRE2 source files from GitHub, before you can run "configure"
+you must run the shell script called autogen.sh. This runs a number of
+autotools to create a "configure" script (you must of course have the autotools
+commands installed in order to do this).
+
 Most commonly, people build PCRE2 within its own distribution directory, and in
 this case, on many systems, just running "./configure" is sufficient. However,
 the usual methods of changing standard defaults are available. For example:
@ -187,10 +194,10 @@ library. They are also documented in the pcre2build man page.

  As well as supporting UTF strings, Unicode support includes support for the
  \P, \p, and \X sequences that recognize Unicode character properties.
-  However, only the basic two-letter properties such as Lu are supported.
-  Escape sequences such as \d and \w in patterns do not by default make use of
-  Unicode properties, but can be made to do so by setting the PCRE2_UCP option
-  or starting a pattern with (*UCP).
+  However, only a subset of Unicode properties are supported; see the
+  pcre2pattern man page for details. Escape sequences such as \d and \w in
+  patterns do not by default make use of Unicode properties, but can be made to
+  do so by setting the PCRE2_UCP option or starting a pattern with (*UCP).

 . You can build PCRE2 to recognize either CR or LF or the sequence CRLF, or any
  of the preceding, or any of the Unicode newline sequences, or the NUL (zero)
@ -368,19 +375,20 @@ library. They are also documented in the pcre2build man page.
  necessary to specify something like LIBS="-lncurses" as well. This is
  because, to quote the readline INSTALL, "Readline uses the termcap functions,
  but does not link with the termcap or curses library itself, allowing
-  applications which link with readline the to choose an appropriate library."
+  applications which link with readline the option to choose an appropriate
+  library."
  If you get error messages about missing functions tgetstr, tgetent, tputs,
  tgetflag, or tgoto, this is the problem, and linking with the ncurses library
  should fix it.

 . The C99 standard defines formatting modifiers z and t for size_t and
  ptrdiff_t values, respectively. By default, PCRE2 uses these modifiers in
-  environments other than Microsoft Visual Studio when __STDC_VERSION__ is
-  defined and has a value greater than or equal to 199901L (indicating C99).
-  However, there is at least one environment that claims to be C99 but does not
-  support these modifiers. If --disable-percent-zt is specified, no use is made
-  of the z or t modifiers. Instead or %td or %zu, %lu is used, with a cast for
-  size_t values.
+  environments other than Microsoft Visual Studio versions earlier than 2013
+  when __STDC_VERSION__ is defined and has a value greater than or equal to
+  199901L (indicating C99). However, there is at least one environment that
+  claims to be C99 but does not support these modifiers. If
+  --disable-percent-zt is specified, no use is made of the z or t modifiers.
+  Instead of %td or %zu, %lu is used, with a cast for size_t values.

 . There is a special option called --enable-fuzz-support for use by people who
  want to run fuzzing tests on PCRE2. At present this applies only to the 8-bit
@ -393,10 +401,10 @@ library. They are also documented in the pcre2build man page.
  Setting --enable-fuzz-support also causes a binary called pcre2fuzzcheck to
  be created. This is normally run under valgrind or used when PCRE2 is
  compiled with address sanitizing enabled. It calls the fuzzing function and
-  outputs information about it is doing. The input strings are specified by
-  arguments: if an argument starts with "=" the rest of it is a literal input
-  string. Otherwise, it is assumed to be a file name, and the contents of the
-  file are the test string.
+  outputs information about what it is doing. The input strings are specified
+  by arguments: if an argument starts with "=" the rest of it is a literal
+  input string. Otherwise, it is assumed to be a file name, and the contents
+  of the file are the test string.

 . Releases before 10.30 could be compiled with --disable-stack-for-recursion,
  which caused pcre2_match() to use individual blocks on the heap for
@ -410,7 +418,7 @@ The "configure" script builds the following files for the basic C library:
 . Makefile             the makefile that builds the library
 . src/config.h         build-time configuration options for the library
 . src/pcre2.h          the public PCRE2 header file
-. pcre2-config          script that shows the building settings such as CFLAGS
+. pcre2-config         script that shows the building settings such as CFLAGS
                         that were set for "configure"
 . libpcre2-8.pc        )
 . libpcre2-16.pc       ) data for the pkg-config command
@ -601,13 +609,13 @@ is available. RunTest outputs a comment when it skips a test.

 Many (but not all) of the tests that are not skipped are run twice if JIT
 support is available. On the second run, JIT compilation is forced. This
-testing can be suppressed by putting "nojit" on the RunTest command line.
+testing can be suppressed by putting "-nojit" on the RunTest command line.

 The entire set of tests is run once for each of the 8-bit, 16-bit and 32-bit
 libraries that are enabled. If you want to run just one set of tests, call
 RunTest with either the -8, -16 or -32 option.

-If valgrind is installed, you can run the tests under it by putting "valgrind"
+If valgrind is installed, you can run the tests under it by putting "-valgrind"
 on the RunTest command line. To run pcre2test on just one or more specific test
 files, give their numbers as arguments to RunTest, for example:

@ -688,7 +696,7 @@ Test 14 contains some special UTF and UCP tests that give different output for
 different code unit widths.

 Test 15 contains a number of tests that must not be run with JIT. They check,
-among other non-JIT things, the match-limiting features of the intepretive
+among other non-JIT things, the match-limiting features of the interpretive
 matcher.

 Test 16 is run only when JIT support is not available. It checks that an
@ -904,4 +912,4 @@ The distribution should contain the files listed below.
 Philip Hazel
 Email local part: Philip.Hazel
 Email domain: gmail.com
-Last updated: 28 April 2021
+Last updated: 15 April 2022
--- a/doc/html/pcre2.html
+++ b/doc/html/pcre2.html
@ -28,7 +28,8 @@ nearly two decades, the limitations of the original API were making development
 increasingly difficult. The new API is more extensible, and it was simplified
 by abolishing the separate "study" optimizing function; in PCRE2, patterns are
 automatically optimized where possible. Since forking from PCRE1, the code has
-been extensively refactored and new features introduced.
+been extensively refactored and new features introduced. The old library is now
+obsolete and is no longer maintained.
 </P>
 <P>
 As well as Perl-style regular expression patterns, some features that appeared
@ -193,18 +194,18 @@ function, listing its arguments and results.
 <P>
 Philip Hazel
 <br>
-University Computing Service
+Retired from University Computing Service
 <br>
 Cambridge, England.
 <br>
 </P>
 <P>
 Putting an actual email address here is a spam magnet. If you want to email me,
-use my two initials, followed by the two digits 10, at the domain cam.ac.uk.
+use my two names separated by a dot at gmail.com.
 </P>
 <br><a name="SEC5" href="#TOC1">REVISION</a><br>
 <P>
-Last updated: 28 April 2021
+Last updated: 27 August 2021
 <br>
 Copyright &copy; 1997-2021 University of Cambridge.
 <br>
--- a/doc/html/pcre2_compile.html
+++ b/doc/html/pcre2_compile.html
@ -92,8 +92,18 @@ Additional options may be set in the compile context via the
 function.
 </P>
 <P>
-The yield of this function is a pointer to a private data structure that
-contains the compiled pattern, or NULL if an error was detected.
+If either of <i>errorcode</i> or <i>erroroffset</i> is NULL, the function returns
+NULL immediately. Otherwise, the yield of this function is a pointer to a
+private data structure that contains the compiled pattern, or NULL if an error
+was detected. In the error case, a text error message can be obtained by
+passing the value returned via the <i>errorcode</i> argument to the the
+<b>pcre2_get_error_message()</b> function. The offset (in code units) where the
+error was encountered is returned via the <i>erroroffset</i> argument.
+</P>
+<P>
+If there is no error, the value passed via <i>errorcode</i> returns the message
+"no error" if passed to <b>pcre2_get_error_message()</b>, and the value passed
+via <i>erroroffset</i> is zero.
 </P>
 <P>
 There is a complete description of the PCRE2 native API, with more detail on
--- a/doc/html/pcre2_dfa_match.html
+++ b/doc/html/pcre2_dfa_match.html
@ -45,10 +45,16 @@ just once (except when processing lookaround assertions). This function is
  <i>workspace</i>    Points to a vector of ints used as working space
  <i>wscount</i>      Number of elements in the vector
 </pre>
-For <b>pcre2_dfa_match()</b>, a match context is needed only if you want to set
-up a callout function or specify the heap limit or the match or the recursion
-depth limits. The <i>length</i> and <i>startoffset</i> values are code units, not
-characters. The options are:
+The size of output vector needed to contain all the results depends on the
+number of simultaneous matches, not on the number of parentheses in the
+pattern. Using <b>pcre2_match_data_create_from_pattern()</b> to create the match
+data block is therefore not advisable when using this function.
+</P>
+<P>
+A match context is needed only if you want to set up a callout function or
+specify the heap limit or the match or the recursion depth limits. The
+<i>length</i> and <i>startoffset</i> values are code units, not characters. The
+options are:
 <pre>
  PCRE2_ANCHORED          Match only at the first position
  PCRE2_COPY_MATCHED_SUBJECT
--- a/doc/html/pcre2_jit_stack_create.html
+++ b/doc/html/pcre2_jit_stack_create.html
@ -34,7 +34,8 @@ allocation. The result can be passed to the JIT run-time code by calling
 <b>pcre2_jit_stack_assign()</b> to associate the stack with a compiled pattern,
 which can then be processed by <b>pcre2_match()</b> or <b>pcre2_jit_match()</b>.
 A maximum stack size of 512KiB to 1MiB should be more than enough for any
-pattern. For more details, see the
+pattern. If the stack couldn't be allocated or the values passed were not
+reasonable, NULL will be returned. For more details, see the
 <a href="pcre2jit.html"><b>pcre2jit</b></a>
 page.
 </P>
--- a/doc/html/pcre2_match_data_create.html
+++ b/doc/html/pcre2_match_data_create.html
@ -30,8 +30,9 @@ This function creates a new match data block, which is used for holding the
 result of a match. The first argument specifies the number of pairs of offsets
 that are required. These form the "output vector" (ovector) within the match
 data block, and are used to identify the matched string and any captured
-substrings. There is always one pair of offsets; if <b>ovecsize</b> is zero, it
-is treated as one.
+substrings when matching with <b>pcre2_match()</b>, or a number of different
+matches at the same point when used with <b>pcre2_dfa_match()</b>. There is
+always one pair of offsets; if <b>ovecsize</b> is zero, it is treated as one.
 </P>
 <P>
 The second argument points to a general context, for custom memory management,
--- a/doc/html/pcre2_match_data_create_from_pattern.html
+++ b/doc/html/pcre2_match_data_create_from_pattern.html
@ -26,12 +26,15 @@ SYNOPSIS
 DESCRIPTION
 </b><br>
 <P>
-This function creates a new match data block, which is used for holding the
-result of a match. The first argument points to a compiled pattern. The number
-of capturing parentheses within the pattern is used to compute the number of
-pairs of offsets that are required in the match data block. These form the
-"output vector" (ovector) within the match data block, and are used to identify
-the matched string and any captured substrings.
+This function creates a new match data block for holding the result of a match.
+The first argument points to a compiled pattern. The number of capturing
+parentheses within the pattern is used to compute the number of pairs of
+offsets that are required in the match data block. These form the "output
+vector" (ovector) within the match data block, and are used to identify the
+matched string and any captured substrings when matching with
+<b>pcre2_match()</b>. If you are using <b>pcre2_dfa_match()</b>, which uses the
+outut vector in a different way, you should use <b>pcre2_match_data_create()</b>
+instead of this function.
 </P>
 <P>
 The second argument points to a general context, for custom memory management,
--- a/doc/html/pcre2_serialize_decode.html
+++ b/doc/html/pcre2_serialize_decode.html
@ -48,7 +48,7 @@ the following negative error codes:
  PCRE2_ERROR_BADDATA   <i>number_of_codes</i> is zero or less
  PCRE2_ERROR_BADMAGIC  mismatch of id bytes in <i>bytes</i>
  PCRE2_ERROR_BADMODE   mismatch of variable unit size or PCRE version
-  PCRE2_ERROR_MEMORY    memory allocation failed
+  PCRE2_ERROR_NOMEMORY  memory allocation failed
  PCRE2_ERROR_NULL      <i>codes</i> or <i>bytes</i> is NULL
 </pre>
 PCRE2_ERROR_BADMAGIC may mean that the data is corrupt, or that it was compiled
--- a/doc/html/pcre2_set_compile_extra_options.html
+++ b/doc/html/pcre2_set_compile_extra_options.html
@ -30,7 +30,8 @@ This function sets additional option bits for <b>pcre2_compile()</b> that are
 housed in a compile context. It completely replaces all the bits. The extra
 options are:
 <pre>
-  PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES  Allow \x{df800} to \x{dfff} in UTF-8 and UTF-32 modes
+  PCRE2_EXTRA_ALLOW_LOOKAROUND_BSK     Allow \K in lookarounds
+  PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES  Allow \x{d800} to \x{dfff} in UTF-8 and UTF-32 modes
  PCRE2_EXTRA_ALT_BSUX                 Extended alternate \u, \U, and \x handling
  PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL    Treat all invalid escapes as a literal following character
  PCRE2_EXTRA_ESCAPED_CR_IS_LF         Interpret \r as \n
--- a/doc/html/pcre2_substitute.html
+++ b/doc/html/pcre2_substitute.html
@ -68,29 +68,29 @@ automatically added.
 The subject and replacement lengths can be given as PCRE2_ZERO_TERMINATED for
 zero-terminated strings. The options are:
 <pre>
-  PCRE2_ANCHORED             Match only at the first position
-  PCRE2_ENDANCHORED          Pattern can match only at end of subject
-  PCRE2_NOTBOL               Subject is not the beginning of a line
-  PCRE2_NOTEOL               Subject is not the end of a line
-  PCRE2_NOTEMPTY             An empty string is not a valid match
-  PCRE2_NOTEMPTY_ATSTART     An empty string at the start of the subject is not a valid match
-  PCRE2_NO_JIT               Do not use JIT matching
-  PCRE2_NO_UTF_CHECK         Do not check the subject or replacement for UTF validity (only relevant if
-                              PCRE2_UTF was set at compile time)
-  PCRE2_SUBSTITUTE_EXTENDED  Do extended replacement processing
-  PCRE2_SUBSTITUTE_GLOBAL    Replace all occurrences in the subject
-  PCRE2_SUBSTITUTE_LITERAL   The replacement string is literal
-  PCRE2_SUBSTITUTE_MATCHED   Use pre-existing match data for 1st match
-  PCRE2_SUBSTITUTE_OVERFLOW_LENGTH  If overflow, compute needed length
+  PCRE2_ANCHORED                     Match only at the first position
+  PCRE2_ENDANCHORED                  Match only at end of subject
+  PCRE2_NOTBOL                       Subject is not the beginning of a line
+  PCRE2_NOTEOL                       Subject is not the end of a line
+  PCRE2_NOTEMPTY                     An empty string is not a valid match
+  PCRE2_NOTEMPTY_ATSTART             An empty string at the start of the subject is not a valid match
+  PCRE2_NO_JIT                       Do not use JIT matching
+  PCRE2_NO_UTF_CHECK                 Do not check for UTF validity in the subject or replacement
+                                      (only relevant if PCRE2_UTF was set at compile time)
+  PCRE2_SUBSTITUTE_EXTENDED          Do extended replacement processing
+  PCRE2_SUBSTITUTE_GLOBAL            Replace all occurrences in the subject
+  PCRE2_SUBSTITUTE_LITERAL           The replacement string is literal
+  PCRE2_SUBSTITUTE_MATCHED           Use pre-existing match data for first match
+  PCRE2_SUBSTITUTE_OVERFLOW_LENGTH   If overflow, compute needed length
  PCRE2_SUBSTITUTE_REPLACEMENT_ONLY  Return only replacement string(s)
-  PCRE2_SUBSTITUTE_UNKNOWN_UNSET  Treat unknown group as unset
-  PCRE2_SUBSTITUTE_UNSET_EMPTY  Simple unset insert = empty string
+  PCRE2_SUBSTITUTE_UNKNOWN_UNSET     Treat unknown group as unset
+  PCRE2_SUBSTITUTE_UNSET_EMPTY       Simple unset insert = empty string
 </pre>
 If PCRE2_SUBSTITUTE_LITERAL is set, PCRE2_SUBSTITUTE_EXTENDED,
 PCRE2_SUBSTITUTE_UNKNOWN_UNSET, and PCRE2_SUBSTITUTE_UNSET_EMPTY are ignored.
 </P>
 <P>
-If PCRE2_SUBSTITUTE_MATCHED is set, <i>match_data</i> must be non-zero; its
+If PCRE2_SUBSTITUTE_MATCHED is set, <i>match_data</i> must be non-NULL; its
 contents must be the result of a call to <b>pcre2_match()</b> using the same
 pattern and subject.
 </P>
--- a/doc/html/pcre2api.html
+++ b/doc/html/pcre2api.html
@ -1017,7 +1017,7 @@ has its own memory control arrangements (see the
 documentation for more details). If the limit is reached, the negative error
 code PCRE2_ERROR_HEAPLIMIT is returned. The default limit can be set when PCRE2
 is built; if it is not, the default is set very large and is essentially
-"unlimited".
+unlimited.
 </P>
 <P>
 A value for the heap limit may also be supplied by an item at the start of a
@ -1030,19 +1030,17 @@ less than the limit set by the caller of <b>pcre2_match()</b> or, if no such
 limit is set, less than the default.
 </P>
 <P>
-The <b>pcre2_match()</b> function starts out using a 20KiB vector on the system
-stack for recording backtracking points. The more nested backtracking points
-there are (that is, the deeper the search tree), the more memory is needed.
-Heap memory is used only if the initial vector is too small. If the heap limit
-is set to a value less than 21 (in particular, zero) no heap memory will be
-used. In this case, only patterns that do not have a lot of nested backtracking
-can be successfully processed.
+The <b>pcre2_match()</b> function always needs some heap memory, so setting a
+value of zero guarantees a "heap limit exceeded" error. Details of how
+<b>pcre2_match()</b> uses the heap are given in the
+<a href="pcre2perform.html"><b>pcre2perform</b></a>
+documentation.
 </P>
 <P>
-Similarly, for <b>pcre2_dfa_match()</b>, a vector on the system stack is used
-when processing pattern recursions, lookarounds, or atomic groups, and only if
-this is not big enough is heap memory used. In this case, too, setting a value
-of zero disables the use of the heap.
+For <b>pcre2_dfa_match()</b>, a vector on the system stack is used when
+processing pattern recursions, lookarounds, or atomic groups, and only if this
+is not big enough is heap memory used. In this case, setting a value of zero
+disables the use of the heap.
 <br>
 <br>
 <b>int pcre2_set_match_limit(pcre2_match_context *<i>mcontext</i>,</b>
@ -1089,10 +1087,10 @@ less than the limit set by the caller of <b>pcre2_match()</b> or
 <br>
 <br>
 This parameter limits the depth of nested backtracking in <b>pcre2_match()</b>.
-Each time a nested backtracking point is passed, a new memory "frame" is used
+Each time a nested backtracking point is passed, a new memory frame is used
 to remember the state of matching at that point. Thus, this parameter
 indirectly limits the amount of memory that is used in a match. However,
-because the size of each memory "frame" depends on the number of capturing
+because the size of each memory frame depends on the number of capturing
 parentheses, the actual memory limit varies from pattern to pattern. This limit
 was more useful in versions before 10.30, where function recursion was used for
 backtracking.
@ -1383,8 +1381,7 @@ If <i>errorcode</i> or <i>erroroffset</i> is NULL, <b>pcre2_compile()</b> return
 NULL immediately. Otherwise, the variables to which these point are set to an
 error code and an offset (number of code units) within the pattern,
 respectively, when <b>pcre2_compile()</b> returns NULL because a compilation
-error has occurred. The values are not defined when compilation is successful
-and <b>pcre2_compile()</b> returns a non-NULL value.
+error has occurred. 
 </P>
 <P>
 There are nearly 100 positive error codes that <b>pcre2_compile()</b> may return
@ -1399,15 +1396,18 @@ because the textual error messages that are obtained by calling the
 message"
 <a href="#geterrormessage">below)</a>
 should be self-explanatory. Macro names starting with PCRE2_ERROR_ are defined
-for both positive and negative error codes in <b>pcre2.h</b>.
+for both positive and negative error codes in <b>pcre2.h</b>. When compilation
+is successful <i>errorcode</i> is set to a value that returns the message "no
+error" if passed to <b>pcre2_get_error_message()</b>.
 </P>
 <P>
 The value returned in <i>erroroffset</i> is an indication of where in the
-pattern the error occurred. It is not necessarily the furthest point in the
-pattern that was read. For example, after the error "lookbehind assertion is
-not fixed length", the error offset points to the start of the failing
-assertion. For an invalid UTF-8 or UTF-16 string, the offset is that of the
-first code unit of the failing character.
+pattern an error occurred. When there is no error, zero is returned. A non-zero
+value is not necessarily the furthest point in the pattern that was read. For
+example, after the error "lookbehind assertion is not fixed length", the error
+offset points to the start of the failing assertion. For an invalid UTF-8 or
+UTF-16 string, the offset is that of the first code unit of the failing
+character.
 </P>
 <P>
 Some errors are not detected until the whole pattern has been scanned; in these
@ -1845,7 +1845,7 @@ undefined. It may cause your program to crash or loop.
 </P>
 <P>
 Note that this option can also be passed to <b>pcre2_match()</b> and
-<b>pcre_dfa_match()</b>, to suppress UTF validity checking of the subject
+<b>pcre2_dfa_match()</b>, to suppress UTF validity checking of the subject
 string.
 </P>
 <P>
@ -1914,6 +1914,13 @@ Extra compile options
 <P>
 The option bits that can be set in a compile context by calling the
 <b>pcre2_set_compile_extra_options()</b> function are as follows:
+<pre>
+  PCRE2_EXTRA_ALLOW_LOOKAROUND_BSK
+</pre>
+Since release 10.38 PCRE2 has forbidden the use of \K within lookaround
+assertions, following Perl's lead. This option is provided to re-enable the
+previous behaviour (act in positive lookarounds, ignore in negative ones) in
+case anybody is relying on it.
 <pre>
  PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES
 </pre>
@ -2048,8 +2055,8 @@ point. However, this applies only to characters whose code points are less than
 \d.
 </P>
 <P>
-When PCRE2 is built with Unicode support (the default), the Unicode properties
-of all characters can be tested with \p and \P, or, alternatively, the
+When PCRE2 is built with Unicode support (the default), certain Unicode
+character properties can be tested with \p and \P, or, alternatively, the
 PCRE2_UCP option can be set when a pattern is compiled; this causes \w and
 friends to use Unicode property support instead of the built-in tables.
 PCRE2_UCP also causes upper/lower casing operations on characters with code
@ -2309,7 +2316,7 @@ return zero. The third argument should point to a <b>size_t</b> variable.
  PCRE2_INFO_LASTCODETYPE
 </pre>
 Returns 1 if there is a rightmost literal code unit that must exist in any
-matched string, other than at its start. The third argument should  point to a
+matched string, other than at its start. The third argument should point to a
 <b>uint32_t</b> variable. If there is no such value, 0 is returned. When 1 is
 returned, the code unit value itself can be retrieved using
 PCRE2_INFO_LASTCODEUNIT. For anchored patterns, a last literal value is
@ -2512,20 +2519,31 @@ to an abstract format like Java or .NET serialization.
 Information about a successful or unsuccessful match is placed in a match
 data block, which is an opaque structure that is accessed by function calls. In
 particular, the match data block contains a vector of offsets into the subject
-string that define the matched part of the subject and any substrings that were
-captured. This is known as the <i>ovector</i>.
+string that define the matched parts of the subject. This is known as the
+<i>ovector</i>.
 </P>
 <P>
 Before calling <b>pcre2_match()</b>, <b>pcre2_dfa_match()</b>, or
 <b>pcre2_jit_match()</b> you must create a match data block by calling one of
 the creation functions above. For <b>pcre2_match_data_create()</b>, the first
-argument is the number of pairs of offsets in the <i>ovector</i>. One pair of
-offsets is required to identify the string that matched the whole pattern, with
-an additional pair for each captured substring. For example, a value of 4
-creates enough space to record the matched portion of the subject plus three
-captured substrings. A minimum of at least 1 pair is imposed by
-<b>pcre2_match_data_create()</b>, so it is always possible to return the overall
-matched string.
+argument is the number of pairs of offsets in the <i>ovector</i>.
+</P>
+<P>
+When using <b>pcre2_match()</b>, one pair of offsets is required to identify the
+string that matched the whole pattern, with an additional pair for each
+captured substring. For example, a value of 4 creates enough space to record
+the matched portion of the subject plus three captured substrings.
+</P>
+<P>
+When using <b>pcre2_dfa_match()</b> there may be multiple matched substrings of
+different lengths at the same point in the subject. The ovector should be made
+large enough to hold as many as are expected.
+</P>
+<P>
+A minimum of at least 1 pair is imposed by <b>pcre2_match_data_create()</b>, so
+it is always possible to return the overall matched string in the case of
+<b>pcre2_match()</b> or the longest match in the case of
+<b>pcre2_dfa_match()</b>.
 </P>
 <P>
 The second argument of <b>pcre2_match_data_create()</b> is a pointer to a
@ -2536,10 +2554,11 @@ pass NULL, which causes <b>malloc()</b> to be used.
 <P>
 For <b>pcre2_match_data_create_from_pattern()</b>, the first argument is a
 pointer to a compiled pattern. The ovector is created to be exactly the right
-size to hold all the substrings a pattern might capture. The second argument is
-again a pointer to a general context, but in this case if NULL is passed, the
-memory is obtained using the same allocator that was used for the compiled
-pattern (custom or default).
+size to hold all the substrings a pattern might capture when matched using
+<b>pcre2_match()</b>. You should not use this call when matching with
+<b>pcre2_dfa_match()</b>. The second argument is again a pointer to a general
+context, but in this case if NULL is passed, the memory is obtained using the
+same allocator that was used for the compiled pattern (custom or default).
 </P>
 <P>
 A match data block can be used many times, with the same or different compiled
@ -2621,7 +2640,9 @@ The subject string is passed to <b>pcre2_match()</b> as a pointer in
 <i>startoffset</i>. The length and offset are in code units, not characters.
 That is, they are in bytes for the 8-bit library, 16-bit code units for the
 16-bit library, and 32-bit code units for the 32-bit library, whether or not
-UTF processing is enabled.
+UTF processing is enabled. As a special case, if <i>subject</i> is NULL and
+<i>length</i> is zero, the subject is assumed to be an empty string. If
+<i>length</i> is non-zero, an error occurs if <i>subject</i> is NULL.
 </P>
 <P>
 If <i>startoffset</i> is greater than the length of the subject,
@ -2643,10 +2664,10 @@ lookbehind. For example, consider the pattern
 </pre>
 which finds occurrences of "iss" in the middle of words. (\B matches only if
 the current position in the subject is not a word boundary.) When applied to
-the string "Mississipi" the first call to <b>pcre2_match()</b> finds the first
+the string "Mississippi" the first call to <b>pcre2_match()</b> finds the first
 occurrence. If <b>pcre2_match()</b> is called again with just the remainder of
-the subject, namely "issipi", it does not match, because \B is always false at
-the start of the subject, which is deemed to be a word boundary. However, if
+the subject, namely "issippi", it does not match, because \B is always false
+at the start of the subject, which is deemed to be a word boundary. However, if
 <b>pcre2_match()</b> is passed the entire string again, but with
 <i>startoffset</i> set to 4, it finds the second occurrence of "iss" because it
 is able to look behind the starting point to discover that it is preceded by a
@ -3125,11 +3146,11 @@ The backtracking match limit was reached.
 <pre>
  PCRE2_ERROR_NOMEMORY
 </pre>
-If a pattern contains many nested backtracking points, heap memory is used to
-remember them. This error is given when the memory allocation function (default
-or custom) fails. Note that a different error, PCRE2_ERROR_HEAPLIMIT, is given
-if the amount of memory needed exceeds the heap limit. PCRE2_ERROR_NOMEMORY is
-also returned if PCRE2_COPY_MATCHED_SUBJECT is set and memory allocation fails.
+Heap memory is used to remember backgracking points. This error is given when
+the memory allocation function (default or custom) fails. Note that a different
+error, PCRE2_ERROR_HEAPLIMIT, is given if the amount of memory needed exceeds
+the heap limit. PCRE2_ERROR_NOMEMORY is also returned if
+PCRE2_COPY_MATCHED_SUBJECT is set and memory allocation fails.
 <pre>
  PCRE2_ERROR_NULL
 </pre>
@ -3375,12 +3396,17 @@ same number causes an error at compile time.
 <P>
 This function optionally calls <b>pcre2_match()</b> and then makes a copy of the
 subject string in <i>outputbuffer</i>, replacing parts that were matched with
-the <i>replacement</i> string, whose length is supplied in <b>rlength</b>. This
-can be given as PCRE2_ZERO_TERMINATED for a zero-terminated string. There is an
-option (see PCRE2_SUBSTITUTE_REPLACEMENT_ONLY below) to return just the
-replacement string(s). The default action is to perform just one replacement if
-the pattern matches, but there is an option that requests multiple replacements
-(see PCRE2_SUBSTITUTE_GLOBAL below).
+the <i>replacement</i> string, whose length is supplied in <b>rlength</b>, which
+can be given as PCRE2_ZERO_TERMINATED for a zero-terminated string. As a
+special case, if <i>replacement</i> is NULL and <i>rlength</i> is zero, the
+replacement is assumed to be an empty string. If <i>rlength</i> is non-zero, an
+error occurs if <i>replacement</i> is NULL.
+</P>
+<P>
+There is an option (see PCRE2_SUBSTITUTE_REPLACEMENT_ONLY below) to return just
+the replacement string(s). The default action is to perform just one
+replacement if the pattern matches, but there is an option that requests
+multiple replacements (see PCRE2_SUBSTITUTE_GLOBAL below).
 </P>
 <P>
 If successful, <b>pcre2_substitute()</b> returns the number of substitutions
@ -3414,12 +3440,12 @@ block may or may not have been changed.
 As well as the usual options for <b>pcre2_match()</b>, a number of additional
 options can be set in the <i>options</i> argument of <b>pcre2_substitute()</b>.
 One such option is PCRE2_SUBSTITUTE_MATCHED. When this is set, an external
-<i>match_data</i> block must be provided, and it must have been used for an
-external call to <b>pcre2_match()</b>. The data in the <i>match_data</i> block
-(return code, offset vector) is used for the first substitution instead of
-calling <b>pcre2_match()</b> from within <b>pcre2_substitute()</b>. This allows
-an application to check for a match before choosing to substitute, without
-having to repeat the match.
+<i>match_data</i> block must be provided, and it must have already been used for
+an external call to <b>pcre2_match()</b> with the same pattern and subject
+arguments. The data in the <i>match_data</i> block (return code, offset vector)
+is then used for the first substitution instead of calling <b>pcre2_match()</b>
+from within <b>pcre2_substitute()</b>. This allows an application to check for a
+match before choosing to substitute, without having to repeat the match.
 </P>
 <P>
 The contents of the externally supplied match data block are not changed when
@ -3564,7 +3590,7 @@ and force lower case. The escape sequences change the current state: \U and
 terminating a \Q quoted sequence) reverts to no case forcing. The sequences
 \u and \l force the next character (if it is a letter) to upper or lower
 case, respectively, and then the state automatically reverts to no case
-forcing. Case forcing applies to all inserted  characters, including those from
+forcing. Case forcing applies to all inserted characters, including those from
 capture groups and letters within \Q...\E quoted sequences. If either
 PCRE2_UTF or PCRE2_UCP was set when the pattern was compiled, Unicode
 properties are used for case forcing characters whose code points are greater
@ -3636,7 +3662,9 @@ default.
 </P>
 <P>
 PCRE2_ERROR_NULL is returned if PCRE2_SUBSTITUTE_MATCHED is set but the
-<i>match_data</i> argument is NULL.
+<i>match_data</i> argument is NULL or if the <i>subject</i> or <i>replacement</i>
+arguments are NULL. For backward compatibility reasons an exception is made for
+the <i>replacement</i> argument if the <i>rlength</i> argument is also 0.
 </P>
 <P>
 PCRE2_ERROR_BADREPLACEMENT is used for miscellaneous syntax errors in the
@ -3791,12 +3819,13 @@ other alternatives. Ultimately, when it runs out of matches,
 <P>
 The function <b>pcre2_dfa_match()</b> is called to match a subject string
 against a compiled pattern, using a matching algorithm that scans the subject
-string just once (not counting lookaround assertions), and does not backtrack.
-This has different characteristics to the normal algorithm, and is not
-compatible with Perl. Some of the features of PCRE2 patterns are not supported.
-Nevertheless, there are times when this kind of matching can be useful. For a
-discussion of the two matching algorithms, and a list of features that
-<b>pcre2_dfa_match()</b> does not support, see the
+string just once (not counting lookaround assertions), and does not backtrack
+(except when processing lookaround assertions). This has different
+characteristics to the normal algorithm, and is not compatible with Perl. Some
+of the features of PCRE2 patterns are not supported. Nevertheless, there are
+times when this kind of matching can be useful. For a discussion of the two
+matching algorithms, and a list of features that <b>pcre2_dfa_match()</b> does
+not support, see the
 <a href="pcre2matching.html"><b>pcre2matching</b></a>
 documentation.
 </P>
@ -3831,7 +3860,7 @@ Here is an example of a simple call to <b>pcre2_dfa_match()</b>:
 </PRE>
 </P>
 <br><b>
-Option bits for <b>pcre_dfa_match()</b>
+Option bits for <b>pcre2_dfa_match()</b>
 </b><br>
 <P>
 The unused bits of the <i>options</i> argument for <b>pcre2_dfa_match()</b> must
@ -3982,16 +4011,16 @@ fail, this error is given.
 <P>
 Philip Hazel
 <br>
-University Computing Service
+Retired from University Computing Service
 <br>
 Cambridge, England.
 <br>
 </P>
 <br><a name="SEC42" href="#TOC1">REVISION</a><br>
 <P>
-Last updated: 04 November 2020
+Last updated: 27 July 2022
 <br>
-Copyright &copy; 1997-2020 University of Cambridge.
+Copyright &copy; 1997-2022 University of Cambridge.
 <br>
 <p>
 Return to the <a href="index.html">PCRE2 index page</a>.
--- a/doc/html/pcre2build.html
+++ b/doc/html/pcre2build.html
@ -142,8 +142,9 @@ locked this out by setting PCRE2_NEVER_UTF.
 UTF support allows the libraries to process character code points up to
 0x10ffff in the strings that they handle. Unicode support also gives access to
 the Unicode properties of characters, using pattern escapes such as \P, \p,
-and \X. Only the general category properties such as <i>Lu</i> and <i>Nd</i> are
-supported. Details are given in the
+and \X. Only the general category properties such as <i>Lu</i> and <i>Nd</i>,
+script names, and some bi-directional properties are supported. Details are
+given in the
 <a href="pcre2pattern.html"><b>pcre2pattern</b></a>
 documentation.
 </P>
@ -283,12 +284,11 @@ to the <b>configure</b> command. This setting also applies to the
 counting is done differently).
 </P>
 <P>
-The <b>pcre2_match()</b> function starts out using a 20KiB vector on the system
-stack to record backtracking points. The more nested backtracking points there
-are (that is, the deeper the search tree), the more memory is needed. If the
-initial vector is not large enough, heap memory is used, up to a certain limit,
-which is specified in kibibytes (units of 1024 bytes). The limit can be changed
-at run time, as described in the
+The <b>pcre2_match()</b> function uses heap memory to record backtracking
+points. The more nested backtracking points there are (that is, the deeper the
+search tree), the more memory is needed. There is an upper limit, specified in
+kibibytes (units of 1024 bytes). This limit can be changed at run time, as
+described in the
 <a href="pcre2api.html"><b>pcre2api</b></a>
 documentation. The default limit (in effect unlimited) is 20 million. You can
 change this by a setting such as
@ -307,7 +307,7 @@ You can also explicitly limit the depth of nested backtracking in the
 for --with-match-limit. You can set a lower default limit by adding, for
 example,
 <pre>
-  --with-match-limit_depth=10000
+  --with-match-limit-depth=10000
 </pre>
 to the <b>configure</b> command. This value can be overridden at run time. This
 depth limit indirectly limits the amount of heap memory that is used, but
@ -553,15 +553,16 @@ documentation.
 <P>
 The C99 standard defines formatting modifiers z and t for size_t and
 ptrdiff_t values, respectively. By default, PCRE2 uses these modifiers in
-environments other than Microsoft Visual Studio when __STDC_VERSION__ is
-defined and has a value greater than or equal to 199901L (indicating C99).
+environments other than old versions of Microsoft Visual Studio when
+__STDC_VERSION__ is defined and has a value greater than or equal to 199901L
+(indicating support for C99).
 However, there is at least one environment that claims to be C99 but does not
 support these modifiers. If
 <pre>
  --disable-percent-zt
 </pre>
 is specified, no use is made of the z or t modifiers. Instead of %td or %zu,
-%lu is used, with a cast for size_t values.
+a suitable format is used depending in the size of long for the platform.
 </P>
 <br><a name="SEC22" href="#TOC1">SUPPORT FOR FUZZERS</a><br>
 <P>
@ -607,16 +608,16 @@ give a warning.
 <P>
 Philip Hazel
 <br>
-University Computing Service
+Retired from University Computing Service
 <br>
 Cambridge, England.
 <br>
 </P>
 <br><a name="SEC26" href="#TOC1">REVISION</a><br>
 <P>
-Last updated: 20 March 2020
+Last updated: 27 July 2022
 <br>
-Copyright &copy; 1997-2020 University of Cambridge.
+Copyright &copy; 1997-2022 University of Cambridge.
 <br>
 <p>
 Return to the <a href="index.html">PCRE2 index page</a>.
--- a/doc/html/pcre2compat.html
+++ b/doc/html/pcre2compat.html
@ -18,33 +18,41 @@ DIFFERENCES BETWEEN PCRE2 AND PERL
 <P>
 This document describes some of the differences in the ways that PCRE2 and Perl
 handle regular expressions. The differences described here are with respect to
-Perl version 5.32.0, but as both Perl and PCRE2 are continually changing, the
+Perl version 5.34.0, but as both Perl and PCRE2 are continually changing, the
 information may at times be out of date.
 </P>
 <P>
-1. PCRE2 has only a subset of Perl's Unicode support. Details of what it does
+1. When PCRE2_DOTALL (equivalent to Perl's /s qualifier) is not set, the
+behaviour of the '.' metacharacter differs from Perl. In PCRE2, '.' matches the
+next character unless it is the start of a newline sequence. This means that,
+if the newline setting is CR, CRLF, or NUL, '.' will match the code point LF
+(0x0A) in ASCII/Unicode environments, and NL (either 0x15 or 0x25) when using
+EBCDIC. In Perl, '.' appears never to match LF, even when 0x0A is not a newline
+indicator.
+</P>
+<P>
+2. PCRE2 has only a subset of Perl's Unicode support. Details of what it does
 have are given in the
 <a href="pcre2unicode.html"><b>pcre2unicode</b></a>
 page.
 </P>
 <P>
-2. Like Perl, PCRE2 allows repeat quantifiers on parenthesized assertions, but
+3. Like Perl, PCRE2 allows repeat quantifiers on parenthesized assertions, but
 they do not mean what you might think. For example, (?!a){3} does not assert
 that the next three characters are not "a". It just asserts that the next
 character is not "a" three times (in principle; PCRE2 optimizes this to run the
 assertion just once). Perl allows some repeat quantifiers on other assertions,
-for example, \b* (but not \b{3}, though oddly it does allow ^{3}), but these
-do not seem to have any use. PCRE2 does not allow any kind of quantifier on
-non-lookaround assertions.
+for example, \b* , but these do not seem to have any use. PCRE2 does not allow
+any kind of quantifier on non-lookaround assertions.
 </P>
 <P>
-3. Capture groups that occur inside negative lookaround assertions are counted,
+4. Capture groups that occur inside negative lookaround assertions are counted,
 but their entries in the offsets vector are set only when a negative assertion
 is a condition that has a matching branch (that is, the condition is false).
 Perl may set such capture groups in other circumstances.
 </P>
 <P>
-4. The following Perl escape sequences are not supported: \F, \l, \L, \u,
+5. The following Perl escape sequences are not supported: \F, \l, \L, \u,
 \U, and \N when followed by a character name. \N on its own, matching a
 non-newline character, and \N{U+dd..}, matching a Unicode code point, are
 supported. The escapes that modify the case of following letters are
@ -55,26 +63,26 @@ PCRE2_EXTRA_ALT_BSUX options is set, \U and \u are interpreted as ECMAScript
 interprets them.
 </P>
 <P>
-5. The Perl escape sequences \p, \P, and \X are supported only if PCRE2 is
+6. The Perl escape sequences \p, \P, and \X are supported only if PCRE2 is
 built with Unicode support (the default). The properties that can be tested
 with \p and \P are limited to the general category properties such as Lu and
-Nd, script names such as Greek or Han, and the derived properties Any and L&.
-Both PCRE2 and Perl support the Cs (surrogate) property, but in PCRE2 its use
-is limited. See the
+Nd, script names such as Greek or Han, Bidi_Class, Bidi_Control, and the
+derived properties Any and LC (synonym L&). Both PCRE2 and Perl support the Cs
+(surrogate) property, but in PCRE2 its use is limited. See the
 <a href="pcre2pattern.html"><b>pcre2pattern</b></a>
 documentation for details. The long synonyms for property names that Perl
 supports (such as \p{Letter}) are not supported by PCRE2, nor is it permitted
 to prefix any of these properties with "Is".
 </P>
 <P>
-6. PCRE2 supports the \Q...\E escape for quoting substrings. Characters
+7. PCRE2 supports the \Q...\E escape for quoting substrings. Characters
 in between are treated as literals. However, this is slightly different from
 Perl in that $ and @ are also handled as literals inside the quotes. In Perl,
-they cause variable interpolation (but of course PCRE2 does not have
-variables). Also, Perl does "double-quotish backslash interpolation" on any
-backslashes between \Q and \E which, its documentation says, "may lead to
-confusing results". PCRE2 treats a backslash between \Q and \E just like any
-other character. Note the following examples:
+they cause variable interpolation (PCRE2 does not have variables). Also, Perl
+does "double-quotish backslash interpolation" on any backslashes between \Q
+and \E which, its documentation says, "may lead to confusing results". PCRE2
+treats a backslash between \Q and \E just like any other character. Note the
+following examples:
 <pre>
    Pattern            PCRE2 matches     Perl matches

@ -88,19 +96,19 @@ The \Q...\E sequence is recognized both inside and outside character classes
 by both PCRE2 and Perl.
 </P>
 <P>
-7. Fairly obviously, PCRE2 does not support the (?{code}) and (??{code})
+8. Fairly obviously, PCRE2 does not support the (?{code}) and (??{code})
 constructions. However, PCRE2 does have a "callout" feature, which allows an
 external function to be called during pattern matching. See the
 <a href="pcre2callout.html"><b>pcre2callout</b></a>
 documentation for details.
 </P>
 <P>
-8. Subroutine calls (whether recursive or not) were treated as atomic groups up
+9. Subroutine calls (whether recursive or not) were treated as atomic groups up
 to PCRE2 release 10.23, but from release 10.30 this changed, and backtracking
 into subroutine calls is now supported, as in Perl.
 </P>
 <P>
-9. In PCRE2, if any of the backtracking control verbs are used in a group that
+10. In PCRE2, if any of the backtracking control verbs are used in a group that
 is called as a subroutine (whether or not recursively), their effect is
 confined to that group; it does not extend to the surrounding pattern. This is
 not always the case in Perl. In particular, if (*THEN) is present in a group
@ -109,20 +117,20 @@ the group does not contain any | characters. Note that such groups are
 processed as anchored at the point where they are tested.
 </P>
 <P>
-10. If a pattern contains more than one backtracking control verb, the first
+11. If a pattern contains more than one backtracking control verb, the first
 one that is backtracked onto acts. For example, in the pattern
 A(*COMMIT)B(*PRUNE)C a failure in B triggers (*COMMIT), but a failure in C
 triggers (*PRUNE). Perl's behaviour is more complex; in many cases it is the
 same as PCRE2, but there are cases where it differs.
 </P>
 <P>
-11. There are some differences that are concerned with the settings of captured
+12. There are some differences that are concerned with the settings of captured
 strings when part of a pattern is repeated. For example, matching "aba" against
 the pattern /^(a(b)?)+$/ in Perl leaves $2 unset, but in PCRE2 it is set to
 "b".
 </P>
 <P>
-12. PCRE2's handling of duplicate capture group numbers and names is not as
+13. PCRE2's handling of duplicate capture group numbers and names is not as
 general as Perl's. This is a consequence of the fact the PCRE2 works internally
 just with numbers, using an external table to translate between numbers and
 names. In particular, a pattern such as (?|(?&#60;a&#62;A)|(?&#60;b&#62;B)), where the two
@ -132,40 +140,43 @@ to distinguish which group matched, because both names map to capture group
 number 1. To avoid this confusing situation, an error is given at compile time.
 </P>
 <P>
-13. Perl used to recognize comments in some places that PCRE2 does not, for
+14. Perl used to recognize comments in some places that PCRE2 does not, for
 example, between the ( and ? at the start of a group. If the /x modifier is
 set, Perl allowed white space between ( and ? though the latest Perls give an
 error (for a while it was just deprecated). There may still be some cases where
 Perl behaves differently.
 </P>
 <P>
-14. Perl, when in warning mode, gives warnings for character classes such as
+15. Perl, when in warning mode, gives warnings for character classes such as
 [A-\d] or [a-[:digit:]]. It then treats the hyphens as literals. PCRE2 has no
 warning features, so it gives an error in these cases because they are almost
 certainly user mistakes.
 </P>
 <P>
-15. In PCRE2, the upper/lower case character properties Lu and Ll are not
+16. In PCRE2, the upper/lower case character properties Lu and Ll are not
 affected when case-independent matching is specified. For example, \p{Lu}
 always matches an upper case letter. I think Perl has changed in this respect;
-in the release at the time of writing (5.32), \p{Lu} and \p{Ll} match all
+in the release at the time of writing (5.34), \p{Lu} and \p{Ll} match all
 letters, regardless of case, when case independence is specified.
 </P>
 <P>
-16. From release 5.32.0, Perl locks out the use of \K in lookaround
-assertions. In PCRE2, \K is acted on when it occurs in positive assertions,
-but is ignored in negative assertions.
+17. From release 5.32.0, Perl locks out the use of \K in lookaround
+assertions. From release 10.38 PCRE2 does the same by default. However, there
+is an option for re-enabling the previous behaviour. When this option is set,
+\K is acted on when it occurs in positive assertions, but is ignored in
+negative assertions.
 </P>
 <P>
-17. PCRE2 provides some extensions to the Perl regular expression facilities.
+18. PCRE2 provides some extensions to the Perl regular expression facilities.
 Perl 5.10 included new features that were not in earlier versions of Perl, some
 of which (such as named parentheses) were in PCRE2 for some time before. This
-list is with respect to Perl 5.32:
+list is with respect to Perl 5.34:
 <br>
 <br>
 (a) Although lookbehind assertions in PCRE2 must match fixed length strings,
 each alternative toplevel branch of a lookbehind assertion can match a
-different length of string. Perl requires them all to have the same length.
+different length of string. Perl used to require them all to have the same
+length, but the latest version has some variable length support.
 <br>
 <br>
 (b) From PCRE2 10.23, backreferences to groups of fixed length are supported
@ -219,12 +230,12 @@ extension to the lookaround facilities. The default, Perl-compatible
 lookarounds are atomic.
 </P>
 <P>
-18. The Perl /a modifier restricts /d numbers to pure ascii, and the /aa
+19. The Perl /a modifier restricts /d numbers to pure ascii, and the /aa
 modifier restricts /i case-insensitive matching to pure ascii, ignoring Unicode
 rules. This separation cannot be represented with PCRE2_UCP.
 </P>
 <P>
-19. Perl has different limits than PCRE2. See the
+20. Perl has different limits than PCRE2. See the
 <a href="pcre2limit.html"><b>pcre2limit</b></a>
 documentation for details. Perl went with 5.10 from recursion to iteration
 keeping the intermediate matches on the heap, which is ~10% slower but does not
@ -237,7 +248,7 @@ AUTHOR
 <P>
 Philip Hazel
 <br>
-University Computing Service
+Retired from University Computing Service
 <br>
 Cambridge, England.
 <br>
@ -246,9 +257,9 @@ Cambridge, England.
 REVISION
 </b><br>
 <P>
-Last updated: 06 October 2020
+Last updated: 08 December 2021
 <br>
-Copyright &copy; 1997-2019 University of Cambridge.
+Copyright &copy; 1997-2021 University of Cambridge.
 <br>
 <p>
 Return to the <a href="index.html">PCRE2 index page</a>.
--- a/doc/html/pcre2convert.html
+++ b/doc/html/pcre2convert.html
@ -141,8 +141,8 @@ permitted to match separator characters, but the double-star (**) feature
 </P>
 <P>
 PCRE2_CONVERT_GLOB_NO_WILD_SEPARATOR matches globs with wildcards allowed to
-match separator characters. PCRE2_GLOB_NO_STARSTAR matches globs with the
-double-star feature disabled. These options may be given together.
+match separator characters. PCRE2_CONVERT_GLOB_NO_STARSTAR matches globs with
+the double-star feature disabled. These options may be given together.
 </P>
 <br><a name="SEC5" href="#TOC1">CONVERTING POSIX PATTERNS</a><br>
 <P>
--- a/doc/html/pcre2demo.html
+++ b/doc/html/pcre2demo.html
@ -215,8 +215,8 @@ if (rc &lt; 0)
  return 1;
  }

-/* Match succeded. Get a pointer to the output vector, where string offsets are
-stored. */
+/* Match succeeded. Get a pointer to the output vector, where string offsets
+are stored. */

 ovector = pcre2_get_ovector_pointer(match_data);
 printf("Match succeeded at offset %d\n", (int)ovector[0]);
@ -234,9 +234,12 @@ pcre2_match_data_create_from_pattern() above. */
 if (rc == 0)
  printf("ovector was not big enough for all the captured substrings\n");

-/* We must guard against patterns such as /(?=.\K)/ that use \K in an assertion
-to set the start of a match later than its end. In this demonstration program,
-we just detect this case and give up. */
+/* Since release 10.38 PCRE2 has locked out the use of \K in lookaround
+assertions. However, there is an option to re-enable the old behaviour. If that
+is set, it is possible to run patterns such as /(?=.\K)/ that use \K in an
+assertion to set the start of a match later than its end. In this demonstration
+program, we show how to detect this case, but it shouldn't arise because the
+option is never set. */

 if (ovector[0] &gt; ovector[1])
  {
@ -453,7 +456,7 @@ for (;;)
    return 1;
    }

-  /* Match succeded */
+  /* Match succeeded */

  printf("\nMatch succeeded again at offset %d\n", (int)ovector[0]);

--- a/doc/html/pcre2grep.html
+++ b/doc/html/pcre2grep.html
@ -71,13 +71,15 @@ For example:
 <pre>
  pcre2grep some-pattern file1 - file3
 </pre>
-Input files are searched line by line. By default, each line that matches a
+By default, input files are searched line by line. Each line that matches a
 pattern is copied to the standard output, and if there is more than one file,
 the file name is output at the start of each line, followed by a colon.
-However, there are options that can change how <b>pcre2grep</b> behaves. In
-particular, the <b>-M</b> option makes it possible to search for strings that
-span line boundaries. What defines a line boundary is controlled by the
-<b>-N</b> (<b>--newline</b>) option.
+However, there are options that can change how <b>pcre2grep</b> behaves. For
+example, the <b>-M</b> option makes it possible to search for strings that span
+line boundaries. What defines a line boundary is controlled by the <b>-N</b>
+(<b>--newline</b>) option. The <b>-h</b> and <b>-H</b> options control whether or
+not file names are shown, and the <b>-Z</b> option changes the file name
+terminator to a zero byte.
 </P>
 <P>
 The amount of memory used for buffering files that are being scanned is
@ -178,9 +180,11 @@ Output up to <i>number</i> lines of context after each matching line. Fewer
 lines are output if the next match or the end of the file is reached, or if the
 processing buffer size has been set too small. If file names and/or line
 numbers are being output, a hyphen separator is used instead of a colon for the
-context lines. A line containing "--" is output between each group of lines,
-unless they are in fact contiguous in the input file. The value of <i>number</i>
-is expected to be relatively small. When <b>-c</b> is used, <b>-A</b> is ignored.
+context lines (the <b>-Z</b> option can be used to change the file name
+terminator to a zero byte). A line containing "--" is output between each group
+of lines, unless they are in fact contiguous in the input file. The value of
+<i>number</i> is expected to be relatively small. When <b>-c</b> is used,
+<b>-A</b> is ignored.
 </P>
 <P>
 <b>-a</b>, <b>--text</b>
@ -188,14 +192,21 @@ Treat binary files as text. This is equivalent to
 <b>--binary-files</b>=<i>text</i>.
 </P>
 <P>
+<b>--allow-lookaround-bsk</b>
+PCRE2 now forbids the use of \K in lookarounds by default, in line with Perl.
+This option causes <b>pcre2grep</b> to set the PCRE2_EXTRA_ALLOW_LOOKAROUND_BSK
+option, which enables this somewhat dangerous usage.
+</P>
+<P>
 <b>-B</b> <i>number</i>, <b>--before-context=</b><i>number</i>
 Output up to <i>number</i> lines of context before each matching line. Fewer
 lines are output if the previous match or the start of the file is within
 <i>number</i> lines, or if the processing buffer size has been set too small. If
 file names and/or line numbers are being output, a hyphen separator is used
-instead of a colon for the context lines. A line containing "--" is output
-between each group of lines, unless they are in fact contiguous in the input
-file. The value of <i>number</i> is expected to be relatively small. When
+instead of a colon for the context lines (the <b>-Z</b> option can be used to
+change the file name terminator to a zero byte). A line containing "--" is
+output between each group of lines, unless they are in fact contiguous in the
+input file. The value of <i>number</i> is expected to be relatively small. When
 <b>-c</b> is used, <b>-B</b> is ignored.
 </P>
 <P>
@ -405,20 +416,22 @@ shown separately. This option is mutually exclusive with <b>--output</b>,
 <P>
 <b>-H</b>, <b>--with-filename</b>
 Force the inclusion of the file name at the start of output lines when
-searching a single file. By default, the file name is not shown in this case.
-For matching lines, the file name is followed by a colon; for context lines, a
-hyphen separator is used. If a line number is also being output, it follows the
-file name. When the <b>-M</b> option causes a pattern to match more than one
-line, only the first is preceded by the file name. This option overrides any
-previous <b>-h</b>, <b>-l</b>, or <b>-L</b> options.
+searching a single file. The file name is not normally shown in this case.
+By default, for matching lines, the file name is followed by a colon; for
+context lines, a hyphen separator is used. The <b>-Z</b> option can be used to
+change the terminator to a zero byte. If a line number is also being output,
+it follows the file name. When the <b>-M</b> option causes a pattern to match
+more than one line, only the first is preceded by the file name. This option
+overrides any previous <b>-h</b>, <b>-l</b>, or <b>-L</b> options.
 </P>
 <P>
 <b>-h</b>, <b>--no-filename</b>
-Suppress the output file names when searching multiple files. By default,
-file names are shown when multiple files are searched. For matching lines, the
-file name is followed by a colon; for context lines, a hyphen separator is used.
-If a line number is also being output, it follows the file name. This option
-overrides any previous <b>-H</b>, <b>-L</b>, or <b>-l</b> options.
+Suppress the output file names when searching multiple files. File names are
+normally shown when multiple files are searched. By default, for matching
+lines, the file name is followed by a colon; for context lines, a hyphen
+separator is used. The <b>-Z</b> option can be used to change the terminator to
+a zero byte. If a line number is also being output, it follows the file name.
+This option overrides any previous <b>-H</b>, <b>-L</b>, or <b>-l</b> options.
 </P>
 <P>
 <b>--heap-limit</b>=<i>number</i>
@ -475,18 +488,20 @@ given any number of times. If a directory matches both <b>--include-dir</b> and
 <b>-L</b>, <b>--files-without-match</b>
 Instead of outputting lines from the files, just output the names of the files
 that do not contain any lines that would have been output. Each file name is
-output once, on a separate line. This option overrides any previous <b>-H</b>,
-<b>-h</b>, or <b>-l</b> options.
+output once, on a separate line by default, but if the <b>-Z</b> option is set, 
+they are separated by zero bytes instead of newlines. This option overrides any
+previous <b>-H</b>, <b>-h</b>, or <b>-l</b> options.
 </P>
 <P>
 <b>-l</b>, <b>--files-with-matches</b>
 Instead of outputting lines from the files, just output the names of the files
 containing lines that would have been output. Each file name is output once, on
-a separate line. Searching normally stops as soon as a matching line is found
-in a file. However, if the <b>-c</b> (count) option is also used, matching
-continues in order to obtain the correct count, and those files that have at
-least one match are listed along with their counts. Using this option with
-<b>-c</b> is a way of suppressing the listing of files with no matches that
+a separate line, but if the <b>-Z</b> option is set, they are separated by zero
+bytes instead of newlines. Searching normally stops as soon as a matching line
+is found in a file. However, if the <b>-c</b> (count) option is also used,
+matching continues in order to obtain the correct count, and those files that
+have at least one match are listed along with their counts. Using this option
+with <b>-c</b> is a way of suppressing the listing of files with no matches that
 occurs with <b>-c</b> on its own. This option overrides any previous <b>-H</b>,
 <b>-h</b>, or <b>-L</b> options.
 </P>
@ -586,10 +601,7 @@ value set by <b>--match-limit</b> is reached, an error occurs.
 <br>
 <br>
 The <b>--heap-limit</b> option specifies, as a number of kibibytes (units of
-1024 bytes), the amount of heap memory that may be used for matching. Heap
-memory is needed only if matching the pattern requires a significant number of
-nested backtracking points to be remembered. This parameter can be set to zero
-to forbid the use of heap memory altogether.
+1024 bytes), the maximum amount of heap memory that may be used for matching.
 <br>
 <br>
 The <b>--depth-limit</b> option limits the depth of nested backtracking points,
@ -833,6 +845,13 @@ pattern and ")$" at the end. This option applies only to the patterns that are
 matched against the contents of files; it does not apply to patterns specified
 by any of the <b>--include</b> or <b>--exclude</b> options.
 </P>
+<P>
+<b>-Z</b>, <b>--null</b>
+Terminate files names in the regular output with a zero byte (the NUL
+character) instead of what would normally appear. This is useful when file
+names contain unusual characters such as colons, hyphens, or even newlines. The
+option does not apply to file names in error messages.
+</P>
 <br><a name="SEC7" href="#TOC1">ENVIRONMENT VARIABLES</a><br>
 <P>
 The environment variables <b>LC_ALL</b> and <b>LC_CTYPE</b> are examined, in that
@ -1040,16 +1059,16 @@ because VMS does not distinguish between exit(0) and exit(1).
 <P>
 Philip Hazel
 <br>
-University Computing Service
+Retired from University Computing Service
 <br>
 Cambridge, England.
 <br>
 </P>
 <br><a name="SEC16" href="#TOC1">REVISION</a><br>
 <P>
-Last updated: 04 October 2020
+Last updated: 30 July 2022
 <br>
-Copyright &copy; 1997-2020 University of Cambridge.
+Copyright &copy; 1997-2022 University of Cambridge.
 <br>
 <p>
 Return to the <a href="index.html">PCRE2 index page</a>.
--- a/doc/html/pcre2jit.html
+++ b/doc/html/pcre2jit.html
@ -54,6 +54,7 @@ platforms:
 <pre>
  ARM 32-bit (v5, v7, and Thumb2)
  ARM 64-bit
+  IBM s390x 64 bit
  Intel x86 32-bit and 64-bit
  MIPS 32-bit and 64-bit
  Power PC 32-bit and 64-bit
@ -268,11 +269,11 @@ starts another match, that match must use a different JIT stack to the one used
 for currently suspended match(es).
 </P>
 <P>
-In a multithread application, if you do not
-specify a JIT stack, or if you assign or pass back NULL from a callback, that
-is thread-safe, because each thread has its own machine stack. However, if you
-assign or pass back a non-NULL JIT stack, this must be a different stack for
-each thread so that the application is thread-safe.
+In a multithread application, if you do not specify a JIT stack, or if you
+assign or pass back NULL from a callback, that is thread-safe, because each
+thread has its own machine stack. However, if you assign or pass back a
+non-NULL JIT stack, this must be a different stack for each thread so that the
+application is thread-safe.
 </P>
 <P>
 Strictly speaking, even more is allowed. You can assign the same non-NULL stack
@ -286,7 +287,7 @@ inefficient solution, and not recommended.
 This is a suggestion for how a multithreaded program that needs to set up
 non-default JIT stacks might operate:
 <pre>
-  During thread initalization
+  During thread initialization
    thread_local_var = pcre2_jit_stack_create(...)

  During thread exit
@ -381,8 +382,8 @@ out this complicated API.
 <b>void pcre2_jit_free_unused_memory(pcre2_general_context *<i>gcontext</i>);</b>
 </P>
 <P>
-The JIT executable allocator does not free all memory when it is possible.
-It expects new allocations, and keeps some free memory around to improve
+The JIT executable allocator does not free all memory when it is possible. It
+expects new allocations, and keeps some free memory around to improve
 allocation speed. However, in low memory conditions, it might be better to free
 all possible memory. You can cause this to happen by calling
 pcre2_jit_free_unused_memory(). Its argument is a general context, for custom
@ -441,10 +442,10 @@ that was not compiled.
 <P>
 When you call <b>pcre2_match()</b>, as well as testing for invalid options, a
 number of other sanity checks are performed on the arguments. For example, if
-the subject pointer is NULL, an immediate error is given. Also, unless
-PCRE2_NO_UTF_CHECK is set, a UTF subject string is tested for validity. In the
-interests of speed, these checks do not happen on the JIT fast path, and if
-invalid data is passed, the result is undefined.
+the subject pointer is NULL but the length is non-zero, an immediate error is
+given. Also, unless PCRE2_NO_UTF_CHECK is set, a UTF subject string is tested
+for validity. In the interests of speed, these checks do not happen on the JIT
+fast path, and if invalid data is passed, the result is undefined.
 </P>
 <P>
 Bypassing the sanity checks and the <b>pcre2_match()</b> wrapping can give
@ -465,9 +466,9 @@ Cambridge, England.
 </P>
 <br><a name="SEC14" href="#TOC1">REVISION</a><br>
 <P>
-Last updated: 23 May 2019
+Last updated: 30 November 2021
 <br>
-Copyright &copy; 1997-2019 University of Cambridge.
+Copyright &copy; 1997-2021 University of Cambridge.
 <br>
 <p>
 Return to the <a href="index.html">PCRE2 index page</a>.
--- a/doc/html/pcre2limits.html
+++ b/doc/html/pcre2limits.html
@ -71,13 +71,18 @@ is 255 code units for the 8-bit library and 65535 code units for the 16-bit and
 The maximum length of a string argument to a callout is the largest number a
 32-bit unsigned integer can hold.
 </P>
+<P>
+The maximum amount of heap memory used for matching is controlled by the heap 
+limit, which can be set in a pattern or in a match context. The default is a 
+very large number, effectively unlimited.
+</P>
 <br><b>
 AUTHOR
 </b><br>
 <P>
 Philip Hazel
 <br>
-University Computing Service
+Retired from University Computing Service
 <br>
 Cambridge, England.
 <br>
@ -86,9 +91,9 @@ Cambridge, England.
 REVISION
 </b><br>
 <P>
-Last updated: 02 February 2019
+Last updated: 26 July 2022
 <br>
-Copyright &copy; 1997-2019 University of Cambridge.
+Copyright &copy; 1997-2022 University of Cambridge.
 <br>
 <p>
 Return to the <a href="index.html">PCRE2 index page</a>.
--- a/doc/html/pcre2matching.html
+++ b/doc/html/pcre2matching.html
@ -78,8 +78,9 @@ tried is controlled by the greedy or ungreedy nature of the quantifier.
 If a leaf node is reached, a matching string has been found, and at that point
 the algorithm stops. Thus, if there is more than one possible match, this
 algorithm returns the first one that it finds. Whether this is the shortest,
-the longest, or some intermediate length depends on the way the greedy and
-ungreedy repetition quantifiers are specified in the pattern.
+the longest, or some intermediate length depends on the way the alternations
+and the greedy or ungreedy repetition quantifiers are specified in the
+pattern.
 </P>
 <P>
 Because it ends up with a single path through the tree, it is relatively
@ -109,11 +110,17 @@ no more unterminated paths. At this point, terminated paths represent the
 different matching possibilities (if there are none, the match has failed).
 Thus, if there is more than one possible match, this algorithm finds all of
 them, and in particular, it finds the longest. The matches are returned in
-decreasing order of length. There is an option to stop the algorithm after the
-first match (which is necessarily the shortest) is found.
+the output vector in decreasing order of length. There is an option to stop the
+algorithm after the first match (which is necessarily the shortest) is found.
 </P>
 <P>
-Note that all the matches that are found start at the same point in the
+Note that the size of vector needed to contain all the results depends on the
+number of simultaneous matches, not on the number of parentheses in the
+pattern. Using <b>pcre2_match_data_create_from_pattern()</b> to create the match
+data block is therefore not advisable when doing DFA matching.
+</P>
+<P>
+Note also that all the matches that are found start at the same point in the
 subject. If the pattern
 <pre>
  cat(er(pillar)?)?
@ -194,21 +201,14 @@ supported by <b>pcre2_dfa_match()</b>.
 </P>
 <br><a name="SEC5" href="#TOC1">ADVANTAGES OF THE ALTERNATIVE ALGORITHM</a><br>
 <P>
-Using the alternative matching algorithm provides the following advantages:
+The main advantage of the alternative algorithm is that all possible matches
+(at a single point in the subject) are automatically found, and in particular,
+the longest match is found. To find more than one match at the same point using
+the standard algorithm, you have to do kludgy things with callouts.
 </P>
 <P>
-1. All possible matches (at a single point in the subject) are automatically
-found, and in particular, the longest match is found. To find more than one
-match using the standard algorithm, you have to do kludgy things with
-callouts.
-</P>
-<P>
-2. Because the alternative algorithm scans the subject string just once, and
-never needs to backtrack (except for lookbehinds), it is possible to pass very
-long subject strings to the matching function in several pieces, checking for
-partial matching each time. Although it is also possible to do multi-segment
-matching using the standard algorithm, by retaining partially matched
-substrings, it is more complicated. The
+Partial matching is possible with this algorithm, though it has some
+limitations. The
 <a href="pcre2partial.html"><b>pcre2partial</b></a>
 documentation gives details of partial matching and discusses multi-segment
 matching.
@ -230,20 +230,23 @@ invalid UTF string are not supported.
 3. Although atomic groups are supported, their use does not provide the
 performance advantage that it does for the standard algorithm.
 </P>
+<P>
+4. JIT optimization is not supported.
+</P>
 <br><a name="SEC7" href="#TOC1">AUTHOR</a><br>
 <P>
 Philip Hazel
 <br>
-University Computing Service
+Retired from University Computing Service
 <br>
 Cambridge, England.
 <br>
 </P>
 <br><a name="SEC8" href="#TOC1">REVISION</a><br>
 <P>
-Last updated: 23 May 2019
+Last updated: 28 August 2021
 <br>
-Copyright &copy; 1997-2019 University of Cambridge.
+Copyright &copy; 1997-2021 University of Cambridge.
 <br>
 <p>
 Return to the <a href="index.html">PCRE2 index page</a>.
--- a/doc/html/pcre2pattern.html
+++ b/doc/html/pcre2pattern.html
@ -534,7 +534,7 @@ for themselves. For example, outside a character class:
  \0113  is a tab followed by the character "3"
  \113   might be a backreference, otherwise the character with octal code 113
  \377   might be a backreference, otherwise the value 255 (decimal)
-  \81    is always a backreference .sp
+  \81    is always a backreference
 </pre>
 Note that octal values of 100 or greater that are specified using this syntax
 must not be introduced by a leading zero, because no more than three octal
@ -745,7 +745,7 @@ Unicode support is not needed for these characters to be recognized.
 <P>
 It is possible to restrict \R to match only CR, LF, or CRLF (instead of the
 complete set of Unicode line endings) by setting the option PCRE2_BSR_ANYCRLF
-at compile time. (BSR is an abbrevation for "backslash R".) This can be made
+at compile time. (BSR is an abbreviation for "backslash R".) This can be made
 the default when PCRE2 is built; if this is the case, the other behaviour can
 be requested via the PCRE2_BSR_UNICODE option. It is also possible to specify
 these settings by starting a pattern string with one of the following
@ -776,194 +776,62 @@ can be used in any mode, though in 8-bit and 16-bit non-UTF modes these
 sequences are of course limited to testing characters whose code points are
 less than U+0100 and U+10000, respectively. In 32-bit non-UTF mode, code points
 greater than 0x10ffff (the Unicode limit) may be encountered. These are all
-treated as being in the Unknown script and with an unassigned type. The extra
-escape sequences are:
+treated as being in the Unknown script and with an unassigned type.
+</P>
+<P>
+Matching characters by Unicode property is not fast, because PCRE2 has to do a
+multistage table lookup in order to find a character's property. That is why
+the traditional escape sequences such as \d and \w do not use Unicode
+properties in PCRE2 by default, though you can make them do so by setting the
+PCRE2_UCP option or by starting the pattern with (*UCP).
+</P>
+<P>
+The extra escape sequences that provide property support are:
 <pre>
  \p{<i>xx</i>}   a character with the <i>xx</i> property
  \P{<i>xx</i>}   a character without the <i>xx</i> property
  \X       a Unicode extended grapheme cluster
 </pre>
-The property names represented by <i>xx</i> above are case-sensitive. There is
-support for Unicode script names, Unicode general category properties, "Any",
-which matches any character (including newline), and some special PCRE2
-properties (described in the
-<a href="#extraprops">next section).</a>
-Other Perl properties such as "InMusicalSymbols" are not supported by PCRE2.
-Note that \P{Any} does not match any characters, so always causes a match
-failure.
+The property names represented by <i>xx</i> above are not case-sensitive, and in
+accordance with Unicode's "loose matching" rules, spaces, hyphens, and
+underscores are ignored. There is support for Unicode script names, Unicode
+general category properties, "Any", which matches any character (including
+newline), Bidi_Class, a number of binary (yes/no) properties, and some special
+PCRE2 properties (described
+<a href="#extraprops">below).</a>
+Certain other Perl properties such as "InMusicalSymbols" are not supported by
+PCRE2. Note that \P{Any} does not match any characters, so always causes a
+match failure.
+</P>
+<br><b>
+Script properties for \p and \P
+</b><br>
+<P>
+There are three different syntax forms for matching a script. Each Unicode
+character has a basic script and, optionally, a list of other scripts ("Script
+Extensions") with which it is commonly used. Using the Adlam script as an
+example, \p{sc:Adlam} matches characters whose basic script is Adlam, whereas
+\p{scx:Adlam} matches, in addition, characters that have Adlam in their
+extensions list. The full names "script" and "script extensions" for the
+property types are recognized, and a equals sign is an alternative to the
+colon. If a script name is given without a property type, for example,
+\p{Adlam}, it is treated as \p{scx:Adlam}. Perl changed to this
+interpretation at release 5.26 and PCRE2 changed at release 10.40.
 </P>
 <P>
-Sets of Unicode characters are defined as belonging to certain scripts. A
-character from one of these sets can be matched using a script name. For
-example:
-<pre>
-  \p{Greek}
-  \P{Han}
-</pre>
 Unassigned characters (and in non-UTF 32-bit mode, characters with code points
 greater than 0x10FFFF) are assigned the "Unknown" script. Others that are not
 part of an identified script are lumped together as "Common". The current list
-of scripts is:
-</P>
-<P>
-Adlam,
-Ahom,
-Anatolian_Hieroglyphs,
-Arabic,
-Armenian,
-Avestan,
-Balinese,
-Bamum,
-Bassa_Vah,
-Batak,
-Bengali,
-Bhaiksuki,
-Bopomofo,
-Brahmi,
-Braille,
-Buginese,
-Buhid,
-Canadian_Aboriginal,
-Carian,
-Caucasian_Albanian,
-Chakma,
-Cham,
-Cherokee,
-Chorasmian,
-Common,
-Coptic,
-Cuneiform,
-Cypriot,
-Cyrillic,
-Deseret,
-Devanagari,
-Dives_Akuru,
-Dogra,
-Duployan,
-Egyptian_Hieroglyphs,
-Elbasan,
-Elymaic,
-Ethiopic,
-Georgian,
-Glagolitic,
-Gothic,
-Grantha,
-Greek,
-Gujarati,
-Gunjala_Gondi,
-Gurmukhi,
-Han,
-Hangul,
-Hanifi_Rohingya,
-Hanunoo,
-Hatran,
-Hebrew,
-Hiragana,
-Imperial_Aramaic,
-Inherited,
-Inscriptional_Pahlavi,
-Inscriptional_Parthian,
-Javanese,
-Kaithi,
-Kannada,
-Katakana,
-Kayah_Li,
-Kharoshthi,
-Khitan_Small_Script,
-Khmer,
-Khojki,
-Khudawadi,
-Lao,
-Latin,
-Lepcha,
-Limbu,
-Linear_A,
-Linear_B,
-Lisu,
-Lycian,
-Lydian,
-Mahajani,
-Makasar,
-Malayalam,
-Mandaic,
-Manichaean,
-Marchen,
-Masaram_Gondi,
-Medefaidrin,
-Meetei_Mayek,
-Mende_Kikakui,
-Meroitic_Cursive,
-Meroitic_Hieroglyphs,
-Miao,
-Modi,
-Mongolian,
-Mro,
-Multani,
-Myanmar,
-Nabataean,
-Nandinagari,
-New_Tai_Lue,
-Newa,
-Nko,
-Nushu,
-Nyakeng_Puachue_Hmong,
-Ogham,
-Ol_Chiki,
-Old_Hungarian,
-Old_Italic,
-Old_North_Arabian,
-Old_Permic,
-Old_Persian,
-Old_Sogdian,
-Old_South_Arabian,
-Old_Turkic,
-Oriya,
-Osage,
-Osmanya,
-Pahawh_Hmong,
-Palmyrene,
-Pau_Cin_Hau,
-Phags_Pa,
-Phoenician,
-Psalter_Pahlavi,
-Rejang,
-Runic,
-Samaritan,
-Saurashtra,
-Sharada,
-Shavian,
-Siddham,
-SignWriting,
-Sinhala,
-Sogdian,
-Sora_Sompeng,
-Soyombo,
-Sundanese,
-Syloti_Nagri,
-Syriac,
-Tagalog,
-Tagbanwa,
-Tai_Le,
-Tai_Tham,
-Tai_Viet,
-Takri,
-Tamil,
-Tangut,
-Telugu,
-Thaana,
-Thai,
-Tibetan,
-Tifinagh,
-Tirhuta,
-Ugaritic,
-Unknown,
-Vai,
-Wancho,
-Warang_Citi,
-Yezidi,
-Yi,
-Zanabazar_Square.
+of recognized script names and their 4-character abbreviations can be obtained
+by running this command:
+<pre>
+  pcre2test -LS
+
+</PRE>
 </P>
+<br><b>
+The general category property for \p and \P
+</b><br>
 <P>
 Each character has exactly one Unicode general category property, specified by
 a two-letter abbreviation. For compatibility with Perl, negation can be
@ -1025,9 +893,9 @@ The following general category property codes are supported:
  Zp    Paragraph separator
  Zs    Space separator
 </pre>
-The special property L& is also supported: it matches a character that has
-the Lu, Ll, or Lt property, in other words, a letter that is not classified as
-a modifier or "other".
+The special property LC, which has the synonym L&, is also supported: it
+matches a character that has the Lu, Ll, or Lt property, in other words, a
+letter that is not classified as a modifier or "other".
 </P>
 <P>
 The Cs (Surrogate) property applies only to characters whose code points are in
@ -1054,12 +922,54 @@ Specifying caseless matching does not affect these escape sequences. For
 example, \p{Lu} always matches only upper case letters. This is different from
 the behaviour of current versions of Perl.
 </P>
+<br><b>
+Binary (yes/no) properties for \p and \P
+</b><br>
 <P>
-Matching characters by Unicode property is not fast, because PCRE2 has to do a
-multistage table lookup in order to find a character's property. That is why
-the traditional escape sequences such as \d and \w do not use Unicode
-properties in PCRE2 by default, though you can make them do so by setting the
-PCRE2_UCP option or by starting the pattern with (*UCP).
+Unicode defines a number of binary properties, that is, properties whose only
+values are true or false. You can obtain a list of those that are recognized by
+\p and \P, along with their abbreviations, by running this command:
+<pre>
+  pcre2test -LP
+
+</PRE>
+</P>
+<br><b>
+The Bidi_Class property for \p and \P
+</b><br>
+<P>
+<pre>
+  \p{Bidi_Class:&#60;class&#62;}   matches a character with the given class
+  \p{BC:&#60;class&#62;}           matches a character with the given class
+</pre>
+The recognized classes are:
+<pre>
+  AL          Arabic letter
+  AN          Arabic number
+  B           paragraph separator
+  BN          boundary neutral
+  CS          common separator
+  EN          European number
+  ES          European separator
+  ET          European terminator
+  FSI         first strong isolate
+  L           left-to-right
+  LRE         left-to-right embedding
+  LRI         left-to-right isolate
+  LRO         left-to-right override
+  NSM         non-spacing mark
+  ON          other neutral
+  PDF         pop directional format
+  PDI         pop directional isolate
+  R           right-to-left
+  RLE         right-to-left embedding
+  RLI         right-to-left isolate
+  RLO         right-to-left override
+  S           segment separator
+  WS          which space
+</pre>
+An equals sign may be used instead of a colon. The class names are
+case-insensitive; only the short names listed above are recognized.
 </P>
 <br><b>
 Extended grapheme clusters
@ -1090,7 +1000,7 @@ additional characters according to the following rules for ending a cluster:
 3. Do not break Hangul (a Korean script) syllable sequences. Hangul characters
 are of five types: L, V, T, LV, and LVT. An L character may be followed by an
 L, V, LV, or LVT character; an LV or V character may be followed by a V or T
-character; an LVT or T character may be follwed only by a T character.
+character; an LVT or T character may be followed only by a T character.
 </P>
 <P>
 4. Do not end before extending characters or spacing marks or the "zero-width
@ -1175,9 +1085,11 @@ For example, when the pattern
 matches "foobar", the first substring is still set to "foo".
 </P>
 <P>
-Perl used to document that the use of \K within lookaround assertions is "not
-well defined", but from version 5.32.0 Perl does not support this usage at all.
-In PCRE2, \K is acted upon when it occurs inside positive assertions, but is
+From version 5.32.0 Perl forbids the use of \K in lookaround assertions. From
+release 10.38 PCRE2 also forbids this by default. However, the
+PCRE2_EXTRA_ALLOW_LOOKAROUND_BSK option can be used when calling
+<b>pcre2_compile()</b> to re-enable the previous behaviour. When this option is
+set, \K is acted upon when it occurs inside positive assertions, but is
 ignored in negative assertions. Note that when a pattern such as (?=ab\K)
 matches, the reported start of the match can be greater than the end of the
 match. Using \K in a lookbehind assertion at the start of a pattern can also
@ -1334,15 +1246,17 @@ end of the subject in both modes, and if all branches of a pattern start with
 <P>
 Outside a character class, a dot in the pattern matches any one character in
 the subject string except (by default) a character that signifies the end of a
-line.
+line. One or more characters may be specified as line terminators (see
+<a href="#newlines">"Newline conventions"</a>
+above).
 </P>
 <P>
-When a line ending is defined as a single character, dot never matches that
-character; when the two-character sequence CRLF is used, dot does not match CR
-if it is immediately followed by LF, but otherwise it matches all characters
-(including isolated CRs and LFs). When any Unicode line endings are being
-recognized, dot does not match CR or LF or any of the other line ending
-characters.
+Dot never matches a single line-ending character. When the two-character
+sequence CRLF is the only line ending, dot does not match CR if it is
+immediately followed by LF, but otherwise it matches all characters (including
+isolated CRs and LFs). When ANYCRLF is selected for line endings, no occurences
+of CR of LF match dot. When all Unicode line endings are being recognized, dot
+does not match CR or LF or any of the other line ending characters.
 </P>
 <P>
 The behaviour of dot with regard to newlines can be changed. If the
@ -2173,10 +2087,10 @@ be easier to remember:
 <pre>
  (*atomic:\d+)foo
 </pre>
-This kind of parenthesized group "locks up" the  part of the pattern it
-contains once it has matched, and a failure further into the pattern is
-prevented from backtracking into it. Backtracking past it to previous items,
-however, works as normal.
+This kind of parenthesized group "locks up" the part of the pattern it contains
+once it has matched, and a failure further into the pattern is prevented from
+backtracking into it. Backtracking past it to previous items, however, works as
+normal.
 </P>
 <P>
 An alternative description is that a group of this type matches exactly the
@ -2897,7 +2811,7 @@ breaks):
  (?(DEFINE) (?&#60;byte&#62; 2[0-4]\d | 25[0-5] | 1\d\d | [1-9]?\d) )
  \b (?&byte) (\.(?&byte)){3} \b
 </pre>
-The first part of the pattern is a DEFINE group inside which a another group
+The first part of the pattern is a DEFINE group inside which another group
 named "byte" is defined. This matches an individual component of an IPv4
 address (a number less than 256). When matching takes place, this part of the
 pattern is skipped because DEFINE acts like a false condition. The rest of the
@ -3607,7 +3521,7 @@ successful match if there is a later mismatch. Consider:
 </pre>
 If the subject is "aaaac...", after the first match attempt fails (starting at
 the first character in the string), the starting point skips on to start the
-next attempt at "c". Note that a possessive quantifer does not have the same
+next attempt at "c". Note that a possessive quantifier does not have the same
 effect as this example; although it would suppress backtracking during the
 first match attempt, the second attempt would start at the second character
 instead of skipping on to "c".
@ -3845,16 +3759,16 @@ there is a backtrack at the outer level.
 <P>
 Philip Hazel
 <br>
-University Computing Service
+Retired from University Computing Service
 <br>
 Cambridge, England.
 <br>
 </P>
 <br><a name="SEC32" href="#TOC1">REVISION</a><br>
 <P>
-Last updated: 06 October 2020
+Last updated: 12 January 2022
 <br>
-Copyright &copy; 1997-2020 University of Cambridge.
+Copyright &copy; 1997-2022 University of Cambridge.
 <br>
 <p>
 Return to the <a href="index.html">PCRE2 index page</a>.
--- a/doc/html/pcre2perform.html
+++ b/doc/html/pcre2perform.html
@ -83,12 +83,31 @@ From release 10.30, the interpretive (non-JIT) version of <b>pcre2_match()</b>
 uses very little system stack at run time. In earlier releases recursive
 function calls could use a great deal of stack, and this could cause problems,
 but this usage has been eliminated. Backtracking positions are now explicitly
-remembered in memory frames controlled by the code. An initial 20KiB vector of
-frames is allocated on the system stack (enough for about 100 frames for small
-patterns), but if this is insufficient, heap memory is used. The amount of heap
-memory can be limited; if the limit is set to zero, only the initial stack
-vector is used. Rewriting patterns to be time-efficient, as described below,
-may also reduce the memory requirements.
+remembered in memory frames controlled by the code. 
+</P>
+<P>
+The size of each frame depends on the size of pointer variables and the number
+of capturing parenthesized groups in the pattern being matched. On a 64-bit
+system the frame size for a pattern with no captures is 128 bytes. For each
+capturing group the size increases by 16 bytes.
+</P>
+<P>
+Until release 10.41, an initial 20KiB frames vector was allocated on the system 
+stack, but this still caused some issues for multi-thread applications where
+each thread has a very small stack. From release 10.41 backtracking memory
+frames are always held in heap memory. An initial heap allocation is obtained
+the first time any match data block is passed to <b>pcre2_match()</b>. This is
+remembered with the match data block and re-used if that block is used for
+another match. It is freed when the match data block itself is freed.
+</P>
+<P>
+The size of the initial block is the larger of 20KiB or ten times the pattern's 
+frame size, unless the heap limit is less than this, in which case the heap 
+limit is used. If the initial block proves to be too small during matching, it
+is replaced by a larger block, subject to the heap limit. The heap limit is 
+checked only when a new block is to be allocated. Reducing the heap limit 
+between calls to <b>pcre2_match()</b> with the same match data block does not 
+affect the saved block.
 </P>
 <P>
 In contrast to <b>pcre2_match()</b>, <b>pcre2_dfa_match()</b> does use recursive
@ -245,16 +264,16 @@ pattern to match. This is done by repeatedly matching with different limits.
 <P>
 Philip Hazel
 <br>
-University Computing Service
+Retired from University Computing Service
 <br>
 Cambridge, England.
 <br>
 </P>
 <br><a name="SEC6" href="#TOC1">REVISION</a><br>
 <P>
-Last updated: 03 February 2019
+Last updated: 27 July 2022
 <br>
-Copyright &copy; 1997-2019 University of Cambridge.
+Copyright &copy; 1997-2022 University of Cambridge.
 <br>
 <p>
 Return to the <a href="index.html">PCRE2 index page</a>.
--- a/doc/html/pcre2serialize.html
+++ b/doc/html/pcre2serialize.html
@ -23,12 +23,12 @@ please consult the man page, in case the conversion went wrong.
 <br><a name="SEC1" href="#TOC1">SAVING AND RE-USING PRECOMPILED PCRE2 PATTERNS</a><br>
 <P>
 <b>int32_t pcre2_serialize_decode(pcre2_code **<i>codes</i>,</b>
-<b>  int32_t <i>number_of_codes</i>, const uint32_t *<i>bytes</i>,</b>
+<b>  int32_t <i>number_of_codes</i>, const uint8_t *<i>bytes</i>,</b>
 <b>  pcre2_general_context *<i>gcontext</i>);</b>
 <br>
 <br>
-<b>int32_t pcre2_serialize_encode(pcre2_code **<i>codes</i>,</b>
-<b>  int32_t <i>number_of_codes</i>, uint32_t **<i>serialized_bytes</i>,</b>
+<b>int32_t pcre2_serialize_encode(const pcre2_code **<i>codes</i>,</b>
+<b>  int32_t <i>number_of_codes</i>, uint8_t **<i>serialized_bytes</i>,</b>
 <b>  PCRE2_SIZE *<i>serialized_size</i>, pcre2_general_context *<i>gcontext</i>);</b>
 <br>
 <br>
@ -94,7 +94,7 @@ of serialized patterns, or one of the following negative error codes:
 <pre>
  PCRE2_ERROR_BADDATA      the number of patterns is zero or less
  PCRE2_ERROR_BADMAGIC     mismatch of id bytes in one of the patterns
-  PCRE2_ERROR_MEMORY       memory allocation failed
+  PCRE2_ERROR_NOMEMORY     memory allocation failed
  PCRE2_ERROR_MIXEDTABLES  the patterns do not all use the same tables
  PCRE2_ERROR_NULL         the 1st, 3rd, or 4th argument is NULL
 </pre>
@ -154,7 +154,6 @@ mangagement functions for the decoded patterns. If this argument is NULL,
 <b>malloc()</b> and <b>free()</b> are used. After deserialization, the byte
 stream is no longer needed and can be discarded.
 <pre>
-  int32_t number_of_codes;
  pcre2_code *list_of_codes[2];
  uint8_t *bytes = &#60;serialized data&#62;;
  int32_t number_of_codes =
--- a/doc/html/pcre2syntax.html
+++ b/doc/html/pcre2syntax.html
@ -19,29 +19,31 @@ please consult the man page, in case the conversion went wrong.
 <li><a name="TOC4" href="#SEC4">CHARACTER TYPES</a>
 <li><a name="TOC5" href="#SEC5">GENERAL CATEGORY PROPERTIES FOR \p and \P</a>
 <li><a name="TOC6" href="#SEC6">PCRE2 SPECIAL CATEGORY PROPERTIES FOR \p and \P</a>
-<li><a name="TOC7" href="#SEC7">SCRIPT NAMES FOR \p AND \P</a>
-<li><a name="TOC8" href="#SEC8">CHARACTER CLASSES</a>
-<li><a name="TOC9" href="#SEC9">QUANTIFIERS</a>
-<li><a name="TOC10" href="#SEC10">ANCHORS AND SIMPLE ASSERTIONS</a>
-<li><a name="TOC11" href="#SEC11">REPORTED MATCH POINT SETTING</a>
-<li><a name="TOC12" href="#SEC12">ALTERNATION</a>
-<li><a name="TOC13" href="#SEC13">CAPTURING</a>
-<li><a name="TOC14" href="#SEC14">ATOMIC GROUPS</a>
-<li><a name="TOC15" href="#SEC15">COMMENT</a>
-<li><a name="TOC16" href="#SEC16">OPTION SETTING</a>
-<li><a name="TOC17" href="#SEC17">NEWLINE CONVENTION</a>
-<li><a name="TOC18" href="#SEC18">WHAT \R MATCHES</a>
-<li><a name="TOC19" href="#SEC19">LOOKAHEAD AND LOOKBEHIND ASSERTIONS</a>
-<li><a name="TOC20" href="#SEC20">NON-ATOMIC LOOKAROUND ASSERTIONS</a>
-<li><a name="TOC21" href="#SEC21">SCRIPT RUNS</a>
-<li><a name="TOC22" href="#SEC22">BACKREFERENCES</a>
-<li><a name="TOC23" href="#SEC23">SUBROUTINE REFERENCES (POSSIBLY RECURSIVE)</a>
-<li><a name="TOC24" href="#SEC24">CONDITIONAL PATTERNS</a>
-<li><a name="TOC25" href="#SEC25">BACKTRACKING CONTROL</a>
-<li><a name="TOC26" href="#SEC26">CALLOUTS</a>
-<li><a name="TOC27" href="#SEC27">SEE ALSO</a>
-<li><a name="TOC28" href="#SEC28">AUTHOR</a>
-<li><a name="TOC29" href="#SEC29">REVISION</a>
+<li><a name="TOC7" href="#SEC7">BINARY PROPERTIES FOR \p AND \P</a>
+<li><a name="TOC8" href="#SEC8">SCRIPT MATCHING WITH \p AND \P</a>
+<li><a name="TOC9" href="#SEC9">THE BIDI_CLASS PROPERTY FOR \p AND \P</a>
+<li><a name="TOC10" href="#SEC10">CHARACTER CLASSES</a>
+<li><a name="TOC11" href="#SEC11">QUANTIFIERS</a>
+<li><a name="TOC12" href="#SEC12">ANCHORS AND SIMPLE ASSERTIONS</a>
+<li><a name="TOC13" href="#SEC13">REPORTED MATCH POINT SETTING</a>
+<li><a name="TOC14" href="#SEC14">ALTERNATION</a>
+<li><a name="TOC15" href="#SEC15">CAPTURING</a>
+<li><a name="TOC16" href="#SEC16">ATOMIC GROUPS</a>
+<li><a name="TOC17" href="#SEC17">COMMENT</a>
+<li><a name="TOC18" href="#SEC18">OPTION SETTING</a>
+<li><a name="TOC19" href="#SEC19">NEWLINE CONVENTION</a>
+<li><a name="TOC20" href="#SEC20">WHAT \R MATCHES</a>
+<li><a name="TOC21" href="#SEC21">LOOKAHEAD AND LOOKBEHIND ASSERTIONS</a>
+<li><a name="TOC22" href="#SEC22">NON-ATOMIC LOOKAROUND ASSERTIONS</a>
+<li><a name="TOC23" href="#SEC23">SCRIPT RUNS</a>
+<li><a name="TOC24" href="#SEC24">BACKREFERENCES</a>
+<li><a name="TOC25" href="#SEC25">SUBROUTINE REFERENCES (POSSIBLY RECURSIVE)</a>
+<li><a name="TOC26" href="#SEC26">CONDITIONAL PATTERNS</a>
+<li><a name="TOC27" href="#SEC27">BACKTRACKING CONTROL</a>
+<li><a name="TOC28" href="#SEC28">CALLOUTS</a>
+<li><a name="TOC29" href="#SEC29">SEE ALSO</a>
+<li><a name="TOC30" href="#SEC30">AUTHOR</a>
+<li><a name="TOC31" href="#SEC31">REVISION</a>
 </ul>
 <br><a name="SEC1" href="#TOC1">PCRE2 REGULAR EXPRESSION SYNTAX SUMMARY</a><br>
 <P>
@ -136,6 +138,11 @@ happening, \s and \w may also match characters with code points in the range
 sequences is changed to use Unicode properties and they match many more
 characters.
 </P>
+<P>
+Property descriptions in \p and \P are matched caselessly; hyphens,
+underscores, and white space are ignored, in accordance with Unicode's "loose
+matching" rules.
+</P>
 <br><a name="SEC5" href="#TOC1">GENERAL CATEGORY PROPERTIES FOR \p and \P</a><br>
 <P>
 <pre>
@ -152,6 +159,7 @@ characters.
  Lo         Other letter
  Lt         Title case letter
  Lu         Upper case letter
+  Lc         Ll, Lu, or Lt
  L&         Ll, Lu, or Lt

  M          Mark
@ -198,166 +206,58 @@ characters.
 Perl and POSIX space are now the same. Perl added VT to its space character set
 at release 5.18.
 </P>
-<br><a name="SEC7" href="#TOC1">SCRIPT NAMES FOR \p AND \P</a><br>
+<br><a name="SEC7" href="#TOC1">BINARY PROPERTIES FOR \p AND \P</a><br>
 <P>
-Adlam,
-Ahom,
-Anatolian_Hieroglyphs,
-Arabic,
-Armenian,
-Avestan,
-Balinese,
-Bamum,
-Bassa_Vah,
-Batak,
-Bengali,
-Bhaiksuki,
-Bopomofo,
-Brahmi,
-Braille,
-Buginese,
-Buhid,
-Canadian_Aboriginal,
-Carian,
-Caucasian_Albanian,
-Chakma,
-Cham,
-Cherokee,
-Chorasmian,
-Common,
-Coptic,
-Cuneiform,
-Cypriot,
-Cyrillic,
-Deseret,
-Devanagari,
-Dives_Akuru,
-Dogra,
-Duployan,
-Egyptian_Hieroglyphs,
-Elbasan,
-Elymaic,
-Ethiopic,
-Georgian,
-Glagolitic,
-Gothic,
-Grantha,
-Greek,
-Gujarati,
-Gunjala_Gondi,
-Gurmukhi,
-Han,
-Hangul,
-Hanifi_Rohingya,
-Hanunoo,
-Hatran,
-Hebrew,
-Hiragana,
-Imperial_Aramaic,
-Inherited,
-Inscriptional_Pahlavi,
-Inscriptional_Parthian,
-Javanese,
-Kaithi,
-Kannada,
-Katakana,
-Kayah_Li,
-Kharoshthi,
-Khitan_Small_Script,
-Khmer,
-Khojki,
-Khudawadi,
-Lao,
-Latin,
-Lepcha,
-Limbu,
-Linear_A,
-Linear_B,
-Lisu,
-Lycian,
-Lydian,
-Mahajani,
-Makasar,
-Malayalam,
-Mandaic,
-Manichaean,
-Marchen,
-Masaram_Gondi,
-Medefaidrin,
-Meetei_Mayek,
-Mende_Kikakui,
-Meroitic_Cursive,
-Meroitic_Hieroglyphs,
-Miao,
-Modi,
-Mongolian,
-Mro,
-Multani,
-Myanmar,
-Nabataean,
-Nandinagari,
-New_Tai_Lue,
-Newa,
-Nko,
-Nushu,
-Nyakeng_Puachue_Hmong,
-Ogham,
-Ol_Chiki,
-Old_Hungarian,
-Old_Italic,
-Old_North_Arabian,
-Old_Permic,
-Old_Persian,
-Old_Sogdian,
-Old_South_Arabian,
-Old_Turkic,
-Oriya,
-Osage,
-Osmanya,
-Pahawh_Hmong,
-Palmyrene,
-Pau_Cin_Hau,
-Phags_Pa,
-Phoenician,
-Psalter_Pahlavi,
-Rejang,
-Runic,
-Samaritan,
-Saurashtra,
-Sharada,
-Shavian,
-Siddham,
-SignWriting,
-Sinhala,
-Sogdian,
-Sora_Sompeng,
-Soyombo,
-Sundanese,
-Syloti_Nagri,
-Syriac,
-Tagalog,
-Tagbanwa,
-Tai_Le,
-Tai_Tham,
-Tai_Viet,
-Takri,
-Tamil,
-Tangut,
-Telugu,
-Thaana,
-Thai,
-Tibetan,
-Tifinagh,
-Tirhuta,
-Ugaritic,
-Vai,
-Wancho,
-Warang_Citi,
-Yezidi,
-Yi,
-Zanabazar_Square.
+Unicode defines a number of binary properties, that is, properties whose only
+values are true or false. You can obtain a list of those that are recognized by
+\p and \P, along with their abbreviations, by running this command:
+<pre>
+  pcre2test -LP
+</PRE>
 </P>
-<br><a name="SEC8" href="#TOC1">CHARACTER CLASSES</a><br>
+<br><a name="SEC8" href="#TOC1">SCRIPT MATCHING WITH \p AND \P</a><br>
+<P>
+Many script names and their 4-letter abbreviations are recognized in
+\p{sc:...} or \p{scx:...} items, or on their own with \p (and also \P of
+course). You can obtain a list of these scripts by running this command:
+<pre>
+  pcre2test -LS
+</PRE>
+</P>
+<br><a name="SEC9" href="#TOC1">THE BIDI_CLASS PROPERTY FOR \p AND \P</a><br>
+<P>
+<pre>
+  \p{Bidi_Class:&#60;class&#62;}   matches a character with the given class
+  \p{BC:&#60;class&#62;}           matches a character with the given class
+</pre>
+The recognized classes are:
+<pre>
+  AL          Arabic letter
+  AN          Arabic number
+  B           paragraph separator
+  BN          boundary neutral
+  CS          common separator
+  EN          European number
+  ES          European separator
+  ET          European terminator
+  FSI         first strong isolate
+  L           left-to-right
+  LRE         left-to-right embedding
+  LRI         left-to-right isolate
+  LRO         left-to-right override
+  NSM         non-spacing mark
+  ON          other neutral
+  PDF         pop directional format
+  PDI         pop directional isolate
+  R           right-to-left
+  RLE         right-to-left embedding
+  RLI         right-to-left isolate
+  RLO         right-to-left override
+  S           segment separator
+  WS          which space
+</PRE>
+</P>
+<br><a name="SEC10" href="#TOC1">CHARACTER CLASSES</a><br>
 <P>
 <pre>
  [...]       positive character class
@ -385,7 +285,7 @@ In PCRE2, POSIX character set names recognize only ASCII characters by default,
 but some of them use Unicode properties if PCRE2_UCP is set. You can use
 \Q...\E inside a character class.
 </P>
-<br><a name="SEC9" href="#TOC1">QUANTIFIERS</a><br>
+<br><a name="SEC11" href="#TOC1">QUANTIFIERS</a><br>
 <P>
 <pre>
  ?           0 or 1, greedy
@ -406,7 +306,7 @@ but some of them use Unicode properties if PCRE2_UCP is set. You can use
  {n,}?       n or more, lazy
 </PRE>
 </P>
-<br><a name="SEC10" href="#TOC1">ANCHORS AND SIMPLE ASSERTIONS</a><br>
+<br><a name="SEC12" href="#TOC1">ANCHORS AND SIMPLE ASSERTIONS</a><br>
 <P>
 <pre>
  \b          word boundary
@ -424,20 +324,23 @@ but some of them use Unicode properties if PCRE2_UCP is set. You can use
  \G          first matching position in subject
 </PRE>
 </P>
-<br><a name="SEC11" href="#TOC1">REPORTED MATCH POINT SETTING</a><br>
+<br><a name="SEC13" href="#TOC1">REPORTED MATCH POINT SETTING</a><br>
 <P>
 <pre>
  \K          set reported start of match
 </pre>
+From release 10.38 \K is not permitted by default in lookaround assertions,
+for compatibility with Perl. However, if the PCRE2_EXTRA_ALLOW_LOOKAROUND_BSK
+option is set, the previous behaviour is re-enabled. When this option is set,
 \K is honoured in positive assertions, but ignored in negative ones.
 </P>
-<br><a name="SEC12" href="#TOC1">ALTERNATION</a><br>
+<br><a name="SEC14" href="#TOC1">ALTERNATION</a><br>
 <P>
 <pre>
  expr|expr|expr...
 </PRE>
 </P>
-<br><a name="SEC13" href="#TOC1">CAPTURING</a><br>
+<br><a name="SEC15" href="#TOC1">CAPTURING</a><br>
 <P>
 <pre>
  (...)           capture group
@ -452,20 +355,20 @@ In non-UTF modes, names may contain underscores and ASCII letters and digits;
 in UTF modes, any Unicode letters and Unicode decimal digits are permitted. In
 both cases, a name must not start with a digit.
 </P>
-<br><a name="SEC14" href="#TOC1">ATOMIC GROUPS</a><br>
+<br><a name="SEC16" href="#TOC1">ATOMIC GROUPS</a><br>
 <P>
 <pre>
  (?&#62;...)         atomic non-capture group
  (*atomic:...)   atomic non-capture group
 </PRE>
 </P>
-<br><a name="SEC15" href="#TOC1">COMMENT</a><br>
+<br><a name="SEC17" href="#TOC1">COMMENT</a><br>
 <P>
 <pre>
  (?#....)        comment (not nestable)
 </PRE>
 </P>
-<br><a name="SEC16" href="#TOC1">OPTION SETTING</a><br>
+<br><a name="SEC18" href="#TOC1">OPTION SETTING</a><br>
 <P>
 Changes of these options within a group are automatically cancelled at the end
 of the group.
@ -510,7 +413,7 @@ not increase them. LIMIT_RECURSION is an obsolete synonym for LIMIT_DEPTH. The
 application can lock out the use of (*UTF) and (*UCP) by setting the
 PCRE2_NEVER_UTF or PCRE2_NEVER_UCP options, respectively, at compile time.
 </P>
-<br><a name="SEC17" href="#TOC1">NEWLINE CONVENTION</a><br>
+<br><a name="SEC19" href="#TOC1">NEWLINE CONVENTION</a><br>
 <P>
 These are recognized only at the very start of the pattern or after option
 settings with a similar syntax.
@ -523,7 +426,7 @@ settings with a similar syntax.
  (*NUL)          the NUL character (binary zero)
 </PRE>
 </P>
-<br><a name="SEC18" href="#TOC1">WHAT \R MATCHES</a><br>
+<br><a name="SEC20" href="#TOC1">WHAT \R MATCHES</a><br>
 <P>
 These are recognized only at the very start of the pattern or after option
 setting with a similar syntax.
@ -532,7 +435,7 @@ setting with a similar syntax.
  (*BSR_UNICODE)  any Unicode newline sequence
 </PRE>
 </P>
-<br><a name="SEC19" href="#TOC1">LOOKAHEAD AND LOOKBEHIND ASSERTIONS</a><br>
+<br><a name="SEC21" href="#TOC1">LOOKAHEAD AND LOOKBEHIND ASSERTIONS</a><br>
 <P>
 <pre>
  (?=...)                     )
@ -553,7 +456,7 @@ setting with a similar syntax.
 </pre>
 Each top-level branch of a lookbehind must be of a fixed length.
 </P>
-<br><a name="SEC20" href="#TOC1">NON-ATOMIC LOOKAROUND ASSERTIONS</a><br>
+<br><a name="SEC22" href="#TOC1">NON-ATOMIC LOOKAROUND ASSERTIONS</a><br>
 <P>
 These assertions are specific to PCRE2 and are not Perl-compatible.
 <pre>
@ -566,7 +469,7 @@ These assertions are specific to PCRE2 and are not Perl-compatible.
  (*non_atomic_positive_lookbehind:...)  )
 </PRE>
 </P>
-<br><a name="SEC21" href="#TOC1">SCRIPT RUNS</a><br>
+<br><a name="SEC23" href="#TOC1">SCRIPT RUNS</a><br>
 <P>
 <pre>
  (*script_run:...)           ) script run, can be backtracked into
@ -576,7 +479,7 @@ These assertions are specific to PCRE2 and are not Perl-compatible.
  (*asr:...)                  )
 </PRE>
 </P>
-<br><a name="SEC22" href="#TOC1">BACKREFERENCES</a><br>
+<br><a name="SEC24" href="#TOC1">BACKREFERENCES</a><br>
 <P>
 <pre>
  \n              reference by number (can be ambiguous)
@ -593,7 +496,7 @@ These assertions are specific to PCRE2 and are not Perl-compatible.
  (?P=name)       reference by name (Python)
 </PRE>
 </P>
-<br><a name="SEC23" href="#TOC1">SUBROUTINE REFERENCES (POSSIBLY RECURSIVE)</a><br>
+<br><a name="SEC25" href="#TOC1">SUBROUTINE REFERENCES (POSSIBLY RECURSIVE)</a><br>
 <P>
 <pre>
  (?R)            recurse whole pattern
@ -612,7 +515,7 @@ These assertions are specific to PCRE2 and are not Perl-compatible.
  \g'-n'          call subroutine by relative number (PCRE2 extension)
 </PRE>
 </P>
-<br><a name="SEC24" href="#TOC1">CONDITIONAL PATTERNS</a><br>
+<br><a name="SEC26" href="#TOC1">CONDITIONAL PATTERNS</a><br>
 <P>
 <pre>
  (?(condition)yes-pattern)
@ -635,7 +538,7 @@ Note the ambiguity of (?(R) and (?(Rn) which might be named reference
 conditions or recursion tests. Such a condition is interpreted as a reference
 condition if the relevant named group exists.
 </P>
-<br><a name="SEC25" href="#TOC1">BACKTRACKING CONTROL</a><br>
+<br><a name="SEC27" href="#TOC1">BACKTRACKING CONTROL</a><br>
 <P>
 All backtracking control verbs may be in the form (*VERB:NAME). For (*MARK) the
 name is mandatory, for the others it is optional. (*SKIP) changes its behaviour
@ -662,7 +565,7 @@ pattern is not anchored.
 The effect of one of these verbs in a group called as a subroutine is confined
 to the subroutine call.
 </P>
-<br><a name="SEC26" href="#TOC1">CALLOUTS</a><br>
+<br><a name="SEC28" href="#TOC1">CALLOUTS</a><br>
 <P>
 <pre>
  (?C)            callout (assumed number 0)
@ -673,25 +576,25 @@ The allowed string delimiters are ` ' " ^ % # $ (which are the same for the
 start and the end), and the starting delimiter { matched with the ending
 delimiter }. To encode the ending delimiter within the string, double it.
 </P>
-<br><a name="SEC27" href="#TOC1">SEE ALSO</a><br>
+<br><a name="SEC29" href="#TOC1">SEE ALSO</a><br>
 <P>
 <b>pcre2pattern</b>(3), <b>pcre2api</b>(3), <b>pcre2callout</b>(3),
 <b>pcre2matching</b>(3), <b>pcre2</b>(3).
 </P>
-<br><a name="SEC28" href="#TOC1">AUTHOR</a><br>
+<br><a name="SEC30" href="#TOC1">AUTHOR</a><br>
 <P>
 Philip Hazel
 <br>
-University Computing Service
+Retired from University Computing Service
 <br>
 Cambridge, England.
 <br>
 </P>
-<br><a name="SEC29" href="#TOC1">REVISION</a><br>
+<br><a name="SEC31" href="#TOC1">REVISION</a><br>
 <P>
-Last updated: 28 December 2019
+Last updated: 12 January 2022
 <br>
-Copyright &copy; 1997-2019 University of Cambridge.
+Copyright &copy; 1997-2022 University of Cambridge.
 <br>
 <p>
 Return to the <a href="index.html">PCRE2 index page</a>.
--- a/doc/html/pcre2test.html
+++ b/doc/html/pcre2test.html
@ -59,12 +59,7 @@ patterns, and the subject lines specify PCRE2 function options, control how the
 subject is processed, and what output is produced.
 </P>
 <P>
-As the original fairly simple PCRE library evolved, it acquired many different
-features, and as a result, the original <b>pcretest</b> program ended up with a
-lot of options in a messy, arcane syntax for testing all the features. The
-move to the new PCRE2 API provided an opportunity to re-implement the test
-program as <b>pcre2test</b>, with a cleaner modifier syntax. Nevertheless, there
-are still many obscure modifiers, some of which are specifically designed for
+There are many obscure modifiers, some of which are specifically designed for
 use in conjunction with the test script and data files that are distributed as
 part of PCRE2. All the modifiers are documented here, some without much
 justification, but many of them are unlikely to be of use except when testing
@ -83,16 +78,16 @@ to 8-bit code units for output.
 </P>
 <P>
 In the rest of this document, the names of library functions and structures
-are given in generic form, for example, <b>pcre_compile()</b>. The actual
+are given in generic form, for example, <b>pcre2_compile()</b>. The actual
 names used in the libraries have a suffix _8, _16, or _32, as appropriate.
 <a name="inputencoding"></a></P>
 <br><a name="SEC3" href="#TOC1">INPUT ENCODING</a><br>
 <P>
 Input to <b>pcre2test</b> is processed line by line, either by calling the C
-library's <b>fgets()</b> function, or via the <b>libreadline</b> library. In some
-Windows environments character 26 (hex 1A) causes an immediate end of file, and
-no further data is read, so this character should be avoided unless you really
-want that action.
+library's <b>fgets()</b> function, or via the <b>libreadline</b> or <b>libedit</b>
+library. In some Windows environments character 26 (hex 1A) causes an immediate
+end of file, and no further data is read, so this character should be avoided
+unless you really want that action.
 </P>
 <P>
 The input is processed using using C's string functions, so must not
@ -258,7 +253,19 @@ available, and the use of JIT for matching is verified.
 <b>-LM</b>
 List modifiers: write a list of available pattern and subject modifiers to the
 standard output, then exit with zero exit code. All other options are ignored.
-If both -C and -LM are present, whichever is first is recognized.
+If both -C and any -Lx options are present, whichever is first is recognized.
+</P>
+<P>
+<b>-LP</b>
+List properties: write a list of recognized Unicode properties to the standard
+output, then exit with zero exit code. All other options are ignored. If both
+-C and any -Lx options are present, whichever is first is recognized.
+</P>
+<P>
+<b>-LS</b>
+List scripts: write a list of recogized Unicode script names to the standard
+output, then exit with zero exit code. All other options are ignored. If both
+-C and any -Lx options are present, whichever is first is recognized.
 </P>
 <P>
 <b>-pattern</b> <i>modifier-list</i>
@ -486,15 +493,17 @@ excluding pattern meta-characters):
 </pre>
 This is interpreted as the pattern's delimiter. A regular expression may be
 continued over several input lines, in which case the newline characters are
-included within it. It is possible to include the delimiter within the pattern
-by escaping it with a backslash, for example
+included within it. It is possible to include the delimiter as a literal within
+the pattern by escaping it with a backslash, for example
 <pre>
  /abc\/def/
 </pre>
 If you do this, the escape and the delimiter form part of the pattern, but
-since the delimiters are all non-alphanumeric, this does not affect its
-interpretation. If the terminating delimiter is immediately followed by a
-backslash, for example,
+since the delimiters are all non-alphanumeric, the inclusion of the backslash
+does not affect the pattern's interpretation. Note, however, that this trick
+does not work within \Q...\E literal bracketing because the backslash will
+itself be interpreted as a literal. If the terminating delimiter is immediately
+followed by a backslash, for example,
 <pre>
  /abc/\
 </pre>
@ -512,11 +521,11 @@ A pattern can be followed by a modifier list (details below).
 </P>
 <br><a name="SEC9" href="#TOC1">SUBJECT LINE SYNTAX</a><br>
 <P>
-Before each subject line is passed to <b>pcre2_match()</b> or
-<b>pcre2_dfa_match()</b>, leading and trailing white space is removed, and the
-line is scanned for backslash escapes, unless the <b>subject_literal</b>
-modifier was set for the pattern. The following provide a means of encoding
-non-printing characters in a visible way:
+Before each subject line is passed to <b>pcre2_match()</b>,
+<b>pcre2_dfa_match()</b>, or <b>pcre2_jit_match()</b>, leading and trailing white
+space is removed, and the line is scanned for backslash escapes, unless the
+<b>subject_literal</b> modifier was set for the pattern. The following provide a
+means of encoding non-printing characters in a visible way:
 <pre>
  \a         alarm (BEL, \x07)
  \b         backspace (\x08)
@ -613,6 +622,7 @@ way <b>pcre2_compile()</b> behaves. See
 for a description of the effects of these options.
 <pre>
      allow_empty_class         set PCRE2_ALLOW_EMPTY_CLASS
+      allow_lookaround_bsk      set PCRE2_EXTRA_ALLOW_LOOKAROUND_BSK
      allow_surrogate_escapes   set PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES
      alt_bsux                  set PCRE2_ALT_BSUX
      alt_circumflex            set PCRE2_ALT_CIRCUMFLEX
@ -1231,7 +1241,8 @@ pattern, but can be overridden by modifiers on the subject.
      copy=&#60;number or name&#62;      copy captured substring
      depth_limit=&#60;n&#62;            set a depth limit
      dfa                        use <b>pcre2_dfa_match()</b>
-      find_limits                find match and depth limits
+      find_limits                find heap, match and depth limits
+      find_limits_noheap         find match and depth limits
      get=&#60;number or name&#62;       extract captured substring
      getall                     extract all captured substrings
  /g  global                     global matching
@ -1241,6 +1252,8 @@ pattern, but can be overridden by modifiers on the subject.
      match_limit=&#60;n&#62;            set a match limit
      memory                     show heap memory usage
      null_context               match with a NULL context
+      null_replacement           substitute with NULL replacement
+      null_subject               match with NULL subject
      offset=&#60;n&#62;                 set starting offset
      offset_limit=&#60;n&#62;           set offset limit
      ovector=&#60;n&#62;                set size of output vector
@ -1552,7 +1565,7 @@ Setting heap, match, and depth limits
 <P>
 The <b>heap_limit</b>, <b>match_limit</b>, and <b>depth_limit</b> modifiers set
 the appropriate limits in the match context. These values are ignored when the
-<b>find_limits</b> modifier is specified.
+<b>find_limits</b> or <b>find_limits_noheap</b> modifier is specified.
 </P>
 <br><b>
 Finding minimum limits
@ -1562,8 +1575,12 @@ If the <b>find_limits</b> modifier is present on a subject line, <b>pcre2test</b
 calls the relevant matching function several times, setting different values in
 the match context via <b>pcre2_set_heap_limit()</b>,
 <b>pcre2_set_match_limit()</b>, or <b>pcre2_set_depth_limit()</b> until it finds
-the minimum values for each parameter that allows the match to complete without
-error. If JIT is being used, only the match limit is relevant.
+the smallest value for each parameter that allows the match to complete without
+a "limit exceeded" error. The match itself may succeed or fail. An alternative
+modifier, <b>find_limits_noheap</b>, omits the heap limit. This is used in the
+standard tests, because the minimum heap limit varies between systems. If JIT
+is being used, only the match limit is relevant, and the other two are
+automatically omitted.
 </P>
 <P>
 When using this modifier, the pattern should not contain any limit settings
@ -1591,9 +1608,7 @@ overall amount of computing resource that is used.
 </P>
 <P>
 For both kinds of matching, the <i>heap_limit</i> number, which is in kibibytes
-(units of 1024 bytes), limits the amount of heap memory used for matching. A
-value of zero disables the use of any heap memory; many simple pattern matches
-can be done without using the heap, so zero is not an unreasonable setting.
+(units of 1024 bytes), limits the amount of heap memory used for matching.
 </P>
 <br><b>
 Showing MARK names
@ -1611,12 +1626,10 @@ Showing memory usage
 <P>
 The <b>memory</b> modifier causes <b>pcre2test</b> to log the sizes of all heap
 memory allocation and freeing calls that occur during a call to
-<b>pcre2_match()</b> or <b>pcre2_dfa_match()</b>. These occur only when a match
-requires a bigger vector than the default for remembering backtracking points
-(<b>pcre2_match()</b>) or for internal workspace (<b>pcre2_dfa_match()</b>). In
-many cases there will be no heap memory used and therefore no additional
-output. No heap memory is allocated during matching with JIT, so in that case
-the <b>memory</b> modifier never has any effect. For this modifier to work, the
+<b>pcre2_match()</b> or <b>pcre2_dfa_match()</b>. In the latter case, heap memory
+is used only when a match requires more internal workspace that the default
+allocation on the stack, so in many cases there will be no output. No heap
+memory is allocated during matching with JIT. For this modifier to work, the
 <b>null_context</b> modifier must not be set on both the pattern and the
 subject, though it can be set on one or the other.
 </P>
@ -1670,7 +1683,7 @@ When testing <b>pcre2_substitute()</b>, this modifier also has the effect of
 passing the replacement string as zero-terminated.
 </P>
 <br><b>
-Passing a NULL context
+Passing a NULL context, subject, or replacement
 </b><br>
 <P>
 Normally, <b>pcre2test</b> passes a context block to <b>pcre2_match()</b>,
@ -1678,7 +1691,13 @@ Normally, <b>pcre2test</b> passes a context block to <b>pcre2_match()</b>,
 If the <b>null_context</b> modifier is set, however, NULL is passed. This is for
 testing that the matching and substitution functions behave correctly in this
 case (they use default values). This modifier cannot be used with the
-<b>find_limits</b> or <b>substitute_callout</b> modifiers.
+<b>find_limits</b>, <b>find_limits_noheap</b>, or <b>substitute_callout</b>
+modifiers.
+</P>
+<P>
+Similarly, for testing purposes, if the <b>null_subject</b> or
+<b>null_replacement</b> modifier is set, the subject or replacement string
+pointers are passed as NULL, respectively, to the relevant functions.
 </P>
 <br><a name="SEC12" href="#TOC1">THE ALTERNATIVE MATCHING FUNCTION</a><br>
 <P>
@ -2117,16 +2136,16 @@ on the stack.
 <P>
 Philip Hazel
 <br>
-University Computing Service
+Retired from University Computing Service
 <br>
 Cambridge, England.
 <br>
 </P>
 <br><a name="SEC21" href="#TOC1">REVISION</a><br>
 <P>
-Last updated: 28 April 2021
+Last updated: 27 July 2022
 <br>
-Copyright &copy; 1997-2021 University of Cambridge.
+Copyright &copy; 1997-2022 University of Cambridge.
 <br>
 <p>
 Return to the <a href="index.html">PCRE2 index page</a>.
--- a/doc/html/pcre2unicode.html
+++ b/doc/html/pcre2unicode.html
@ -50,17 +50,18 @@ UNICODE PROPERTY SUPPORT
 <P>
 When PCRE2 is built with Unicode support, the escape sequences \p{..},
 \P{..}, and \X can be used. This is not dependent on the PCRE2_UTF setting.
-The Unicode properties that can be tested are limited to the general category
-properties such as Lu for an upper case letter or Nd for a decimal number, the
-Unicode script names such as Arabic or Han, and the derived properties Any and
-L&. Full lists are given in the
+The Unicode properties that can be tested are a subset of those that Perl
+supports. Currently they are limited to the general category properties such as
+Lu for an upper case letter or Nd for a decimal number, the Unicode script
+names such as Arabic or Han, Bidi_Class, Bidi_Control, and the derived
+properties Any and LC (synonym L&). Full lists are given in the
 <a href="pcre2pattern.html"><b>pcre2pattern</b></a>
 and
 <a href="pcre2syntax.html"><b>pcre2syntax</b></a>
-documentation. Only the short names for properties are supported. For example,
-\p{L} matches a letter. Its Perl synonym, \p{Letter}, is not supported.
-Furthermore, in Perl, many properties may optionally be prefixed by "Is", for
-compatibility with Perl 5.6. PCRE2 does not support this.
+documentation. In general, only the short names for properties are supported.
+For example, \p{L} matches a letter. Its longer synonym, \p{Letter}, is not
+supported. Furthermore, in Perl, many properties may optionally be prefixed by
+"Is", for compatibility with Perl 5.6. PCRE2 does not support this.
 </P>
 <br><b>
 WIDE CHARACTERS AND UTF MODES
@ -477,7 +478,7 @@ AUTHOR
 <P>
 Philip Hazel
 <br>
-University Computing Service
+Retired from University Computing Service
 <br>
 Cambridge, England.
 <br>
@ -486,9 +487,9 @@ Cambridge, England.
 REVISION
 </b><br>
 <P>
-Last updated: 23 February 2020
+Last updated: 22 December 2021
 <br>
-Copyright &copy; 1997-2020 University of Cambridge.
+Copyright &copy; 1997-2021 University of Cambridge.
 <br>
 <p>
 Return to the <a href="index.html">PCRE2 index page</a>.
--- a/doc/pcre2.3
+++ b/doc/pcre2.3
@ -1,4 +1,4 @@
-.TH PCRE2 3 "28 April 2021" "PCRE2 10.37"
+.TH PCRE2 3 "27 August 2021" "PCRE2 10.38"
 .SH NAME
 PCRE2 - Perl-compatible regular expressions (revised API)
 .SH INTRODUCTION
@ -11,7 +11,8 @@ nearly two decades, the limitations of the original API were making development
 increasingly difficult. The new API is more extensible, and it was simplified
 by abolishing the separate "study" optimizing function; in PCRE2, patterns are
 automatically optimized where possible. Since forking from PCRE1, the code has
-been extensively refactored and new features introduced.
+been extensively refactored and new features introduced. The old library is now
+obsolete and is no longer maintained.
 .P
 As well as Perl-style regular expression patterns, some features that appeared
 in Python and the original PCRE before they appeared in Perl are available
@ -190,18 +191,18 @@ function, listing its arguments and results.
 .sp
 .nf
 Philip Hazel
-University Computing Service
+Retired from University Computing Service
 Cambridge, England.
 .fi
 .P
 Putting an actual email address here is a spam magnet. If you want to email me,
-use my two initials, followed by the two digits 10, at the domain cam.ac.uk.
+use my two names separated by a dot at gmail.com.
 .
 .
 .SH REVISION
 .rs
 .sp
 .nf
-Last updated: 28 April 2021
+Last updated: 27 August 2021
 Copyright (c) 1997-2021 University of Cambridge.
 .fi
--- a/doc/pcre2.txt
+++ b/doc/pcre2.txt
--- a/doc/pcre2_compile.3
+++ b/doc/pcre2_compile.3
@ -1,4 +1,4 @@
-.TH PCRE2_COMPILE 3 "23 May 2019" "PCRE2 10.34"
+.TH PCRE2_COMPILE 3 "22 April 2022" "PCRE2 10.41"
 .SH NAME
 PCRE2 - Perl-compatible regular expressions (revised API)
 .SH SYNOPSIS
@ -80,8 +80,17 @@ Additional options may be set in the compile context via the
 .\"
 function.
 .P
-The yield of this function is a pointer to a private data structure that
-contains the compiled pattern, or NULL if an error was detected.
+If either of \fIerrorcode\fP or \fIerroroffset\fP is NULL, the function returns
+NULL immediately. Otherwise, the yield of this function is a pointer to a
+private data structure that contains the compiled pattern, or NULL if an error
+was detected. In the error case, a text error message can be obtained by
+passing the value returned via the \fIerrorcode\fP argument to the the
+\fBpcre2_get_error_message()\fP function. The offset (in code units) where the
+error was encountered is returned via the \fIerroroffset\fP argument.
+.P
+If there is no error, the value passed via \fIerrorcode\fP returns the message
+"no error" if passed to \fBpcre2_get_error_message()\fP, and the value passed
+via \fIerroroffset\fP is zero.
 .P
 There is a complete description of the PCRE2 native API, with more detail on
 each option, in the
--- a/doc/pcre2_dfa_match.3
+++ b/doc/pcre2_dfa_match.3
@ -1,4 +1,4 @@
-.TH PCRE2_DFA_MATCH 3 "16 October 2018" "PCRE2 10.33"
+.TH PCRE2_DFA_MATCH 3 "28 August 2021" "PCRE2 10.38"
 .SH NAME
 PCRE2 - Perl-compatible regular expressions (revised API)
 .SH SYNOPSIS
@ -33,10 +33,15 @@ just once (except when processing lookaround assertions). This function is
  \fIworkspace\fP    Points to a vector of ints used as working space
  \fIwscount\fP      Number of elements in the vector
 .sp
-For \fBpcre2_dfa_match()\fP, a match context is needed only if you want to set
-up a callout function or specify the heap limit or the match or the recursion
-depth limits. The \fIlength\fP and \fIstartoffset\fP values are code units, not
-characters. The options are:
+The size of output vector needed to contain all the results depends on the
+number of simultaneous matches, not on the number of parentheses in the
+pattern. Using \fBpcre2_match_data_create_from_pattern()\fP to create the match
+data block is therefore not advisable when using this function.
+.P
+A match context is needed only if you want to set up a callout function or
+specify the heap limit or the match or the recursion depth limits. The
+\fIlength\fP and \fIstartoffset\fP values are code units, not characters. The
+options are:
 .sp
  PCRE2_ANCHORED          Match only at the first position
  PCRE2_COPY_MATCHED_SUBJECT
--- a/doc/pcre2_jit_stack_create.3
+++ b/doc/pcre2_jit_stack_create.3
@ -22,7 +22,8 @@ allocation. The result can be passed to the JIT run-time code by calling
 \fBpcre2_jit_stack_assign()\fP to associate the stack with a compiled pattern,
 which can then be processed by \fBpcre2_match()\fP or \fBpcre2_jit_match()\fP.
 A maximum stack size of 512KiB to 1MiB should be more than enough for any
-pattern. For more details, see the
+pattern. If the stack couldn't be allocated or the values passed were not
+reasonable, NULL will be returned. For more details, see the
 .\" HREF
 \fBpcre2jit\fP
 .\"
--- a/doc/pcre2_match_data_create.3
+++ b/doc/pcre2_match_data_create.3
@ -1,4 +1,4 @@
-.TH PCRE2_MATCH_DATA_CREATE 3 "29 July 2015" "PCRE2 10.21"
+.TH PCRE2_MATCH_DATA_CREATE 3 "28 August 2021" "PCRE2 10.38"
 .SH NAME
 PCRE2 - Perl-compatible regular expressions (revised API)
 .SH SYNOPSIS
@ -18,8 +18,9 @@ This function creates a new match data block, which is used for holding the
 result of a match. The first argument specifies the number of pairs of offsets
 that are required. These form the "output vector" (ovector) within the match
 data block, and are used to identify the matched string and any captured
-substrings. There is always one pair of offsets; if \fBovecsize\fP is zero, it
-is treated as one.
+substrings when matching with \fBpcre2_match()\fP, or a number of different
+matches at the same point when used with \fBpcre2_dfa_match()\fP. There is
+always one pair of offsets; if \fBovecsize\fP is zero, it is treated as one.
 .P
 The second argument points to a general context, for custom memory management,
 or is NULL for system memory management. The result of the function is NULL if
--- a/doc/pcre2_match_data_create_from_pattern.3
+++ b/doc/pcre2_match_data_create_from_pattern.3
@ -1,4 +1,4 @@
-.TH PCRE2_MATCH_DATA_CREATE_FROM_PATTERN 3 "29 July 2015" "PCRE2 10.21"
+.TH PCRE2_MATCH_DATA_CREATE_FROM_PATTERN 3 "28 August 2021" "PCRE2 10.38"
 .SH NAME
 PCRE2 - Perl-compatible regular expressions (revised API)
 .SH SYNOPSIS
@ -14,12 +14,15 @@ PCRE2 - Perl-compatible regular expressions (revised API)
 .SH DESCRIPTION
 .rs
 .sp
-This function creates a new match data block, which is used for holding the
-result of a match. The first argument points to a compiled pattern. The number
-of capturing parentheses within the pattern is used to compute the number of
-pairs of offsets that are required in the match data block. These form the
-"output vector" (ovector) within the match data block, and are used to identify
-the matched string and any captured substrings.
+This function creates a new match data block for holding the result of a match.
+The first argument points to a compiled pattern. The number of capturing
+parentheses within the pattern is used to compute the number of pairs of
+offsets that are required in the match data block. These form the "output
+vector" (ovector) within the match data block, and are used to identify the
+matched string and any captured substrings when matching with
+\fBpcre2_match()\fP. If you are using \fBpcre2_dfa_match()\fP, which uses the
+outut vector in a different way, you should use \fBpcre2_match_data_create()\fP
+instead of this function.
 .P
 The second argument points to a general context, for custom memory management,
 or is NULL to use the same memory allocator as was used for the compiled
--- a/doc/pcre2_serialize_decode.3
+++ b/doc/pcre2_serialize_decode.3
@ -36,7 +36,7 @@ the following negative error codes:
  PCRE2_ERROR_BADDATA   \fInumber_of_codes\fP is zero or less
  PCRE2_ERROR_BADMAGIC  mismatch of id bytes in \fIbytes\fP
  PCRE2_ERROR_BADMODE   mismatch of variable unit size or PCRE version
-  PCRE2_ERROR_MEMORY    memory allocation failed
+  PCRE2_ERROR_NOMEMORY  memory allocation failed
  PCRE2_ERROR_NULL      \fIcodes\fP or \fIbytes\fP is NULL
 .sp
 PCRE2_ERROR_BADMAGIC may mean that the data is corrupt, or that it was compiled
--- a/doc/pcre2_set_compile_extra_options.3
+++ b/doc/pcre2_set_compile_extra_options.3
@ -1,4 +1,4 @@
-.TH PCRE2_SET_COMPILE_EXTRA_OPTIONS 3 "11 February 2019" "PCRE2 10.33"
+.TH PCRE2_SET_COMPILE_EXTRA_OPTIONS 3 "31 August 2021" "PCRE2 10.38"
 .SH NAME
 PCRE2 - Perl-compatible regular expressions (revised API)
 .SH SYNOPSIS
@ -18,12 +18,13 @@ This function sets additional option bits for \fBpcre2_compile()\fP that are
 housed in a compile context. It completely replaces all the bits. The extra
 options are:
 .sp
+  PCRE2_EXTRA_ALLOW_LOOKAROUND_BSK     Allow \eK in lookarounds
 .\" JOIN
-  PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES  Allow \ex{df800} to \ex{dfff}
+  PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES  Allow \ex{d800} to \ex{dfff}
                                         in UTF-8 and UTF-32 modes
 .\" JOIN
-  PCRE2_EXTRA_ALT_BSUX                 Extended alternate \eu, \eU, and \ex
-                                         handling
+  PCRE2_EXTRA_ALT_BSUX                 Extended alternate \eu, \eU, and
+                                         \ex handling
 .\" JOIN
  PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL    Treat all invalid escapes as
                                         a literal following character
--- a/doc/pcre2_substitute.3
+++ b/doc/pcre2_substitute.3
@ -55,32 +55,42 @@ automatically added.
 The subject and replacement lengths can be given as PCRE2_ZERO_TERMINATED for
 zero-terminated strings. The options are:
 .sp
-  PCRE2_ANCHORED             Match only at the first position
-  PCRE2_ENDANCHORED          Pattern can match only at end of subject
-  PCRE2_NOTBOL               Subject is not the beginning of a line
-  PCRE2_NOTEOL               Subject is not the end of a line
-  PCRE2_NOTEMPTY             An empty string is not a valid match
+  PCRE2_ANCHORED                     Match only at the first position
+  PCRE2_ENDANCHORED                  Match only at end of subject
 .\" JOIN
-  PCRE2_NOTEMPTY_ATSTART     An empty string at the start of the
-                              subject is not a valid match
-  PCRE2_NO_JIT               Do not use JIT matching
+  PCRE2_NOTBOL                       Subject is not the beginning of a
+                                      line
+  PCRE2_NOTEOL                       Subject is not the end of a line
 .\" JOIN
-  PCRE2_NO_UTF_CHECK         Do not check the subject or replacement
-                              for UTF validity (only relevant if
-                              PCRE2_UTF was set at compile time)
-  PCRE2_SUBSTITUTE_EXTENDED  Do extended replacement processing
-  PCRE2_SUBSTITUTE_GLOBAL    Replace all occurrences in the subject
-  PCRE2_SUBSTITUTE_LITERAL   The replacement string is literal
-  PCRE2_SUBSTITUTE_MATCHED   Use pre-existing match data for 1st match
-  PCRE2_SUBSTITUTE_OVERFLOW_LENGTH  If overflow, compute needed length
+  PCRE2_NOTEMPTY                     An empty string is not a
+                                      valid match
+.\" JOIN
+  PCRE2_NOTEMPTY_ATSTART             An empty string at the start of
+                                      the subject is not a valid match
+  PCRE2_NO_JIT                       Do not use JIT matching
+.\" JOIN
+  PCRE2_NO_UTF_CHECK                 Do not check for UTF validity in
+                                      the subject or replacement
+.\" JOIN
+                                      (only relevant if PCRE2_UTF was
+                                      set at compile time)
+  PCRE2_SUBSTITUTE_EXTENDED          Do extended replacement processing
+.\" JOIN
+  PCRE2_SUBSTITUTE_GLOBAL            Replace all occurrences in the
+                                      subject
+  PCRE2_SUBSTITUTE_LITERAL           The replacement string is literal
+.\" JOIN
+  PCRE2_SUBSTITUTE_MATCHED           Use pre-existing match data for
+                                      first match
+  PCRE2_SUBSTITUTE_OVERFLOW_LENGTH   If overflow, compute needed length
  PCRE2_SUBSTITUTE_REPLACEMENT_ONLY  Return only replacement string(s)
-  PCRE2_SUBSTITUTE_UNKNOWN_UNSET  Treat unknown group as unset
-  PCRE2_SUBSTITUTE_UNSET_EMPTY  Simple unset insert = empty string
+  PCRE2_SUBSTITUTE_UNKNOWN_UNSET     Treat unknown group as unset
+  PCRE2_SUBSTITUTE_UNSET_EMPTY       Simple unset insert = empty string
 .sp
 If PCRE2_SUBSTITUTE_LITERAL is set, PCRE2_SUBSTITUTE_EXTENDED,
 PCRE2_SUBSTITUTE_UNKNOWN_UNSET, and PCRE2_SUBSTITUTE_UNSET_EMPTY are ignored.
 .P
-If PCRE2_SUBSTITUTE_MATCHED is set, \fImatch_data\fP must be non-zero; its
+If PCRE2_SUBSTITUTE_MATCHED is set, \fImatch_data\fP must be non-NULL; its
 contents must be the result of a call to \fBpcre2_match()\fP using the same
 pattern and subject.
 .P
--- a/doc/pcre2api.3
+++ b/doc/pcre2api.3
@ -1,4 +1,4 @@
-.TH PCRE2API 3 "04 November 2020" "PCRE2 10.36"
+.TH PCRE2API 3 "27 July 2022" "PCRE2 10.41"
 .SH NAME
 PCRE2 - Perl-compatible regular expressions (revised API)
 .sp
@ -953,7 +953,7 @@ has its own memory control arrangements (see the
 documentation for more details). If the limit is reached, the negative error
 code PCRE2_ERROR_HEAPLIMIT is returned. The default limit can be set when PCRE2
 is built; if it is not, the default is set very large and is essentially
-"unlimited".
+unlimited.
 .P
 A value for the heap limit may also be supplied by an item at the start of a
 pattern of the form
@ -964,18 +964,18 @@ where ddd is a decimal number. However, such a setting is ignored unless ddd is
 less than the limit set by the caller of \fBpcre2_match()\fP or, if no such
 limit is set, less than the default.
 .P
-The \fBpcre2_match()\fP function starts out using a 20KiB vector on the system
-stack for recording backtracking points. The more nested backtracking points
-there are (that is, the deeper the search tree), the more memory is needed.
-Heap memory is used only if the initial vector is too small. If the heap limit
-is set to a value less than 21 (in particular, zero) no heap memory will be
-used. In this case, only patterns that do not have a lot of nested backtracking
-can be successfully processed.
+The \fBpcre2_match()\fP function always needs some heap memory, so setting a
+value of zero guarantees a "heap limit exceeded" error. Details of how
+\fBpcre2_match()\fP uses the heap are given in the
+.\" HREF
+\fBpcre2perform\fP
+.\"
+documentation.
 .P
-Similarly, for \fBpcre2_dfa_match()\fP, a vector on the system stack is used
-when processing pattern recursions, lookarounds, or atomic groups, and only if
-this is not big enough is heap memory used. In this case, too, setting a value
-of zero disables the use of the heap.
+For \fBpcre2_dfa_match()\fP, a vector on the system stack is used when
+processing pattern recursions, lookarounds, or atomic groups, and only if this
+is not big enough is heap memory used. In this case, setting a value of zero
+disables the use of the heap.
 .sp
 .nf
 .B int pcre2_set_match_limit(pcre2_match_context *\fImcontext\fP,
@ -1019,10 +1019,10 @@ less than the limit set by the caller of \fBpcre2_match()\fP or
 .fi
 .sp
 This parameter limits the depth of nested backtracking in \fBpcre2_match()\fP.
-Each time a nested backtracking point is passed, a new memory "frame" is used
+Each time a nested backtracking point is passed, a new memory frame is used
 to remember the state of matching at that point. Thus, this parameter
 indirectly limits the amount of memory that is used in a match. However,
-because the size of each memory "frame" depends on the number of capturing
+because the size of each memory frame depends on the number of capturing
 parentheses, the actual memory limit varies from pattern to pattern. This limit
 was more useful in versions before 10.30, where function recursion was used for
 backtracking.
@ -1323,8 +1323,7 @@ If \fIerrorcode\fP or \fIerroroffset\fP is NULL, \fBpcre2_compile()\fP returns
 NULL immediately. Otherwise, the variables to which these point are set to an
 error code and an offset (number of code units) within the pattern,
 respectively, when \fBpcre2_compile()\fP returns NULL because a compilation
-error has occurred. The values are not defined when compilation is successful
-and \fBpcre2_compile()\fP returns a non-NULL value.
+error has occurred. 
 .P
 There are nearly 100 positive error codes that \fBpcre2_compile()\fP may return
 if it finds an error in the pattern. There are also some negative error codes
@ -1343,14 +1342,17 @@ message"
 below)
 .\"
 should be self-explanatory. Macro names starting with PCRE2_ERROR_ are defined
-for both positive and negative error codes in \fBpcre2.h\fP.
+for both positive and negative error codes in \fBpcre2.h\fP. When compilation
+is successful \fIerrorcode\fP is set to a value that returns the message "no
+error" if passed to \fBpcre2_get_error_message()\fP.
 .P
 The value returned in \fIerroroffset\fP is an indication of where in the
-pattern the error occurred. It is not necessarily the furthest point in the
-pattern that was read. For example, after the error "lookbehind assertion is
-not fixed length", the error offset points to the start of the failing
-assertion. For an invalid UTF-8 or UTF-16 string, the offset is that of the
-first code unit of the failing character.
+pattern an error occurred. When there is no error, zero is returned. A non-zero
+value is not necessarily the furthest point in the pattern that was read. For
+example, after the error "lookbehind assertion is not fixed length", the error
+offset points to the start of the failing assertion. For an invalid UTF-8 or
+UTF-16 string, the offset is that of the first code unit of the failing
+character.
 .P
 Some errors are not detected until the whole pattern has been scanned; in these
 cases, the offset passed back is the length of the pattern. Note that the
@ -1794,7 +1796,7 @@ it is set, the effect of passing an invalid UTF string as a pattern is
 undefined. It may cause your program to crash or loop.
 .P
 Note that this option can also be passed to \fBpcre2_match()\fP and
-\fBpcre_dfa_match()\fP, to suppress UTF validity checking of the subject
+\fBpcre2_dfa_match()\fP, to suppress UTF validity checking of the subject
 string.
 .P
 Note also that setting PCRE2_NO_UTF_CHECK at compile time does not disable the
@ -1875,6 +1877,13 @@ characters with code points greater than 127.
 .sp
 The option bits that can be set in a compile context by calling the
 \fBpcre2_set_compile_extra_options()\fP function are as follows:
+.sp
+  PCRE2_EXTRA_ALLOW_LOOKAROUND_BSK
+.sp
+Since release 10.38 PCRE2 has forbidden the use of \eK within lookaround
+assertions, following Perl's lead. This option is provided to re-enable the
+previous behaviour (act in positive lookarounds, ignore in negative ones) in
+case anybody is relying on it.
 .sp
  PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES
 .sp
@ -2008,8 +2017,8 @@ point. However, this applies only to characters whose code points are less than
 256. By default, higher-valued code points never match escapes such as \ew or
 \ed.
 .P
-When PCRE2 is built with Unicode support (the default), the Unicode properties
-of all characters can be tested with \ep and \eP, or, alternatively, the
+When PCRE2 is built with Unicode support (the default), certain Unicode
+character properties can be tested with \ep and \eP, or, alternatively, the
 PCRE2_UCP option can be set when a pattern is compiled; this causes \ew and
 friends to use Unicode property support instead of the built-in tables.
 PCRE2_UCP also causes upper/lower casing operations on characters with code
@ -2272,7 +2281,7 @@ return zero. The third argument should point to a \fBsize_t\fP variable.
  PCRE2_INFO_LASTCODETYPE
 .sp
 Returns 1 if there is a rightmost literal code unit that must exist in any
-matched string, other than at its start. The third argument should  point to a
+matched string, other than at its start. The third argument should point to a
 \fBuint32_t\fP variable. If there is no such value, 0 is returned. When 1 is
 returned, the code unit value itself can be retrieved using
 PCRE2_INFO_LASTCODEUNIT. For anchored patterns, a last literal value is
@ -2490,19 +2499,27 @@ to an abstract format like Java or .NET serialization.
 Information about a successful or unsuccessful match is placed in a match
 data block, which is an opaque structure that is accessed by function calls. In
 particular, the match data block contains a vector of offsets into the subject
-string that define the matched part of the subject and any substrings that were
-captured. This is known as the \fIovector\fP.
+string that define the matched parts of the subject. This is known as the
+\fIovector\fP.
 .P
 Before calling \fBpcre2_match()\fP, \fBpcre2_dfa_match()\fP, or
 \fBpcre2_jit_match()\fP you must create a match data block by calling one of
 the creation functions above. For \fBpcre2_match_data_create()\fP, the first
-argument is the number of pairs of offsets in the \fIovector\fP. One pair of
-offsets is required to identify the string that matched the whole pattern, with
-an additional pair for each captured substring. For example, a value of 4
-creates enough space to record the matched portion of the subject plus three
-captured substrings. A minimum of at least 1 pair is imposed by
-\fBpcre2_match_data_create()\fP, so it is always possible to return the overall
-matched string.
+argument is the number of pairs of offsets in the \fIovector\fP.
+.P
+When using \fBpcre2_match()\fP, one pair of offsets is required to identify the
+string that matched the whole pattern, with an additional pair for each
+captured substring. For example, a value of 4 creates enough space to record
+the matched portion of the subject plus three captured substrings.
+.P
+When using \fBpcre2_dfa_match()\fP there may be multiple matched substrings of
+different lengths at the same point in the subject. The ovector should be made
+large enough to hold as many as are expected.
+.P
+A minimum of at least 1 pair is imposed by \fBpcre2_match_data_create()\fP, so
+it is always possible to return the overall matched string in the case of
+\fBpcre2_match()\fP or the longest match in the case of
+\fBpcre2_dfa_match()\fP.
 .P
 The second argument of \fBpcre2_match_data_create()\fP is a pointer to a
 general context, which can specify custom memory management for obtaining the
@ -2511,10 +2528,11 @@ pass NULL, which causes \fBmalloc()\fP to be used.
 .P
 For \fBpcre2_match_data_create_from_pattern()\fP, the first argument is a
 pointer to a compiled pattern. The ovector is created to be exactly the right
-size to hold all the substrings a pattern might capture. The second argument is
-again a pointer to a general context, but in this case if NULL is passed, the
-memory is obtained using the same allocator that was used for the compiled
-pattern (custom or default).
+size to hold all the substrings a pattern might capture when matched using
+\fBpcre2_match()\fP. You should not use this call when matching with
+\fBpcre2_dfa_match()\fP. The second argument is again a pointer to a general
+context, but in this case if NULL is passed, the memory is obtained using the
+same allocator that was used for the compiled pattern (custom or default).
 .P
 A match data block can be used many times, with the same or different compiled
 patterns. You can extract information from a match data block after a match
@ -2608,7 +2626,9 @@ The subject string is passed to \fBpcre2_match()\fP as a pointer in
 \fIstartoffset\fP. The length and offset are in code units, not characters.
 That is, they are in bytes for the 8-bit library, 16-bit code units for the
 16-bit library, and 32-bit code units for the 32-bit library, whether or not
-UTF processing is enabled.
+UTF processing is enabled. As a special case, if \fIsubject\fP is NULL and
+\fIlength\fP is zero, the subject is assumed to be an empty string. If
+\fIlength\fP is non-zero, an error occurs if \fIsubject\fP is NULL.
 .P
 If \fIstartoffset\fP is greater than the length of the subject,
 \fBpcre2_match()\fP returns PCRE2_ERROR_BADOFFSET. When the starting offset is
@ -2628,10 +2648,10 @@ lookbehind. For example, consider the pattern
 .sp
 which finds occurrences of "iss" in the middle of words. (\eB matches only if
 the current position in the subject is not a word boundary.) When applied to
-the string "Mississipi" the first call to \fBpcre2_match()\fP finds the first
+the string "Mississippi" the first call to \fBpcre2_match()\fP finds the first
 occurrence. If \fBpcre2_match()\fP is called again with just the remainder of
-the subject, namely "issipi", it does not match, because \eB is always false at
-the start of the subject, which is deemed to be a word boundary. However, if
+the subject, namely "issippi", it does not match, because \eB is always false
+at the start of the subject, which is deemed to be a word boundary. However, if
 \fBpcre2_match()\fP is passed the entire string again, but with
 \fIstartoffset\fP set to 4, it finds the second occurrence of "iss" because it
 is able to look behind the starting point to discover that it is preceded by a
@ -3142,11 +3162,11 @@ The backtracking match limit was reached.
 .sp
  PCRE2_ERROR_NOMEMORY
 .sp
-If a pattern contains many nested backtracking points, heap memory is used to
-remember them. This error is given when the memory allocation function (default
-or custom) fails. Note that a different error, PCRE2_ERROR_HEAPLIMIT, is given
-if the amount of memory needed exceeds the heap limit. PCRE2_ERROR_NOMEMORY is
-also returned if PCRE2_COPY_MATCHED_SUBJECT is set and memory allocation fails.
+Heap memory is used to remember backgracking points. This error is given when
+the memory allocation function (default or custom) fails. Note that a different
+error, PCRE2_ERROR_HEAPLIMIT, is given if the amount of memory needed exceeds
+the heap limit. PCRE2_ERROR_NOMEMORY is also returned if
+PCRE2_COPY_MATCHED_SUBJECT is set and memory allocation fails.
 .sp
  PCRE2_ERROR_NULL
 .sp
@ -3397,12 +3417,16 @@ same number causes an error at compile time.
 .P
 This function optionally calls \fBpcre2_match()\fP and then makes a copy of the
 subject string in \fIoutputbuffer\fP, replacing parts that were matched with
-the \fIreplacement\fP string, whose length is supplied in \fBrlength\fP. This
-can be given as PCRE2_ZERO_TERMINATED for a zero-terminated string. There is an
-option (see PCRE2_SUBSTITUTE_REPLACEMENT_ONLY below) to return just the
-replacement string(s). The default action is to perform just one replacement if
-the pattern matches, but there is an option that requests multiple replacements
-(see PCRE2_SUBSTITUTE_GLOBAL below).
+the \fIreplacement\fP string, whose length is supplied in \fBrlength\fP, which
+can be given as PCRE2_ZERO_TERMINATED for a zero-terminated string. As a
+special case, if \fIreplacement\fP is NULL and \fIrlength\fP is zero, the
+replacement is assumed to be an empty string. If \fIrlength\fP is non-zero, an
+error occurs if \fIreplacement\fP is NULL.
+.P
+There is an option (see PCRE2_SUBSTITUTE_REPLACEMENT_ONLY below) to return just
+the replacement string(s). The default action is to perform just one
+replacement if the pattern matches, but there is an option that requests
+multiple replacements (see PCRE2_SUBSTITUTE_GLOBAL below).
 .P
 If successful, \fBpcre2_substitute()\fP returns the number of substitutions
 that were carried out. This may be zero if no match was found, and is never
@ -3431,12 +3455,12 @@ block may or may not have been changed.
 As well as the usual options for \fBpcre2_match()\fP, a number of additional
 options can be set in the \fIoptions\fP argument of \fBpcre2_substitute()\fP.
 One such option is PCRE2_SUBSTITUTE_MATCHED. When this is set, an external
-\fImatch_data\fP block must be provided, and it must have been used for an
-external call to \fBpcre2_match()\fP. The data in the \fImatch_data\fP block
-(return code, offset vector) is used for the first substitution instead of
-calling \fBpcre2_match()\fP from within \fBpcre2_substitute()\fP. This allows
-an application to check for a match before choosing to substitute, without
-having to repeat the match.
+\fImatch_data\fP block must be provided, and it must have already been used for
+an external call to \fBpcre2_match()\fP with the same pattern and subject
+arguments. The data in the \fImatch_data\fP block (return code, offset vector)
+is then used for the first substitution instead of calling \fBpcre2_match()\fP
+from within \fBpcre2_substitute()\fP. This allows an application to check for a
+match before choosing to substitute, without having to repeat the match.
 .P
 The contents of the externally supplied match data block are not changed when
 PCRE2_SUBSTITUTE_MATCHED is set. If PCRE2_SUBSTITUTE_GLOBAL is also set,
@ -3568,7 +3592,7 @@ and force lower case. The escape sequences change the current state: \eU and
 terminating a \eQ quoted sequence) reverts to no case forcing. The sequences
 \eu and \el force the next character (if it is a letter) to upper or lower
 case, respectively, and then the state automatically reverts to no case
-forcing. Case forcing applies to all inserted  characters, including those from
+forcing. Case forcing applies to all inserted characters, including those from
 capture groups and letters within \eQ...\eE quoted sequences. If either
 PCRE2_UTF or PCRE2_UCP was set when the pattern was compiled, Unicode
 properties are used for case forcing characters whose code points are greater
@ -3633,7 +3657,9 @@ needed is returned via \fIoutlengthptr\fP. Note that this does not happen by
 default.
 .P
 PCRE2_ERROR_NULL is returned if PCRE2_SUBSTITUTE_MATCHED is set but the
-\fImatch_data\fP argument is NULL.
+\fImatch_data\fP argument is NULL or if the \fIsubject\fP or \fIreplacement\fP
+arguments are NULL. For backward compatibility reasons an exception is made for
+the \fIreplacement\fP argument if the \fIrlength\fP argument is also 0.
 .P
 PCRE2_ERROR_BADREPLACEMENT is used for miscellaneous syntax errors in the
 replacement string, with more particular errors being PCRE2_ERROR_BADREPESCAPE
@ -3795,12 +3821,13 @@ other alternatives. Ultimately, when it runs out of matches,
 .P
 The function \fBpcre2_dfa_match()\fP is called to match a subject string
 against a compiled pattern, using a matching algorithm that scans the subject
-string just once (not counting lookaround assertions), and does not backtrack.
-This has different characteristics to the normal algorithm, and is not
-compatible with Perl. Some of the features of PCRE2 patterns are not supported.
-Nevertheless, there are times when this kind of matching can be useful. For a
-discussion of the two matching algorithms, and a list of features that
-\fBpcre2_dfa_match()\fP does not support, see the
+string just once (not counting lookaround assertions), and does not backtrack
+(except when processing lookaround assertions). This has different
+characteristics to the normal algorithm, and is not compatible with Perl. Some
+of the features of PCRE2 patterns are not supported. Nevertheless, there are
+times when this kind of matching can be useful. For a discussion of the two
+matching algorithms, and a list of features that \fBpcre2_dfa_match()\fP does
+not support, see the
 .\" HREF
 \fBpcre2matching\fP
 .\"
@ -3832,7 +3859,7 @@ Here is an example of a simple call to \fBpcre2_dfa_match()\fP:
    wspace,         /* working space vector */
    20);            /* number of elements (NOT size in bytes) */
 .
-.SS "Option bits for \fBpcre_dfa_match()\fP"
+.SS "Option bits for \fBpcre2_dfa_match()\fP"
 .rs
 .sp
 The unused bits of the \fIoptions\fP argument for \fBpcre2_dfa_match()\fP must
@ -3991,7 +4018,7 @@ fail, this error is given.
 .sp
 .nf
 Philip Hazel
-University Computing Service
+Retired from University Computing Service
 Cambridge, England.
 .fi
 .
@ -4000,6 +4027,6 @@ Cambridge, England.
 .rs
 .sp
 .nf
-Last updated: 04 November 2020
-Copyright (c) 1997-2020 University of Cambridge.
+Last updated: 27 July 2022
+Copyright (c) 1997-2022 University of Cambridge.
 .fi
--- a/doc/pcre2build.3
+++ b/doc/pcre2build.3
@ -1,4 +1,4 @@
-.TH PCRE2BUILD 3 "20 March 2020" "PCRE2 10.35"
+.TH PCRE2BUILD 3 "27 July 2022" "PCRE2 10.41"
 .SH NAME
 PCRE2 - Perl-compatible regular expressions (revised API)
 .
@ -122,8 +122,9 @@ locked this out by setting PCRE2_NEVER_UTF.
 UTF support allows the libraries to process character code points up to
 0x10ffff in the strings that they handle. Unicode support also gives access to
 the Unicode properties of characters, using pattern escapes such as \eP, \ep,
-and \eX. Only the general category properties such as \fILu\fP and \fINd\fP are
-supported. Details are given in the
+and \eX. Only the general category properties such as \fILu\fP and \fINd\fP,
+script names, and some bi-directional properties are supported. Details are
+given in the
 .\" HREF
 \fBpcre2pattern\fP
 .\"
@ -277,12 +278,11 @@ to the \fBconfigure\fP command. This setting also applies to the
 \fBpcre2_dfa_match()\fP matching function, and to JIT matching (though the
 counting is done differently).
 .P
-The \fBpcre2_match()\fP function starts out using a 20KiB vector on the system
-stack to record backtracking points. The more nested backtracking points there
-are (that is, the deeper the search tree), the more memory is needed. If the
-initial vector is not large enough, heap memory is used, up to a certain limit,
-which is specified in kibibytes (units of 1024 bytes). The limit can be changed
-at run time, as described in the
+The \fBpcre2_match()\fP function uses heap memory to record backtracking
+points. The more nested backtracking points there are (that is, the deeper the
+search tree), the more memory is needed. There is an upper limit, specified in
+kibibytes (units of 1024 bytes). This limit can be changed at run time, as
+described in the
 .\" HREF
 \fBpcre2api\fP
 .\"
@ -302,7 +302,7 @@ You can also explicitly limit the depth of nested backtracking in the
 for --with-match-limit. You can set a lower default limit by adding, for
 example,
 .sp
-  --with-match-limit_depth=10000
+  --with-match-limit-depth=10000
 .sp
 to the \fBconfigure\fP command. This value can be overridden at run time. This
 depth limit indirectly limits the amount of heap memory that is used, but
@ -563,15 +563,16 @@ documentation.
 .sp
 The C99 standard defines formatting modifiers z and t for size_t and
 ptrdiff_t values, respectively. By default, PCRE2 uses these modifiers in
-environments other than Microsoft Visual Studio when __STDC_VERSION__ is
-defined and has a value greater than or equal to 199901L (indicating C99).
+environments other than old versions of Microsoft Visual Studio when
+__STDC_VERSION__ is defined and has a value greater than or equal to 199901L
+(indicating support for C99).
 However, there is at least one environment that claims to be C99 but does not
 support these modifiers. If
 .sp
  --disable-percent-zt
 .sp
 is specified, no use is made of the z or t modifiers. Instead of %td or %zu,
-%lu is used, with a cast for size_t values.
+a suitable format is used depending in the size of long for the platform.
 .
 .
 .SH "SUPPORT FOR FUZZERS"
@ -623,7 +624,7 @@ give a warning.
 .sp
 .nf
 Philip Hazel
-University Computing Service
+Retired from University Computing Service
 Cambridge, England.
 .fi
 .
@ -632,6 +633,6 @@ Cambridge, England.
 .rs
 .sp
 .nf
-Last updated: 20 March 2020
-Copyright (c) 1997-2020 University of Cambridge.
+Last updated: 27 July 2022
+Copyright (c) 1997-2022 University of Cambridge.
 .fi
--- a/doc/pcre2compat.3
+++ b/doc/pcre2compat.3
@ -1,4 +1,4 @@
-.TH PCRE2COMPAT 3 "06 October 2020" "PCRE2 10.36"
+.TH PCRE2COMPAT 3 "08 December 2021" "PCRE2 10.40"
 .SH NAME
 PCRE2 - Perl-compatible regular expressions (revised API)
 .SH "DIFFERENCES BETWEEN PCRE2 AND PERL"
@ -6,31 +6,38 @@ PCRE2 - Perl-compatible regular expressions (revised API)
 .sp
 This document describes some of the differences in the ways that PCRE2 and Perl
 handle regular expressions. The differences described here are with respect to
-Perl version 5.32.0, but as both Perl and PCRE2 are continually changing, the
+Perl version 5.34.0, but as both Perl and PCRE2 are continually changing, the
 information may at times be out of date.
 .P
-1. PCRE2 has only a subset of Perl's Unicode support. Details of what it does
+1. When PCRE2_DOTALL (equivalent to Perl's /s qualifier) is not set, the
+behaviour of the '.' metacharacter differs from Perl. In PCRE2, '.' matches the
+next character unless it is the start of a newline sequence. This means that,
+if the newline setting is CR, CRLF, or NUL, '.' will match the code point LF
+(0x0A) in ASCII/Unicode environments, and NL (either 0x15 or 0x25) when using
+EBCDIC. In Perl, '.' appears never to match LF, even when 0x0A is not a newline
+indicator.
+.P
+2. PCRE2 has only a subset of Perl's Unicode support. Details of what it does
 have are given in the
 .\" HREF
 \fBpcre2unicode\fP
 .\"
 page.
 .P
-2. Like Perl, PCRE2 allows repeat quantifiers on parenthesized assertions, but
+3. Like Perl, PCRE2 allows repeat quantifiers on parenthesized assertions, but
 they do not mean what you might think. For example, (?!a){3} does not assert
 that the next three characters are not "a". It just asserts that the next
 character is not "a" three times (in principle; PCRE2 optimizes this to run the
 assertion just once). Perl allows some repeat quantifiers on other assertions,
-for example, \eb* (but not \eb{3}, though oddly it does allow ^{3}), but these
-do not seem to have any use. PCRE2 does not allow any kind of quantifier on
-non-lookaround assertions.
+for example, \eb* , but these do not seem to have any use. PCRE2 does not allow
+any kind of quantifier on non-lookaround assertions.
 .P
-3. Capture groups that occur inside negative lookaround assertions are counted,
+4. Capture groups that occur inside negative lookaround assertions are counted,
 but their entries in the offsets vector are set only when a negative assertion
 is a condition that has a matching branch (that is, the condition is false).
 Perl may set such capture groups in other circumstances.
 .P
-4. The following Perl escape sequences are not supported: \eF, \el, \eL, \eu,
+5. The following Perl escape sequences are not supported: \eF, \el, \eL, \eu,
 \eU, and \eN when followed by a character name. \eN on its own, matching a
 non-newline character, and \eN{U+dd..}, matching a Unicode code point, are
 supported. The escapes that modify the case of following letters are
@ -40,12 +47,12 @@ generated by default. However, if either of the PCRE2_ALT_BSUX or
 PCRE2_EXTRA_ALT_BSUX options is set, \eU and \eu are interpreted as ECMAScript
 interprets them.
 .P
-5. The Perl escape sequences \ep, \eP, and \eX are supported only if PCRE2 is
+6. The Perl escape sequences \ep, \eP, and \eX are supported only if PCRE2 is
 built with Unicode support (the default). The properties that can be tested
 with \ep and \eP are limited to the general category properties such as Lu and
-Nd, script names such as Greek or Han, and the derived properties Any and L&.
-Both PCRE2 and Perl support the Cs (surrogate) property, but in PCRE2 its use
-is limited. See the
+Nd, script names such as Greek or Han, Bidi_Class, Bidi_Control, and the
+derived properties Any and LC (synonym L&). Both PCRE2 and Perl support the Cs
+(surrogate) property, but in PCRE2 its use is limited. See the
 .\" HREF
 \fBpcre2pattern\fP
 .\"
@ -53,14 +60,14 @@ documentation for details. The long synonyms for property names that Perl
 supports (such as \ep{Letter}) are not supported by PCRE2, nor is it permitted
 to prefix any of these properties with "Is".
 .P
-6. PCRE2 supports the \eQ...\eE escape for quoting substrings. Characters
+7. PCRE2 supports the \eQ...\eE escape for quoting substrings. Characters
 in between are treated as literals. However, this is slightly different from
 Perl in that $ and @ are also handled as literals inside the quotes. In Perl,
-they cause variable interpolation (but of course PCRE2 does not have
-variables). Also, Perl does "double-quotish backslash interpolation" on any
-backslashes between \eQ and \eE which, its documentation says, "may lead to
-confusing results". PCRE2 treats a backslash between \eQ and \eE just like any
-other character. Note the following examples:
+they cause variable interpolation (PCRE2 does not have variables). Also, Perl
+does "double-quotish backslash interpolation" on any backslashes between \eQ
+and \eE which, its documentation says, "may lead to confusing results". PCRE2
+treats a backslash between \eQ and \eE just like any other character. Note the
+following examples:
 .sp
    Pattern            PCRE2 matches     Perl matches
 .sp
@ -75,7 +82,7 @@ other character. Note the following examples:
 The \eQ...\eE sequence is recognized both inside and outside character classes
 by both PCRE2 and Perl.
 .P
-7. Fairly obviously, PCRE2 does not support the (?{code}) and (??{code})
+8. Fairly obviously, PCRE2 does not support the (?{code}) and (??{code})
 constructions. However, PCRE2 does have a "callout" feature, which allows an
 external function to be called during pattern matching. See the
 .\" HREF
@ -83,11 +90,11 @@ external function to be called during pattern matching. See the
 .\"
 documentation for details.
 .P
-8. Subroutine calls (whether recursive or not) were treated as atomic groups up
+9. Subroutine calls (whether recursive or not) were treated as atomic groups up
 to PCRE2 release 10.23, but from release 10.30 this changed, and backtracking
 into subroutine calls is now supported, as in Perl.
 .P
-9. In PCRE2, if any of the backtracking control verbs are used in a group that
+10. In PCRE2, if any of the backtracking control verbs are used in a group that
 is called as a subroutine (whether or not recursively), their effect is
 confined to that group; it does not extend to the surrounding pattern. This is
 not always the case in Perl. In particular, if (*THEN) is present in a group
@ -95,18 +102,18 @@ that is called as a subroutine, its action is limited to that group, even if
 the group does not contain any | characters. Note that such groups are
 processed as anchored at the point where they are tested.
 .P
-10. If a pattern contains more than one backtracking control verb, the first
+11. If a pattern contains more than one backtracking control verb, the first
 one that is backtracked onto acts. For example, in the pattern
 A(*COMMIT)B(*PRUNE)C a failure in B triggers (*COMMIT), but a failure in C
 triggers (*PRUNE). Perl's behaviour is more complex; in many cases it is the
 same as PCRE2, but there are cases where it differs.
 .P
-11. There are some differences that are concerned with the settings of captured
+12. There are some differences that are concerned with the settings of captured
 strings when part of a pattern is repeated. For example, matching "aba" against
 the pattern /^(a(b)?)+$/ in Perl leaves $2 unset, but in PCRE2 it is set to
 "b".
 .P
-12. PCRE2's handling of duplicate capture group numbers and names is not as
+13. PCRE2's handling of duplicate capture group numbers and names is not as
 general as Perl's. This is a consequence of the fact the PCRE2 works internally
 just with numbers, using an external table to translate between numbers and
 names. In particular, a pattern such as (?|(?<a>A)|(?<b>B)), where the two
@ -115,35 +122,38 @@ causes an error at compile time. If it were allowed, it would not be possible
 to distinguish which group matched, because both names map to capture group
 number 1. To avoid this confusing situation, an error is given at compile time.
 .P
-13. Perl used to recognize comments in some places that PCRE2 does not, for
+14. Perl used to recognize comments in some places that PCRE2 does not, for
 example, between the ( and ? at the start of a group. If the /x modifier is
 set, Perl allowed white space between ( and ? though the latest Perls give an
 error (for a while it was just deprecated). There may still be some cases where
 Perl behaves differently.
 .P
-14. Perl, when in warning mode, gives warnings for character classes such as
+15. Perl, when in warning mode, gives warnings for character classes such as
 [A-\ed] or [a-[:digit:]]. It then treats the hyphens as literals. PCRE2 has no
 warning features, so it gives an error in these cases because they are almost
 certainly user mistakes.
 .P
-15. In PCRE2, the upper/lower case character properties Lu and Ll are not
+16. In PCRE2, the upper/lower case character properties Lu and Ll are not
 affected when case-independent matching is specified. For example, \ep{Lu}
 always matches an upper case letter. I think Perl has changed in this respect;
-in the release at the time of writing (5.32), \ep{Lu} and \ep{Ll} match all
+in the release at the time of writing (5.34), \ep{Lu} and \ep{Ll} match all
 letters, regardless of case, when case independence is specified.
 .P
-16. From release 5.32.0, Perl locks out the use of \eK in lookaround
-assertions. In PCRE2, \eK is acted on when it occurs in positive assertions,
-but is ignored in negative assertions.
+17. From release 5.32.0, Perl locks out the use of \eK in lookaround
+assertions. From release 10.38 PCRE2 does the same by default. However, there
+is an option for re-enabling the previous behaviour. When this option is set,
+\eK is acted on when it occurs in positive assertions, but is ignored in
+negative assertions.
 .P
-17. PCRE2 provides some extensions to the Perl regular expression facilities.
+18. PCRE2 provides some extensions to the Perl regular expression facilities.
 Perl 5.10 included new features that were not in earlier versions of Perl, some
 of which (such as named parentheses) were in PCRE2 for some time before. This
-list is with respect to Perl 5.32:
+list is with respect to Perl 5.34:
 .sp
 (a) Although lookbehind assertions in PCRE2 must match fixed length strings,
 each alternative toplevel branch of a lookbehind assertion can match a
-different length of string. Perl requires them all to have the same length.
+different length of string. Perl used to require them all to have the same
+length, but the latest version has some variable length support.
 .sp
 (b) From PCRE2 10.23, backreferences to groups of fixed length are supported
 in lookbehinds, provided that there is no possibility of referencing a
@ -184,11 +194,11 @@ the pattern.
 extension to the lookaround facilities. The default, Perl-compatible
 lookarounds are atomic.
 .P
-18. The Perl /a modifier restricts /d numbers to pure ascii, and the /aa
+19. The Perl /a modifier restricts /d numbers to pure ascii, and the /aa
 modifier restricts /i case-insensitive matching to pure ascii, ignoring Unicode
 rules. This separation cannot be represented with PCRE2_UCP.
 .P
-19. Perl has different limits than PCRE2. See the
+20. Perl has different limits than PCRE2. See the
 .\" HREF
 \fBpcre2limit\fP
 .\"
@ -203,7 +213,7 @@ fall into any stack-overflow limit. PCRE2 made a similar change at release
 .sp
 .nf
 Philip Hazel
-University Computing Service
+Retired from University Computing Service
 Cambridge, England.
 .fi
 .
@ -212,6 +222,6 @@ Cambridge, England.
 .rs
 .sp
 .nf
-Last updated: 06 October 2020
-Copyright (c) 1997-2019 University of Cambridge.
+Last updated: 08 December 2021
+Copyright (c) 1997-2021 University of Cambridge.
 .fi
--- a/doc/pcre2convert.3
+++ b/doc/pcre2convert.3
@ -116,8 +116,8 @@ permitted to match separator characters, but the double-star (**) feature
 (which does match separators) is supported.
 .P
 PCRE2_CONVERT_GLOB_NO_WILD_SEPARATOR matches globs with wildcards allowed to
-match separator characters. PCRE2_GLOB_NO_STARSTAR matches globs with the
-double-star feature disabled. These options may be given together.
+match separator characters. PCRE2_CONVERT_GLOB_NO_STARSTAR matches globs with
+the double-star feature disabled. These options may be given together.
 .
 .
 .SH "CONVERTING POSIX PATTERNS"
--- a/doc/pcre2demo.3
+++ b/doc/pcre2demo.3
@ -215,8 +215,8 @@ if (rc < 0)
  return 1;
  }

-/* Match succeded. Get a pointer to the output vector, where string offsets are
-stored. */
+/* Match succeeded. Get a pointer to the output vector, where string offsets
+are stored. */

 ovector = pcre2_get_ovector_pointer(match_data);
 printf("Match succeeded at offset %d\en", (int)ovector[0]);
@ -234,9 +234,12 @@ pcre2_match_data_create_from_pattern() above. */
 if (rc == 0)
  printf("ovector was not big enough for all the captured substrings\en");

-/* We must guard against patterns such as /(?=.\eK)/ that use \eK in an assertion
-to set the start of a match later than its end. In this demonstration program,
-we just detect this case and give up. */
+/* Since release 10.38 PCRE2 has locked out the use of \eK in lookaround
+assertions. However, there is an option to re-enable the old behaviour. If that
+is set, it is possible to run patterns such as /(?=.\eK)/ that use \eK in an
+assertion to set the start of a match later than its end. In this demonstration
+program, we show how to detect this case, but it shouldn't arise because the
+option is never set. */

 if (ovector[0] > ovector[1])
  {
@ -453,7 +456,7 @@ for (;;)
    return 1;
    }

-  /* Match succeded */
+  /* Match succeeded */

  printf("\enMatch succeeded again at offset %d\en", (int)ovector[0]);

--- a/doc/pcre2grep.1
+++ b/doc/pcre2grep.1
@ -1,4 +1,4 @@
-.TH PCRE2GREP 1 "04 October 2020" "PCRE2 10.36"
+.TH PCRE2GREP 1 "30 July 2022" "PCRE2 10.41"
 .SH NAME
 pcre2grep - a grep with Perl-compatible regular expressions.
 .SH SYNOPSIS
@ -43,13 +43,15 @@ For example:
 .sp
  pcre2grep some-pattern file1 - file3
 .sp
-Input files are searched line by line. By default, each line that matches a
+By default, input files are searched line by line. Each line that matches a
 pattern is copied to the standard output, and if there is more than one file,
 the file name is output at the start of each line, followed by a colon.
-However, there are options that can change how \fBpcre2grep\fP behaves. In
-particular, the \fB-M\fP option makes it possible to search for strings that
-span line boundaries. What defines a line boundary is controlled by the
-\fB-N\fP (\fB--newline\fP) option.
+However, there are options that can change how \fBpcre2grep\fP behaves. For
+example, the \fB-M\fP option makes it possible to search for strings that span
+line boundaries. What defines a line boundary is controlled by the \fB-N\fP
+(\fB--newline\fP) option. The \fB-h\fP and \fB-H\fP options control whether or
+not file names are shown, and the \fB-Z\fP option changes the file name
+terminator to a zero byte.
 .P
 The amount of memory used for buffering files that are being scanned is
 controlled by parameters that can be set by the \fB--buffer-size\fP and
@ -149,22 +151,30 @@ Output up to \fInumber\fP lines of context after each matching line. Fewer
 lines are output if the next match or the end of the file is reached, or if the
 processing buffer size has been set too small. If file names and/or line
 numbers are being output, a hyphen separator is used instead of a colon for the
-context lines. A line containing "--" is output between each group of lines,
-unless they are in fact contiguous in the input file. The value of \fInumber\fP
-is expected to be relatively small. When \fB-c\fP is used, \fB-A\fP is ignored.
+context lines (the \fB-Z\fP option can be used to change the file name
+terminator to a zero byte). A line containing "--" is output between each group
+of lines, unless they are in fact contiguous in the input file. The value of
+\fInumber\fP is expected to be relatively small. When \fB-c\fP is used,
+\fB-A\fP is ignored.
 .TP
 \fB-a\fP, \fB--text\fP
 Treat binary files as text. This is equivalent to
 \fB--binary-files\fP=\fItext\fP.
 .TP
+\fB--allow-lookaround-bsk\fP
+PCRE2 now forbids the use of \eK in lookarounds by default, in line with Perl.
+This option causes \fBpcre2grep\fP to set the PCRE2_EXTRA_ALLOW_LOOKAROUND_BSK
+option, which enables this somewhat dangerous usage.
+.TP
 \fB-B\fP \fInumber\fP, \fB--before-context=\fP\fInumber\fP
 Output up to \fInumber\fP lines of context before each matching line. Fewer
 lines are output if the previous match or the start of the file is within
 \fInumber\fP lines, or if the processing buffer size has been set too small. If
 file names and/or line numbers are being output, a hyphen separator is used
-instead of a colon for the context lines. A line containing "--" is output
-between each group of lines, unless they are in fact contiguous in the input
-file. The value of \fInumber\fP is expected to be relatively small. When
+instead of a colon for the context lines (the \fB-Z\fP option can be used to
+change the file name terminator to a zero byte). A line containing "--" is
+output between each group of lines, unless they are in fact contiguous in the
+input file. The value of \fInumber\fP is expected to be relatively small. When
 \fB-c\fP is used, \fB-B\fP is ignored.
 .TP
 \fB--binary-files=\fP\fIword\fP
@ -351,19 +361,21 @@ shown separately. This option is mutually exclusive with \fB--output\fP,
 .TP
 \fB-H\fP, \fB--with-filename\fP
 Force the inclusion of the file name at the start of output lines when
-searching a single file. By default, the file name is not shown in this case.
-For matching lines, the file name is followed by a colon; for context lines, a
-hyphen separator is used. If a line number is also being output, it follows the
-file name. When the \fB-M\fP option causes a pattern to match more than one
-line, only the first is preceded by the file name. This option overrides any
-previous \fB-h\fP, \fB-l\fP, or \fB-L\fP options.
+searching a single file. The file name is not normally shown in this case.
+By default, for matching lines, the file name is followed by a colon; for
+context lines, a hyphen separator is used. The \fB-Z\fP option can be used to
+change the terminator to a zero byte. If a line number is also being output,
+it follows the file name. When the \fB-M\fP option causes a pattern to match
+more than one line, only the first is preceded by the file name. This option
+overrides any previous \fB-h\fP, \fB-l\fP, or \fB-L\fP options.
 .TP
 \fB-h\fP, \fB--no-filename\fP
-Suppress the output file names when searching multiple files. By default,
-file names are shown when multiple files are searched. For matching lines, the
-file name is followed by a colon; for context lines, a hyphen separator is used.
-If a line number is also being output, it follows the file name. This option
-overrides any previous \fB-H\fP, \fB-L\fP, or \fB-l\fP options.
+Suppress the output file names when searching multiple files. File names are
+normally shown when multiple files are searched. By default, for matching
+lines, the file name is followed by a colon; for context lines, a hyphen
+separator is used. The \fB-Z\fP option can be used to change the terminator to
+a zero byte. If a line number is also being output, it follows the file name.
+This option overrides any previous \fB-H\fP, \fB-L\fP, or \fB-l\fP options.
 .TP
 \fB--heap-limit\fP=\fInumber\fP
 See \fB--match-limit\fP below.
@ -412,17 +424,19 @@ given any number of times. If a directory matches both \fB--include-dir\fP and
 \fB-L\fP, \fB--files-without-match\fP
 Instead of outputting lines from the files, just output the names of the files
 that do not contain any lines that would have been output. Each file name is
-output once, on a separate line. This option overrides any previous \fB-H\fP,
-\fB-h\fP, or \fB-l\fP options.
+output once, on a separate line by default, but if the \fB-Z\fP option is set, 
+they are separated by zero bytes instead of newlines. This option overrides any
+previous \fB-H\fP, \fB-h\fP, or \fB-l\fP options.
 .TP
 \fB-l\fP, \fB--files-with-matches\fP
 Instead of outputting lines from the files, just output the names of the files
 containing lines that would have been output. Each file name is output once, on
-a separate line. Searching normally stops as soon as a matching line is found
-in a file. However, if the \fB-c\fP (count) option is also used, matching
-continues in order to obtain the correct count, and those files that have at
-least one match are listed along with their counts. Using this option with
-\fB-c\fP is a way of suppressing the listing of files with no matches that
+a separate line, but if the \fB-Z\fP option is set, they are separated by zero
+bytes instead of newlines. Searching normally stops as soon as a matching line
+is found in a file. However, if the \fB-c\fP (count) option is also used,
+matching continues in order to obtain the correct count, and those files that
+have at least one match are listed along with their counts. Using this option
+with \fB-c\fP is a way of suppressing the listing of files with no matches that
 occurs with \fB-c\fP on its own. This option overrides any previous \fB-H\fP,
 \fB-h\fP, or \fB-L\fP options.
 .TP
@ -511,10 +525,7 @@ counter that is incremented each time around its main processing loop. If the
 value set by \fB--match-limit\fP is reached, an error occurs.
 .sp
 The \fB--heap-limit\fP option specifies, as a number of kibibytes (units of
-1024 bytes), the amount of heap memory that may be used for matching. Heap
-memory is needed only if matching the pattern requires a significant number of
-nested backtracking points to be remembered. This parameter can be set to zero
-to forbid the use of heap memory altogether.
+1024 bytes), the maximum amount of heap memory that may be used for matching.
 .sp
 The \fB--depth-limit\fP option limits the depth of nested backtracking points,
 which indirectly limits the amount of memory that is used. The amount of memory
@ -727,6 +738,12 @@ be more than one line. This is equivalent to having "^(?:" at the start of each
 pattern and ")$" at the end. This option applies only to the patterns that are
 matched against the contents of files; it does not apply to patterns specified
 by any of the \fB--include\fP or \fB--exclude\fP options.
+.TP
+\fB-Z\fP, \fB--null\fP
+Terminate files names in the regular output with a zero byte (the NUL
+character) instead of what would normally appear. This is useful when file
+names contain unusual characters such as colons, hyphens, or even newlines. The
+option does not apply to file names in error messages.
 .
 .
 .SH "ENVIRONMENT VARIABLES"
@ -946,7 +963,7 @@ because VMS does not distinguish between exit(0) and exit(1).
 .sp
 .nf
 Philip Hazel
-University Computing Service
+Retired from University Computing Service
 Cambridge, England.
 .fi
 .
@ -955,6 +972,6 @@ Cambridge, England.
 .rs
 .sp
 .nf
-Last updated: 04 October 2020
-Copyright (c) 1997-2020 University of Cambridge.
+Last updated: 30 July 2022
+Copyright (c) 1997-2022 University of Cambridge.
 .fi
--- a/doc/pcre2grep.txt
+++ b/doc/pcre2grep.txt
@ -42,13 +42,15 @@ DESCRIPTION

         pcre2grep some-pattern file1 - file3

-       Input files are searched line by  line.  By  default,  each  line  that
+       By default, input files are searched  line  by  line.  Each  line  that
       matches  a  pattern  is  copied to the standard output, and if there is
       more than one file, the file name is output at the start of each  line,
       followed  by  a  colon.  However, there are options that can change how
-       pcre2grep behaves. In particular, the -M option makes  it  possible  to
+       pcre2grep behaves. For example, the -M  option  makes  it  possible  to
       search  for  strings  that  span  line  boundaries. What defines a line
-       boundary is controlled by the -N (--newline) option.
+       boundary is controlled by the -N (--newline) option. The -h and -H  op-
+       tions  control  whether  or not file names are shown, and the -Z option
+       changes the file name terminator to a zero byte.

       The amount of memory used for buffering files that are being scanned is
       controlled  by  parameters  that  can  be  set by the --buffer-size and
@ -149,26 +151,35 @@ OPTIONS
                 the  file  is  reached,  or if the processing buffer size has
                 been set too small. If file names and/or line numbers are be-
                 ing output, a hyphen separator is used instead of a colon for
-                 the context lines. A line containing "--" is  output  between
-                 each  group  of  lines, unless they are in fact contiguous in
-                 the input file. The value of number is expected to  be  rela-
-                 tively small. When -c is used, -A is ignored.
+                 the context lines (the -Z option can be used  to  change  the
+                 file  name terminator to a zero byte). A line containing "--"
+                 is output between each group of lines,  unless  they  are  in
+                 fact contiguous in the input file. The value of number is ex-
+                 pected to be relatively small. When -c is  used,  -A  is  ig-
+                 nored.

       -a, --text
                 Treat  binary  files as text. This is equivalent to --binary-
                 files=text.

+       --allow-lookaround-bsk
+                 PCRE2 now forbids the use of \K in lookarounds by default, in
+                 line  with  Perl.   This  option  causes pcre2grep to set the
+                 PCRE2_EXTRA_ALLOW_LOOKAROUND_BSK option, which  enables  this
+                 somewhat dangerous usage.
+
       -B number, --before-context=number
-                 Output up to number lines of  context  before  each  matching
-                 line.  Fewer  lines  are  output if the previous match or the
-                 start of the file is within number lines, or if the  process-
-                 ing  buffer size has been set too small. If file names and/or
+                 Output  up  to  number  lines of context before each matching
+                 line. Fewer lines are output if the  previous  match  or  the
+                 start  of the file is within number lines, or if the process-
+                 ing buffer size has been set too small. If file names  and/or
                 line numbers are being output, a hyphen separator is used in-
-                 stead  of  a  colon  for the context lines. A line containing
-                 "--" is output between each group of lines, unless  they  are
-                 in  fact contiguous in the input file. The value of number is
-                 expected to be relatively small. When -c is used, -B  is  ig-
-                 nored.
+                 stead of a colon for the context lines (the -Z option can  be
+                 used  to  change  the file name terminator to a zero byte). A
+                 line containing "--" is output between each group  of  lines,
+                 unless  they  are  in  fact contiguous in the input file. The
+                 value of number is expected to be relatively small.  When  -c
+                 is used, -B is ignored.

       --binary-files=word
                 Specify  how binary files are to be processed. If the word is
@ -381,89 +392,94 @@ OPTIONS

       -H, --with-filename
                 Force  the  inclusion of the file name at the start of output
-                 lines when searching a single file. By default, the file name
-                 is not shown in this case.  For matching lines, the file name
-                 is followed by a colon; for context lines, a hyphen separator
-                 is  used.  If  a line number is also being output, it follows
-                 the file name. When the -M option causes a pattern  to  match
-                 more  than  one  line, only the first is preceded by the file
-                 name. This option overrides any previous -h, -l,  or  -L  op-
-                 tions.
+                 lines when searching a single file. The file name is not nor-
+                 mally  shown  in  this case.  By default, for matching lines,
+                 the file name is followed by a colon; for  context  lines,  a
+                 hyphen separator is used. The -Z option can be used to change
+                 the terminator to a zero byte. If a line number is also being
+                 output, it follows the file name. When the -M option causes a
+                 pattern to match more than one line, only the first  is  pre-
+                 ceded  by  the  file name. This option overrides any previous
+                 -h, -l, or -L options.

       -h, --no-filename
                 Suppress the output file names when searching multiple files.
-                 By default, file names are  shown  when  multiple  files  are
-                 searched.  For matching lines, the file name is followed by a
-                 colon; for context lines, a hyphen separator is used.   If  a
-                 line  number  is also being output, it follows the file name.
-                 This option overrides any previous -H, -L, or -l options.
+                 File  names  are  normally  shown  when  multiple  files  are
+                 searched. By default, for matching lines, the  file  name  is
+                 followed by a colon; for context lines, a hyphen separator is
+                 used. The -Z option can be used to change the terminator to a
+                 zero  byte. If a line number is also being output, it follows
+                 the file name.  This option overrides any previous -H, -L, or
+                 -l options.

       --heap-limit=number
                 See --match-limit below.

-       --help    Output a help message, giving brief details  of  the  command
-                 options  and  file type support, and then exit. Anything else
+       --help    Output  a  help  message, giving brief details of the command
+                 options and file type support, and then exit.  Anything  else
                 on the command line is ignored.

-       -I        Ignore  binary  files.  This  is  equivalent   to   --binary-
+       -I        Ignore   binary   files.  This  is  equivalent  to  --binary-
                 files=without-match.

       -i, --ignore-case
                 Ignore upper/lower case distinctions during comparisons.

       --include=pattern
-                 If  any --include patterns are specified, the only files that
+                 If any --include patterns are specified, the only files  that
                 are processed are those whose names match one of the patterns
-                 and  do  not match an --exclude pattern. This option does not
-                 affect directories, but it  applies  to  all  files,  whether
-                 listed  on the command line, obtained from --file-list, or by
-                 scanning a directory. The pattern is a PCRE2 regular  expres-
-                 sion,  and is matched against the final component of the file
-                 name, not the entire path. The -F, -w, and -x options do  not
-                 apply  to this pattern. The option may be given any number of
-                 times. If a file name matches both an --include and an  --ex-
-                 clude  pattern,  it  is excluded.  There is no short form for
+                 and do not match an --exclude pattern. This option  does  not
+                 affect  directories,  but  it  applies  to all files, whether
+                 listed on the command line, obtained from --file-list, or  by
+                 scanning  a directory. The pattern is a PCRE2 regular expres-
+                 sion, and is matched against the final component of the  file
+                 name,  not the entire path. The -F, -w, and -x options do not
+                 apply to this pattern. The option may be given any number  of
+                 times.  If a file name matches both an --include and an --ex-
+                 clude pattern, it is excluded.  There is no  short  form  for
                 this option.

       --include-from=filename
-                 Treat each non-empty line of the file  as  the  data  for  an
+                 Treat  each  non-empty  line  of  the file as the data for an
                 --include option. What constitutes a newline for this purpose
-                 is the operating system's default. The --newline  option  has
+                 is  the  operating system's default. The --newline option has
                 no effect on this option. This option may be given any number
                 of times; all the files are read.

       --include-dir=pattern
-                 If any --include-dir patterns are specified, the only  direc-
-                 tories  that are processed are those whose names match one of
-                 the patterns and do not match an --exclude-dir pattern.  This
-                 applies  to  all  directories,  whether listed on the command
-                 line, obtained from --file-list, or by scanning a parent  di-
-                 rectory.  The  pattern  is a PCRE2 regular expression, and is
-                 matched against the final component of  the  directory  name,
-                 not  the entire path. The -F, -w, and -x options do not apply
+                 If  any --include-dir patterns are specified, the only direc-
+                 tories that are processed are those whose names match one  of
+                 the  patterns and do not match an --exclude-dir pattern. This
+                 applies to all directories, whether  listed  on  the  command
+                 line,  obtained from --file-list, or by scanning a parent di-
+                 rectory. The pattern is a PCRE2 regular  expression,  and  is
+                 matched  against  the  final component of the directory name,
+                 not the entire path. The -F, -w, and -x options do not  apply
                 to this pattern. The option may be given any number of times.
-                 If  a directory matches both --include-dir and --exclude-dir,
+                 If a directory matches both --include-dir and  --exclude-dir,
                 it is excluded. There is no short form for this option.

       -L, --files-without-match
-                 Instead of outputting lines from the files, just  output  the
-                 names  of  the files that do not contain any lines that would
-                 have been output. Each file name is output once, on  a  sepa-
-                 rate  line.  This option overrides any previous -H, -h, or -l
-                 options.
+                 Instead  of  outputting lines from the files, just output the
+                 names of the files that do not contain any lines  that  would
+                 have  been  output. Each file name is output once, on a sepa-
+                 rate line by default, but if the -Z option is set,  they  are
+                 separated  by  zero  bytes  instead  of newlines. This option
+                 overrides any previous -H, -h, or -l options.

       -l, --files-with-matches
                 Instead of outputting lines from the files, just  output  the
                 names of the files containing lines that would have been out-
-                 put. Each file name is  output  once,  on  a  separate  line.
-                 Searching  normally stops as soon as a matching line is found
-                 in a file. However, if the -c (count) option  is  also  used,
-                 matching  continues in order to obtain the correct count, and
-                 those files that have at least one  match  are  listed  along
-                 with their counts. Using this option with -c is a way of sup-
-                 pressing the listing of files with  no  matches  that  occurs
-                 with  -c  on  its own. This option overrides any previous -H,
-                 -h, or -L options.
+                 put. Each file name is output once, on a separate  line,  but
+                 if the -Z option is set, they are separated by zero bytes in-
+                 stead of newlines. Searching normally  stops  as  soon  as  a
+                 matching  line is found in a file. However, if the -c (count)
+                 option is also used, matching continues in  order  to  obtain
+                 the  correct  count,  and  those files that have at least one
+                 match are listed along with their counts. Using  this  option
+                 with  -c is a way of suppressing the listing of files with no
+                 matches that occurs with -c on its own. This option overrides
+                 any previous -H, -h, or -L options.

       --label=name
                 This option supplies a name to be used for the standard input
@ -471,105 +487,102 @@ OPTIONS
                 input)" is used. There is no short form for this option.

       --line-buffered
-                 When this option is given, non-compressed input is  read  and
-                 processed  line by line, and the output is flushed after each
-                 write. By default, input is  read  in  large  chunks,  unless
-                 pcre2grep  can  determine that it is reading from a terminal,
+                 When  this  option is given, non-compressed input is read and
+                 processed line by line, and the output is flushed after  each
+                 write.  By  default,  input  is  read in large chunks, unless
+                 pcre2grep can determine that it is reading from  a  terminal,
                 which is currently possible only in Unix-like environments or
                 Windows. Output to terminal is normally automatically flushed
-                 by the operating system. This option can be useful  when  the
-                 input  or  output  is  attached to a pipe and you do not want
-                 pcre2grep to buffer up large amounts of data.   However,  its
-                 use  will  affect  performance, and the -M (multiline) option
-                 ceases to work. When input is from a compressed .gz  or  .bz2
+                 by  the  operating system. This option can be useful when the
+                 input or output is attached to a pipe and  you  do  not  want
+                 pcre2grep  to  buffer up large amounts of data.  However, its
+                 use will affect performance, and the  -M  (multiline)  option
+                 ceases  to  work. When input is from a compressed .gz or .bz2
                 file, --line-buffered is ignored.

       --line-offsets
-                 Instead  of  showing lines or parts of lines that match, show
+                 Instead of showing lines or parts of lines that  match,  show
                 each match as a line number, the offset from the start of the
-                 line,  and a length. The line number is terminated by a colon
-                 (as usual; see the -n option), and the offset and length  are
-                 separated  by  a  comma.  In  this mode, no context is shown.
-                 That is, the -A, -B, and -C options are ignored. If there  is
-                 more  than  one  match in a line, each of them is shown sepa-
-                 rately. This option  is  mutually  exclusive  with  --output,
+                 line, and a length. The line number is terminated by a  colon
+                 (as  usual; see the -n option), and the offset and length are
+                 separated by a comma. In this  mode,  no  context  is  shown.
+                 That  is, the -A, -B, and -C options are ignored. If there is
+                 more than one match in a line, each of them  is  shown  sepa-
+                 rately.  This  option  is  mutually  exclusive with --output,
                 --file-offsets, and --only-matching.

       --locale=locale-name
-                 This  option specifies a locale to be used for pattern match-
-                 ing. It overrides the value in the LC_ALL or  LC_CTYPE  envi-
-                 ronment  variables.  If no locale is specified, the PCRE2 li-
+                 This option specifies a locale to be used for pattern  match-
+                 ing.  It  overrides the value in the LC_ALL or LC_CTYPE envi-
+                 ronment variables. If no locale is specified, the  PCRE2  li-
                 brary's default (usually the "C" locale) is used. There is no
                 short form for this option.

       -M, --multiline
-                 Allow  patterns to match more than one line. When this option
+                 Allow patterns to match more than one line. When this  option
                 is set, the PCRE2 library is called in "multiline" mode. This
-                 allows  a matched string to extend past the end of a line and
-                 continue on one or more subsequent lines. Patterns used  with
+                 allows a matched string to extend past the end of a line  and
+                 continue  on one or more subsequent lines. Patterns used with
                 -M may usefully contain literal newline characters and inter-
-                 nal occurrences of ^ and $ characters. The output for a  suc-
-                 cessful  match  may  consist of more than one line. The first
-                 line is the line in which the match  started,  and  the  last
-                 line  is  the  line  in which the match ended. If the matched
-                 string ends with a newline sequence, the output ends  at  the
-                 end  of  that  line.   If  -v  is set, none of the lines in a
-                 multi-line match are output. Once a match has  been  handled,
-                 scanning  restarts at the beginning of the line after the one
+                 nal  occurrences of ^ and $ characters. The output for a suc-
+                 cessful match may consist of more than one  line.  The  first
+                 line  is  the  line  in which the match started, and the last
+                 line is the line in which the match  ended.  If  the  matched
+                 string  ends  with a newline sequence, the output ends at the
+                 end of that line.  If -v is set,  none  of  the  lines  in  a
+                 multi-line  match  are output. Once a match has been handled,
+                 scanning restarts at the beginning of the line after the  one
                 in which the match ended.

-                 The newline sequence that separates multiple  lines  must  be
-                 matched  as  part  of  the  pattern. For example, to find the
-                 phrase "regular expression" in a file where  "regular"  might
-                 be  at the end of a line and "expression" at the start of the
+                 The  newline  sequence  that separates multiple lines must be
+                 matched as part of the pattern.  For  example,  to  find  the
+                 phrase  "regular  expression" in a file where "regular" might
+                 be at the end of a line and "expression" at the start of  the
                 next line, you could use this command:

                   pcre2grep -M 'regular\s+expression' <file>

                 The \s escape sequence matches any white space character, in-
-                 cluding  newlines, and is followed by + so as to match trail-
-                 ing white space on the first line as well  as  possibly  han-
+                 cluding newlines, and is followed by + so as to match  trail-
+                 ing  white  space  on the first line as well as possibly han-
                 dling a two-character newline sequence.

-                 There  is a limit to the number of lines that can be matched,
-                 imposed by the way that pcre2grep buffers the input  file  as
-                 it  scans  it.  With  a sufficiently large processing buffer,
+                 There is a limit to the number of lines that can be  matched,
+                 imposed  by  the way that pcre2grep buffers the input file as
+                 it scans it. With a  sufficiently  large  processing  buffer,
                 this should not be a problem, but the -M option does not work
                 when input is read line by line (see --line-buffered.)

       -m number, --max-count=number
-                 Stop  processing after finding number matching lines, or non-
-                 matching lines if -v is also set. Any trailing context  lines
-                 are  output  after  the  final match. In multiline mode, each
-                 multiline match counts as just one line for this purpose.  If
-                 this  limit is reached when reading the standard input from a
+                 Stop processing after finding number matching lines, or  non-
+                 matching  lines if -v is also set. Any trailing context lines
+                 are output after the final match.  In  multiline  mode,  each
+                 multiline  match counts as just one line for this purpose. If
+                 this limit is reached when reading the standard input from  a
                 regular file, the file is left positioned just after the last
-                 matching  line.   If -c is also set, the count that is output
-                 is never greater than number. This option has  no  effect  if
+                 matching line.  If -c is also set, the count that  is  output
+                 is  never  greater  than number. This option has no effect if
                 used with -L, -l, or -q, or when just checking for a match in
                 a binary file.

       --match-limit=number
-                 Processing some regular expression patterns may take  a  very
+                 Processing  some  regular expression patterns may take a very
                 long time to search for all possible matching strings. Others
-                 may require a very large amount of memory.  There  are  three
+                 may  require  a  very large amount of memory. There are three
                 options that set resource limits for matching.

                 The --match-limit option provides a means of limiting comput-
-                 ing resource usage when processing patterns that are not  go-
+                 ing  resource usage when processing patterns that are not go-
                 ing to match, but which have a very large number of possibil-
                 ities in their search trees. The classic example is a pattern
-                 that  uses  nested unlimited repeats. Internally, PCRE2 has a
-                 counter that is incremented each time around  its  main  pro-
-                 cessing  loop.  If the value set by --match-limit is reached,
+                 that uses nested unlimited repeats. Internally, PCRE2  has  a
+                 counter  that  is  incremented each time around its main pro-
+                 cessing loop. If the value set by --match-limit  is  reached,
                 an error occurs.

-                 The --heap-limit option specifies, as a number  of  kibibytes
-                 (units  of 1024 bytes), the amount of heap memory that may be
-                 used for matching. Heap memory is needed only if matching the
-                 pattern  requires a significant number of nested backtracking
-                 points to be remembered. This parameter can be set to zero to
-                 forbid the use of heap memory altogether.
+                 The  --heap-limit  option specifies, as a number of kibibytes
+                 (units of 1024 bytes), the maximum amount of heap memory that
+                 may be used for matching.

                 The  --depth-limit  option  limits  the depth of nested back-
                 tracking points, which indirectly limits the amount of memory
@ -806,6 +819,13 @@ OPTIONS
                 does  not apply to patterns specified by any of the --include
                 or --exclude options.

+       -Z, --null
+                 Terminate files names in the regular output with a zero  byte
+                 (the  NUL  character)  instead of what would normally appear.
+                 This is useful when file  names  contain  unusual  characters
+                 such  as  colons,  hyphens, or even newlines. The option does
+                 not apply to file names in error messages.
+

 ENVIRONMENT VARIABLES

@ -1010,11 +1030,11 @@ SEE ALSO
 AUTHOR

       Philip Hazel
-       University Computing Service
+       Retired from University Computing Service
       Cambridge, England.


 REVISION

-       Last updated: 04 October 2020
-       Copyright (c) 1997-2020 University of Cambridge.
+       Last updated: 30 July 2022
+       Copyright (c) 1997-2022 University of Cambridge.
--- a/doc/pcre2jit.3
+++ b/doc/pcre2jit.3
@ -1,4 +1,4 @@
-.TH PCRE2JIT 3 "23 May 2019" "PCRE2 10.34"
+.TH PCRE2JIT 3 "30 November 2021" "PCRE2 10.40"
 .SH NAME
 PCRE2 - Perl-compatible regular expressions (revised API)
 .SH "PCRE2 JUST-IN-TIME COMPILER SUPPORT"
@ -29,6 +29,7 @@ platforms:
 .sp
  ARM 32-bit (v5, v7, and Thumb2)
  ARM 64-bit
+  IBM s390x 64 bit
  Intel x86 32-bit and 64-bit
  MIPS 32-bit and 64-bit
  Power PC 32-bit and 64-bit
@ -250,11 +251,11 @@ non-sequential matches in one thread is to use callouts: if a callout function
 starts another match, that match must use a different JIT stack to the one used
 for currently suspended match(es).
 .P
-In a multithread application, if you do not
-specify a JIT stack, or if you assign or pass back NULL from a callback, that
-is thread-safe, because each thread has its own machine stack. However, if you
-assign or pass back a non-NULL JIT stack, this must be a different stack for
-each thread so that the application is thread-safe.
+In a multithread application, if you do not specify a JIT stack, or if you
+assign or pass back NULL from a callback, that is thread-safe, because each
+thread has its own machine stack. However, if you assign or pass back a
+non-NULL JIT stack, this must be a different stack for each thread so that the
+application is thread-safe.
 .P
 Strictly speaking, even more is allowed. You can assign the same non-NULL stack
 to a match context that is used by any number of patterns, as long as they are
@ -266,7 +267,7 @@ inefficient solution, and not recommended.
 This is a suggestion for how a multithreaded program that needs to set up
 non-default JIT stacks might operate:
 .sp
-  During thread initalization
+  During thread initialization
    thread_local_var = pcre2_jit_stack_create(...)
 .sp
  During thread exit
@ -354,8 +355,8 @@ out this complicated API.
 .B void pcre2_jit_free_unused_memory(pcre2_general_context *\fIgcontext\fP);
 .fi
 .P
-The JIT executable allocator does not free all memory when it is possible.
-It expects new allocations, and keeps some free memory around to improve
+The JIT executable allocator does not free all memory when it is possible. It
+expects new allocations, and keeps some free memory around to improve
 allocation speed. However, in low memory conditions, it might be better to free
 all possible memory. You can cause this to happen by calling
 pcre2_jit_free_unused_memory(). Its argument is a general context, for custom
@ -415,10 +416,10 @@ that was not compiled.
 .P
 When you call \fBpcre2_match()\fP, as well as testing for invalid options, a
 number of other sanity checks are performed on the arguments. For example, if
-the subject pointer is NULL, an immediate error is given. Also, unless
-PCRE2_NO_UTF_CHECK is set, a UTF subject string is tested for validity. In the
-interests of speed, these checks do not happen on the JIT fast path, and if
-invalid data is passed, the result is undefined.
+the subject pointer is NULL but the length is non-zero, an immediate error is
+given. Also, unless PCRE2_NO_UTF_CHECK is set, a UTF subject string is tested
+for validity. In the interests of speed, these checks do not happen on the JIT
+fast path, and if invalid data is passed, the result is undefined.
 .P
 Bypassing the sanity checks and the \fBpcre2_match()\fP wrapping can give
 speedups of more than 10%.
@ -444,6 +445,6 @@ Cambridge, England.
 .rs
 .sp
 .nf
-Last updated: 23 May 2019
-Copyright (c) 1997-2019 University of Cambridge.
+Last updated: 30 November 2021
+Copyright (c) 1997-2021 University of Cambridge.
 .fi
--- a/doc/pcre2limits.3
+++ b/doc/pcre2limits.3
@ -1,4 +1,4 @@
-.TH PCRE2LIMITS 3 "03 February 2019" "PCRE2 10.33"
+.TH PCRE2LIMITS 3 "26 July 2022" "PCRE2 10.41"
 .SH NAME
 PCRE2 - Perl-compatible regular expressions (revised API)
 .SH "SIZE AND OTHER LIMITATIONS"
@ -51,6 +51,10 @@ is 255 code units for the 8-bit library and 65535 code units for the 16-bit and
 .P
 The maximum length of a string argument to a callout is the largest number a
 32-bit unsigned integer can hold.
+.P
+The maximum amount of heap memory used for matching is controlled by the heap 
+limit, which can be set in a pattern or in a match context. The default is a 
+very large number, effectively unlimited.
 .
 .
 .SH AUTHOR
@ -58,7 +62,7 @@ The maximum length of a string argument to a callout is the largest number a
 .sp
 .nf
 Philip Hazel
-University Computing Service
+Retired from University Computing Service
 Cambridge, England.
 .fi
 .
@ -67,6 +71,6 @@ Cambridge, England.
 .rs
 .sp
 .nf
-Last updated: 02 February 2019
-Copyright (c) 1997-2019 University of Cambridge.
+Last updated: 26 July 2022
+Copyright (c) 1997-2022 University of Cambridge.
 .fi
--- a/doc/pcre2matching.3
+++ b/doc/pcre2matching.3
@ -1,4 +1,4 @@
-.TH PCRE2MATCHING 3 "23 May 2019" "PCRE2 10.34"
+.TH PCRE2MATCHING 3 "28 August 2021" "PCRE2 10.38"
 .SH NAME
 PCRE2 - Perl-compatible regular expressions (revised API)
 .SH "PCRE2 MATCHING ALGORITHMS"
@ -61,8 +61,9 @@ tried is controlled by the greedy or ungreedy nature of the quantifier.
 If a leaf node is reached, a matching string has been found, and at that point
 the algorithm stops. Thus, if there is more than one possible match, this
 algorithm returns the first one that it finds. Whether this is the shortest,
-the longest, or some intermediate length depends on the way the greedy and
-ungreedy repetition quantifiers are specified in the pattern.
+the longest, or some intermediate length depends on the way the alternations
+and the greedy or ungreedy repetition quantifiers are specified in the
+pattern.
 .P
 Because it ends up with a single path through the tree, it is relatively
 straightforward for this algorithm to keep track of the substrings that are
@ -91,10 +92,15 @@ no more unterminated paths. At this point, terminated paths represent the
 different matching possibilities (if there are none, the match has failed).
 Thus, if there is more than one possible match, this algorithm finds all of
 them, and in particular, it finds the longest. The matches are returned in
-decreasing order of length. There is an option to stop the algorithm after the
-first match (which is necessarily the shortest) is found.
+the output vector in decreasing order of length. There is an option to stop the
+algorithm after the first match (which is necessarily the shortest) is found.
 .P
-Note that all the matches that are found start at the same point in the
+Note that the size of vector needed to contain all the results depends on the
+number of simultaneous matches, not on the number of parentheses in the
+pattern. Using \fBpcre2_match_data_create_from_pattern()\fP to create the match
+data block is therefore not advisable when doing DFA matching.
+.P
+Note also that all the matches that are found start at the same point in the
 subject. If the pattern
 .sp
  cat(er(pillar)?)?
@ -165,19 +171,13 @@ supported by \fBpcre2_dfa_match()\fP.
 .SH "ADVANTAGES OF THE ALTERNATIVE ALGORITHM"
 .rs
 .sp
-Using the alternative matching algorithm provides the following advantages:
+The main advantage of the alternative algorithm is that all possible matches
+(at a single point in the subject) are automatically found, and in particular,
+the longest match is found. To find more than one match at the same point using
+the standard algorithm, you have to do kludgy things with callouts.
 .P
-1. All possible matches (at a single point in the subject) are automatically
-found, and in particular, the longest match is found. To find more than one
-match using the standard algorithm, you have to do kludgy things with
-callouts.
-.P
-2. Because the alternative algorithm scans the subject string just once, and
-never needs to backtrack (except for lookbehinds), it is possible to pass very
-long subject strings to the matching function in several pieces, checking for
-partial matching each time. Although it is also possible to do multi-segment
-matching using the standard algorithm, by retaining partially matched
-substrings, it is more complicated. The
+Partial matching is possible with this algorithm, though it has some
+limitations. The
 .\" HREF
 \fBpcre2partial\fP
 .\"
@ -199,6 +199,8 @@ invalid UTF string are not supported.
 .P
 3. Although atomic groups are supported, their use does not provide the
 performance advantage that it does for the standard algorithm.
+.P
+4. JIT optimization is not supported.
 .
 .
 .SH AUTHOR
@ -206,7 +208,7 @@ performance advantage that it does for the standard algorithm.
 .sp
 .nf
 Philip Hazel
-University Computing Service
+Retired from University Computing Service
 Cambridge, England.
 .fi
 .
@ -215,6 +217,6 @@ Cambridge, England.
 .rs
 .sp
 .nf
-Last updated: 23 May 2019
-Copyright (c) 1997-2019 University of Cambridge.
+Last updated: 28 August 2021
+Copyright (c) 1997-2021 University of Cambridge.
 .fi
--- a/doc/pcre2pattern.3
+++ b/doc/pcre2pattern.3
@ -1,4 +1,4 @@
-.TH PCRE2PATTERN 3 "06 October 2020" "PCRE2 10.35"
+.TH PCRE2PATTERN 3 "12 January 2022" "PCRE2 10.40"
 .SH NAME
 PCRE2 - Perl-compatible regular expressions (revised API)
 .SH "PCRE2 REGULAR EXPRESSION DETAILS"
@ -509,7 +509,6 @@ for themselves. For example, outside a character class:
 .\" JOIN
  \e377   might be a backreference, otherwise
            the value 255 (decimal)
-.\" JOIN
  \e81    is always a backreference
 .sp
 Note that octal values of 100 or greater that are specified using this syntax
@ -741,7 +740,7 @@ Unicode support is not needed for these characters to be recognized.
 .P
 It is possible to restrict \eR to match only CR, LF, or CRLF (instead of the
 complete set of Unicode line endings) by setting the option PCRE2_BSR_ANYCRLF
-at compile time. (BSR is an abbrevation for "backslash R".) This can be made
+at compile time. (BSR is an abbreviation for "backslash R".) This can be made
 the default when PCRE2 is built; if this is the case, the other behaviour can
 be requested via the PCRE2_BSR_UNICODE option. It is also possible to specify
 these settings by starting a pattern string with one of the following
@ -773,195 +772,64 @@ can be used in any mode, though in 8-bit and 16-bit non-UTF modes these
 sequences are of course limited to testing characters whose code points are
 less than U+0100 and U+10000, respectively. In 32-bit non-UTF mode, code points
 greater than 0x10ffff (the Unicode limit) may be encountered. These are all
-treated as being in the Unknown script and with an unassigned type. The extra
-escape sequences are:
+treated as being in the Unknown script and with an unassigned type.
+.P
+Matching characters by Unicode property is not fast, because PCRE2 has to do a
+multistage table lookup in order to find a character's property. That is why
+the traditional escape sequences such as \ed and \ew do not use Unicode
+properties in PCRE2 by default, though you can make them do so by setting the
+PCRE2_UCP option or by starting the pattern with (*UCP).
+.P
+The extra escape sequences that provide property support are:
 .sp
  \ep{\fIxx\fP}   a character with the \fIxx\fP property
  \eP{\fIxx\fP}   a character without the \fIxx\fP property
  \eX       a Unicode extended grapheme cluster
 .sp
-The property names represented by \fIxx\fP above are case-sensitive. There is
-support for Unicode script names, Unicode general category properties, "Any",
-which matches any character (including newline), and some special PCRE2
-properties (described in the
+The property names represented by \fIxx\fP above are not case-sensitive, and in
+accordance with Unicode's "loose matching" rules, spaces, hyphens, and
+underscores are ignored. There is support for Unicode script names, Unicode
+general category properties, "Any", which matches any character (including
+newline), Bidi_Class, a number of binary (yes/no) properties, and some special
+PCRE2 properties (described
 .\" HTML <a href="#extraprops">
 .\" </a>
-next section).
+below).
 .\"
-Other Perl properties such as "InMusicalSymbols" are not supported by PCRE2.
-Note that \eP{Any} does not match any characters, so always causes a match
-failure.
+Certain other Perl properties such as "InMusicalSymbols" are not supported by
+PCRE2. Note that \eP{Any} does not match any characters, so always causes a
+match failure.
+.
+.
+.
+.SS "Script properties for \ep and \eP"
+.rs
+.sp
+There are three different syntax forms for matching a script. Each Unicode
+character has a basic script and, optionally, a list of other scripts ("Script
+Extensions") with which it is commonly used. Using the Adlam script as an
+example, \ep{sc:Adlam} matches characters whose basic script is Adlam, whereas
+\ep{scx:Adlam} matches, in addition, characters that have Adlam in their
+extensions list. The full names "script" and "script extensions" for the
+property types are recognized, and a equals sign is an alternative to the
+colon. If a script name is given without a property type, for example,
+\ep{Adlam}, it is treated as \ep{scx:Adlam}. Perl changed to this
+interpretation at release 5.26 and PCRE2 changed at release 10.40.
 .P
-Sets of Unicode characters are defined as belonging to certain scripts. A
-character from one of these sets can be matched using a script name. For
-example:
-.sp
-  \ep{Greek}
-  \eP{Han}
-.sp
 Unassigned characters (and in non-UTF 32-bit mode, characters with code points
 greater than 0x10FFFF) are assigned the "Unknown" script. Others that are not
 part of an identified script are lumped together as "Common". The current list
-of scripts is:
-.P
-Adlam,
-Ahom,
-Anatolian_Hieroglyphs,
-Arabic,
-Armenian,
-Avestan,
-Balinese,
-Bamum,
-Bassa_Vah,
-Batak,
-Bengali,
-Bhaiksuki,
-Bopomofo,
-Brahmi,
-Braille,
-Buginese,
-Buhid,
-Canadian_Aboriginal,
-Carian,
-Caucasian_Albanian,
-Chakma,
-Cham,
-Cherokee,
-Chorasmian,
-Common,
-Coptic,
-Cuneiform,
-Cypriot,
-Cyrillic,
-Deseret,
-Devanagari,
-Dives_Akuru,
-Dogra,
-Duployan,
-Egyptian_Hieroglyphs,
-Elbasan,
-Elymaic,
-Ethiopic,
-Georgian,
-Glagolitic,
-Gothic,
-Grantha,
-Greek,
-Gujarati,
-Gunjala_Gondi,
-Gurmukhi,
-Han,
-Hangul,
-Hanifi_Rohingya,
-Hanunoo,
-Hatran,
-Hebrew,
-Hiragana,
-Imperial_Aramaic,
-Inherited,
-Inscriptional_Pahlavi,
-Inscriptional_Parthian,
-Javanese,
-Kaithi,
-Kannada,
-Katakana,
-Kayah_Li,
-Kharoshthi,
-Khitan_Small_Script,
-Khmer,
-Khojki,
-Khudawadi,
-Lao,
-Latin,
-Lepcha,
-Limbu,
-Linear_A,
-Linear_B,
-Lisu,
-Lycian,
-Lydian,
-Mahajani,
-Makasar,
-Malayalam,
-Mandaic,
-Manichaean,
-Marchen,
-Masaram_Gondi,
-Medefaidrin,
-Meetei_Mayek,
-Mende_Kikakui,
-Meroitic_Cursive,
-Meroitic_Hieroglyphs,
-Miao,
-Modi,
-Mongolian,
-Mro,
-Multani,
-Myanmar,
-Nabataean,
-Nandinagari,
-New_Tai_Lue,
-Newa,
-Nko,
-Nushu,
-Nyakeng_Puachue_Hmong,
-Ogham,
-Ol_Chiki,
-Old_Hungarian,
-Old_Italic,
-Old_North_Arabian,
-Old_Permic,
-Old_Persian,
-Old_Sogdian,
-Old_South_Arabian,
-Old_Turkic,
-Oriya,
-Osage,
-Osmanya,
-Pahawh_Hmong,
-Palmyrene,
-Pau_Cin_Hau,
-Phags_Pa,
-Phoenician,
-Psalter_Pahlavi,
-Rejang,
-Runic,
-Samaritan,
-Saurashtra,
-Sharada,
-Shavian,
-Siddham,
-SignWriting,
-Sinhala,
-Sogdian,
-Sora_Sompeng,
-Soyombo,
-Sundanese,
-Syloti_Nagri,
-Syriac,
-Tagalog,
-Tagbanwa,
-Tai_Le,
-Tai_Tham,
-Tai_Viet,
-Takri,
-Tamil,
-Tangut,
-Telugu,
-Thaana,
-Thai,
-Tibetan,
-Tifinagh,
-Tirhuta,
-Ugaritic,
-Unknown,
-Vai,
-Wancho,
-Warang_Citi,
-Yezidi,
-Yi,
-Zanabazar_Square.
-.P
+of recognized script names and their 4-character abbreviations can be obtained
+by running this command:
+.sp
+  pcre2test -LS
+.sp
+.
+.
+.
+.SS "The general category property for \ep and \eP"
+.rs
+.sp
 Each character has exactly one Unicode general category property, specified by
 a two-letter abbreviation. For compatibility with Perl, negation can be
 specified by including a circumflex between the opening brace and the property
@ -1021,9 +889,9 @@ The following general category property codes are supported:
  Zp    Paragraph separator
  Zs    Space separator
 .sp
-The special property L& is also supported: it matches a character that has
-the Lu, Ll, or Lt property, in other words, a letter that is not classified as
-a modifier or "other".
+The special property LC, which has the synonym L&, is also supported: it
+matches a character that has the Lu, Ll, or Lt property, in other words, a
+letter that is not classified as a modifier or "other".
 .P
 The Cs (Surrogate) property applies only to characters whose code points are in
 the range U+D800 to U+DFFF. These characters are no different to any other
@ -1047,12 +915,53 @@ Unicode table.
 Specifying caseless matching does not affect these escape sequences. For
 example, \ep{Lu} always matches only upper case letters. This is different from
 the behaviour of current versions of Perl.
-.P
-Matching characters by Unicode property is not fast, because PCRE2 has to do a
-multistage table lookup in order to find a character's property. That is why
-the traditional escape sequences such as \ed and \ew do not use Unicode
-properties in PCRE2 by default, though you can make them do so by setting the
-PCRE2_UCP option or by starting the pattern with (*UCP).
+.
+.
+.SS "Binary (yes/no) properties for \ep and \eP"
+.rs
+.sp
+Unicode defines a number of binary properties, that is, properties whose only
+values are true or false. You can obtain a list of those that are recognized by
+\ep and \eP, along with their abbreviations, by running this command:
+.sp
+  pcre2test -LP
+.sp
+.
+.
+.SS "The Bidi_Class property for \ep and \eP"
+.rs
+.sp
+  \ep{Bidi_Class:<class>}   matches a character with the given class
+  \ep{BC:<class>}           matches a character with the given class
+.sp
+The recognized classes are:
+.sp
+  AL          Arabic letter
+  AN          Arabic number
+  B           paragraph separator
+  BN          boundary neutral
+  CS          common separator
+  EN          European number
+  ES          European separator
+  ET          European terminator
+  FSI         first strong isolate
+  L           left-to-right
+  LRE         left-to-right embedding
+  LRI         left-to-right isolate
+  LRO         left-to-right override
+  NSM         non-spacing mark
+  ON          other neutral
+  PDF         pop directional format
+  PDI         pop directional isolate
+  R           right-to-left
+  RLE         right-to-left embedding
+  RLI         right-to-left isolate
+  RLO         right-to-left override
+  S           segment separator
+  WS          which space
+.sp
+An equals sign may be used instead of a colon. The class names are
+case-insensitive; only the short names listed above are recognized.
 .
 .
 .SS Extended grapheme clusters
@ -1082,7 +991,7 @@ additional characters according to the following rules for ending a cluster:
 3. Do not break Hangul (a Korean script) syllable sequences. Hangul characters
 are of five types: L, V, T, LV, and LVT. An L character may be followed by an
 L, V, LV, or LVT character; an LV or V character may be followed by a V or T
-character; an LVT or T character may be follwed only by a T character.
+character; an LVT or T character may be followed only by a T character.
 .P
 4. Do not end before extending characters or spacing marks or the "zero-width
 joiner" character. Characters with the "mark" property always have the
@ -1168,9 +1077,11 @@ For example, when the pattern
 .sp
 matches "foobar", the first substring is still set to "foo".
 .P
-Perl used to document that the use of \eK within lookaround assertions is "not
-well defined", but from version 5.32.0 Perl does not support this usage at all.
-In PCRE2, \eK is acted upon when it occurs inside positive assertions, but is
+From version 5.32.0 Perl forbids the use of \eK in lookaround assertions. From
+release 10.38 PCRE2 also forbids this by default. However, the
+PCRE2_EXTRA_ALLOW_LOOKAROUND_BSK option can be used when calling
+\fBpcre2_compile()\fP to re-enable the previous behaviour. When this option is
+set, \eK is acted upon when it occurs inside positive assertions, but is
 ignored in negative assertions. Note that when a pattern such as (?=ab\eK)
 matches, the reported start of the match can be greater than the end of the
 match. Using \eK in a lookbehind assertion at the start of a pattern can also
@ -1329,14 +1240,19 @@ end of the subject in both modes, and if all branches of a pattern start with
 .sp
 Outside a character class, a dot in the pattern matches any one character in
 the subject string except (by default) a character that signifies the end of a
-line.
+line. One or more characters may be specified as line terminators (see
+.\" HTML <a href="#newlines">
+.\" </a>
+"Newline conventions"
+.\"
+above).
 .P
-When a line ending is defined as a single character, dot never matches that
-character; when the two-character sequence CRLF is used, dot does not match CR
-if it is immediately followed by LF, but otherwise it matches all characters
-(including isolated CRs and LFs). When any Unicode line endings are being
-recognized, dot does not match CR or LF or any of the other line ending
-characters.
+Dot never matches a single line-ending character. When the two-character
+sequence CRLF is the only line ending, dot does not match CR if it is
+immediately followed by LF, but otherwise it matches all characters (including
+isolated CRs and LFs). When ANYCRLF is selected for line endings, no occurences
+of CR of LF match dot. When all Unicode line endings are being recognized, dot
+does not match CR or LF or any of the other line ending characters.
 .P
 The behaviour of dot with regard to newlines can be changed. If the
 PCRE2_DOTALL option is set, a dot matches any one character, without exception.
@ -2179,10 +2095,10 @@ be easier to remember:
 .sp
  (*atomic:\ed+)foo
 .sp
-This kind of parenthesized group "locks up" the  part of the pattern it
-contains once it has matched, and a failure further into the pattern is
-prevented from backtracking into it. Backtracking past it to previous items,
-however, works as normal.
+This kind of parenthesized group "locks up" the part of the pattern it contains
+once it has matched, and a failure further into the pattern is prevented from
+backtracking into it. Backtracking past it to previous items, however, works as
+normal.
 .P
 An alternative description is that a group of this type matches exactly the
 string of characters that an identical standalone pattern would match, if
@ -2928,7 +2844,7 @@ breaks):
  (?(DEFINE) (?<byte> 2[0-4]\ed | 25[0-5] | 1\ed\ed | [1-9]?\ed) )
  \eb (?&byte) (\e.(?&byte)){3} \eb
 .sp
-The first part of the pattern is a DEFINE group inside which a another group
+The first part of the pattern is a DEFINE group inside which another group
 named "byte" is defined. This matches an individual component of an IPv4
 address (a number less than 256). When matching takes place, this part of the
 pattern is skipped because DEFINE acts like a false condition. The rest of the
@ -3658,7 +3574,7 @@ successful match if there is a later mismatch. Consider:
 .sp
 If the subject is "aaaac...", after the first match attempt fails (starting at
 the first character in the string), the starting point skips on to start the
-next attempt at "c". Note that a possessive quantifer does not have the same
+next attempt at "c". Note that a possessive quantifier does not have the same
 effect as this example; although it would suppress backtracking during the
 first match attempt, the second attempt would start at the second character
 instead of skipping on to "c".
@ -3889,7 +3805,7 @@ there is a backtrack at the outer level.
 .sp
 .nf
 Philip Hazel
-University Computing Service
+Retired from University Computing Service
 Cambridge, England.
 .fi
 .
@ -3898,6 +3814,6 @@ Cambridge, England.
 .rs
 .sp
 .nf
-Last updated: 06 October 2020
-Copyright (c) 1997-2020 University of Cambridge.
+Last updated: 12 January 2022
+Copyright (c) 1997-2022 University of Cambridge.
 .fi
--- a/doc/pcre2perform.3
+++ b/doc/pcre2perform.3
@ -1,4 +1,4 @@
-.TH PCRE2PERFORM 3 "03 February 2019" "PCRE2 10.33"
+.TH PCRE2PERFORM 3 "27 July 2022" "PCRE2 10.41"
 .SH NAME
 PCRE2 - Perl-compatible regular expressions (revised API)
 .SH "PCRE2 PERFORMANCE"
@ -69,12 +69,28 @@ From release 10.30, the interpretive (non-JIT) version of \fBpcre2_match()\fP
 uses very little system stack at run time. In earlier releases recursive
 function calls could use a great deal of stack, and this could cause problems,
 but this usage has been eliminated. Backtracking positions are now explicitly
-remembered in memory frames controlled by the code. An initial 20KiB vector of
-frames is allocated on the system stack (enough for about 100 frames for small
-patterns), but if this is insufficient, heap memory is used. The amount of heap
-memory can be limited; if the limit is set to zero, only the initial stack
-vector is used. Rewriting patterns to be time-efficient, as described below,
-may also reduce the memory requirements.
+remembered in memory frames controlled by the code. 
+.P
+The size of each frame depends on the size of pointer variables and the number
+of capturing parenthesized groups in the pattern being matched. On a 64-bit
+system the frame size for a pattern with no captures is 128 bytes. For each
+capturing group the size increases by 16 bytes.
+.P
+Until release 10.41, an initial 20KiB frames vector was allocated on the system 
+stack, but this still caused some issues for multi-thread applications where
+each thread has a very small stack. From release 10.41 backtracking memory
+frames are always held in heap memory. An initial heap allocation is obtained
+the first time any match data block is passed to \fBpcre2_match()\fP. This is
+remembered with the match data block and re-used if that block is used for
+another match. It is freed when the match data block itself is freed.
+.P
+The size of the initial block is the larger of 20KiB or ten times the pattern's 
+frame size, unless the heap limit is less than this, in which case the heap 
+limit is used. If the initial block proves to be too small during matching, it
+is replaced by a larger block, subject to the heap limit. The heap limit is 
+checked only when a new block is to be allocated. Reducing the heap limit 
+between calls to \fBpcre2_match()\fP with the same match data block does not 
+affect the saved block.
 .P
 In contrast to \fBpcre2_match()\fP, \fBpcre2_dfa_match()\fP does use recursive
 function calls, but only for processing atomic groups, lookaround assertions,
@ -230,7 +246,7 @@ pattern to match. This is done by repeatedly matching with different limits.
 .sp
 .nf
 Philip Hazel
-University Computing Service
+Retired from University Computing Service
 Cambridge, England.
 .fi
 .
@ -239,6 +255,6 @@ Cambridge, England.
 .rs
 .sp
 .nf
-Last updated: 03 February 2019
-Copyright (c) 1997-2019 University of Cambridge.
+Last updated: 27 July 2022
+Copyright (c) 1997-2022 University of Cambridge.
 .fi
--- a/doc/pcre2serialize.3
+++ b/doc/pcre2serialize.3
@ -6,11 +6,11 @@ PCRE2 - Perl-compatible regular expressions (revised API)
 .sp
 .nf
 .B int32_t pcre2_serialize_decode(pcre2_code **\fIcodes\fP,
-.B "  int32_t \fInumber_of_codes\fP, const uint32_t *\fIbytes\fP,"
+.B "  int32_t \fInumber_of_codes\fP, const uint8_t *\fIbytes\fP,"
 .B "  pcre2_general_context *\fIgcontext\fP);"
 .sp
-.B int32_t pcre2_serialize_encode(pcre2_code **\fIcodes\fP,
-.B "  int32_t \fInumber_of_codes\fP, uint32_t **\fIserialized_bytes\fP,"
+.B int32_t pcre2_serialize_encode(const pcre2_code **\fIcodes\fP,
+.B "  int32_t \fInumber_of_codes\fP, uint8_t **\fIserialized_bytes\fP,"
 .B "  PCRE2_SIZE *\fIserialized_size\fP, pcre2_general_context *\fIgcontext\fP);"
 .sp
 .B void pcre2_serialize_free(uint8_t *\fIbytes\fP);
@ -81,7 +81,7 @@ of serialized patterns, or one of the following negative error codes:
 .sp
  PCRE2_ERROR_BADDATA      the number of patterns is zero or less
  PCRE2_ERROR_BADMAGIC     mismatch of id bytes in one of the patterns
-  PCRE2_ERROR_MEMORY       memory allocation failed
+  PCRE2_ERROR_NOMEMORY     memory allocation failed
  PCRE2_ERROR_MIXEDTABLES  the patterns do not all use the same tables
  PCRE2_ERROR_NULL         the 1st, 3rd, or 4th argument is NULL
 .sp
@ -141,7 +141,6 @@ mangagement functions for the decoded patterns. If this argument is NULL,
 \fBmalloc()\fP and \fBfree()\fP are used. After deserialization, the byte
 stream is no longer needed and can be discarded.
 .sp
-  int32_t number_of_codes;
  pcre2_code *list_of_codes[2];
  uint8_t *bytes = <serialized data>;
  int32_t number_of_codes =
--- a/doc/pcre2syntax.3
+++ b/doc/pcre2syntax.3
@ -1,4 +1,4 @@
-.TH PCRE2SYNTAX 3 "28 December 2019" "PCRE2 10.35"
+.TH PCRE2SYNTAX 3 "12 January 2022" "PCRE2 10.40"
 .SH NAME
 PCRE2 - Perl-compatible regular expressions (revised API)
 .SH "PCRE2 REGULAR EXPRESSION SYNTAX SUMMARY"
@ -102,6 +102,10 @@ happening, \es and \ew may also match characters with code points in the range
 128-255. If the PCRE2_UCP option is set, the behaviour of these escape
 sequences is changed to use Unicode properties and they match many more
 characters.
+.P
+Property descriptions in \ep and \eP are matched caselessly; hyphens,
+underscores, and white space are ignored, in accordance with Unicode's "loose
+matching" rules.
 .
 .
 .SH "GENERAL CATEGORY PROPERTIES FOR \ep and \eP"
@ -120,6 +124,7 @@ characters.
  Lo         Other letter
  Lt         Title case letter
  Lu         Upper case letter
+  Lc         Ll, Lu, or Lt
  L&         Ll, Lu, or Lt
 .sp
  M          Mark
@ -167,165 +172,59 @@ Perl and POSIX space are now the same. Perl added VT to its space character set
 at release 5.18.
 .
 .
-.SH "SCRIPT NAMES FOR \ep AND \eP"
+.SH "BINARY PROPERTIES FOR \ep AND \eP"
 .rs
 .sp
-Adlam,
-Ahom,
-Anatolian_Hieroglyphs,
-Arabic,
-Armenian,
-Avestan,
-Balinese,
-Bamum,
-Bassa_Vah,
-Batak,
-Bengali,
-Bhaiksuki,
-Bopomofo,
-Brahmi,
-Braille,
-Buginese,
-Buhid,
-Canadian_Aboriginal,
-Carian,
-Caucasian_Albanian,
-Chakma,
-Cham,
-Cherokee,
-Chorasmian,
-Common,
-Coptic,
-Cuneiform,
-Cypriot,
-Cyrillic,
-Deseret,
-Devanagari,
-Dives_Akuru,
-Dogra,
-Duployan,
-Egyptian_Hieroglyphs,
-Elbasan,
-Elymaic,
-Ethiopic,
-Georgian,
-Glagolitic,
-Gothic,
-Grantha,
-Greek,
-Gujarati,
-Gunjala_Gondi,
-Gurmukhi,
-Han,
-Hangul,
-Hanifi_Rohingya,
-Hanunoo,
-Hatran,
-Hebrew,
-Hiragana,
-Imperial_Aramaic,
-Inherited,
-Inscriptional_Pahlavi,
-Inscriptional_Parthian,
-Javanese,
-Kaithi,
-Kannada,
-Katakana,
-Kayah_Li,
-Kharoshthi,
-Khitan_Small_Script,
-Khmer,
-Khojki,
-Khudawadi,
-Lao,
-Latin,
-Lepcha,
-Limbu,
-Linear_A,
-Linear_B,
-Lisu,
-Lycian,
-Lydian,
-Mahajani,
-Makasar,
-Malayalam,
-Mandaic,
-Manichaean,
-Marchen,
-Masaram_Gondi,
-Medefaidrin,
-Meetei_Mayek,
-Mende_Kikakui,
-Meroitic_Cursive,
-Meroitic_Hieroglyphs,
-Miao,
-Modi,
-Mongolian,
-Mro,
-Multani,
-Myanmar,
-Nabataean,
-Nandinagari,
-New_Tai_Lue,
-Newa,
-Nko,
-Nushu,
-Nyakeng_Puachue_Hmong,
-Ogham,
-Ol_Chiki,
-Old_Hungarian,
-Old_Italic,
-Old_North_Arabian,
-Old_Permic,
-Old_Persian,
-Old_Sogdian,
-Old_South_Arabian,
-Old_Turkic,
-Oriya,
-Osage,
-Osmanya,
-Pahawh_Hmong,
-Palmyrene,
-Pau_Cin_Hau,
-Phags_Pa,
-Phoenician,
-Psalter_Pahlavi,
-Rejang,
-Runic,
-Samaritan,
-Saurashtra,
-Sharada,
-Shavian,
-Siddham,
-SignWriting,
-Sinhala,
-Sogdian,
-Sora_Sompeng,
-Soyombo,
-Sundanese,
-Syloti_Nagri,
-Syriac,
-Tagalog,
-Tagbanwa,
-Tai_Le,
-Tai_Tham,
-Tai_Viet,
-Takri,
-Tamil,
-Tangut,
-Telugu,
-Thaana,
-Thai,
-Tibetan,
-Tifinagh,
-Tirhuta,
-Ugaritic,
-Vai,
-Wancho,
-Warang_Citi,
-Yezidi,
-Yi,
-Zanabazar_Square.
+Unicode defines a number of binary properties, that is, properties whose only
+values are true or false. You can obtain a list of those that are recognized by
+\ep and \eP, along with their abbreviations, by running this command:
+.sp
+  pcre2test -LP
+.
+.
+.
+.SH "SCRIPT MATCHING WITH \ep AND \eP"
+.rs
+.sp
+Many script names and their 4-letter abbreviations are recognized in
+\ep{sc:...} or \ep{scx:...} items, or on their own with \ep (and also \eP of
+course). You can obtain a list of these scripts by running this command:
+.sp
+  pcre2test -LS
+.
+.
+.
+.SH "THE BIDI_CLASS PROPERTY FOR \ep AND \eP"
+.rs
+.sp
+  \ep{Bidi_Class:<class>}   matches a character with the given class
+  \ep{BC:<class>}           matches a character with the given class
+.sp
+The recognized classes are:
+.sp
+  AL          Arabic letter
+  AN          Arabic number
+  B           paragraph separator
+  BN          boundary neutral
+  CS          common separator
+  EN          European number
+  ES          European separator
+  ET          European terminator
+  FSI         first strong isolate
+  L           left-to-right
+  LRE         left-to-right embedding
+  LRI         left-to-right isolate
+  LRO         left-to-right override
+  NSM         non-spacing mark
+  ON          other neutral
+  PDF         pop directional format
+  PDI         pop directional isolate
+  R           right-to-left
+  RLE         right-to-left embedding
+  RLI         right-to-left isolate
+  RLO         right-to-left override
+  S           segment separator
+  WS          which space
 .
 .
 .SH "CHARACTER CLASSES"
@ -401,6 +300,9 @@ but some of them use Unicode properties if PCRE2_UCP is set. You can use
 .sp
  \eK          set reported start of match
 .sp
+From release 10.38 \eK is not permitted by default in lookaround assertions,
+for compatibility with Perl. However, if the PCRE2_EXTRA_ALLOW_LOOKAROUND_BSK
+option is set, the previous behaviour is re-enabled. When this option is set,
 \eK is honoured in positive assertions, but ignored in negative ones.
 .
 .
@ -667,7 +569,7 @@ delimiter }. To encode the ending delimiter within the string, double it.
 .sp
 .nf
 Philip Hazel
-University Computing Service
+Retired from University Computing Service
 Cambridge, England.
 .fi
 .
@ -676,6 +578,6 @@ Cambridge, England.
 .rs
 .sp
 .nf
-Last updated: 28 December 2019
-Copyright (c) 1997-2019 University of Cambridge.
+Last updated: 12 January 2022
+Copyright (c) 1997-2022 University of Cambridge.
 .fi
--- a/doc/pcre2test.1
+++ b/doc/pcre2test.1
@ -1,4 +1,4 @@
-.TH PCRE2TEST 1 "28 April 2021" "PCRE 10.37"
+.TH PCRE2TEST 1 "27 July 2022" "PCRE 10.41"
 .SH NAME
 pcre2test - a program for testing Perl-compatible regular expressions.
 .SH SYNOPSIS
@ -27,12 +27,7 @@ each match attempt. Modifiers on external or internal command lines, the
 patterns, and the subject lines specify PCRE2 function options, control how the
 subject is processed, and what output is produced.
 .P
-As the original fairly simple PCRE library evolved, it acquired many different
-features, and as a result, the original \fBpcretest\fP program ended up with a
-lot of options in a messy, arcane syntax for testing all the features. The
-move to the new PCRE2 API provided an opportunity to re-implement the test
-program as \fBpcre2test\fP, with a cleaner modifier syntax. Nevertheless, there
-are still many obscure modifiers, some of which are specifically designed for
+There are many obscure modifiers, some of which are specifically designed for
 use in conjunction with the test script and data files that are distributed as
 part of PCRE2. All the modifiers are documented here, some without much
 justification, but many of them are unlikely to be of use except when testing
@ -52,7 +47,7 @@ format before being passed to the library functions. Results are converted back
 to 8-bit code units for output.
 .P
 In the rest of this document, the names of library functions and structures
-are given in generic form, for example, \fBpcre_compile()\fP. The actual
+are given in generic form, for example, \fBpcre2_compile()\fP. The actual
 names used in the libraries have a suffix _8, _16, or _32, as appropriate.
 .
 .
@ -61,10 +56,10 @@ names used in the libraries have a suffix _8, _16, or _32, as appropriate.
 .rs
 .sp
 Input to \fBpcre2test\fP is processed line by line, either by calling the C
-library's \fBfgets()\fP function, or via the \fBlibreadline\fP library. In some
-Windows environments character 26 (hex 1A) causes an immediate end of file, and
-no further data is read, so this character should be avoided unless you really
-want that action.
+library's \fBfgets()\fP function, or via the \fBlibreadline\fP or \fBlibedit\fP
+library. In some Windows environments character 26 (hex 1A) causes an immediate
+end of file, and no further data is read, so this character should be avoided
+unless you really want that action.
 .P
 The input is processed using using C's string functions, so must not
 contain binary zeros, even though in Unix-like environments, \fBfgets()\fP
@ -216,7 +211,17 @@ available, and the use of JIT for matching is verified.
 \fB-LM\fP
 List modifiers: write a list of available pattern and subject modifiers to the
 standard output, then exit with zero exit code. All other options are ignored.
-If both -C and -LM are present, whichever is first is recognized.
+If both -C and any -Lx options are present, whichever is first is recognized.
+.TP 10
+\fB-LP\fP
+List properties: write a list of recognized Unicode properties to the standard
+output, then exit with zero exit code. All other options are ignored. If both
+-C and any -Lx options are present, whichever is first is recognized.
+.TP 10
+\fB-LS\fP
+List scripts: write a list of recogized Unicode script names to the standard
+output, then exit with zero exit code. All other options are ignored. If both
+-C and any -Lx options are present, whichever is first is recognized.
 .TP 10
 \fB-pattern\fP \fImodifier-list\fP
 Behave as if each pattern line contains the given modifiers.
@ -443,15 +448,17 @@ excluding pattern meta-characters):
 .sp
 This is interpreted as the pattern's delimiter. A regular expression may be
 continued over several input lines, in which case the newline characters are
-included within it. It is possible to include the delimiter within the pattern
-by escaping it with a backslash, for example
+included within it. It is possible to include the delimiter as a literal within
+the pattern by escaping it with a backslash, for example
 .sp
  /abc\e/def/
 .sp
 If you do this, the escape and the delimiter form part of the pattern, but
-since the delimiters are all non-alphanumeric, this does not affect its
-interpretation. If the terminating delimiter is immediately followed by a
-backslash, for example,
+since the delimiters are all non-alphanumeric, the inclusion of the backslash
+does not affect the pattern's interpretation. Note, however, that this trick
+does not work within \eQ...\eE literal bracketing because the backslash will
+itself be interpreted as a literal. If the terminating delimiter is immediately
+followed by a backslash, for example,
 .sp
  /abc/\e
 .sp
@ -470,11 +477,11 @@ A pattern can be followed by a modifier list (details below).
 .SH "SUBJECT LINE SYNTAX"
 .rs
 .sp
-Before each subject line is passed to \fBpcre2_match()\fP or
-\fBpcre2_dfa_match()\fP, leading and trailing white space is removed, and the
-line is scanned for backslash escapes, unless the \fBsubject_literal\fP
-modifier was set for the pattern. The following provide a means of encoding
-non-printing characters in a visible way:
+Before each subject line is passed to \fBpcre2_match()\fP,
+\fBpcre2_dfa_match()\fP, or \fBpcre2_jit_match()\fP, leading and trailing white
+space is removed, and the line is scanned for backslash escapes, unless the
+\fBsubject_literal\fP modifier was set for the pattern. The following provide a
+means of encoding non-printing characters in a visible way:
 .sp
  \ea         alarm (BEL, \ex07)
  \eb         backspace (\ex08)
@ -570,6 +577,7 @@ way \fBpcre2_compile()\fP behaves. See
 for a description of the effects of these options.
 .sp
      allow_empty_class         set PCRE2_ALLOW_EMPTY_CLASS
+      allow_lookaround_bsk      set PCRE2_EXTRA_ALLOW_LOOKAROUND_BSK
      allow_surrogate_escapes   set PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES
      alt_bsux                  set PCRE2_ALT_BSUX
      alt_circumflex            set PCRE2_ALT_CIRCUMFLEX
@ -1198,7 +1206,8 @@ pattern, but can be overridden by modifiers on the subject.
      copy=<number or name>      copy captured substring
      depth_limit=<n>            set a depth limit
      dfa                        use \fBpcre2_dfa_match()\fP
-      find_limits                find match and depth limits
+      find_limits                find heap, match and depth limits
+      find_limits_noheap         find match and depth limits
      get=<number or name>       extract captured substring
      getall                     extract all captured substrings
  /g  global                     global matching
@ -1208,6 +1217,8 @@ pattern, but can be overridden by modifiers on the subject.
      match_limit=<n>            set a match limit
      memory                     show heap memory usage
      null_context               match with a NULL context
+      null_replacement           substitute with NULL replacement
+      null_subject               match with NULL subject
      offset=<n>                 set starting offset
      offset_limit=<n>           set offset limit
      ovector=<n>                set size of output vector
@ -1518,7 +1529,7 @@ value that was set on the pattern.
 .sp
 The \fBheap_limit\fP, \fBmatch_limit\fP, and \fBdepth_limit\fP modifiers set
 the appropriate limits in the match context. These values are ignored when the
-\fBfind_limits\fP modifier is specified.
+\fBfind_limits\fP or \fBfind_limits_noheap\fP modifier is specified.
 .
 .
 .SS "Finding minimum limits"
@ -1528,8 +1539,12 @@ If the \fBfind_limits\fP modifier is present on a subject line, \fBpcre2test\fP
 calls the relevant matching function several times, setting different values in
 the match context via \fBpcre2_set_heap_limit()\fP,
 \fBpcre2_set_match_limit()\fP, or \fBpcre2_set_depth_limit()\fP until it finds
-the minimum values for each parameter that allows the match to complete without
-error. If JIT is being used, only the match limit is relevant.
+the smallest value for each parameter that allows the match to complete without
+a "limit exceeded" error. The match itself may succeed or fail. An alternative
+modifier, \fBfind_limits_noheap\fP, omits the heap limit. This is used in the
+standard tests, because the minimum heap limit varies between systems. If JIT
+is being used, only the match limit is relevant, and the other two are
+automatically omitted.
 .P
 When using this modifier, the pattern should not contain any limit settings
 such as (*LIMIT_MATCH=...) within it. If such a setting is present and is
@ -1553,9 +1568,7 @@ and non-recursive, to the internal matching function, thus controlling the
 overall amount of computing resource that is used.
 .P
 For both kinds of matching, the \fIheap_limit\fP number, which is in kibibytes
-(units of 1024 bytes), limits the amount of heap memory used for matching. A
-value of zero disables the use of any heap memory; many simple pattern matches
-can be done without using the heap, so zero is not an unreasonable setting.
+(units of 1024 bytes), limits the amount of heap memory used for matching.
 .
 .
 .SS "Showing MARK names"
@ -1574,12 +1587,10 @@ is added to the non-match message.
 .sp
 The \fBmemory\fP modifier causes \fBpcre2test\fP to log the sizes of all heap
 memory allocation and freeing calls that occur during a call to
-\fBpcre2_match()\fP or \fBpcre2_dfa_match()\fP. These occur only when a match
-requires a bigger vector than the default for remembering backtracking points
-(\fBpcre2_match()\fP) or for internal workspace (\fBpcre2_dfa_match()\fP). In
-many cases there will be no heap memory used and therefore no additional
-output. No heap memory is allocated during matching with JIT, so in that case
-the \fBmemory\fP modifier never has any effect. For this modifier to work, the
+\fBpcre2_match()\fP or \fBpcre2_dfa_match()\fP. In the latter case, heap memory
+is used only when a match requires more internal workspace that the default
+allocation on the stack, so in many cases there will be no output. No heap
+memory is allocated during matching with JIT. For this modifier to work, the
 \fBnull_context\fP modifier must not be set on both the pattern and the
 subject, though it can be set on one or the other.
 .
@ -1631,7 +1642,7 @@ When testing \fBpcre2_substitute()\fP, this modifier also has the effect of
 passing the replacement string as zero-terminated.
 .
 .
-.SS "Passing a NULL context"
+.SS "Passing a NULL context, subject, or replacement"
 .rs
 .sp
 Normally, \fBpcre2test\fP passes a context block to \fBpcre2_match()\fP,
@ -1639,7 +1650,12 @@ Normally, \fBpcre2test\fP passes a context block to \fBpcre2_match()\fP,
 If the \fBnull_context\fP modifier is set, however, NULL is passed. This is for
 testing that the matching and substitution functions behave correctly in this
 case (they use default values). This modifier cannot be used with the
-\fBfind_limits\fP or \fBsubstitute_callout\fP modifiers.
+\fBfind_limits\fP, \fBfind_limits_noheap\fP, or \fBsubstitute_callout\fP
+modifiers.
+.P
+Similarly, for testing purposes, if the \fBnull_subject\fP or
+\fBnull_replacement\fP modifier is set, the subject or replacement string
+pointers are passed as NULL, respectively, to the relevant functions.
 .
 .
 .SH "THE ALTERNATIVE MATCHING FUNCTION"
@ -2096,7 +2112,7 @@ on the stack.
 .sp
 .nf
 Philip Hazel
-University Computing Service
+Retired from University Computing Service
 Cambridge, England.
 .fi
 .
@ -2105,6 +2121,6 @@ Cambridge, England.
 .rs
 .sp
 .nf
-Last updated: 28 April 2021
-Copyright (c) 1997-2021 University of Cambridge.
+Last updated: 27 July 2022
+Copyright (c) 1997-2022 University of Cambridge.
 .fi
--- a/doc/pcre2test.txt
+++ b/doc/pcre2test.txt
--- a/doc/pcre2unicode.3
+++ b/doc/pcre2unicode.3
@ -1,4 +1,4 @@
-.TH PCRE2UNICODE 3 "23 February 2020" "PCRE2 10.35"
+.TH PCRE2UNICODE 3 "22 December 2021" "PCRE2 10.40"
 .SH NAME
 PCRE - Perl-compatible regular expressions (revised API)
 .SH "UNICODE AND UTF SUPPORT"
@ -40,10 +40,11 @@ handled, as documented below.
 .sp
 When PCRE2 is built with Unicode support, the escape sequences \ep{..},
 \eP{..}, and \eX can be used. This is not dependent on the PCRE2_UTF setting.
-The Unicode properties that can be tested are limited to the general category
-properties such as Lu for an upper case letter or Nd for a decimal number, the
-Unicode script names such as Arabic or Han, and the derived properties Any and
-L&. Full lists are given in the
+The Unicode properties that can be tested are a subset of those that Perl
+supports. Currently they are limited to the general category properties such as
+Lu for an upper case letter or Nd for a decimal number, the Unicode script
+names such as Arabic or Han, Bidi_Class, Bidi_Control, and the derived
+properties Any and LC (synonym L&). Full lists are given in the
 .\" HREF
 \fBpcre2pattern\fP
 .\"
@ -51,10 +52,10 @@ and
 .\" HREF
 \fBpcre2syntax\fP
 .\"
-documentation. Only the short names for properties are supported. For example,
-\ep{L} matches a letter. Its Perl synonym, \ep{Letter}, is not supported.
-Furthermore, in Perl, many properties may optionally be prefixed by "Is", for
-compatibility with Perl 5.6. PCRE2 does not support this.
+documentation. In general, only the short names for properties are supported.
+For example, \ep{L} matches a letter. Its longer synonym, \ep{Letter}, is not
+supported. Furthermore, in Perl, many properties may optionally be prefixed by
+"Is", for compatibility with Perl 5.6. PCRE2 does not support this.
 .
 .
 .SH "WIDE CHARACTERS AND UTF MODES"
@ -448,7 +449,7 @@ can be useful when searching for UTF text in executable or other binary files.
 .sp
 .nf
 Philip Hazel
-University Computing Service
+Retired from University Computing Service
 Cambridge, England.
 .fi
 .
@ -457,6 +458,6 @@ Cambridge, England.
 .rs
 .sp
 .nf
-Last updated: 23 February 2020
-Copyright (c) 1997-2020 University of Cambridge.
+Last updated: 22 December 2021
+Copyright (c) 1997-2021 University of Cambridge.
 .fi
--- a/index.md
+++ b/index.md
@ -0,0 +1,56 @@
+# PCRE2 - Perl-Compatible Regular Expressions
+
+The PCRE2 library is a set of C functions that implement regular expression
+pattern matching using the same syntax and semantics as Perl 5. PCRE2 has its
+own native API, as well as a set of wrapper functions that correspond to the
+POSIX regular expression API. The PCRE2 library is free, even for building 
+proprietary software. It comes in three forms, for processing 8-bit, 16-bit,
+or 32-bit code units, in either literal or UTF encoding.
+
+PCRE2 was first released in 2015 to replace the API in the original PCRE 
+library, which is now obsolete and no longer maintained. As well as a more
+flexible API, the code of PCRE2 has been much improved since the fork.
+ 
+## Download
+
+As well as downloading from the 
+[GitHub site](https://github.com/PCRE2Project/pcre2), you can download PCRE2 
+or the older, unmaintained PCRE1 library from an 
+[*unofficial* mirror](https://sourceforge.net/projects/pcre/files/) at SourceForge.
+
+You can check out the PCRE2 source code via Git or Subversion:
+
+    git clone https://github.com/PCRE2Project/pcre2.git
+    svn co    https://github.com/PCRE2Project/pcre2.git
+
+## Contributed Ports
+
+If you just need the command-line PCRE2 tools on Windows, precompiled binary
+versions are available at this 
+[Rexegg page](http://www.rexegg.com/pcregrep-pcretest.html).
+
+A PCRE2 port for z/OS, a mainframe operating system which uses EBCDIC as its
+default character encoding, can be found at 
+[http://www.cbttape.org](http://www.cbttape.org/) (File 939).
+
+## Documentation
+
+You can read the PCRE2 documentation 
+[here](https://PCRE2Project.github.io/pcre2/doc/html/index.html).
+
+Comparisons to Perl's regular expression semantics can be found in the
+community authored Wikipedia entry for PCRE.
+
+There is a curated summary of changes for each PCRE release, copies of
+documentation from older releases, and other useful information from the third
+party authored 
+[RexEgg PCRE Documentation and Change Log page](http://www.rexegg.com/pcre-documentation.html).
+
+## Contact
+
+To report a problem with the PCRE2 library, or to make a feature request, please
+use the PCRE2 GitHub issues tracker. There is a mailing list for discussion of
+ PCRE2 issues and development at pcre2-dev@googlegroups.com, which is where any
+announcements will be made. You can browse the 
+[list archives](https://groups.google.com/g/pcre2-dev).
+
--- a/maint/GenerateCommon.py
+++ b/maint/GenerateCommon.py
@ -0,0 +1,355 @@
+#! /usr/bin/python
+
+#                   PCRE2 UNICODE PROPERTY SUPPORT
+#                   ------------------------------
+
+# This file is a Python module containing common lists and functions for the
+# GenerateXXX scripts that create various.c and .h files from Unicode data
+# files. It was created as part of a re-organizaton of these scripts in
+# December 2021.
+
+
+import re
+
+
+# ---------------------------------------------------------------------------
+#                             DATA LISTS
+# ---------------------------------------------------------------------------
+
+# BIDI classes in the DerivedBidiClass.txt file, with comments.
+
+bidi_classes = [
+  'AL',  'Arabic letter',
+  'AN',  'Arabic number',
+  'B',   'Paragraph separator',
+  'BN',  'Boundary neutral',
+  'CS',  'Common separator',
+  'EN',  'European number',
+  'ES',  'European separator',
+  'ET',  'European terminator',
+  'FSI', 'First strong isolate',
+  'L',   'Left to right',
+  'LRE', 'Left to right embedding',
+  'LRI', 'Left to right isolate',
+  'LRO', 'Left to right override',
+  'NSM', 'Non-spacing mark',
+  'ON',  'Other neutral',
+  'PDF', 'Pop directional format',
+  'PDI', 'Pop directional isolate',
+  'R',   'Right to left',
+  'RLE', 'Right to left embedding',
+  'RLI', 'Right to left isolate',
+  'RLO', 'Right to left override',
+  'S',   'Segment separator',
+  'WS',  'White space'
+  ]
+
+# Particular category property names, with comments. NOTE: If ever this list
+# is changed, the table called "catposstab" in the pcre2_auto_possess.c file
+# must be edited to keep in step.
+
+category_names = [
+  'Cc', 'Control',
+  'Cf', 'Format',
+  'Cn', 'Unassigned',
+  'Co', 'Private use',
+  'Cs', 'Surrogate',
+  'Ll', 'Lower case letter',
+  'Lm', 'Modifier letter',
+  'Lo', 'Other letter',
+  'Lt', 'Title case letter',
+  'Lu', 'Upper case letter',
+  'Mc', 'Spacing mark',
+  'Me', 'Enclosing mark',
+  'Mn', 'Non-spacing mark',
+  'Nd', 'Decimal number',
+  'Nl', 'Letter number',
+  'No', 'Other number',
+  'Pc', 'Connector punctuation',
+  'Pd', 'Dash punctuation',
+  'Pe', 'Close punctuation',
+  'Pf', 'Final punctuation',
+  'Pi', 'Initial punctuation',
+  'Po', 'Other punctuation',
+  'Ps', 'Open punctuation',
+  'Sc', 'Currency symbol',
+  'Sk', 'Modifier symbol',
+  'Sm', 'Mathematical symbol',
+  'So', 'Other symbol',
+  'Zl', 'Line separator',
+  'Zp', 'Paragraph separator',
+  'Zs', 'Space separator'
+  ]
+
+# The Extended_Pictographic property is not found in the file where all the
+# others are (GraphemeBreakProperty.txt). It comes from the emoji-data.txt
+# file, but we list it here so that the name has the correct index value.
+
+break_properties = [
+  'CR',                    ' 0',
+  'LF',                    ' 1',
+  'Control',               ' 2',
+  'Extend',                ' 3',
+  'Prepend',               ' 4',
+  'SpacingMark',           ' 5',
+  'L',                     ' 6 Hangul syllable type L',
+  'V',                     ' 7 Hangul syllable type V',
+  'T',                     ' 8 Hangul syllable type T',
+  'LV',                    ' 9 Hangul syllable type LV',
+  'LVT',                   '10 Hangul syllable type LVT',
+  'Regional_Indicator',    '11',
+  'Other',                 '12',
+  'ZWJ',                   '13',
+  'Extended_Pictographic', '14'
+  ]
+
+# List of files from which the names of Boolean properties are obtained, along
+# with a list of regex patterns for properties to be ignored, and a list of
+# extra pattern names to add.
+
+bool_propsfiles = ['PropList.txt', 'DerivedCoreProperties.txt', 'emoji-data.txt']
+bool_propsignore = [r'^Other_', r'^Hyphen$']
+bool_propsextras = ['ASCII', 'Bidi_Mirrored']
+
+
+# ---------------------------------------------------------------------------
+#                   GET BOOLEAN PROPERTY NAMES
+# ---------------------------------------------------------------------------
+
+# Get a list of Boolean property names from a number of files.
+
+def getbpropslist():
+  bplist = []
+  bplast = ""
+
+  for filename in bool_propsfiles:
+    try:
+      file = open('Unicode.tables/' + filename, 'r')
+    except IOError:
+      print(f"** Couldn't open {'Unicode.tables/' + filename}\n")
+      sys.exit(1)
+
+    for line in file:
+      line = re.sub(r'#.*', '', line)
+      data = list(map(str.strip, line.split(';')))
+      if len(data) <= 1 or data[1] == bplast:
+        continue
+      bplast = data[1]
+      for pat in bool_propsignore:
+        if re.match(pat, bplast) != None:
+          break
+      else:
+        bplist.append(bplast)
+
+    file.close()
+
+  bplist.extend(bool_propsextras)
+  bplist.sort()
+  return bplist
+
+bool_properties = getbpropslist()
+bool_props_list_item_size = (len(bool_properties) + 31) // 32
+
+
+
+# ---------------------------------------------------------------------------
+#                  COLLECTING PROPERTY NAMES AND ALIASES
+# ---------------------------------------------------------------------------
+
+script_names = ['Unknown']
+abbreviations = {}
+
+def collect_property_names():
+  global script_names
+  global abbreviations
+
+  names_re = re.compile(r'^[0-9A-F]{4,6}(?:\.\.[0-9A-F]{4,6})? +; ([A-Za-z_]+) #')
+
+  last_script_name = ""
+  with open("Unicode.tables/Scripts.txt") as f:
+    for line in f:
+      match_obj = names_re.match(line)
+
+      if match_obj == None or match_obj.group(1) == last_script_name:
+        continue
+
+      last_script_name = match_obj.group(1)
+      script_names.append(last_script_name)
+
+  # Sometimes there is comment in the line
+  # so splitting around semicolon is not enough
+  value_alias_re = re.compile(r' *([A-Za-z_]+) *; *([A-Za-z_]+) *; *([A-Za-z_]+)(?: *; *([A-Za-z_ ]+))?')
+
+  with open("Unicode.tables/PropertyValueAliases.txt") as f:
+    for line in f:
+      match_obj = value_alias_re.match(line)
+
+      if match_obj == None:
+        continue
+
+      if match_obj.group(1) == "sc":
+        if match_obj.group(2) == match_obj.group(3):
+          abbreviations[match_obj.group(3)] = ()
+        elif match_obj.group(4) == None:
+          abbreviations[match_obj.group(3)] = (match_obj.group(2),)
+        else:
+          abbreviations[match_obj.group(3)] = (match_obj.group(2), match_obj.group(4))
+
+  # We can also collect Boolean property abbreviations into the same dictionary
+
+  bin_alias_re = re.compile(r' *([A-Za-z_]+) *; *([A-Za-z_]+)(?: *; *([A-Za-z_]+))?')
+  with open("Unicode.tables/PropertyAliases.txt") as f:
+    for line in f:
+      match_obj = bin_alias_re.match(line)
+      if match_obj == None:
+        continue
+
+      if match_obj.group(2) in bool_properties:
+        if match_obj.group(3) == None:
+          abbreviations[match_obj.group(2)] = (match_obj.group(1),)
+        else:
+          abbreviations[match_obj.group(2)] = (match_obj.group(1), match_obj.group(3))
+
+collect_property_names()
+
+
+
+# ---------------------------------------------------------------------------
+#                      REORDERING SCRIPT NAMES
+# ---------------------------------------------------------------------------
+
+script_abbrevs = []
+
+def reorder_scripts():
+  global script_names
+  global script_abbrevs
+  global abbreviations
+
+  for name in script_names:
+    abbrevs = abbreviations[name]
+    script_abbrevs.append(name if len(abbrevs) == 0 else abbrevs[0])
+
+  extended_script_abbrevs = set()
+  with open("Unicode.tables/ScriptExtensions.txt") as f:
+    names_re = re.compile(r'^[0-9A-F]{4,6}(?:\.\.[0-9A-F]{4,6})? +; ([A-Za-z_ ]+) #')
+
+    for line in f:
+      match_obj = names_re.match(line)
+
+      if match_obj == None:
+        continue
+
+      for name in match_obj.group(1).split(" "):
+        extended_script_abbrevs.add(name)
+
+  new_script_names = []
+  new_script_abbrevs = []
+
+  for idx, abbrev in enumerate(script_abbrevs):
+    if abbrev in extended_script_abbrevs:
+      new_script_names.append(script_names[idx])
+      new_script_abbrevs.append(abbrev)
+
+  for idx, abbrev in enumerate(script_abbrevs):
+    if abbrev not in extended_script_abbrevs:
+      new_script_names.append(script_names[idx])
+      new_script_abbrevs.append(abbrev)
+
+  script_names = new_script_names
+  script_abbrevs = new_script_abbrevs
+
+reorder_scripts()
+script_list_item_size = (script_names.index('Unknown') + 31) // 32
+
+
+# ---------------------------------------------------------------------------
+#                         DERIVED LISTS
+# ---------------------------------------------------------------------------
+
+# Create general character property names from the first letters of the
+# particular categories.
+
+gcn_set = set(category_names[i][0] for i in range(0, len(category_names), 2))
+general_category_names = list(gcn_set)
+general_category_names.sort()
+
+
+# ---------------------------------------------------------------------------
+#                           FUNCTIONS
+# ---------------------------------------------------------------------------
+
+import sys
+
+# Open an output file, using the command's argument or a default. Write common
+# preliminary header information.
+
+def open_output(default):
+  if len(sys.argv) > 2:
+    print('** Too many arguments: just give a file name')
+    sys.exit(1)
+  if len(sys.argv) == 2:
+    output_name = sys.argv[1]
+  else:
+    output_name = default
+  try:
+    file = open(output_name, "w")
+  except IOError:
+    print ("** Couldn't open %s" % output_name)
+    sys.exit(1)
+
+  script_name = sys.argv[0]
+  i = script_name.rfind('/')
+  if i >= 0:
+    script_name = script_name[i+1:]
+
+  file.write("""\
+/*************************************************
+*      Perl-Compatible Regular Expressions       *
+*************************************************/
+
+/* PCRE is a library of functions to support regular expressions whose syntax
+and semantics are as close as possible to those of the Perl 5 language.
+
+                       Written by Philip Hazel
+     Original API code Copyright (c) 1997-2012 University of Cambridge
+          New API code Copyright (c) 2016-2022 University of Cambridge
+
+This module is auto-generated from Unicode data files. DO NOT EDIT MANUALLY!
+""")
+
+  file.write("Instead, modify the maint/%s script and run it to generate\n"
+  "a new version of this code.\n\n" % script_name)
+
+  file.write("""\
+-----------------------------------------------------------------------------
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+      this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of the University of Cambridge nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+-----------------------------------------------------------------------------
+*/
+\n""")
+  return file
+
+# End of UcpCommon.py
--- a/maint/GenerateTest26.py
+++ b/maint/GenerateTest26.py
@ -0,0 +1,188 @@
+#! /usr/bin/python
+
+#                   PCRE2 UNICODE PROPERTY SUPPORT
+#                   ------------------------------
+#
+# This file auto-generates unicode property tests and their expected output.
+# It is recommended to re-run this generator after the unicode files are
+# updated. The names of the generated files are `testinput26` and `testoutput26`
+
+import re
+import sys
+
+from GenerateCommon import \
+  script_names, \
+  script_abbrevs
+
+def write_both(text):
+  input_file.write(text)
+  output_file.write(text)
+
+def to_string_char(ch_idx):
+  if ch_idx < 128:
+    if ch_idx < 16:
+      return "\\x{0%x}" % ch_idx
+    if ch_idx >= 32:
+      return chr(ch_idx)
+  return "\\x{%x}" % ch_idx
+
+output_directory = ""
+
+if len(sys.argv) > 2:
+  print('** Too many arguments: just give a directory name')
+  sys.exit(1)
+if len(sys.argv) == 2:
+  output_directory = sys.argv[1]
+  if not output_directory.endswith("/"):
+    output_directory += "/"
+
+try:
+  input_file = open(output_directory + "testinput26", "w")
+  output_file = open(output_directory + "testoutput26", "w")
+except IOError:
+  print ("** Couldn't open output files")
+  sys.exit(1)
+
+write_both("# These tests are generated by maint/GenerateTest26.py, do not edit.\n\n")
+
+# ---------------------------------------------------------------------------
+#                      UNICODE SCRIPT EXTENSION TESTS
+# ---------------------------------------------------------------------------
+
+write_both("# Unicode Script Extension tests.\n\n")
+
+def gen_script_tests():
+  script_data = [None] * len(script_names)
+  char_data = [None] * 0x110000
+
+  property_re = re.compile("^([0-9A-F]{4,6})(?:\\.\\.([0-9A-F]{4,6}))? +; ([A-Za-z_ ]+) #")
+  prev_name = ""
+  script_idx = -1
+
+  with open("Unicode.tables/Scripts.txt") as f:
+    for line in f:
+      match_obj = property_re.match(line)
+
+      if match_obj == None:
+        continue
+
+      name = match_obj.group(3)
+      if name != prev_name:
+        script_idx = script_names.index(name)
+        prev_name = name
+
+      low = int(match_obj.group(1), 16)
+      high = low
+      char_data[low] = name
+
+      if match_obj.group(2) != None:
+        high = int(match_obj.group(2), 16)
+        for idx in range(low + 1, high + 1):
+           char_data[idx] = name
+
+      if script_data[script_idx] == None:
+        script_data[script_idx] = [low, None, None, None, None]
+      script_data[script_idx][1] = high
+
+  extended_script_indicies = {}
+
+  with open("Unicode.tables/ScriptExtensions.txt") as f:
+    for line in f:
+      match_obj = property_re.match(line)
+
+      if match_obj == None:
+        continue
+
+      low = int(match_obj.group(1), 16)
+      high = low
+      if match_obj.group(2) != None:
+        high = int(match_obj.group(2), 16)
+
+      for abbrev in match_obj.group(3).split(" "):
+        if abbrev not in extended_script_indicies:
+          idx = script_abbrevs.index(abbrev)
+          extended_script_indicies[abbrev] = idx
+          rec = script_data[idx]
+          rec[2] = low
+          rec[3] = high
+        else:
+          idx = extended_script_indicies[abbrev]
+          rec = script_data[idx]
+          if rec[2] > low:
+            rec[2] = low
+          if rec[3] < high:
+            rec[3] = high
+
+        if rec[4] == None:
+          name = script_names[idx]
+          for idx in range(low, high + 1):
+            if char_data[idx] != name:
+              rec[4] = idx
+              break
+
+  long_property_name = False
+
+  for idx, rec in enumerate(script_data):
+    script_name = script_names[idx]
+
+    if script_name == "Unknown":
+      continue
+
+    script_abbrev = script_abbrevs[idx]
+
+    write_both("# Base script check\n")
+    write_both("/^\\p{sc=%s}/utf\n" % script_name)
+    write_both("  %s\n" % to_string_char(rec[0]))
+    output_file.write(" 0: %s\n" % to_string_char(rec[0]))
+    write_both("\n")
+
+    write_both("/^\\p{Script=%s}/utf\n" % script_abbrev)
+    write_both("  %s\n" % to_string_char(rec[1]))
+    output_file.write(" 0: %s\n" % to_string_char(rec[1]))
+    write_both("\n")
+
+    if rec[2] != None:
+      property_name = "scx"
+      if long_property_name:
+        property_name = "Script_Extensions"
+
+      write_both("# Script extension check\n")
+      write_both("/^\\p{%s}/utf\n" % script_name)
+      write_both("  %s\n" % to_string_char(rec[2]))
+      output_file.write(" 0: %s\n" % to_string_char(rec[2]))
+      write_both("\n")
+
+      write_both("/^\\p{%s=%s}/utf\n" % (property_name, script_abbrev))
+      write_both("  %s\n" % to_string_char(rec[3]))
+      output_file.write(" 0: %s\n" % to_string_char(rec[3]))
+      write_both("\n")
+
+      long_property_name = not long_property_name
+
+      if rec[4] != None:
+        write_both("# Script extension only character\n")
+        write_both("/^\\p{%s}/utf\n" % script_name)
+        write_both("  %s\n" % to_string_char(rec[4]))
+        output_file.write(" 0: %s\n" % to_string_char(rec[4]))
+        write_both("\n")
+
+        write_both("/^\\p{sc=%s}/utf\n" % script_name)
+        write_both("  %s\n" % to_string_char(rec[4]))
+        output_file.write("No match\n")
+        write_both("\n")
+      else:
+        print("External character has not found for %s" % script_name)
+
+    high = rec[1]
+    if rec[3] != None and rec[3] > rec[1]:
+      high = rec[3]
+    write_both("# Character not in script\n")
+    write_both("/^\\p{%s}/utf\n" % script_name)
+    write_both("  %s\n" % to_string_char(high + 1))
+    output_file.write("No match\n")
+    write_both("\n")
+
+
+gen_script_tests()
+
+write_both("# End of testinput26\n")
--- a/maint/GenerateUcd.py
+++ b/maint/GenerateUcd.py
@ -0,0 +1,923 @@
+#! /usr/bin/python
+
+#                   PCRE2 UNICODE PROPERTY SUPPORT
+#                   ------------------------------
+#
+# This script generates the pcre2_ucd.c file from Unicode data files. This is
+# the compressed Unicode property data used by PCRE2. The script was created in
+# December 2021 as part of the Unicode data generation refactoring. It is
+# basically a re-working of the MultiStage2.py script that was submitted to the
+# PCRE project by Peter Kankowski in 2008 as part of a previous upgrading of
+# Unicode property support. A number of extensions have since been added. The
+# main difference in the 2021 upgrade (apart from comments and layout) is that
+# the data tables (e.g. list of script names) are now listed in or generated by
+# a separate Python module that is shared with the other Generate scripts.
+#
+# This script must be run in the "maint" directory. It requires the following
+# Unicode data tables: BidiMirrorring.txt, CaseFolding.txt,
+# DerivedBidiClass.txt, DerivedCoreProperties.txt, DerivedGeneralCategory.txt,
+# GraphemeBreakProperty.txt, PropList.txt, PropertyAliases.txt,
+# PropertyValueAliases.txt, ScriptExtensions.txt, Scripts.txt, and
+# emoji-data.txt. These must be in the Unicode.tables subdirectory.
+#
+# The emoji-data.txt file is found in the "emoji" subdirectory even though it
+# is technically part of a different (but coordinated) standard as shown
+# in files associated with Unicode Technical Standard #51 ("Unicode Emoji"),
+# for example:
+#
+# http://unicode.org/Public/emoji/13.0/ReadMe.txt
+#
+# DerivedBidiClass.txt and DerivedGeneralCategory.txt are in the "extracted"
+# subdirectory of the Unicode database (UCD) on the Unicode web site;
+# GraphemeBreakProperty.txt is in the "auxiliary" subdirectory. The other files
+# are in the top-level UCD directory.
+#
+# -----------------------------------------------------------------------------
+# Minor modifications made to the original script:
+#  Added #! line at start
+#  Removed tabs
+#  Made it work with Python 2.4 by rewriting two statements that needed 2.5
+#  Consequent code tidy
+#  Adjusted data file names to take from the Unicode.tables directory
+#  Adjusted global table names by prefixing _pcre_.
+#  Commented out stuff relating to the casefolding table, which isn't used;
+#    removed completely in 2012.
+#  Corrected size calculation
+#  Add #ifndef SUPPORT_UCP to use dummy tables when no UCP support is needed.
+#  Update for PCRE2: name changes, and SUPPORT_UCP is abolished.
+#
+# Major modifications made to the original script:
+#  Added code to add a grapheme break property field to records.
+#
+#  Added code to search for sets of more than two characters that must match
+#  each other caselessly. A new table is output containing these sets, and
+#  offsets into the table are added to the main output records. This new
+#  code scans CaseFolding.txt instead of UnicodeData.txt, which is no longer
+#  used.
+#
+#  Update for Python3:
+#    . Processed with 2to3, but that didn't fix everything
+#    . Changed string.strip to str.strip
+#    . Added encoding='utf-8' to the open() call
+#    . Inserted 'int' before blocksize/ELEMS_PER_LINE because an int is
+#        required and the result of the division is a float
+#
+#  Added code to scan the emoji-data.txt file to find the Extended Pictographic
+#  property, which is used by PCRE2 as a grapheme breaking property. This was
+#  done when updating to Unicode 11.0.0 (July 2018).
+#
+#  Added code to add a Script Extensions field to records. This has increased
+#  their size from 8 to 12 bytes, only 10 of which are currently used.
+#
+#  Added code to add a bidi class field to records by scanning the
+#  DerivedBidiClass.txt and PropList.txt files. This uses one of the two spare
+#  bytes, so now 11 out of 12 are in use.
+#
+# 01-March-2010:     Updated list of scripts for Unicode 5.2.0
+# 30-April-2011:     Updated list of scripts for Unicode 6.0.0
+#     July-2012:     Updated list of scripts for Unicode 6.1.0
+# 20-August-2012:    Added scan of GraphemeBreakProperty.txt and added a new
+#                      field in the record to hold the value. Luckily, the
+#                      structure had a hole in it, so the resulting table is
+#                      not much bigger than before.
+# 18-September-2012: Added code for multiple caseless sets. This uses the
+#                      final hole in the structure.
+# 30-September-2012: Added RegionalIndicator break property from Unicode 6.2.0
+# 13-May-2014:       Updated for PCRE2
+# 03-June-2014:      Updated for Python 3
+# 20-June-2014:      Updated for Unicode 7.0.0
+# 12-August-2014:    Updated to put Unicode version into the file
+# 19-June-2015:      Updated for Unicode 8.0.0
+# 02-July-2017:      Updated for Unicode 10.0.0
+# 03-July-2018:      Updated for Unicode 11.0.0
+# 07-July-2018:      Added code to scan emoji-data.txt for the Extended
+#                      Pictographic property.
+# 01-October-2018:   Added the 'Unknown' script name
+# 03-October-2018:   Added new field for Script Extensions
+# 27-July-2019:      Updated for Unicode 12.1.0
+# 10-March-2020:     Updated for Unicode 13.0.0
+# PCRE2-10.39:       Updated for Unicode 14.0.0
+# 05-December-2021:  Added code to scan DerivedBidiClass.txt for bidi class,
+#                      and also PropList.txt for the Bidi_Control property
+# 19-December-2021:  Reworked script extensions lists to be bit maps instead
+#                      of zero-terminated lists of script numbers.
+# ----------------------------------------------------------------------------
+#
+# Changes to the refactored script:
+#
+# 26-December-2021:  Refactoring completed
+# 10-January-2022:   Addition of general Boolean property support
+# 12-January-2022:   Merge scriptx and bidiclass fields
+# 14-January-2022:   Enlarge Boolean property offset to 12 bits
+#
+# ----------------------------------------------------------------------------
+#
+#
+# The main tables generated by this script are used by macros defined in
+# pcre2_internal.h. They look up Unicode character properties using short
+# sequences of code that contains no branches, which makes for greater speed.
+#
+# Conceptually, there is a table of records (of type ucd_record), one for each
+# Unicode character. Each record contains the script number, script extension
+# value, character type, grapheme break type, offset to caseless matching set,
+# offset to the character's other case, the bidi class, and offset to bitmap of
+# Boolean properties.
+#
+# A real table covering all Unicode characters would be far too big. It can be
+# efficiently compressed by observing that many characters have the same
+# record, and many blocks of characters (taking 128 characters in a block) have
+# the same set of records as other blocks. This leads to a 2-stage lookup
+# process.
+#
+# This script constructs seven tables. The ucd_caseless_sets table contains
+# lists of characters that all match each other caselessly. Each list is
+# in order, and is terminated by NOTACHAR (0xffffffff), which is larger than
+# any valid character. The first list is empty; this is used for characters
+# that are not part of any list.
+#
+# The ucd_digit_sets table contains the code points of the '9' characters in
+# each set of 10 decimal digits in Unicode. This is used to ensure that digits
+# in script runs all come from the same set. The first element in the vector
+# contains the number of subsequent elements, which are in ascending order.
+#
+# Scripts are partitioned into two groups. Scripts that appear in at least one
+# character's script extension list come first, followed by "Unknown" and then
+# all the rest. This sorting is done automatically in the GenerateCommon.py
+# script. A script's number is its index in the script_names list.
+#
+# The ucd_script_sets table contains bitmaps that represent lists of scripts
+# for Script Extensions properties. Each bitmap consists of a fixed number of
+# unsigned 32-bit numbers, enough to allocate a bit for every script that is
+# used in any character's extension list, that is, enough for every script
+# whose number is less than ucp_Unknown. A character's script extension value
+# in its ucd record is an offset into the ucd_script_sets vector. The first
+# bitmap has no bits set; characters that have no script extensions have zero
+# as their script extensions value so that they use this map.
+#
+# The ucd_boolprop_sets table contains bitmaps that represent lists of Boolean
+# properties. Each bitmap consists of a fixed number of unsigned 32-bit
+# numbers, enough to allocate a bit for each supported Boolean property.
+#
+# The ucd_records table contains one instance of every unique character record
+# that is required. The ucd_stage1 table is indexed by a character's block
+# number, which is the character's code point divided by 128, since 128 is the
+# size of each block. The result of a lookup in ucd_stage1 a "virtual" block
+# number.
+#
+# The ucd_stage2 table is a table of "virtual" blocks; each block is indexed by
+# the offset of a character within its own block, and the result is the index
+# number of the required record in the ucd_records vector.
+#
+# The following examples are correct for the Unicode 14.0.0 database. Future
+# updates may make change the actual lookup values.
+#
+# Example: lowercase "a" (U+0061) is in block 0
+#          lookup 0 in stage1 table yields 0
+#          lookup 97 (0x61) in the first table in stage2 yields 35
+#          record 35 is { 0, 5, 12, 0, -32, 18432, 44 }
+#             0 = ucp_Latin   => Latin script
+#             5 = ucp_Ll      => Lower case letter
+#            12 = ucp_gbOther => Grapheme break property "Other"
+#             0               => Not part of a caseless set
+#           -32 (-0x20)       => Other case is U+0041
+#         18432 = 0x4800      => Combined Bidi class + script extension values
+#            44               => Offset to Boolean properties
+#
+# The top 5 bits of the sixth field are the Bidi class, with the rest being the
+# script extension value, giving:
+#
+#             9 = ucp_bidiL   => Bidi class left-to-right
+#             0               => No special script extension property
+#
+# Almost all lowercase latin characters resolve to the same record. One or two
+# are different because they are part of a multi-character caseless set (for
+# example, k, K and the Kelvin symbol are such a set).
+#
+# Example: hiragana letter A (U+3042) is in block 96 (0x60)
+#          lookup 96 in stage1 table yields 93
+#          lookup 66 (0x42) in table 93 in stage2 yields 819
+#          record 819 is { 20, 7, 12, 0, 0, 18432, 82 }
+#            20 = ucp_Hiragana => Hiragana script
+#             7 = ucp_Lo       => Other letter
+#            12 = ucp_gbOther  => Grapheme break property "Other"
+#             0                => Not part of a caseless set
+#             0                => No other case
+#         18432 = 0x4800       => Combined Bidi class + script extension values
+#            82                => Offset to Boolean properties
+#
+# The top 5 bits of the sixth field are the Bidi class, with the rest being the
+# script extension value, giving:
+#
+#             9 = ucp_bidiL   => Bidi class left-to-right
+#             0               => No special script extension property
+#
+# Example: vedic tone karshana (U+1CD0) is in block 57 (0x39)
+#          lookup 57 in stage1 table yields 55
+#          lookup 80 (0x50) in table 55 in stage2 yields 621
+#          record 621 is { 84, 12, 3, 0, 0, 26762, 96 }
+#            84 = ucp_Inherited => Script inherited from predecessor
+#            12 = ucp_Mn        => Non-spacing mark
+#             3 = ucp_gbExtend  => Grapheme break property "Extend"
+#             0                 => Not part of a caseless set
+#             0                 => No other case
+#         26762 = 0x688A        => Combined Bidi class + script extension values
+#            96                 => Offset to Boolean properties
+#
+# The top 5 bits of the sixth field are the Bidi class, with the rest being the
+# script extension value, giving:
+#
+#            13 = ucp_bidiNSM   => Bidi class non-spacing mark
+#           138                 => Script Extension list offset = 138
+#
+# At offset 138 in the ucd_script_sets vector we find a bitmap with bits 1, 8,
+# 18, and 47 set. This means that this character is expected to be used with
+# any of those scripts, which are Bengali, Devanagari, Kannada, and Grantha.
+#
+#  Philip Hazel, last updated 14 January 2022.
+##############################################################################
+
+
+# Import standard modules
+
+import re
+import string
+import sys
+
+# Import common data lists and functions
+
+from GenerateCommon import \
+  bidi_classes, \
+  bool_properties, \
+  bool_propsfiles, \
+  bool_props_list_item_size, \
+  break_properties, \
+  category_names, \
+  general_category_names, \
+  script_abbrevs, \
+  script_list_item_size, \
+  script_names, \
+  open_output
+
+# Some general parameters
+
+MAX_UNICODE = 0x110000
+NOTACHAR = 0xffffffff
+
+
+# ---------------------------------------------------------------------------
+#                         DEFINE FUNCTIONS
+# ---------------------------------------------------------------------------
+
+
+# Parse a line of Scripts.txt, GraphemeBreakProperty.txt, DerivedBidiClass.txt
+# or DerivedGeneralCategory.txt
+
+def make_get_names(enum):
+  return lambda chardata: enum.index(chardata[1])
+
+
+# Parse a line of CaseFolding.txt
+
+def get_other_case(chardata):
+  if chardata[1] == 'C' or chardata[1] == 'S':
+    return int(chardata[2], 16) - int(chardata[0], 16)
+  return 0
+
+
+# Parse a line of ScriptExtensions.txt
+
+def get_script_extension(chardata):
+  global last_script_extension
+
+  offset = len(script_lists) * script_list_item_size
+  if last_script_extension == chardata[1]:
+    return offset - script_list_item_size
+
+  last_script_extension = chardata[1]
+  script_lists.append(tuple(script_abbrevs.index(abbrev) for abbrev in last_script_extension.split(' ')))
+  return offset
+
+
+# Read a whole table in memory, setting/checking the Unicode version
+
+def read_table(file_name, get_value, default_value):
+  global unicode_version
+
+  f = re.match(r'^[^/]+/([^.]+)\.txt$', file_name)
+  file_base = f.group(1)
+  version_pat = r"^# " + re.escape(file_base) + r"-(\d+\.\d+\.\d+)\.txt$"
+  file = open(file_name, 'r', encoding='utf-8')
+  f = re.match(version_pat, file.readline())
+  version = f.group(1)
+  if unicode_version == "":
+    unicode_version = version
+  elif unicode_version != version:
+    print("WARNING: Unicode version differs in %s", file_name, file=sys.stderr)
+
+  table = [default_value] * MAX_UNICODE
+  for line in file:
+    line = re.sub(r'#.*', '', line)
+    chardata = list(map(str.strip, line.split(';')))
+    if len(chardata) <= 1:
+      continue
+    value = get_value(chardata)
+    m = re.match(r'([0-9a-fA-F]+)(\.\.([0-9a-fA-F]+))?$', chardata[0])
+    char = int(m.group(1), 16)
+    if m.group(3) is None:
+      last = char
+    else:
+      last = int(m.group(3), 16)
+    for i in range(char, last + 1):
+      # It is important not to overwrite a previously set value because in the
+      # CaseFolding file there are lines to be ignored (returning the default
+      # value of 0) which often come after a line which has already set data.
+      if table[i] == default_value:
+        table[i] = value
+  file.close()
+  return table
+
+
+# Get the smallest possible C language type for the values in a table
+
+def get_type_size(table):
+  type_size = [("uint8_t", 1), ("uint16_t", 2), ("uint32_t", 4),
+    ("signed char", 1), ("int16_t", 2), ("int32_t", 4)]
+  limits = [(0, 255), (0, 65535), (0, 4294967295), (-128, 127),
+    (-32768, 32767), (-2147483648, 2147483647)]
+  minval = min(table)
+  maxval = max(table)
+  for num, (minlimit, maxlimit) in enumerate(limits):
+    if minlimit <= minval and maxval <= maxlimit:
+      return type_size[num]
+  raise OverflowError("Too large to fit into C types")
+
+
+# Get the total size of a list of tables
+
+def get_tables_size(*tables):
+  total_size = 0
+  for table in tables:
+    type, size = get_type_size(table)
+    total_size += size * len(table)
+  return total_size
+
+
+# Compress a table into the two stages
+
+def compress_table(table, block_size):
+  blocks = {} # Dictionary for finding identical blocks
+  stage1 = [] # Stage 1 table contains block numbers (indices into stage 2 table)
+  stage2 = [] # Stage 2 table contains the blocks with property values
+  table = tuple(table)
+  for i in range(0, len(table), block_size):
+    block = table[i:i+block_size]
+    start = blocks.get(block)
+    if start is None:
+      # Allocate a new block
+      start = len(stage2) / block_size
+      stage2 += block
+      blocks[block] = start
+    stage1.append(start)
+  return stage1, stage2
+
+
+# Output a table
+
+def write_table(table, table_name, block_size = None):
+  type, size = get_type_size(table)
+  ELEMS_PER_LINE = 16
+
+  s = "const %s %s[] = { /* %d bytes" % (type, table_name, size * len(table))
+  if block_size:
+    s += ", block = %d" % block_size
+  f.write(s + " */\n")
+  table = tuple(table)
+  if block_size is None:
+    fmt = "%3d," * ELEMS_PER_LINE + " /* U+%04X */\n"
+    mult = MAX_UNICODE / len(table)
+    for i in range(0, len(table), ELEMS_PER_LINE):
+      f.write(fmt % (table[i:i+ELEMS_PER_LINE] + (int(i * mult),)))
+  else:
+    if block_size > ELEMS_PER_LINE:
+      el = ELEMS_PER_LINE
+    else:
+      el = block_size
+    fmt = "%3d," * el + "\n"
+    if block_size > ELEMS_PER_LINE:
+      fmt = fmt * int(block_size / ELEMS_PER_LINE)
+    for i in range(0, len(table), block_size):
+      f.write(("\n/* block %d */\n" + fmt) % ((i / block_size,) + table[i:i+block_size]))
+  f.write("};\n\n")
+
+
+# Extract the unique combinations of properties into records
+
+def combine_tables(*tables):
+  records = {}
+  index = []
+  for t in zip(*tables):
+    i = records.get(t)
+    if i is None:
+      i = records[t] = len(records)
+    index.append(i)
+  return index, records
+
+
+# Create a record struct
+
+def get_record_size_struct(records):
+  size = 0
+  structure = 'typedef struct {\n'
+  for i in range(len(records[0])):
+    record_slice = [record[i] for record in records]
+    slice_type, slice_size = get_type_size(record_slice)
+    # add padding: round up to the nearest power of slice_size
+    size = (size + slice_size - 1) & -slice_size
+    size += slice_size
+    structure += '%s property_%d;\n' % (slice_type, i)
+
+  # round up to the first item of the next structure in array
+  record_slice = [record[0] for record in records]
+  slice_type, slice_size = get_type_size(record_slice)
+  size = (size + slice_size - 1) & -slice_size
+
+  structure += '} ucd_record;\n*/\n'
+  return size, structure
+
+
+# Write records
+
+def write_records(records, record_size):
+  f.write('const ucd_record PRIV(ucd_records)[] = { ' + \
+    '/* %d bytes, record size %d */\n' % (len(records) * record_size, record_size))
+  records = list(zip(list(records.keys()), list(records.values())))
+  records.sort(key = lambda x: x[1])
+  for i, record in enumerate(records):
+    f.write(('  {' + '%6d, ' * len(record[0]) + '}, /* %3d */\n') % (record[0] + (i,)))
+  f.write('};\n\n')
+
+
+# Write a bit set
+
+def write_bitsets(list, item_size):
+  for d in list:
+    bitwords = [0] * item_size
+    for idx in d:
+      bitwords[idx // 32] |= 1 << (idx & 31)
+    s = " "
+    for x in bitwords:
+      f.write("%s" % s)
+      s = ", "
+      f.write("0x%08xu" % x)
+    f.write(",\n")
+  f.write("};\n\n")
+
+
+# ---------------------------------------------------------------------------
+# This bit of code must have been useful when the original script was being
+# developed. Retain it just in case it is ever needed again.
+
+# def test_record_size():
+#   tests = [ \
+#     ( [(3,), (6,), (6,), (1,)], 1 ), \
+#     ( [(300,), (600,), (600,), (100,)], 2 ), \
+#     ( [(25, 3), (6, 6), (34, 6), (68, 1)], 2 ), \
+#     ( [(300, 3), (6, 6), (340, 6), (690, 1)], 4 ), \
+#     ( [(3, 300), (6, 6), (6, 340), (1, 690)], 4 ), \
+#     ( [(300, 300), (6, 6), (6, 340), (1, 690)], 4 ), \
+#     ( [(3, 100000), (6, 6), (6, 123456), (1, 690)], 8 ), \
+#     ( [(100000, 300), (6, 6), (123456, 6), (1, 690)], 8 ), \
+#   ]
+#   for test in tests:
+#     size, struct = get_record_size_struct(test[0])
+#     assert(size == test[1])
+# test_record_size()
+# ---------------------------------------------------------------------------
+
+
+
+# ---------------------------------------------------------------------------
+#                       MAIN CODE FOR CREATING TABLES
+# ---------------------------------------------------------------------------
+
+unicode_version = ""
+
+# Some of the tables imported from GenerateCommon.py have alternate comment
+# strings for use by GenerateUcpHeader. The comments are not wanted here, so
+# remove them.
+
+bidi_classes = bidi_classes[::2]
+break_properties = break_properties[::2]
+category_names = category_names[::2]
+
+# Create the various tables from Unicode data files
+
+script = read_table('Unicode.tables/Scripts.txt', make_get_names(script_names), script_names.index('Unknown'))
+category = read_table('Unicode.tables/DerivedGeneralCategory.txt', make_get_names(category_names), category_names.index('Cn'))
+break_props = read_table('Unicode.tables/GraphemeBreakProperty.txt', make_get_names(break_properties), break_properties.index('Other'))
+other_case = read_table('Unicode.tables/CaseFolding.txt', get_other_case, 0)
+bidi_class = read_table('Unicode.tables/DerivedBidiClass.txt', make_get_names(bidi_classes), bidi_classes.index('L'))
+
+# The grapheme breaking rules were changed for Unicode 11.0.0 (June 2018). Now
+# we need to find the Extended_Pictographic property for emoji characters. This
+# can be set as an additional grapheme break property, because the default for
+# all the emojis is "other". We scan the emoji-data.txt file and modify the
+# break-props table.
+
+file = open('Unicode.tables/emoji-data.txt', 'r', encoding='utf-8')
+for line in file:
+  line = re.sub(r'#.*', '', line)
+  chardata = list(map(str.strip, line.split(';')))
+  if len(chardata) <= 1:
+    continue
+  if chardata[1] != "Extended_Pictographic":
+    continue
+  m = re.match(r'([0-9a-fA-F]+)(\.\.([0-9a-fA-F]+))?$', chardata[0])
+  char = int(m.group(1), 16)
+  if m.group(3) is None:
+    last = char
+  else:
+    last = int(m.group(3), 16)
+  for i in range(char, last + 1):
+    if break_props[i] != break_properties.index('Other'):
+      print("WARNING: Emoji 0x%x has break property %s, not 'Other'",
+        i, break_properties[break_props[i]], file=sys.stderr)
+    break_props[i] = break_properties.index('Extended_Pictographic')
+file.close()
+
+# Handle script extensions. The get_script_extesion() function maintains a
+# list of unique bitmaps representing lists of scripts, returning the offset
+# in that list. Initialize the list with an empty set, which is used for
+# characters that have no script extensions.
+
+script_lists = [[]]
+last_script_extension = ""
+scriptx_bidi_class = read_table('Unicode.tables/ScriptExtensions.txt', get_script_extension, 0)
+
+for idx in range(len(scriptx_bidi_class)):
+  scriptx_bidi_class[idx] = scriptx_bidi_class[idx] | (bidi_class[idx] << 11)
+bidi_class = None
+
+# Find the Boolean properties of each character. This next bit of magic creates
+# a list of empty lists. Using [[]] * MAX_UNICODE gives a list of references to
+# the *same* list, which is not what we want.
+
+bprops = [[] for _ in range(MAX_UNICODE)]
+
+# Collect the properties from the various files
+
+for filename in bool_propsfiles:
+  try:
+    file = open('Unicode.tables/' + filename, 'r')
+  except IOError:
+    print(f"** Couldn't open {'Unicode.tables/' + filename}\n")
+    sys.exit(1)
+
+  for line in file:
+    line = re.sub(r'#.*', '', line)
+    data = list(map(str.strip, line.split(';')))
+    if len(data) <= 1:
+      continue
+
+    try:
+      ix = bool_properties.index(data[1])
+    except ValueError:
+      continue
+
+    m = re.match(r'([0-9a-fA-F]+)(\.\.([0-9a-fA-F]+))?$', data[0])
+    char = int(m.group(1), 16)
+    if m.group(3) is None:
+      last = char
+    else:
+      last = int(m.group(3), 16)
+
+    for i in range(char, last + 1):
+      bprops[i].append(ix)
+
+  file.close()
+
+# The ASCII property isn't listed in any files, but it is easy enough to add
+# it manually.
+
+ix = bool_properties.index("ASCII")
+for i in range(128):
+  bprops[i].append(ix)
+
+# The Bidi_Mirrored property isn't listed in any property files. We have to
+# deduce it from the file that lists the mirrored characters.
+
+ix = bool_properties.index("Bidi_Mirrored")
+
+try:
+  file = open('Unicode.tables/BidiMirroring.txt', 'r')
+except IOError:
+  print(f"** Couldn't open {'Unicode.tables/BidiMirroring.txt'}\n")
+  sys.exit(1)
+
+for line in file:
+  line = re.sub(r'#.*', '', line)
+  data = list(map(str.strip, line.split(';')))
+  if len(data) <= 1:
+    continue
+  c = int(data[0], 16)
+  bprops[c].append(ix)
+
+file.close()
+
+# Scan each character's boolean property list and created a list of unique
+# lists, at the same time, setting the index in that list for each property in
+# the bool_props vector.
+
+bool_props = [0] * MAX_UNICODE
+bool_props_lists = [[]]
+
+for c in range(MAX_UNICODE):
+  s = set(bprops[c])
+  for i in range(len(bool_props_lists)):
+    if s == set(bool_props_lists[i]):
+      break;
+  else:
+    bool_props_lists.append(bprops[c])
+    i += 1
+
+  bool_props[c] = i * bool_props_list_item_size
+
+# This block of code was added by PH in September 2012. It scans the other_case
+# table to find sets of more than two characters that must all match each other
+# caselessly. Later in this script a table of these sets is written out.
+# However, we have to do this work here in order to compute the offsets in the
+# table that are inserted into the main table.
+
+# The CaseFolding.txt file lists pairs, but the common logic for reading data
+# sets only one value, so first we go through the table and set "return"
+# offsets for those that are not already set.
+
+for c in range(MAX_UNICODE):
+  if other_case[c] != 0 and other_case[c + other_case[c]] == 0:
+    other_case[c + other_case[c]] = -other_case[c]
+
+# Now scan again and create equivalence sets.
+
+caseless_sets = []
+
+for c in range(MAX_UNICODE):
+  o = c + other_case[c]
+
+  # Trigger when this character's other case does not point back here. We
+  # now have three characters that are case-equivalent.
+
+  if other_case[o] != -other_case[c]:
+    t = o + other_case[o]
+
+    # Scan the existing sets to see if any of the three characters are already
+    # part of a set. If so, unite the existing set with the new set.
+
+    appended = 0
+    for s in caseless_sets:
+      found = 0
+      for x in s:
+        if x == c or x == o or x == t:
+          found = 1
+
+      # Add new characters to an existing set
+
+      if found:
+        found = 0
+        for y in [c, o, t]:
+          for x in s:
+            if x == y:
+              found = 1
+          if not found:
+            s.append(y)
+        appended = 1
+
+    # If we have not added to an existing set, create a new one.
+
+    if not appended:
+      caseless_sets.append([c, o, t])
+
+# End of loop looking for caseless sets.
+
+# Now scan the sets and set appropriate offsets for the characters.
+
+caseless_offsets = [0] * MAX_UNICODE
+
+offset = 1;
+for s in caseless_sets:
+  for x in s:
+    caseless_offsets[x] = offset
+  offset += len(s) + 1
+
+# End of block of code for creating offsets for caseless matching sets.
+
+
+# Combine all the tables
+
+table, records = combine_tables(script, category, break_props,
+  caseless_offsets, other_case, scriptx_bidi_class, bool_props)
+
+# Find the record size and create a string definition of the structure for
+# outputting as a comment.
+
+record_size, record_struct = get_record_size_struct(list(records.keys()))
+
+# Find the optimum block size for the two-stage table
+
+min_size = sys.maxsize
+for block_size in [2 ** i for i in range(5,10)]:
+  size = len(records) * record_size
+  stage1, stage2 = compress_table(table, block_size)
+  size += get_tables_size(stage1, stage2)
+  #print "/* block size %5d  => %5d bytes */" % (block_size, size)
+  if size < min_size:
+    min_size = size
+    min_stage1, min_stage2 = stage1, stage2
+    min_block_size = block_size
+
+
+# ---------------------------------------------------------------------------
+#                   MAIN CODE FOR WRITING THE OUTPUT FILE
+# ---------------------------------------------------------------------------
+
+# Open the output file (no return on failure). This call also writes standard
+# header boilerplate.
+
+f = open_output("pcre2_ucd.c")
+
+# Output this file's heading text
+
+f.write("""\
+/* This file contains tables of Unicode properties that are extracted from
+Unicode data files. See the comments at the start of maint/GenerateUcd.py for
+details.
+
+As well as being part of the PCRE2 library, this file is #included by the
+pcre2test program, which redefines the PRIV macro to change table names from
+_pcre2_xxx to xxxx, thereby avoiding name clashes with the library. At present,
+just one of these tables is actually needed. When compiling the library, some
+headers are needed. */
+
+#ifndef PCRE2_PCRE2TEST
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+#include "pcre2_internal.h"
+#endif /* PCRE2_PCRE2TEST */
+
+/* The tables herein are needed only when UCP support is built, and in PCRE2
+that happens automatically with UTF support. This module should not be
+referenced otherwise, so it should not matter whether it is compiled or not.
+However a comment was received about space saving - maybe the guy linked all
+the modules rather than using a library - so we include a condition to cut out
+the tables when not needed. But don't leave a totally empty module because some
+compilers barf at that. Instead, just supply some small dummy tables. */
+
+#ifndef SUPPORT_UNICODE
+const ucd_record PRIV(ucd_records)[] = {{0,0,0,0,0,0,0}};
+const uint16_t PRIV(ucd_stage1)[] = {0};
+const uint16_t PRIV(ucd_stage2)[] = {0};
+const uint32_t PRIV(ucd_caseless_sets)[] = {0};
+#else
+\n""")
+
+# --- Output some variable heading stuff ---
+
+f.write("/* Total size: %d bytes, block size: %d. */\n\n" % (min_size, min_block_size))
+f.write('const char *PRIV(unicode_version) = "{}";\n\n'.format(unicode_version))
+
+f.write("""\
+/* When recompiling tables with a new Unicode version, please check the types
+in this structure definition with those in pcre2_internal.h (the actual field
+names will be different).
+\n""")
+
+f.write(record_struct)
+
+f.write("""
+/* If the 32-bit library is run in non-32-bit mode, character values greater
+than 0x10ffff may be encountered. For these we set up a special record. */
+
+#if PCRE2_CODE_UNIT_WIDTH == 32
+const ucd_record PRIV(dummy_ucd_record)[] = {{
+  ucp_Unknown,    /* script */
+  ucp_Cn,         /* type unassigned */
+  ucp_gbOther,    /* grapheme break property */
+  0,              /* case set */
+  0,              /* other case */
+  0 | (ucp_bidiL << UCD_BIDICLASS_SHIFT), /* script extension and bidi class */
+  0,              /* bool properties offset */
+  }};
+#endif
+\n""")
+
+# --- Output the table of caseless character sets ---
+
+f.write("""\
+/* This table contains lists of characters that are caseless sets of
+more than one character. Each list is terminated by NOTACHAR. */
+
+const uint32_t PRIV(ucd_caseless_sets)[] = {
+  NOTACHAR,
+""")
+
+for s in caseless_sets:
+  s = sorted(s)
+  for x in s:
+    f.write('  0x%04x,' % x)
+  f.write('  NOTACHAR,\n')
+f.write('};\n\n')
+
+# --- Other tables are not needed by pcre2test ---
+
+f.write("""\
+/* When #included in pcre2test, we don't need the table of digit sets, nor the
+the large main UCD tables. */
+
+#ifndef PCRE2_PCRE2TEST
+\n""")
+
+# --- Read Scripts.txt again for the sets of 10 digits. ---
+
+digitsets = []
+file = open('Unicode.tables/Scripts.txt', 'r', encoding='utf-8')
+
+for line in file:
+  m = re.match(r'([0-9a-fA-F]+)\.\.([0-9a-fA-F]+)\s+;\s+\S+\s+#\s+Nd\s+', line)
+  if m is None:
+    continue
+  first = int(m.group(1),16)
+  last  = int(m.group(2),16)
+  if ((last - first + 1) % 10) != 0:
+    f.write("ERROR: %04x..%04x does not contain a multiple of 10 characters" % (first, last),
+      file=sys.stderr)
+  while first < last:
+    digitsets.append(first + 9)
+    first += 10
+file.close()
+digitsets.sort()
+
+f.write("""\
+/* This table lists the code points for the '9' characters in each set of
+decimal digits. It is used to ensure that all the digits in a script run come
+from the same set. */
+
+const uint32_t PRIV(ucd_digit_sets)[] = {
+""")
+
+f.write("  %d,  /* Number of subsequent values */" % len(digitsets))
+count = 8
+for d in digitsets:
+  if count == 8:
+    f.write("\n ")
+    count = 0
+  f.write(" 0x%05x," % d)
+  count += 1
+f.write("\n};\n\n")
+
+f.write("""\
+/* This vector is a list of script bitsets for the Script Extension property.
+The number of 32-bit words in each bitset is #defined in pcre2_ucp.h as
+ucd_script_sets_item_size. */
+
+const uint32_t PRIV(ucd_script_sets)[] = {
+""")
+write_bitsets(script_lists, script_list_item_size)
+
+f.write("""\
+/* This vector is a list of bitsets for Boolean properties. The number of
+32_bit words in each bitset is #defined as ucd_boolprop_sets_item_size in
+pcre2_ucp.h. */
+
+const uint32_t PRIV(ucd_boolprop_sets)[] = {
+""")
+write_bitsets(bool_props_lists, bool_props_list_item_size)
+
+
+# Output the main UCD tables.
+
+f.write("""\
+/* These are the main two-stage UCD tables. The fields in each record are:
+script (8 bits), character type (8 bits), grapheme break property (8 bits),
+offset to multichar other cases or zero (8 bits), offset to other case or zero
+(32 bits, signed), bidi class (5 bits) and script extension (11 bits) packed
+into a 16-bit field, and offset in binary properties table (16 bits). */
+\n""")
+
+write_records(records, record_size)
+write_table(min_stage1, 'PRIV(ucd_stage1)')
+write_table(min_stage2, 'PRIV(ucd_stage2)', min_block_size)
+
+f.write("#if UCD_BLOCK_SIZE != %d\n" % min_block_size)
+f.write("""\
+#error Please correct UCD_BLOCK_SIZE in pcre2_internal.h
+#endif
+#endif  /* SUPPORT_UNICODE */
+
+#endif  /* PCRE2_PCRE2TEST */
+
+/* End of pcre2_ucd.c */
+""")
+
+f.close
+
+# End
--- a/maint/GenerateUcpHeader.py
+++ b/maint/GenerateUcpHeader.py
@ -0,0 +1,98 @@
+#! /usr/bin/python
+
+#                   PCRE2 UNICODE PROPERTY SUPPORT
+#                   ------------------------------
+
+# This script generates the pcre2_ucp.h file from Unicode data files. This
+# header uses enumerations to give names to Unicode property types and script
+# names.
+
+# This script was created in December 2021 as part of the Unicode data
+# generation refactoring.
+
+
+# Import common data lists and functions
+
+from GenerateCommon import \
+  bidi_classes, \
+  bool_properties, \
+  bool_props_list_item_size, \
+  break_properties, \
+  category_names, \
+  general_category_names, \
+  script_list_item_size, \
+  script_names, \
+  open_output
+
+# Open the output file (no return on failure). This call also writes standard
+# header boilerplate.
+
+f = open_output("pcre2_ucp.h")
+
+# Output this file's heading text
+
+f.write("""\
+#ifndef PCRE2_UCP_H_IDEMPOTENT_GUARD
+#define PCRE2_UCP_H_IDEMPOTENT_GUARD
+
+/* This file contains definitions of the Unicode property values that are
+returned by the UCD access macros and used throughout PCRE2.
+
+IMPORTANT: The specific values of the first two enums (general and particular
+character categories) are assumed by the table called catposstab in the file
+pcre2_auto_possess.c. They are unlikely to change, but should be checked after
+an update. */
+\n""")
+
+f.write("/* These are the general character categories. */\n\nenum {\n")
+for i in general_category_names:
+  f.write("  ucp_%s,\n" % i)
+f.write("};\n\n")
+
+f.write("/* These are the particular character categories. */\n\nenum {\n")
+for i in range(0, len(category_names), 2):
+  f.write("  ucp_%s,    /* %s */\n" % (category_names[i], category_names[i+1]))
+f.write("};\n\n")
+
+f.write("/* These are Boolean properties. */\n\nenum {\n")
+for i in bool_properties:
+  f.write("  ucp_%s,\n" % i)
+
+f.write("  /* This must be last */\n")
+f.write("  ucp_Bprop_Count\n};\n\n")
+
+f.write("/* Size of entries in ucd_boolprop_sets[] */\n\n")
+f.write("#define ucd_boolprop_sets_item_size %d\n\n" % bool_props_list_item_size)
+
+f.write("/* These are the bidi class values. */\n\nenum {\n")
+for i in range(0, len(bidi_classes), 2):
+  sp = ' ' * (4 - len(bidi_classes[i]))
+  f.write("  ucp_bidi%s,%s /* %s */\n" % (bidi_classes[i], sp, bidi_classes[i+1]))
+f.write("};\n\n")
+
+f.write("/* These are grapheme break properties. The Extended Pictographic "
+  "property\ncomes from the emoji-data.txt file. */\n\nenum {\n")
+for i in range(0, len(break_properties), 2):
+  sp = ' ' * (21 - len(break_properties[i]))
+  f.write("  ucp_gb%s,%s /* %s */\n" % (break_properties[i], sp, break_properties[i+1]))
+f.write("};\n\n")
+
+f.write("/* These are the script identifications. */\n\nenum {\n  /* Scripts which has characters in other scripts. */\n")
+for i in script_names:
+  if i == "Unknown":
+    f.write("\n  /* Scripts which has no characters in other scripts. */\n")
+  f.write("  ucp_%s,\n" % i)
+f.write("\n")
+
+f.write("  /* This must be last */\n")
+f.write("  ucp_Script_Count\n};\n\n")
+
+f.write("/* Size of entries in ucd_script_sets[] */\n\n")
+f.write("#define ucd_script_sets_item_size %d\n\n" % script_list_item_size)
+
+f.write("#endif  /* PCRE2_UCP_H_IDEMPOTENT_GUARD */\n\n")
+f.write("/* End of pcre2_ucp.h */\n")
+
+f.close()
+
+# End
--- a/maint/GenerateUcpTables.py
+++ b/maint/GenerateUcpTables.py
@ -0,0 +1,203 @@
+#! /usr/bin/python
+
+#                   PCRE2 UNICODE PROPERTY SUPPORT
+#                   ------------------------------
+
+# This script generates the pcre2_ucptables.c file, which contains tables for
+# recognizing Unicode property names. It is #included by pcre2_tables.c. In
+# order to reduce the number of relocations when loading the PCRE2 library, the
+# names are held as a single large string, with offsets in the table. This is
+# tedious to maintain by hand. Therefore, a script is used to generate the
+# table.
+
+# This script was created in December 2021 based on the previous GenerateUtt
+# script, whose output had to be manually edited into pcre2_tables.c. Here is
+# the history of the original script:
+
+# -----------------------------------------------------------------------------
+# Modified by PH 17-March-2009 to generate the more verbose form that works
+# for UTF-support in EBCDIC as well as ASCII environments.
+# Modified by PH 01-March-2010 to add new scripts for Unicode 5.2.0.
+# Modified by PH 04-May-2010 to add new "X.." special categories.
+# Modified by PH 30-April-2011 to add new scripts for Unicode 6.0.0
+# Modified by ChPe 30-September-2012 to add this note; no other changes were
+# necessary for Unicode 6.2.0 support.
+# Modfied by PH 26-February-2013 to add the Xuc special category.
+# Comment modified by PH 13-May-2014 to update to PCRE2 file names.
+# Script updated to Python 3 by running it through the 2to3 converter.
+# Added script names for Unicode 7.0.0, 20-June-2014.
+# Added script names for Unicode 8.0.0, 19-June-2015.
+# Added script names for Unicode 10.0.0, 02-July-2017.
+# Added script names for Unicode 11.0.0, 03-July-2018.
+# Added 'Unknown' script, 01-October-2018.
+# Added script names for Unicode 12.1.0, 27-July-2019.
+# Added script names for Unicode 13.0.0, 10-March-2020.
+# Added Script names for Unicode 14.0.0, PCRE2-10.39
+# Added support for bidi class and bidi control, 06-December-2021
+#   This also involved lower casing strings and removing underscores, in
+#   accordance with Unicode's "loose matching" rules, which Perl observes.
+# Changed default script type from PT_SC to PT_SCX, 18-December-2021
+# -----------------------------------------------------------------------------
+#
+# Note subsequent changes here:
+#
+# 27-December-2021: Added support for 4-letter script abbreviations.
+# 10-January-2022:  Further updates for Boolean property support
+# -----------------------------------------------------------------------------
+
+
+# Import common data lists and functions
+
+from GenerateCommon import \
+  abbreviations, \
+  bool_properties, \
+  bidi_classes, \
+  category_names, \
+  general_category_names, \
+  script_names, \
+  open_output
+
+# Open the output file (no return on failure). This call also writes standard
+# header boilerplate.
+
+f = open_output("pcre2_ucptables.c")
+
+# The list in bidi_classes contains just the Unicode classes such as AN, LRE,
+# etc., along with comments. We need to add "bidi" in front of each value, in
+# order to create names that don't clash with other types of property.
+
+bidi_class_names = []
+for i in range(0, len(bidi_classes), 2):
+  bidi_class_names.append("bidi" + bidi_classes[i])
+
+# Remove the comments from other lists that contain them.
+
+category_names = category_names[::2]
+
+# Create standardized versions of the names by lowercasing and removing
+# underscores.
+
+def stdname(x):
+  return x.lower().replace('_', '')
+
+def stdnames(x):
+  y = [''] * len(x)
+  for i in range(len(x)):
+    y[i] = stdname(x[i])
+  return y
+
+std_category_names = stdnames(category_names)
+std_general_category_names = stdnames(general_category_names)
+std_bidi_class_names = stdnames(bidi_class_names)
+std_bool_properties = stdnames(bool_properties)
+
+# Create the table, starting with the Unicode script, category and bidi class
+# names. We keep both the standardized name and the original, because the
+# latter is used for the ucp_xx names. NOTE: for the script abbreviations, we
+# still use the full original names.
+
+utt_table = []
+
+scx_end = script_names.index('Unknown')
+
+for idx, name in enumerate(script_names):
+  pt_type = 'PT_SCX' if idx < scx_end else 'PT_SC'
+  utt_table.append((stdname(name), name, pt_type))
+  for abbrev in abbreviations[name]:
+    utt_table.append((stdname(abbrev), name, pt_type))
+
+# Add the remaining property lists
+
+utt_table += list(zip(std_category_names, category_names, ['PT_PC'] * len(category_names)))
+utt_table += list(zip(std_general_category_names, general_category_names, ['PT_GC'] * len(general_category_names)))
+utt_table += list(zip(std_bidi_class_names, bidi_class_names, ['PT_BIDICL'] * len(bidi_class_names)))
+
+for name in bool_properties:
+  utt_table.append((stdname(name), name, 'PT_BOOL'))
+  if name in abbreviations: 
+    for abbrev in abbreviations[name]:
+      utt_table.append((stdname(abbrev), name, 'PT_BOOL'))
+
+# Now add specials and synonyms. Note both the standardized and capitalized
+# forms are needed.
+
+utt_table.append(('any', 'Any', 'PT_ANY'))
+utt_table.append(('l&',  'L&',  'PT_LAMP'))
+utt_table.append(('lc',  'LC',  'PT_LAMP'))
+utt_table.append(('xan', 'Xan', 'PT_ALNUM'))
+utt_table.append(('xps', 'Xps', 'PT_PXSPACE'))
+utt_table.append(('xsp', 'Xsp', 'PT_SPACE'))
+utt_table.append(('xuc', 'Xuc', 'PT_UCNC'))
+utt_table.append(('xwd', 'Xwd', 'PT_WORD'))
+
+# Remove duplicates from the table and then sort it.
+
+utt_table = list(set(utt_table)) 
+utt_table.sort()
+
+# Output file-specific heading
+
+f.write("""\
+#ifdef SUPPORT_UNICODE
+
+/* The PRIV(utt)[] table below translates Unicode property names into type and
+code values. It is searched by binary chop, so must be in collating sequence of
+name. Originally, the table contained pointers to the name strings in the first
+field of each entry. However, that leads to a large number of relocations when
+a shared library is dynamically loaded. A significant reduction is made by
+putting all the names into a single, large string and using offsets instead.
+All letters are lower cased, and underscores are removed, in accordance with
+the "loose matching" rules that Unicode advises and Perl uses. */
+\n""")
+
+# We have to use STR_ macros to define the strings so that it all works in
+# UTF-8 mode on EBCDIC platforms.
+
+for utt in utt_table:
+  f.write('#define STRING_%s0' % (utt[0].replace('&', '_AMPERSAND')))
+  for c in utt[0]:
+    if c == '&':
+      f.write(' STR_AMPERSAND')
+    else:
+      f.write(' STR_%s' % c);
+  f.write(' "\\0"\n')
+
+# Output the long string of concatenated names
+
+f.write('\nconst char PRIV(utt_names)[] =\n');
+last = ''
+for utt in utt_table:
+  if utt == utt_table[-1]:
+    last = ';'
+  f.write('  STRING_%s0%s\n' % (utt[0].replace('&', '_AMPERSAND'), last))
+
+# Output the property type table
+
+f.write('\nconst ucp_type_table PRIV(utt)[] = {\n')
+offset = 0
+last = ','
+for utt in utt_table:
+  if utt[2] in ('PT_ANY', 'PT_LAMP', 'PT_ALNUM', 'PT_PXSPACE',
+      'PT_SPACE', 'PT_UCNC', 'PT_WORD'):
+    value = '0'
+  else:
+    value = 'ucp_' + utt[1]
+  if utt == utt_table[-1]:
+    last = ''
+  f.write('  { %3d, %s, %s }%s\n' % (offset, utt[2], value, last))
+  offset += len(utt[0]) + 1
+f.write('};\n\n')
+
+# Ending text
+
+f.write("""\
+const size_t PRIV(utt_size) = sizeof(PRIV(utt)) / sizeof(ucp_type_table);
+
+#endif /* SUPPORT_UNICODE */
+
+/* End of pcre2_ucptables.c */
+""")
+
+f.close
+
+# End
--- a/maint/GenerateUtt.py
+++ b/maint/GenerateUtt.py
@ -1,137 +0,0 @@
-#! /usr/bin/python
-
-# Generate utt tables. Note: this script has now been converted to Python 3.
-
-# The source file pcre2_tables.c contains (amongst other things), a table that
-# is indexed by script name. In order to reduce the number of relocations when
-# loading the library, the names are held as a single large string, with
-# offsets in the table. This is tedious to maintain by hand. Therefore, this
-# script is used to generate the table. The output is sent to stdout; usually
-# that should be directed to a temporary file. Then pcre2_tables.c can be
-# edited by replacing the relevant definitions and table therein with the
-# temporary file.
-
-# Modified by PH 17-March-2009 to generate the more verbose form that works
-# for UTF-support in EBCDIC as well as ASCII environments.
-# Modified by PH 01-March-2010 to add new scripts for Unicode 5.2.0.
-# Modified by PH 04-May-2010 to add new "X.." special categories.
-# Modified by PH 30-April-2011 to add new scripts for Unicode 6.0.0
-# Modified by ChPe 30-September-2012 to add this note; no other changes were
-# necessary for Unicode 6.2.0 support.
-# Modfied by PH 26-February-2013 to add the Xuc special category.
-# Comment modified by PH 13-May-2014 to update to PCRE2 file names.
-# Script updated to Python 3 by running it through the 2to3 converter.
-# Added script names for Unicode 7.0.0, 20-June-2014.
-# Added script names for Unicode 8.0.0, 19-June-2015.
-# Added script names for Unicode 10.0.0, 02-July-2017.
-# Added script names for Unicode 11.0.0, 03-July-2018.
-# Added 'Unknown' script, 01-October-2018.
-# Added script names for Unicode 12.1.0, 27-July-2019.
-# Added script names for Unicode 13.0.0, 10-March-2020.
-
-script_names = ['Unknown', 'Arabic', 'Armenian', 'Bengali', 'Bopomofo', 'Braille', 'Buginese', 'Buhid', 'Canadian_Aboriginal', \
- 'Cherokee', 'Common', 'Coptic', 'Cypriot', 'Cyrillic', 'Deseret', 'Devanagari', 'Ethiopic', 'Georgian', \
- 'Glagolitic', 'Gothic', 'Greek', 'Gujarati', 'Gurmukhi', 'Han', 'Hangul', 'Hanunoo', 'Hebrew', 'Hiragana', \
- 'Inherited', 'Kannada', 'Katakana', 'Kharoshthi', 'Khmer', 'Lao', 'Latin', 'Limbu', 'Linear_B', 'Malayalam', \
- 'Mongolian', 'Myanmar', 'New_Tai_Lue', 'Ogham', 'Old_Italic', 'Old_Persian', 'Oriya', 'Osmanya', 'Runic', \
- 'Shavian', 'Sinhala', 'Syloti_Nagri', 'Syriac', 'Tagalog', 'Tagbanwa', 'Tai_Le', 'Tamil', 'Telugu', 'Thaana', \
- 'Thai', 'Tibetan', 'Tifinagh', 'Ugaritic', 'Yi', \
- # New for Unicode 5.0
- 'Balinese', 'Cuneiform', 'Nko', 'Phags_Pa', 'Phoenician', \
- # New for Unicode 5.1
- 'Carian', 'Cham', 'Kayah_Li', 'Lepcha', 'Lycian', 'Lydian', 'Ol_Chiki', 'Rejang', 'Saurashtra', 'Sundanese', 'Vai', \
- # New for Unicode 5.2
- 'Avestan', 'Bamum', 'Egyptian_Hieroglyphs', 'Imperial_Aramaic', \
- 'Inscriptional_Pahlavi', 'Inscriptional_Parthian', \
- 'Javanese', 'Kaithi', 'Lisu', 'Meetei_Mayek', \
- 'Old_South_Arabian', 'Old_Turkic', 'Samaritan', 'Tai_Tham', 'Tai_Viet', \
- # New for Unicode 6.0.0
- 'Batak', 'Brahmi', 'Mandaic', \
-# New for Unicode 6.1.0
- 'Chakma', 'Meroitic_Cursive', 'Meroitic_Hieroglyphs', 'Miao', 'Sharada', 'Sora_Sompeng', 'Takri',
-# New for Unicode 7.0.0
- 'Bassa_Vah', 'Caucasian_Albanian', 'Duployan', 'Elbasan', 'Grantha', 'Khojki', 'Khudawadi',
- 'Linear_A', 'Mahajani', 'Manichaean', 'Mende_Kikakui', 'Modi', 'Mro', 'Nabataean',
- 'Old_North_Arabian', 'Old_Permic', 'Pahawh_Hmong', 'Palmyrene', 'Psalter_Pahlavi',
- 'Pau_Cin_Hau', 'Siddham', 'Tirhuta', 'Warang_Citi',
-# New for Unicode 8.0.0
- 'Ahom', 'Anatolian_Hieroglyphs', 'Hatran', 'Multani', 'Old_Hungarian',
- 'SignWriting',
-# New for Unicode 10.0.0
- 'Adlam', 'Bhaiksuki', 'Marchen', 'Newa', 'Osage', 'Tangut', 'Masaram_Gondi',
- 'Nushu', 'Soyombo', 'Zanabazar_Square',
-# New for Unicode 11.0.0
-  'Dogra', 'Gunjala_Gondi', 'Hanifi_Rohingya', 'Makasar', 'Medefaidrin',
-  'Old_Sogdian', 'Sogdian',
-# New for Unicode 12.0.0
-  'Elymaic', 'Nandinagari', 'Nyiakeng_Puachue_Hmong', 'Wancho',
-# New for Unicode 13.0.0
-  'Chorasmian', 'Dives_Akuru', 'Khitan_Small_Script', 'Yezidi'
- ]
-
-category_names = ['Cc', 'Cf', 'Cn', 'Co', 'Cs', 'Ll', 'Lm', 'Lo', 'Lt', 'Lu',
-  'Mc', 'Me', 'Mn', 'Nd', 'Nl', 'No', 'Pc', 'Pd', 'Pe', 'Pf', 'Pi', 'Po', 'Ps',
-  'Sc', 'Sk', 'Sm', 'So', 'Zl', 'Zp', 'Zs' ]
-
-general_category_names = ['C', 'L', 'M', 'N', 'P', 'S', 'Z']
-
-# First add the Unicode script and category names.
-
-utt_table  = list(zip(script_names, ['PT_SC'] * len(script_names)))
-utt_table += list(zip(category_names, ['PT_PC'] * len(category_names)))
-utt_table += list(zip(general_category_names, ['PT_GC'] * len(general_category_names)))
-
-# Now add our own specials.
-
-utt_table.append(('Any', 'PT_ANY'))
-utt_table.append(('L&',  'PT_LAMP'))
-utt_table.append(('Xan', 'PT_ALNUM'))
-utt_table.append(('Xps', 'PT_PXSPACE'))
-utt_table.append(('Xsp', 'PT_SPACE'))
-utt_table.append(('Xuc', 'PT_UCNC'))
-utt_table.append(('Xwd', 'PT_WORD'))
-
-# Sort the table.
-
-utt_table.sort()
-
-# We have to use STR_ macros to define the strings so that it all works in
-# UTF-8 mode on EBCDIC platforms.
-
-for utt in utt_table:
-        print('#define STRING_%s0' % (utt[0].replace('&', '_AMPERSAND')), end=' ')
-        for c in utt[0]:
-                if c == '_':
-                        print('STR_UNDERSCORE', end=' ')
-                elif c == '&':
-                        print('STR_AMPERSAND', end=' ')
-                else:
-                        print('STR_%s' % c, end=' ');
-        print('"\\0"')
-
-# Print the actual table, using the string names
-
-print('')
-print('const char PRIV(utt_names)[] =');
-last = ''
-for utt in utt_table:
-        if utt == utt_table[-1]:
-                last = ';'
-        print('  STRING_%s0%s' % (utt[0].replace('&', '_AMPERSAND'), last))
-# This was how it was done before the EBCDIC-compatible modification.
-#        print '  "%s\\0"%s' % (utt[0], last)
-
-print('\nconst ucp_type_table PRIV(utt)[] = {')
-offset = 0
-last = ','
-for utt in utt_table:
-        if utt[1] in ('PT_ANY', 'PT_LAMP', 'PT_ALNUM', 'PT_PXSPACE', 
-          'PT_SPACE', 'PT_UCNC', 'PT_WORD'):
-                value = '0'
-        else:
-                value = 'ucp_' + utt[0]
-        if utt == utt_table[-1]:
-                last = ''
-        print('  { %3d, %s, %s }%s' % (offset, utt[1], value, last))
-        offset += len(utt[0]) + 1
-print('};')
--- a/maint/MultiStage2.py
+++ b/maint/MultiStage2.py
@ -1,814 +0,0 @@
-#! /usr/bin/python
-
-# Multistage table builder
-# (c) Peter Kankowski, 2008
-
-##############################################################################
-# This script was submitted to the PCRE project by Peter Kankowski as part of
-# the upgrading of Unicode property support. The new code speeds up property
-# matching many times. The script is for the use of PCRE maintainers, to
-# generate the pcre2_ucd.c file that contains a digested form of the Unicode
-# data tables. A number of extensions have been added to the original script.
-#
-# The script has now been upgraded to Python 3 for PCRE2, and should be run in
-# the maint subdirectory, using the command
-#
-# [python3] ./MultiStage2.py >../src/pcre2_ucd.c
-#
-# It requires six Unicode data tables: DerivedGeneralCategory.txt,
-# GraphemeBreakProperty.txt, Scripts.txt, ScriptExtensions.txt,
-# CaseFolding.txt, and emoji-data.txt. These must be in the
-# maint/Unicode.tables subdirectory.
-#
-# DerivedGeneralCategory.txt is found in the "extracted" subdirectory of the
-# Unicode database (UCD) on the Unicode web site; GraphemeBreakProperty.txt is
-# in the "auxiliary" subdirectory. Scripts.txt, ScriptExtensions.txt, and
-# CaseFolding.txt are directly in the UCD directory.
-#
-# The emoji-data.txt file is found in the "emoji" subdirectory even though it
-# is technically part of a different (but coordinated) standard as shown
-# in files associated with Unicode Technical Standard #51 ("Unicode Emoji"),
-# for example:
-#
-# http://unicode.org/Public/emoji/13.0/ReadMe.txt
-#
-# -----------------------------------------------------------------------------
-# Minor modifications made to this script:
-#  Added #! line at start
-#  Removed tabs
-#  Made it work with Python 2.4 by rewriting two statements that needed 2.5
-#  Consequent code tidy
-#  Adjusted data file names to take from the Unicode.tables directory
-#  Adjusted global table names by prefixing _pcre_.
-#  Commented out stuff relating to the casefolding table, which isn't used;
-#    removed completely in 2012.
-#  Corrected size calculation
-#  Add #ifndef SUPPORT_UCP to use dummy tables when no UCP support is needed.
-#  Update for PCRE2: name changes, and SUPPORT_UCP is abolished.
-#
-# Major modifications made to this script:
-#  Added code to add a grapheme break property field to records.
-#
-#  Added code to search for sets of more than two characters that must match
-#  each other caselessly. A new table is output containing these sets, and
-#  offsets into the table are added to the main output records. This new
-#  code scans CaseFolding.txt instead of UnicodeData.txt, which is no longer
-#  used.
-#
-#  Update for Python3:
-#    . Processed with 2to3, but that didn't fix everything
-#    . Changed string.strip to str.strip
-#    . Added encoding='utf-8' to the open() call
-#    . Inserted 'int' before blocksize/ELEMS_PER_LINE because an int is
-#        required and the result of the division is a float
-#
-#  Added code to scan the emoji-data.txt file to find the Extended Pictographic
-#  property, which is used by PCRE2 as a grapheme breaking property. This was
-#  done when updating to Unicode 11.0.0 (July 2018).
-#
-#  Added code to add a Script Extensions field to records. This has increased
-#  their size from 8 to 12 bytes, only 10 of which are currently used.
-#
-# 01-March-2010:     Updated list of scripts for Unicode 5.2.0
-# 30-April-2011:     Updated list of scripts for Unicode 6.0.0
-#     July-2012:     Updated list of scripts for Unicode 6.1.0
-# 20-August-2012:    Added scan of GraphemeBreakProperty.txt and added a new
-#                      field in the record to hold the value. Luckily, the
-#                      structure had a hole in it, so the resulting table is
-#                      not much bigger than before.
-# 18-September-2012: Added code for multiple caseless sets. This uses the
-#                      final hole in the structure.
-# 30-September-2012: Added RegionalIndicator break property from Unicode 6.2.0
-# 13-May-2014:       Updated for PCRE2
-# 03-June-2014:      Updated for Python 3
-# 20-June-2014:      Updated for Unicode 7.0.0
-# 12-August-2014:    Updated to put Unicode version into the file
-# 19-June-2015:      Updated for Unicode 8.0.0
-# 02-July-2017:      Updated for Unicode 10.0.0
-# 03-July-2018:      Updated for Unicode 11.0.0
-# 07-July-2018:      Added code to scan emoji-data.txt for the Extended
-#                      Pictographic property.
-# 01-October-2018:   Added the 'Unknown' script name
-# 03-October-2018:   Added new field for Script Extensions
-# 27-July-2019:      Updated for Unicode 12.1.0
-# 10-March-2020:     Updated for Unicode 13.0.0
-# ----------------------------------------------------------------------------
-#
-#
-# The main tables generated by this script are used by macros defined in
-# pcre2_internal.h. They look up Unicode character properties using short
-# sequences of code that contains no branches, which makes for greater speed.
-#
-# Conceptually, there is a table of records (of type ucd_record), containing a
-# script number, script extension value, character type, grapheme break type,
-# offset to caseless matching set, offset to the character's other case, for
-# every Unicode character. However, a real table covering all Unicode
-# characters would be far too big. It can be efficiently compressed by
-# observing that many characters have the same record, and many blocks of
-# characters (taking 128 characters in a block) have the same set of records as
-# other blocks. This leads to a 2-stage lookup process.
-#
-# This script constructs six tables. The ucd_caseless_sets table contains
-# lists of characters that all match each other caselessly. Each list is
-# in order, and is terminated by NOTACHAR (0xffffffff), which is larger than
-# any valid character. The first list is empty; this is used for characters
-# that are not part of any list.
-#
-# The ucd_digit_sets table contains the code points of the '9' characters in
-# each set of 10 decimal digits in Unicode. This is used to ensure that digits
-# in script runs all come from the same set. The first element in the vector
-# contains the number of subsequent elements, which are in ascending order.
-#
-# The ucd_script_sets vector contains lists of script numbers that are the
-# Script Extensions properties of certain characters. Each list is terminated
-# by zero (ucp_Unknown). A character with more than one script listed for its
-# Script Extension property has a negative value in its record. This is the
-# negated offset to the start of the relevant list in the ucd_script_sets
-# vector.
-#
-# The ucd_records table contains one instance of every unique record that is
-# required. The ucd_stage1 table is indexed by a character's block number,
-# which is the character's code point divided by 128, since 128 is the size
-# of each block. The result of a lookup in ucd_stage1 a "virtual" block number.
-#
-# The ucd_stage2 table is a table of "virtual" blocks; each block is indexed by
-# the offset of a character within its own block, and the result is the index
-# number of the required record in the ucd_records vector.
-#
-# The following examples are correct for the Unicode 11.0.0 database. Future
-# updates may make change the actual lookup values.
-#
-# Example: lowercase "a" (U+0061) is in block 0
-#          lookup 0 in stage1 table yields 0
-#          lookup 97 (0x61) in the first table in stage2 yields 17
-#          record 17 is { 34, 5, 12, 0, -32, 34, 0 }
-#            34 = ucp_Latin   => Latin script
-#             5 = ucp_Ll      => Lower case letter
-#            12 = ucp_gbOther => Grapheme break property "Other"
-#             0               => Not part of a caseless set
-#           -32 (-0x20)       => Other case is U+0041
-#            34 = ucp_Latin   => No special Script Extension property
-#             0               => Dummy value, unused at present
-#
-# Almost all lowercase latin characters resolve to the same record. One or two
-# are different because they are part of a multi-character caseless set (for
-# example, k, K and the Kelvin symbol are such a set).
-#
-# Example: hiragana letter A (U+3042) is in block 96 (0x60)
-#          lookup 96 in stage1 table yields 90
-#          lookup 66 (0x42) in table 90 in stage2 yields 564
-#          record 564 is { 27, 7, 12, 0, 0, 27, 0 }
-#            27 = ucp_Hiragana => Hiragana script
-#             7 = ucp_Lo       => Other letter
-#            12 = ucp_gbOther  => Grapheme break property "Other"
-#             0                => Not part of a caseless set
-#             0                => No other case
-#            27 = ucp_Hiragana => No special Script Extension property
-#             0                => Dummy value, unused at present
-#
-# Example: vedic tone karshana (U+1CD0) is in block 57 (0x39)
-#          lookup 57 in stage1 table yields 55
-#          lookup 80 (0x50) in table 55 in stage2 yields 458
-#          record 458 is { 28, 12, 3, 0, 0, -101, 0 }
-#            28 = ucp_Inherited => Script inherited from predecessor
-#            12 = ucp_Mn        => Non-spacing mark
-#             3 = ucp_gbExtend  => Grapheme break property "Extend"
-#             0                 => Not part of a caseless set
-#             0                 => No other case
-#          -101                 => Script Extension list offset = 101
-#             0                 => Dummy value, unused at present
-#
-# At offset 101 in the ucd_script_sets vector we find the list 3, 15, 107, 29,
-# and terminator 0. This means that this character is expected to be used with
-# any of those scripts, which are Bengali, Devanagari, Grantha, and Kannada.
-#
-#  Philip Hazel, 03 July 2008
-##############################################################################
-
-
-import re
-import string
-import sys
-
-MAX_UNICODE = 0x110000
-NOTACHAR = 0xffffffff
-
-
-# Parse a line of Scripts.txt, GraphemeBreakProperty.txt or DerivedGeneralCategory.txt
-def make_get_names(enum):
-        return lambda chardata: enum.index(chardata[1])
-
-# Parse a line of CaseFolding.txt
-def get_other_case(chardata):
-        if chardata[1] == 'C' or chardata[1] == 'S':
-          return int(chardata[2], 16) - int(chardata[0], 16)
-        return 0
-
-# Parse a line of ScriptExtensions.txt
-def get_script_extension(chardata):
-        this_script_list = list(chardata[1].split(' '))
-        if len(this_script_list) == 1:
-          return script_abbrevs.index(this_script_list[0])
-
-        script_numbers = []
-        for d in this_script_list:
-          script_numbers.append(script_abbrevs.index(d))
-        script_numbers.append(0)
-        script_numbers_length = len(script_numbers)
-
-        for i in range(1, len(script_lists) - script_numbers_length + 1):
-          for j in range(0, script_numbers_length):
-            found = True
-            if script_lists[i+j] != script_numbers[j]:
-              found = False
-              break
-          if found:
-            return -i
-
-        # Not found in existing lists
-
-        return_value = len(script_lists)
-        script_lists.extend(script_numbers)
-        return -return_value
-
-# Read the whole table in memory, setting/checking the Unicode version
-def read_table(file_name, get_value, default_value):
-        global unicode_version
-
-        f = re.match(r'^[^/]+/([^.]+)\.txt$', file_name)
-        file_base = f.group(1)
-        version_pat = r"^# " + re.escape(file_base) + r"-(\d+\.\d+\.\d+)\.txt$"
-        file = open(file_name, 'r', encoding='utf-8')
-        f = re.match(version_pat, file.readline())
-        version = f.group(1)
-        if unicode_version == "":
-                unicode_version = version
-        elif unicode_version != version:
-                print("WARNING: Unicode version differs in %s", file_name, file=sys.stderr)
-
-        table = [default_value] * MAX_UNICODE
-        for line in file:
-                line = re.sub(r'#.*', '', line)
-                chardata = list(map(str.strip, line.split(';')))
-                if len(chardata) <= 1:
-                        continue
-                value = get_value(chardata)
-                m = re.match(r'([0-9a-fA-F]+)(\.\.([0-9a-fA-F]+))?$', chardata[0])
-                char = int(m.group(1), 16)
-                if m.group(3) is None:
-                        last = char
-                else:
-                        last = int(m.group(3), 16)
-                for i in range(char, last + 1):
-                        # It is important not to overwrite a previously set
-                        # value because in the CaseFolding file there are lines
-                        # to be ignored (returning the default value of 0)
-                        # which often come after a line which has already set
-                        # data.
-                        if table[i] == default_value:
-                          table[i] = value
-        file.close()
-        return table
-
-# Get the smallest possible C language type for the values
-def get_type_size(table):
-        type_size = [("uint8_t", 1), ("uint16_t", 2), ("uint32_t", 4),
-                                 ("signed char", 1), ("pcre_int16", 2), ("pcre_int32", 4)]
-        limits = [(0, 255), (0, 65535), (0, 4294967295),
-                          (-128, 127), (-32768, 32767), (-2147483648, 2147483647)]
-        minval = min(table)
-        maxval = max(table)
-        for num, (minlimit, maxlimit) in enumerate(limits):
-                if minlimit <= minval and maxval <= maxlimit:
-                        return type_size[num]
-        else:
-                raise OverflowError("Too large to fit into C types")
-
-def get_tables_size(*tables):
-        total_size = 0
-        for table in tables:
-                type, size = get_type_size(table)
-                total_size += size * len(table)
-        return total_size
-
-# Compress the table into the two stages
-def compress_table(table, block_size):
-        blocks = {} # Dictionary for finding identical blocks
-        stage1 = [] # Stage 1 table contains block numbers (indices into stage 2 table)
-        stage2 = [] # Stage 2 table contains the blocks with property values
-        table = tuple(table)
-        for i in range(0, len(table), block_size):
-                block = table[i:i+block_size]
-                start = blocks.get(block)
-                if start is None:
-                        # Allocate a new block
-                        start = len(stage2) / block_size
-                        stage2 += block
-                        blocks[block] = start
-                stage1.append(start)
-
-        return stage1, stage2
-
-# Print a table
-def print_table(table, table_name, block_size = None):
-        type, size = get_type_size(table)
-        ELEMS_PER_LINE = 16
-
-        s = "const %s %s[] = { /* %d bytes" % (type, table_name, size * len(table))
-        if block_size:
-                s += ", block = %d" % block_size
-        print(s + " */")
-        table = tuple(table)
-        if block_size is None:
-                fmt = "%3d," * ELEMS_PER_LINE + " /* U+%04X */"
-                mult = MAX_UNICODE / len(table)
-                for i in range(0, len(table), ELEMS_PER_LINE):
-                        print(fmt % (table[i:i+ELEMS_PER_LINE] +
-                          (int(i * mult),)))
-        else:
-                if block_size > ELEMS_PER_LINE:
-                        el = ELEMS_PER_LINE
-                else:
-                        el = block_size
-                fmt = "%3d," * el + "\n"
-                if block_size > ELEMS_PER_LINE:
-                        fmt = fmt * int(block_size / ELEMS_PER_LINE)
-                for i in range(0, len(table), block_size):
-                        print(("/* block %d */\n" + fmt) % ((i / block_size,) + table[i:i+block_size]))
-        print("};\n")
-
-# Extract the unique combinations of properties into records
-def combine_tables(*tables):
-        records = {}
-        index = []
-        for t in zip(*tables):
-                i = records.get(t)
-                if i is None:
-                        i = records[t] = len(records)
-                index.append(i)
-        return index, records
-
-def get_record_size_struct(records):
-        size = 0
-        structure = '/* When recompiling tables with a new Unicode version, please check the\n' + \
-        'types in this structure definition from pcre2_internal.h (the actual\n' + \
-        'field names will be different):\n\ntypedef struct {\n'
-        for i in range(len(records[0])):
-                record_slice = [record[i] for record in records]
-                slice_type, slice_size = get_type_size(record_slice)
-                # add padding: round up to the nearest power of slice_size
-                size = (size + slice_size - 1) & -slice_size
-                size += slice_size
-                structure += '%s property_%d;\n' % (slice_type, i)
-
-        # round up to the first item of the next structure in array
-        record_slice = [record[0] for record in records]
-        slice_type, slice_size = get_type_size(record_slice)
-        size = (size + slice_size - 1) & -slice_size
-
-        structure += '} ucd_record;\n*/\n'
-        return size, structure
-
-def test_record_size():
-        tests = [ \
-          ( [(3,), (6,), (6,), (1,)], 1 ), \
-          ( [(300,), (600,), (600,), (100,)], 2 ), \
-          ( [(25, 3), (6, 6), (34, 6), (68, 1)], 2 ), \
-          ( [(300, 3), (6, 6), (340, 6), (690, 1)], 4 ), \
-          ( [(3, 300), (6, 6), (6, 340), (1, 690)], 4 ), \
-          ( [(300, 300), (6, 6), (6, 340), (1, 690)], 4 ), \
-          ( [(3, 100000), (6, 6), (6, 123456), (1, 690)], 8 ), \
-          ( [(100000, 300), (6, 6), (123456, 6), (1, 690)], 8 ), \
-        ]
-        for test in tests:
-            size, struct = get_record_size_struct(test[0])
-            assert(size == test[1])
-            #print struct
-
-def print_records(records, record_size):
-        print('const ucd_record PRIV(ucd_records)[] = { ' + \
-              '/* %d bytes, record size %d */' % (len(records) * record_size, record_size))
-
-        records = list(zip(list(records.keys()), list(records.values())))
-        records.sort(key = lambda x: x[1])
-        for i, record in enumerate(records):
-                print(('  {' + '%6d, ' * len(record[0]) + '}, /* %3d */') % (record[0] + (i,)))
-        print('};\n')
-
-script_names = ['Unknown', 'Arabic', 'Armenian', 'Bengali', 'Bopomofo', 'Braille', 'Buginese', 'Buhid', 'Canadian_Aboriginal',
- 'Cherokee', 'Common', 'Coptic', 'Cypriot', 'Cyrillic', 'Deseret', 'Devanagari', 'Ethiopic', 'Georgian',
- 'Glagolitic', 'Gothic', 'Greek', 'Gujarati', 'Gurmukhi', 'Han', 'Hangul', 'Hanunoo', 'Hebrew', 'Hiragana',
- 'Inherited', 'Kannada', 'Katakana', 'Kharoshthi', 'Khmer', 'Lao', 'Latin', 'Limbu', 'Linear_B', 'Malayalam',
- 'Mongolian', 'Myanmar', 'New_Tai_Lue', 'Ogham', 'Old_Italic', 'Old_Persian', 'Oriya', 'Osmanya', 'Runic',
- 'Shavian', 'Sinhala', 'Syloti_Nagri', 'Syriac', 'Tagalog', 'Tagbanwa', 'Tai_Le', 'Tamil', 'Telugu', 'Thaana',
- 'Thai', 'Tibetan', 'Tifinagh', 'Ugaritic', 'Yi',
-# New for Unicode 5.0
- 'Balinese', 'Cuneiform', 'Nko', 'Phags_Pa', 'Phoenician',
-# New for Unicode 5.1
- 'Carian', 'Cham', 'Kayah_Li', 'Lepcha', 'Lycian', 'Lydian', 'Ol_Chiki', 'Rejang', 'Saurashtra', 'Sundanese', 'Vai',
-# New for Unicode 5.2
- 'Avestan', 'Bamum', 'Egyptian_Hieroglyphs', 'Imperial_Aramaic',
- 'Inscriptional_Pahlavi', 'Inscriptional_Parthian',
- 'Javanese', 'Kaithi', 'Lisu', 'Meetei_Mayek',
- 'Old_South_Arabian', 'Old_Turkic', 'Samaritan', 'Tai_Tham', 'Tai_Viet',
-# New for Unicode 6.0.0
- 'Batak', 'Brahmi', 'Mandaic',
-# New for Unicode 6.1.0
- 'Chakma', 'Meroitic_Cursive', 'Meroitic_Hieroglyphs', 'Miao', 'Sharada', 'Sora_Sompeng', 'Takri',
-# New for Unicode 7.0.0
- 'Bassa_Vah', 'Caucasian_Albanian', 'Duployan', 'Elbasan', 'Grantha', 'Khojki', 'Khudawadi',
- 'Linear_A', 'Mahajani', 'Manichaean', 'Mende_Kikakui', 'Modi', 'Mro', 'Nabataean',
- 'Old_North_Arabian', 'Old_Permic', 'Pahawh_Hmong', 'Palmyrene', 'Psalter_Pahlavi',
- 'Pau_Cin_Hau', 'Siddham', 'Tirhuta', 'Warang_Citi',
-# New for Unicode 8.0.0
- 'Ahom', 'Anatolian_Hieroglyphs', 'Hatran', 'Multani', 'Old_Hungarian',
- 'SignWriting',
-# New for Unicode 10.0.0
- 'Adlam', 'Bhaiksuki', 'Marchen', 'Newa', 'Osage', 'Tangut', 'Masaram_Gondi',
- 'Nushu', 'Soyombo', 'Zanabazar_Square',
-# New for Unicode 11.0.0
-  'Dogra', 'Gunjala_Gondi', 'Hanifi_Rohingya', 'Makasar', 'Medefaidrin',
-  'Old_Sogdian', 'Sogdian',
-# New for Unicode 12.0.0
-  'Elymaic', 'Nandinagari', 'Nyiakeng_Puachue_Hmong', 'Wancho',
-# New for Unicode 13.0.0
-  'Chorasmian', 'Dives_Akuru', 'Khitan_Small_Script', 'Yezidi'
- ]
-
-script_abbrevs = [
-  'Zzzz', 'Arab', 'Armn', 'Beng', 'Bopo', 'Brai', 'Bugi', 'Buhd', 'Cans',
-  'Cher', 'Zyyy', 'Copt', 'Cprt', 'Cyrl', 'Dsrt', 'Deva', 'Ethi', 'Geor',
-  'Glag', 'Goth', 'Grek', 'Gujr', 'Guru', 'Hani', 'Hang', 'Hano', 'Hebr',
-  'Hira', 'Zinh', 'Knda', 'Kana', 'Khar', 'Khmr', 'Laoo', 'Latn', 'Limb',
-  'Linb', 'Mlym', 'Mong', 'Mymr', 'Talu', 'Ogam', 'Ital', 'Xpeo', 'Orya',
-  'Osma', 'Runr', 'Shaw', 'Sinh', 'Sylo', 'Syrc', 'Tglg', 'Tagb', 'Tale',
-  'Taml', 'Telu', 'Thaa', 'Thai', 'Tibt', 'Tfng', 'Ugar', 'Yiii',
-#New for Unicode 5.0
-  'Bali', 'Xsux', 'Nkoo', 'Phag', 'Phnx',
-#New for Unicode 5.1
-  'Cari', 'Cham', 'Kali', 'Lepc', 'Lyci', 'Lydi', 'Olck', 'Rjng', 'Saur',
-  'Sund', 'Vaii',
-#New for Unicode 5.2
-  'Avst', 'Bamu', 'Egyp', 'Armi', 'Phli', 'Prti', 'Java', 'Kthi', 'Lisu',
-  'Mtei', 'Sarb', 'Orkh', 'Samr', 'Lana', 'Tavt',
-#New for Unicode 6.0.0
-  'Batk', 'Brah', 'Mand',
-#New for Unicode 6.1.0
-  'Cakm', 'Merc', 'Mero', 'Plrd', 'Shrd', 'Sora', 'Takr',
-#New for Unicode 7.0.0
-  'Bass', 'Aghb', 'Dupl', 'Elba', 'Gran', 'Khoj', 'Sind', 'Lina', 'Mahj',
-  'Mani', 'Mend', 'Modi', 'Mroo', 'Nbat', 'Narb', 'Perm', 'Hmng', 'Palm',
-  'Phlp', 'Pauc', 'Sidd', 'Tirh', 'Wara',
-#New for Unicode 8.0.0
-  'Ahom', 'Hluw', 'Hatr', 'Mult', 'Hung', 'Sgnw',
-#New for Unicode 10.0.0
-  'Adlm', 'Bhks', 'Marc', 'Newa', 'Osge', 'Tang', 'Gonm', 'Nshu', 'Soyo',
-  'Zanb',
-#New for Unicode 11.0.0
-  'Dogr', 'Gong', 'Rohg', 'Maka', 'Medf', 'Sogo', 'Sogd',
-#New for Unicode 12.0.0
-  'Elym', 'Nand', 'Hmnp', 'Wcho',
-#New for Unicode 13.0.0
-  'Chrs', 'Diak', 'Kits', 'Yezi'
-  ]
-
-category_names = ['Cc', 'Cf', 'Cn', 'Co', 'Cs', 'Ll', 'Lm', 'Lo', 'Lt', 'Lu',
-  'Mc', 'Me', 'Mn', 'Nd', 'Nl', 'No', 'Pc', 'Pd', 'Pe', 'Pf', 'Pi', 'Po', 'Ps',
-  'Sc', 'Sk', 'Sm', 'So', 'Zl', 'Zp', 'Zs' ]
-
-# The Extended_Pictographic property is not found in the file where all the
-# others are (GraphemeBreakProperty.txt). It comes from the emoji-data.txt
-# file, but we list it here so that the name has the correct index value.
-
-break_property_names = ['CR', 'LF', 'Control', 'Extend', 'Prepend',
-  'SpacingMark', 'L', 'V', 'T', 'LV', 'LVT', 'Regional_Indicator', 'Other',
-  'ZWJ', 'Extended_Pictographic' ]
-
-test_record_size()
-unicode_version = ""
-
-script = read_table('Unicode.tables/Scripts.txt', make_get_names(script_names), script_names.index('Unknown'))
-category = read_table('Unicode.tables/DerivedGeneralCategory.txt', make_get_names(category_names), category_names.index('Cn'))
-break_props = read_table('Unicode.tables/GraphemeBreakProperty.txt', make_get_names(break_property_names), break_property_names.index('Other'))
-other_case = read_table('Unicode.tables/CaseFolding.txt', get_other_case, 0)
-
-# The grapheme breaking rules were changed for Unicode 11.0.0 (June 2018). Now
-# we need to find the Extended_Pictographic property for emoji characters. This
-# can be set as an additional grapheme break property, because the default for
-# all the emojis is "other". We scan the emoji-data.txt file and modify the
-# break-props table.
-
-file = open('Unicode.tables/emoji-data.txt', 'r', encoding='utf-8')
-for line in file:
-        line = re.sub(r'#.*', '', line)
-        chardata = list(map(str.strip, line.split(';')))
-        if len(chardata) <= 1:
-                continue
-
-        if chardata[1] != "Extended_Pictographic":
-                continue
-
-        m = re.match(r'([0-9a-fA-F]+)(\.\.([0-9a-fA-F]+))?$', chardata[0])
-        char = int(m.group(1), 16)
-        if m.group(3) is None:
-                last = char
-        else:
-                last = int(m.group(3), 16)
-        for i in range(char, last + 1):
-                if break_props[i] != break_property_names.index('Other'):
-                   print("WARNING: Emoji 0x%x has break property %s, not 'Other'",
-                     i, break_property_names[break_props[i]], file=sys.stderr)
-                break_props[i] = break_property_names.index('Extended_Pictographic')
-file.close()
-
-# The Script Extensions property default value is the Script value. Parse the
-# file, setting 'Unknown' as the default (this will never be a Script Extension
-# value), then scan it and fill in the default from Scripts. Code added by PH
-# in October 2018. Positive values are used for just a single script for a
-# code point. Negative values are negated offsets in a list of lists of
-# multiple scripts. Initialize this list with a single entry, as the zeroth
-# element is never used.
-
-script_lists = [0]
-script_abbrevs_default = script_abbrevs.index('Zzzz')
-scriptx = read_table('Unicode.tables/ScriptExtensions.txt', get_script_extension, script_abbrevs_default)
-
-for i in range(0, MAX_UNICODE):
-  if scriptx[i] == script_abbrevs_default:
-    scriptx[i] = script[i]
-
-# With the addition of the new Script Extensions field, we need some padding
-# to get the Unicode records up to 12 bytes (multiple of 4). Set a value
-# greater than 255 to make the field 16 bits.
-
-padding_dummy = [0] * MAX_UNICODE
-padding_dummy[0] = 256
-
-# This block of code was added by PH in September 2012. I am not a Python
-# programmer, so the style is probably dreadful, but it does the job. It scans
-# the other_case table to find sets of more than two characters that must all
-# match each other caselessly. Later in this script a table of these sets is
-# written out. However, we have to do this work here in order to compute the
-# offsets in the table that are inserted into the main table.
-
-# The CaseFolding.txt file lists pairs, but the common logic for reading data
-# sets only one value, so first we go through the table and set "return"
-# offsets for those that are not already set.
-
-for c in range(MAX_UNICODE):
-  if other_case[c] != 0 and other_case[c + other_case[c]] == 0:
-    other_case[c + other_case[c]] = -other_case[c]
-
-# Now scan again and create equivalence sets.
-
-sets = []
-
-for c in range(MAX_UNICODE):
-  o = c + other_case[c]
-
-  # Trigger when this character's other case does not point back here. We
-  # now have three characters that are case-equivalent.
-
-  if other_case[o] != -other_case[c]:
-    t = o + other_case[o]
-
-    # Scan the existing sets to see if any of the three characters are already
-    # part of a set. If so, unite the existing set with the new set.
-
-    appended = 0
-    for s in sets:
-      found = 0
-      for x in s:
-        if x == c or x == o or x == t:
-          found = 1
-
-      # Add new characters to an existing set
-
-      if found:
-        found = 0
-        for y in [c, o, t]:
-          for x in s:
-            if x == y:
-              found = 1
-          if not found:
-            s.append(y)
-        appended = 1
-
-    # If we have not added to an existing set, create a new one.
-
-    if not appended:
-      sets.append([c, o, t])
-
-# End of loop looking for caseless sets.
-
-# Now scan the sets and set appropriate offsets for the characters.
-
-caseless_offsets = [0] * MAX_UNICODE
-
-offset = 1;
-for s in sets:
-  for x in s:
-    caseless_offsets[x] = offset
-  offset += len(s) + 1
-
-# End of block of code for creating offsets for caseless matching sets.
-
-
-# Combine the tables
-
-table, records = combine_tables(script, category, break_props,
-  caseless_offsets, other_case, scriptx, padding_dummy)
-
-record_size, record_struct = get_record_size_struct(list(records.keys()))
-
-# Find the optimum block size for the two-stage table
-min_size = sys.maxsize
-for block_size in [2 ** i for i in range(5,10)]:
-        size = len(records) * record_size
-        stage1, stage2 = compress_table(table, block_size)
-        size += get_tables_size(stage1, stage2)
-        #print "/* block size %5d  => %5d bytes */" % (block_size, size)
-        if size < min_size:
-                min_size = size
-                min_stage1, min_stage2 = stage1, stage2
-                min_block_size = block_size
-
-print("/* This module is generated by the maint/MultiStage2.py script.")
-print("Do not modify it by hand. Instead modify the script and run it")
-print("to regenerate this code.")
-print()
-print("As well as being part of the PCRE2 library, this module is #included")
-print("by the pcre2test program, which redefines the PRIV macro to change")
-print("table names from _pcre2_xxx to xxxx, thereby avoiding name clashes")
-print("with the library. At present, just one of these tables is actually")
-print("needed. */")
-print()
-print("#ifndef PCRE2_PCRE2TEST")
-print()
-print("#ifdef HAVE_CONFIG_H")
-print("#include \"config.h\"")
-print("#endif")
-print()
-print("#include \"pcre2_internal.h\"")
-print()
-print("#endif /* PCRE2_PCRE2TEST */")
-print()
-print("/* Unicode character database. */")
-print("/* This file was autogenerated by the MultiStage2.py script. */")
-print("/* Total size: %d bytes, block size: %d. */" % (min_size, min_block_size))
-print()
-print("/* The tables herein are needed only when UCP support is built,")
-print("and in PCRE2 that happens automatically with UTF support.")
-print("This module should not be referenced otherwise, so")
-print("it should not matter whether it is compiled or not. However")
-print("a comment was received about space saving - maybe the guy linked")
-print("all the modules rather than using a library - so we include a")
-print("condition to cut out the tables when not needed. But don't leave")
-print("a totally empty module because some compilers barf at that.")
-print("Instead, just supply some small dummy tables. */")
-print()
-print("#ifndef SUPPORT_UNICODE")
-print("const ucd_record PRIV(ucd_records)[] = {{0,0,0,0,0,0,0 }};")
-print("const uint16_t PRIV(ucd_stage1)[] = {0};")
-print("const uint16_t PRIV(ucd_stage2)[] = {0};")
-print("const uint32_t PRIV(ucd_caseless_sets)[] = {0};")
-print("#else")
-print()
-print("const char *PRIV(unicode_version) = \"{}\";".format(unicode_version))
-print()
-print("/* If the 32-bit library is run in non-32-bit mode, character values")
-print("greater than 0x10ffff may be encountered. For these we set up a")
-print("special record. */")
-print()
-print("#if PCRE2_CODE_UNIT_WIDTH == 32")
-print("const ucd_record PRIV(dummy_ucd_record)[] = {{")
-print("  ucp_Unknown,    /* script */")
-print("  ucp_Cn,         /* type unassigned */")
-print("  ucp_gbOther,    /* grapheme break property */")
-print("  0,              /* case set */")
-print("  0,              /* other case */")
-print("  ucp_Unknown,    /* script extension */")
-print("  0,              /* dummy filler */")
-print("  }};")
-print("#endif")
-print()
-print(record_struct)
-
-# --- Added by PH: output the table of caseless character sets ---
-
-print("/* This table contains lists of characters that are caseless sets of")
-print("more than one character. Each list is terminated by NOTACHAR. */\n")
-
-print("const uint32_t PRIV(ucd_caseless_sets)[] = {")
-print("  NOTACHAR,")
-for s in sets:
-  s = sorted(s)
-  for x in s:
-    print('  0x%04x,' % x, end=' ')
-  print('  NOTACHAR,')
-print('};')
-print()
-
-# ------
-
-print("/* When #included in pcre2test, we don't need the table of digit")
-print("sets, nor the the large main UCD tables. */")
-print()
-print("#ifndef PCRE2_PCRE2TEST")
-print()
-
-# --- Added by PH: read Scripts.txt again for the sets of 10 digits. ---
-
-digitsets = []
-file = open('Unicode.tables/Scripts.txt', 'r', encoding='utf-8')
-
-for line in file:
-  m = re.match(r'([0-9a-fA-F]+)\.\.([0-9a-fA-F]+)\s+;\s+\S+\s+#\s+Nd\s+', line)
-  if m is None:
-    continue
-  first = int(m.group(1),16)
-  last  = int(m.group(2),16)
-  if ((last - first + 1) % 10) != 0:
-    print("ERROR: %04x..%04x does not contain a multiple of 10 characters" % (first, last),
-      file=sys.stderr)
-  while first < last:
-    digitsets.append(first + 9)
-    first += 10
-file.close()
-digitsets.sort()
-
-print("/* This table lists the code points for the '9' characters in each")
-print("set of decimal digits. It is used to ensure that all the digits in")
-print("a script run come from the same set. */\n")
-print("const uint32_t PRIV(ucd_digit_sets)[] = {")
-
-print("  %d,  /* Number of subsequent values */" % len(digitsets), end='')
-count = 8
-for d in digitsets:
-  if count == 8:
-    print("\n ", end='')
-    count = 0
-  print(" 0x%05x," % d, end='')
-  count += 1
-print("\n};\n")
-
-print("/* This vector is a list of lists of scripts for the Script Extension")
-print("property. Each sublist is zero-terminated. */\n")
-print("const uint8_t PRIV(ucd_script_sets)[] = {")
-
-count = 0
-print("  /*   0 */", end='')
-for d in script_lists:
-  print(" %3d," % d, end='')
-  count += 1
-  if d == 0:
-    print("\n  /* %3d */" % count, end='')
-print("\n};\n")
-
-# Output the main UCD tables.
-
-print("/* These are the main two-stage UCD tables. The fields in each record are:")
-print("script (8 bits), character type (8 bits), grapheme break property (8 bits),")
-print("offset to multichar other cases or zero (8 bits), offset to other case")
-print("or zero (32 bits, signed), script extension (16 bits, signed), and a dummy")
-print("16-bit field to make the whole thing a multiple of 4 bytes. */\n")
-
-print_records(records, record_size)
-print_table(min_stage1, 'PRIV(ucd_stage1)')
-print_table(min_stage2, 'PRIV(ucd_stage2)', min_block_size)
-print("#if UCD_BLOCK_SIZE != %d" % min_block_size)
-print("#error Please correct UCD_BLOCK_SIZE in pcre2_internal.h")
-print("#endif")
-print("#endif  /* SUPPORT_UNICODE */")
-print()
-print("#endif  /* PCRE2_PCRE2TEST */")
-
-
-# This code was part of the original contribution, but is commented out as it
-# was never used. A two-stage table has sufficed.
-
-"""
-
-# Three-stage tables:
-
-# Find the optimum block size for 3-stage table
-min_size = sys.maxint
-for stage3_block in [2 ** i for i in range(2,6)]:
-        stage_i, stage3 = compress_table(table, stage3_block)
-        for stage2_block in [2 ** i for i in range(5,10)]:
-                size = len(records) * 4
-                stage1, stage2 = compress_table(stage_i, stage2_block)
-                size += get_tables_size(stage1, stage2, stage3)
-                # print "/* %5d / %3d  => %5d bytes */" % (stage2_block, stage3_block, size)
-                if size < min_size:
-                        min_size = size
-                        min_stage1, min_stage2, min_stage3 = stage1, stage2, stage3
-                        min_stage2_block, min_stage3_block = stage2_block, stage3_block
-
-print "/* Total size: %d bytes" % min_size */
-print_records(records)
-print_table(min_stage1, 'ucd_stage1')
-print_table(min_stage2, 'ucd_stage2', min_stage2_block)
-print_table(min_stage3, 'ucd_stage3', min_stage3_block)
-
-"""
--- a/maint/README
+++ b/maint/README
@ -16,99 +16,122 @@ and also contains some notes for maintainers. Its contents are:
 Files in the maint directory
 ============================

-GenerateUtt.py   A Python script to generate part of the pcre2_tables.c file
-                 that contains Unicode script names in a long string with
-                 offsets, which is tedious to maintain by hand.
+GenerateCommon.py
+  A Python module containing data and functions that are used by the other
+  Generate scripts.
+  
+GenerateTest26.py
+  A Python script that generates input and expected output test data for test
+  26, which tests certain aspects of Unicode property support.  

-ManyConfigTests  A shell script that runs "configure, make, test" a number of
-                 times with different configuration settings.
+GenerateUcd.py
+  A Python script that generates the file pcre2_ucd.c from GenerateCommon.py
+  and Unicode data files, which are themselves downloaded from the Unicode web
+  site. The generated file contains the tables for a 2-stage lookup of Unicode
+  properties, along with some auxiliary tables. The script starts with a long
+  comment that gives details of the tables it constructs. 

-MultiStage2.py   A Python script that generates the file pcre2_ucd.c from six
-                 Unicode data files, which are themselves downloaded from the
-                 Unicode web site. Run this script in the "maint" directory.
-                 The generated file is written to stdout. It contains the
-                 tables for a 2-stage lookup of Unicode properties, along with
-                 some auxiliary tables.
+GenerateUcpHeader.py
+  A Python script that generates the file pcre2_ucp.h from GenerateCommon.py
+  and Unicode data files. The generated file defines constants for various
+  Unicode property values.
+
+GenerateUcpTables.py
+  A Python script that generates the file pcre2_ucptables.c from
+  GenerateCommon.py and Unicode data files. The generated file contains tables
+  for looking up Unicode property names.
+
+ManyConfigTests
+  A shell script that runs "configure, make, test" a number of times with
+  different configuration settings.

 pcre2_chartables.c.non-standard
-                 This is a set of character tables that came from a Windows
-                 system. It has characters greater than 128 that are set as
-                 spaces, amongst other things. I kept it so that it can be
-                 used for testing from time to time.
+  This is a set of character tables that came from a Windows system. It has
+  characters greater than 128 that are set as spaces, amongst other things. I
+  kept it so that it can be used for testing from time to time.

-README           This file.
+README
+  This file.

-Unicode.tables   The files in this directory were downloaded from the Unicode
-                 web site. They contain information about Unicode characters
-                 and scripts. The ones used by the MultiStage2.py script are
-                 CaseFolding.txt, DerivedGeneralCategory.txt, Scripts.txt,
-                 ScriptExtensions.txt, GraphemeBreakProperty.txt, and
-                 emoji-data.txt. I've kept UnicodeData.txt (which is no longer
-                 used by the script) because it is useful occasionally for
-                 manually looking up the details of certain characters.
-                 However, note that character names in this file such as
-                 "Arabic sign sanah" do NOT mean that the character is in a
-                 particular script (in this case, Arabic). Scripts.txt and
-                 ScriptExtensions.txt are where to look for script information.
+Unicode.tables
+  The files in this directory were downloaded from the Unicode web site. They
+  contain information about Unicode characters and scripts, and are used by the
+  Generate scripts. There is also UnicodeData.txt, which is no longer used by
+  any script, because it is useful occasionally for manually looking up the
+  details of certain characters. However, note that character names in this
+  file such as "Arabic sign sanah" do NOT mean that the character is in a
+  particular script (in this case, Arabic). Scripts.txt and
+  ScriptExtensions.txt are where to look for script information.

-ucptest.c        A short C program for testing the Unicode property macros
-                 that do lookups in the pcre2_ucd.c data, mainly useful after
-                 rebuilding the Unicode property table. Compile and run this in
-                 the "maint" directory (see comments at its head). This program 
-                 can also be used to find characters with specific properties. 
+ucptest.c
+  A program for testing the Unicode property macros that do lookups in the
+  pcre2_ucd.c data, mainly useful after rebuilding the Unicode property tables.
+  Compile and run this in the "maint" directory (see comments at its head).
+  This program can also be used to find characters with specific properties and 
+  to list which properties are supported. 

-ucptestdata      A directory containing four files, testinput{1,2} and
-                 testoutput{1,2}, for use in conjunction with the ucptest
-                 program.
+ucptestdata
+  A directory containing four files, testinput{1,2} and testoutput{1,2}, for
+  use in conjunction with the ucptest program.

-utf8.c           A short, freestanding C program for converting a Unicode code
-                 point into a sequence of bytes in the UTF-8 encoding, and vice
-                 versa. If its argument is a hex number such as 0x1234, it
-                 outputs a list of the equivalent UTF-8 bytes. If its argument
-                 is a sequence of concatenated UTF-8 bytes (e.g. e188b4) it
-                 treats them as a UTF-8 character and outputs the equivalent
-                 code point in hex. See comments at its head for details.
+utf8.c
+  A short, freestanding C program for converting a Unicode code point into a
+  sequence of bytes in the UTF-8 encoding, and vice versa. If its argument is a
+  hex number such as 0x1234, it outputs a list of the equivalent UTF-8 bytes.
+  If its argument is a sequence of concatenated UTF-8 bytes (e.g. 12e188b4) it
+  treats them as a UTF-8 string and outputs the equivalent code points in hex.
+  See comments at its head for details.


 Updating to a new Unicode release
 =================================

 When there is a new release of Unicode, the files in Unicode.tables must be
-refreshed from the web site. If the new version of Unicode adds new character
-scripts, the source file pcre2_ucp.h and both the MultiStage2.py and the
-GenerateUtt.py scripts must be edited to add the new names. I have been adding
-each new group at the end of the relevant list, with a comment. Note also that
-both the pcre2syntax.3 and pcre2pattern.3 man pages contain lists of Unicode
-script names.
+refreshed from the web site. Once that is done, the four Python scripts that 
+generate files from the Unicode data can be run from within the "maint" 
+directory.

-MultiStage2.py has two lists: the full names and the abbreviations that are
-found in the ScriptExtensions.txt file. A list of script names and their
-abbreviations can be found in the PropertyValueAliases.txt file on the
-Unicode web site. There is also a Wikipedia page that lists them, and notes the
-Unicode version in which they were introduced:
+Note: Previously, it was necessary to update lists of scripts and their 
+abbreviations by hand before running the Python scripts. This is no longer
+necessary because the scripts have been upgraded to extract this information
+themselves. Also, there used to be explicit lists of scripts in two of the man
+pages. This is no longer the case; the pcre2test program can now output a list 
+of supported scripts.

-https://en.wikipedia.org/wiki/Unicode_scripts#Table_of_Unicode_scripts
+You can give an output file name as an argument to the following scripts, but
+by default:

-Once the script name lists have been updated, MultiStage2.py can be run to
-generate a new version of pcre2_ucd.c, and GenerateUtt.py can be run to
-generate the tricky tables for inclusion in pcre2_tables.c (which must be
-hand-edited). If MultiStage2.py gives the error "ValueError: list.index(x): x
-not in list", the cause is usually a missing (or misspelt) name in one of the
-lists of scripts.
+GenerateUcd.py        creates pcre2_ucd.c        )
+GenerateUcpHeader.py  creates pcre2_ucp.h        ) in the current directory
+GenerateUcpTables.py  creates pcre2_ucptables.c  )

-The ucptest program can be compiled and used to check that the new tables in
-pcre2_ucd.c work properly, using the data files in ucptestdata to check a
-number of test characters. It used to be necessary to update the source
-ucptest.c whenever new Unicode scripts were added, but this is no longer
-required because that program now uses the lists in the PCRE2 source. However,
-adding a few tests for new scripts to the files in ucptestdata is a good idea.
+These files can be compared against the existing versions in the src directory
+to check on any changes before replacing the old files, but you can also
+generate directly into the final location by running:
+
+./GenerateUcd.py       ../src/pcre2_ucd.c
+./GenerateUcpHeader.py ../src/pcre2_ucp.h
+./GenerateUcpTables.py ../src/pcre2_ucptables.c
+
+Once the .c and .h files are in the ../src directory, the ucptest program can
+be compiled and used to check that the new tables work properly. The data files
+in ucptestdata are set up to check a number of test characters. See the
+comments at the start of ucptest.c. If there are new scripts, adding a few
+tests to the files in ucptestdata is a good idea.
+
+Finally, you should run the GenerateTest26.py script to regenerate new versions 
+of the input and expected output from a series of Unicode property tests that 
+are automatically generated from the Unicode data files. By default, the files
+are written to testinput26 and testoutput26 in the current directory, but you
+can give an alternative directory name as an argument to the script. These
+files should eventually be installed in the main testdata directory.


 Preparing for a PCRE2 release
 =============================

-This section contains a checklist of things that I consult before building a
-distribution for a new release.
+This section contains a checklist of things that I do before building a new
+release.

 . Ensure that the version number and version date are correct in configure.ac.

@ -117,19 +140,19 @@ distribution for a new release.

 . If new build options or new source files have been added, ensure that they
  are added to the CMake files as well as to the autoconf files. The relevant
-  files are CMakeLists.txt and config-cmake.h.in. After making a release
-  tarball, test it out with CMake if there have been changes here.
+  files are CMakeLists.txt and config-cmake.h.in. After making a release, test
+  it out with CMake if there have been changes here.

 . Run ./autogen.sh to ensure everything is up-to-date.

 . Compile and test with many different config options, and combinations of
  options. Also, test with valgrind by running "RunTest valgrind" and
-  "RunGrepTest valgrind" (which takes quite a long time). The script
-  maint/ManyConfigTests now encapsulates this testing. It runs tests with
-  different configurations, and it also runs some of them with valgrind, all of
-  which can take quite some time.
+  "RunGrepTest valgrind". The script maint/ManyConfigTests now encapsulates
+  this testing. It runs tests with different configurations, and it also runs
+  some of them with valgrind, all of which can take quite some time.

-. Run tests in both 32-bit and 64-bit environments if possible.
+. Run tests in both 32-bit and 64-bit environments if possible. I can no longer
+  run 32-bit tests.

 . Run tests with two or more different compilers (e.g. clang and gcc), and
  make use of -fsanitize=address and friends where possible. For gcc,
@ -140,7 +163,9 @@ distribution for a new release.
  be added when compiling with JIT. Another useful clang option is
  -fsanitize=signed-integer-overflow

-. Do a test build using CMake.
+. Do a test build using CMake. Remove src/config.h first, lest it override the
+  version that CMake creates. Also do a CMake unity build to check that it 
+  still works: [c]cmake -DCMAKE_UNITY_BUILD=ON sets up a unity build.

 . Run perltest.sh on the test data for tests 1 and 4. The output should match
  the PCRE2 test output, apart from the version identification at the start of
@ -159,12 +184,12 @@ distribution for a new release.
  systems. For example, on Solaris it is helpful to test using Sun's cc
  compiler as a change from gcc. Adding -xarch=v9 to the cc options does a
  64-bit test, but it also needs -S 64 for pcre2test to increase the stack size
-  for test 2. Since I retired I can no longer do much of this, but instead I
-  rely on putting out release candidates for folks on the pcre-dev list to
-  test.
+  for test 2. Since I retired I can no longer do much of this. There are 
+  automated tests under Ubuntu, Alpine, and Windows that are now set up as 
+  GitHub actions. Check that they are running clean.

 . The buildbots at http://buildfarm.opencsw.org/ do some automated testing
-  of PCRE2 and should be checked before putting out a release.
+  of PCRE2 and should also be checked before putting out a release.


 Updating version info for libtool
@ -214,20 +239,20 @@ changes in a shared library:
 Making a PCRE2 release
 ======================

-Run PrepareRelease and commit the files that it changes (by removing trailing
-spaces). The first thing this script does is to run CheckMan on the man pages;
-if it finds any markup errors, it reports them and then aborts.
+Run PrepareRelease and commit the files that it changes. The first thing this
+script does is to run CheckMan on the man pages; if it finds any markup errors,
+it reports them and then aborts. Otherwise it removes trailing spaces from
+sources and refreshes the HTML documentation. Update the GitHub repository with
+"git push".

 Once PrepareRelease has run clean, run "make distcheck" to create the tarballs
-and the zipball. Double-check with "svn status", then create an SVN tagged
-copy:
-
-  svn copy svn://vcs.exim.org/pcre2/code/trunk \
-           svn://vcs.exim.org/pcre2/code/tags/pcre2-10.xx
+and the zipball. I then sign these files. Double-check with "git status" that
+the repository is fully up-to-date, then create a new tag and a release on
+GitHub. Upload the tarballs, zipball, and the signatures as "assets" of the
+GitHub release.

 When the new release is out, don't forget to tell webmaster@pcre.org and the
-mailing list. Also, update the list of version numbers in Bugzilla
-(administration > products > PCRE > Edit versions).
+mailing list.


 Future ideas (wish list)
@ -235,7 +260,8 @@ Future ideas (wish list)

 This section records a list of ideas so that they do not get forgotten. They
 vary enormously in their usefulness and potential for implementation. Some are
-very sensible; some are rather wacky. Some have been on this list for years.
+very sensible; some are rather wacky. Some have been on this list for many
+years.

 . Optimization

@ -276,9 +302,6 @@ very sensible; some are rather wacky. Some have been on this list for years.

 . An option to convert results into character offsets and character lengths.

-. An option for pcre2grep to scan only the start of a file. I am not keen -
-  this is the job of "head".
-
 . A (non-Unix) user wanted pcregrep options to (a) list a file name just once,
  preceded by a blank line, instead of adding it to every matched line, and (b)
  support --outputfile=name.
@ -317,10 +340,9 @@ very sensible; some are rather wacky. Some have been on this list for years.

 . PCRE2 cannot at present distinguish between subpatterns with different names,
  but the same number (created by the use of ?|). In order to do so, a way of
-  remembering *which* subpattern numbered n matched is needed. Bugzilla #760.
-  (*MARK) can perhaps be used as a way round this problem. However, note that
-  Perl does not distinguish: like PCRE2, a name is just an alias for a number
-  in Perl.
+  remembering *which* subpattern numbered n matched is needed. (*MARK) can
+  perhaps be used as a way round this problem. However, note that Perl does not
+  distinguish: like PCRE2, a name is just an alias for a number in Perl.

 . Instead of having #ifdef HAVE_CONFIG_H in each module, put #include
  "something" and the the #ifdef appears only in one place, in "something".
@ -346,10 +368,6 @@ very sensible; some are rather wacky. Some have been on this list for years.

  See Unicode TR 29. The last two are very much aimed at natural language.

-. (?[...]) extended classes: big project.
-
-. Bugzilla #1694 requests backwards searching.
-
 . Allow a callout to specify a number of characters to skip. This can be done
  compatibly via an extra callout field.

@ -361,9 +379,6 @@ very sensible; some are rather wacky. Some have been on this list for years.
 . A limit on substitutions: a user suggested somehow finding a way of making
  match_limit apply to the whole operation instead of each match separately.

-. Redesign handling of class/nclass/xclass because the compile code logic is
-  currently very contorted and obscure.
-
 . Some #defines could be replaced with enums to improve robustness.

 . There was a request for an option for pcre2_match() to return the longest
@ -380,7 +395,8 @@ very sensible; some are rather wacky. Some have been on this list for years.
  The test function could make use of get_substrings() to cover more code.

 . A neater way of handling recursion file names in pcre2grep, e.g. a single
-  buffer that can grow.
+  buffer that can grow. See also GitHub issue #2 (recursion looping via
+  symlinks).

 . A user suggested that before/after parameters in pcre2grep could have
  negative values, to list lines near to the matched line, but not necessarily
@ -395,14 +411,7 @@ very sensible; some are rather wacky. Some have been on this list for years.
 . Breaking loops that match an empty string: perhaps find a way of continuing
  if *something* has changed, but this might mean remembering additional data.
  "Something" could be a capture value, but then a list of previous values
-  would be needed to avoid a cycle of changes. Bugzilla #2182.
-
-. The use of \K in assertions is problematic. There was some talk of Perl
-  banning this, but it hasn't happened. Some problems could be avoided by
-  not allowing it to set a value before the match start; others by not allowing
-  it to set a value after the match end. This could be controlled by an option
-  such as PCRE2_SANE_BACKSLASH_K, for compatibility (or possibly make the sane
-  behaviour the default and implement PCRE2_INSANE_BACKSLASH_K).
+  would be needed to avoid a cycle of changes.

 . If a function could be written to find 3-character (or other length) fixed
  strings, at least one of which must be present for a match, efficient
@ -410,6 +419,8 @@ very sensible; some are rather wacky. Some have been on this list for years.

 . If pcre2grep had --first-line (match only in the first line) it could be
  efficiently used to find files "starting with xxx". What about --last-line?
+  There was also the suggestion of an option for pcre2grep to scan only the
+  start of a file. I am not keen - this is the job of "head".

 . A user requested a means of determining whether a failed match was failed by
  the start-of-match optimizations, or by running the match engine. Easy enough
@ -419,25 +430,31 @@ very sensible; some are rather wacky. Some have been on this list for years.
  interpreters? JIT already does some of this, but it may not be worth it for
  the interpreters.

-. There was a request for a way of re-defining \w (and therefore \W, \b, and
-  \B). An in-pattern sequence such as (?w=[...]) was suggested. Easiest way
-  would be simply to inline the class, with lookarounds for \b and \B. Ideally
-  the setting should last till the end of the group, which means remembering
-  all previous settings; maybe a fixed amount of stack would do - how deep
-  would anyone want to nest these things? Bugzilla #2301.
-
-. Recognize the short script names. They are already listed in maint/
-  Multistage2.py because they are needed for scanning the script extensions
-  file.
-
-. Use script extensions for \p?
+. Redesign handling of class/nclass/xclass because the compile code logic is
+  currently very contorted and obscure. Also there was a request for a way of
+  re-defining \w (and therefore \W, \b, and \B). An in-pattern sequence such as
+  (?w=[...]) was suggested. Easiest way would be simply to inline the class,
+  with lookarounds for \b and \B. Ideally the setting should last till the end
+  of the group, which means remembering all previous settings; maybe a fixed
+  amount of stack would do - how deep would anyone want to nest these things?
+  See GitHub issue #13 for a compendium of character class issues, including
+  (?[...]) extended classes.

 . A user suggested something like --with-build-info to set a build information
  string that could be retrieved by pcre2_config(). However, there's no
  facility for a length limit in pcre2_config(), and what would be the
  encoding?

+. Quantified groups with a fixed count currently operate by replicating the
+  group in the compiled bytecode. This may not really matter in these days of
+  gigabyte memory, but perhaps another implementation might be considered.
+  Needs coordination between the interpreters and JIT.
+
+. There are regular requests for variable-length lookbehinds.
+
+. See also any suggestions in the GitHub issues.
+
 Philip Hazel
-Email local part: ph10
-Email domain: cam.ac.uk
-Last updated: 01 April 2020
+Email local part: Philip.Hazel
+Email domain: gmail.com
+Last updated: 25 April 2022
--- a/maint/Unicode.tables/BidiMirroring.txt
+++ b/maint/Unicode.tables/BidiMirroring.txt
@ -0,0 +1,633 @@
+# BidiMirroring-14.0.0.txt
+# Date: 2021-08-08, 22:55:00 GMT [KW, RP]
+# © 2021 Unicode®, Inc.
+# For terms of use, see https://www.unicode.org/terms_of_use.html
+#
+# Unicode Character Database
+# For documentation, see https://www.unicode.org/reports/tr44/
+#
+# Bidi_Mirroring_Glyph Property
+#
+# This file is an informative contributory data file in the
+# Unicode Character Database.
+#
+# This data file lists characters that have the Bidi_Mirrored=Yes property
+# value, for which there is another Unicode character that typically has a glyph
+# that is the mirror image of the original character's glyph.
+#
+# The repertoire covered by the file is Unicode 14.0.0.
+#
+# The file contains a list of lines with mappings from one code point
+# to another one for character-based mirroring.
+# Note that for "real" mirroring, a rendering engine needs to select
+# appropriate alternative glyphs, and that many Unicode characters do not
+# have a mirror-image Unicode character.
+#
+# Each mapping line contains two fields, separated by a semicolon (';').
+# Each of the two fields contains a code point represented as a
+# variable-length hexadecimal value with 4 to 6 digits.
+# A comment indicates where the characters are "BEST FIT" mirroring.
+#
+# Code points for which Bidi_Mirrored=Yes, but for which no appropriate
+# characters exist with mirrored glyphs, are
+# listed as comments at the end of the file.
+#
+# Formally, the default value of the Bidi_Mirroring_Glyph property
+# for each code point is <none>, unless a mapping to
+# some other character is specified in this data file. When a code
+# point has the default value for the Bidi_Mirroring_Glyph property,
+# that means that no other character exists whose glyph is suitable
+# for character-based mirroring.
+#
+# For information on bidi mirroring, see UAX #9: Unicode Bidirectional Algorithm,
+# at https://www.unicode.org/reports/tr9/
+#
+# This file was originally created by Markus Scherer.
+# Extended for Unicode 3.2, 4.0, 4.1, 5.0, 5.1, 5.2, and 6.0 by Ken Whistler,
+# and for subsequent versions by Ken Whistler, Laurentiu Iancu, and Roozbeh Pournader.
+#
+# Historical and Compatibility Information:
+#
+# The OpenType Mirroring Pairs List (OMPL) is frozen to match the
+# Unicode 5.1 version of the Bidi_Mirroring_Glyph property (2008).
+# See https://www.microsoft.com/typography/otspec/ompl.txt
+#
+# The Unicode 6.1 version of the Bidi_Mirroring_Glyph property (2011)
+# added one mirroring pair: 27CB <--> 27CD.
+#
+# The Unicode 11.0 version of the Bidi_Mirroring_Glyph property (2018)
+# underwent a substantial revision, to formally recognize all of the
+# exact mirroring pairs and "BEST FIT" mirroring pairs that had been
+# added after the freezing of the OMPL list. As a result, starting
+# with Unicode 11.0, the bmg mapping values more accurately reflect
+# the current status of glyphs for Bidi_Mirrored characters in
+# the Unicode Standard, but this listing now extends significantly
+# beyond the frozen OMPL list. Implementers should be aware of this
+# intentional distinction.
+#
+# ############################################################
+#
+# Property:	Bidi_Mirroring_Glyph
+#
+# @missing: 0000..10FFFF; <none>
+
+0028; 0029 # LEFT PARENTHESIS
+0029; 0028 # RIGHT PARENTHESIS
+003C; 003E # LESS-THAN SIGN
+003E; 003C # GREATER-THAN SIGN
+005B; 005D # LEFT SQUARE BRACKET
+005D; 005B # RIGHT SQUARE BRACKET
+007B; 007D # LEFT CURLY BRACKET
+007D; 007B # RIGHT CURLY BRACKET
+00AB; 00BB # LEFT-POINTING DOUBLE ANGLE QUOTATION MARK
+00BB; 00AB # RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK
+0F3A; 0F3B # TIBETAN MARK GUG RTAGS GYON
+0F3B; 0F3A # TIBETAN MARK GUG RTAGS GYAS
+0F3C; 0F3D # TIBETAN MARK ANG KHANG GYON
+0F3D; 0F3C # TIBETAN MARK ANG KHANG GYAS
+169B; 169C # OGHAM FEATHER MARK
+169C; 169B # OGHAM REVERSED FEATHER MARK
+2039; 203A # SINGLE LEFT-POINTING ANGLE QUOTATION MARK
+203A; 2039 # SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
+2045; 2046 # LEFT SQUARE BRACKET WITH QUILL
+2046; 2045 # RIGHT SQUARE BRACKET WITH QUILL
+207D; 207E # SUPERSCRIPT LEFT PARENTHESIS
+207E; 207D # SUPERSCRIPT RIGHT PARENTHESIS
+208D; 208E # SUBSCRIPT LEFT PARENTHESIS
+208E; 208D # SUBSCRIPT RIGHT PARENTHESIS
+2208; 220B # ELEMENT OF
+2209; 220C # [BEST FIT] NOT AN ELEMENT OF
+220A; 220D # SMALL ELEMENT OF
+220B; 2208 # CONTAINS AS MEMBER
+220C; 2209 # [BEST FIT] DOES NOT CONTAIN AS MEMBER
+220D; 220A # SMALL CONTAINS AS MEMBER
+2215; 29F5 # DIVISION SLASH
+221F; 2BFE # RIGHT ANGLE
+2220; 29A3 # ANGLE
+2221; 299B # MEASURED ANGLE
+2222; 29A0 # SPHERICAL ANGLE
+2224; 2AEE # DOES NOT DIVIDE
+223C; 223D # TILDE OPERATOR
+223D; 223C # REVERSED TILDE
+2243; 22CD # ASYMPTOTICALLY EQUAL TO
+2245; 224C # APPROXIMATELY EQUAL TO
+224C; 2245 # ALL EQUAL TO
+2252; 2253 # APPROXIMATELY EQUAL TO OR THE IMAGE OF
+2253; 2252 # IMAGE OF OR APPROXIMATELY EQUAL TO
+2254; 2255 # COLON EQUALS
+2255; 2254 # EQUALS COLON
+2264; 2265 # LESS-THAN OR EQUAL TO
+2265; 2264 # GREATER-THAN OR EQUAL TO
+2266; 2267 # LESS-THAN OVER EQUAL TO
+2267; 2266 # GREATER-THAN OVER EQUAL TO
+2268; 2269 # [BEST FIT] LESS-THAN BUT NOT EQUAL TO
+2269; 2268 # [BEST FIT] GREATER-THAN BUT NOT EQUAL TO
+226A; 226B # MUCH LESS-THAN
+226B; 226A # MUCH GREATER-THAN
+226E; 226F # [BEST FIT] NOT LESS-THAN
+226F; 226E # [BEST FIT] NOT GREATER-THAN
+2270; 2271 # [BEST FIT] NEITHER LESS-THAN NOR EQUAL TO
+2271; 2270 # [BEST FIT] NEITHER GREATER-THAN NOR EQUAL TO
+2272; 2273 # [BEST FIT] LESS-THAN OR EQUIVALENT TO
+2273; 2272 # [BEST FIT] GREATER-THAN OR EQUIVALENT TO
+2274; 2275 # [BEST FIT] NEITHER LESS-THAN NOR EQUIVALENT TO
+2275; 2274 # [BEST FIT] NEITHER GREATER-THAN NOR EQUIVALENT TO
+2276; 2277 # LESS-THAN OR GREATER-THAN
+2277; 2276 # GREATER-THAN OR LESS-THAN
+2278; 2279 # [BEST FIT] NEITHER LESS-THAN NOR GREATER-THAN
+2279; 2278 # [BEST FIT] NEITHER GREATER-THAN NOR LESS-THAN
+227A; 227B # PRECEDES
+227B; 227A # SUCCEEDS
+227C; 227D # PRECEDES OR EQUAL TO
+227D; 227C # SUCCEEDS OR EQUAL TO
+227E; 227F # [BEST FIT] PRECEDES OR EQUIVALENT TO
+227F; 227E # [BEST FIT] SUCCEEDS OR EQUIVALENT TO
+2280; 2281 # [BEST FIT] DOES NOT PRECEDE
+2281; 2280 # [BEST FIT] DOES NOT SUCCEED
+2282; 2283 # SUBSET OF
+2283; 2282 # SUPERSET OF
+2284; 2285 # [BEST FIT] NOT A SUBSET OF
+2285; 2284 # [BEST FIT] NOT A SUPERSET OF
+2286; 2287 # SUBSET OF OR EQUAL TO
+2287; 2286 # SUPERSET OF OR EQUAL TO
+2288; 2289 # [BEST FIT] NEITHER A SUBSET OF NOR EQUAL TO
+2289; 2288 # [BEST FIT] NEITHER A SUPERSET OF NOR EQUAL TO
+228A; 228B # [BEST FIT] SUBSET OF WITH NOT EQUAL TO
+228B; 228A # [BEST FIT] SUPERSET OF WITH NOT EQUAL TO
+228F; 2290 # SQUARE IMAGE OF
+2290; 228F # SQUARE ORIGINAL OF
+2291; 2292 # SQUARE IMAGE OF OR EQUAL TO
+2292; 2291 # SQUARE ORIGINAL OF OR EQUAL TO
+2298; 29B8 # CIRCLED DIVISION SLASH
+22A2; 22A3 # RIGHT TACK
+22A3; 22A2 # LEFT TACK
+22A6; 2ADE # ASSERTION
+22A8; 2AE4 # TRUE
+22A9; 2AE3 # FORCES
+22AB; 2AE5 # DOUBLE VERTICAL BAR DOUBLE RIGHT TURNSTILE
+22B0; 22B1 # PRECEDES UNDER RELATION
+22B1; 22B0 # SUCCEEDS UNDER RELATION
+22B2; 22B3 # NORMAL SUBGROUP OF
+22B3; 22B2 # CONTAINS AS NORMAL SUBGROUP
+22B4; 22B5 # NORMAL SUBGROUP OF OR EQUAL TO
+22B5; 22B4 # CONTAINS AS NORMAL SUBGROUP OR EQUAL TO
+22B6; 22B7 # ORIGINAL OF
+22B7; 22B6 # IMAGE OF
+22B8; 27DC # MULTIMAP
+22C9; 22CA # LEFT NORMAL FACTOR SEMIDIRECT PRODUCT
+22CA; 22C9 # RIGHT NORMAL FACTOR SEMIDIRECT PRODUCT
+22CB; 22CC # LEFT SEMIDIRECT PRODUCT
+22CC; 22CB # RIGHT SEMIDIRECT PRODUCT
+22CD; 2243 # REVERSED TILDE EQUALS
+22D0; 22D1 # DOUBLE SUBSET
+22D1; 22D0 # DOUBLE SUPERSET
+22D6; 22D7 # LESS-THAN WITH DOT
+22D7; 22D6 # GREATER-THAN WITH DOT
+22D8; 22D9 # VERY MUCH LESS-THAN
+22D9; 22D8 # VERY MUCH GREATER-THAN
+22DA; 22DB # LESS-THAN EQUAL TO OR GREATER-THAN
+22DB; 22DA # GREATER-THAN EQUAL TO OR LESS-THAN
+22DC; 22DD # EQUAL TO OR LESS-THAN
+22DD; 22DC # EQUAL TO OR GREATER-THAN
+22DE; 22DF # EQUAL TO OR PRECEDES
+22DF; 22DE # EQUAL TO OR SUCCEEDS
+22E0; 22E1 # [BEST FIT] DOES NOT PRECEDE OR EQUAL
+22E1; 22E0 # [BEST FIT] DOES NOT SUCCEED OR EQUAL
+22E2; 22E3 # [BEST FIT] NOT SQUARE IMAGE OF OR EQUAL TO
+22E3; 22E2 # [BEST FIT] NOT SQUARE ORIGINAL OF OR EQUAL TO
+22E4; 22E5 # [BEST FIT] SQUARE IMAGE OF OR NOT EQUAL TO
+22E5; 22E4 # [BEST FIT] SQUARE ORIGINAL OF OR NOT EQUAL TO
+22E6; 22E7 # [BEST FIT] LESS-THAN BUT NOT EQUIVALENT TO
+22E7; 22E6 # [BEST FIT] GREATER-THAN BUT NOT EQUIVALENT TO
+22E8; 22E9 # [BEST FIT] PRECEDES BUT NOT EQUIVALENT TO
+22E9; 22E8 # [BEST FIT] SUCCEEDS BUT NOT EQUIVALENT TO
+22EA; 22EB # [BEST FIT] NOT NORMAL SUBGROUP OF
+22EB; 22EA # [BEST FIT] DOES NOT CONTAIN AS NORMAL SUBGROUP
+22EC; 22ED # [BEST FIT] NOT NORMAL SUBGROUP OF OR EQUAL TO
+22ED; 22EC # [BEST FIT] DOES NOT CONTAIN AS NORMAL SUBGROUP OR EQUAL
+22F0; 22F1 # UP RIGHT DIAGONAL ELLIPSIS
+22F1; 22F0 # DOWN RIGHT DIAGONAL ELLIPSIS
+22F2; 22FA # ELEMENT OF WITH LONG HORIZONTAL STROKE
+22F3; 22FB # ELEMENT OF WITH VERTICAL BAR AT END OF HORIZONTAL STROKE
+22F4; 22FC # SMALL ELEMENT OF WITH VERTICAL BAR AT END OF HORIZONTAL STROKE
+22F6; 22FD # ELEMENT OF WITH OVERBAR
+22F7; 22FE # SMALL ELEMENT OF WITH OVERBAR
+22FA; 22F2 # CONTAINS WITH LONG HORIZONTAL STROKE
+22FB; 22F3 # CONTAINS WITH VERTICAL BAR AT END OF HORIZONTAL STROKE
+22FC; 22F4 # SMALL CONTAINS WITH VERTICAL BAR AT END OF HORIZONTAL STROKE
+22FD; 22F6 # CONTAINS WITH OVERBAR
+22FE; 22F7 # SMALL CONTAINS WITH OVERBAR
+2308; 2309 # LEFT CEILING
+2309; 2308 # RIGHT CEILING
+230A; 230B # LEFT FLOOR
+230B; 230A # RIGHT FLOOR
+2329; 232A # LEFT-POINTING ANGLE BRACKET
+232A; 2329 # RIGHT-POINTING ANGLE BRACKET
+2768; 2769 # MEDIUM LEFT PARENTHESIS ORNAMENT
+2769; 2768 # MEDIUM RIGHT PARENTHESIS ORNAMENT
+276A; 276B # MEDIUM FLATTENED LEFT PARENTHESIS ORNAMENT
+276B; 276A # MEDIUM FLATTENED RIGHT PARENTHESIS ORNAMENT
+276C; 276D # MEDIUM LEFT-POINTING ANGLE BRACKET ORNAMENT
+276D; 276C # MEDIUM RIGHT-POINTING ANGLE BRACKET ORNAMENT
+276E; 276F # HEAVY LEFT-POINTING ANGLE QUOTATION MARK ORNAMENT
+276F; 276E # HEAVY RIGHT-POINTING ANGLE QUOTATION MARK ORNAMENT
+2770; 2771 # HEAVY LEFT-POINTING ANGLE BRACKET ORNAMENT
+2771; 2770 # HEAVY RIGHT-POINTING ANGLE BRACKET ORNAMENT
+2772; 2773 # LIGHT LEFT TORTOISE SHELL BRACKET ORNAMENT
+2773; 2772 # LIGHT RIGHT TORTOISE SHELL BRACKET ORNAMENT
+2774; 2775 # MEDIUM LEFT CURLY BRACKET ORNAMENT
+2775; 2774 # MEDIUM RIGHT CURLY BRACKET ORNAMENT
+27C3; 27C4 # OPEN SUBSET
+27C4; 27C3 # OPEN SUPERSET
+27C5; 27C6 # LEFT S-SHAPED BAG DELIMITER
+27C6; 27C5 # RIGHT S-SHAPED BAG DELIMITER
+27C8; 27C9 # REVERSE SOLIDUS PRECEDING SUBSET
+27C9; 27C8 # SUPERSET PRECEDING SOLIDUS
+27CB; 27CD # MATHEMATICAL RISING DIAGONAL
+27CD; 27CB # MATHEMATICAL FALLING DIAGONAL
+27D5; 27D6 # LEFT OUTER JOIN
+27D6; 27D5 # RIGHT OUTER JOIN
+27DC; 22B8 # LEFT MULTIMAP
+27DD; 27DE # LONG RIGHT TACK
+27DE; 27DD # LONG LEFT TACK
+27E2; 27E3 # WHITE CONCAVE-SIDED DIAMOND WITH LEFTWARDS TICK
+27E3; 27E2 # WHITE CONCAVE-SIDED DIAMOND WITH RIGHTWARDS TICK
+27E4; 27E5 # WHITE SQUARE WITH LEFTWARDS TICK
+27E5; 27E4 # WHITE SQUARE WITH RIGHTWARDS TICK
+27E6; 27E7 # MATHEMATICAL LEFT WHITE SQUARE BRACKET
+27E7; 27E6 # MATHEMATICAL RIGHT WHITE SQUARE BRACKET
+27E8; 27E9 # MATHEMATICAL LEFT ANGLE BRACKET
+27E9; 27E8 # MATHEMATICAL RIGHT ANGLE BRACKET
+27EA; 27EB # MATHEMATICAL LEFT DOUBLE ANGLE BRACKET
+27EB; 27EA # MATHEMATICAL RIGHT DOUBLE ANGLE BRACKET
+27EC; 27ED # MATHEMATICAL LEFT WHITE TORTOISE SHELL BRACKET
+27ED; 27EC # MATHEMATICAL RIGHT WHITE TORTOISE SHELL BRACKET
+27EE; 27EF # MATHEMATICAL LEFT FLATTENED PARENTHESIS
+27EF; 27EE # MATHEMATICAL RIGHT FLATTENED PARENTHESIS
+2983; 2984 # LEFT WHITE CURLY BRACKET
+2984; 2983 # RIGHT WHITE CURLY BRACKET
+2985; 2986 # LEFT WHITE PARENTHESIS
+2986; 2985 # RIGHT WHITE PARENTHESIS
+2987; 2988 # Z NOTATION LEFT IMAGE BRACKET
+2988; 2987 # Z NOTATION RIGHT IMAGE BRACKET
+2989; 298A # Z NOTATION LEFT BINDING BRACKET
+298A; 2989 # Z NOTATION RIGHT BINDING BRACKET
+298B; 298C # LEFT SQUARE BRACKET WITH UNDERBAR
+298C; 298B # RIGHT SQUARE BRACKET WITH UNDERBAR
+298D; 2990 # LEFT SQUARE BRACKET WITH TICK IN TOP CORNER
+298E; 298F # RIGHT SQUARE BRACKET WITH TICK IN BOTTOM CORNER
+298F; 298E # LEFT SQUARE BRACKET WITH TICK IN BOTTOM CORNER
+2990; 298D # RIGHT SQUARE BRACKET WITH TICK IN TOP CORNER
+2991; 2992 # LEFT ANGLE BRACKET WITH DOT
+2992; 2991 # RIGHT ANGLE BRACKET WITH DOT
+2993; 2994 # LEFT ARC LESS-THAN BRACKET
+2994; 2993 # RIGHT ARC GREATER-THAN BRACKET
+2995; 2996 # DOUBLE LEFT ARC GREATER-THAN BRACKET
+2996; 2995 # DOUBLE RIGHT ARC LESS-THAN BRACKET
+2997; 2998 # LEFT BLACK TORTOISE SHELL BRACKET
+2998; 2997 # RIGHT BLACK TORTOISE SHELL BRACKET
+299B; 2221 # MEASURED ANGLE OPENING LEFT
+29A0; 2222 # SPHERICAL ANGLE OPENING LEFT
+29A3; 2220 # REVERSED ANGLE
+29A4; 29A5 # ANGLE WITH UNDERBAR
+29A5; 29A4 # REVERSED ANGLE WITH UNDERBAR
+29A8; 29A9 # MEASURED ANGLE WITH OPEN ARM ENDING IN ARROW POINTING UP AND RIGHT
+29A9; 29A8 # MEASURED ANGLE WITH OPEN ARM ENDING IN ARROW POINTING UP AND LEFT
+29AA; 29AB # MEASURED ANGLE WITH OPEN ARM ENDING IN ARROW POINTING DOWN AND RIGHT
+29AB; 29AA # MEASURED ANGLE WITH OPEN ARM ENDING IN ARROW POINTING DOWN AND LEFT
+29AC; 29AD # MEASURED ANGLE WITH OPEN ARM ENDING IN ARROW POINTING RIGHT AND UP
+29AD; 29AC # MEASURED ANGLE WITH OPEN ARM ENDING IN ARROW POINTING LEFT AND UP
+29AE; 29AF # MEASURED ANGLE WITH OPEN ARM ENDING IN ARROW POINTING RIGHT AND DOWN
+29AF; 29AE # MEASURED ANGLE WITH OPEN ARM ENDING IN ARROW POINTING LEFT AND DOWN
+29B8; 2298 # CIRCLED REVERSE SOLIDUS
+29C0; 29C1 # CIRCLED LESS-THAN
+29C1; 29C0 # CIRCLED GREATER-THAN
+29C4; 29C5 # SQUARED RISING DIAGONAL SLASH
+29C5; 29C4 # SQUARED FALLING DIAGONAL SLASH
+29CF; 29D0 # LEFT TRIANGLE BESIDE VERTICAL BAR
+29D0; 29CF # VERTICAL BAR BESIDE RIGHT TRIANGLE
+29D1; 29D2 # BOWTIE WITH LEFT HALF BLACK
+29D2; 29D1 # BOWTIE WITH RIGHT HALF BLACK
+29D4; 29D5 # TIMES WITH LEFT HALF BLACK
+29D5; 29D4 # TIMES WITH RIGHT HALF BLACK
+29D8; 29D9 # LEFT WIGGLY FENCE
+29D9; 29D8 # RIGHT WIGGLY FENCE
+29DA; 29DB # LEFT DOUBLE WIGGLY FENCE
+29DB; 29DA # RIGHT DOUBLE WIGGLY FENCE
+29E8; 29E9 # DOWN-POINTING TRIANGLE WITH LEFT HALF BLACK
+29E9; 29E8 # DOWN-POINTING TRIANGLE WITH RIGHT HALF BLACK
+29F5; 2215 # REVERSE SOLIDUS OPERATOR
+29F8; 29F9 # BIG SOLIDUS
+29F9; 29F8 # BIG REVERSE SOLIDUS
+29FC; 29FD # LEFT-POINTING CURVED ANGLE BRACKET
+29FD; 29FC # RIGHT-POINTING CURVED ANGLE BRACKET
+2A2B; 2A2C # MINUS SIGN WITH FALLING DOTS
+2A2C; 2A2B # MINUS SIGN WITH RISING DOTS
+2A2D; 2A2E # PLUS SIGN IN LEFT HALF CIRCLE
+2A2E; 2A2D # PLUS SIGN IN RIGHT HALF CIRCLE
+2A34; 2A35 # MULTIPLICATION SIGN IN LEFT HALF CIRCLE
+2A35; 2A34 # MULTIPLICATION SIGN IN RIGHT HALF CIRCLE
+2A3C; 2A3D # INTERIOR PRODUCT
+2A3D; 2A3C # RIGHTHAND INTERIOR PRODUCT
+2A64; 2A65 # Z NOTATION DOMAIN ANTIRESTRICTION
+2A65; 2A64 # Z NOTATION RANGE ANTIRESTRICTION
+2A79; 2A7A # LESS-THAN WITH CIRCLE INSIDE
+2A7A; 2A79 # GREATER-THAN WITH CIRCLE INSIDE
+2A7B; 2A7C # [BEST FIT] LESS-THAN WITH QUESTION MARK ABOVE
+2A7C; 2A7B # [BEST FIT] GREATER-THAN WITH QUESTION MARK ABOVE
+2A7D; 2A7E # LESS-THAN OR SLANTED EQUAL TO
+2A7E; 2A7D # GREATER-THAN OR SLANTED EQUAL TO
+2A7F; 2A80 # LESS-THAN OR SLANTED EQUAL TO WITH DOT INSIDE
+2A80; 2A7F # GREATER-THAN OR SLANTED EQUAL TO WITH DOT INSIDE
+2A81; 2A82 # LESS-THAN OR SLANTED EQUAL TO WITH DOT ABOVE
+2A82; 2A81 # GREATER-THAN OR SLANTED EQUAL TO WITH DOT ABOVE
+2A83; 2A84 # LESS-THAN OR SLANTED EQUAL TO WITH DOT ABOVE RIGHT
+2A84; 2A83 # GREATER-THAN OR SLANTED EQUAL TO WITH DOT ABOVE LEFT
+2A85; 2A86 # [BEST FIT] LESS-THAN OR APPROXIMATE
+2A86; 2A85 # [BEST FIT] GREATER-THAN OR APPROXIMATE
+2A87; 2A88 # [BEST FIT] LESS-THAN AND SINGLE-LINE NOT EQUAL TO
+2A88; 2A87 # [BEST FIT] GREATER-THAN AND SINGLE-LINE NOT EQUAL TO
+2A89; 2A8A # [BEST FIT] LESS-THAN AND NOT APPROXIMATE
+2A8A; 2A89 # [BEST FIT] GREATER-THAN AND NOT APPROXIMATE
+2A8B; 2A8C # LESS-THAN ABOVE DOUBLE-LINE EQUAL ABOVE GREATER-THAN
+2A8C; 2A8B # GREATER-THAN ABOVE DOUBLE-LINE EQUAL ABOVE LESS-THAN
+2A8D; 2A8E # [BEST FIT] LESS-THAN ABOVE SIMILAR OR EQUAL
+2A8E; 2A8D # [BEST FIT] GREATER-THAN ABOVE SIMILAR OR EQUAL
+2A8F; 2A90 # [BEST FIT] LESS-THAN ABOVE SIMILAR ABOVE GREATER-THAN
+2A90; 2A8F # [BEST FIT] GREATER-THAN ABOVE SIMILAR ABOVE LESS-THAN
+2A91; 2A92 # LESS-THAN ABOVE GREATER-THAN ABOVE DOUBLE-LINE EQUAL
+2A92; 2A91 # GREATER-THAN ABOVE LESS-THAN ABOVE DOUBLE-LINE EQUAL
+2A93; 2A94 # LESS-THAN ABOVE SLANTED EQUAL ABOVE GREATER-THAN ABOVE SLANTED EQUAL
+2A94; 2A93 # GREATER-THAN ABOVE SLANTED EQUAL ABOVE LESS-THAN ABOVE SLANTED EQUAL
+2A95; 2A96 # SLANTED EQUAL TO OR LESS-THAN
+2A96; 2A95 # SLANTED EQUAL TO OR GREATER-THAN
+2A97; 2A98 # SLANTED EQUAL TO OR LESS-THAN WITH DOT INSIDE
+2A98; 2A97 # SLANTED EQUAL TO OR GREATER-THAN WITH DOT INSIDE
+2A99; 2A9A # DOUBLE-LINE EQUAL TO OR LESS-THAN
+2A9A; 2A99 # DOUBLE-LINE EQUAL TO OR GREATER-THAN
+2A9B; 2A9C # DOUBLE-LINE SLANTED EQUAL TO OR LESS-THAN
+2A9C; 2A9B # DOUBLE-LINE SLANTED EQUAL TO OR GREATER-THAN
+2A9D; 2A9E # [BEST FIT] SIMILAR OR LESS-THAN
+2A9E; 2A9D # [BEST FIT] SIMILAR OR GREATER-THAN
+2A9F; 2AA0 # [BEST FIT] SIMILAR ABOVE LESS-THAN ABOVE EQUALS SIGN
+2AA0; 2A9F # [BEST FIT] SIMILAR ABOVE GREATER-THAN ABOVE EQUALS SIGN
+2AA1; 2AA2 # DOUBLE NESTED LESS-THAN
+2AA2; 2AA1 # DOUBLE NESTED GREATER-THAN
+2AA6; 2AA7 # LESS-THAN CLOSED BY CURVE
+2AA7; 2AA6 # GREATER-THAN CLOSED BY CURVE
+2AA8; 2AA9 # LESS-THAN CLOSED BY CURVE ABOVE SLANTED EQUAL
+2AA9; 2AA8 # GREATER-THAN CLOSED BY CURVE ABOVE SLANTED EQUAL
+2AAA; 2AAB # SMALLER THAN
+2AAB; 2AAA # LARGER THAN
+2AAC; 2AAD # SMALLER THAN OR EQUAL TO
+2AAD; 2AAC # LARGER THAN OR EQUAL TO
+2AAF; 2AB0 # PRECEDES ABOVE SINGLE-LINE EQUALS SIGN
+2AB0; 2AAF # SUCCEEDS ABOVE SINGLE-LINE EQUALS SIGN
+2AB1; 2AB2 # [BEST FIT] PRECEDES ABOVE SINGLE-LINE NOT EQUAL TO
+2AB2; 2AB1 # [BEST FIT] SUCCEEDS ABOVE SINGLE-LINE NOT EQUAL TO
+2AB3; 2AB4 # PRECEDES ABOVE EQUALS SIGN
+2AB4; 2AB3 # SUCCEEDS ABOVE EQUALS SIGN
+2AB5; 2AB6 # [BEST FIT] PRECEDES ABOVE NOT EQUAL TO
+2AB6; 2AB5 # [BEST FIT] SUCCEEDS ABOVE NOT EQUAL TO
+2AB7; 2AB8 # [BEST FIT] PRECEDES ABOVE ALMOST EQUAL TO
+2AB8; 2AB7 # [BEST FIT] SUCCEEDS ABOVE ALMOST EQUAL TO
+2AB9; 2ABA # [BEST FIT] PRECEDES ABOVE NOT ALMOST EQUAL TO
+2ABA; 2AB9 # [BEST FIT] SUCCEEDS ABOVE NOT ALMOST EQUAL TO
+2ABB; 2ABC # DOUBLE PRECEDES
+2ABC; 2ABB # DOUBLE SUCCEEDS
+2ABD; 2ABE # SUBSET WITH DOT
+2ABE; 2ABD # SUPERSET WITH DOT
+2ABF; 2AC0 # SUBSET WITH PLUS SIGN BELOW
+2AC0; 2ABF # SUPERSET WITH PLUS SIGN BELOW
+2AC1; 2AC2 # SUBSET WITH MULTIPLICATION SIGN BELOW
+2AC2; 2AC1 # SUPERSET WITH MULTIPLICATION SIGN BELOW
+2AC3; 2AC4 # SUBSET OF OR EQUAL TO WITH DOT ABOVE
+2AC4; 2AC3 # SUPERSET OF OR EQUAL TO WITH DOT ABOVE
+2AC5; 2AC6 # SUBSET OF ABOVE EQUALS SIGN
+2AC6; 2AC5 # SUPERSET OF ABOVE EQUALS SIGN
+2AC7; 2AC8 # [BEST FIT] SUBSET OF ABOVE TILDE OPERATOR
+2AC8; 2AC7 # [BEST FIT] SUPERSET OF ABOVE TILDE OPERATOR
+2AC9; 2ACA # [BEST FIT] SUBSET OF ABOVE ALMOST EQUAL TO
+2ACA; 2AC9 # [BEST FIT] SUPERSET OF ABOVE ALMOST EQUAL TO
+2ACB; 2ACC # [BEST FIT] SUBSET OF ABOVE NOT EQUAL TO
+2ACC; 2ACB # [BEST FIT] SUPERSET OF ABOVE NOT EQUAL TO
+2ACD; 2ACE # SQUARE LEFT OPEN BOX OPERATOR
+2ACE; 2ACD # SQUARE RIGHT OPEN BOX OPERATOR
+2ACF; 2AD0 # CLOSED SUBSET
+2AD0; 2ACF # CLOSED SUPERSET
+2AD1; 2AD2 # CLOSED SUBSET OR EQUAL TO
+2AD2; 2AD1 # CLOSED SUPERSET OR EQUAL TO
+2AD3; 2AD4 # SUBSET ABOVE SUPERSET
+2AD4; 2AD3 # SUPERSET ABOVE SUBSET
+2AD5; 2AD6 # SUBSET ABOVE SUBSET
+2AD6; 2AD5 # SUPERSET ABOVE SUPERSET
+2ADE; 22A6 # SHORT LEFT TACK
+2AE3; 22A9 # DOUBLE VERTICAL BAR LEFT TURNSTILE
+2AE4; 22A8 # VERTICAL BAR DOUBLE LEFT TURNSTILE
+2AE5; 22AB # DOUBLE VERTICAL BAR DOUBLE LEFT TURNSTILE
+2AEC; 2AED # DOUBLE STROKE NOT SIGN
+2AED; 2AEC # REVERSED DOUBLE STROKE NOT SIGN
+2AEE; 2224 # DOES NOT DIVIDE WITH REVERSED NEGATION SLASH
+2AF7; 2AF8 # TRIPLE NESTED LESS-THAN
+2AF8; 2AF7 # TRIPLE NESTED GREATER-THAN
+2AF9; 2AFA # DOUBLE-LINE SLANTED LESS-THAN OR EQUAL TO
+2AFA; 2AF9 # DOUBLE-LINE SLANTED GREATER-THAN OR EQUAL TO
+2BFE; 221F # REVERSED RIGHT ANGLE
+2E02; 2E03 # LEFT SUBSTITUTION BRACKET
+2E03; 2E02 # RIGHT SUBSTITUTION BRACKET
+2E04; 2E05 # LEFT DOTTED SUBSTITUTION BRACKET
+2E05; 2E04 # RIGHT DOTTED SUBSTITUTION BRACKET
+2E09; 2E0A # LEFT TRANSPOSITION BRACKET
+2E0A; 2E09 # RIGHT TRANSPOSITION BRACKET
+2E0C; 2E0D # LEFT RAISED OMISSION BRACKET
+2E0D; 2E0C # RIGHT RAISED OMISSION BRACKET
+2E1C; 2E1D # LEFT LOW PARAPHRASE BRACKET
+2E1D; 2E1C # RIGHT LOW PARAPHRASE BRACKET
+2E20; 2E21 # LEFT VERTICAL BAR WITH QUILL
+2E21; 2E20 # RIGHT VERTICAL BAR WITH QUILL
+2E22; 2E23 # TOP LEFT HALF BRACKET
+2E23; 2E22 # TOP RIGHT HALF BRACKET
+2E24; 2E25 # BOTTOM LEFT HALF BRACKET
+2E25; 2E24 # BOTTOM RIGHT HALF BRACKET
+2E26; 2E27 # LEFT SIDEWAYS U BRACKET
+2E27; 2E26 # RIGHT SIDEWAYS U BRACKET
+2E28; 2E29 # LEFT DOUBLE PARENTHESIS
+2E29; 2E28 # RIGHT DOUBLE PARENTHESIS
+2E55; 2E56 # LEFT SQUARE BRACKET WITH STROKE
+2E56; 2E55 # RIGHT SQUARE BRACKET WITH STROKE
+2E57; 2E58 # LEFT SQUARE BRACKET WITH DOUBLE STROKE
+2E58; 2E57 # RIGHT SQUARE BRACKET WITH DOUBLE STROKE
+2E59; 2E5A # TOP HALF LEFT PARENTHESIS
+2E5A; 2E59 # TOP HALF RIGHT PARENTHESIS
+2E5B; 2E5C # BOTTOM HALF LEFT PARENTHESIS
+2E5C; 2E5B # BOTTOM HALF RIGHT PARENTHESIS
+3008; 3009 # LEFT ANGLE BRACKET
+3009; 3008 # RIGHT ANGLE BRACKET
+300A; 300B # LEFT DOUBLE ANGLE BRACKET
+300B; 300A # RIGHT DOUBLE ANGLE BRACKET
+300C; 300D # [BEST FIT] LEFT CORNER BRACKET
+300D; 300C # [BEST FIT] RIGHT CORNER BRACKET
+300E; 300F # [BEST FIT] LEFT WHITE CORNER BRACKET
+300F; 300E # [BEST FIT] RIGHT WHITE CORNER BRACKET
+3010; 3011 # LEFT BLACK LENTICULAR BRACKET
+3011; 3010 # RIGHT BLACK LENTICULAR BRACKET
+3014; 3015 # LEFT TORTOISE SHELL BRACKET
+3015; 3014 # RIGHT TORTOISE SHELL BRACKET
+3016; 3017 # LEFT WHITE LENTICULAR BRACKET
+3017; 3016 # RIGHT WHITE LENTICULAR BRACKET
+3018; 3019 # LEFT WHITE TORTOISE SHELL BRACKET
+3019; 3018 # RIGHT WHITE TORTOISE SHELL BRACKET
+301A; 301B # LEFT WHITE SQUARE BRACKET
+301B; 301A # RIGHT WHITE SQUARE BRACKET
+FE59; FE5A # SMALL LEFT PARENTHESIS
+FE5A; FE59 # SMALL RIGHT PARENTHESIS
+FE5B; FE5C # SMALL LEFT CURLY BRACKET
+FE5C; FE5B # SMALL RIGHT CURLY BRACKET
+FE5D; FE5E # SMALL LEFT TORTOISE SHELL BRACKET
+FE5E; FE5D # SMALL RIGHT TORTOISE SHELL BRACKET
+FE64; FE65 # SMALL LESS-THAN SIGN
+FE65; FE64 # SMALL GREATER-THAN SIGN
+FF08; FF09 # FULLWIDTH LEFT PARENTHESIS
+FF09; FF08 # FULLWIDTH RIGHT PARENTHESIS
+FF1C; FF1E # FULLWIDTH LESS-THAN SIGN
+FF1E; FF1C # FULLWIDTH GREATER-THAN SIGN
+FF3B; FF3D # FULLWIDTH LEFT SQUARE BRACKET
+FF3D; FF3B # FULLWIDTH RIGHT SQUARE BRACKET
+FF5B; FF5D # FULLWIDTH LEFT CURLY BRACKET
+FF5D; FF5B # FULLWIDTH RIGHT CURLY BRACKET
+FF5F; FF60 # FULLWIDTH LEFT WHITE PARENTHESIS
+FF60; FF5F # FULLWIDTH RIGHT WHITE PARENTHESIS
+FF62; FF63 # [BEST FIT] HALFWIDTH LEFT CORNER BRACKET
+FF63; FF62 # [BEST FIT] HALFWIDTH RIGHT CORNER BRACKET
+
+# The following characters have no appropriate mirroring character.
+# For these characters it is up to the rendering system
+#   to provide mirrored glyphs.
+
+# 2140; DOUBLE-STRUCK N-ARY SUMMATION
+# 2201; COMPLEMENT
+# 2202; PARTIAL DIFFERENTIAL
+# 2203; THERE EXISTS
+# 2204; THERE DOES NOT EXIST
+# 2211; N-ARY SUMMATION
+# 2216; SET MINUS
+# 221A; SQUARE ROOT
+# 221B; CUBE ROOT
+# 221C; FOURTH ROOT
+# 221D; PROPORTIONAL TO
+# 2226; NOT PARALLEL TO
+# 222B; INTEGRAL
+# 222C; DOUBLE INTEGRAL
+# 222D; TRIPLE INTEGRAL
+# 222E; CONTOUR INTEGRAL
+# 222F; SURFACE INTEGRAL
+# 2230; VOLUME INTEGRAL
+# 2231; CLOCKWISE INTEGRAL
+# 2232; CLOCKWISE CONTOUR INTEGRAL
+# 2233; ANTICLOCKWISE CONTOUR INTEGRAL
+# 2239; EXCESS
+# 223B; HOMOTHETIC
+# 223E; INVERTED LAZY S
+# 223F; SINE WAVE
+# 2240; WREATH PRODUCT
+# 2241; NOT TILDE
+# 2242; MINUS TILDE
+# 2244; NOT ASYMPTOTICALLY EQUAL TO
+# 2246; APPROXIMATELY BUT NOT ACTUALLY EQUAL TO
+# 2247; NEITHER APPROXIMATELY NOR ACTUALLY EQUAL TO
+# 2248; ALMOST EQUAL TO
+# 2249; NOT ALMOST EQUAL TO
+# 224A; ALMOST EQUAL OR EQUAL TO
+# 224B; TRIPLE TILDE
+# 225F; QUESTIONED EQUAL TO
+# 2260; NOT EQUAL TO
+# 2262; NOT IDENTICAL TO
+# 228C; MULTISET
+# 22A7; MODELS
+# 22AA; TRIPLE VERTICAL BAR RIGHT TURNSTILE
+# 22AC; DOES NOT PROVE
+# 22AD; NOT TRUE
+# 22AE; DOES NOT FORCE
+# 22AF; NEGATED DOUBLE VERTICAL BAR DOUBLE RIGHT TURNSTILE
+# 22BE; RIGHT ANGLE WITH ARC
+# 22BF; RIGHT TRIANGLE
+# 22F5; ELEMENT OF WITH DOT ABOVE
+# 22F8; ELEMENT OF WITH UNDERBAR
+# 22F9; ELEMENT OF WITH TWO HORIZONTAL STROKES
+# 22FF; Z NOTATION BAG MEMBERSHIP
+# 2320; TOP HALF INTEGRAL
+# 2321; BOTTOM HALF INTEGRAL
+# 27C0; THREE DIMENSIONAL ANGLE
+# 27CC; LONG DIVISION
+# 27D3; LOWER RIGHT CORNER WITH DOT
+# 27D4; UPPER LEFT CORNER WITH DOT
+# 299C; RIGHT ANGLE VARIANT WITH SQUARE
+# 299D; MEASURED RIGHT ANGLE WITH DOT
+# 299E; ANGLE WITH S INSIDE
+# 299F; ACUTE ANGLE
+# 29A2; TURNED ANGLE
+# 29A6; OBLIQUE ANGLE OPENING UP
+# 29A7; OBLIQUE ANGLE OPENING DOWN
+# 29C2; CIRCLE WITH SMALL CIRCLE TO THE RIGHT
+# 29C3; CIRCLE WITH TWO HORIZONTAL STROKES TO THE RIGHT
+# 29C9; TWO JOINED SQUARES
+# 29CE; RIGHT TRIANGLE ABOVE LEFT TRIANGLE
+# 29DC; INCOMPLETE INFINITY
+# 29E1; INCREASES AS
+# 29E3; EQUALS SIGN AND SLANTED PARALLEL
+# 29E4; EQUALS SIGN AND SLANTED PARALLEL WITH TILDE ABOVE
+# 29E5; IDENTICAL TO AND SLANTED PARALLEL
+# 29F4; RULE-DELAYED
+# 29F6; SOLIDUS WITH OVERBAR
+# 29F7; REVERSE SOLIDUS WITH HORIZONTAL STROKE
+# 2A0A; MODULO TWO SUM
+# 2A0B; SUMMATION WITH INTEGRAL
+# 2A0C; QUADRUPLE INTEGRAL OPERATOR
+# 2A0D; FINITE PART INTEGRAL
+# 2A0E; INTEGRAL WITH DOUBLE STROKE
+# 2A0F; INTEGRAL AVERAGE WITH SLASH
+# 2A10; CIRCULATION FUNCTION
+# 2A11; ANTICLOCKWISE INTEGRATION
+# 2A12; LINE INTEGRATION WITH RECTANGULAR PATH AROUND POLE
+# 2A13; LINE INTEGRATION WITH SEMICIRCULAR PATH AROUND POLE
+# 2A14; LINE INTEGRATION NOT INCLUDING THE POLE
+# 2A15; INTEGRAL AROUND A POINT OPERATOR
+# 2A16; QUATERNION INTEGRAL OPERATOR
+# 2A17; INTEGRAL WITH LEFTWARDS ARROW WITH HOOK
+# 2A18; INTEGRAL WITH TIMES SIGN
+# 2A19; INTEGRAL WITH INTERSECTION
+# 2A1A; INTEGRAL WITH UNION
+# 2A1B; INTEGRAL WITH OVERBAR
+# 2A1C; INTEGRAL WITH UNDERBAR
+# 2A1E; LARGE LEFT TRIANGLE OPERATOR
+# 2A1F; Z NOTATION SCHEMA COMPOSITION
+# 2A20; Z NOTATION SCHEMA PIPING
+# 2A21; Z NOTATION SCHEMA PROJECTION
+# 2A24; PLUS SIGN WITH TILDE ABOVE
+# 2A26; PLUS SIGN WITH TILDE BELOW
+# 2A29; MINUS SIGN WITH COMMA ABOVE
+# 2A3E; Z NOTATION RELATIONAL COMPOSITION
+# 2A57; SLOPING LARGE OR
+# 2A58; SLOPING LARGE AND
+# 2A6A; TILDE OPERATOR WITH DOT ABOVE
+# 2A6B; TILDE OPERATOR WITH RISING DOTS
+# 2A6C; SIMILAR MINUS SIMILAR
+# 2A6D; CONGRUENT WITH DOT ABOVE
+# 2A6F; ALMOST EQUAL TO WITH CIRCUMFLEX ACCENT
+# 2A70; APPROXIMATELY EQUAL OR EQUAL TO
+# 2A73; EQUALS SIGN ABOVE TILDE OPERATOR
+# 2A74; DOUBLE COLON EQUAL
+# 2AA3; DOUBLE NESTED LESS-THAN WITH UNDERBAR
+# 2ADC; FORKING
+# 2AE2; VERTICAL BAR TRIPLE RIGHT TURNSTILE
+# 2AE6; LONG DASH FROM LEFT MEMBER OF DOUBLE VERTICAL
+# 2AF3; PARALLEL WITH TILDE OPERATOR
+# 2AFB; TRIPLE SOLIDUS BINARY RELATION
+# 2AFD; DOUBLE SOLIDUS OPERATOR
+# 1D6DB; MATHEMATICAL BOLD PARTIAL DIFFERENTIAL
+# 1D715; MATHEMATICAL ITALIC PARTIAL DIFFERENTIAL
+# 1D74F; MATHEMATICAL BOLD ITALIC PARTIAL DIFFERENTIAL
+# 1D789; MATHEMATICAL SANS-SERIF BOLD PARTIAL DIFFERENTIAL
+# 1D7C3; MATHEMATICAL SANS-SERIF BOLD ITALIC PARTIAL DIFFERENTIAL
+
+# EOF
--- a/maint/Unicode.tables/CaseFolding.txt
+++ b/maint/Unicode.tables/CaseFolding.txt
@ -1,6 +1,6 @@
-# CaseFolding-13.0.0.txt
-# Date: 2019-09-08, 23:30:59 GMT
-# © 2019 Unicode®, Inc.
+# CaseFolding-14.0.0.txt
+# Date: 2021-03-08, 19:35:41 GMT
+# © 2021 Unicode®, Inc.
 # Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries.
 # For terms of use, see http://www.unicode.org/terms_of_use.html
 #
@ -1050,6 +1050,7 @@
 2C2C; C; 2C5C; # GLAGOLITIC CAPITAL LETTER SHTAPIC
 2C2D; C; 2C5D; # GLAGOLITIC CAPITAL LETTER TROKUTASTI A
 2C2E; C; 2C5E; # GLAGOLITIC CAPITAL LETTER LATINATE MYSLITE
+2C2F; C; 2C5F; # GLAGOLITIC CAPITAL LETTER CAUDATE CHRIVI
 2C60; C; 2C61; # LATIN CAPITAL LETTER L WITH DOUBLE BAR
 2C62; C; 026B; # LATIN CAPITAL LETTER L WITH MIDDLE TILDE
 2C63; C; 1D7D; # LATIN CAPITAL LETTER P WITH STROKE
@ -1230,12 +1231,16 @@ A7B8; C; A7B9; # LATIN CAPITAL LETTER U WITH STROKE
 A7BA; C; A7BB; # LATIN CAPITAL LETTER GLOTTAL A
 A7BC; C; A7BD; # LATIN CAPITAL LETTER GLOTTAL I
 A7BE; C; A7BF; # LATIN CAPITAL LETTER GLOTTAL U
+A7C0; C; A7C1; # LATIN CAPITAL LETTER OLD POLISH O
 A7C2; C; A7C3; # LATIN CAPITAL LETTER ANGLICANA W
 A7C4; C; A794; # LATIN CAPITAL LETTER C WITH PALATAL HOOK
 A7C5; C; 0282; # LATIN CAPITAL LETTER S WITH HOOK
 A7C6; C; 1D8E; # LATIN CAPITAL LETTER Z WITH PALATAL HOOK
 A7C7; C; A7C8; # LATIN CAPITAL LETTER D WITH SHORT STROKE OVERLAY
 A7C9; C; A7CA; # LATIN CAPITAL LETTER S WITH SHORT STROKE OVERLAY
+A7D0; C; A7D1; # LATIN CAPITAL LETTER CLOSED INSULAR G
+A7D6; C; A7D7; # LATIN CAPITAL LETTER MIDDLE SCOTS S
+A7D8; C; A7D9; # LATIN CAPITAL LETTER SIGMOID S
 A7F5; C; A7F6; # LATIN CAPITAL LETTER REVERSED HALF H
 AB70; C; 13A0; # CHEROKEE SMALL LETTER A
 AB71; C; 13A1; # CHEROKEE SMALL LETTER E
@ -1431,6 +1436,41 @@ FF3A; C; FF5A; # FULLWIDTH LATIN CAPITAL LETTER Z
 104D1; C; 104F9; # OSAGE CAPITAL LETTER GHA
 104D2; C; 104FA; # OSAGE CAPITAL LETTER ZA
 104D3; C; 104FB; # OSAGE CAPITAL LETTER ZHA
+10570; C; 10597; # VITHKUQI CAPITAL LETTER A
+10571; C; 10598; # VITHKUQI CAPITAL LETTER BBE
+10572; C; 10599; # VITHKUQI CAPITAL LETTER BE
+10573; C; 1059A; # VITHKUQI CAPITAL LETTER CE
+10574; C; 1059B; # VITHKUQI CAPITAL LETTER CHE
+10575; C; 1059C; # VITHKUQI CAPITAL LETTER DE
+10576; C; 1059D; # VITHKUQI CAPITAL LETTER DHE
+10577; C; 1059E; # VITHKUQI CAPITAL LETTER EI
+10578; C; 1059F; # VITHKUQI CAPITAL LETTER E
+10579; C; 105A0; # VITHKUQI CAPITAL LETTER FE
+1057A; C; 105A1; # VITHKUQI CAPITAL LETTER GA
+1057C; C; 105A3; # VITHKUQI CAPITAL LETTER HA
+1057D; C; 105A4; # VITHKUQI CAPITAL LETTER HHA
+1057E; C; 105A5; # VITHKUQI CAPITAL LETTER I
+1057F; C; 105A6; # VITHKUQI CAPITAL LETTER IJE
+10580; C; 105A7; # VITHKUQI CAPITAL LETTER JE
+10581; C; 105A8; # VITHKUQI CAPITAL LETTER KA
+10582; C; 105A9; # VITHKUQI CAPITAL LETTER LA
+10583; C; 105AA; # VITHKUQI CAPITAL LETTER LLA
+10584; C; 105AB; # VITHKUQI CAPITAL LETTER ME
+10585; C; 105AC; # VITHKUQI CAPITAL LETTER NE
+10586; C; 105AD; # VITHKUQI CAPITAL LETTER NJE
+10587; C; 105AE; # VITHKUQI CAPITAL LETTER O
+10588; C; 105AF; # VITHKUQI CAPITAL LETTER PE
+10589; C; 105B0; # VITHKUQI CAPITAL LETTER QA
+1058A; C; 105B1; # VITHKUQI CAPITAL LETTER RE
+1058C; C; 105B3; # VITHKUQI CAPITAL LETTER SE
+1058D; C; 105B4; # VITHKUQI CAPITAL LETTER SHE
+1058E; C; 105B5; # VITHKUQI CAPITAL LETTER TE
+1058F; C; 105B6; # VITHKUQI CAPITAL LETTER THE
+10590; C; 105B7; # VITHKUQI CAPITAL LETTER U
+10591; C; 105B8; # VITHKUQI CAPITAL LETTER VE
+10592; C; 105B9; # VITHKUQI CAPITAL LETTER XE
+10594; C; 105BB; # VITHKUQI CAPITAL LETTER Y
+10595; C; 105BC; # VITHKUQI CAPITAL LETTER ZE
 10C80; C; 10CC0; # OLD HUNGARIAN CAPITAL LETTER A
 10C81; C; 10CC1; # OLD HUNGARIAN CAPITAL LETTER AA
 10C82; C; 10CC2; # OLD HUNGARIAN CAPITAL LETTER EB
--- a/maint/Unicode.tables/DerivedBidiClass.txt
+++ b/maint/Unicode.tables/DerivedBidiClass.txt
--- a/maint/Unicode.tables/DerivedCoreProperties.txt
+++ b/maint/Unicode.tables/DerivedCoreProperties.txt
--- a/maint/Unicode.tables/DerivedGeneralCategory.txt
+++ b/maint/Unicode.tables/DerivedGeneralCategory.txt
@ -1,6 +1,6 @@
-# DerivedGeneralCategory-13.0.0.txt
-# Date: 2019-10-21, 14:30:32 GMT
-# © 2019 Unicode®, Inc.
+# DerivedGeneralCategory-14.0.0.txt
+# Date: 2021-07-10, 00:35:08 GMT
+# © 2021 Unicode®, Inc.
 # Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries.
 # For terms of use, see http://www.unicode.org/terms_of_use.html
 #
@ -27,7 +27,6 @@
 05C8..05CF    ; Cn #   [8] <reserved-05C8>..<reserved-05CF>
 05EB..05EE    ; Cn #   [4] <reserved-05EB>..<reserved-05EE>
 05F5..05FF    ; Cn #  [11] <reserved-05F5>..<reserved-05FF>
-061D          ; Cn #       <reserved-061D>
 070E          ; Cn #       <reserved-070E>
 074B..074C    ; Cn #   [2] <reserved-074B>..<reserved-074C>
 07B2..07BF    ; Cn #  [14] <reserved-07B2>..<reserved-07BF>
@ -36,9 +35,9 @@
 083F          ; Cn #       <reserved-083F>
 085C..085D    ; Cn #   [2] <reserved-085C>..<reserved-085D>
 085F          ; Cn #       <reserved-085F>
-086B..089F    ; Cn #  [53] <reserved-086B>..<reserved-089F>
-08B5          ; Cn #       <reserved-08B5>
-08C8..08D2    ; Cn #  [11] <reserved-08C8>..<reserved-08D2>
+086B..086F    ; Cn #   [5] <reserved-086B>..<reserved-086F>
+088F          ; Cn #       <reserved-088F>
+0892..0897    ; Cn #   [6] <reserved-0892>..<reserved-0897>
 0984          ; Cn #       <reserved-0984>
 098D..098E    ; Cn #   [2] <reserved-098D>..<reserved-098E>
 0991..0992    ; Cn #   [2] <reserved-0991>..<reserved-0992>
@ -116,12 +115,13 @@
 0C0D          ; Cn #       <reserved-0C0D>
 0C11          ; Cn #       <reserved-0C11>
 0C29          ; Cn #       <reserved-0C29>
-0C3A..0C3C    ; Cn #   [3] <reserved-0C3A>..<reserved-0C3C>
+0C3A..0C3B    ; Cn #   [2] <reserved-0C3A>..<reserved-0C3B>
 0C45          ; Cn #       <reserved-0C45>
 0C49          ; Cn #       <reserved-0C49>
 0C4E..0C54    ; Cn #   [7] <reserved-0C4E>..<reserved-0C54>
 0C57          ; Cn #       <reserved-0C57>
-0C5B..0C5F    ; Cn #   [5] <reserved-0C5B>..<reserved-0C5F>
+0C5B..0C5C    ; Cn #   [2] <reserved-0C5B>..<reserved-0C5C>
+0C5E..0C5F    ; Cn #   [2] <reserved-0C5E>..<reserved-0C5F>
 0C64..0C65    ; Cn #   [2] <reserved-0C64>..<reserved-0C65>
 0C70..0C76    ; Cn #   [7] <reserved-0C70>..<reserved-0C76>
 0C8D          ; Cn #       <reserved-0C8D>
@ -132,7 +132,7 @@
 0CC5          ; Cn #       <reserved-0CC5>
 0CC9          ; Cn #       <reserved-0CC9>
 0CCE..0CD4    ; Cn #   [7] <reserved-0CCE>..<reserved-0CD4>
-0CD7..0CDD    ; Cn #   [7] <reserved-0CD7>..<reserved-0CDD>
+0CD7..0CDC    ; Cn #   [6] <reserved-0CD7>..<reserved-0CDC>
 0CDF          ; Cn #       <reserved-0CDF>
 0CE4..0CE5    ; Cn #   [2] <reserved-0CE4>..<reserved-0CE5>
 0CF0          ; Cn #       <reserved-0CF0>
@ -200,8 +200,7 @@
 13FE..13FF    ; Cn #   [2] <reserved-13FE>..<reserved-13FF>
 169D..169F    ; Cn #   [3] <reserved-169D>..<reserved-169F>
 16F9..16FF    ; Cn #   [7] <reserved-16F9>..<reserved-16FF>
-170D          ; Cn #       <reserved-170D>
-1715..171F    ; Cn #  [11] <reserved-1715>..<reserved-171F>
+1716..171E    ; Cn #   [9] <reserved-1716>..<reserved-171E>
 1737..173F    ; Cn #   [9] <reserved-1737>..<reserved-173F>
 1754..175F    ; Cn #  [12] <reserved-1754>..<reserved-175F>
 176D          ; Cn #       <reserved-176D>
@ -210,7 +209,6 @@
 17DE..17DF    ; Cn #   [2] <reserved-17DE>..<reserved-17DF>
 17EA..17EF    ; Cn #   [6] <reserved-17EA>..<reserved-17EF>
 17FA..17FF    ; Cn #   [6] <reserved-17FA>..<reserved-17FF>
-180F          ; Cn #       <reserved-180F>
 181A..181F    ; Cn #   [6] <reserved-181A>..<reserved-181F>
 1879..187F    ; Cn #   [7] <reserved-1879>..<reserved-187F>
 18AB..18AF    ; Cn #   [5] <reserved-18AB>..<reserved-18AF>
@ -230,9 +228,9 @@
 1A8A..1A8F    ; Cn #   [6] <reserved-1A8A>..<reserved-1A8F>
 1A9A..1A9F    ; Cn #   [6] <reserved-1A9A>..<reserved-1A9F>
 1AAE..1AAF    ; Cn #   [2] <reserved-1AAE>..<reserved-1AAF>
-1AC1..1AFF    ; Cn #  [63] <reserved-1AC1>..<reserved-1AFF>
-1B4C..1B4F    ; Cn #   [4] <reserved-1B4C>..<reserved-1B4F>
-1B7D..1B7F    ; Cn #   [3] <reserved-1B7D>..<reserved-1B7F>
+1ACF..1AFF    ; Cn #  [49] <reserved-1ACF>..<reserved-1AFF>
+1B4D..1B4F    ; Cn #   [3] <reserved-1B4D>..<reserved-1B4F>
+1B7F          ; Cn #       <reserved-1B7F>
 1BF4..1BFB    ; Cn #   [8] <reserved-1BF4>..<reserved-1BFB>
 1C38..1C3A    ; Cn #   [3] <reserved-1C38>..<reserved-1C3A>
 1C4A..1C4C    ; Cn #   [3] <reserved-1C4A>..<reserved-1C4C>
@ -240,7 +238,6 @@
 1CBB..1CBC    ; Cn #   [2] <reserved-1CBB>..<reserved-1CBC>
 1CC8..1CCF    ; Cn #   [8] <reserved-1CC8>..<reserved-1CCF>
 1CFB..1CFF    ; Cn #   [5] <reserved-1CFB>..<reserved-1CFF>
-1DFA          ; Cn #       <reserved-1DFA>
 1F16..1F17    ; Cn #   [2] <reserved-1F16>..<reserved-1F17>
 1F1E..1F1F    ; Cn #   [2] <reserved-1F1E>..<reserved-1F1F>
 1F46..1F47    ; Cn #   [2] <reserved-1F46>..<reserved-1F47>
@ -261,15 +258,13 @@
 2072..2073    ; Cn #   [2] <reserved-2072>..<reserved-2073>
 208F          ; Cn #       <reserved-208F>
 209D..209F    ; Cn #   [3] <reserved-209D>..<reserved-209F>
-20C0..20CF    ; Cn #  [16] <reserved-20C0>..<reserved-20CF>
+20C1..20CF    ; Cn #  [15] <reserved-20C1>..<reserved-20CF>
 20F1..20FF    ; Cn #  [15] <reserved-20F1>..<reserved-20FF>
 218C..218F    ; Cn #   [4] <reserved-218C>..<reserved-218F>
 2427..243F    ; Cn #  [25] <reserved-2427>..<reserved-243F>
 244B..245F    ; Cn #  [21] <reserved-244B>..<reserved-245F>
 2B74..2B75    ; Cn #   [2] <reserved-2B74>..<reserved-2B75>
 2B96          ; Cn #       <reserved-2B96>
-2C2F          ; Cn #       <reserved-2C2F>
-2C5F          ; Cn #       <reserved-2C5F>
 2CF4..2CF8    ; Cn #   [5] <reserved-2CF4>..<reserved-2CF8>
 2D26          ; Cn #       <reserved-2D26>
 2D28..2D2C    ; Cn #   [5] <reserved-2D28>..<reserved-2D2C>
@ -285,7 +280,7 @@
 2DCF          ; Cn #       <reserved-2DCF>
 2DD7          ; Cn #       <reserved-2DD7>
 2DDF          ; Cn #       <reserved-2DDF>
-2E53..2E7F    ; Cn #  [45] <reserved-2E53>..<reserved-2E7F>
+2E5E..2E7F    ; Cn #  [34] <reserved-2E5E>..<reserved-2E7F>
 2E9A          ; Cn #       <reserved-2E9A>
 2EF4..2EFF    ; Cn #  [12] <reserved-2EF4>..<reserved-2EFF>
 2FD6..2FEF    ; Cn #  [26] <reserved-2FD6>..<reserved-2FEF>
@ -297,13 +292,14 @@
 318F          ; Cn #       <reserved-318F>
 31E4..31EF    ; Cn #  [12] <reserved-31E4>..<reserved-31EF>
 321F          ; Cn #       <reserved-321F>
-9FFD..9FFF    ; Cn #   [3] <reserved-9FFD>..<reserved-9FFF>
 A48D..A48F    ; Cn #   [3] <reserved-A48D>..<reserved-A48F>
 A4C7..A4CF    ; Cn #   [9] <reserved-A4C7>..<reserved-A4CF>
 A62C..A63F    ; Cn #  [20] <reserved-A62C>..<reserved-A63F>
 A6F8..A6FF    ; Cn #   [8] <reserved-A6F8>..<reserved-A6FF>
-A7C0..A7C1    ; Cn #   [2] <reserved-A7C0>..<reserved-A7C1>
-A7CB..A7F4    ; Cn #  [42] <reserved-A7CB>..<reserved-A7F4>
+A7CB..A7CF    ; Cn #   [5] <reserved-A7CB>..<reserved-A7CF>
+A7D2          ; Cn #       <reserved-A7D2>
+A7D4          ; Cn #       <reserved-A7D4>
+A7DA..A7F1    ; Cn #  [24] <reserved-A7DA>..<reserved-A7F1>
 A82D..A82F    ; Cn #   [3] <reserved-A82D>..<reserved-A82F>
 A83A..A83F    ; Cn #   [6] <reserved-A83A>..<reserved-A83F>
 A878..A87F    ; Cn #   [8] <reserved-A878>..<reserved-A87F>
@ -339,11 +335,10 @@ FB3D          ; Cn #       <reserved-FB3D>
 FB3F          ; Cn #       <reserved-FB3F>
 FB42          ; Cn #       <reserved-FB42>
 FB45          ; Cn #       <reserved-FB45>
-FBC2..FBD2    ; Cn #  [17] <reserved-FBC2>..<reserved-FBD2>
-FD40..FD4F    ; Cn #  [16] <reserved-FD40>..<reserved-FD4F>
+FBC3..FBD2    ; Cn #  [16] <reserved-FBC3>..<reserved-FBD2>
 FD90..FD91    ; Cn #   [2] <reserved-FD90>..<reserved-FD91>
-FDC8..FDEF    ; Cn #  [40] <reserved-FDC8>..<noncharacter-FDEF>
-FDFE..FDFF    ; Cn #   [2] <reserved-FDFE>..<reserved-FDFF>
+FDC8..FDCE    ; Cn #   [7] <reserved-FDC8>..<reserved-FDCE>
+FDD0..FDEF    ; Cn #  [32] <noncharacter-FDD0>..<noncharacter-FDEF>
 FE1A..FE1F    ; Cn #   [6] <reserved-FE1A>..<reserved-FE1F>
 FE53          ; Cn #       <reserved-FE53>
 FE67          ; Cn #       <reserved-FE67>
@ -387,10 +382,20 @@ FFFE..FFFF    ; Cn #   [2] <noncharacter-FFFE>..<noncharacter-FFFF>
 104FC..104FF  ; Cn #   [4] <reserved-104FC>..<reserved-104FF>
 10528..1052F  ; Cn #   [8] <reserved-10528>..<reserved-1052F>
 10564..1056E  ; Cn #  [11] <reserved-10564>..<reserved-1056E>
-10570..105FF  ; Cn # [144] <reserved-10570>..<reserved-105FF>
+1057B         ; Cn #       <reserved-1057B>
+1058B         ; Cn #       <reserved-1058B>
+10593         ; Cn #       <reserved-10593>
+10596         ; Cn #       <reserved-10596>
+105A2         ; Cn #       <reserved-105A2>
+105B2         ; Cn #       <reserved-105B2>
+105BA         ; Cn #       <reserved-105BA>
+105BD..105FF  ; Cn #  [67] <reserved-105BD>..<reserved-105FF>
 10737..1073F  ; Cn #   [9] <reserved-10737>..<reserved-1073F>
 10756..1075F  ; Cn #  [10] <reserved-10756>..<reserved-1075F>
-10768..107FF  ; Cn # [152] <reserved-10768>..<reserved-107FF>
+10768..1077F  ; Cn #  [24] <reserved-10768>..<reserved-1077F>
+10786         ; Cn #       <reserved-10786>
+107B1         ; Cn #       <reserved-107B1>
+107BB..107FF  ; Cn #  [69] <reserved-107BB>..<reserved-107FF>
 10806..10807  ; Cn #   [2] <reserved-10806>..<reserved-10807>
 10809         ; Cn #       <reserved-10809>
 10836         ; Cn #       <reserved-10836>
@ -433,12 +438,13 @@ FFFE..FFFF    ; Cn #   [2] <noncharacter-FFFE>..<noncharacter-FFFF>
 10EAE..10EAF  ; Cn #   [2] <reserved-10EAE>..<reserved-10EAF>
 10EB2..10EFF  ; Cn #  [78] <reserved-10EB2>..<reserved-10EFF>
 10F28..10F2F  ; Cn #   [8] <reserved-10F28>..<reserved-10F2F>
-10F5A..10FAF  ; Cn #  [86] <reserved-10F5A>..<reserved-10FAF>
+10F5A..10F6F  ; Cn #  [22] <reserved-10F5A>..<reserved-10F6F>
+10F8A..10FAF  ; Cn #  [38] <reserved-10F8A>..<reserved-10FAF>
 10FCC..10FDF  ; Cn #  [20] <reserved-10FCC>..<reserved-10FDF>
 10FF7..10FFF  ; Cn #   [9] <reserved-10FF7>..<reserved-10FFF>
 1104E..11051  ; Cn #   [4] <reserved-1104E>..<reserved-11051>
-11070..1107E  ; Cn #  [15] <reserved-11070>..<reserved-1107E>
-110C2..110CC  ; Cn #  [11] <reserved-110C2>..<reserved-110CC>
+11076..1107E  ; Cn #   [9] <reserved-11076>..<reserved-1107E>
+110C3..110CC  ; Cn #  [10] <reserved-110C3>..<reserved-110CC>
 110CE..110CF  ; Cn #   [2] <reserved-110CE>..<reserved-110CF>
 110E9..110EF  ; Cn #   [7] <reserved-110E9>..<reserved-110EF>
 110FA..110FF  ; Cn #   [6] <reserved-110FA>..<reserved-110FF>
@ -480,11 +486,11 @@ FFFE..FFFF    ; Cn #   [2] <noncharacter-FFFE>..<noncharacter-FFFF>
 11645..1164F  ; Cn #  [11] <reserved-11645>..<reserved-1164F>
 1165A..1165F  ; Cn #   [6] <reserved-1165A>..<reserved-1165F>
 1166D..1167F  ; Cn #  [19] <reserved-1166D>..<reserved-1167F>
-116B9..116BF  ; Cn #   [7] <reserved-116B9>..<reserved-116BF>
+116BA..116BF  ; Cn #   [6] <reserved-116BA>..<reserved-116BF>
 116CA..116FF  ; Cn #  [54] <reserved-116CA>..<reserved-116FF>
 1171B..1171C  ; Cn #   [2] <reserved-1171B>..<reserved-1171C>
 1172C..1172F  ; Cn #   [4] <reserved-1172C>..<reserved-1172F>
-11740..117FF  ; Cn # [192] <reserved-11740>..<reserved-117FF>
+11747..117FF  ; Cn # [185] <reserved-11747>..<reserved-117FF>
 1183C..1189F  ; Cn # [100] <reserved-1183C>..<reserved-1189F>
 118F3..118FE  ; Cn #  [12] <reserved-118F3>..<reserved-118FE>
 11907..11908  ; Cn #   [2] <reserved-11907>..<reserved-11908>
@ -499,7 +505,7 @@ FFFE..FFFF    ; Cn #   [2] <noncharacter-FFFE>..<noncharacter-FFFF>
 119D8..119D9  ; Cn #   [2] <reserved-119D8>..<reserved-119D9>
 119E5..119FF  ; Cn #  [27] <reserved-119E5>..<reserved-119FF>
 11A48..11A4F  ; Cn #   [8] <reserved-11A48>..<reserved-11A4F>
-11AA3..11ABF  ; Cn #  [29] <reserved-11AA3>..<reserved-11ABF>
+11AA3..11AAF  ; Cn #  [13] <reserved-11AA3>..<reserved-11AAF>
 11AF9..11BFF  ; Cn # [263] <reserved-11AF9>..<reserved-11BFF>
 11C09         ; Cn #       <reserved-11C09>
 11C37         ; Cn #       <reserved-11C37>
@ -527,14 +533,16 @@ FFFE..FFFF    ; Cn #   [2] <noncharacter-FFFE>..<noncharacter-FFFF>
 1239A..123FF  ; Cn # [102] <reserved-1239A>..<reserved-123FF>
 1246F         ; Cn #       <reserved-1246F>
 12475..1247F  ; Cn #  [11] <reserved-12475>..<reserved-1247F>
-12544..12FFF  ; Cn # [2748] <reserved-12544>..<reserved-12FFF>
+12544..12F8F  ; Cn # [2636] <reserved-12544>..<reserved-12F8F>
+12FF3..12FFF  ; Cn #  [13] <reserved-12FF3>..<reserved-12FFF>
 1342F         ; Cn #       <reserved-1342F>
 13439..143FF  ; Cn # [4039] <reserved-13439>..<reserved-143FF>
 14647..167FF  ; Cn # [8633] <reserved-14647>..<reserved-167FF>
 16A39..16A3F  ; Cn #   [7] <reserved-16A39>..<reserved-16A3F>
 16A5F         ; Cn #       <reserved-16A5F>
 16A6A..16A6D  ; Cn #   [4] <reserved-16A6A>..<reserved-16A6D>
-16A70..16ACF  ; Cn #  [96] <reserved-16A70>..<reserved-16ACF>
+16ABF         ; Cn #       <reserved-16ABF>
+16ACA..16ACF  ; Cn #   [6] <reserved-16ACA>..<reserved-16ACF>
 16AEE..16AEF  ; Cn #   [2] <reserved-16AEE>..<reserved-16AEF>
 16AF6..16AFF  ; Cn #  [10] <reserved-16AF6>..<reserved-16AFF>
 16B46..16B4F  ; Cn #  [10] <reserved-16B46>..<reserved-16B4F>
@ -550,8 +558,11 @@ FFFE..FFFF    ; Cn #   [2] <noncharacter-FFFE>..<noncharacter-FFFF>
 16FF2..16FFF  ; Cn #  [14] <reserved-16FF2>..<reserved-16FFF>
 187F8..187FF  ; Cn #   [8] <reserved-187F8>..<reserved-187FF>
 18CD6..18CFF  ; Cn #  [42] <reserved-18CD6>..<reserved-18CFF>
-18D09..1AFFF  ; Cn # [8951] <reserved-18D09>..<reserved-1AFFF>
-1B11F..1B14F  ; Cn #  [49] <reserved-1B11F>..<reserved-1B14F>
+18D09..1AFEF  ; Cn # [8935] <reserved-18D09>..<reserved-1AFEF>
+1AFF4         ; Cn #       <reserved-1AFF4>
+1AFFC         ; Cn #       <reserved-1AFFC>
+1AFFF         ; Cn #       <reserved-1AFFF>
+1B123..1B14F  ; Cn #  [45] <reserved-1B123>..<reserved-1B14F>
 1B153..1B163  ; Cn #  [17] <reserved-1B153>..<reserved-1B163>
 1B168..1B16F  ; Cn #   [8] <reserved-1B168>..<reserved-1B16F>
 1B2FC..1BBFF  ; Cn # [2308] <reserved-1B2FC>..<reserved-1BBFF>
@ -559,10 +570,13 @@ FFFE..FFFF    ; Cn #   [2] <noncharacter-FFFE>..<noncharacter-FFFF>
 1BC7D..1BC7F  ; Cn #   [3] <reserved-1BC7D>..<reserved-1BC7F>
 1BC89..1BC8F  ; Cn #   [7] <reserved-1BC89>..<reserved-1BC8F>
 1BC9A..1BC9B  ; Cn #   [2] <reserved-1BC9A>..<reserved-1BC9B>
-1BCA4..1CFFF  ; Cn # [4956] <reserved-1BCA4>..<reserved-1CFFF>
+1BCA4..1CEFF  ; Cn # [4700] <reserved-1BCA4>..<reserved-1CEFF>
+1CF2E..1CF2F  ; Cn #   [2] <reserved-1CF2E>..<reserved-1CF2F>
+1CF47..1CF4F  ; Cn #   [9] <reserved-1CF47>..<reserved-1CF4F>
+1CFC4..1CFFF  ; Cn #  [60] <reserved-1CFC4>..<reserved-1CFFF>
 1D0F6..1D0FF  ; Cn #  [10] <reserved-1D0F6>..<reserved-1D0FF>
 1D127..1D128  ; Cn #   [2] <reserved-1D127>..<reserved-1D128>
-1D1E9..1D1FF  ; Cn #  [23] <reserved-1D1E9>..<reserved-1D1FF>
+1D1EB..1D1FF  ; Cn #  [21] <reserved-1D1EB>..<reserved-1D1FF>
 1D246..1D2DF  ; Cn # [154] <reserved-1D246>..<reserved-1D2DF>
 1D2F4..1D2FF  ; Cn #  [12] <reserved-1D2F4>..<reserved-1D2FF>
 1D357..1D35F  ; Cn #   [9] <reserved-1D357>..<reserved-1D35F>
@ -589,7 +603,8 @@ FFFE..FFFF    ; Cn #   [2] <noncharacter-FFFE>..<noncharacter-FFFF>
 1D7CC..1D7CD  ; Cn #   [2] <reserved-1D7CC>..<reserved-1D7CD>
 1DA8C..1DA9A  ; Cn #  [15] <reserved-1DA8C>..<reserved-1DA9A>
 1DAA0         ; Cn #       <reserved-1DAA0>
-1DAB0..1DFFF  ; Cn # [1360] <reserved-1DAB0>..<reserved-1DFFF>
+1DAB0..1DEFF  ; Cn # [1104] <reserved-1DAB0>..<reserved-1DEFF>
+1DF1F..1DFFF  ; Cn # [225] <reserved-1DF1F>..<reserved-1DFFF>
 1E007         ; Cn #       <reserved-1E007>
 1E019..1E01A  ; Cn #   [2] <reserved-1E019>..<reserved-1E01A>
 1E022         ; Cn #       <reserved-1E022>
@ -598,9 +613,14 @@ FFFE..FFFF    ; Cn #   [2] <noncharacter-FFFE>..<noncharacter-FFFF>
 1E12D..1E12F  ; Cn #   [3] <reserved-1E12D>..<reserved-1E12F>
 1E13E..1E13F  ; Cn #   [2] <reserved-1E13E>..<reserved-1E13F>
 1E14A..1E14D  ; Cn #   [4] <reserved-1E14A>..<reserved-1E14D>
-1E150..1E2BF  ; Cn # [368] <reserved-1E150>..<reserved-1E2BF>
+1E150..1E28F  ; Cn # [320] <reserved-1E150>..<reserved-1E28F>
+1E2AF..1E2BF  ; Cn #  [17] <reserved-1E2AF>..<reserved-1E2BF>
 1E2FA..1E2FE  ; Cn #   [5] <reserved-1E2FA>..<reserved-1E2FE>
-1E300..1E7FF  ; Cn # [1280] <reserved-1E300>..<reserved-1E7FF>
+1E300..1E7DF  ; Cn # [1248] <reserved-1E300>..<reserved-1E7DF>
+1E7E7         ; Cn #       <reserved-1E7E7>
+1E7EC         ; Cn #       <reserved-1E7EC>
+1E7EF         ; Cn #       <reserved-1E7EF>
+1E7FF         ; Cn #       <reserved-1E7FF>
 1E8C5..1E8C6  ; Cn #   [2] <reserved-1E8C5>..<reserved-1E8C6>
 1E8D7..1E8FF  ; Cn #  [41] <reserved-1E8D7>..<reserved-1E8FF>
 1E94C..1E94F  ; Cn #   [4] <reserved-1E94C>..<reserved-1E94F>
@ -654,34 +674,35 @@ FFFE..FFFF    ; Cn #   [2] <noncharacter-FFFE>..<noncharacter-FFFF>
 1F249..1F24F  ; Cn #   [7] <reserved-1F249>..<reserved-1F24F>
 1F252..1F25F  ; Cn #  [14] <reserved-1F252>..<reserved-1F25F>
 1F266..1F2FF  ; Cn # [154] <reserved-1F266>..<reserved-1F2FF>
-1F6D8..1F6DF  ; Cn #   [8] <reserved-1F6D8>..<reserved-1F6DF>
+1F6D8..1F6DC  ; Cn #   [5] <reserved-1F6D8>..<reserved-1F6DC>
 1F6ED..1F6EF  ; Cn #   [3] <reserved-1F6ED>..<reserved-1F6EF>
 1F6FD..1F6FF  ; Cn #   [3] <reserved-1F6FD>..<reserved-1F6FF>
 1F774..1F77F  ; Cn #  [12] <reserved-1F774>..<reserved-1F77F>
 1F7D9..1F7DF  ; Cn #   [7] <reserved-1F7D9>..<reserved-1F7DF>
-1F7EC..1F7FF  ; Cn #  [20] <reserved-1F7EC>..<reserved-1F7FF>
+1F7EC..1F7EF  ; Cn #   [4] <reserved-1F7EC>..<reserved-1F7EF>
+1F7F1..1F7FF  ; Cn #  [15] <reserved-1F7F1>..<reserved-1F7FF>
 1F80C..1F80F  ; Cn #   [4] <reserved-1F80C>..<reserved-1F80F>
 1F848..1F84F  ; Cn #   [8] <reserved-1F848>..<reserved-1F84F>
 1F85A..1F85F  ; Cn #   [6] <reserved-1F85A>..<reserved-1F85F>
 1F888..1F88F  ; Cn #   [8] <reserved-1F888>..<reserved-1F88F>
 1F8AE..1F8AF  ; Cn #   [2] <reserved-1F8AE>..<reserved-1F8AF>
 1F8B2..1F8FF  ; Cn #  [78] <reserved-1F8B2>..<reserved-1F8FF>
-1F979         ; Cn #       <reserved-1F979>
-1F9CC         ; Cn #       <reserved-1F9CC>
 1FA54..1FA5F  ; Cn #  [12] <reserved-1FA54>..<reserved-1FA5F>
 1FA6E..1FA6F  ; Cn #   [2] <reserved-1FA6E>..<reserved-1FA6F>
 1FA75..1FA77  ; Cn #   [3] <reserved-1FA75>..<reserved-1FA77>
-1FA7B..1FA7F  ; Cn #   [5] <reserved-1FA7B>..<reserved-1FA7F>
+1FA7D..1FA7F  ; Cn #   [3] <reserved-1FA7D>..<reserved-1FA7F>
 1FA87..1FA8F  ; Cn #   [9] <reserved-1FA87>..<reserved-1FA8F>
-1FAA9..1FAAF  ; Cn #   [7] <reserved-1FAA9>..<reserved-1FAAF>
-1FAB7..1FABF  ; Cn #   [9] <reserved-1FAB7>..<reserved-1FABF>
-1FAC3..1FACF  ; Cn #  [13] <reserved-1FAC3>..<reserved-1FACF>
-1FAD7..1FAFF  ; Cn #  [41] <reserved-1FAD7>..<reserved-1FAFF>
+1FAAD..1FAAF  ; Cn #   [3] <reserved-1FAAD>..<reserved-1FAAF>
+1FABB..1FABF  ; Cn #   [5] <reserved-1FABB>..<reserved-1FABF>
+1FAC6..1FACF  ; Cn #  [10] <reserved-1FAC6>..<reserved-1FACF>
+1FADA..1FADF  ; Cn #   [6] <reserved-1FADA>..<reserved-1FADF>
+1FAE8..1FAEF  ; Cn #   [8] <reserved-1FAE8>..<reserved-1FAEF>
+1FAF7..1FAFF  ; Cn #   [9] <reserved-1FAF7>..<reserved-1FAFF>
 1FB93         ; Cn #       <reserved-1FB93>
 1FBCB..1FBEF  ; Cn #  [37] <reserved-1FBCB>..<reserved-1FBEF>
 1FBFA..1FFFF  ; Cn # [1030] <reserved-1FBFA>..<noncharacter-1FFFF>
-2A6DE..2A6FF  ; Cn #  [34] <reserved-2A6DE>..<reserved-2A6FF>
-2B735..2B73F  ; Cn #  [11] <reserved-2B735>..<reserved-2B73F>
+2A6E0..2A6FF  ; Cn #  [32] <reserved-2A6E0>..<reserved-2A6FF>
+2B739..2B73F  ; Cn #   [7] <reserved-2B739>..<reserved-2B73F>
 2B81E..2B81F  ; Cn #   [2] <reserved-2B81E>..<reserved-2B81F>
 2CEA2..2CEAF  ; Cn #  [14] <reserved-2CEA2>..<reserved-2CEAF>
 2EBE1..2F7FF  ; Cn # [3103] <reserved-2EBE1>..<reserved-2F7FF>
@ -693,7 +714,7 @@ E01F0..EFFFF  ; Cn # [65040] <reserved-E01F0>..<noncharacter-EFFFF>
 FFFFE..FFFFF  ; Cn #   [2] <noncharacter-FFFFE>..<noncharacter-FFFFF>
 10FFFE..10FFFF; Cn #   [2] <noncharacter-10FFFE>..<noncharacter-10FFFF>

-# Total code points: 830672
+# Total code points: 829834

 # ================================================

@ -1130,7 +1151,7 @@ FFFFE..FFFFF  ; Cn #   [2] <noncharacter-FFFFE>..<noncharacter-FFFFF>
 213E..213F    ; Lu #   [2] DOUBLE-STRUCK CAPITAL GAMMA..DOUBLE-STRUCK CAPITAL PI
 2145          ; Lu #       DOUBLE-STRUCK ITALIC CAPITAL D
 2183          ; Lu #       ROMAN NUMERAL REVERSED ONE HUNDRED
-2C00..2C2E    ; Lu #  [47] GLAGOLITIC CAPITAL LETTER AZU..GLAGOLITIC CAPITAL LETTER LATINATE MYSLITE
+2C00..2C2F    ; Lu #  [48] GLAGOLITIC CAPITAL LETTER AZU..GLAGOLITIC CAPITAL LETTER CAUDATE CHRIVI
 2C60          ; Lu #       LATIN CAPITAL LETTER L WITH DOUBLE BAR
 2C62..2C64    ; Lu #   [3] LATIN CAPITAL LETTER L WITH MIDDLE TILDE..LATIN CAPITAL LETTER R WITH TAIL
 2C67          ; Lu #       LATIN CAPITAL LETTER H WITH DESCENDER
@ -1295,13 +1316,21 @@ A7B8          ; Lu #       LATIN CAPITAL LETTER U WITH STROKE
 A7BA          ; Lu #       LATIN CAPITAL LETTER GLOTTAL A
 A7BC          ; Lu #       LATIN CAPITAL LETTER GLOTTAL I
 A7BE          ; Lu #       LATIN CAPITAL LETTER GLOTTAL U
+A7C0          ; Lu #       LATIN CAPITAL LETTER OLD POLISH O
 A7C2          ; Lu #       LATIN CAPITAL LETTER ANGLICANA W
 A7C4..A7C7    ; Lu #   [4] LATIN CAPITAL LETTER C WITH PALATAL HOOK..LATIN CAPITAL LETTER D WITH SHORT STROKE OVERLAY
 A7C9          ; Lu #       LATIN CAPITAL LETTER S WITH SHORT STROKE OVERLAY
+A7D0          ; Lu #       LATIN CAPITAL LETTER CLOSED INSULAR G
+A7D6          ; Lu #       LATIN CAPITAL LETTER MIDDLE SCOTS S
+A7D8          ; Lu #       LATIN CAPITAL LETTER SIGMOID S
 A7F5          ; Lu #       LATIN CAPITAL LETTER REVERSED HALF H
 FF21..FF3A    ; Lu #  [26] FULLWIDTH LATIN CAPITAL LETTER A..FULLWIDTH LATIN CAPITAL LETTER Z
 10400..10427  ; Lu #  [40] DESERET CAPITAL LETTER LONG I..DESERET CAPITAL LETTER EW
 104B0..104D3  ; Lu #  [36] OSAGE CAPITAL LETTER A..OSAGE CAPITAL LETTER ZHA
+10570..1057A  ; Lu #  [11] VITHKUQI CAPITAL LETTER A..VITHKUQI CAPITAL LETTER GA
+1057C..1058A  ; Lu #  [15] VITHKUQI CAPITAL LETTER HA..VITHKUQI CAPITAL LETTER RE
+1058C..10592  ; Lu #   [7] VITHKUQI CAPITAL LETTER SE..VITHKUQI CAPITAL LETTER XE
+10594..10595  ; Lu #   [2] VITHKUQI CAPITAL LETTER Y..VITHKUQI CAPITAL LETTER ZE
 10C80..10CB2  ; Lu #  [51] OLD HUNGARIAN CAPITAL LETTER A..OLD HUNGARIAN CAPITAL LETTER US
 118A0..118BF  ; Lu #  [32] WARANG CITI CAPITAL LETTER NGAA..WARANG CITI CAPITAL LETTER VIYO
 16E40..16E5F  ; Lu #  [32] MEDEFAIDRIN CAPITAL LETTER M..MEDEFAIDRIN CAPITAL LETTER Y
@ -1338,7 +1367,7 @@ FF21..FF3A    ; Lu #  [26] FULLWIDTH LATIN CAPITAL LETTER A..FULLWIDTH LATIN CAP
 1D7CA         ; Lu #       MATHEMATICAL BOLD CAPITAL DIGAMMA
 1E900..1E921  ; Lu #  [34] ADLAM CAPITAL LETTER ALIF..ADLAM CAPITAL LETTER SHA

-# Total code points: 1791
+# Total code points: 1831

 # ================================================

@ -1775,7 +1804,7 @@ FF21..FF3A    ; Lu #  [26] FULLWIDTH LATIN CAPITAL LETTER A..FULLWIDTH LATIN CAP
 2146..2149    ; Ll #   [4] DOUBLE-STRUCK ITALIC SMALL D..DOUBLE-STRUCK ITALIC SMALL J
 214E          ; Ll #       TURNED SMALL F
 2184          ; Ll #       LATIN SMALL LETTER REVERSED C
-2C30..2C5E    ; Ll #  [47] GLAGOLITIC SMALL LETTER AZU..GLAGOLITIC SMALL LETTER LATINATE MYSLITE
+2C30..2C5F    ; Ll #  [48] GLAGOLITIC SMALL LETTER AZU..GLAGOLITIC SMALL LETTER CAUDATE CHRIVI
 2C61          ; Ll #       LATIN SMALL LETTER L WITH DOUBLE BAR
 2C65..2C66    ; Ll #   [2] LATIN SMALL LETTER A WITH STROKE..LATIN SMALL LETTER T WITH DIAGONAL STROKE
 2C68          ; Ll #       LATIN SMALL LETTER H WITH DESCENDER
@ -1944,9 +1973,15 @@ A7B9          ; Ll #       LATIN SMALL LETTER U WITH STROKE
 A7BB          ; Ll #       LATIN SMALL LETTER GLOTTAL A
 A7BD          ; Ll #       LATIN SMALL LETTER GLOTTAL I
 A7BF          ; Ll #       LATIN SMALL LETTER GLOTTAL U
+A7C1          ; Ll #       LATIN SMALL LETTER OLD POLISH O
 A7C3          ; Ll #       LATIN SMALL LETTER ANGLICANA W
 A7C8          ; Ll #       LATIN SMALL LETTER D WITH SHORT STROKE OVERLAY
 A7CA          ; Ll #       LATIN SMALL LETTER S WITH SHORT STROKE OVERLAY
+A7D1          ; Ll #       LATIN SMALL LETTER CLOSED INSULAR G
+A7D3          ; Ll #       LATIN SMALL LETTER DOUBLE THORN
+A7D5          ; Ll #       LATIN SMALL LETTER DOUBLE WYNN
+A7D7          ; Ll #       LATIN SMALL LETTER MIDDLE SCOTS S
+A7D9          ; Ll #       LATIN SMALL LETTER SIGMOID S
 A7F6          ; Ll #       LATIN SMALL LETTER REVERSED HALF H
 A7FA          ; Ll #       LATIN LETTER SMALL CAPITAL TURNED M
 AB30..AB5A    ; Ll #  [43] LATIN SMALL LETTER BARRED ALPHA..LATIN SMALL LETTER Y WITH SHORT RIGHT LEG
@ -1957,6 +1992,10 @@ FB13..FB17    ; Ll #   [5] ARMENIAN SMALL LIGATURE MEN NOW..ARMENIAN SMALL LIGAT
 FF41..FF5A    ; Ll #  [26] FULLWIDTH LATIN SMALL LETTER A..FULLWIDTH LATIN SMALL LETTER Z
 10428..1044F  ; Ll #  [40] DESERET SMALL LETTER LONG I..DESERET SMALL LETTER EW
 104D8..104FB  ; Ll #  [36] OSAGE SMALL LETTER A..OSAGE SMALL LETTER ZHA
+10597..105A1  ; Ll #  [11] VITHKUQI SMALL LETTER A..VITHKUQI SMALL LETTER GA
+105A3..105B1  ; Ll #  [15] VITHKUQI SMALL LETTER HA..VITHKUQI SMALL LETTER RE
+105B3..105B9  ; Ll #   [7] VITHKUQI SMALL LETTER SE..VITHKUQI SMALL LETTER XE
+105BB..105BC  ; Ll #   [2] VITHKUQI SMALL LETTER Y..VITHKUQI SMALL LETTER ZE
 10CC0..10CF2  ; Ll #  [51] OLD HUNGARIAN SMALL LETTER A..OLD HUNGARIAN SMALL LETTER US
 118C0..118DF  ; Ll #  [32] WARANG CITI SMALL LETTER NGAA..WARANG CITI SMALL LETTER VIYO
 16E60..16E7F  ; Ll #  [32] MEDEFAIDRIN SMALL LETTER M..MEDEFAIDRIN SMALL LETTER Y
@ -1988,9 +2027,11 @@ FF41..FF5A    ; Ll #  [26] FULLWIDTH LATIN SMALL LETTER A..FULLWIDTH LATIN SMALL
 1D7AA..1D7C2  ; Ll #  [25] MATHEMATICAL SANS-SERIF BOLD ITALIC SMALL ALPHA..MATHEMATICAL SANS-SERIF BOLD ITALIC SMALL OMEGA
 1D7C4..1D7C9  ; Ll #   [6] MATHEMATICAL SANS-SERIF BOLD ITALIC EPSILON SYMBOL..MATHEMATICAL SANS-SERIF BOLD ITALIC PI SYMBOL
 1D7CB         ; Ll #       MATHEMATICAL BOLD SMALL DIGAMMA
+1DF00..1DF09  ; Ll #  [10] LATIN SMALL LETTER FENG DIGRAPH WITH TRILL..LATIN SMALL LETTER T WITH HOOK AND RETROFLEX HOOK
+1DF0B..1DF1E  ; Ll #  [20] LATIN SMALL LETTER ESH WITH DOUBLE BAR..LATIN SMALL LETTER S WITH CURL
 1E922..1E943  ; Ll #  [34] ADLAM SMALL LETTER ALIF..ADLAM SMALL LETTER SHA

-# Total code points: 2155
+# Total code points: 2227

 # ================================================

@ -2028,6 +2069,7 @@ FF41..FF5A    ; Ll #  [26] FULLWIDTH LATIN SMALL LETTER A..FULLWIDTH LATIN SMALL
 081A          ; Lm #       SAMARITAN MODIFIER LETTER EPENTHETIC YUT
 0824          ; Lm #       SAMARITAN MODIFIER LETTER SHORT A
 0828          ; Lm #       SAMARITAN MODIFIER LETTER I
+08C9          ; Lm #       ARABIC SMALL FARSI YEH
 0971          ; Lm #       DEVANAGARI SIGN HIGH SPACING DOT
 0E46          ; Lm #       THAI CHARACTER MAIYAMOK
 0EC6          ; Lm #       LAO KO LA
@ -2058,6 +2100,7 @@ A69C..A69D    ; Lm #   [2] MODIFIER LETTER CYRILLIC HARD SIGN..MODIFIER LETTER C
 A717..A71F    ; Lm #   [9] MODIFIER LETTER DOT VERTICAL BAR..MODIFIER LETTER LOW INVERTED EXCLAMATION MARK
 A770          ; Lm #       MODIFIER LETTER US
 A788          ; Lm #       MODIFIER LETTER LOW CIRCUMFLEX ACCENT
+A7F2..A7F4    ; Lm #   [3] MODIFIER LETTER CAPITAL C..MODIFIER LETTER CAPITAL Q
 A7F8..A7F9    ; Lm #   [2] MODIFIER LETTER CAPITAL H WITH STROKE..MODIFIER LETTER SMALL LIGATURE OE
 A9CF          ; Lm #       JAVANESE PANGRANGKEP
 A9E6          ; Lm #       MYANMAR MODIFIER LETTER SHAN REDUPLICATION
@ -2068,14 +2111,20 @@ AB5C..AB5F    ; Lm #   [4] MODIFIER LETTER SMALL HENG..MODIFIER LETTER SMALL U W
 AB69          ; Lm #       MODIFIER LETTER SMALL TURNED W
 FF70          ; Lm #       HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK
 FF9E..FF9F    ; Lm #   [2] HALFWIDTH KATAKANA VOICED SOUND MARK..HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK
+10780..10785  ; Lm #   [6] MODIFIER LETTER SMALL CAPITAL AA..MODIFIER LETTER SMALL B WITH HOOK
+10787..107B0  ; Lm #  [42] MODIFIER LETTER SMALL DZ DIGRAPH..MODIFIER LETTER SMALL V WITH RIGHT HOOK
+107B2..107BA  ; Lm #   [9] MODIFIER LETTER SMALL CAPITAL Y..MODIFIER LETTER SMALL S WITH CURL
 16B40..16B43  ; Lm #   [4] PAHAWH HMONG SIGN VOS SEEV..PAHAWH HMONG SIGN IB YAM
 16F93..16F9F  ; Lm #  [13] MIAO LETTER TONE-2..MIAO LETTER REFORMED TONE-8
 16FE0..16FE1  ; Lm #   [2] TANGUT ITERATION MARK..NUSHU ITERATION MARK
 16FE3         ; Lm #       OLD CHINESE ITERATION MARK
+1AFF0..1AFF3  ; Lm #   [4] KATAKANA LETTER MINNAN TONE-2..KATAKANA LETTER MINNAN TONE-5
+1AFF5..1AFFB  ; Lm #   [7] KATAKANA LETTER MINNAN TONE-7..KATAKANA LETTER MINNAN NASALIZED TONE-5
+1AFFD..1AFFE  ; Lm #   [2] KATAKANA LETTER MINNAN NASALIZED TONE-7..KATAKANA LETTER MINNAN NASALIZED TONE-8
 1E137..1E13D  ; Lm #   [7] NYIAKENG PUACHUE HMONG SIGN FOR PERSON..NYIAKENG PUACHUE HMONG SYLLABLE LENGTHENER
 1E94B         ; Lm #       ADLAM NASALIZATION MARK

-# Total code points: 260
+# Total code points: 334

 # ================================================

@ -2104,8 +2153,9 @@ FF9E..FF9F    ; Lm #   [2] HALFWIDTH KATAKANA VOICED SOUND MARK..HALFWIDTH KATAK
 0800..0815    ; Lo #  [22] SAMARITAN LETTER ALAF..SAMARITAN LETTER TAAF
 0840..0858    ; Lo #  [25] MANDAIC LETTER HALQA..MANDAIC LETTER AIN
 0860..086A    ; Lo #  [11] SYRIAC LETTER MALAYALAM NGA..SYRIAC LETTER MALAYALAM SSA
-08A0..08B4    ; Lo #  [21] ARABIC LETTER BEH WITH SMALL V BELOW..ARABIC LETTER KAF WITH DOT BELOW
-08B6..08C7    ; Lo #  [18] ARABIC LETTER BEH WITH SMALL MEEM ABOVE..ARABIC LETTER LAM WITH SMALL ARABIC LETTER TAH ABOVE
+0870..0887    ; Lo #  [24] ARABIC LETTER ALEF WITH ATTACHED FATHA..ARABIC BASELINE ROUND DOT
+0889..088E    ; Lo #   [6] ARABIC LETTER NOON WITH INVERTED SMALL V..ARABIC VERTICAL TAIL
+08A0..08C8    ; Lo #  [41] ARABIC LETTER BEH WITH SMALL V BELOW..ARABIC LETTER GRAF
 0904..0939    ; Lo #  [54] DEVANAGARI LETTER SHORT A..DEVANAGARI LETTER HA
 093D          ; Lo #       DEVANAGARI SIGN AVAGRAHA
 0950          ; Lo #       DEVANAGARI OM
@ -2170,6 +2220,7 @@ FF9E..FF9F    ; Lm #   [2] HALFWIDTH KATAKANA VOICED SOUND MARK..HALFWIDTH KATAK
 0C2A..0C39    ; Lo #  [16] TELUGU LETTER PA..TELUGU LETTER HA
 0C3D          ; Lo #       TELUGU SIGN AVAGRAHA
 0C58..0C5A    ; Lo #   [3] TELUGU LETTER TSA..TELUGU LETTER RRRA
+0C5D          ; Lo #       TELUGU LETTER NAKAARA POLLU
 0C60..0C61    ; Lo #   [2] TELUGU LETTER VOCALIC RR..TELUGU LETTER VOCALIC LL
 0C80          ; Lo #       KANNADA SIGN SPACING CANDRABINDU
 0C85..0C8C    ; Lo #   [8] KANNADA LETTER A..KANNADA LETTER VOCALIC L
@ -2178,7 +2229,7 @@ FF9E..FF9F    ; Lm #   [2] HALFWIDTH KATAKANA VOICED SOUND MARK..HALFWIDTH KATAK
 0CAA..0CB3    ; Lo #  [10] KANNADA LETTER PA..KANNADA LETTER LLA
 0CB5..0CB9    ; Lo #   [5] KANNADA LETTER VA..KANNADA LETTER HA
 0CBD          ; Lo #       KANNADA SIGN AVAGRAHA
-0CDE          ; Lo #       KANNADA LETTER FA
+0CDD..0CDE    ; Lo #   [2] KANNADA LETTER NAKAARA POLLU..KANNADA LETTER FA
 0CE0..0CE1    ; Lo #   [2] KANNADA LETTER VOCALIC RR..KANNADA LETTER VOCALIC LL
 0CF1..0CF2    ; Lo #   [2] KANNADA SIGN JIHVAMULIYA..KANNADA SIGN UPADHMANIYA
 0D04..0D0C    ; Lo #   [9] MALAYALAM LETTER VEDIC ANUSVARA..MALAYALAM LETTER VOCALIC L
@ -2242,9 +2293,8 @@ FF9E..FF9F    ; Lm #   [2] HALFWIDTH KATAKANA VOICED SOUND MARK..HALFWIDTH KATAK
 1681..169A    ; Lo #  [26] OGHAM LETTER BEITH..OGHAM LETTER PEITH
 16A0..16EA    ; Lo #  [75] RUNIC LETTER FEHU FEOH FE F..RUNIC LETTER X
 16F1..16F8    ; Lo #   [8] RUNIC LETTER K..RUNIC LETTER FRANKS CASKET AESC
-1700..170C    ; Lo #  [13] TAGALOG LETTER A..TAGALOG LETTER YA
-170E..1711    ; Lo #   [4] TAGALOG LETTER LA..TAGALOG LETTER HA
-1720..1731    ; Lo #  [18] HANUNOO LETTER A..HANUNOO LETTER HA
+1700..1711    ; Lo #  [18] TAGALOG LETTER A..TAGALOG LETTER HA
+171F..1731    ; Lo #  [19] TAGALOG LETTER ARCHAIC RA..HANUNOO LETTER HA
 1740..1751    ; Lo #  [18] BUHID LETTER A..BUHID LETTER HA
 1760..176C    ; Lo #  [13] TAGBANWA LETTER A..TAGBANWA LETTER YA
 176E..1770    ; Lo #   [3] TAGBANWA LETTER LA..TAGBANWA LETTER SA
@ -2264,7 +2314,7 @@ FF9E..FF9F    ; Lm #   [2] HALFWIDTH KATAKANA VOICED SOUND MARK..HALFWIDTH KATAK
 1A00..1A16    ; Lo #  [23] BUGINESE LETTER KA..BUGINESE LETTER HA
 1A20..1A54    ; Lo #  [53] TAI THAM LETTER HIGH KA..TAI THAM LETTER GREAT SA
 1B05..1B33    ; Lo #  [47] BALINESE LETTER AKARA..BALINESE LETTER HA
-1B45..1B4B    ; Lo #   [7] BALINESE LETTER KAF SASAK..BALINESE LETTER ASYURA SASAK
+1B45..1B4C    ; Lo #   [8] BALINESE LETTER KAF SASAK..BALINESE LETTER ARCHAIC JNYA
 1B83..1BA0    ; Lo #  [30] SUNDANESE LETTER A..SUNDANESE LETTER HA
 1BAE..1BAF    ; Lo #   [2] SUNDANESE LETTER KHA..SUNDANESE LETTER SYA
 1BBA..1BE5    ; Lo #  [44] SUNDANESE AVAGRAHA..BATAK LETTER U
@ -2297,8 +2347,7 @@ FF9E..FF9F    ; Lm #   [2] HALFWIDTH KATAKANA VOICED SOUND MARK..HALFWIDTH KATAK
 31A0..31BF    ; Lo #  [32] BOPOMOFO LETTER BU..BOPOMOFO LETTER AH
 31F0..31FF    ; Lo #  [16] KATAKANA LETTER SMALL KU..KATAKANA LETTER SMALL RO
 3400..4DBF    ; Lo # [6592] CJK UNIFIED IDEOGRAPH-3400..CJK UNIFIED IDEOGRAPH-4DBF
-4E00..9FFC    ; Lo # [20989] CJK UNIFIED IDEOGRAPH-4E00..CJK UNIFIED IDEOGRAPH-9FFC
-A000..A014    ; Lo #  [21] YI SYLLABLE IT..YI SYLLABLE E
+4E00..A014    ; Lo # [21013] CJK UNIFIED IDEOGRAPH-4E00..YI SYLLABLE E
 A016..A48C    ; Lo # [1143] YI SYLLABLE BIT..YI SYLLABLE YYR
 A4D0..A4F7    ; Lo #  [40] LISU LETTER BA..LISU LETTER OE
 A500..A60B    ; Lo # [268] VAI SYLLABLE EE..VAI SYLLABLE NG
@ -2426,9 +2475,12 @@ FFDA..FFDC    ; Lo #   [3] HALFWIDTH HANGUL LETTER EU..HALFWIDTH HANGUL LETTER I
 10F00..10F1C  ; Lo #  [29] OLD SOGDIAN LETTER ALEPH..OLD SOGDIAN LETTER FINAL TAW WITH VERTICAL TAIL
 10F27         ; Lo #       OLD SOGDIAN LIGATURE AYIN-DALETH
 10F30..10F45  ; Lo #  [22] SOGDIAN LETTER ALEPH..SOGDIAN INDEPENDENT SHIN
+10F70..10F81  ; Lo #  [18] OLD UYGHUR LETTER ALEPH..OLD UYGHUR LETTER LESH
 10FB0..10FC4  ; Lo #  [21] CHORASMIAN LETTER ALEPH..CHORASMIAN LETTER TAW
 10FE0..10FF6  ; Lo #  [23] ELYMAIC LETTER ALEPH..ELYMAIC LIGATURE ZAYIN-YODH
 11003..11037  ; Lo #  [53] BRAHMI SIGN JIHVAMULIYA..BRAHMI LETTER OLD TAMIL NNNA
+11071..11072  ; Lo #   [2] BRAHMI LETTER OLD TAMIL SHORT E..BRAHMI LETTER OLD TAMIL SHORT O
+11075         ; Lo #       BRAHMI LETTER OLD TAMIL LLA
 11083..110AF  ; Lo #  [45] KAITHI LETTER A..KAITHI LETTER HA
 110D0..110E8  ; Lo #  [25] SORA SOMPENG LETTER SAH..SORA SOMPENG LETTER MAE
 11103..11126  ; Lo #  [36] CHAKMA LETTER AA..CHAKMA LETTER HAA
@ -2470,6 +2522,7 @@ FFDA..FFDC    ; Lo #   [3] HALFWIDTH HANGUL LETTER EU..HALFWIDTH HANGUL LETTER I
 11680..116AA  ; Lo #  [43] TAKRI LETTER A..TAKRI LETTER RRA
 116B8         ; Lo #       TAKRI LETTER ARCHAIC KHA
 11700..1171A  ; Lo #  [27] AHOM LETTER KA..AHOM LETTER ALTERNATE BA
+11740..11746  ; Lo #   [7] AHOM LETTER CA..AHOM LETTER LLA
 11800..1182B  ; Lo #  [44] DOGRA LETTER A..DOGRA LETTER RRA
 118FF..11906  ; Lo #   [8] WARANG CITI OM..DIVES AKURU LETTER E
 11909         ; Lo #       DIVES AKURU LETTER O
@ -2488,7 +2541,7 @@ FFDA..FFDC    ; Lo #   [3] HALFWIDTH HANGUL LETTER EU..HALFWIDTH HANGUL LETTER I
 11A50         ; Lo #       SOYOMBO LETTER A
 11A5C..11A89  ; Lo #  [46] SOYOMBO LETTER KA..SOYOMBO CLUSTER-INITIAL LETTER SA
 11A9D         ; Lo #       SOYOMBO MARK PLUTA
-11AC0..11AF8  ; Lo #  [57] PAU CIN HAU LETTER PA..PAU CIN HAU GLOTTAL STOP FINAL
+11AB0..11AF8  ; Lo #  [73] CANADIAN SYLLABICS NATTILIK HI..PAU CIN HAU GLOTTAL STOP FINAL
 11C00..11C08  ; Lo #   [9] BHAIKSUKI LETTER A..BHAIKSUKI LETTER VOCALIC L
 11C0A..11C2E  ; Lo #  [37] BHAIKSUKI LETTER E..BHAIKSUKI LETTER HA
 11C40         ; Lo #       BHAIKSUKI SIGN AVAGRAHA
@ -2505,10 +2558,12 @@ FFDA..FFDC    ; Lo #   [3] HALFWIDTH HANGUL LETTER EU..HALFWIDTH HANGUL LETTER I
 11FB0         ; Lo #       LISU LETTER YHA
 12000..12399  ; Lo # [922] CUNEIFORM SIGN A..CUNEIFORM SIGN U U
 12480..12543  ; Lo # [196] CUNEIFORM SIGN AB TIMES NUN TENU..CUNEIFORM SIGN ZU5 TIMES THREE DISH TENU
+12F90..12FF0  ; Lo #  [97] CYPRO-MINOAN SIGN CM001..CYPRO-MINOAN SIGN CM114
 13000..1342E  ; Lo # [1071] EGYPTIAN HIEROGLYPH A001..EGYPTIAN HIEROGLYPH AA032
 14400..14646  ; Lo # [583] ANATOLIAN HIEROGLYPH A001..ANATOLIAN HIEROGLYPH A530
 16800..16A38  ; Lo # [569] BAMUM LETTER PHASE-A NGKUE MFON..BAMUM LETTER PHASE-F VUEQ
 16A40..16A5E  ; Lo #  [31] MRO LETTER TA..MRO LETTER TEK
+16A70..16ABE  ; Lo #  [79] TANGSA LETTER OZ..TANGSA LETTER ZA
 16AD0..16AED  ; Lo #  [30] BASSA VAH LETTER ENNI..BASSA VAH LETTER I
 16B00..16B2F  ; Lo #  [48] PAHAWH HMONG VOWEL KEEB..PAHAWH HMONG CONSONANT CAU
 16B63..16B77  ; Lo #  [21] PAHAWH HMONG SIGN VOS LUB..PAHAWH HMONG SIGN CIM NRES TOS
@ -2518,7 +2573,7 @@ FFDA..FFDC    ; Lo #   [3] HALFWIDTH HANGUL LETTER EU..HALFWIDTH HANGUL LETTER I
 17000..187F7  ; Lo # [6136] TANGUT IDEOGRAPH-17000..TANGUT IDEOGRAPH-187F7
 18800..18CD5  ; Lo # [1238] TANGUT COMPONENT-001..KHITAN SMALL SCRIPT CHARACTER-18CD5
 18D00..18D08  ; Lo #   [9] TANGUT IDEOGRAPH-18D00..TANGUT IDEOGRAPH-18D08
-1B000..1B11E  ; Lo # [287] KATAKANA LETTER ARCHAIC E..HENTAIGANA LETTER N-MU-MO-2
+1B000..1B122  ; Lo # [291] KATAKANA LETTER ARCHAIC E..KATAKANA LETTER ARCHAIC WU
 1B150..1B152  ; Lo #   [3] HIRAGANA LETTER SMALL WI..HIRAGANA LETTER SMALL WO
 1B164..1B167  ; Lo #   [4] KATAKANA LETTER SMALL WI..KATAKANA LETTER SMALL N
 1B170..1B2FB  ; Lo # [396] NUSHU CHARACTER-1B170..NUSHU CHARACTER-1B2FB
@ -2526,9 +2581,15 @@ FFDA..FFDC    ; Lo #   [3] HALFWIDTH HANGUL LETTER EU..HALFWIDTH HANGUL LETTER I
 1BC70..1BC7C  ; Lo #  [13] DUPLOYAN AFFIX LEFT HORIZONTAL SECANT..DUPLOYAN AFFIX ATTACHED TANGENT HOOK
 1BC80..1BC88  ; Lo #   [9] DUPLOYAN AFFIX HIGH ACUTE..DUPLOYAN AFFIX HIGH VERTICAL
 1BC90..1BC99  ; Lo #  [10] DUPLOYAN AFFIX LOW ACUTE..DUPLOYAN AFFIX LOW ARROW
+1DF0A         ; Lo #       LATIN LETTER RETROFLEX CLICK WITH RETROFLEX HOOK
 1E100..1E12C  ; Lo #  [45] NYIAKENG PUACHUE HMONG LETTER MA..NYIAKENG PUACHUE HMONG LETTER W
 1E14E         ; Lo #       NYIAKENG PUACHUE HMONG LOGOGRAM NYAJ
+1E290..1E2AD  ; Lo #  [30] TOTO LETTER PA..TOTO LETTER A
 1E2C0..1E2EB  ; Lo #  [44] WANCHO LETTER AA..WANCHO LETTER YIH
+1E7E0..1E7E6  ; Lo #   [7] ETHIOPIC SYLLABLE HHYA..ETHIOPIC SYLLABLE HHYO
+1E7E8..1E7EB  ; Lo #   [4] ETHIOPIC SYLLABLE GURAGE HHWA..ETHIOPIC SYLLABLE HHWE
+1E7ED..1E7EE  ; Lo #   [2] ETHIOPIC SYLLABLE GURAGE MWI..ETHIOPIC SYLLABLE GURAGE MWEE
+1E7F0..1E7FE  ; Lo #  [15] ETHIOPIC SYLLABLE GURAGE QWI..ETHIOPIC SYLLABLE GURAGE PWEE
 1E800..1E8C4  ; Lo # [197] MENDE KIKAKUI SYLLABLE M001 KI..MENDE KIKAKUI SYLLABLE M060 NYON
 1EE00..1EE03  ; Lo #   [4] ARABIC MATHEMATICAL ALEF..ARABIC MATHEMATICAL DAL
 1EE05..1EE1F  ; Lo #  [27] ARABIC MATHEMATICAL WAW..ARABIC MATHEMATICAL DOTLESS QAF
@ -2563,15 +2624,15 @@ FFDA..FFDC    ; Lo #   [3] HALFWIDTH HANGUL LETTER EU..HALFWIDTH HANGUL LETTER I
 1EEA1..1EEA3  ; Lo #   [3] ARABIC MATHEMATICAL DOUBLE-STRUCK BEH..ARABIC MATHEMATICAL DOUBLE-STRUCK DAL
 1EEA5..1EEA9  ; Lo #   [5] ARABIC MATHEMATICAL DOUBLE-STRUCK WAW..ARABIC MATHEMATICAL DOUBLE-STRUCK YEH
 1EEAB..1EEBB  ; Lo #  [17] ARABIC MATHEMATICAL DOUBLE-STRUCK LAM..ARABIC MATHEMATICAL DOUBLE-STRUCK GHAIN
-20000..2A6DD  ; Lo # [42718] CJK UNIFIED IDEOGRAPH-20000..CJK UNIFIED IDEOGRAPH-2A6DD
-2A700..2B734  ; Lo # [4149] CJK UNIFIED IDEOGRAPH-2A700..CJK UNIFIED IDEOGRAPH-2B734
+20000..2A6DF  ; Lo # [42720] CJK UNIFIED IDEOGRAPH-20000..CJK UNIFIED IDEOGRAPH-2A6DF
+2A700..2B738  ; Lo # [4153] CJK UNIFIED IDEOGRAPH-2A700..CJK UNIFIED IDEOGRAPH-2B738
 2B740..2B81D  ; Lo # [222] CJK UNIFIED IDEOGRAPH-2B740..CJK UNIFIED IDEOGRAPH-2B81D
 2B820..2CEA1  ; Lo # [5762] CJK UNIFIED IDEOGRAPH-2B820..CJK UNIFIED IDEOGRAPH-2CEA1
 2CEB0..2EBE0  ; Lo # [7473] CJK UNIFIED IDEOGRAPH-2CEB0..CJK UNIFIED IDEOGRAPH-2EBE0
 2F800..2FA1D  ; Lo # [542] CJK COMPATIBILITY IDEOGRAPH-2F800..CJK COMPATIBILITY IDEOGRAPH-2FA1D
 30000..3134A  ; Lo # [4939] CJK UNIFIED IDEOGRAPH-30000..CJK UNIFIED IDEOGRAPH-3134A

-# Total code points: 127004
+# Total code points: 127333

 # ================================================

@ -2601,7 +2662,8 @@ FFDA..FFDC    ; Lo #   [3] HALFWIDTH HANGUL LETTER EU..HALFWIDTH HANGUL LETTER I
 0825..0827    ; Mn #   [3] SAMARITAN VOWEL SIGN SHORT A..SAMARITAN VOWEL SIGN U
 0829..082D    ; Mn #   [5] SAMARITAN VOWEL SIGN LONG I..SAMARITAN MARK NEQUDAA
 0859..085B    ; Mn #   [3] MANDAIC AFFRICATION MARK..MANDAIC GEMINATION MARK
-08D3..08E1    ; Mn #  [15] ARABIC SMALL LOW WAW..ARABIC SMALL HIGH SIGN SAFHA
+0898..089F    ; Mn #   [8] ARABIC SMALL HIGH WORD AL-JUZ..ARABIC HALF MADDA OVER MADDA
+08CA..08E1    ; Mn #  [24] ARABIC SMALL HIGH FARSI YEH..ARABIC SMALL HIGH SIGN SAFHA
 08E3..0902    ; Mn #  [32] ARABIC TURNED DAMMA BELOW..DEVANAGARI SIGN ANUSVARA
 093A          ; Mn #       DEVANAGARI VOWEL SIGN OE
 093C          ; Mn #       DEVANAGARI SIGN NUKTA
@ -2642,6 +2704,7 @@ FFDA..FFDC    ; Lo #   [3] HALFWIDTH HANGUL LETTER EU..HALFWIDTH HANGUL LETTER I
 0BCD          ; Mn #       TAMIL SIGN VIRAMA
 0C00          ; Mn #       TELUGU SIGN COMBINING CANDRABINDU ABOVE
 0C04          ; Mn #       TELUGU SIGN COMBINING ANUSVARA ABOVE
+0C3C          ; Mn #       TELUGU SIGN NUKTA
 0C3E..0C40    ; Mn #   [3] TELUGU VOWEL SIGN AA..TELUGU VOWEL SIGN II
 0C46..0C48    ; Mn #   [3] TELUGU VOWEL SIGN E..TELUGU VOWEL SIGN AI
 0C4A..0C4D    ; Mn #   [4] TELUGU VOWEL SIGN O..TELUGU SIGN VIRAMA
@ -2691,7 +2754,7 @@ FFDA..FFDC    ; Lo #   [3] HALFWIDTH HANGUL LETTER EU..HALFWIDTH HANGUL LETTER I
 109D          ; Mn #       MYANMAR VOWEL SIGN AITON AI
 135D..135F    ; Mn #   [3] ETHIOPIC COMBINING GEMINATION AND VOWEL LENGTH MARK..ETHIOPIC COMBINING GEMINATION MARK
 1712..1714    ; Mn #   [3] TAGALOG VOWEL SIGN I..TAGALOG SIGN VIRAMA
-1732..1734    ; Mn #   [3] HANUNOO VOWEL SIGN I..HANUNOO SIGN PAMUDPOD
+1732..1733    ; Mn #   [2] HANUNOO VOWEL SIGN I..HANUNOO VOWEL SIGN U
 1752..1753    ; Mn #   [2] BUHID VOWEL SIGN I..BUHID VOWEL SIGN U
 1772..1773    ; Mn #   [2] TAGBANWA VOWEL SIGN I..TAGBANWA VOWEL SIGN U
 17B4..17B5    ; Mn #   [2] KHMER VOWEL INHERENT AQ..KHMER VOWEL INHERENT AA
@ -2700,6 +2763,7 @@ FFDA..FFDC    ; Lo #   [3] HALFWIDTH HANGUL LETTER EU..HALFWIDTH HANGUL LETTER I
 17C9..17D3    ; Mn #  [11] KHMER SIGN MUUSIKATOAN..KHMER SIGN BATHAMASAT
 17DD          ; Mn #       KHMER SIGN ATTHACAN
 180B..180D    ; Mn #   [3] MONGOLIAN FREE VARIATION SELECTOR ONE..MONGOLIAN FREE VARIATION SELECTOR THREE
+180F          ; Mn #       MONGOLIAN FREE VARIATION SELECTOR FOUR
 1885..1886    ; Mn #   [2] MONGOLIAN LETTER ALI GALI BALUDA..MONGOLIAN LETTER ALI GALI THREE BALUDA
 18A9          ; Mn #       MONGOLIAN LETTER ALI GALI DAGALGA
 1920..1922    ; Mn #   [3] LIMBU VOWEL SIGN A..LIMBU VOWEL SIGN U
@ -2716,7 +2780,7 @@ FFDA..FFDC    ; Lo #   [3] HALFWIDTH HANGUL LETTER EU..HALFWIDTH HANGUL LETTER I
 1A73..1A7C    ; Mn #  [10] TAI THAM VOWEL SIGN OA ABOVE..TAI THAM SIGN KHUEN-LUE KARAN
 1A7F          ; Mn #       TAI THAM COMBINING CRYPTOGRAMMIC DOT
 1AB0..1ABD    ; Mn #  [14] COMBINING DOUBLED CIRCUMFLEX ACCENT..COMBINING PARENTHESES BELOW
-1ABF..1AC0    ; Mn #   [2] COMBINING LATIN SMALL LETTER W BELOW..COMBINING LATIN SMALL LETTER TURNED W BELOW
+1ABF..1ACE    ; Mn #  [16] COMBINING LATIN SMALL LETTER W BELOW..COMBINING LATIN SMALL LETTER INSULAR T
 1B00..1B03    ; Mn #   [4] BALINESE SIGN ULU RICEM..BALINESE SIGN SURANG
 1B34          ; Mn #       BALINESE SIGN REREKAN
 1B36..1B3A    ; Mn #   [5] BALINESE VOWEL SIGN ULU..BALINESE VOWEL SIGN RA REPA
@ -2739,8 +2803,7 @@ FFDA..FFDC    ; Lo #   [3] HALFWIDTH HANGUL LETTER EU..HALFWIDTH HANGUL LETTER I
 1CED          ; Mn #       VEDIC SIGN TIRYAK
 1CF4          ; Mn #       VEDIC TONE CANDRA ABOVE
 1CF8..1CF9    ; Mn #   [2] VEDIC TONE RING ABOVE..VEDIC TONE DOUBLE RING ABOVE
-1DC0..1DF9    ; Mn #  [58] COMBINING DOTTED GRAVE ACCENT..COMBINING WIDE INVERTED BRIDGE BELOW
-1DFB..1DFF    ; Mn #   [5] COMBINING DELETION MARK..COMBINING RIGHT ARROWHEAD AND DOWN ARROWHEAD BELOW
+1DC0..1DFF    ; Mn #  [64] COMBINING DOTTED GRAVE ACCENT..COMBINING RIGHT ARROWHEAD AND DOWN ARROWHEAD BELOW
 20D0..20DC    ; Mn #  [13] COMBINING LEFT HARPOON ABOVE..COMBINING FOUR DOTS ABOVE
 20E1          ; Mn #       COMBINING LEFT RIGHT ARROW ABOVE
 20E5..20F0    ; Mn #  [12] COMBINING REVERSE SOLIDUS OVERLAY..COMBINING ASTERISK ABOVE
@ -2799,11 +2862,15 @@ FE20..FE2F    ; Mn #  [16] COMBINING LIGATURE LEFT HALF..COMBINING CYRILLIC TITL
 10D24..10D27  ; Mn #   [4] HANIFI ROHINGYA SIGN HARBAHAY..HANIFI ROHINGYA SIGN TASSI
 10EAB..10EAC  ; Mn #   [2] YEZIDI COMBINING HAMZA MARK..YEZIDI COMBINING MADDA MARK
 10F46..10F50  ; Mn #  [11] SOGDIAN COMBINING DOT BELOW..SOGDIAN COMBINING STROKE BELOW
+10F82..10F85  ; Mn #   [4] OLD UYGHUR COMBINING DOT ABOVE..OLD UYGHUR COMBINING TWO DOTS BELOW
 11001         ; Mn #       BRAHMI SIGN ANUSVARA
 11038..11046  ; Mn #  [15] BRAHMI VOWEL SIGN AA..BRAHMI VIRAMA
+11070         ; Mn #       BRAHMI SIGN OLD TAMIL VIRAMA
+11073..11074  ; Mn #   [2] BRAHMI VOWEL SIGN OLD TAMIL SHORT E..BRAHMI VOWEL SIGN OLD TAMIL SHORT O
 1107F..11081  ; Mn #   [3] BRAHMI NUMBER JOINER..KAITHI SIGN ANUSVARA
 110B3..110B6  ; Mn #   [4] KAITHI VOWEL SIGN U..KAITHI VOWEL SIGN AI
 110B9..110BA  ; Mn #   [2] KAITHI SIGN VIRAMA..KAITHI SIGN NUKTA
+110C2         ; Mn #       KAITHI VOWEL SIGN VOCALIC R
 11100..11102  ; Mn #   [3] CHAKMA SIGN CANDRABINDU..CHAKMA SIGN VISARGA
 11127..1112B  ; Mn #   [5] CHAKMA VOWEL SIGN A..CHAKMA VOWEL SIGN UU
 1112D..11134  ; Mn #   [8] CHAKMA VOWEL SIGN AI..CHAKMA MAAYYAA
@ -2883,6 +2950,8 @@ FE20..FE2F    ; Mn #  [16] COMBINING LIGATURE LEFT HALF..COMBINING CYRILLIC TITL
 16F8F..16F92  ; Mn #   [4] MIAO TONE RIGHT..MIAO TONE BELOW
 16FE4         ; Mn #       KHITAN SMALL SCRIPT FILLER
 1BC9D..1BC9E  ; Mn #   [2] DUPLOYAN THICK LETTER SELECTOR..DUPLOYAN DOUBLE MARK
+1CF00..1CF2D  ; Mn #  [46] ZNAMENNY COMBINING MARK GORAZDO NIZKO S KRYZHEM ON LEFT..ZNAMENNY COMBINING MARK KRYZH ON LEFT
+1CF30..1CF46  ; Mn #  [23] ZNAMENNY COMBINING TONAL RANGE MARK MRACHNO..ZNAMENNY PRIZNAK MODIFIER ROG
 1D167..1D169  ; Mn #   [3] MUSICAL SYMBOL COMBINING TREMOLO-1..MUSICAL SYMBOL COMBINING TREMOLO-3
 1D17B..1D182  ; Mn #   [8] MUSICAL SYMBOL COMBINING ACCENT..MUSICAL SYMBOL COMBINING LOURE
 1D185..1D18B  ; Mn #   [7] MUSICAL SYMBOL COMBINING DOIT..MUSICAL SYMBOL COMBINING TRIPLE TONGUE
@ -2900,12 +2969,13 @@ FE20..FE2F    ; Mn #  [16] COMBINING LIGATURE LEFT HALF..COMBINING CYRILLIC TITL
 1E023..1E024  ; Mn #   [2] COMBINING GLAGOLITIC LETTER YU..COMBINING GLAGOLITIC LETTER SMALL YUS
 1E026..1E02A  ; Mn #   [5] COMBINING GLAGOLITIC LETTER YO..COMBINING GLAGOLITIC LETTER FITA
 1E130..1E136  ; Mn #   [7] NYIAKENG PUACHUE HMONG TONE-B..NYIAKENG PUACHUE HMONG TONE-D
+1E2AE         ; Mn #       TOTO SIGN RISING TONE
 1E2EC..1E2EF  ; Mn #   [4] WANCHO TONE TUP..WANCHO TONE KOINI
 1E8D0..1E8D6  ; Mn #   [7] MENDE KIKAKUI COMBINING NUMBER TEENS..MENDE KIKAKUI COMBINING NUMBER MILLIONS
 1E944..1E94A  ; Mn #   [7] ADLAM ALIF LENGTHENER..ADLAM NUKTA
 E0100..E01EF  ; Mn # [240] VARIATION SELECTOR-17..VARIATION SELECTOR-256

-# Total code points: 1839
+# Total code points: 1950

 # ================================================

@ -2980,6 +3050,8 @@ A670..A672    ; Me #   [3] COMBINING CYRILLIC TEN MILLIONS SIGN..COMBINING CYRIL
 1087..108C    ; Mc #   [6] MYANMAR SIGN SHAN TONE-2..MYANMAR SIGN SHAN COUNCIL TONE-3
 108F          ; Mc #       MYANMAR SIGN RUMAI PALAUNG TONE-5
 109A..109C    ; Mc #   [3] MYANMAR SIGN KHAMTI TONE-1..MYANMAR VOWEL SIGN AITON A
+1715          ; Mc #       TAGALOG SIGN PAMUDPOD
+1734          ; Mc #       HANUNOO SIGN PAMUDPOD
 17B6          ; Mc #       KHMER VOWEL SIGN AA
 17BE..17C5    ; Mc #   [8] KHMER VOWEL SIGN OE..KHMER VOWEL SIGN AU
 17C7..17C8    ; Mc #   [2] KHMER SIGN REAHMUK..KHMER SIGN YUUKALEAPINTU
@ -3099,7 +3171,7 @@ ABEC          ; Mc #       MEETEI MAYEK LUM IYEK
 1D165..1D166  ; Mc #   [2] MUSICAL SYMBOL COMBINING STEM..MUSICAL SYMBOL COMBINING SPRECHGESANG STEM
 1D16D..1D172  ; Mc #   [6] MUSICAL SYMBOL COMBINING AUGMENTATION DOT..MUSICAL SYMBOL COMBINING FLAG-5

-# Total code points: 443
+# Total code points: 445

 # ================================================

@ -3160,6 +3232,7 @@ FF10..FF19    ; Nd #  [10] FULLWIDTH DIGIT ZERO..FULLWIDTH DIGIT NINE
 11D50..11D59  ; Nd #  [10] MASARAM GONDI DIGIT ZERO..MASARAM GONDI DIGIT NINE
 11DA0..11DA9  ; Nd #  [10] GUNJALA GONDI DIGIT ZERO..GUNJALA GONDI DIGIT NINE
 16A60..16A69  ; Nd #  [10] MRO DIGIT ZERO..MRO DIGIT NINE
+16AC0..16AC9  ; Nd #  [10] TANGSA DIGIT ZERO..TANGSA DIGIT NINE
 16B50..16B59  ; Nd #  [10] PAHAWH HMONG DIGIT ZERO..PAHAWH HMONG DIGIT NINE
 1D7CE..1D7FF  ; Nd #  [50] MATHEMATICAL BOLD DIGIT ZERO..MATHEMATICAL MONOSPACE DIGIT NINE
 1E140..1E149  ; Nd #  [10] NYIAKENG PUACHUE HMONG DIGIT ZERO..NYIAKENG PUACHUE HMONG DIGIT NINE
@ -3167,7 +3240,7 @@ FF10..FF19    ; Nd #  [10] FULLWIDTH DIGIT ZERO..FULLWIDTH DIGIT NINE
 1E950..1E959  ; Nd #  [10] ADLAM DIGIT ZERO..ADLAM DIGIT NINE
 1FBF0..1FBF9  ; Nd #  [10] SEGMENTED DIGIT ZERO..SEGMENTED DIGIT NINE

-# Total code points: 650
+# Total code points: 660

 # ================================================

@ -3314,6 +3387,7 @@ A830..A835    ; No #   [6] NORTH INDIC FRACTION ONE QUARTER..NORTH INDIC FRACTIO
 061C          ; Cf #       ARABIC LETTER MARK
 06DD          ; Cf #       ARABIC END OF AYAH
 070F          ; Cf #       SYRIAC ABBREVIATION MARK
+0890..0891    ; Cf #   [2] ARABIC POUND MARK ABOVE..ARABIC PIASTRE MARK ABOVE
 08E2          ; Cf #       ARABIC DISPUTED END OF AYAH
 180E          ; Cf #       MONGOLIAN VOWEL SEPARATOR
 200B..200F    ; Cf #   [5] ZERO WIDTH SPACE..RIGHT-TO-LEFT MARK
@ -3330,7 +3404,7 @@ FFF9..FFFB    ; Cf #   [3] INTERLINEAR ANNOTATION ANCHOR..INTERLINEAR ANNOTATION
 E0001         ; Cf #       LANGUAGE TAG
 E0020..E007F  ; Cf #  [96] TAG SPACE..CANCEL TAG

-# Total code points: 161
+# Total code points: 163

 # ================================================

@ -3364,6 +3438,7 @@ D800..DFFF    ; Cs # [2048] <surrogate-D800>..<surrogate-DFFF>
 2E1A          ; Pd #       HYPHEN WITH DIAERESIS
 2E3A..2E3B    ; Pd #   [2] TWO-EM DASH..THREE-EM DASH
 2E40          ; Pd #       DOUBLE HYPHEN
+2E5D          ; Pd #       OBLIQUE HYPHEN
 301C          ; Pd #       WAVE DASH
 3030          ; Pd #       WAVY DASH
 30A0          ; Pd #       KATAKANA-HIRAGANA DOUBLE HYPHEN
@ -3373,7 +3448,7 @@ FE63          ; Pd #       SMALL HYPHEN-MINUS
 FF0D          ; Pd #       FULLWIDTH HYPHEN-MINUS
 10EAD         ; Pd #       YEZIDI HYPHENATION MARK

-# Total code points: 25
+# Total code points: 26

 # ================================================

@ -3425,6 +3500,10 @@ FF0D          ; Pd #       FULLWIDTH HYPHEN-MINUS
 2E26          ; Ps #       LEFT SIDEWAYS U BRACKET
 2E28          ; Ps #       LEFT DOUBLE PARENTHESIS
 2E42          ; Ps #       DOUBLE LOW-REVERSED-9 QUOTATION MARK
+2E55          ; Ps #       LEFT SQUARE BRACKET WITH STROKE
+2E57          ; Ps #       LEFT SQUARE BRACKET WITH DOUBLE STROKE
+2E59          ; Ps #       TOP HALF LEFT PARENTHESIS
+2E5B          ; Ps #       BOTTOM HALF LEFT PARENTHESIS
 3008          ; Ps #       LEFT ANGLE BRACKET
 300A          ; Ps #       LEFT DOUBLE ANGLE BRACKET
 300C          ; Ps #       LEFT CORNER BRACKET
@ -3455,7 +3534,7 @@ FF5B          ; Ps #       FULLWIDTH LEFT CURLY BRACKET
 FF5F          ; Ps #       FULLWIDTH LEFT WHITE PARENTHESIS
 FF62          ; Ps #       HALFWIDTH LEFT CORNER BRACKET

-# Total code points: 75
+# Total code points: 79

 # ================================================

@ -3504,6 +3583,10 @@ FF62          ; Ps #       HALFWIDTH LEFT CORNER BRACKET
 2E25          ; Pe #       BOTTOM RIGHT HALF BRACKET
 2E27          ; Pe #       RIGHT SIDEWAYS U BRACKET
 2E29          ; Pe #       RIGHT DOUBLE PARENTHESIS
+2E56          ; Pe #       RIGHT SQUARE BRACKET WITH STROKE
+2E58          ; Pe #       RIGHT SQUARE BRACKET WITH DOUBLE STROKE
+2E5A          ; Pe #       TOP HALF RIGHT PARENTHESIS
+2E5C          ; Pe #       BOTTOM HALF RIGHT PARENTHESIS
 3009          ; Pe #       RIGHT ANGLE BRACKET
 300B          ; Pe #       RIGHT DOUBLE ANGLE BRACKET
 300D          ; Pe #       RIGHT CORNER BRACKET
@ -3534,7 +3617,7 @@ FF5D          ; Pe #       FULLWIDTH RIGHT CURLY BRACKET
 FF60          ; Pe #       FULLWIDTH RIGHT WHITE PARENTHESIS
 FF63          ; Pe #       HALFWIDTH RIGHT CORNER BRACKET

-# Total code points: 73
+# Total code points: 77

 # ================================================

@ -3576,7 +3659,7 @@ FF3F          ; Pc #       FULLWIDTH LOW LINE
 0609..060A    ; Po #   [2] ARABIC-INDIC PER MILLE SIGN..ARABIC-INDIC PER TEN THOUSAND SIGN
 060C..060D    ; Po #   [2] ARABIC COMMA..ARABIC DATE SEPARATOR
 061B          ; Po #       ARABIC SEMICOLON
-061E..061F    ; Po #   [2] ARABIC TRIPLE DOT PUNCTUATION MARK..ARABIC QUESTION MARK
+061D..061F    ; Po #   [3] ARABIC END OF TEXT MARK..ARABIC QUESTION MARK
 066A..066D    ; Po #   [4] ARABIC PERCENT SIGN..ARABIC FIVE POINTED STAR
 06D4          ; Po #       ARABIC FULL STOP
 0700..070D    ; Po #  [14] SYRIAC END OF PARAGRAPH..SYRIAC HARKLEAN ASTERISCUS
@ -3613,6 +3696,7 @@ FF3F          ; Pc #       FULLWIDTH LOW LINE
 1AA0..1AA6    ; Po #   [7] TAI THAM SIGN WIANG..TAI THAM SIGN REVERSED ROTATED RANA
 1AA8..1AAD    ; Po #   [6] TAI THAM SIGN KAAN..TAI THAM SIGN CAANG
 1B5A..1B60    ; Po #   [7] BALINESE PANTI..BALINESE PAMENENG
+1B7D..1B7E    ; Po #   [2] BALINESE PANTI LANTANG..BALINESE PAMADA LANTANG
 1BFC..1BFF    ; Po #   [4] BATAK SYMBOL BINDU NA METEK..BATAK SYMBOL BINDU PANGOLAT
 1C3B..1C3F    ; Po #   [5] LEPCHA PUNCTUATION TA-ROL..LEPCHA PUNCTUATION TSHOOK
 1C7E..1C7F    ; Po #   [2] OL CHIKI PUNCTUATION MUCAAD..OL CHIKI PUNCTUATION DOUBLE MUCAAD
@ -3641,7 +3725,7 @@ FF3F          ; Pc #       FULLWIDTH LOW LINE
 2E3C..2E3F    ; Po #   [4] STENOGRAPHIC FULL STOP..CAPITULUM
 2E41          ; Po #       REVERSED COMMA
 2E43..2E4F    ; Po #  [13] DASH WITH LEFT UPTURN..CORNISH VERSE DIVIDER
-2E52          ; Po #       TIRONIAN SIGN CAPITAL ET
+2E52..2E54    ; Po #   [3] TIRONIAN SIGN CAPITAL ET..MEDIEVAL QUESTION MARK
 3001..3003    ; Po #   [3] IDEOGRAPHIC COMMA..DITTO MARK
 303D          ; Po #       PART ALTERNATION MARK
 30FB          ; Po #       KATAKANA MIDDLE DOT
@ -3695,6 +3779,7 @@ FF64..FF65    ; Po #   [2] HALFWIDTH IDEOGRAPHIC COMMA..HALFWIDTH KATAKANA MIDDL
 10B39..10B3F  ; Po #   [7] AVESTAN ABBREVIATION MARK..LARGE ONE RING OVER TWO RINGS PUNCTUATION
 10B99..10B9C  ; Po #   [4] PSALTER PAHLAVI SECTION MARK..PSALTER PAHLAVI FOUR DOTS WITH DOT
 10F55..10F59  ; Po #   [5] SOGDIAN PUNCTUATION TWO VERTICAL BARS..SOGDIAN PUNCTUATION HALF CIRCLE WITH DOT
+10F86..10F89  ; Po #   [4] OLD UYGHUR PUNCTUATION BAR..OLD UYGHUR PUNCTUATION FOUR DOTS
 11047..1104D  ; Po #   [7] BRAHMI DANDA..BRAHMI PUNCTUATION LOTUS
 110BB..110BC  ; Po #   [2] KAITHI ABBREVIATION SIGN..KAITHI ENUMERATION SIGN
 110BE..110C1  ; Po #   [4] KAITHI SECTION MARK..KAITHI DOUBLE DANDA
@ -3713,6 +3798,7 @@ FF64..FF65    ; Po #   [2] HALFWIDTH IDEOGRAPHIC COMMA..HALFWIDTH KATAKANA MIDDL
 115C1..115D7  ; Po #  [23] SIDDHAM SIGN SIDDHAM..SIDDHAM SECTION MARK WITH CIRCLES AND FOUR ENCLOSURES
 11641..11643  ; Po #   [3] MODI DANDA..MODI ABBREVIATION SIGN
 11660..1166C  ; Po #  [13] MONGOLIAN BIRGA WITH ORNAMENT..MONGOLIAN TURNED SWIRL BIRGA WITH DOUBLE ORNAMENT
+116B9         ; Po #       TAKRI ABBREVIATION SIGN
 1173C..1173E  ; Po #   [3] AHOM SIGN SMALL SECTION..AHOM SIGN RULAI
 1183B         ; Po #       DOGRA ABBREVIATION SIGN
 11944..11946  ; Po #   [3] DIVES AKURU DOUBLE DANDA..DIVES AKURU END OF TEXT MARK
@ -3725,6 +3811,7 @@ FF64..FF65    ; Po #   [2] HALFWIDTH IDEOGRAPHIC COMMA..HALFWIDTH KATAKANA MIDDL
 11EF7..11EF8  ; Po #   [2] MAKASAR PASSIMBANG..MAKASAR END OF SECTION
 11FFF         ; Po #       TAMIL PUNCTUATION END OF TEXT
 12470..12474  ; Po #   [5] CUNEIFORM PUNCTUATION SIGN OLD ASSYRIAN WORD DIVIDER..CUNEIFORM PUNCTUATION SIGN DIAGONAL QUADCOLON
+12FF1..12FF2  ; Po #   [2] CYPRO-MINOAN SIGN CM301..CYPRO-MINOAN SIGN CM302
 16A6E..16A6F  ; Po #   [2] MRO DANDA..MRO DOUBLE DANDA
 16AF5         ; Po #       BASSA VAH FULL STOP
 16B37..16B3B  ; Po #   [5] PAHAWH HMONG SIGN VOS THOM..PAHAWH HMONG SIGN VOS FEEM
@ -3735,7 +3822,7 @@ FF64..FF65    ; Po #   [2] HALFWIDTH IDEOGRAPHIC COMMA..HALFWIDTH KATAKANA MIDDL
 1DA87..1DA8B  ; Po #   [5] SIGNWRITING COMMA..SIGNWRITING PARENTHESIS
 1E95E..1E95F  ; Po #   [2] ADLAM INITIAL EXCLAMATION MARK..ADLAM INITIAL QUESTION MARK

-# Total code points: 593
+# Total code points: 605

 # ================================================

@ -3823,7 +3910,7 @@ FFE9..FFEC    ; Sm #   [4] HALFWIDTH LEFTWARDS ARROW..HALFWIDTH DOWNWARDS ARROW
 0BF9          ; Sc #       TAMIL RUPEE SIGN
 0E3F          ; Sc #       THAI CURRENCY SYMBOL BAHT
 17DB          ; Sc #       KHMER CURRENCY SYMBOL RIEL
-20A0..20BF    ; Sc #  [32] EURO-CURRENCY SIGN..BITCOIN SIGN
+20A0..20C0    ; Sc #  [33] EURO-CURRENCY SIGN..SOM SIGN
 A838          ; Sc #       NORTH INDIC RUPEE MARK
 FDFC          ; Sc #       RIAL SIGN
 FE69          ; Sc #       SMALL DOLLAR SIGN
@ -3834,7 +3921,7 @@ FFE5..FFE6    ; Sc #   [2] FULLWIDTH YEN SIGN..FULLWIDTH WON SIGN
 1E2FF         ; Sc #       WANCHO NGUN SIGN
 1ECB0         ; Sc #       INDIC SIYAQ RUPEE MARK

-# Total code points: 62
+# Total code points: 63

 # ================================================

@ -3853,6 +3940,7 @@ FFE5..FFE6    ; Sc #   [2] FULLWIDTH YEN SIGN..FULLWIDTH WON SIGN
 02EF..02FF    ; Sk #  [17] MODIFIER LETTER LOW DOWN ARROWHEAD..MODIFIER LETTER LOW LEFT ARROW
 0375          ; Sk #       GREEK LOWER NUMERAL SIGN
 0384..0385    ; Sk #   [2] GREEK TONOS..GREEK DIALYTIKA TONOS
+0888          ; Sk #       ARABIC RAISED ROUND DOT
 1FBD          ; Sk #       GREEK KORONIS
 1FBF..1FC1    ; Sk #   [3] GREEK PSILI..GREEK DIALYTIKA AND PERISPOMENI
 1FCD..1FCF    ; Sk #   [3] GREEK PSILI AND VARIA..GREEK PSILI AND PERISPOMENI
@ -3865,13 +3953,13 @@ A720..A721    ; Sk #   [2] MODIFIER LETTER STRESS AND HIGH TONE..MODIFIER LETTER
 A789..A78A    ; Sk #   [2] MODIFIER LETTER COLON..MODIFIER LETTER SHORT EQUALS SIGN
 AB5B          ; Sk #       MODIFIER BREVE WITH INVERTED BREVE
 AB6A..AB6B    ; Sk #   [2] MODIFIER LETTER LEFT TACK..MODIFIER LETTER RIGHT TACK
-FBB2..FBC1    ; Sk #  [16] ARABIC SYMBOL DOT ABOVE..ARABIC SYMBOL SMALL TAH BELOW
+FBB2..FBC2    ; Sk #  [17] ARABIC SYMBOL DOT ABOVE..ARABIC SYMBOL WASLA ABOVE
 FF3E          ; Sk #       FULLWIDTH CIRCUMFLEX ACCENT
 FF40          ; Sk #       FULLWIDTH GRAVE ACCENT
 FFE3          ; Sk #       FULLWIDTH MACRON
 1F3FB..1F3FF  ; Sk #   [5] EMOJI MODIFIER FITZPATRICK TYPE-1-2..EMOJI MODIFIER FITZPATRICK TYPE-6

-# Total code points: 123
+# Total code points: 125

 # ================================================

@ -3984,7 +4072,9 @@ A828..A82B    ; So #   [4] SYLOTI NAGRI POETRY MARK-1..SYLOTI NAGRI POETRY MARK-
 A836..A837    ; So #   [2] NORTH INDIC QUARTER MARK..NORTH INDIC PLACEHOLDER MARK
 A839          ; So #       NORTH INDIC QUANTITY MARK
 AA77..AA79    ; So #   [3] MYANMAR SYMBOL AITON EXCLAMATION..MYANMAR SYMBOL AITON TWO
-FDFD          ; So #       ARABIC LIGATURE BISMILLAH AR-RAHMAN AR-RAHEEM
+FD40..FD4F    ; So #  [16] ARABIC LIGATURE RAHIMAHU ALLAAH..ARABIC LIGATURE RAHIMAHUM ALLAAH
+FDCF          ; So #       ARABIC LIGATURE SALAAMUHU ALAYNAA
+FDFD..FDFF    ; So #   [3] ARABIC LIGATURE BISMILLAH AR-RAHMAN AR-RAHEEM..ARABIC LIGATURE AZZA WA JALL
 FFE4          ; So #       FULLWIDTH BROKEN BAR
 FFE8          ; So #       HALFWIDTH FORMS LIGHT VERTICAL
 FFED..FFEE    ; So #   [2] HALFWIDTH BLACK SQUARE..HALFWIDTH WHITE CIRCLE
@ -4003,13 +4093,14 @@ FFFC..FFFD    ; So #   [2] OBJECT REPLACEMENT CHARACTER..REPLACEMENT CHARACTER
 16B3C..16B3F  ; So #   [4] PAHAWH HMONG SIGN XYEEM NTXIV..PAHAWH HMONG SIGN XYEEM FAIB
 16B45         ; So #       PAHAWH HMONG SIGN CIM TSOV ROG
 1BC9C         ; So #       DUPLOYAN SIGN O WITH CROSS
+1CF50..1CFC3  ; So # [116] ZNAMENNY NEUME KRYUK..ZNAMENNY NEUME PAUK
 1D000..1D0F5  ; So # [246] BYZANTINE MUSICAL SYMBOL PSILI..BYZANTINE MUSICAL SYMBOL GORGON NEO KATO
 1D100..1D126  ; So #  [39] MUSICAL SYMBOL SINGLE BARLINE..MUSICAL SYMBOL DRUM CLEF-2
 1D129..1D164  ; So #  [60] MUSICAL SYMBOL MULTIPLE MEASURE REST..MUSICAL SYMBOL ONE HUNDRED TWENTY-EIGHTH NOTE
 1D16A..1D16C  ; So #   [3] MUSICAL SYMBOL FINGERED TREMOLO-1..MUSICAL SYMBOL FINGERED TREMOLO-3
 1D183..1D184  ; So #   [2] MUSICAL SYMBOL ARPEGGIATO UP..MUSICAL SYMBOL ARPEGGIATO DOWN
 1D18C..1D1A9  ; So #  [30] MUSICAL SYMBOL RINFORZANDO..MUSICAL SYMBOL DEGREE SLASH
-1D1AE..1D1E8  ; So #  [59] MUSICAL SYMBOL PEDAL MARK..MUSICAL SYMBOL KIEVAN FLAT SIGN
+1D1AE..1D1EA  ; So #  [61] MUSICAL SYMBOL PEDAL MARK..MUSICAL SYMBOL KORON
 1D200..1D241  ; So #  [66] GREEK VOCAL NOTATION SYMBOL-1..GREEK INSTRUMENTAL NOTATION SYMBOL-54
 1D245         ; So #       GREEK MUSICAL LEIMMA
 1D300..1D356  ; So #  [87] MONOGRAM FOR EARTH..TETRAGRAM FOR FOSTERING
@ -4035,32 +4126,33 @@ FFFC..FFFD    ; So #   [2] OBJECT REPLACEMENT CHARACTER..REPLACEMENT CHARACTER
 1F260..1F265  ; So #   [6] ROUNDED SYMBOL FOR FU..ROUNDED SYMBOL FOR CAI
 1F300..1F3FA  ; So # [251] CYCLONE..AMPHORA
 1F400..1F6D7  ; So # [728] RAT..ELEVATOR
-1F6E0..1F6EC  ; So #  [13] HAMMER AND WRENCH..AIRPLANE ARRIVING
+1F6DD..1F6EC  ; So #  [16] PLAYGROUND SLIDE..AIRPLANE ARRIVING
 1F6F0..1F6FC  ; So #  [13] SATELLITE..ROLLER SKATE
 1F700..1F773  ; So # [116] ALCHEMICAL SYMBOL FOR QUINTESSENCE..ALCHEMICAL SYMBOL FOR HALF OUNCE
 1F780..1F7D8  ; So #  [89] BLACK LEFT-POINTING ISOSCELES RIGHT TRIANGLE..NEGATIVE CIRCLED SQUARE
 1F7E0..1F7EB  ; So #  [12] LARGE ORANGE CIRCLE..LARGE BROWN SQUARE
+1F7F0         ; So #       HEAVY EQUALS SIGN
 1F800..1F80B  ; So #  [12] LEFTWARDS ARROW WITH SMALL TRIANGLE ARROWHEAD..DOWNWARDS ARROW WITH LARGE TRIANGLE ARROWHEAD
 1F810..1F847  ; So #  [56] LEFTWARDS ARROW WITH SMALL EQUILATERAL ARROWHEAD..DOWNWARDS HEAVY ARROW
 1F850..1F859  ; So #  [10] LEFTWARDS SANS-SERIF ARROW..UP DOWN SANS-SERIF ARROW
 1F860..1F887  ; So #  [40] WIDE-HEADED LEFTWARDS LIGHT BARB ARROW..WIDE-HEADED SOUTH WEST VERY HEAVY BARB ARROW
 1F890..1F8AD  ; So #  [30] LEFTWARDS TRIANGLE ARROWHEAD..WHITE ARROW SHAFT WIDTH TWO THIRDS
 1F8B0..1F8B1  ; So #   [2] ARROW POINTING UPWARDS THEN NORTH WEST..ARROW POINTING RIGHTWARDS THEN CURVING SOUTH WEST
-1F900..1F978  ; So # [121] CIRCLED CROSS FORMEE WITH FOUR DOTS..DISGUISED FACE
-1F97A..1F9CB  ; So #  [82] FACE WITH PLEADING EYES..BUBBLE TEA
-1F9CD..1FA53  ; So # [135] STANDING PERSON..BLACK CHESS KNIGHT-BISHOP
+1F900..1FA53  ; So # [340] CIRCLED CROSS FORMEE WITH FOUR DOTS..BLACK CHESS KNIGHT-BISHOP
 1FA60..1FA6D  ; So #  [14] XIANGQI RED GENERAL..XIANGQI BLACK SOLDIER
 1FA70..1FA74  ; So #   [5] BALLET SHOES..THONG SANDAL
-1FA78..1FA7A  ; So #   [3] DROP OF BLOOD..STETHOSCOPE
+1FA78..1FA7C  ; So #   [5] DROP OF BLOOD..CRUTCH
 1FA80..1FA86  ; So #   [7] YO-YO..NESTING DOLLS
-1FA90..1FAA8  ; So #  [25] RINGED PLANET..ROCK
-1FAB0..1FAB6  ; So #   [7] FLY..FEATHER
-1FAC0..1FAC2  ; So #   [3] ANATOMICAL HEART..PEOPLE HUGGING
-1FAD0..1FAD6  ; So #   [7] BLUEBERRIES..TEAPOT
+1FA90..1FAAC  ; So #  [29] RINGED PLANET..HAMSA
+1FAB0..1FABA  ; So #  [11] FLY..NEST WITH EGGS
+1FAC0..1FAC5  ; So #   [6] ANATOMICAL HEART..PERSON WITH CROWN
+1FAD0..1FAD9  ; So #  [10] BLUEBERRIES..JAR
+1FAE0..1FAE7  ; So #   [8] MELTING FACE..BUBBLES
+1FAF0..1FAF6  ; So #   [7] HAND WITH INDEX FINGER AND THUMB CROSSED..HEART HANDS
 1FB00..1FB92  ; So # [147] BLOCK SEXTANT-1..UPPER HALF INVERSE MEDIUM SHADE AND LOWER HALF BLOCK
 1FB94..1FBCA  ; So #  [55] LEFT HALF INVERSE MEDIUM SHADE AND RIGHT HALF BLOCK..WHITE UP-POINTING CHEVRON

-# Total code points: 6431
+# Total code points: 6605

 # ================================================

--- a/maint/Unicode.tables/GraphemeBreakProperty.txt
+++ b/maint/Unicode.tables/GraphemeBreakProperty.txt
@ -1,6 +1,6 @@
-# GraphemeBreakProperty-13.0.0.txt
-# Date: 2019-10-21, 14:30:35 GMT
-# © 2019 Unicode®, Inc.
+# GraphemeBreakProperty-14.0.0.txt
+# Date: 2021-08-12, 23:13:02 GMT
+# © 2021 Unicode®, Inc.
 # Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries.
 # For terms of use, see http://www.unicode.org/terms_of_use.html
 #
@ -21,6 +21,7 @@
 0600..0605    ; Prepend # Cf   [6] ARABIC NUMBER SIGN..ARABIC NUMBER MARK ABOVE
 06DD          ; Prepend # Cf       ARABIC END OF AYAH
 070F          ; Prepend # Cf       SYRIAC ABBREVIATION MARK
+0890..0891    ; Prepend # Cf   [2] ARABIC POUND MARK ABOVE..ARABIC PIASTRE MARK ABOVE
 08E2          ; Prepend # Cf       ARABIC DISPUTED END OF AYAH
 0D4E          ; Prepend # Lo       MALAYALAM LETTER DOT REPH
 110BD         ; Prepend # Cf       KAITHI NUMBER SIGN
@ -32,7 +33,7 @@
 11A84..11A89  ; Prepend # Lo   [6] SOYOMBO SIGN JIHVAMULIYA..SOYOMBO CLUSTER-INITIAL LETTER SA
 11D46         ; Prepend # Lo       MASARAM GONDI REPHA

-# Total code points: 24
+# Total code points: 26

 # ================================================

@ -104,7 +105,8 @@ E01F0..E0FFF  ; Control # Cn [3600] <reserved-E01F0>..<reserved-E0FFF>
 0825..0827    ; Extend # Mn   [3] SAMARITAN VOWEL SIGN SHORT A..SAMARITAN VOWEL SIGN U
 0829..082D    ; Extend # Mn   [5] SAMARITAN VOWEL SIGN LONG I..SAMARITAN MARK NEQUDAA
 0859..085B    ; Extend # Mn   [3] MANDAIC AFFRICATION MARK..MANDAIC GEMINATION MARK
-08D3..08E1    ; Extend # Mn  [15] ARABIC SMALL LOW WAW..ARABIC SMALL HIGH SIGN SAFHA
+0898..089F    ; Extend # Mn   [8] ARABIC SMALL HIGH WORD AL-JUZ..ARABIC HALF MADDA OVER MADDA
+08CA..08E1    ; Extend # Mn  [24] ARABIC SMALL HIGH FARSI YEH..ARABIC SMALL HIGH SIGN SAFHA
 08E3..0902    ; Extend # Mn  [32] ARABIC TURNED DAMMA BELOW..DEVANAGARI SIGN ANUSVARA
 093A          ; Extend # Mn       DEVANAGARI VOWEL SIGN OE
 093C          ; Extend # Mn       DEVANAGARI SIGN NUKTA
@ -151,6 +153,7 @@ E01F0..E0FFF  ; Control # Cn [3600] <reserved-E01F0>..<reserved-E0FFF>
 0BD7          ; Extend # Mc       TAMIL AU LENGTH MARK
 0C00          ; Extend # Mn       TELUGU SIGN COMBINING CANDRABINDU ABOVE
 0C04          ; Extend # Mn       TELUGU SIGN COMBINING ANUSVARA ABOVE
+0C3C          ; Extend # Mn       TELUGU SIGN NUKTA
 0C3E..0C40    ; Extend # Mn   [3] TELUGU VOWEL SIGN AA..TELUGU VOWEL SIGN II
 0C46..0C48    ; Extend # Mn   [3] TELUGU VOWEL SIGN E..TELUGU VOWEL SIGN AI
 0C4A..0C4D    ; Extend # Mn   [4] TELUGU VOWEL SIGN O..TELUGU SIGN VIRAMA
@ -206,7 +209,7 @@ E01F0..E0FFF  ; Control # Cn [3600] <reserved-E01F0>..<reserved-E0FFF>
 109D          ; Extend # Mn       MYANMAR VOWEL SIGN AITON AI
 135D..135F    ; Extend # Mn   [3] ETHIOPIC COMBINING GEMINATION AND VOWEL LENGTH MARK..ETHIOPIC COMBINING GEMINATION MARK
 1712..1714    ; Extend # Mn   [3] TAGALOG VOWEL SIGN I..TAGALOG SIGN VIRAMA
-1732..1734    ; Extend # Mn   [3] HANUNOO VOWEL SIGN I..HANUNOO SIGN PAMUDPOD
+1732..1733    ; Extend # Mn   [2] HANUNOO VOWEL SIGN I..HANUNOO VOWEL SIGN U
 1752..1753    ; Extend # Mn   [2] BUHID VOWEL SIGN I..BUHID VOWEL SIGN U
 1772..1773    ; Extend # Mn   [2] TAGBANWA VOWEL SIGN I..TAGBANWA VOWEL SIGN U
 17B4..17B5    ; Extend # Mn   [2] KHMER VOWEL INHERENT AQ..KHMER VOWEL INHERENT AA
@ -215,6 +218,7 @@ E01F0..E0FFF  ; Control # Cn [3600] <reserved-E01F0>..<reserved-E0FFF>
 17C9..17D3    ; Extend # Mn  [11] KHMER SIGN MUUSIKATOAN..KHMER SIGN BATHAMASAT
 17DD          ; Extend # Mn       KHMER SIGN ATTHACAN
 180B..180D    ; Extend # Mn   [3] MONGOLIAN FREE VARIATION SELECTOR ONE..MONGOLIAN FREE VARIATION SELECTOR THREE
+180F          ; Extend # Mn       MONGOLIAN FREE VARIATION SELECTOR FOUR
 1885..1886    ; Extend # Mn   [2] MONGOLIAN LETTER ALI GALI BALUDA..MONGOLIAN LETTER ALI GALI THREE BALUDA
 18A9          ; Extend # Mn       MONGOLIAN LETTER ALI GALI DAGALGA
 1920..1922    ; Extend # Mn   [3] LIMBU VOWEL SIGN A..LIMBU VOWEL SIGN U
@ -232,7 +236,7 @@ E01F0..E0FFF  ; Control # Cn [3600] <reserved-E01F0>..<reserved-E0FFF>
 1A7F          ; Extend # Mn       TAI THAM COMBINING CRYPTOGRAMMIC DOT
 1AB0..1ABD    ; Extend # Mn  [14] COMBINING DOUBLED CIRCUMFLEX ACCENT..COMBINING PARENTHESES BELOW
 1ABE          ; Extend # Me       COMBINING PARENTHESES OVERLAY
-1ABF..1AC0    ; Extend # Mn   [2] COMBINING LATIN SMALL LETTER W BELOW..COMBINING LATIN SMALL LETTER TURNED W BELOW
+1ABF..1ACE    ; Extend # Mn  [16] COMBINING LATIN SMALL LETTER W BELOW..COMBINING LATIN SMALL LETTER INSULAR T
 1B00..1B03    ; Extend # Mn   [4] BALINESE SIGN ULU RICEM..BALINESE SIGN SURANG
 1B34          ; Extend # Mn       BALINESE SIGN REREKAN
 1B35          ; Extend # Mc       BALINESE VOWEL SIGN TEDUNG
@ -256,8 +260,7 @@ E01F0..E0FFF  ; Control # Cn [3600] <reserved-E01F0>..<reserved-E0FFF>
 1CED          ; Extend # Mn       VEDIC SIGN TIRYAK
 1CF4          ; Extend # Mn       VEDIC TONE CANDRA ABOVE
 1CF8..1CF9    ; Extend # Mn   [2] VEDIC TONE RING ABOVE..VEDIC TONE DOUBLE RING ABOVE
-1DC0..1DF9    ; Extend # Mn  [58] COMBINING DOTTED GRAVE ACCENT..COMBINING WIDE INVERTED BRIDGE BELOW
-1DFB..1DFF    ; Extend # Mn   [5] COMBINING DELETION MARK..COMBINING RIGHT ARROWHEAD AND DOWN ARROWHEAD BELOW
+1DC0..1DFF    ; Extend # Mn  [64] COMBINING DOTTED GRAVE ACCENT..COMBINING RIGHT ARROWHEAD AND DOWN ARROWHEAD BELOW
 200C          ; Extend # Cf       ZERO WIDTH NON-JOINER
 20D0..20DC    ; Extend # Mn  [13] COMBINING LEFT HARPOON ABOVE..COMBINING FOUR DOTS ABOVE
 20DD..20E0    ; Extend # Me   [4] COMBINING ENCLOSING CIRCLE..COMBINING ENCLOSING CIRCLE BACKSLASH
@ -322,11 +325,15 @@ FF9E..FF9F    ; Extend # Lm   [2] HALFWIDTH KATAKANA VOICED SOUND MARK..HALFWIDT
 10D24..10D27  ; Extend # Mn   [4] HANIFI ROHINGYA SIGN HARBAHAY..HANIFI ROHINGYA SIGN TASSI
 10EAB..10EAC  ; Extend # Mn   [2] YEZIDI COMBINING HAMZA MARK..YEZIDI COMBINING MADDA MARK
 10F46..10F50  ; Extend # Mn  [11] SOGDIAN COMBINING DOT BELOW..SOGDIAN COMBINING STROKE BELOW
+10F82..10F85  ; Extend # Mn   [4] OLD UYGHUR COMBINING DOT ABOVE..OLD UYGHUR COMBINING TWO DOTS BELOW
 11001         ; Extend # Mn       BRAHMI SIGN ANUSVARA
 11038..11046  ; Extend # Mn  [15] BRAHMI VOWEL SIGN AA..BRAHMI VIRAMA
+11070         ; Extend # Mn       BRAHMI SIGN OLD TAMIL VIRAMA
+11073..11074  ; Extend # Mn   [2] BRAHMI VOWEL SIGN OLD TAMIL SHORT E..BRAHMI VOWEL SIGN OLD TAMIL SHORT O
 1107F..11081  ; Extend # Mn   [3] BRAHMI NUMBER JOINER..KAITHI SIGN ANUSVARA
 110B3..110B6  ; Extend # Mn   [4] KAITHI VOWEL SIGN U..KAITHI VOWEL SIGN AI
 110B9..110BA  ; Extend # Mn   [2] KAITHI SIGN VIRAMA..KAITHI SIGN NUKTA
+110C2         ; Extend # Mn       KAITHI VOWEL SIGN VOCALIC R
 11100..11102  ; Extend # Mn   [3] CHAKMA SIGN CANDRABINDU..CHAKMA SIGN VISARGA
 11127..1112B  ; Extend # Mn   [5] CHAKMA VOWEL SIGN A..CHAKMA VOWEL SIGN UU
 1112D..11134  ; Extend # Mn   [8] CHAKMA VOWEL SIGN AI..CHAKMA MAAYYAA
@ -412,6 +419,8 @@ FF9E..FF9F    ; Extend # Lm   [2] HALFWIDTH KATAKANA VOICED SOUND MARK..HALFWIDT
 16F8F..16F92  ; Extend # Mn   [4] MIAO TONE RIGHT..MIAO TONE BELOW
 16FE4         ; Extend # Mn       KHITAN SMALL SCRIPT FILLER
 1BC9D..1BC9E  ; Extend # Mn   [2] DUPLOYAN THICK LETTER SELECTOR..DUPLOYAN DOUBLE MARK
+1CF00..1CF2D  ; Extend # Mn  [46] ZNAMENNY COMBINING MARK GORAZDO NIZKO S KRYZHEM ON LEFT..ZNAMENNY COMBINING MARK KRYZH ON LEFT
+1CF30..1CF46  ; Extend # Mn  [23] ZNAMENNY COMBINING TONAL RANGE MARK MRACHNO..ZNAMENNY PRIZNAK MODIFIER ROG
 1D165         ; Extend # Mc       MUSICAL SYMBOL COMBINING STEM
 1D167..1D169  ; Extend # Mn   [3] MUSICAL SYMBOL COMBINING TREMOLO-1..MUSICAL SYMBOL COMBINING TREMOLO-3
 1D16E..1D172  ; Extend # Mc   [5] MUSICAL SYMBOL COMBINING FLAG-1..MUSICAL SYMBOL COMBINING FLAG-5
@ -431,6 +440,7 @@ FF9E..FF9F    ; Extend # Lm   [2] HALFWIDTH KATAKANA VOICED SOUND MARK..HALFWIDT
 1E023..1E024  ; Extend # Mn   [2] COMBINING GLAGOLITIC LETTER YU..COMBINING GLAGOLITIC LETTER SMALL YUS
 1E026..1E02A  ; Extend # Mn   [5] COMBINING GLAGOLITIC LETTER YO..COMBINING GLAGOLITIC LETTER FITA
 1E130..1E136  ; Extend # Mn   [7] NYIAKENG PUACHUE HMONG TONE-B..NYIAKENG PUACHUE HMONG TONE-D
+1E2AE         ; Extend # Mn       TOTO SIGN RISING TONE
 1E2EC..1E2EF  ; Extend # Mn   [4] WANCHO TONE TUP..WANCHO TONE KOINI
 1E8D0..1E8D6  ; Extend # Mn   [7] MENDE KIKAKUI COMBINING NUMBER TEENS..MENDE KIKAKUI COMBINING NUMBER MILLIONS
 1E944..1E94A  ; Extend # Mn   [7] ADLAM ALIF LENGTHENER..ADLAM NUKTA
@ -438,7 +448,7 @@ FF9E..FF9F    ; Extend # Lm   [2] HALFWIDTH KATAKANA VOICED SOUND MARK..HALFWIDT
 E0020..E007F  ; Extend # Cf  [96] TAG SPACE..CANCEL TAG
 E0100..E01EF  ; Extend # Mn [240] VARIATION SELECTOR-17..VARIATION SELECTOR-256

-# Total code points: 1984
+# Total code points: 2095

 # ================================================

@ -495,6 +505,8 @@ E0100..E01EF  ; Extend # Mn [240] VARIATION SELECTOR-17..VARIATION SELECTOR-256
 103B..103C    ; SpacingMark # Mc   [2] MYANMAR CONSONANT SIGN MEDIAL YA..MYANMAR CONSONANT SIGN MEDIAL RA
 1056..1057    ; SpacingMark # Mc   [2] MYANMAR VOWEL SIGN VOCALIC R..MYANMAR VOWEL SIGN VOCALIC RR
 1084          ; SpacingMark # Mc       MYANMAR VOWEL SIGN SHAN E
+1715          ; SpacingMark # Mc       TAGALOG SIGN PAMUDPOD
+1734          ; SpacingMark # Mc       HANUNOO SIGN PAMUDPOD
 17B6          ; SpacingMark # Mc       KHMER VOWEL SIGN AA
 17BE..17C5    ; SpacingMark # Mc   [8] KHMER VOWEL SIGN OE..KHMER VOWEL SIGN AU
 17C7..17C8    ; SpacingMark # Mc   [2] KHMER SIGN REAHMUK..KHMER SIGN YUUKALEAPINTU
@ -579,7 +591,6 @@ ABEC          ; SpacingMark # Mc       MEETEI MAYEK LUM IYEK
 116AC         ; SpacingMark # Mc       TAKRI SIGN VISARGA
 116AE..116AF  ; SpacingMark # Mc   [2] TAKRI VOWEL SIGN I..TAKRI VOWEL SIGN II
 116B6         ; SpacingMark # Mc       TAKRI SIGN VIRAMA
-11720..11721  ; SpacingMark # Mc   [2] AHOM VOWEL SIGN A..AHOM VOWEL SIGN AA
 11726         ; SpacingMark # Mc       AHOM VOWEL SIGN E
 1182C..1182E  ; SpacingMark # Mc   [3] DOGRA VOWEL SIGN AA..DOGRA VOWEL SIGN II
 11838         ; SpacingMark # Mc       DOGRA SIGN VISARGA
--- a/maint/Unicode.tables/PropList.txt
+++ b/maint/Unicode.tables/PropList.txt
--- a/maint/Unicode.tables/PropertyAliases.txt
+++ b/maint/Unicode.tables/PropertyAliases.txt
@ -0,0 +1,212 @@
+# PropertyAliases-14.0.0.txt
+# Date: 2021-03-08, 19:35:48 GMT
+# © 2021 Unicode®, Inc.
+# Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries.
+# For terms of use, see http://www.unicode.org/terms_of_use.html
+#
+# Unicode Character Database
+#   For documentation, see http://www.unicode.org/reports/tr44/
+#
+# This file contains aliases for properties used in the UCD.
+# These names can be used for XML formats of UCD data, for regular-expression
+# property tests, and other programmatic textual descriptions of Unicode data.
+#
+# The names may be translated in appropriate environments, and additional
+# aliases may be useful.
+#
+# FORMAT
+#
+# Each line has two or more fields, separated by semicolons.
+#
+# First Field: The first field is the short name for the property.
+# It is typically an abbreviation, but in a number of cases it is simply
+# a duplicate of the "long name" in the second field.
+# For Unihan database tags, the short name is actually a longer string than
+# the tag specified in the second field.
+#
+# Second Field: The second field is the long name for the property,
+# typically the formal name used in documentation about the property.
+#
+# The above are the preferred aliases. Other aliases may be listed in additional fields.
+#
+# Loose matching should be applied to all property names and property values, with
+# the exception of String Property values. With loose matching of property names and
+# values, the case distinctions, whitespace, and '_' are ignored. For Numeric Property
+# values, numeric equivalencies are applied: thus "01.00" is equivalent to "1".
+#
+# NOTE: Property value names are NOT unique across properties. For example:
+#
+#   AL means Arabic Letter for the Bidi_Class property, and
+#   AL means Above_Left for the Combining_Class property, and
+#   AL means Alphabetic for the Line_Break property.
+#
+# In addition, some property names may be the same as some property value names.
+# For example:
+#
+#   sc means the Script property, and
+#   Sc means the General_Category property value Currency_Symbol (Sc)
+#
+# The combination of property value and property name is, however, unique.
+#
+# For more information, see UAX #44, Unicode Character Database, and
+# UTS #18, Unicode Regular Expressions.
+# ================================================
+
+
+# ================================================
+# Numeric Properties
+# ================================================
+cjkAccountingNumeric     ; kAccountingNumeric
+cjkOtherNumeric          ; kOtherNumeric
+cjkPrimaryNumeric        ; kPrimaryNumeric
+nv                       ; Numeric_Value
+
+# ================================================
+# String Properties
+# ================================================
+cf                       ; Case_Folding
+cjkCompatibilityVariant  ; kCompatibilityVariant
+dm                       ; Decomposition_Mapping
+FC_NFKC                  ; FC_NFKC_Closure
+lc                       ; Lowercase_Mapping
+NFKC_CF                  ; NFKC_Casefold
+scf                      ; Simple_Case_Folding         ; sfc
+slc                      ; Simple_Lowercase_Mapping
+stc                      ; Simple_Titlecase_Mapping
+suc                      ; Simple_Uppercase_Mapping
+tc                       ; Titlecase_Mapping
+uc                       ; Uppercase_Mapping
+
+# ================================================
+# Miscellaneous Properties
+# ================================================
+bmg                      ; Bidi_Mirroring_Glyph
+bpb                      ; Bidi_Paired_Bracket
+cjkIICore                ; kIICore
+cjkIRG_GSource           ; kIRG_GSource
+cjkIRG_HSource           ; kIRG_HSource
+cjkIRG_JSource           ; kIRG_JSource
+cjkIRG_KPSource          ; kIRG_KPSource
+cjkIRG_KSource           ; kIRG_KSource
+cjkIRG_MSource           ; kIRG_MSource
+cjkIRG_SSource           ; kIRG_SSource
+cjkIRG_TSource           ; kIRG_TSource
+cjkIRG_UKSource          ; kIRG_UKSource
+cjkIRG_USource           ; kIRG_USource
+cjkIRG_VSource           ; kIRG_VSource
+cjkRSUnicode             ; kRSUnicode                  ; Unicode_Radical_Stroke; URS
+EqUIdeo                  ; Equivalent_Unified_Ideograph
+isc                      ; ISO_Comment
+JSN                      ; Jamo_Short_Name
+na                       ; Name
+na1                      ; Unicode_1_Name
+Name_Alias               ; Name_Alias
+scx                      ; Script_Extensions
+
+# ================================================
+# Catalog Properties
+# ================================================
+age                      ; Age
+blk                      ; Block
+sc                       ; Script
+
+# ================================================
+# Enumerated Properties
+# ================================================
+bc                       ; Bidi_Class
+bpt                      ; Bidi_Paired_Bracket_Type
+ccc                      ; Canonical_Combining_Class
+dt                       ; Decomposition_Type
+ea                       ; East_Asian_Width
+gc                       ; General_Category
+GCB                      ; Grapheme_Cluster_Break
+hst                      ; Hangul_Syllable_Type
+InPC                     ; Indic_Positional_Category
+InSC                     ; Indic_Syllabic_Category
+jg                       ; Joining_Group
+jt                       ; Joining_Type
+lb                       ; Line_Break
+NFC_QC                   ; NFC_Quick_Check
+NFD_QC                   ; NFD_Quick_Check
+NFKC_QC                  ; NFKC_Quick_Check
+NFKD_QC                  ; NFKD_Quick_Check
+nt                       ; Numeric_Type
+SB                       ; Sentence_Break
+vo                       ; Vertical_Orientation
+WB                       ; Word_Break
+
+# ================================================
+# Binary Properties
+# ================================================
+AHex                     ; ASCII_Hex_Digit
+Alpha                    ; Alphabetic
+Bidi_C                   ; Bidi_Control
+Bidi_M                   ; Bidi_Mirrored
+Cased                    ; Cased
+CE                       ; Composition_Exclusion
+CI                       ; Case_Ignorable
+Comp_Ex                  ; Full_Composition_Exclusion
+CWCF                     ; Changes_When_Casefolded
+CWCM                     ; Changes_When_Casemapped
+CWKCF                    ; Changes_When_NFKC_Casefolded
+CWL                      ; Changes_When_Lowercased
+CWT                      ; Changes_When_Titlecased
+CWU                      ; Changes_When_Uppercased
+Dash                     ; Dash
+Dep                      ; Deprecated
+DI                       ; Default_Ignorable_Code_Point
+Dia                      ; Diacritic
+EBase                    ; Emoji_Modifier_Base
+EComp                    ; Emoji_Component
+EMod                     ; Emoji_Modifier
+Emoji                    ; Emoji
+EPres                    ; Emoji_Presentation
+Ext                      ; Extender
+ExtPict                  ; Extended_Pictographic
+Gr_Base                  ; Grapheme_Base
+Gr_Ext                   ; Grapheme_Extend
+Gr_Link                  ; Grapheme_Link
+Hex                      ; Hex_Digit
+Hyphen                   ; Hyphen
+IDC                      ; ID_Continue
+Ideo                     ; Ideographic
+IDS                      ; ID_Start
+IDSB                     ; IDS_Binary_Operator
+IDST                     ; IDS_Trinary_Operator
+Join_C                   ; Join_Control
+LOE                      ; Logical_Order_Exception
+Lower                    ; Lowercase
+Math                     ; Math
+NChar                    ; Noncharacter_Code_Point
+OAlpha                   ; Other_Alphabetic
+ODI                      ; Other_Default_Ignorable_Code_Point
+OGr_Ext                  ; Other_Grapheme_Extend
+OIDC                     ; Other_ID_Continue
+OIDS                     ; Other_ID_Start
+OLower                   ; Other_Lowercase
+OMath                    ; Other_Math
+OUpper                   ; Other_Uppercase
+Pat_Syn                  ; Pattern_Syntax
+Pat_WS                   ; Pattern_White_Space
+PCM                      ; Prepended_Concatenation_Mark
+QMark                    ; Quotation_Mark
+Radical                  ; Radical
+RI                       ; Regional_Indicator
+SD                       ; Soft_Dotted
+STerm                    ; Sentence_Terminal
+Term                     ; Terminal_Punctuation
+UIdeo                    ; Unified_Ideograph
+Upper                    ; Uppercase
+VS                       ; Variation_Selector
+WSpace                   ; White_Space                 ; space
+XIDC                     ; XID_Continue
+XIDS                     ; XID_Start
+XO_NFC                   ; Expands_On_NFC
+XO_NFD                   ; Expands_On_NFD
+XO_NFKC                  ; Expands_On_NFKC
+XO_NFKD                  ; Expands_On_NFKD
+
+# ================================================
+# Total:    129
+
+# EOF
--- a/maint/Unicode.tables/PropertyValueAliases.txt
+++ b/maint/Unicode.tables/PropertyValueAliases.txt
--- a/maint/Unicode.tables/ScriptExtensions.txt
+++ b/maint/Unicode.tables/ScriptExtensions.txt
@ -1,6 +1,6 @@
-# ScriptExtensions-13.0.0.txt
-# Date: 2020-01-22, 00:07:43 GMT
-# © 2020 Unicode®, Inc.
+# ScriptExtensions-14.0.0.txt
+# Date: 2021-06-04, 02:19:38 GMT
+# © 2021 Unicode®, Inc.
 # Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries.
 # For terms of use, see http://www.unicode.org/terms_of_use.html
 #
@ -11,10 +11,10 @@
 # with more than one script, but with a limited number of scripts.
 # For each code point, there is one or more property values.  Each such value is a Script property value.
 # For more information, see:
-#   UAX #24, Unicode Script Property: http://www.unicode.org/reports/tr24/
+#   UAX #24, Unicode Script Property: https://www.unicode.org/reports/tr24/
 #     Especially the sections:
-#       http://www.unicode.org/reports/tr24/#Assignment_Script_Values
-#       http://www.unicode.org/reports/tr24/#Assignment_ScriptX_Values
+#       https://www.unicode.org/reports/tr24/#Assignment_Script_Values
+#       https://www.unicode.org/reports/tr24/#Assignment_ScriptX_Values
 #
 # Each Script_Extensions value in this file consists of a set
 # of one or more abbreviated Script property values. The ordering of the
@ -119,6 +119,14 @@

 # ================================================

+# Script_Extensions=Syrc
+
+1DFA          ; Syrc # Mn       COMBINING DOT BELOW LEFT
+
+# Total code points: 1
+
+# ================================================
+
 # Script_Extensions=Arab Copt

 102E0         ; Arab Copt # Mn       COPTIC EPACT THOUSANDS MARK
@ -136,6 +144,15 @@

 # ================================================

+# Script_Extensions=Arab Nkoo
+
+FD3E          ; Arab Nkoo # Pe       ORNATE LEFT PARENTHESIS
+FD3F          ; Arab Nkoo # Ps       ORNATE RIGHT PARENTHESIS
+
+# Total code points: 2
+
+# ================================================
+
 # Script_Extensions=Arab Syrc

 064B..0655    ; Arab Syrc # Mn  [11] ARABIC FATHATAN..ARABIC HAMZA BELOW
@ -186,10 +203,10 @@ A9CF          ; Bugi Java # Lm       JAVANESE PANGRANGKEP

 # Script_Extensions=Cprt Linb

-10100..10102  ; Cprt Linb # Po   [3] AEGEAN WORD SEPARATOR LINE..AEGEAN CHECK MARK
+10102         ; Cprt Linb # Po       AEGEAN CHECK MARK
 10137..1013F  ; Cprt Linb # So   [9] AEGEAN WEIGHT BASE UNIT..AEGEAN MEASURE THIRD SUBUNIT

-# Total code points: 12
+# Total code points: 10

 # ================================================

@ -342,6 +359,14 @@ FF9E..FF9F    ; Hira Kana # Lm   [2] HALFWIDTH KATAKANA VOICED SOUND MARK..HALFW

 # ================================================

+# Script_Extensions=Mani Ougr
+
+10AF2         ; Mani Ougr # Po       MANICHAEAN PUNCTUATION DOUBLE DOT WITHIN DOT
+
+# Total code points: 1
+
+# ================================================
+
 # Script_Extensions=Mong Phag

 1802..1803    ; Mong Phag # Po   [2] MONGOLIAN COMMA..MONGOLIAN FULL STOP
@ -383,6 +408,14 @@ FF9E..FF9F    ; Hira Kana # Lm   [2] HALFWIDTH KATAKANA VOICED SOUND MARK..HALFW

 # ================================================

+# Script_Extensions=Cpmn Cprt Linb
+
+10100..10101  ; Cpmn Cprt Linb # Po   [2] AEGEAN WORD SEPARATOR LINE..AEGEAN WORD SEPARATOR DOT
+
+# Total code points: 2
+
+# ================================================
+
 # Script_Extensions=Cprt Lina Linb

 10107..10133  ; Cprt Lina Linb # No  [45] AEGEAN NUMBER ONE..AEGEAN NUMBER NINETY THOUSAND
@ -449,16 +482,6 @@ A92E          ; Kali Latn Mymr # Po       KAYAH LI SIGN CWI

 # ================================================

-# Script_Extensions=Arab Rohg Syrc Thaa Yezi
-
-060C          ; Arab Rohg Syrc Thaa Yezi # Po       ARABIC COMMA
-061B          ; Arab Rohg Syrc Thaa Yezi # Po       ARABIC SEMICOLON
-061F          ; Arab Rohg Syrc Thaa Yezi # Po       ARABIC QUESTION MARK
-
-# Total code points: 3
-
-# ================================================
-
 # Script_Extensions=Bopo Hang Hani Hira Kana

 3003          ; Bopo Hang Hani Hira Kana # Po       DITTO MARK
@ -474,6 +497,15 @@ FE45..FE46    ; Bopo Hang Hani Hira Kana # Po   [2] SESAME DOT..WHITE SESAME DOT

 # ================================================

+# Script_Extensions=Arab Nkoo Rohg Syrc Thaa Yezi
+
+060C          ; Arab Nkoo Rohg Syrc Thaa Yezi # Po       ARABIC COMMA
+061B          ; Arab Nkoo Rohg Syrc Thaa Yezi # Po       ARABIC SEMICOLON
+
+# Total code points: 2
+
+# ================================================
+
 # Script_Extensions=Bopo Hang Hani Hira Kana Yiii

 3001..3002    ; Bopo Hang Hani Hira Kana Yiii # Po   [2] IDEOGRAPHIC COMMA..IDEOGRAPHIC FULL STOP
@ -513,9 +545,9 @@ FF64..FF65    ; Bopo Hang Hani Hira Kana Yiii # Po   [2] HALFWIDTH IDEOGRAPHIC C

 # ================================================

-# Script_Extensions=Adlm Arab Mand Mani Phlp Rohg Sogd Syrc
+# Script_Extensions=Adlm Arab Nkoo Rohg Syrc Thaa Yezi

-0640          ; Adlm Arab Mand Mani Phlp Rohg Sogd Syrc # Lm       ARABIC TATWEEL
+061F          ; Adlm Arab Nkoo Rohg Syrc Thaa Yezi # Po       ARABIC QUESTION MARK

 # Total code points: 1

@ -529,6 +561,14 @@ FF64..FF65    ; Bopo Hang Hani Hira Kana Yiii # Po   [2] HALFWIDTH IDEOGRAPHIC C

 # ================================================

+# Script_Extensions=Adlm Arab Mand Mani Ougr Phlp Rohg Sogd Syrc
+
+0640          ; Adlm Arab Mand Mani Ougr Phlp Rohg Sogd Syrc # Lm       ARABIC TATWEEL
+
+# Total code points: 1
+
+# ================================================
+
 # Script_Extensions=Deva Dogr Gujr Guru Khoj Kthi Mahj Modi Sind Takr Tirh

 A836..A837    ; Deva Dogr Gujr Guru Khoj Kthi Mahj Modi Sind Takr Tirh # So   [2] NORTH INDIC QUARTER MARK..NORTH INDIC PLACEHOLDER MARK
--- a/Show More
+++ b/Show More