Cleanup of Makefile.os4, added release rule and a README file for this release

Implement -Z in pcre2grep and update documentation
Added some special heap tests
2022-07-31 20:34:33 +01:00 · 2022-07-30 17:41:49 +01:00 · 2022-07-28 17:58:19 +01:00 · 2022-07-27 18:00:40 +01:00 · 2022-07-27 17:44:55 +01:00 · 2022-07-15 17:18:11 +01:00
266 changed files with 82308 additions and 40561 deletions
--- a/.bazelrc
+++ b/.bazelrc
@ -0,0 +1,3 @@
+common --experimental_enable_bzlmod
+build --incompatible_enable_cc_toolchain_resolution
+build --incompatible_strict_action_env
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@ -0,0 +1,77 @@
+
+name: Build
+on: [push, pull_request]
+
+jobs:
+  linux:
+    name: Linux
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v2
+        
+      - name: Autogen
+        run: ./autogen.sh
+        
+      - name: Configure
+        run: ./configure --enable-jit --enable-pcre2-8 --enable-pcre2-16 --enable-pcre2-32
+        
+      - name: Build
+        run: make
+        
+      - name: Test (main test script)
+        run: ./RunTest
+
+      - name: Test (JIT test program)
+        run: ./pcre2_jit_test
+
+      - name: Test (pcre2grep test script)
+        run: ./RunGrepTest
+    
+  alpine:
+    name: alpine
+    runs-on: ubuntu-latest
+    container: alpine 
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v2
+        
+      - name: Autotools
+        run: apk add --no-cache automake autoconf gcc libtool make musl-dev 
+        
+      - name: Autogen
+        run: ./autogen.sh
+        
+      - name: Configure
+        run: ./configure --enable-jit --enable-pcre2-8 --enable-pcre2-16 --enable-pcre2-32
+        
+      - name: Build
+        run: make
+        
+      - name: Test (main test script)
+        run: ./RunTest
+
+      - name: Test (JIT test program)
+        run: ./pcre2_jit_test
+
+      - name: Test (pcre2grep test script)
+        run: ./RunGrepTest
+        
+  windows:      
+    name: 32bit Windows
+    runs-on: windows-latest
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v2
+
+      - name: Configure
+        run: cmake -DPCRE2_SUPPORT_JIT=ON -DPCRE2_BUILD_PCRE2_16=ON -DPCRE2_BUILD_PCRE2_32=ON -B build -A Win32
+
+      - name: Build
+        run: cmake --build build
+
+      - name: Test
+        run: |
+          cd build\Debug
+          ..\..\RunTest.bat
+           
--- a/.github/workflows/codeql.yml
+++ b/.github/workflows/codeql.yml
@ -0,0 +1,73 @@
+# For most projects, this workflow file will not need changing; you simply need
+# to commit it to your repository.
+#
+# You may wish to alter this file to override the set of languages analyzed,
+# or to provide custom queries or build logic.
+#
+# ******** NOTE ********
+# We have attempted to detect the languages in your repository. Please check
+# the `language` matrix defined below to confirm you have the correct set of
+# supported CodeQL languages.
+#
+name: "CodeQL"
+
+on:
+  push:
+    branches: [ master ]
+  pull_request:
+    # The branches below must be a subset of the branches above
+    branches: [ master ]
+  schedule:
+    - cron: '27 6 * * 4'
+
+# Declare default permissions as read only.
+permissions: read-all
+
+jobs:
+  analyze:
+    name: Analyze
+    runs-on: ubuntu-latest
+    permissions:
+      actions: read
+      contents: read
+      security-events: write
+
+    strategy:
+      fail-fast: false
+      matrix:
+        language: [ 'cpp', 'python' ]
+        # CodeQL supports [ 'cpp', 'csharp', 'go', 'java', 'javascript', 'python', 'ruby' ]
+        # Learn more about CodeQL language support at https://git.io/codeql-language-support
+
+    steps:
+    - name: Checkout repository
+      uses: actions/checkout@v2
+
+    # Initializes the CodeQL tools for scanning.
+    - name: Initialize CodeQL
+      uses: github/codeql-action/init@v1
+      with:
+        languages: ${{ matrix.language }}
+        # If you wish to specify custom queries, you can do so here or in a config file.
+        # By default, queries listed here will override any specified in a config file.
+        # Prefix the list here with "+" to use these queries and those in the config file.
+        # queries: ./path/to/local/query, your-org/your-repo/queries@main
+
+    # Autobuild attempts to build any compiled languages  (C/C++, C#, or Java).
+    # If this step fails, then you should remove it and run the build manually (see below)
+    - name: Autobuild
+      uses: github/codeql-action/autobuild@v1
+
+    # ℹ️ Command-line programs to run using the OS shell.
+    # 📚 https://git.io/JvXDl
+
+    # ✏️ If the Autobuild fails above, remove it and uncomment the following three lines
+    #    and modify them (or add more) to build your code if your project
+    #    uses a compiled language
+
+    #- run: |
+    #   make bootstrap
+    #   make release
+
+    - name: Perform CodeQL Analysis
+      uses: github/codeql-action/analyze@v1
--- a/.github/workflows/scorecards.yml
+++ b/.github/workflows/scorecards.yml
@ -0,0 +1,55 @@
+name: Scorecards supply-chain security
+on:
+  # Only the default branch is supported.
+  branch_protection_rule:
+  schedule:
+    - cron: '23 17 * * 1'
+  push:
+    branches: [ master ]
+
+# Declare default permissions as read only.
+permissions: read-all
+
+jobs:
+  analysis:
+    name: Scorecards analysis
+    runs-on: ubuntu-latest
+    permissions:
+      # Needed to upload the results to code-scanning dashboard.
+      security-events: write
+      actions: read
+      contents: read
+
+    steps:
+      - name: "Checkout code"
+        uses: actions/checkout@ec3a7ce113134d7a93b817d10a8272cb61118579 # v2.4.0
+        with:
+          persist-credentials: false
+
+      - name: "Run analysis"
+        uses: ossf/scorecard-action@c1aec4ac820532bab364f02a81873c555a0ba3a1 # v1.0.4
+        with:
+          results_file: results.sarif
+          results_format: sarif
+          # Read-only PAT token. To create it,
+          # follow the steps in https://github.com/ossf/scorecard-action#pat-token-creation.
+          repo_token: ${{ secrets.SCORECARD_READ_TOKEN }}
+          # Publish the results to enable scorecard badges. For more details, see
+          # https://github.com/ossf/scorecard-action#publishing-results.
+          # For private repositories, `publish_results` will automatically be set to `false`,
+          # regardless of the value entered here.
+          publish_results: true
+
+      # Upload the results as artifacts (optional).
+      - name: "Upload artifact"
+        uses: actions/upload-artifact@82c141cc518b40d92cc801eee768e7aafc9c2fa2 # v2.3.1
+        with:
+          name: SARIF file
+          path: results.sarif
+          retention-days: 5
+
+      # Upload the results to GitHub's code scanning dashboard.
+      - name: "Upload to code-scanning"
+        uses: github/codeql-action/upload-sarif@5f532563584d71fdef14ee64d17bafb34f751ce5 # v1.0.26
+        with:
+          sarif_file: results.sarif
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,82 @@
+# Public .gitignore file for PCRE2
+
+*.a
+*.lo
+*.la
+*.pc
+*.o
+*~
+*.lha
+
+__pycache__
+.deps
+.libs
+
+INSTALL
+Makefile
+Makefile.in
+RunGrepTest.log
+RunGrepTest.trs
+RunTest.log
+RunTest.trs
+
+aclocal.m4
+ar-lib
+compile
+config.guess
+config.log
+config.status
+config.sub
+configure
+depcomp
+install-sh
+libtool
+ltmain.sh
+missing
+pcre2-config
+pcre2_dftables
+pcre2_jit_test
+pcre2_jit_test.log
+pcre2_jit_test.trs
+pcre2demo
+pcre2fuzzcheck
+pcre2grep
+pcre2test
+test-driver
+test-suite.log
+test3input
+test3output
+testNinput
+testNinputgrep
+teststderr
+teststderrM
+teststderrgrep
+teststdout
+teststdoutM
+testtemp1
+testtemp1grep
+testtemp2
+testtemp2grep
+testtry
+testtrygrep
+
+m4/libtool.m4
+m4/ltoptions.m4
+m4/ltsugar.m4
+m4/ltversion.m4
+m4/lt~obsolete.m4
+
+maint/ucptest
+maint/utf8
+
+src/.deps
+src/.dirstamp
+src/config.h
+src/pcre2.h
+src/pcre2_chartables.c
+src/stamp-h1
+
+/bazel-*
+
+# End
+
--- a/12
+++ b/12
@ -2,13 +2,13 @@ THE MAIN PCRE2 LIBRARY CODE
 ---------------------------

 Written by:       Philip Hazel
-Email local part: ph10
-Email domain:     cam.ac.uk
+Email local part: Philip.Hazel
+Email domain:     gmail.com

-University of Cambridge Computing Service,
+Retired from University of Cambridge Computing Service,
 Cambridge, England.

-Copyright (c) 1997-2019 University of Cambridge
+Copyright (c) 1997-2022 University of Cambridge
 All rights reserved


@ -19,7 +19,7 @@ Written by:       Zoltan Herczeg
 Email local part: hzmester
 Emain domain:     freemail.hu

-Copyright(c) 2010-2019 Zoltan Herczeg
+Copyright(c) 2010-2022 Zoltan Herczeg
 All rights reserved.


@ -30,7 +30,7 @@ Written by:       Zoltan Herczeg
 Email local part: hzmester
 Emain domain:     freemail.hu

-Copyright(c) 2009-2019 Zoltan Herczeg
+Copyright(c) 2009-2022 Zoltan Herczeg
 All rights reserved.

 ####
--- a/BUILD.bazel
+++ b/BUILD.bazel
@ -0,0 +1,72 @@
+load("@rules_cc//cc:defs.bzl", "cc_library", "cc_test")
+load("@bazel_skylib//rules:copy_file.bzl", "copy_file")
+
+copy_file(
+    name = "config_h_generic",
+    src = "src/config.h.generic",
+    out = "src/config.h",
+)
+
+copy_file(
+    name = "pcre2_h_generic",
+    src = "src/pcre2.h.generic",
+    out = "src/pcre2.h",
+)
+
+copy_file(
+    name = "pcre2_chartables_c",
+    src = "src/pcre2_chartables.c.dist",
+    out = "src/pcre2_chartables.c",
+)
+
+cc_library(
+    name = "pcre2",
+    srcs = [
+        "src/pcre2_auto_possess.c",
+        "src/pcre2_compile.c",
+        "src/pcre2_config.c",
+        "src/pcre2_context.c",
+        "src/pcre2_convert.c",
+        "src/pcre2_dfa_match.c",
+        "src/pcre2_error.c",
+        "src/pcre2_extuni.c",
+        "src/pcre2_find_bracket.c",
+        "src/pcre2_maketables.c",
+        "src/pcre2_match.c",
+        "src/pcre2_match_data.c",
+        "src/pcre2_newline.c",
+        "src/pcre2_ord2utf.c",
+        "src/pcre2_pattern_info.c",
+        "src/pcre2_script_run.c",
+        "src/pcre2_serialize.c",
+        "src/pcre2_string_utils.c",
+        "src/pcre2_study.c",
+        "src/pcre2_substitute.c",
+        "src/pcre2_substring.c",
+        "src/pcre2_tables.c",
+        "src/pcre2_ucd.c",
+        "src/pcre2_ucptables.c",
+        "src/pcre2_valid_utf.c",
+        "src/pcre2_xclass.c",
+        ":pcre2_chartables_c",
+    ],
+    hdrs = glob(["src/*.h"]) + [
+        ":config_h_generic",
+        ":pcre2_h_generic",
+    ],
+    defines = [
+        "HAVE_CONFIG_H",
+        "PCRE2_CODE_UNIT_WIDTH=8",
+        "PCRE2_STATIC",
+    ],
+    includes = ["src"],
+    strip_include_prefix = "src",
+    visibility = ["//visibility:public"],
+)
+
+cc_binary(
+    name = "pcre2demo",
+    srcs = ["src/pcre2demo.c"],
+    visibility = ["//visibility:public"],
+    deps = [":pcre2"],
+)
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -1,6 +1,5 @@
 # CMakeLists.txt
 #
-#
 # This file enables PCRE2 to be built with the CMake configuration and build
 # tool. Download CMake in source or binary form from http://www.cmake.org/
 # Converted to support PCRE2 from the original PCRE file, August 2014.
@ -85,19 +84,44 @@
 # 2018-11-14 PH removed unnecessary checks for stdint.h and inttypes.h
 # 2018-11-16 PH added PCRE2GREP_SUPPORT_CALLOUT_FORK support and tidied
 # 2019-02-16 PH hacked to avoid CMP0026 policy issue (see comments below)
+# 2020-03-16 PH renamed dftables as pcre2_dftables (as elsewhere)
+# 2020-03-24 PH changed CMAKE_MODULE_PATH definition to add, not replace
+# 2020-04-08 Carlo added function check for secure_getenv, fixed strerror
+# 2020-04-16 enh added check for __attribute__((uninitialized))
+# 2020-04-25 PH applied patches from Uwe Korn to support pkg-config and
+#            library versioning.
+# 2020-04-25 Carlo added function check for mkostemp used in ProtExecAllocator
+# 2020-04-28 PH added function check for memfd_create based on Carlo's patch
+# 2020-05-25 PH added a check for Intel CET
+# 2020-12-03 PH altered the definition of pcre2test as suggested by Daniel
+# 2021-06-29 JWSB added the option to build static library with PIC.
+# 2021-07-05 JWSB modified such both the static and shared library can be
+#            build in one go.
+# 2021-08-28 PH increased minimum version
+# 2021-08-28 PH added test for realpath()

 PROJECT(PCRE2 C)

-# Increased minimum to 2.8.0 to support newer add_test features.
-CMAKE_MINIMUM_REQUIRED(VERSION 2.8.0)
+# Increased minimum to 2.8.5 to support GNUInstallDirs.
+# Increased minimum to 3.1 to support imported targets.
+CMAKE_MINIMUM_REQUIRED(VERSION 3.1)

 # Set policy CMP0026 to avoid warnings for the use of LOCATION in
 # GET_TARGET_PROPERTY. This should no longer be required.
 # CMAKE_POLICY(SET CMP0026 OLD)

-SET(CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake) # for FindReadline.cmake
+# With a recent cmake, you can provide a rootdir to look for non
+# standard installed library dependencies, but to do so, the policy
+# needs to be set to new (by uncommenting the following)
+# CMAKE_POLICY(SET CMP0074 NEW)

-SET(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -I${PROJECT_SOURCE_DIR}/src")
+# For FindReadline.cmake. This was changed to allow setting CMAKE_MODULE_PATH
+# on the command line.
+# SET(CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake)
+
+LIST(APPEND CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake)
+
+INCLUDE_DIRECTORIES(${PROJECT_SOURCE_DIR}/src)

 # external packages
 FIND_PACKAGE( BZip2 )
@ -107,29 +131,66 @@ FIND_PACKAGE( Editline )

 # Configuration checks

-INCLUDE(CheckIncludeFile)
+INCLUDE(CheckCSourceCompiles)
 INCLUDE(CheckFunctionExists)
+INCLUDE(CheckSymbolExists)
+INCLUDE(CheckIncludeFile)
 INCLUDE(CheckTypeSize)
+INCLUDE(GNUInstallDirs) # for CMAKE_INSTALL_LIBDIR

 CHECK_INCLUDE_FILE(dirent.h     HAVE_DIRENT_H)
-CHECK_INCLUDE_FILE(stdint.h     HAVE_STDINT_H)
-CHECK_INCLUDE_FILE(inttypes.h   HAVE_INTTYPES_H)
 CHECK_INCLUDE_FILE(sys/stat.h   HAVE_SYS_STAT_H)
 CHECK_INCLUDE_FILE(sys/types.h  HAVE_SYS_TYPES_H)
 CHECK_INCLUDE_FILE(unistd.h     HAVE_UNISTD_H)
 CHECK_INCLUDE_FILE(windows.h    HAVE_WINDOWS_H)

-CHECK_FUNCTION_EXISTS(bcopy     HAVE_BCOPY)
-CHECK_FUNCTION_EXISTS(memmove   HAVE_MEMMOVE)
-CHECK_FUNCTION_EXISTS(strerror  HAVE_STRERROR)
+CHECK_SYMBOL_EXISTS(bcopy         "strings.h"  HAVE_BCOPY)
+CHECK_SYMBOL_EXISTS(memfd_create  "sys/mman.h" HAVE_MEMFD_CREATE)
+CHECK_SYMBOL_EXISTS(memmove       "string.h"   HAVE_MEMMOVE)
+CHECK_SYMBOL_EXISTS(secure_getenv "stdlib.h"   HAVE_SECURE_GETENV)
+CHECK_SYMBOL_EXISTS(strerror      "string.h"   HAVE_STRERROR)
+
+CHECK_C_SOURCE_COMPILES(
+  "#include <stdlib.h>
+   #include <limits.h>
+   int main(int c, char *v[]) { char buf[PATH_MAX]; realpath(v[1], buf); return 0; }"
+  HAVE_REALPATH
+)
+
+set(ORIG_CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS})
+set(CMAKE_REQUIRED_FLAGS "${CMAKE_REQUIRED_FLAGS} -Werror")
+CHECK_C_SOURCE_COMPILES(
+  "int main() { char buf[128] __attribute__((uninitialized)); (void)buf; return 0; }"
+  HAVE_ATTRIBUTE_UNINITIALIZED
+)
+set(CMAKE_REQUIRED_FLAGS ${ORIG_CMAKE_REQUIRED_FLAGS})
+
+# Check whether Intel CET is enabled, and if so, adjust compiler flags. This
+# code was written by PH, trying to imitate the logic from the autotools
+# configuration.
+
+CHECK_C_SOURCE_COMPILES(
+  "#ifndef __CET__
+   #error CET is not enabled
+   #endif
+   int main() { return 0; }"
+  INTEL_CET_ENABLED
+)
+
+IF (INTEL_CET_ENABLED)
+  SET(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mshstk")
+ENDIF(INTEL_CET_ENABLED)
+
+

 # User-configurable options
 #
 # Note: CMakeSetup displays these in alphabetical order, regardless of
 # the order we use here.

-SET(BUILD_SHARED_LIBS OFF CACHE BOOL
-    "Build shared libraries instead of static ones.")
+SET(BUILD_SHARED_LIBS OFF CACHE BOOL "Build shared libraries.")
+
+OPTION(BUILD_STATIC_LIBS "Build static libraries." ON)

 OPTION(PCRE2_BUILD_PCRE2_8 "Build 8 bit PCRE2 library" ON)

@ -137,6 +198,8 @@ OPTION(PCRE2_BUILD_PCRE2_16 "Build 16 bit PCRE2 library" OFF)

 OPTION(PCRE2_BUILD_PCRE2_32 "Build 32 bit PCRE2 library" OFF)

+OPTION(PCRE2_STATIC_PIC "Build the static library with the option position independent code enabled." OFF)
+
 OPTION(PCRE2_DEBUG "Include debugging code" OFF)

 OPTION(PCRE2_DISABLE_PERCENT_ZT "Disable the use of %zu and %td (rarely needed)" OFF)
@ -177,8 +240,12 @@ SET(PCRE2_HEAP_MATCH_RECURSE OFF CACHE BOOL
 SET(PCRE2_SUPPORT_JIT OFF CACHE BOOL
    "Enable support for Just-in-time compiling.")

+IF(${CMAKE_SYSTEM_NAME} MATCHES Linux|NetBSD)
    SET(PCRE2_SUPPORT_JIT_SEALLOC OFF CACHE BOOL
-    "Enable SELinux compatible execmem allocator in JIT.")
+        "Enable SELinux compatible execmem allocator in JIT (experimental).")
+ELSE(${CMAKE_SYSTEM_NAME} MATCHES Linux|NetBSD)
+    SET(PCRE2_SUPPORT_JIT_SEALLOC IGNORE)
+ENDIF(${CMAKE_SYSTEM_NAME} MATCHES Linux|NetBSD)

 SET(PCRE2GREP_SUPPORT_JIT ON CACHE BOOL
    "Enable use of Just-in-time compiling in pcre2grep.")
@ -244,9 +311,19 @@ ENDIF(PCRE2_SUPPORT_LIBZ)
 IF(EDITLINE_FOUND)
  OPTION (PCRE2_SUPPORT_LIBEDIT  "Enable support for linking pcre2test with libedit." OFF)
 ENDIF(EDITLINE_FOUND)
+IF(EDITLINE_FOUND)
  IF(PCRE2_SUPPORT_LIBEDIT)
    INCLUDE_DIRECTORIES(${EDITLINE_INCLUDE_DIR})
  ENDIF(PCRE2_SUPPORT_LIBEDIT)
+ELSE(EDITLINE_FOUND)
+  IF(PCRE2_SUPPORT_LIBEDIT)
+    MESSAGE(FATAL_ERROR
+      " libedit not found, set EDITLINE_INCLUDE_DIR to a compatible header\n"
+      " or set Editline_ROOT to a full libedit installed tree, as needed\n"
+      " Might need to enable policy CMP0074 in CMakeLists.txt"
+    )
+  ENDIF(PCRE2_SUPPORT_LIBEDIT)
+ENDIF(EDITLINE_FOUND)

 # readline lib
 IF(READLINE_FOUND)
@ -258,9 +335,9 @@ ENDIF(PCRE2_SUPPORT_LIBREADLINE)

 # Prepare build configuration

-IF(NOT BUILD_SHARED_LIBS)
-        SET(PCRE2_STATIC 1)
-ENDIF(NOT BUILD_SHARED_LIBS)
+IF(NOT BUILD_SHARED_LIBS AND NOT BUILD_STATIC_LIBS)
+        MESSAGE(FATAL_ERROR "At least one of BUILD_SHARED_LIBS or BUILD_STATIC_LIBS must be enabled.")
+ENDIF(NOT BUILD_SHARED_LIBS AND NOT BUILD_STATIC_LIBS)

 IF(NOT PCRE2_BUILD_PCRE2_8 AND NOT PCRE2_BUILD_PCRE2_16 AND NOT PCRE2_BUILD_PCRE2_32)
        MESSAGE(FATAL_ERROR "At least one of PCRE2_BUILD_PCRE2_8, PCRE2_BUILD_PCRE2_16 or PCRE2_BUILD_PCRE2_32 must be enabled")
@ -284,7 +361,12 @@ IF(PCRE2_BUILD_PCRE2GREP AND NOT PCRE2_BUILD_PCRE2_8)
 ENDIF(PCRE2_BUILD_PCRE2GREP AND NOT PCRE2_BUILD_PCRE2_8)

 IF(PCRE2_SUPPORT_LIBREADLINE AND PCRE2_SUPPORT_LIBEDIT)
-        MESSAGE(FATAL_ERROR "Only one of libreadline or libeditline can be specified")
+        IF(READLINE_FOUND)
+                MESSAGE(FATAL_ERROR
+                  " Only one of the readline compatible libraries can be enabled.\n"
+                  " Disable libreadline with -DPCRE2_SUPPORT_LIBREADLINE=OFF"
+                )
+        ENDIF(READLINE_FOUND)
 ENDIF(PCRE2_SUPPORT_LIBREADLINE AND PCRE2_SUPPORT_LIBEDIT)

 IF(PCRE2_SUPPORT_BSR_ANYCRLF)
@ -301,10 +383,28 @@ ENDIF(PCRE2_SUPPORT_UNICODE)

 IF(PCRE2_SUPPORT_JIT)
 	SET(SUPPORT_JIT 1)
+	IF(UNIX)
+		FIND_PACKAGE(Threads REQUIRED)
+		IF(CMAKE_USE_PTHREADS_INIT)
+			SET(REQUIRE_PTHREAD 1)
+		ENDIF(CMAKE_USE_PTHREADS_INIT)
+	ENDIF(UNIX)
 ENDIF(PCRE2_SUPPORT_JIT)

 IF(PCRE2_SUPPORT_JIT_SEALLOC)
+        SET(CMAKE_REQUIRED_DEFINITIONS -D_GNU_SOURCE)
+	CHECK_SYMBOL_EXISTS(mkostemp stdlib.h REQUIRED)
+        UNSET(CMAKE_REQUIRED_DEFINITIONS)
+        IF(${REQUIRED})
+                IF(${CMAKE_SYSTEM_NAME} MATCHES Linux|NetBSD)
+                        ADD_DEFINITIONS(-D_GNU_SOURCE)
                        SET(SLJIT_PROT_EXECUTABLE_ALLOCATOR 1)
+                ELSE(${CMAKE_SYSTEM_NAME} MATCHES Linux|NetBSD)
+                        MESSAGE(FATAL_ERROR "Your configuration is not supported")
+                ENDIF(${CMAKE_SYSTEM_NAME} MATCHES Linux|NetBSD)
+        ELSE(${REQUIRED})
+                SET(PCRE2_SUPPORT_JIT_SEALLOC OFF)
+        ENDIF(${REQUIRED})
 ENDIF(PCRE2_SUPPORT_JIT_SEALLOC)

 IF(PCRE2GREP_SUPPORT_JIT)
@ -400,12 +500,13 @@ file(STRINGS ${PROJECT_SOURCE_DIR}/configure.ac
  LIMIT_COUNT 50 # Read only the first 50 lines of the file
 )

-set(SEARCHED_VARIABLES "pcre2_major" "pcre2_minor" "pcre2_prerelease" "pcre2_date")
+set(SEARCHED_VARIABLES "pcre2_major" "pcre2_minor" "pcre2_prerelease" "pcre2_date"
+  "libpcre2_posix_version" "libpcre2_8_version" "libpcre2_16_version" "libpcre2_32_version")
 foreach(configure_line ${configure_lines})
    foreach(_substitution_variable ${SEARCHED_VARIABLES})
        string(TOUPPER ${_substitution_variable} _substitution_variable_upper)
        if (NOT ${_substitution_variable_upper})
-            string(REGEX MATCH "m4_define\\(${_substitution_variable}, \\[(.*)\\]" MACTHED_STRING ${configure_line})
+            string(REGEX MATCH "m4_define\\(${_substitution_variable}, *\\[(.*)\\]" MATCHED_STRING ${configure_line})
            if (CMAKE_MATCH_1)
                set(${_substitution_variable_upper} ${CMAKE_MATCH_1})
            endif()
@ -413,21 +514,83 @@ foreach(configure_line ${configure_lines})
    endforeach()
 endforeach()

+macro(PARSE_LIB_VERSION VARIABLE_PREFIX)
+  string(REPLACE ":" ";" ${VARIABLE_PREFIX}_VERSION_LIST ${${VARIABLE_PREFIX}_VERSION})
+  list(GET ${VARIABLE_PREFIX}_VERSION_LIST 0 ${VARIABLE_PREFIX}_VERSION_CURRENT)
+  list(GET ${VARIABLE_PREFIX}_VERSION_LIST 1 ${VARIABLE_PREFIX}_VERSION_REVISION)
+  list(GET ${VARIABLE_PREFIX}_VERSION_LIST 2 ${VARIABLE_PREFIX}_VERSION_AGE)
+
+  math(EXPR ${VARIABLE_PREFIX}_SOVERSION "${${VARIABLE_PREFIX}_VERSION_CURRENT} - ${${VARIABLE_PREFIX}_VERSION_AGE}")
+  math(EXPR ${VARIABLE_PREFIX}_MACHO_COMPATIBILITY_VERSION "${${VARIABLE_PREFIX}_VERSION_CURRENT} + 1")
+  math(EXPR ${VARIABLE_PREFIX}_MACHO_CURRENT_VERSION "${${VARIABLE_PREFIX}_VERSION_CURRENT} + 1")
+  set(${VARIABLE_PREFIX}_MACHO_CURRENT_VERSION "${${VARIABLE_PREFIX}_MACHO_CURRENT_VERSION}.${${VARIABLE_PREFIX}_VERSION_REVISION}}")
+  set(${VARIABLE_PREFIX}_VERSION "${${VARIABLE_PREFIX}_SOVERSION}.${${VARIABLE_PREFIX}_VERSION_AGE}.${${VARIABLE_PREFIX}_VERSION_REVISION}")
+endmacro()
+
+PARSE_LIB_VERSION(LIBPCRE2_POSIX)
+PARSE_LIB_VERSION(LIBPCRE2_8)
+PARSE_LIB_VERSION(LIBPCRE2_16)
+PARSE_LIB_VERSION(LIBPCRE2_32)
+
 CONFIGURE_FILE(src/pcre2.h.in
               ${PROJECT_BINARY_DIR}/pcre2.h
               @ONLY)

-# What about pcre2-config and libpcre2.pc?
+# Make sure to not link debug libs
+# against release libs and vice versa
+IF(WIN32)
+  SET(CMAKE_DEBUG_POSTFIX "d")
+ENDIF(WIN32)
+
+# Generate pkg-config files
+
+SET(PACKAGE_VERSION "${PCRE2_MAJOR}.${PCRE2_MINOR}")
+SET(prefix ${CMAKE_INSTALL_PREFIX})
+
+SET(exec_prefix "\${prefix}")
+SET(libdir "\${exec_prefix}/${CMAKE_INSTALL_LIBDIR}")
+SET(includedir "\${prefix}/include")
+IF(WIN32 AND (CMAKE_BUILD_TYPE MATCHES Debug))
+  SET(LIB_POSTFIX ${CMAKE_DEBUG_POSTFIX})
+ENDIF()
+CONFIGURE_FILE(libpcre2-posix.pc.in libpcre2-posix.pc @ONLY)
+SET(pkg_config_files ${pkg_config_files} "${CMAKE_CURRENT_BINARY_DIR}/libpcre2-posix.pc")
+
+IF(PCRE2_BUILD_PCRE2_8)
+  CONFIGURE_FILE(libpcre2-8.pc.in libpcre2-8.pc @ONLY)
+  SET(pkg_config_files ${pkg_config_files} "${CMAKE_CURRENT_BINARY_DIR}/libpcre2-8.pc")
+  SET(enable_pcre2_8 "yes")
+ELSE()
+  SET(enable_pcre2_8 "no")
+ENDIF()
+
+IF(PCRE2_BUILD_PCRE2_16)
+  CONFIGURE_FILE(libpcre2-16.pc.in libpcre2-16.pc @ONLY)
+  SET(pkg_config_files ${pkg_config_files} "${CMAKE_CURRENT_BINARY_DIR}/libpcre2-16.pc")
+  SET(enable_pcre2_16 "yes")
+ELSE()
+  SET(enable_pcre2_16 "no")
+ENDIF()
+
+IF(PCRE2_BUILD_PCRE2_32)
+  CONFIGURE_FILE(libpcre2-32.pc.in libpcre2-32.pc @ONLY)
+  SET(pkg_config_files ${pkg_config_files} "${CMAKE_CURRENT_BINARY_DIR}/libpcre2-32.pc")
+  SET(enable_pcre2_32 "yes")
+ELSE()
+  SET(enable_pcre2_32 "no")
+ENDIF()
+
+CONFIGURE_FILE(pcre2-config.in pcre2-config @ONLY)

 # Character table generation

 OPTION(PCRE2_REBUILD_CHARTABLES "Rebuild char tables" OFF)
 IF(PCRE2_REBUILD_CHARTABLES)
-  ADD_EXECUTABLE(dftables src/dftables.c)
+  ADD_EXECUTABLE(pcre2_dftables src/pcre2_dftables.c)
  ADD_CUSTOM_COMMAND(
    COMMENT "Generating character tables (pcre2_chartables.c) for current locale"
-    DEPENDS dftables
-    COMMAND dftables
+    DEPENDS pcre2_dftables
+    COMMAND pcre2_dftables
    ARGS        ${PROJECT_BINARY_DIR}/pcre2_chartables.c
    OUTPUT      ${PROJECT_BINARY_DIR}/pcre2_chartables.c
  )
@ -474,39 +637,37 @@ SET(PCRE2_SOURCES
 SET(PCRE2POSIX_HEADERS src/pcre2posix.h)
 SET(PCRE2POSIX_SOURCES src/pcre2posix.c)

-IF(MINGW AND NOT PCRE2_STATIC)
+IF(MINGW AND BUILD_SHARED_LIBS)
  IF (EXISTS ${PROJECT_SOURCE_DIR}/pcre2.rc)
    ADD_CUSTOM_COMMAND(OUTPUT ${PROJECT_SOURCE_DIR}/pcre2.o
      PRE-LINK
      COMMAND windres ARGS pcre2.rc pcre2.o
      WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}
      COMMENT Using pcre2 coff info in mingw build)
-SET(PCRE2_SOURCES
-  ${PCRE2_SOURCES} ${PROJECT_SOURCE_DIR}/pcre2.o
-)
+    SET(PCRE2_SOURCES ${PCRE2_SOURCES} ${PROJECT_SOURCE_DIR}/pcre2.o)
  ENDIF(EXISTS ${PROJECT_SOURCE_DIR}/pcre2.rc)
+
  IF (EXISTS ${PROJECT_SOURCE_DIR}/pcre2posix.rc)
    ADD_CUSTOM_COMMAND(OUTPUT ${PROJECT_SOURCE_DIR}/pcre2posix.o
      PRE-LINK
      COMMAND windres ARGS pcre2posix.rc pcre2posix.o
      WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}
      COMMENT Using pcre2posix coff info in mingw build)
-SET(PCRE2POSIX_SOURCES
-  ${PCRE2POSIX_SOURCES} ${PROJECT_SOURCE_DIR}/pcre2posix.o
-)
+    SET(PCRE2POSIX_SOURCES ${PCRE2POSIX_SOURCES} ${PROJECT_SOURCE_DIR}/pcre2posix.o)
  ENDIF(EXISTS ${PROJECT_SOURCE_DIR}/pcre2posix.rc)
-ENDIF(MINGW AND NOT PCRE2_STATIC)
+ENDIF(MINGW AND BUILD_SHARED_LIBS)

-IF(MSVC AND NOT PCRE2_STATIC)
+IF(MSVC AND BUILD_SHARED_LIBS)
+  SET(dll_pdb_files ${PROJECT_BINARY_DIR}/pcre2-posix.pdb ${dll_pdb_files})
+  SET(dll_pdb_debug_files ${PROJECT_BINARY_DIR}/pcre2-posixd.pdb ${dll_pdb_debug_files})
  IF (EXISTS ${PROJECT_SOURCE_DIR}/pcre2.rc)
-SET(PCRE2_SOURCES
-  ${PCRE2_SOURCES} pcre2.rc)
+    SET(PCRE2_SOURCES ${PCRE2_SOURCES} pcre2.rc)
  ENDIF(EXISTS ${PROJECT_SOURCE_DIR}/pcre2.rc)
+
  IF (EXISTS ${PROJECT_SOURCE_DIR}/pcre2posix.rc)
-SET(PCRE2POSIX_SOURCES
-  ${PCRE2POSIX_SOURCES} pcre2posix.rc)
+    SET(PCRE2POSIX_SOURCES ${PCRE2POSIX_SOURCES} pcre2posix.rc)
  ENDIF (EXISTS ${PROJECT_SOURCE_DIR}/pcre2posix.rc)
-ENDIF(MSVC AND NOT PCRE2_STATIC)
+ENDIF(MSVC AND BUILD_SHARED_LIBS)

 # Fix static compilation with MSVC: https://bugs.exim.org/show_bug.cgi?id=1681
 # This code was taken from the CMake wiki, not from WebM.
@ -529,71 +690,219 @@ IF(MSVC)
 ENDIF(MSVC)

 SET(CMAKE_INCLUDE_CURRENT_DIR 1)
-# needed to make sure to not link debug libs
-# against release libs and vice versa
-IF(WIN32)
-  SET(CMAKE_DEBUG_POSTFIX "d")
-ENDIF(WIN32)

 SET(targets)

 # 8-bit library

 IF(PCRE2_BUILD_PCRE2_8)
-ADD_LIBRARY(pcre2-8 ${PCRE2_HEADERS} ${PCRE2_SOURCES} ${PROJECT_BINARY_DIR}/config.h)
-SET_PROPERTY(TARGET pcre2-8
-  PROPERTY COMPILE_DEFINITIONS PCRE2_CODE_UNIT_WIDTH=8)
-SET(targets ${targets} pcre2-8)
-ADD_LIBRARY(pcre2-posix ${PCRE2POSIX_HEADERS} ${PCRE2POSIX_SOURCES})
-SET_PROPERTY(TARGET pcre2-posix
-  PROPERTY COMPILE_DEFINITIONS PCRE2_CODE_UNIT_WIDTH=8)
-SET(targets ${targets} pcre2-posix)
-TARGET_LINK_LIBRARIES(pcre2-posix pcre2-8)
+  IF(BUILD_STATIC_LIBS)
+    ADD_LIBRARY(pcre2-8-static STATIC ${PCRE2_HEADERS} ${PCRE2_SOURCES} ${PROJECT_BINARY_DIR}/config.h)
+    SET_TARGET_PROPERTIES(pcre2-8-static PROPERTIES
+      COMPILE_DEFINITIONS PCRE2_CODE_UNIT_WIDTH=8
+      MACHO_COMPATIBILITY_VERSION "${LIBPCRE2_8_MACHO_COMPATIBILITY_VERSION}"
+      MACHO_CURRENT_VERSION "${LIBPCRE2_8_MACHO_CURRENT_VERSION}"
+      VERSION ${LIBPCRE2_8_VERSION}
+      SOVERSION ${LIBPCRE2_8_SOVERSION})
+    TARGET_COMPILE_DEFINITIONS(pcre2-8-static PUBLIC PCRE2_STATIC)
+    TARGET_INCLUDE_DIRECTORIES(pcre2-8-static PUBLIC ${PROJECT_BINARY_DIR})
+    IF(REQUIRE_PTHREAD)
+        TARGET_LINK_LIBRARIES(pcre2-8-static Threads::Threads)
+    ENDIF(REQUIRE_PTHREAD)
+    SET(targets ${targets} pcre2-8-static)
+    ADD_LIBRARY(pcre2-posix-static STATIC ${PCRE2POSIX_HEADERS} ${PCRE2POSIX_SOURCES})
+    SET_TARGET_PROPERTIES(pcre2-posix-static PROPERTIES
+      COMPILE_DEFINITIONS PCRE2_CODE_UNIT_WIDTH=8
+      MACHO_COMPATIBILITY_VERSION "${LIBPCRE2_POSIX_MACHO_COMPATIBILITY_VERSION}"
+      MACHO_CURRENT_VERSION "${LIBPCRE2_POSIX_MACHO_CURRENT_VERSION}"
+      VERSION ${LIBPCRE2_POSIX_VERSION}
+      SOVERSION ${LIBPCRE2_POSIX_SOVERSION})
+    TARGET_LINK_LIBRARIES(pcre2-posix-static pcre2-8-static)
+    TARGET_COMPILE_DEFINITIONS(pcre2-posix-static PUBLIC PCRE2_STATIC)
+    TARGET_INCLUDE_DIRECTORIES(pcre2-posix-static PUBLIC ${PROJECT_BINARY_DIR})
+    SET(targets ${targets} pcre2-posix-static)

-IF(MINGW AND NOT PCRE2_STATIC)
+    IF(MSVC)
+      SET_TARGET_PROPERTIES(pcre2-8-static PROPERTIES OUTPUT_NAME pcre2-8-static)
+      SET_TARGET_PROPERTIES(pcre2-posix-static PROPERTIES OUTPUT_NAME pcre2-posix-static)
+    ELSE(MSVC)
+      SET_TARGET_PROPERTIES(pcre2-8-static PROPERTIES OUTPUT_NAME pcre2-8)
+      SET_TARGET_PROPERTIES(pcre2-posix-static PROPERTIES OUTPUT_NAME pcre2-posix)
+    ENDIF(MSVC)
+    IF(PCRE2_STATIC_PIC)
+      SET_TARGET_PROPERTIES(pcre2-8-static pcre2-posix-static PROPERTIES POSITION_INDEPENDENT_CODE 1)
+    ENDIF(PCRE2_STATIC_PIC)
+  ENDIF(BUILD_STATIC_LIBS)
+
+  IF(BUILD_SHARED_LIBS)
+    ADD_LIBRARY(pcre2-8-shared SHARED ${PCRE2_HEADERS} ${PCRE2_SOURCES} ${PROJECT_BINARY_DIR}/config.h)
+    TARGET_INCLUDE_DIRECTORIES(pcre2-8-shared PUBLIC ${PROJECT_BINARY_DIR})
+    SET_TARGET_PROPERTIES(pcre2-8-shared PROPERTIES
+      COMPILE_DEFINITIONS PCRE2_CODE_UNIT_WIDTH=8
+      MACHO_COMPATIBILITY_VERSION "${LIBPCRE2_8_MACHO_COMPATIBILITY_VERSION}"
+      MACHO_CURRENT_VERSION "${LIBPCRE2_8_MACHO_CURRENT_VERSION}"
+      VERSION ${LIBPCRE2_8_VERSION}
+      SOVERSION ${LIBPCRE2_8_SOVERSION}
+      OUTPUT_NAME pcre2-8)
+    IF(REQUIRE_PTHREAD)
+        TARGET_LINK_LIBRARIES(pcre2-8-shared Threads::Threads)
+    ENDIF(REQUIRE_PTHREAD)
+    SET(targets ${targets} pcre2-8-shared)
+    ADD_LIBRARY(pcre2-posix-shared SHARED ${PCRE2POSIX_HEADERS} ${PCRE2POSIX_SOURCES})
+    TARGET_INCLUDE_DIRECTORIES(pcre2-posix-shared PUBLIC ${PROJECT_BINARY_DIR})
+    SET_TARGET_PROPERTIES(pcre2-posix-shared PROPERTIES
+      COMPILE_DEFINITIONS PCRE2_CODE_UNIT_WIDTH=8
+      MACHO_COMPATIBILITY_VERSION "${LIBPCRE2_POSIX_MACHO_COMPATIBILITY_VERSION}"
+      MACHO_CURRENT_VERSION "${LIBPCRE2_POSIX_MACHO_CURRENT_VERSION}"
+      VERSION ${LIBPCRE2_POSIX_VERSION}
+      SOVERSION ${LIBPCRE2_POSIX_SOVERSION}
+      OUTPUT_NAME pcre2-posix)
+    TARGET_LINK_LIBRARIES(pcre2-posix-shared pcre2-8-shared)
+    SET(targets ${targets} pcre2-posix-shared)
+    SET(dll_pdb_files ${PROJECT_BINARY_DIR}/pcre2-8.pdb ${dll_pdb_files})
+    SET(dll_pdb_debug_files ${PROJECT_BINARY_DIR}/pcre2-8d.pdb ${dll_pdb_debug_files})
+
+    IF(MINGW)
      IF(NON_STANDARD_LIB_PREFIX)
-    SET_TARGET_PROPERTIES(pcre2-8 pcre2-posix PROPERTIES PREFIX "")
+        SET_TARGET_PROPERTIES(pcre2-8-shared pcre2-posix-shared PROPERTIES PREFIX "")
      ENDIF(NON_STANDARD_LIB_PREFIX)
      IF(NON_STANDARD_LIB_SUFFIX)
-    SET_TARGET_PROPERTIES(pcre2-8 pcre2-posix PROPERTIES SUFFIX "-0.dll")
+        SET_TARGET_PROPERTIES(pcre2-8-shared pcre2-posix-shared PROPERTIES SUFFIX "-0.dll")
      ENDIF(NON_STANDARD_LIB_SUFFIX)
-ENDIF(MINGW AND NOT PCRE2_STATIC)
+    ENDIF(MINGW)
+  ENDIF(BUILD_SHARED_LIBS)
+
+  IF(BUILD_STATIC_LIBS)
+    ADD_LIBRARY(pcre2-8 ALIAS pcre2-8-static)
+    ADD_LIBRARY(pcre2-posix ALIAS pcre2-posix-static)
+  ELSE(BUILD_STATIC_LIBS)
+    ADD_LIBRARY(pcre2-8 ALIAS pcre2-8-shared)
+    ADD_LIBRARY(pcre2-posix ALIAS pcre2-posix-shared)
+  ENDIF(BUILD_STATIC_LIBS)
 ENDIF(PCRE2_BUILD_PCRE2_8)

 # 16-bit library

 IF(PCRE2_BUILD_PCRE2_16)
-ADD_LIBRARY(pcre2-16 ${PCRE2_HEADERS} ${PCRE2_SOURCES} ${PROJECT_BINARY_DIR}/config.h)
-SET_PROPERTY(TARGET pcre2-16
-  PROPERTY COMPILE_DEFINITIONS PCRE2_CODE_UNIT_WIDTH=16)
-SET(targets ${targets} pcre2-16)
+  IF(BUILD_STATIC_LIBS)
+    ADD_LIBRARY(pcre2-16-static STATIC ${PCRE2_HEADERS} ${PCRE2_SOURCES} ${PROJECT_BINARY_DIR}/config.h)
+    TARGET_INCLUDE_DIRECTORIES(pcre2-16-static PUBLIC ${PROJECT_BINARY_DIR})
+    SET_TARGET_PROPERTIES(pcre2-16-static PROPERTIES
+      COMPILE_DEFINITIONS PCRE2_CODE_UNIT_WIDTH=16
+      MACHO_COMPATIBILITY_VERSION "${LIBPCRE2_32_MACHO_COMPATIBILITY_VERSION}"
+      MACHO_CURRENT_VERSION "${LIBPCRE2_32_MACHO_CURRENT_VERSION}"
+      VERSION ${LIBPCRE2_16_VERSION}
+      SOVERSION ${LIBPCRE2_16_SOVERSION})
+    TARGET_COMPILE_DEFINITIONS(pcre2-16-static PUBLIC PCRE2_STATIC)
+    IF(REQUIRE_PTHREAD)
+      TARGET_LINK_LIBRARIES(pcre2-16-static Threads::Threads)
+    ENDIF(REQUIRE_PTHREAD)
+    SET(targets ${targets} pcre2-16-static)

-IF(MINGW AND NOT PCRE2_STATIC)
+    IF(MSVC)
+      SET_TARGET_PROPERTIES(pcre2-16-static PROPERTIES OUTPUT_NAME pcre2-16-static)
+    ELSE(MSVC)
+      SET_TARGET_PROPERTIES(pcre2-16-static PROPERTIES OUTPUT_NAME pcre2-16)
+    ENDIF(MSVC)
+    IF(PCRE2_STATIC_PIC)
+      SET_TARGET_PROPERTIES(pcre2-16-static PROPERTIES POSITION_INDEPENDENT_CODE 1)
+    ENDIF(PCRE2_STATIC_PIC)
+  ENDIF(BUILD_STATIC_LIBS)
+
+  IF(BUILD_SHARED_LIBS)
+    ADD_LIBRARY(pcre2-16-shared SHARED ${PCRE2_HEADERS} ${PCRE2_SOURCES} ${PROJECT_BINARY_DIR}/config.h)
+    TARGET_INCLUDE_DIRECTORIES(pcre2-16-shared PUBLIC ${PROJECT_BINARY_DIR})
+    SET_TARGET_PROPERTIES(pcre2-16-shared PROPERTIES
+      COMPILE_DEFINITIONS PCRE2_CODE_UNIT_WIDTH=16
+      MACHO_COMPATIBILITY_VERSION "${LIBPCRE2_32_MACHO_COMPATIBILITY_VERSION}"
+      MACHO_CURRENT_VERSION "${LIBPCRE2_32_MACHO_CURRENT_VERSION}"
+      VERSION ${LIBPCRE2_16_VERSION}
+      SOVERSION ${LIBPCRE2_16_SOVERSION}
+      OUTPUT_NAME pcre2-16)
+    IF(REQUIRE_PTHREAD)
+      TARGET_LINK_LIBRARIES(pcre2-16-shared Threads::Threads)
+    ENDIF(REQUIRE_PTHREAD)
+    SET(targets ${targets} pcre2-16-shared)
+    SET(dll_pdb_files ${PROJECT_BINARY_DIR}/pcre2-16.pdb ${dll_pdb_files})
+    SET(dll_pdb_debug_files ${PROJECT_BINARY_DIR}/pcre2-16d.pdb ${dll_pdb_debug_files})
+
+    IF(MINGW)
      IF(NON_STANDARD_LIB_PREFIX)
-    SET_TARGET_PROPERTIES(pcre2-16 PROPERTIES PREFIX "")
+        SET_TARGET_PROPERTIES(pcre2-16-shared PROPERTIES PREFIX "")
      ENDIF(NON_STANDARD_LIB_PREFIX)
      IF(NON_STANDARD_LIB_SUFFIX)
-    SET_TARGET_PROPERTIES(pcre2-16 PROPERTIES SUFFIX "-0.dll")
+        SET_TARGET_PROPERTIES(pcre2-16-shared PROPERTIES SUFFIX "-0.dll")
      ENDIF(NON_STANDARD_LIB_SUFFIX)
-ENDIF(MINGW AND NOT PCRE2_STATIC)
+    ENDIF(MINGW)
+  ENDIF(BUILD_SHARED_LIBS)
+
+  IF(BUILD_STATIC_LIBS)
+    ADD_LIBRARY(pcre2-16 ALIAS pcre2-16-static)
+  ELSE(BUILD_STATIC_LIBS)
+    ADD_LIBRARY(pcre2-16 ALIAS pcre2-16-shared)
+  ENDIF(BUILD_STATIC_LIBS)
 ENDIF(PCRE2_BUILD_PCRE2_16)

 # 32-bit library

 IF(PCRE2_BUILD_PCRE2_32)
-ADD_LIBRARY(pcre2-32 ${PCRE2_HEADERS} ${PCRE2_SOURCES} ${PROJECT_BINARY_DIR}/config.h)
-SET_PROPERTY(TARGET pcre2-32
-  PROPERTY COMPILE_DEFINITIONS PCRE2_CODE_UNIT_WIDTH=32)
-SET(targets ${targets} pcre2-32)
+  IF(BUILD_STATIC_LIBS)
+    ADD_LIBRARY(pcre2-32-static STATIC ${PCRE2_HEADERS} ${PCRE2_SOURCES} ${PROJECT_BINARY_DIR}/config.h)
+    TARGET_INCLUDE_DIRECTORIES(pcre2-32-static PUBLIC ${PROJECT_BINARY_DIR})
+    SET_TARGET_PROPERTIES(pcre2-32-static PROPERTIES
+      COMPILE_DEFINITIONS PCRE2_CODE_UNIT_WIDTH=32
+      MACHO_COMPATIBILITY_VERSION "${LIBPCRE2_32_MACHO_COMPATIBILITY_VERSION}"
+      MACHO_CURRENT_VERSION "${LIBPCRE2_32_MACHO_CURRENT_VERSION}"
+      VERSION ${LIBPCRE2_32_VERSION}
+      SOVERSION ${LIBPCRE2_32_SOVERSION})
+    TARGET_COMPILE_DEFINITIONS(pcre2-32-static PUBLIC PCRE2_STATIC)
+    IF(REQUIRE_PTHREAD)
+      TARGET_LINK_LIBRARIES(pcre2-32-static Threads::Threads)
+    ENDIF(REQUIRE_PTHREAD)
+    SET(targets ${targets} pcre2-32-static)

-IF(MINGW AND NOT PCRE2_STATIC)
+    IF(MSVC)
+      SET_TARGET_PROPERTIES(pcre2-32-static PROPERTIES OUTPUT_NAME pcre2-32-static)
+    ELSE(MSVC)
+      SET_TARGET_PROPERTIES(pcre2-32-static PROPERTIES OUTPUT_NAME pcre2-32)
+    ENDIF(MSVC)
+    IF(PCRE2_STATIC_PIC)
+      SET_TARGET_PROPERTIES(pcre2-32-static PROPERTIES POSITION_INDEPENDENT_CODE 1)
+    ENDIF(PCRE2_STATIC_PIC)
+  ENDIF(BUILD_STATIC_LIBS)
+
+  IF(BUILD_SHARED_LIBS)
+    ADD_LIBRARY(pcre2-32-shared SHARED ${PCRE2_HEADERS} ${PCRE2_SOURCES} ${PROJECT_BINARY_DIR}/config.h)
+    TARGET_INCLUDE_DIRECTORIES(pcre2-32-shared PUBLIC ${PROJECT_BINARY_DIR})
+    SET_TARGET_PROPERTIES(pcre2-32-shared PROPERTIES
+      COMPILE_DEFINITIONS PCRE2_CODE_UNIT_WIDTH=32
+      MACHO_COMPATIBILITY_VERSION "${LIBPCRE2_32_MACHO_COMPATIBILITY_VERSION}"
+      MACHO_CURRENT_VERSION "${LIBPCRE2_32_MACHO_CURRENT_VERSION}"
+      VERSION ${LIBPCRE2_32_VERSION}
+      SOVERSION ${LIBPCRE2_32_SOVERSION}
+      OUTPUT_NAME pcre2-32)
+    IF(REQUIRE_PTHREAD)
+      TARGET_LINK_LIBRARIES(pcre2-32-shared Threads::Threads)
+    ENDIF(REQUIRE_PTHREAD)
+    SET(targets ${targets} pcre2-32-shared)
+    SET(dll_pdb_files ${PROJECT_BINARY_DIR}/pcre2-32.pdb ${dll_pdb_files})
+    SET(dll_pdb_debug_files ${PROJECT_BINARY_DIR}/pcre2-32d.pdb ${dll_pdb_debug_files})
+
+    IF(MINGW)
      IF(NON_STANDARD_LIB_PREFIX)
-    SET_TARGET_PROPERTIES(pcre2-32 PROPERTIES PREFIX "")
+        SET_TARGET_PROPERTIES(pcre2-32-shared PROPERTIES PREFIX "")
      ENDIF(NON_STANDARD_LIB_PREFIX)
      IF(NON_STANDARD_LIB_SUFFIX)
-    SET_TARGET_PROPERTIES(pcre2-32 PROPERTIES SUFFIX "-0.dll")
+        SET_TARGET_PROPERTIES(pcre2-32-shared PROPERTIES SUFFIX "-0.dll")
      ENDIF(NON_STANDARD_LIB_SUFFIX)
-ENDIF(MINGW AND NOT PCRE2_STATIC)
+    ENDIF(MINGW)
+  ENDIF(BUILD_SHARED_LIBS)
+
+  IF(BUILD_STATIC_LIBS)
+    ADD_LIBRARY(pcre2-32 ALIAS pcre2-32-static)
+  ELSE(BUILD_STATIC_LIBS)
+    ADD_LIBRARY(pcre2-32 ALIAS pcre2-32-shared)
+  ENDIF(BUILD_STATIC_LIBS)
 ENDIF(PCRE2_BUILD_PCRE2_32)

 # Executables
@ -718,7 +1027,9 @@ if test \"$?\" != \"0\"; then exit 1; fi
 \@echo off
 setlocal
 SET srcdir=\"${winsrc}\"
-SET pcre2test=\"${winexe}\"
+# The next line was replaced by the following one after a user comment.
+# SET pcre2test=\"${winexe}\"
+SET pcre2test=\"${winbin}\\pcre2test.exe\"
 if not [%CMAKE_CONFIG_TYPE%]==[] SET pcre2test=\"${winbin}\\%CMAKE_CONFIG_TYPE%\\pcre2test.exe\"
 call %srcdir%\\RunTest.Bat
 if errorlevel 1 exit /b 1
@ -754,42 +1065,44 @@ SET(CMAKE_INSTALL_ALWAYS 1)

 INSTALL(TARGETS ${targets}
        RUNTIME DESTINATION bin
-        LIBRARY DESTINATION lib
-        ARCHIVE DESTINATION lib)
+        LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
+        ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR})
+INSTALL(FILES ${pkg_config_files} DESTINATION ${CMAKE_INSTALL_LIBDIR}/pkgconfig)
+INSTALL(FILES "${CMAKE_CURRENT_BINARY_DIR}/pcre2-config"
+  DESTINATION bin
+  # Set 0755 permissions
+  PERMISSIONS OWNER_WRITE OWNER_READ OWNER_EXECUTE GROUP_READ GROUP_EXECUTE WORLD_READ WORLD_EXECUTE)

 INSTALL(FILES ${PCRE2_HEADERS} ${PCRE2POSIX_HEADERS} DESTINATION include)

+# CMake config files.
+set(PCRE2_CONFIG_IN  ${CMAKE_CURRENT_SOURCE_DIR}/cmake/pcre2-config.cmake.in)
+set(PCRE2_CONFIG_OUT ${CMAKE_CURRENT_BINARY_DIR}/cmake/pcre2-config.cmake)
+configure_file(${PCRE2_CONFIG_IN} ${PCRE2_CONFIG_OUT} @ONLY)
+set(PCRE2_CONFIG_VERSION_IN  ${CMAKE_CURRENT_SOURCE_DIR}/cmake/pcre2-config-version.cmake.in)
+set(PCRE2_CONFIG_VERSION_OUT ${CMAKE_CURRENT_BINARY_DIR}/cmake/pcre2-config-version.cmake)
+configure_file(${PCRE2_CONFIG_VERSION_IN} ${PCRE2_CONFIG_VERSION_OUT} @ONLY)
+install(FILES ${PCRE2_CONFIG_OUT} ${PCRE2_CONFIG_VERSION_OUT} DESTINATION cmake)
+
 FILE(GLOB html ${PROJECT_SOURCE_DIR}/doc/html/*.html)
 FILE(GLOB man1 ${PROJECT_SOURCE_DIR}/doc/*.1)
 FILE(GLOB man3 ${PROJECT_SOURCE_DIR}/doc/*.3)

-FOREACH(man ${man3})
-        GET_FILENAME_COMPONENT(man_tmp ${man} NAME)
-        SET(man3_new ${man3} ${man})
-ENDFOREACH(man ${man3})
-SET(man3 ${man3_new})
-
 INSTALL(FILES ${man1} DESTINATION man/man1)
 INSTALL(FILES ${man3} DESTINATION man/man3)
 INSTALL(FILES ${html} DESTINATION share/doc/pcre2/html)

 IF(MSVC AND INSTALL_MSVC_PDB)
-    INSTALL(FILES ${PROJECT_BINARY_DIR}/pcre2.pdb
-                  ${PROJECT_BINARY_DIR}/pcre2posix.pdb
-            DESTINATION bin
-            CONFIGURATIONS RelWithDebInfo)
-    INSTALL(FILES ${PROJECT_BINARY_DIR}/pcre2d.pdb
-                  ${PROJECT_BINARY_DIR}/pcre2posixd.pdb
-            DESTINATION bin
-            CONFIGURATIONS Debug)
+ INSTALL(FILES ${dll_pdb_files} DESTINATION bin CONFIGURATIONS RelWithDebInfo)
+ INSTALL(FILES ${dll_pdb_debug_files} DESTINATION bin CONFIGURATIONS Debug)
 ENDIF(MSVC AND INSTALL_MSVC_PDB)

 # Help, only for nice output
-IF(BUILD_SHARED_LIBS)
-  SET(BUILD_STATIC_LIBS OFF)
-ELSE(BUILD_SHARED_LIBS)
+IF(BUILD_STATIC_LIBS)
  SET(BUILD_STATIC_LIBS ON)
-ENDIF(BUILD_SHARED_LIBS)
+ELSE(BUILD_STATIC_LIBS)
+  SET(BUILD_STATIC_LIBS OFF)
+ENDIF(BUILD_STATIC_LIBS)

 IF(PCRE2_HEAP_MATCH_RECURSE)
  MESSAGE(WARNING "HEAP_MATCH_RECURSE is obsolete and does nothing.")
@ -802,7 +1115,7 @@ IF(PCRE2_SHOW_REPORT)
  ENDIF(CMAKE_C_FLAGS)
  MESSAGE(STATUS "")
  MESSAGE(STATUS "")
-  MESSAGE(STATUS "PCRE2 configuration summary:")
+  MESSAGE(STATUS "PCRE2-${PCRE2_MAJOR}.${PCRE2_MINOR} configuration summary:")
  MESSAGE(STATUS "")
  MESSAGE(STATUS "  Install prefix .................. : ${CMAKE_INSTALL_PREFIX}")
  MESSAGE(STATUS "  C compiler ...................... : ${CMAKE_C_COMPILER}")
@ -827,6 +1140,7 @@ IF(PCRE2_SHOW_REPORT)
  MESSAGE(STATUS "  Match depth limit ............... : ${PCRE2_MATCH_LIMIT_DEPTH}")
  MESSAGE(STATUS "  Build shared libs ............... : ${BUILD_SHARED_LIBS}")
  MESSAGE(STATUS "  Build static libs ............... : ${BUILD_STATIC_LIBS}")
+  MESSAGE(STATUS "     with PIC enabled ............. : ${PCRE2_STATIC_PIC}")
  MESSAGE(STATUS "  Build pcre2grep ................. : ${PCRE2_BUILD_PCRE2GREP}")
  MESSAGE(STATUS "  Enable JIT in pcre2grep ......... : ${PCRE2GREP_SUPPORT_JIT}")
  MESSAGE(STATUS "  Enable callouts in pcre2grep .... : ${PCRE2GREP_SUPPORT_CALLOUT}")
@ -861,10 +1175,10 @@ IF(PCRE2_SHOW_REPORT)
    MESSAGE(STATUS "  Use %zu and %td ..................: AUTO" )
  ENDIF(PCRE2_DISABLE_PERCENT_ZT)

-  IF(MINGW AND NOT PCRE2_STATIC)
+  IF(MINGW AND BUILD_SHARED_LIBS)
    MESSAGE(STATUS "  Non-standard dll names (prefix) . : ${NON_STANDARD_LIB_PREFIX}")
    MESSAGE(STATUS "  Non-standard dll names (suffix) . : ${NON_STANDARD_LIB_SUFFIX}")
-  ENDIF(MINGW AND NOT PCRE2_STATIC)
+  ENDIF(MINGW AND BUILD_SHARED_LIBS)

  IF(MSVC)
    MESSAGE(STATUS "  Install MSVC .pdb files ..........: ${INSTALL_MSVC_PDB}")
--- a/747
+++ b/747
@ -1,5 +1,746 @@
-Change Log for PCRE2
--------------------
+Change Log for PCRE2 - see also the Git log
+-------------------------------------------
+
+
+Version 10.41 xx-xxx-2022
+-------------------------
+
+1. Add fflush() before and after a fork callout in pcre2grep to get its output
+to be the same on all systems. (THere were previously ordering differences in
+Alpine Linux).
+
+2. Merged patch from @carenas (GitHub #110) for pthreads support in CMake.
+
+3. SSF scorecards grumbled about possible overflow in an expression in
+pcre2test. It never would have overflowed in practice, but some casts have been
+added and at the some time there's been some tidying of fprints that output
+size_t values.
+
+4. PR #94 showed up an unused enum in pcre2_convert.c, which is now removed.
+
+5. Minor code re-arrangement to remove gcc warning about realloc() in
+pcre2test.
+
+6. Change a number of int variables that hold buffer and line lengths in
+pcre2grep to PCRE2_SIZE (aka size_t).
+
+7. Added an #ifdef to cut out a call to PRIV(jit_free) when JIT is not
+supported (even though that function would do nothing in that case) at the
+request of a user who doesn't even want to link with pcre_jit_compile.o. Also
+tidied up an untidy #ifdef arrangement in pcre2test.
+
+8. Fixed an issue in the backtracking optimization of character repeats in
+JIT. Furthermore optimize star repetitions, not just plus repetitions.
+
+9. Removed the use of an initial backtracking frames vector on the system stack 
+in pcre2_match() so that it now always uses the heap. (In a multi-thread 
+environment with very small stacks there had been an issue.) This also is 
+tidier for JIT matching, which didn't need that vector. The heap vector is now 
+remembered in the match data block and re-used if that block itself is re-used. 
+It is freed with the match data block.
+
+10. Adjusted the find_limits code in pcre2test to work with change 9 above.
+
+11. Added find_limits_noheap to pcre2test, because the heap limits are now 
+different in different environments and so cannot be included in the standard 
+tests.
+
+12. Created a test for pcre2_match() heap processing that is not part of the 
+tests run by 'make check', but can be run manually. The current output is from 
+a 64-bit system.
+
+13. Implemented -Z aka --null in pcre2grep.
+
+
+Version 10.40 15-April-2022
+---------------------------
+
+1. Merged patch from @carenas (GitHub #35, 7db87842) to fix pcre2grep incorrect
+handling of multiple passes.
+
+2. Merged patch from @carenas (GitHub #36, dae47509) to fix portability issue
+in pcre2grep with buffered fseek(stdin).
+
+3. Merged patch from @carenas (GitHub #37, acc520924) to fix tests when -S is
+not supported.
+
+4. Revert an unintended change in JIT repeat detection.
+
+5. Merged patch from @carenas (GitHub #52, b037bfa1) to fix build on GNU Hurd.
+
+6. Merged documentation and comments patches from @carenas (GitHub #47).
+
+7. Merged patch from @carenas (GitHub #49) to remove obsolete JFriedl test code
+from pcre2grep.
+
+8. Merged patch from @carenas (GitHub #48) to fix CMake install issue #46.
+
+9. Merged patch from @carenas (GitHub #53) fixing NULL checks in matching and
+substituting.
+
+10. Add null_subject and null_replacement modifiers to pcre2test.
+
+11. Add check for NULL subject to POSIX regexec() function.
+
+12. Add check for NULL replacement to pcre2_substitute().
+
+13. For the subject arguments of pcre2_match(), pcre2_dfa_match(), and
+pcre2_substitute(), and the replacement argument of the latter, if the pointer
+is NULL and the length is zero, treat as an empty string. Apparently a number
+of applications treat NULL/0 in this way.
+
+14. Added support for Bidi_Class and a number of binary Unicode properties,
+including Bidi_Control.
+
+15. Fix some minor issues raised by clang sanitize.
+
+16. Very minor code speed up for maximizing character property matches.
+
+17. A number of changes to script matching for \p and \P:
+
+    (a) Script extensions for a character are now coded as a bitmap instead of
+        a list of script numbers, which should be faster and does not need a
+        loop.
+
+    (b) Added the syntax \p{script:xxx} and \p{script_extensions:xxx} (synonyms
+        sc and scx).
+
+    (c) Changed \p{scriptname} from being the same as \p{sc:scriptname} to being
+        the same as \p{scx:scriptname} because this change happened in Perl at
+        release 5.26.
+
+    (d) The standard Unicode 4-letter abbreviations for script names are now
+        recognized.
+
+    (e) In accordance with Unicode and Perl's "loose matching" rules, spaces,
+        hyphens, and underscores are ignored in property names, which are then
+        matched independent of case.
+
+18. The Python scripts in the maint directory have been refactored. There are
+now three scripts that generate pcre2_ucd.c, pcre2_ucp.h, and pcre2_ucptables.c
+(which is #included by pcre2_tables.c). The data lists that used to be
+duplicated are now held in a single common Python module.
+
+19. On CHERI, and thus Arm's Morello prototype, pointers are represented as
+hardware capabilities, which consist of both an integer address and additional
+metadata, meaning they are twice the size of the platform's size_t type, i.e.
+16 bytes on a 64-bit system. The ovector member of heapframe happens to only be
+8 byte aligned, and so computing frame_size ended up with a multiple of 8 but
+not 16. Whilst the first frame was always suitably aligned, this then
+misaligned the frame that follows, resulting in an alignment fault when storing
+a pointer to Fecode at the start of match. Patch to fix this issue by Jessica
+Clarke PR#72.
+
+20. Added -LP and -LS listing options to pcre2test.
+
+21. A user discovered that the library names in CMakeLists.txt for MSVC
+debugger (PDB) files were incorrect - perhaps never tried for PCRE2?
+
+22. An item such as [Aa] is optimized into a caseless single character match.
+When this was quantified (e.g. [Aa]{2}) and was also the last literal item in a
+pattern, the optimizing "must be present for a match" character check was not
+being flagged as caseless, causing some matches that should have succeeded to
+fail.
+
+23. Fixed a unicode property matching issue in JIT. The character was not
+fully read in caseless matching.
+
+24. Fixed an issue affecting recursions in JIT caused by duplicated data
+transfers.
+
+25. Merged patch from @carenas (GitHub #96) which fixes some problems with
+pcre2test and readline/readedit:
+
+  * Use the right header for libedit in FreeBSD with autoconf
+  * Really allow libedit with cmake
+  * Avoid using readline headers with libedit
+
+
+Version 10.39 29-October-2021
+-----------------------------
+
+1. Fix incorrect detection of alternatives in first character search in JIT.
+
+2. Merged patch from @carenas (GitHub #28):
+
+  Visual Studio 2013 includes support for %zu and %td, so let newer
+  versions of it avoid the fallback, and while at it, make sure that
+  the first check is for DISABLE_PERCENT_ZT so it will be always
+  honoured if chosen.
+
+  prtdiff_t is signed, so use a signed type instead, and make sure
+  that an appropriate width is chosen if pointers are 64bit wide and
+  long is not (ex: Windows 64bit).
+
+  IMHO removing the cast (and therefore the possibilty of truncation)
+  make the code cleaner and the fallback is likely portable enough
+  with all 64-bit POSIX systems doing LP64 except for Windows.
+
+3. Merged patch from @carenas (GitHub #29) to update to Unicode 14.0.0.
+
+4. Merged patch from @carenas (GitHub #30):
+
+  * Cleanup: remove references to no longer used stdint.h
+
+  Since 19c50b9d (Unconditionally use inttypes.h instead of trying for stdint.h
+  (simplification) and remove the now unnecessary inclusion in
+  pcre2_internal.h., 2018-11-14), stdint.h is no longer used.
+
+  Remove checks for it in autotools and CMake and document better the expected
+  build failures for systems that might have stdint.h (C99) and not inttypes.h
+  (from POSIX), like old Windows.
+
+  * Cleanup: remove detection for inttypes.h which is a hard dependency
+
+  CMake checks for standard headers are not meant to be used for hard
+  dependencies, so will prevent a possible fallback to work.
+
+  Alternatively, the header could be checked to make the configuration fail
+  instead of breaking the build, but that was punted, as it was missing anyway
+  from autotools.
+
+5. Merged patch from @carenas (GitHub #32):
+
+  * jit: allow building with ancient MSVC versions
+
+  Visual Studio older than 2013 fails to build with JIT enabled, because it is
+  unable to parse non C89 compatible syntax, with mixed declarations and code.
+  While most recent compilers wouldn't even report this as a warning since it
+  is valid C99, it could be also made visible by adding to gcc/clang the
+  -Wdeclaration-after-statement flag at build time.
+
+  Move the code below the affected definitions.
+
+  * pcre2grep: avoid mixing declarations with code
+
+  Since d5a61ee8 (Patch to detect (and ignore) symlink loops in pcre2grep,
+  2021-08-28), code will fail to build in a strict C89 compiler.
+
+  Reformat slightly to make it C89 compatible again.
+
+
+Version 10.38 01-October-2021
+-----------------------------
+
+1. Fix invalid single character repetition issues in JIT when the repetition
+is inside a capturing bracket and the bracket is preceded by character
+literals.
+
+2. Installed revised CMake configuration files provided by Jan-Willem Blokland.
+This extends the CMake build system to build both static and shared libraries
+in one go, builds the static library with PIC, and exposes PCRE2 libraries
+using the CMake config files. JWB provided these notes:
+
+- Introduced CMake variable BUILD_STATIC_LIBS to build the static library.
+
+- Make a small modification to config-cmake.h.in by removing the PCRE2_STATIC
+  variable. Added PCRE2_STATIC variable to the static build using the
+  target_compile_definitions() function.
+
+- Extended the CMake config files.
+
+  - Introduced CMake variable PCRE2_USE_STATIC_LIBS to easily switch between
+    the static and shared libraries.
+
+  - Added the PCRE_STATIC variable to the target compile definitions for the
+    import of the static library.
+
+Building static and shared libraries using MSVC results in a name clash of
+the libraries. Both static and shared library builds create, for example, the
+file pcre2-8.lib. Therefore, I decided to change the static library names by
+adding "-static". For example, pcre2-8.lib has become pcre2-8-static.lib.
+[Comment by PH: this is MSVC-specific. It doesn't happen on Linux.]
+
+3. Increased the minimum release number for CMake to 3.0.0 because older than
+2.8.12 is deprecated (it was set to 2.8.5) and causes warnings. Even 3.0.0 is
+quite old; it was released in 2014.
+
+4. Implemented a modified version of Thomas Tempelmann's pcre2grep patch for
+detecting symlink loops. This is dependent on the availability of realpath(),
+which is now tested for in ./configure and CMakeLists.txt.
+
+5. Implemented a modified version of Thomas Tempelmann's patch for faster
+case-independent "first code unit" searches for unanchored patterns in 8-bit
+mode in the interpreters. Instead of just remembering whether one case matched
+or not, it remembers the position of a previous match so as to avoid
+unnecessary repeated searching.
+
+6. Perl now locks out \K in lookarounds, so PCRE2 now does the same by default.
+However, just in case anybody was relying on the old behaviour, there is an
+option called PCRE2_EXTRA_ALLOW_LOOKAROUND_BSK that enables the old behaviour.
+An option has also been added to pcre2grep to enable this.
+
+7. Re-enable a JIT optimization which was unintentionally disabled in 10.35.
+
+8. There is a loop counter to catch excessively crazy patterns when checking
+the lengths of lookbehinds at compile time. This was incorrectly getting reset
+whenever a lookahead was processed, leading to some fuzzer-generated patterns
+taking a very long time to compile when (?|) was present in the pattern,
+because (?|) disables caching of group lengths.
+
+
+Version 10.37 26-May-2021
+-------------------------
+
+1. Change RunGrepTest to use tr instead of sed when testing with binary
+zero bytes, because sed varies a lot from system to system and has problems
+with binary zeros. This is from Bugzilla #2681. Patch from Jeremie
+Courreges-Anglas via Nam Nguyen. This fixes RunGrepTest for OpenBSD. Later:
+it broke it for at least one version of Solaris, where tr can't handle binary
+zeros. However, that system had /usr/xpg4/bin/tr installed, which works OK, so
+RunGrepTest now checks for that command and uses it if found.
+
+2. Compiling with gcc 10.2's -fanalyzer option showed up a hypothetical problem
+with a NULL dereference. I don't think this case could ever occur in practice,
+but I have put in a check in order to get rid of the compiler error.
+
+3. An alternative patch for CMakeLists.txt because 10.36 #4 breaks CMake on
+Windows. Patch from email@cs-ware.de fixes bugzilla #2688.
+
+4. Two bugs related to over-large numbers have been fixed so the behaviour is
+now the same as Perl.
+
+  (a) A pattern such as /\214748364/ gave an overflow error instead of being
+  treated as the octal number \214 followed by literal digits.
+
+  (b) A sequence such as {65536 that has no terminating } so is not a
+  quantifier was nevertheless complaining that a quantifier number was too big.
+
+5. A run of autoconf suggested that configure.ac was out-of-date with respect
+to the lastest autoconf. Running autoupdate made some valid changes, some valid
+suggestions, and also some invalid changes, which were fixed by hand. Autoconf
+now runs clean and the resulting "configure" seems to work, so I hope nothing
+is broken. Later: the requirement for autoconf 2.70 broke some automatic test
+robots. It doesn't seem to be necessary: trying a reduction to 2.60.
+
+6. The pattern /a\K.(?0)*/ when matched against "abac" by the interpreter gave
+the answer "bac", whereas Perl and JIT both yield "c". This was because the
+effect of \K was not propagating back from the full pattern recursion. Other
+recursions such as /(a\K.(?1)*)/ did not have this problem.
+
+7. Restore single character repetition optimization in JIT. Currently fewer
+character repetitions are optimized than in 10.34.
+
+8. When the names of the functions in the POSIX wrapper were changed to
+pcre2_regcomp() etc. (see change 10.33 #4 below), functions with the original
+names were left in the library so that pre-compiled programs would still work.
+However, this has proved troublesome when programs link with several libraries,
+some of which use PCRE2 via the POSIX interface while others use a native POSIX
+library. For this reason, the POSIX function names are removed in this release.
+The macros in pcre2posix.h should ensure that re-compiling fixes any programs
+that haven't been compiled since before 10.33.
+
+
+Version 10.36 04-December-2020
+------------------------------
+
+1. Add CET_CFLAGS so that when Intel CET is enabled, pass -mshstk to
+compiler. This fixes https://bugs.exim.org/show_bug.cgi?id=2578. Patch for
+Makefile.am and configure.ac by H.J. Lu. Equivalent patch for CMakeLists.txt
+invented by PH.
+
+2. Fix inifinite loop when a single byte newline is searched in JIT when
+invalid utf8 mode is enabled.
+
+3. Updated CMakeLists.txt with patch from Wolfgang Stöggl (Bugzilla #2584):
+
+  - Include GNUInstallDirs and use ${CMAKE_INSTALL_LIBDIR} instead of hardcoded
+    lib. This allows differentiation between lib and lib64.
+    CMAKE_INSTALL_LIBDIR is used for installation of libraries and also for
+    pkgconfig file generation.
+
+  - Add the version of PCRE2 to the configuration summary like ./configure
+    does.
+
+  - Fix typo: MACTHED_STRING->MATCHED_STRING
+
+4. Updated CMakeLists.txt with another patch from Wolfgang Stöggl (Bugzilla
+#2588):
+
+  - Add escaped double quotes around include directory in CMakeLists.txt to
+    allow spaces in directory names.
+
+  - This fixes a cmake error, if the path of the pcre2 source contains a space.
+
+5. Updated CMakeLists.txt with a patch from B. Scott Michel: CMake's
+documentation suggests using CHECK_SYMBOL_EXISTS over CHECK_FUNCTION_EXIST.
+Moreover, these functions come from specific header files, which need to be
+specified (and, thankfully, are the same on both the Linux and WinXX
+platforms.)
+
+6. Added a (uint32_t) cast to prevent a compiler warning in pcre2_compile.c.
+
+7. Applied a patch from Wolfgang Stöggl (Bugzilla #2600) to fix postfix for
+debug Windows builds using CMake. This also updated configure so that it
+generates *.pc files and pcre2-config with the same content, as in the past.
+
+8. If a pattern ended with (?(VERSION=n.d where n is any number but d is just a
+single digit, the code unit beyond d was being read (i.e. there was a read
+buffer overflow). Fixes ClusterFuzz 23779.
+
+9. After the rework in r1235, certain character ranges were incorrectly
+handled by an optimization in JIT. Furthermore a wrong offset was used to
+read a value from a buffer which could lead to memory overread.
+
+10. Unnoticed for many years was the fact that delimiters other than / in the
+testinput1 and testinput4 files could cause incorrect behaviour when these
+files were processed by perltest.sh. There were several tests that used quotes
+as delimiters, and it was just luck that they didn't go wrong with perltest.sh.
+All the patterns in testinput1 and testinput4 now use / as their delimiter.
+This fixes Bugzilla #2641.
+
+11. Perl has started to give an error for \K within lookarounds (though there
+are cases where it doesn't). PCRE2 still allows this, so the tests that include
+this case have been moved from test 1 to test 2.
+
+12. Further to 10 above, pcre2test has been updated to detect and grumble if a
+delimiter other than / is used after #perltest.
+
+13. Fixed a bug with PCRE2_MATCH_INVALID_UTF in 8-bit mode when PCRE2_CASELESS
+was set and PCRE2_NO_START_OPTIMIZE was not set. The optimization for finding
+the start of a match was not resetting correctly after a failed match on the
+first valid fragment of the subject, possibly causing incorrect "no match"
+returns on subsequent fragments. For example, the pattern /A/ failed to match
+the subject \xe5A. Fixes Bugzilla #2642.
+
+14. Fixed a bug in character set matching when JIT is enabled and both unicode
+scripts and unicode classes are present at the same time.
+
+15. Added GNU grep's -m (aka --max-count) option to pcre2grep.
+
+16. Refactored substitution processing in pcre2grep strings, both for the -O
+option and when dealing with callouts. There is now a single function that
+handles $ expansion in all cases (instead of multiple copies of almost
+identical code). This means that the same escape sequences are available
+everywhere, which was not previously the case. At the same time, the escape
+sequences $x{...} and $o{...} have been introduced, to allow for characters
+whose code points are greater than 255 in Unicode mode.
+
+17. Applied the patch from Bugzilla #2628 to RunGrepTest. This does an explicit
+test for a version of sed that can handle binary zero, instead of assuming that
+any Linux version will work. Later: replaced $(...) by `...` because not all
+shells recognize the former.
+
+18. Fixed a word boundary check bug in JIT when partial matching is enabled.
+
+19. Fix ARM64 compilation warning in JIT. Patch by Carlo.
+
+20. A bug in the RunTest script meant that if the first part of test 2 failed,
+the failure was not reported.
+
+21. Test 2 was failing when run from a directory other than the source
+directory. This failure was previously missed in RunTest because of 20 above.
+Fixes added to both RunTest and RunTest.bat.
+
+22. Patch to CMakeLists.txt from Daniel to fix problem with testing under
+Windows.
+
+
+Version 10.35 09-May-2020
+---------------------------
+
+1. Use PCRE2_MATCH_EMPTY flag to detect empty matches in JIT.
+
+2. Fix ARMv5 JIT improper handling of labels right after a constant pool.
+
+3. A JIT bug is fixed which allowed to read the fields of the compiled
+pattern before its existence is checked.
+
+4. Back in the PCRE1 day, capturing groups that contained recursive back
+references to themselves were made atomic (version 8.01, change 18) because
+after the end a repeated group, the captured substrings had their values from
+the final repetition, not from an earlier repetition that might be the
+destination of a backtrack. This feature was documented, and was carried over
+into PCRE2. However, it has now been realized that the major refactoring that
+was done for 10.30 has made this atomicizing unnecessary, and it is confusing
+when users are unaware of it, making some patterns appear not to be working as
+expected. Capture values of recursive back references in repeated groups are
+now correctly backtracked, so this unnecessary restriction has been removed.
+
+5. Added PCRE2_SUBSTITUTE_LITERAL.
+
+6. Avoid some VS compiler warnings.
+
+7. Added PCRE2_SUBSTITUTE_MATCHED.
+
+8. Added (?* and (?<* as synonyms for (*napla: and (*naplb: to match another
+regex engine. The Perl regex folks are aware of this usage and have made a note
+about it.
+
+9. When an assertion is repeated, PCRE2 used to limit the maximum repetition to
+1, believing that repeating an assertion is pointless. However, if a positive
+assertion contains capturing groups, repetition can be useful. In any case, an
+assertion could always be wrapped in a repeated group. The only restriction
+that is now imposed is that an unlimited maximum is changed to one more than
+the minimum.
+
+10. Fix *THEN verbs in lookahead assertions in JIT.
+
+11. Added PCRE2_SUBSTITUTE_REPLACEMENT_ONLY.
+
+12. The JIT stack should be freed when the low-level stack allocation fails.
+
+13. In pcre2grep, if the final line in a scanned file is output but does not
+end with a newline sequence, add a newline according to the --newline setting.
+
+14. (?(DEFINE)...) groups were not being handled correctly when checking for
+the fixed length of a lookbehind assertion. Such a group within a lookbehind
+should be skipped, as it does not contribute to the length of the group.
+Instead, the (DEFINE) group was being processed, and if at the end of the
+lookbehind, that end was not correctly recognized. Errors such as "lookbehind
+assertion is not fixed length" and also "internal error: bad code value in
+parsed_skip()" could result.
+
+15. Put a limit of 1000 on recursive calls in pcre2_study() when searching
+nested groups for starting code units, in order to avoid stack overflow issues.
+If the limit is reached, it just gives up trying for this optimization.
+
+16. The control verb chain list must always be restored when exiting from a
+recurse function in JIT.
+
+17. Fix a crash which occurs when the character type of an invalid UTF
+character is decoded in JIT.
+
+18. Changes in many areas of the code so that when Unicode is supported and
+PCRE2_UCP is set without PCRE2_UTF, Unicode character properties are used for
+upper/lower case computations on characters whose code points are greater than
+127.
+
+19. The function for checking UTF-16 validity was returning an incorrect offset
+for the start of the error when a high surrogate was not followed by a valid
+low surrogate. This caused incorrect behaviour, for example when
+PCRE2_MATCH_INVALID_UTF was set and a match started immediately following the
+invalid high surrogate, such as /aa/ matching "\x{d800}aa".
+
+20. If a DEFINE group immediately preceded a lookbehind assertion, the pattern
+could be mis-compiled and therefore not match correctly. This is the example
+that found this: /(?(DEFINE)(?<foo>bar))(?<![-a-z0-9])word/ which failed to
+match "word" because the "move back" value was set to zero.
+
+21. Following a request from a user, some extensions and tidies to the
+character tables handling have been done:
+
+  (a) The dftables auxiliary program is renamed pcre2_dftables, but it is still
+  not installed for public use.
+
+  (b) There is now a -b option for pcre2_dftables, which causes the tables to
+  be written in binary. There is also a -help option.
+
+  (c) PCRE2_CONFIG_TABLES_LENGTH is added to pcre2_config() so that an
+  application that wants to save tables in binary knows how long they are.
+
+22. Changed setting of CMAKE_MODULE_PATH in CMakeLists.txt from SET to
+LIST(APPEND...) to allow a setting from the command line to be included.
+
+23. Updated to Unicode 13.0.0.
+
+24. CMake build now checks for secure_getenv() and strerror(). Patch by Carlo.
+
+25. Avoid using [-1] as a suffix in pcre2test because it can provoke a compiler
+warning.
+
+26. Added tests for __attribute__((uninitialized)) to both the configure and
+CMake build files, and then applied this attribute to the variable called
+stack_frames_vector[] in pcre2_match(). When implemented, this disables
+automatic initialization (a facility in clang), which can take time on big
+variables.
+
+27. Updated CMakeLists.txt (patches by Uwe Korn) to add support for
+pcre2-config, the libpcre*.pc files, SOVERSION, VERSION and the
+MACHO_*_VERSIONS settings for CMake builds.
+
+28. Another patch to CMakeLists.txt to check for mkostemp (configure already
+does). Patch by Carlo Marcelo Arenas Belon.
+
+29. Check for the existence of memfd_create in both CMake and configure
+configurations. Patch by Carlo Marcelo Arenas Belon.
+
+30. Restrict the configuration setting for the SELinux compatible execmem
+allocator (change 10.30/44) to Linux and NetBSD.
+
+
+Version 10.34 21-November-2019
+------------------------------
+
+1. The maximum number of capturing subpatterns is 65535 (documented), but no
+check on this was ever implemented. This omission has been rectified; it fixes
+ClusterFuzz 14376.
+
+2. Improved the invalid utf32 support of the JIT compiler. Now it correctly
+detects invalid characters in the 0xd800-0xdfff range.
+
+3. Fix minor typo bug in JIT compile when \X is used in a non-UTF string.
+
+4. Add support for matching in invalid UTF strings to the pcre2_match()
+interpreter, and integrate with the existing JIT support via the new
+PCRE2_MATCH_INVALID_UTF compile-time option.
+
+5. Give more error detail for invalid UTF-8 when detected in pcre2grep.
+
+6. Add support for invalid UTF-8 to pcre2grep.
+
+7. Adjust the limit for "must have" code unit searching, in particular,
+increase it substantially for non-anchored patterns.
+
+8. Allow (*ACCEPT) to be quantified, because an ungreedy quantifier with a zero
+minimum is potentially useful.
+
+9. Some changes to the way the minimum subject length is handled:
+
+   * When PCRE2_NO_START_OPTIMIZE is set, no minimum length is computed;
+     pcre2test now omits this item instead of showing a value of zero.
+
+   * An incorrect minimum length could be calculated for a pattern that
+     contained (*ACCEPT) inside a qualified group whose minimum repetition was
+     zero, for example /A(?:(*ACCEPT))?B/, which incorrectly computed a minimum
+     of 2. The minimum length scan no longer happens for a pattern that
+     contains (*ACCEPT).
+
+   * When no minimum length is set by the normal scan, but a first and/or last
+     code unit is recorded, set the minimum to 1 or 2 as appropriate.
+
+   * When a pattern contains multiple groups with the same number, a back
+     reference cannot know which one to scan for a minimum length. This used to
+     cause the minimum length finder to give up with no result. Now it treats
+     such references as not adding to the minimum length (which it should have
+     done all along).
+
+   * Furthermore, the above action now happens only if the back reference is to
+     a group that exists more than once in a pattern instead of any back
+     reference in a pattern with duplicate numbers.
+
+10. A (*MARK) value inside a successful condition was not being returned by the
+interpretive matcher (it was returned by JIT). This bug has been mended.
+
+11. A bug in pcre2grep meant that -o without an argument (or -o0) didn't work
+if the pattern had more than 32 capturing parentheses. This is fixed. In
+addition (a) the default limit for groups requested by -o<n> has been raised to
+50, (b) the new --om-capture option changes the limit, (c) an error is raised
+if -o asks for a group that is above the limit.
+
+12. The quantifier {1} was always being ignored, but this is incorrect when it
+is made possessive and applied to an item in parentheses, because a
+parenthesized item may contain multiple branches or other backtracking points,
+for example /(a|ab){1}+c/ or /(a+){1}+a/.
+
+13. For partial matches, pcre2test was always showing the maximum lookbehind
+characters, flagged with "<", which is misleading when the lookbehind didn't
+actually look behind the start (because it was later in the pattern). Showing
+all consulted preceding characters for partial matches is now controlled by the
+existing "allusedtext" modifier and, as for complete matches, this facility is
+available only for non-JIT matching, because JIT does not maintain the first
+and last consulted characters.
+
+14. DFA matching (using pcre2_dfa_match()) was not recognising a partial match
+if the end of the subject was encountered in a lookahead (conditional or
+otherwise), an atomic group, or a recursion.
+
+15. Give error if pcre2test -t, -T, -tm or -TM is given an argument of zero.
+
+16. Check for integer overflow when computing lookbehind lengths. Fixes
+Clusterfuzz issue 15636.
+
+17. Implemented non-atomic positive lookaround assertions.
+
+18. If a lookbehind contained a lookahead that contained another lookbehind
+within it, the nested lookbehind was not correctly processed. For example, if
+/(?<=(?=(?<=a)))b/ was matched to "ab" it gave no match instead of matching
+"b".
+
+19. Implemented pcre2_get_match_data_size().
+
+20. Two alterations to partial matching:
+
+    (a) The definition of a partial match is slightly changed: if a pattern
+    contains any lookbehinds, an empty partial match may be given, because this
+    is another situation where adding characters to the current subject can
+    lead to a full match. Example: /c*+(?<=[bc])/ with subject "ab".
+
+    (b) Similarly, if a pattern could match an empty string, an empty partial
+    match may be given. Example: /(?![ab]).*/ with subject "ab". This case
+    applies only to PCRE2_PARTIAL_HARD.
+
+    (c) An empty string partial hard match can be returned for \z and \Z as it
+    is documented that they shouldn't match.
+
+21. A branch that started with (*ACCEPT) was not being recognized as one that
+could match an empty string.
+
+22. Corrected pcre2_set_character_tables() tables data type: was const unsigned
+char * instead of const uint8_t *, as generated by pcre2_maketables().
+
+23. Upgraded to Unicode 12.1.0.
+
+24. Add -jitfast command line option to pcre2test (to make all the jit options
+available directly).
+
+25. Make pcre2test -C show if libreadline or libedit is supported.
+
+26. If the length of one branch of a group exceeded 65535 (the maximum value
+that is remembered as a minimum length), the whole group's length was
+incorrectly recorded as 65535, leading to incorrect "no match" when start-up
+optimizations were in force.
+
+27. The "rightmost consulted character" value was not always correct; in
+particular, if a pattern ended with a negative lookahead, characters that were
+inspected in that lookahead were not included.
+
+28. Add the pcre2_maketables_free() function.
+
+29. The start-up optimization that looks for a unique initial matching
+code unit in the interpretive engines uses memchr() in 8-bit mode. When the
+search is caseless, it was doing so inefficiently, which ended up slowing down
+the match drastically when the subject was very long. The revised code (a)
+remembers if one case is not found, so it never repeats the search for that
+case after a bumpalong and (b) when one case has been found, it searches only
+up to that position for an earlier occurrence of the other case. This fix
+applies to both interpretive pcre2_match() and to pcre2_dfa_match().
+
+30. While scanning to find the minimum length of a group, if any branch has
+minimum length zero, there is no need to scan any subsequent branches (a small
+compile-time performance improvement).
+
+31. Installed a .gitignore file on a user's suggestion. When using the svn
+repository with git (through git svn) this helps keep it tidy.
+
+32. Add underflow check in JIT which may occur when the value of subject
+string pointer is close to 0.
+
+33. Arrange for classes such as [Aa] which contain just the two cases of the
+same character, to be treated as a single caseless character. This causes the
+first and required code unit optimizations to kick in where relevant.
+
+34. Improve the bitmap of starting bytes for positive classes that include wide
+characters, but no property types, in UTF-8 mode. Previously, on encountering
+such a class, the bits for all bytes greater than \xc4 were set, thus
+specifying any character with codepoint >= 0x100. Now the only bits that are
+set are for the relevant bytes that start the wide characters. This can give a
+noticeable performance improvement.
+
+35. If the bitmap of starting code units contains only 1 or 2 bits, replace it
+with a single starting code unit (1 bit) or a caseless single starting code
+unit if the two relevant characters are case-partners. This is particularly
+relevant to the 8-bit library, though it applies to all. It can give a
+performance boost for patterns such as [Ww]ord and (word|WORD). However, this
+optimization doesn't happen if there is a "required" code unit of the same
+value (because the search for a "required" code unit starts at the match start
+for non-unique first code unit patterns, but after a unique first code unit,
+and patterns such as a*a need the former action).
+
+36. Small patch to pcre2posix.c to set the erroroffset field to -1 immediately
+after a successful compile, instead of at the start of matching to avoid a
+sanitizer complaint (regexec is supposed to be thread safe).
+
+37. Add NEON vectorization to JIT to speed up matching of first character and
+pairs of characters on ARM64 CPUs.
+
+38. If a non-ASCII character was the first in a starting assertion in a
+caseless match, the "first code unit" optimization did not get the casing
+right, and the assertion failed to match a character in the other case if it
+did not start with the same code unit.
+
+39. Fixed the incorrect computation of jump sizes on x86 CPUs in JIT. A masking
+operation was incorrectly removed in r1136. Reported by Ralf Junker.


 Version 10.33 16-April-2019
@ -153,7 +894,7 @@ Patch by Guillem Jover.
 warnings were reported.

 38. Using the clang compiler with sanitizing options causes runtime complaints
-about truncation for statments such as x = ~x when x is an 8-bit value; it
+about truncation for statements such as x = ~x when x is an 8-bit value; it
 seems to compute ~x as a 32-bit value. Changing such statements to x = 255 ^ x
 gets rid of the warnings. There were also two missing casts in pcre2test.

--- a/17
+++ b/17
@ -16,6 +16,7 @@ while (scalar(@ARGV) > 0)
  
  while (<IN>)
    {  
+    $count = 0;
    $line++; 
    if (/^\s*$/)
      {
@ -50,13 +51,23 @@ while (scalar(@ARGV) > 0)
        $yield = 1;
        }
      }
-    else
-      {
-      if (/\\[^ef]|\\f[^IBP]/)
+    elsif (/\\[^ef]|\\f[^IBP]/)
      {
      printf "Bad backslash in line $line of $file\n";
      $yield = 1;
      }
+    while (/\\f[BI]/g)
+      {
+      $count++;
+      }
+    while (/\\fP/g)
+      {
+      $count--;
+      }
+    if ($count != 0)
+      {
+      printf "Mismatching formatting in line $line of $file\n";
+      $yield = 1;
      }
    }
     
--- a/90
+++ b/90
@ -8,8 +8,8 @@ library is referred to as PCRE1 below. For information about testing PCRE2, see
 the pcre2test documentation and the comment at the head of the RunTest file.

 PCRE1 releases were up to 8.3x when PCRE2 was developed, and later bug fix
-releases remain in the 8.xx series. PCRE2 releases started at 10.00 to avoid
-confusion with PCRE1.
+releases carried on the 8.xx series, up to the final 8.45 release. PCRE2
+releases started at 10.00 to avoid confusion with PCRE1.


 Historical note 1
@ -38,8 +38,8 @@ Historical note 2
 By contrast, the code originally written by Henry Spencer (which was
 subsequently heavily modified for Perl) compiles the expression twice: once in
 a dummy mode in order to find out how much store will be needed, and then for
-real. (The Perl version probably doesn't do this any more; I'm talking about
-the original library.) The execution function operates by backtracking and
+real. (The Perl version may or may not still do this; I'm talking about the
+original library.) The execution function operates by backtracking and
 maximizing (or, optionally, minimizing, in Perl) the amount of the subject that
 matches individual wild portions of the pattern. This is an "NFA algorithm" in
 Friedl's terminology.
@ -151,8 +151,8 @@ of code units in the item itself. The exception is the aforementioned large
 advance to check for such values. When auto-callouts are enabled, the generous
 assumption is made that there will be a callout for each pattern code unit
 (which of course is only actually true if all code units are literals) plus one
-at the end. There is a default parsed pattern vector on the system stack, but
-if this is not big enough, heap memory is used.
+at the end. A default parsed pattern vector is defined on the system stack, to
+minimize memory handling, but if this is not big enough, heap memory is used.

 As before, the actual compiling function is run twice, the first time to
 determine the amount of memory needed for the final compiled pattern. It
@ -187,7 +187,7 @@ META_CLASS_EMPTY      [] empty class - only with PCRE2_ALLOW_EMPTY_CLASS
 META_CLASS_EMPTY_NOT  [^] negative empty class - ditto
 META_CLASS_END        ] end of non-empty class
 META_CLASS_NOT        [^ start non-empty negative class
-META_COMMIT           (*COMMIT)
+META_COMMIT           (*COMMIT) - no argument (see below for with argument)
 META_COND_ASSERT      (?(?assertion)
 META_DOLLAR           $ metacharacter
 META_DOT              . metacharacter
@ -195,23 +195,24 @@ META_END              End of pattern (this value is 0x80000000)
 META_FAIL             (*FAIL)
 META_KET              ) closing parenthesis
 META_LOOKAHEAD        (?= start of lookahead
+META_LOOKAHEAD_NA     (*napla: start of non-atomic lookahead
 META_LOOKAHEADNOT     (?! start of negative lookahead
 META_NOCAPTURE        (?: no capture parens
 META_PLUS             +
 META_PLUS_PLUS        ++
 META_PLUS_QUERY       +?
-META_PRUNE            (*PRUNE) - no argument
+META_PRUNE            (*PRUNE) - no argument (see below for with argument)
 META_QUERY            ?
 META_QUERY_PLUS       ?+
 META_QUERY_QUERY      ??
 META_RANGE_ESCAPED    hyphen in class range with at least one escape
 META_RANGE_LITERAL    hyphen in class range defined literally
-META_SKIP             (*SKIP) - no argument
-META_THEN             (*THEN) - no argument
+META_SKIP             (*SKIP) - no argument (see below for with argument)
+META_THEN             (*THEN) - no argument (see below for with argument)

 The two RANGE values occur only in character classes. They are positioned
 between two literals that define the start and end of the range. In an EBCDIC
-evironment it is necessary to know whether either of the range values was
+environment it is necessary to know whether either of the range values was
 specified as an escape. In an ASCII/Unicode environment the distinction is not
 relevant.

@ -228,17 +229,16 @@ If the data for META_ALT is non-zero, it is inside a lookbehind, and the data
 is the length of its branch, for which OP_REVERSE must be generated.

 META_BACKREF, META_CAPTURE, and META_RECURSE have the capture group number as
-their data in the lower 16 bits of the element.
+their data in the lower 16 bits of the element. META_RECURSE is followed by an
+offset, for use in error messages.

 META_BACKREF is followed by an offset if the back reference group number is 10
-or more. The offsets of the first ocurrences of references to groups whose
+or more. The offsets of the first occurrences of references to groups whose
 numbers are less than 10 are put in cb->small_ref_offset[] (only the first
 occurrence is useful). On 64-bit systems this avoids using more than two parsed
 pattern elements for items such as \3. The offset is used when an error occurs
 because the reference is to a non-existent group.

-META_RECURSE is always followed by an offset, for use in error messages.
-
 META_ESCAPE has an ESC_xxx value as its data. For ESC_P and ESC_p, the next
 element contains the 16-bit type and data property values, packed together.
 ESC_g and ESC_k are used only for named references - numerical ones are turned
@ -286,12 +286,13 @@ The following are also followed just by an offset, but also the lower 16 bits
 of the main word contain the length of the first branch of the lookbehind
 group; this is used when generating OP_REVERSE for that branch.

-META_LOOKBEHIND       (?<=
-META_LOOKBEHINDNOT    (?<!
+META_LOOKBEHIND       (?<=      start of lookbehind
+META_LOOKBEHIND_NA    (*naplb:  start of non-atomic lookbehind
+META_LOOKBEHINDNOT    (?<!      start of negative lookbehind

-The following are followed by two elements, the minimum and maximum. Repeat
-values are limited to 65535 (MAX_REPEAT). A maximum value of "unlimited" is
-represented by UNLIMITED_REPEAT, which is bigger than MAX_REPEAT:
+The following are followed by two elements, the minimum and maximum. The
+maximum value is limited to 65535 (MAX_REPEAT). A maximum value of "unlimited"
+is represented by UNLIMITED_REPEAT, which is bigger than MAX_REPEAT:

 META_MINMAX           {n,m}  repeat
 META_MINMAX_PLUS      {n,m}+ repeat
@ -345,11 +346,11 @@ support is not available for this kind of matching.
 Changeable options
 ------------------

-The /i, /m, or /s options (PCRE2_CASELESS, PCRE2_MULTILINE, PCRE2_DOTALL, and
-others) may be changed in the middle of patterns by items such as (?i). Their
-processing is handled entirely at compile time by generating different opcodes
-for the different settings. The runtime functions do not need to keep track of
-an option's state.
+The /i, /m, or /s options (PCRE2_CASELESS, PCRE2_MULTILINE, PCRE2_DOTALL) and
+some others may be changed in the middle of patterns by items such as (?i).
+Their processing is handled entirely at compile time by generating different
+opcodes for the different settings. The runtime functions do not need to keep
+track of an option's state.

 PCRE2_DUPNAMES, PCRE2_EXTENDED, PCRE2_EXTENDED_MORE, and PCRE2_NO_AUTO_CAPTURE
 are tracked and processed during the parsing pre-pass. The others are handled
@ -466,8 +467,8 @@ Caseless matching (positive or negative) of characters that have more than two
 case-equivalent code points (which is possible only in UTF mode) is handled by
 compiling a Unicode property item (see below), with the pseudo-property
 PT_CLIST. The value of this property is an offset in a vector called
-"ucd_caseless_sets" which identifies the start of a short list of equivalent
-characters, terminated by the value NOTACHAR (0xffffffff).
+"ucd_caseless_sets" which identifies the start of a short list of case
+equivalent characters, terminated by the value NOTACHAR (0xffffffff).


 Repeating single characters
@ -544,8 +545,9 @@ Each is followed by two code units that encode the desired property as a type
 and a value. The types are a set of #defines of the form PT_xxx, and the values
 are enumerations of the form ucp_xx, defined in the pcre2_ucp.h source file.
 The value is relevant only for PT_GC (General Category), PT_PC (Particular
-Category), PT_SC (Script), and the pseudo-property PT_CLIST, which is used to
-identify a list of case-equivalent characters when there are three or more.
+Category), PT_SC (Script), PT_BIDICL (Bidi Class), PT_BOOL (Boolean property),
+and the pseudo-property PT_CLIST, which is used to identify a list of
+case-equivalent characters when there are three or more (see above).

 Repeats of these items use the OP_TYPESTAR etc. set of opcodes, followed by
 three code units: OP_PROP or OP_NOTPROP, and then the desired property type and
@ -665,7 +667,7 @@ used for subpatterns that do not repeat indefinitely, OP_KETRMIN and
 OP_KETRMAX are used for indefinite repetitions, minimally or maximally
 respectively, and OP_KETRPOS for possessive repetitions (see below for more
 details). All four are followed by a LINK_SIZE value giving (as a positive
-number) the offset back to the matching bracket opcode.
+number) the offset back to the matching opening bracket opcode.

 If a subpattern is quantified such that it is permitted to match zero times, it
 is preceded by one of OP_BRAZERO, OP_BRAMINZERO, or OP_SKIPZERO. These are
@ -715,13 +717,15 @@ Assertions
 ----------

 Forward assertions are also just like other subpatterns, but starting with one
-of the opcodes OP_ASSERT or OP_ASSERT_NOT. Backward assertions use the opcodes
-OP_ASSERTBACK and OP_ASSERTBACK_NOT, and the first opcode inside the assertion
-is OP_REVERSE, followed by a count of the number of characters to move back the
-pointer in the subject string. In ASCII or UTF-32 mode, the count is also the
-number of code units, but in UTF-8/16 mode each character may occupy more than
-one code unit. A separate count is present in each alternative of a lookbehind
-assertion, allowing them to have different (but fixed) lengths.
+of the opcodes OP_ASSERT, OP_ASSERT_NA (non-atomic assertion), or
+OP_ASSERT_NOT. Backward assertions use the opcodes OP_ASSERTBACK,
+OP_ASSERTBACK_NA, and OP_ASSERTBACK_NOT, and the first opcode inside the
+assertion is OP_REVERSE, followed by a count of the number of characters to
+move back the pointer in the subject string. In ASCII or UTF-32 mode, the count
+is also the number of code units, but in UTF-8/16 mode each character may
+occupy more than one code unit. A separate count is present in each alternative
+of a lookbehind assertion, allowing each branch to have a different (but fixed)
+length.


 Conditional subpatterns
@ -754,11 +758,11 @@ tests the PCRE2 version number. This compiles into one of the opcodes OP_TRUE
 or OP_FALSE.

 If a condition is not a back reference, recursion test, DEFINE, or VERSION, it
-must start with a parenthesized assertion, whose opcode normally immediately
-follows OP_COND or OP_SCOND. However, if automatic callouts are enabled, a
-callout is inserted immediately before the assertion. It is also possible to
-insert a manual callout at this point. Only assertion conditions may have
-callouts preceding the condition.
+must start with a parenthesized atomic assertion, whose opcode normally
+immediately follows OP_COND or OP_SCOND. However, if automatic callouts are
+enabled, a callout is inserted immediately before the assertion. It is also
+possible to insert a manual callout at this point. Only assertion conditions
+may have callouts preceding the condition.

 A condition that is the negative assertion (?!) is optimized to OP_FAIL in all
 parts of the pattern, so this is another opcode that may appear as a condition.
@ -823,4 +827,4 @@ not a real opcode, but is used to check at compile time that tables indexed by
 opcode are the correct length, in order to catch updating errors.

 Philip Hazel
-20 July 2018
+April 2022
--- a/12
+++ b/12
@ -20,13 +20,13 @@ THE BASIC LIBRARY FUNCTIONS
 ---------------------------

 Written by:       Philip Hazel
-Email local part: ph10
-Email domain:     cam.ac.uk
+Email local part: Philip.Hazel
+Email domain:     gmail.com

-University of Cambridge Computing Service,
+Retired from University of Cambridge Computing Service,
 Cambridge, England.

-Copyright (c) 1997-2019 University of Cambridge
+Copyright (c) 1997-2022 University of Cambridge
 All rights reserved.


@ -37,7 +37,7 @@ Written by:       Zoltan Herczeg
 Email local part: hzmester
 Email domain:     freemail.hu

-Copyright(c) 2010-2019 Zoltan Herczeg
+Copyright(c) 2010-2022 Zoltan Herczeg
 All rights reserved.


@ -48,7 +48,7 @@ Written by:       Zoltan Herczeg
 Email local part: hzmester
 Email domain:     freemail.hu

-Copyright(c) 2009-2019 Zoltan Herczeg
+Copyright(c) 2009-2022 Zoltan Herczeg
 All rights reserved.


--- a/MODULE.bazel
+++ b/MODULE.bazel
@ -0,0 +1,8 @@
+module(
+    name = "pcre2",
+    version = "10.40",
+    compatibility_level = 1,
+)
+
+bazel_dep(name = "rules_cc", version = "0.0.1")
+bazel_dep(name = "bazel_skylib", version = "1.2.1")
--- a/Makefile.am
+++ b/Makefile.am
@ -46,6 +46,7 @@ dist_html_DATA = \
  doc/html/pcre2_general_context_free.html \
  doc/html/pcre2_get_error_message.html \
  doc/html/pcre2_get_mark.html \
+  doc/html/pcre2_get_match_data_size.html \
  doc/html/pcre2_get_ovector_count.html \
  doc/html/pcre2_get_ovector_pointer.html \
  doc/html/pcre2_get_startchar.html \
@ -56,6 +57,7 @@ dist_html_DATA = \
  doc/html/pcre2_jit_stack_create.html \
  doc/html/pcre2_jit_stack_free.html \
  doc/html/pcre2_maketables.html \
+  doc/html/pcre2_maketables_free.html \
  doc/html/pcre2_match.html \
  doc/html/pcre2_match_context_copy.html \
  doc/html/pcre2_match_context_create.html \
@ -140,6 +142,7 @@ dist_man_MANS = \
  doc/pcre2_general_context_free.3 \
  doc/pcre2_get_error_message.3 \
  doc/pcre2_get_mark.3 \
+  doc/pcre2_get_match_data_size.3 \
  doc/pcre2_get_ovector_count.3 \
  doc/pcre2_get_ovector_pointer.3 \
  doc/pcre2_get_startchar.3 \
@ -150,6 +153,7 @@ dist_man_MANS = \
  doc/pcre2_jit_stack_create.3 \
  doc/pcre2_jit_stack_free.3 \
  doc/pcre2_maketables.3 \
+  doc/pcre2_maketables_free.3 \
  doc/pcre2_match.3 \
  doc/pcre2_match_context_copy.3 \
  doc/pcre2_match_context_create.3 \
@ -321,18 +325,18 @@ include_HEADERS = src/pcre2posix.h
 bin_SCRIPTS = pcre2-config

 ## ---------------------------------------------------------------
-## The dftables program is used to rebuild character tables before compiling
-## PCRE2, if --enable-rebuild-chartables is specified. It is not a user-visible
-## program. The default (when --enable-rebuild-chartables is not specified) is
-## to copy a distributed set of tables that are defined for ASCII code. In this
-## case, dftables is not needed.
+## The pcre2_dftables program is used to rebuild character tables before
+## compiling PCRE2, if --enable-rebuild-chartables is specified. It is not an
+## installed program. The default (when --enable-rebuild-chartables is not
+## specified) is to copy a distributed set of tables that are defined for ASCII
+## code. In this case, pcre2_dftables is not needed.

 if WITH_REBUILD_CHARTABLES
-noinst_PROGRAMS += dftables
-dftables_SOURCES = src/dftables.c
-src/pcre2_chartables.c: dftables$(EXEEXT)
+noinst_PROGRAMS += pcre2_dftables
+pcre2_dftables_SOURCES = src/pcre2_dftables.c
+src/pcre2_chartables.c: pcre2_dftables$(EXEEXT)
 	rm -f $@
-	./dftables$(EXEEXT) $@
+	./pcre2_dftables$(EXEEXT) $@
 else
 src/pcre2_chartables.c: $(srcdir)/src/pcre2_chartables.c.dist
 	rm -f $@
@ -358,6 +362,8 @@ COMMON_SOURCES = \
  src/pcre2_internal.h \
  src/pcre2_intmodedep.h \
  src/pcre2_jit_compile.c \
+  src/pcre2_jit_neon_inc.h \
+  src/pcre2_jit_simd_inc.h \
  src/pcre2_maketables.c \
  src/pcre2_match.c \
  src/pcre2_match_data.c \
@ -376,6 +382,10 @@ COMMON_SOURCES = \
  src/pcre2_valid_utf.c \
  src/pcre2_xclass.c

+# The pcre2_ucptables.c file is #included by pcre2_tables.c
+
+EXTRA_DIST += src/pcre2_ucptables.c
+
 if WITH_PCRE2_8
 lib_LTLIBRARIES += libpcre2-8.la
 libpcre2_8_la_SOURCES = \
@ -385,6 +395,7 @@ nodist_libpcre2_8_la_SOURCES = \
 libpcre2_8_la_CFLAGS = \
  -DPCRE2_CODE_UNIT_WIDTH=8 \
  $(VISIBILITY_CFLAGS) \
+  $(CET_CFLAGS) \
  $(AM_CFLAGS)
 libpcre2_8_la_LIBADD =
 endif # WITH_PCRE2_8
@ -398,6 +409,7 @@ nodist_libpcre2_16_la_SOURCES = \
 libpcre2_16_la_CFLAGS = \
  -DPCRE2_CODE_UNIT_WIDTH=16 \
  $(VISIBILITY_CFLAGS) \
+  $(CET_CFLAGS) \
  $(AM_CFLAGS)
 libpcre2_16_la_LIBADD =
 endif # WITH_PCRE2_16
@ -411,6 +423,7 @@ nodist_libpcre2_32_la_SOURCES = \
 libpcre2_32_la_CFLAGS = \
  -DPCRE2_CODE_UNIT_WIDTH=32 \
  $(VISIBILITY_CFLAGS) \
+  $(CET_CFLAGS) \
  $(AM_CFLAGS)
 libpcre2_32_la_LIBADD =
 endif # WITH_PCRE2_32
@ -439,15 +452,16 @@ EXTRA_DIST += \
  src/sljit/sljitNativePPC_32.c \
  src/sljit/sljitNativePPC_64.c \
  src/sljit/sljitNativePPC_common.c \
-  src/sljit/sljitNativeSPARC_32.c \
-  src/sljit/sljitNativeSPARC_common.c \
-  src/sljit/sljitNativeTILEGX-encoder.c \
-  src/sljit/sljitNativeTILEGX_64.c \
+  src/sljit/sljitNativeRISCV_32.c \
+  src/sljit/sljitNativeRISCV_64.c \
+  src/sljit/sljitNativeRISCV_common.c \
+  src/sljit/sljitNativeS390X.c \
  src/sljit/sljitNativeX86_32.c \
  src/sljit/sljitNativeX86_64.c \
  src/sljit/sljitNativeX86_common.c \
  src/sljit/sljitProtExecAllocator.c \
-  src/sljit/sljitUtils.c
+  src/sljit/sljitUtils.c \
+  src/sljit/sljitWXExecAllocator.c

 # Some of the JIT sources are also in separate files that are #included.

@ -628,6 +642,7 @@ EXTRA_DIST += \
  testdata/grepoutputCN \
  testdata/grepoutputN \
  testdata/greppatN4 \
+  testdata/testbtables \
  testdata/testinput1 \
  testdata/testinput2 \
  testdata/testinput3 \
@ -653,6 +668,7 @@ EXTRA_DIST += \
  testdata/testinput23 \
  testdata/testinput24 \
  testdata/testinput25 \
+  testdata/testinput26 \
  testdata/testinputEBC \
  testdata/testoutput1 \
  testdata/testoutput2 \
@ -695,6 +711,7 @@ EXTRA_DIST += \
  testdata/testoutput23 \
  testdata/testoutput24 \
  testdata/testoutput25 \
+  testdata/testoutput26 \
  testdata/testoutputEBC \
  testdata/valgrind-jit.supp \
  testdata/wintestinput3 \
@ -849,9 +866,11 @@ endif # WITH_GCOV

 EXTRA_DIST += \
  cmake/COPYING-CMAKE-SCRIPTS \
+  cmake/FindEditline.cmake \
  cmake/FindPackageHandleStandardArgs.cmake \
  cmake/FindReadline.cmake \
-  cmake/FindEditline.cmake \
+  cmake/pcre2-config-version.cmake.in \
+  cmake/pcre2-config.cmake.in \
  CMakeLists.txt \
  config-cmake.h.in

--- a/Makefile.os4
+++ b/Makefile.os4
@ -0,0 +1,271 @@
+#
+# Project: pcre2
+#
+# Created on: 10-01-2022 22:01:46
+#
+# commands to use:
+# make -f Makefile.os4 libpcre2.a
+# make -f Makefile.os4 libpcre2-posix.a
+# make -f Makefile.os4 pcre2test
+# sh RunTest
+# make -f Makefile.os4 clean
+#
+
+###################################################################
+##
+##////  Objects
+##
+###################################################################
+
+libpcre2_OBJ := \
+	 src/pcre2_chartables.o src/pcre2_auto_possess.o src/pcre2_compile.o \
+	 src/pcre2_config.o src/pcre2_context.o src/pcre2_convert.o \
+	 src/pcre2_dfa_match.o src/pcre2_error.o src/pcre2_extuni.o \
+	 src/pcre2_find_bracket.o src/pcre2_jit_compile.o src/pcre2_maketables.o \
+	 src/pcre2_match.o src/pcre2_match_data.o src/pcre2_newline.o \
+	 src/pcre2_ord2utf.o src/pcre2_pattern_info.o src/pcre2_script_run.o \
+	 src/pcre2_serialize.o src/pcre2_string_utils.o src/pcre2_study.o \
+	 src/pcre2_substitute.o src/pcre2_substring.o src/pcre2_tables.o \
+	 src/pcre2_ucd.o src/pcre2_valid_utf.o src/pcre2_xclass.o \
+	
+
+
+pcre2posix_OBJ := \
+	 src/pcre2posix.o
+
+
+pcre2test_OBJ := \
+	 src/pcre2test.o
+
+
+pcre2grep_OBJ := \
+	 src/pcre2grep.o
+
+###################################################################
+##
+##////  Variables and Environment
+##
+###################################################################
+
+MCRT := -mcrt=newlib
+ifeq ($(USE_CLIB2), yes)
+MCRT := -mcrt=clib2
+endif
+
+CC := gcc:bin/gcc
+
+INCPATH := -I. -Isrc
+
+# for pcre2test
+CFLAGS := $(MCRT) $(INCPATH) -O2 -DHAVE_CONFIG_H -DPCRE2_CODE_UNIT_WIDTH=8
+
+###################################################################
+##
+##////  General rules
+##
+###################################################################
+
+.PHONY: all all-before all-after clean clean-custom realclean
+
+all: all-before libpcre2.a libpcre2-posix.a all-after
+
+all-before:
+#	You can add rules here to execute before the project is built
+
+all-after:
+#	You can add rules here to execute after the project is built
+
+tests: pcre2test pcre2grep
+
+clean: clean-custom
+	@echo "Cleaning compiler objects..."
+	@rm -f  $(libpcre2_OBJ) $(pcre2posix_OBJ) $(pcre2test_OBJ)
+
+cleanall: clean
+	@echo "Cleaning compiler targets..."
+	@rm -f  libpcre.a libpcre-posix.a pcre2test pcre2grep
+
+###################################################################
+##
+##////  Targets
+##
+###################################################################
+
+libpcre2.a: $(libpcre2_OBJ)
+	ar -rcs libpcre2.a $(libpcre2_OBJ)
+	ranlib libpcre2.a
+
+libpcre2-posix.a: $(pcre2posix_OBJ)
+	ar -rcs libpcre2-posix.a $(pcre2posix_OBJ)
+	ranlib libpcre2-posix.a
+
+pcre2test: libpcre2.a libpcre2-posix.a $(pcre2test_OBJ)
+	@echo "Linking pcre2test"
+	@gcc:bin/gcc $(MCRT) -o pcre2test $(pcre2test_OBJ) -L. -lauto -lpcre2 -lpcre2-posix
+	@echo "Removing stale debug target: pcre2test"
+	@rm -f pcre2test.debug
+	
+pcre2grep: libpcre2.a $(pcre2grep_OBJ)
+	@echo "Linking pcre2grep"
+	@gcc:bin/gcc $(MCRT) -o pcre2grep $(pcre2grep_OBJ) -L . -lauto -lpcre2
+	@echo "Removing stale debug target: pcre2grep"
+	@rm -f pcre2grep.debug
+
+
+###################################################################
+##
+##////  Standard rules
+##
+###################################################################
+
+# A default rule to make all the objects listed below
+# because we are hiding compiler commands from the output
+
+.c.o:
+	@echo "Compiling $<"
+	@$(CC) -c $< -o $*.o $(CFLAGS)
+
+src/pcre2_chartables.o: src/pcre2_chartables.c src/config.h src/pcre2_internal.h \
+	 src/pcre2.h src/pcre2_ucp.h
+
+src/pcre2_auto_possess.o: src/pcre2_auto_possess.c src/config.h src/pcre2_internal.h \
+	 src/pcre2.h src/pcre2_ucp.h
+
+src/pcre2_compile.o: src/pcre2_compile.c src/config.h src/pcre2_internal.h \
+	 src/pcre2.h src/pcre2_ucp.h src/pcre2_intmodedep.h \
+	
+
+src/pcre2_config.o: src/pcre2_config.c src/config.h src/pcre2_internal.h \
+	 src/pcre2.h src/pcre2_ucp.h
+
+src/pcre2_context.o: src/pcre2_context.c src/config.h src/pcre2_internal.h \
+	 src/pcre2.h src/pcre2_ucp.h
+
+src/pcre2_convert.o: src/pcre2_convert.c src/config.h src/pcre2_internal.h \
+	 src/pcre2.h src/pcre2_ucp.h
+
+src/pcre2_dfa_match.o: src/pcre2_dfa_match.c src/config.h src/pcre2_internal.h \
+	 src/pcre2.h src/pcre2_ucp.h
+
+src/pcre2_error.o: src/pcre2_error.c src/config.h src/pcre2_internal.h \
+	 src/pcre2.h src/pcre2_ucp.h
+
+src/pcre2_extuni.o: src/pcre2_extuni.c src/config.h src/pcre2_internal.h \
+	 src/pcre2.h src/pcre2_ucp.h
+
+src/pcre2_find_bracket.o: src/pcre2_find_bracket.c src/config.h src/pcre2_internal.h \
+	 src/pcre2.h src/pcre2_ucp.h
+
+src/pcre2_jit_compile.o: src/pcre2_jit_compile.c src/config.h src/pcre2_internal.h \
+	 src/pcre2.h src/pcre2_ucp.h src/pcre2_intmodedep.h \
+	 src/sljit/sljitLir.c src/sljit/sljitLir.h src/sljit/sljitConfig.h \
+	 src/sljit/sljitConfigInternal.h src/sljit/sljitUtils.c src/sljit/sljitProtExecAllocator.c \
+	 src/sljit/sljitWXExecAllocator.c src/sljit/sljitExecAllocator.c src/pcre2_jit_simd_inc.h \
+	 src/pcre2_jit_neon_inc.h src/pcre2_jit_match.c
+
+src/pcre2_maketables.o: src/pcre2_maketables.c
+
+src/pcre2_match.o: src/pcre2_match.c src/config.h src/pcre2_internal.h \
+	 src/pcre2.h src/pcre2_ucp.h
+
+src/pcre2_match_data.o: src/pcre2_match_data.c src/config.h src/pcre2_internal.h \
+	 src/pcre2.h src/pcre2_ucp.h
+
+src/pcre2_newline.o: src/pcre2_newline.c src/config.h src/pcre2_internal.h \
+	 src/pcre2.h src/pcre2_ucp.h
+
+src/pcre2_ord2utf.o: src/pcre2_ord2utf.c src/config.h src/pcre2_internal.h \
+	 src/pcre2.h src/pcre2_ucp.h
+
+src/pcre2_pattern_info.o: src/pcre2_pattern_info.c src/config.h src/pcre2_internal.h \
+	 src/pcre2.h src/pcre2_ucp.h
+
+src/pcre2_script_run.o: src/pcre2_script_run.c src/config.h src/pcre2_internal.h \
+	 src/pcre2.h src/pcre2_ucp.h
+
+src/pcre2_serialize.o: src/pcre2_serialize.c src/config.h src/pcre2_internal.h \
+	 src/pcre2.h src/pcre2_ucp.h
+
+src/pcre2test.o: src/pcre2test.c src/config.h src/pcre2.h \
+	 src/pcre2posix.h src/pcre2_internal.h src/pcre2_ucp.h \
+	 src/pcre2_intmodedep.h src/pcre2_tables.c src/pcre2_ucptables.c \
+	 src/pcre2_ucd.c src/pcre2_printint.c
+
+src/pcre2_string_utils.o: src/pcre2_string_utils.c src/config.h src/pcre2_internal.h \
+	 src/pcre2.h src/pcre2_ucp.h
+
+src/pcre2_study.o: src/pcre2_study.c src/config.h src/pcre2_internal.h \
+	 src/pcre2.h src/pcre2_ucp.h
+
+src/pcre2_substitute.o: src/pcre2_substitute.c src/config.h src/pcre2_internal.h \
+	 src/pcre2.h src/pcre2_ucp.h
+
+src/pcre2_substring.o: src/pcre2_substring.c src/config.h src/pcre2_internal.h \
+	 src/pcre2.h src/pcre2_ucp.h
+
+src/pcre2posix.o: src/pcre2posix.c src/config.h src/pcre2.h \
+	
+
+src/pcre2_tables.o: src/pcre2_tables.c src/config.h src/pcre2_internal.h \
+	 src/pcre2.h src/pcre2_ucp.h src/pcre2_intmodedep.h \
+	
+
+src/pcre2_ucd.o: src/pcre2_ucd.c src/config.h src/pcre2_internal.h \
+	 src/pcre2.h src/pcre2_ucp.h
+
+src/pcre2_valid_utf.o: src/pcre2_valid_utf.c src/config.h src/pcre2_internal.h \
+	 src/pcre2.h src/pcre2_ucp.h
+
+src/pcre2_xclass.o: src/pcre2_xclass.c src/config.h src/pcre2_internal.h \
+	 src/pcre2.h src/pcre2_ucp.h
+
+
+src/pcre2grep.o: src/pcre2grep.c src/config.h
+
+###################################################################
+##
+##////  Custom rules
+##
+###################################################################
+
+runtests: libpcre2.a libpcre2-posix.a tests
+	sh RunTest
+	sh RunGrepTest
+
+release:
+	@echo "Create release folders..."
+	@mkdir -p release/local/newlib/lib release/local/clib2/lib release/local/Documentation/pcre2 release/local/common/include
+	
+	@echo "Building newlib based libraries..."
+	@make -f Makefile.os4 all
+	@cp libpcre2.a release/local/newlib/lib/
+	@cp libpcre2-posix.a release/local/newlib/lib/
+	
+	@echo "Clean build and libraries files..."
+	@make -f Makefile.os4 cleanall
+	
+	@echo "Building clib2 based libraries..."
+	@make -f Makefile.os4 all USE_CLIB2=yes
+	@cp libpcre2.a release/local/clib2/lib/
+	@cp libpcre2-posix.a release/local/clib2/lib/
+
+	@echo "Copy the necessary files..."
+	@cp src/pcre2.h release/local/common/include/
+	@cp src/pcre2posix.h release/local/common/include/
+	@cp COPYING release/local/Documentation/pcre2/
+	@cp HACKING release/local/Documentation/pcre2/
+	@cp LICENCE release/local/Documentation/pcre2/
+	@cp README release/local/Documentation/pcre2/
+	@cp README-OS4.md release/local/Documentation/pcre2/
+	
+	@echo "Clean build and libraries files..."
+	@make -f Makefile.os4 cleanall
+	
+	@echo "Creating the lha release file..."
+	@rm -f pcre2.lha
+	@lha -aeqr3 a pcre2.lha release/
+	
+	@rm -rf release
+
+###################################################################
+
--- a/121
+++ b/121
@ -2,8 +2,125 @@ News about PCRE2 releases
 -------------------------


-Version 10.33-RC1 16-April-2019
-------------------------------
+Version 10.40 15-April-2022
+---------------------------
+
+This is mostly a bug-fixing and code-tidying release. However, there are some
+extensions to Unicode property handling:
+
+* Added support for Bidi_Class and a number of binary Unicode properties,
+including Bidi_Control.
+
+* A number of changes to script matching for \p and \P:
+
+  (a) Script extensions for a character are now coded as a bitmap instead of
+      a list of script numbers, which should be faster and does not need a
+      loop.
+
+  (b) Added the syntax \p{script:xxx} and \p{script_extensions:xxx} (synonyms
+      sc and scx).
+
+  (c) Changed \p{scriptname} from being the same as \p{sc:scriptname} to being
+      the same as \p{scx:scriptname} because this change happened in Perl at
+      release 5.26.
+
+  (d) The standard Unicode 4-letter abbreviations for script names are now
+      recognized.
+
+  (e) In accordance with Unicode and Perl's "loose matching" rules, spaces,
+      hyphens, and underscores are ignored in property names, which are then
+      matched independent of case.
+
+As always, see ChangeLog for a list of all changes (also the Git log).
+
+
+Version 10.39 29-October-2021
+-----------------------------
+
+This release is happening soon after 10.38 because the bug fix is important.
+
+1. Fix incorrect detection of alternatives in first character search in JIT.
+
+2. Update to Unicode 14.0.0.
+
+3. Some code cleanups (see ChangeLog).
+
+
+Version 10.38 01-October-2021
+-----------------------------
+
+As well as some bug fixes and tidies (as always, see ChangeLog for details),
+the documentation is updated to list the new URLs, following the move of the
+source repository to GitHub and the mailing list to Google Groups.
+
+* The CMake build system can now build both static and shared libraries in one
+go.
+
+* Following Perl's lead, \K is now locked out in lookaround assertions by
+default, but an option is provided to re-enable the previous behaviour.
+
+
+Version 10.37 26-May-2021
+-------------------------
+
+A few more bug fixes and tidies. The only change of real note is the removal of
+the actual POSIX names regcomp etc. from the POSIX wrapper library because
+these have caused issues for some applications (see 10.33 #2 below).
+
+
+Version 10.36 04-December-2020
+------------------------------
+
+Again, mainly bug fixes and tidies. The only enhancements are the addition of
+GNU grep's -m (aka --max-count) option to pcre2grep, and also unifying the
+handling of substitution strings for both -O and callouts in pcre2grep, with
+the addition of $x{...} and $o{...} to allow for characters whose code points
+are greater than 255 in Unicode mode.
+
+NOTE: there is an outstanding issue with JIT support for MacOS on arm64
+hardware. For details, please see Bugzilla issue #2618.
+
+
+Version 10.35 15-April-2020
+---------------------------
+
+Bugfixes, tidies, and a few new enhancements.
+
+1. Capturing groups that contain recursive backreferences to themselves are no
+longer automatically atomic, because the restriction is no longer necessary
+as a result of the 10.30 restructuring.
+
+2. Several new options for pcre2_substitute().
+
+3. When Unicode is supported and PCRE2_UCP is set without PCRE2_UTF, Unicode
+character properties are used for upper/lower case computations on characters
+whose code points are greater than 127.
+
+4. The character tables (for low-valued characters) can now more easily be
+saved and restored in binary.
+
+5. Updated to Unicode 13.0.0.
+
+
+Version 10.34 21-November-2019
+------------------------------
+
+Another release with a few enhancements as well as bugfixes and tidies. The
+main new features are:
+
+1. There is now some support for matching in invalid UTF strings.
+
+2. Non-atomic positive lookarounds are implemented in the pcre2_match()
+interpreter, but not in JIT.
+
+3. Added two new functions: pcre2_get_match_data_size() and
+pcre2_maketables_free().
+
+4. Upgraded to Unicode 12.1.0.
+
+
+Version 10.33 16-April-2019
+---------------------------

 Yet more bugfixes, tidies, and a few enhancements, summarized here (see
 ChangeLog for the full list):
--- a/57
+++ b/57
@ -40,7 +40,11 @@ GENERIC INSTRUCTIONS FOR THE PCRE2 C LIBRARY

 The following are generic instructions for building the PCRE2 C library "by
 hand". If you are going to use CMake, this section does not apply to you; you
-can skip ahead to the CMake section.
+can skip ahead to the CMake section. Note that the settings concerned with
+8-bit, 16-bit, and 32-bit code units relate to the type of data string that
+PCRE2 processes. They are NOT referring to the underlying operating system bit
+width. You do not have to do anything special to compile in a 64-bit
+environment, for example.

 (1) Copy or rename the file src/config.h.generic as src/config.h, and edit the
     macro settings that it contains to whatever is appropriate for your
@ -74,23 +78,23 @@ can skip ahead to the CMake section.
       src/pcre2_chartables.c.

     OR:
-       Compile src/dftables.c as a stand-alone program (using -DHAVE_CONFIG_H
-       if you have set up src/config.h), and then run it with the single
-       argument "src/pcre2_chartables.c". This generates a set of standard
-       character tables and writes them to that file. The tables are generated
-       using the default C locale for your system. If you want to use a locale
-       that is specified by LC_xxx environment variables, add the -L option to
-       the dftables command. You must use this method if you are building on a
-       system that uses EBCDIC code.
+       Compile src/pcre2_dftables.c as a stand-alone program (using
+       -DHAVE_CONFIG_H if you have set up src/config.h), and then run it with
+       the single argument "src/pcre2_chartables.c". This generates a set of
+       standard character tables and writes them to that file. The tables are
+       generated using the default C locale for your system. If you want to use
+       a locale that is specified by LC_xxx environment variables, add the -L
+       option to the pcre2_dftables command. You must use this method if you
+       are building on a system that uses EBCDIC code.

     The tables in src/pcre2_chartables.c are defaults. The caller of PCRE2 can
     specify alternative tables at run time.

- (4) For an 8-bit library, compile the following source files from the src
-     directory, setting -DPCRE2_CODE_UNIT_WIDTH=8 as a compiler option. Also
-     set -DHAVE_CONFIG_H if you have set up src/config.h with your
-     configuration, or else use other -D settings to change the configuration
-     as required.
+ (4) For a library that supports 8-bit code units in the character strings that
+     it processes, compile the following source files from the src directory,
+     setting -DPCRE2_CODE_UNIT_WIDTH=8 as a compiler option. Also set
+     -DHAVE_CONFIG_H if you have set up src/config.h with your configuration,
+     or else use other -D settings to change the configuration as required.

       pcre2_auto_possess.c
       pcre2_chartables.c
@ -117,6 +121,7 @@ can skip ahead to the CMake section.
       pcre2_substring.c
       pcre2_tables.c
       pcre2_ucd.c
+       pcre2_ucptables.c
       pcre2_valid_utf.c
       pcre2_xclass.c

@ -142,9 +147,9 @@ can skip ahead to the CMake section.
     If your system has static and shared libraries, you may have to do this
     once for each type.

- (6) If you want to build a 16-bit library or 32-bit library (as well as, or
-     instead of the 8-bit library) just supply 16 or 32 as the value of
-     -DPCRE2_CODE_UNIT_WIDTH when you are compiling.
+ (6) If you want to build a library that supports 16-bit or 32-bit code units,
+     (as well as, or instead of the 8-bit library) just supply 16 or 32 as the
+     value of -DPCRE2_CODE_UNIT_WIDTH when you are compiling.

 (7) If you want to build the POSIX wrapper functions (which apply only to the
     8-bit library), ensure that you have the src/pcre2posix.h file and then
@ -302,7 +307,7 @@ cache can be deleted by selecting "File > Delete Cache".
 3.  Create a new, empty build directory, preferably a subdirectory of the
    source dir. For example, C:\pcre2\pcre2-xx\build.

-4.  Run cmake-gui from the Shell envirornment of your build tool, for example,
+4.  Run cmake-gui from the Shell environment of your build tool, for example,
    Msys for Msys/MinGW or Visual Studio Command Prompt for VC/VC++. Do not try
    to start Cmake from the Windows Start menu, as this can lead to errors.

@ -339,10 +344,10 @@ cache can be deleted by selecting "File > Delete Cache".

 BUILDING PCRE2 ON WINDOWS WITH VISUAL STUDIO

-The code currently cannot be compiled without a stdint.h header, which is
-available only in relatively recent versions of Visual Studio. However, this
-portable and permissively-licensed implementation of the header worked without
-issue:
+The code currently cannot be compiled without an inttypes.h header, which is
+available only with Visual Studio 2013 or newer. However, this portable and
+permissively-licensed implementation of the stdint.h header could be used as an
+alternative:

  http://www.azillionmonkeys.com/qed/pstdint.h

@ -369,7 +374,7 @@ Otherwise:
 1. Copy RunTest.bat into the directory where pcre2test.exe and pcre2grep.exe
   have been created.

-2. Edit RunTest.bat to indentify the full or relative location of
+2. Edit RunTest.bat to identify the full or relative location of
   the pcre2 source (wherein which the testdata folder resides), e.g.:

   set srcdir=C:\pcre2\pcre2-10.00
@ -401,6 +406,6 @@ Everything in that location, source and executable, is in EBCDIC and native
 z/OS file formats. The port provides an API for LE languages such as COBOL and
 for the z/OS and z/VM versions of the Rexx languages.

-==============================
-Last Updated: 14 November 2018
-==============================
+===========================
+Last Updated: 28 April 2021
+===========================
--- a/2
+++ b/2
@ -190,7 +190,7 @@ files="\
  libpcre2-16.pc.in \
  libpcre2-32.pc.in \
  libpcre2-posix.pc.in \
-  src/dftables.c \
+  src/pcre2_dftables.c \
  src/pcre2.h.in \
  src/pcre2_auto_possess.c \
  src/pcre2_compile.c \
--- a/160
+++ b/160
@ -4,18 +4,20 @@ README file for PCRE2 (Perl-compatible regular expression library)
 PCRE2 is a re-working of the original PCRE1 library to provide an entirely new
 API. Since its initial release in 2015, there has been further development of
 the code and it now differs from PCRE1 in more than just the API. There are new
-features and the internals have been improved. The latest release of PCRE2 is
-always available in three alternative formats from:
+features, and the internals have been improved. The original PCRE1 library is
+now obsolete and no longer maintained. The latest release of PCRE2 is available
+in .tar.gz, tar.bz2, or .zip form from this GitHub repository:

-  ftp://ftp.csx.cam.ac.uk/pub/software/programming/pcre/pcre2-xxx.tar.gz
-  ftp://ftp.csx.cam.ac.uk/pub/software/programming/pcre/pcre2-xxx.tar.bz2
-  ftp://ftp.csx.cam.ac.uk/pub/software/programming/pcre/pcre2-xxx.zip
+https://github.com/PCRE2Project/pcre2/releases

-There is a mailing list for discussion about the development of PCRE (both the
-original and new APIs) at pcre-dev@exim.org. You can access the archives and
-subscribe or manage your subscription here:
+There is a mailing list for discussion about the development of PCRE2 at
+pcre2-dev@googlegroups.com. You can subscribe by sending an email to
+pcre2-dev+subscribe@googlegroups.com.

-   https://lists.exim.org/mailman/listinfo/pcre-dev
+You can access the archives and also subscribe or manage your subscription
+here:
+
+https://groups.google.com/g/pcre2-dev

 Please read the NEWS file if you are upgrading from a previous release. The
 contents of this README file are:
@ -112,12 +114,18 @@ Building PCRE2 using autotools
 The following instructions assume the use of the widely used "configure; make;
 make install" (autotools) process.

-To build PCRE2 on system that supports autotools, first run the "configure"
-command from the PCRE2 distribution directory, with your current directory set
+If you have downloaded and unpacked a PCRE2 release tarball, run the
+"configure" command from the PCRE2 directory, with your current directory set
 to the directory where you want the files to be created. This command is a
 standard GNU "autoconf" configuration script, for which generic instructions
 are supplied in the file INSTALL.

+The files in the GitHub repository do not contain "configure". If you have
+downloaded the PCRE2 source files from GitHub, before you can run "configure"
+you must run the shell script called autogen.sh. This runs a number of
+autotools to create a "configure" script (you must of course have the autotools
+commands installed in order to do this).
+
 Most commonly, people build PCRE2 within its own distribution directory, and in
 this case, on many systems, just running "./configure" is sufficient. However,
 the usual methods of changing standard defaults are available. For example:
@ -164,9 +172,11 @@ library. They are also documented in the pcre2build man page.
  will be a compile time error. If in doubt, use --enable-jit=auto, which
  enables JIT only if the current hardware is supported.

-. If you are enabling JIT under SELinux you may also want to add
-  --enable-jit-sealloc, which enables the use of an execmem allocator in JIT
-  that is compatible with SELinux. This has no effect if JIT is not enabled.
+. If you are enabling JIT under SELinux environment you may also want to add
+  --enable-jit-sealloc, which enables the use of an executable memory allocator
+  that is compatible with SELinux. Warning: this allocator is experimental!
+  It does not support fork() operation and may crash when no disk space is
+  available. This option has no effect if JIT is disabled.

 . If you do not want to make use of the default support for UTF-8 Unicode
  character strings in the 8-bit library, UTF-16 Unicode character strings in
@ -184,10 +194,10 @@ library. They are also documented in the pcre2build man page.

  As well as supporting UTF strings, Unicode support includes support for the
  \P, \p, and \X sequences that recognize Unicode character properties.
-  However, only the basic two-letter properties such as Lu are supported.
-  Escape sequences such as \d and \w in patterns do not by default make use of
-  Unicode properties, but can be made to do so by setting the PCRE2_UCP option
-  or starting a pattern with (*UCP).
+  However, only a subset of Unicode properties are supported; see the
+  pcre2pattern man page for details. Escape sequences such as \d and \w in
+  patterns do not by default make use of Unicode properties, but can be made to
+  do so by setting the PCRE2_UCP option or starting a pattern with (*UCP).

 . You can build PCRE2 to recognize either CR or LF or the sequence CRLF, or any
  of the preceding, or any of the Unicode newline sequences, or the NUL (zero)
@ -267,9 +277,9 @@ library. They are also documented in the pcre2build man page.

  --enable-rebuild-chartables

-  a program called dftables is compiled and run in the default C locale when
-  you obey "make". It builds a source file called pcre2_chartables.c. If you do
-  not specify this option, pcre2_chartables.c is created as a copy of
+  a program called pcre2_dftables is compiled and run in the default C locale
+  when you obey "make". It builds a source file called pcre2_chartables.c. If
+  you do not specify this option, pcre2_chartables.c is created as a copy of
  pcre2_chartables.c.dist. See "Character tables" below for further
  information.

@ -295,8 +305,8 @@ library. They are also documented in the pcre2build man page.
  unaddressable. This allows it to detect invalid memory accesses, and is
  mostly useful for debugging PCRE2 itself.

-. In environments where the gcc compiler is used and lcov version 1.6 or above
-  is installed, if you specify
+. In environments where the gcc compiler is used and lcov is installed, if you
+  specify

  --enable-coverage

@ -365,19 +375,20 @@ library. They are also documented in the pcre2build man page.
  necessary to specify something like LIBS="-lncurses" as well. This is
  because, to quote the readline INSTALL, "Readline uses the termcap functions,
  but does not link with the termcap or curses library itself, allowing
-  applications which link with readline the to choose an appropriate library."
+  applications which link with readline the option to choose an appropriate
+  library."
  If you get error messages about missing functions tgetstr, tgetent, tputs,
  tgetflag, or tgoto, this is the problem, and linking with the ncurses library
  should fix it.

 . The C99 standard defines formatting modifiers z and t for size_t and
  ptrdiff_t values, respectively. By default, PCRE2 uses these modifiers in
-  environments other than Microsoft Visual Studio when __STDC_VERSION__ is
-  defined and has a value greater than or equal to 199901L (indicating C99).
-  However, there is at least one environment that claims to be C99 but does not
-  support these modifiers. If --disable-percent-zt is specified, no use is made
-  of the z or t modifiers. Instead or %td or %zu, %lu is used, with a cast for
-  size_t values.
+  environments other than Microsoft Visual Studio versions earlier than 2013
+  when __STDC_VERSION__ is defined and has a value greater than or equal to
+  199901L (indicating C99). However, there is at least one environment that
+  claims to be C99 but does not support these modifiers. If
+  --disable-percent-zt is specified, no use is made of the z or t modifiers.
+  Instead of %td or %zu, %lu is used, with a cast for size_t values.

 . There is a special option called --enable-fuzz-support for use by people who
  want to run fuzzing tests on PCRE2. At present this applies only to the 8-bit
@ -390,10 +401,10 @@ library. They are also documented in the pcre2build man page.
  Setting --enable-fuzz-support also causes a binary called pcre2fuzzcheck to
  be created. This is normally run under valgrind or used when PCRE2 is
  compiled with address sanitizing enabled. It calls the fuzzing function and
-  outputs information about it is doing. The input strings are specified by
-  arguments: if an argument starts with "=" the rest of it is a literal input
-  string. Otherwise, it is assumed to be a file name, and the contents of the
-  file are the test string.
+  outputs information about what it is doing. The input strings are specified
+  by arguments: if an argument starts with "=" the rest of it is a literal
+  input string. Otherwise, it is assumed to be a file name, and the contents
+  of the file are the test string.

 . Releases before 10.30 could be compiled with --disable-stack-for-recursion,
  which caused pcre2_match() to use individual blocks on the heap for
@ -546,11 +557,11 @@ Cross-compiling using autotools

 You can specify CC and CFLAGS in the normal way to the "configure" command, in
 order to cross-compile PCRE2 for some other host. However, you should NOT
-specify --enable-rebuild-chartables, because if you do, the dftables.c source
-file is compiled and run on the local host, in order to generate the inbuilt
-character tables (the pcre2_chartables.c file). This will probably not work,
-because dftables.c needs to be compiled with the local compiler, not the cross
-compiler.
+specify --enable-rebuild-chartables, because if you do, the pcre2_dftables.c
+source file is compiled and run on the local host, in order to generate the
+inbuilt character tables (the pcre2_chartables.c file). This will probably not
+work, because pcre2_dftables.c needs to be compiled with the local compiler,
+not the cross compiler.

 When --enable-rebuild-chartables is not specified, pcre2_chartables.c is
 created by making a copy of pcre2_chartables.c.dist, which is a default set of
@ -558,9 +569,10 @@ tables that assumes ASCII code. Cross-compiling with the default tables should
 not be a problem.

 If you need to modify the character tables when cross-compiling, you should
-move pcre2_chartables.c.dist out of the way, then compile dftables.c by hand
-and run it on the local host to make a new version of pcre2_chartables.c.dist.
-Then when you cross-compile PCRE2 this new version of the tables will be used.
+move pcre2_chartables.c.dist out of the way, then compile pcre2_dftables.c by
+hand and run it on the local host to make a new version of
+pcre2_chartables.c.dist. See the pcre2build section "Creating character tables
+at build time" for more details.


 Making new tarballs
@ -597,13 +609,13 @@ is available. RunTest outputs a comment when it skips a test.

 Many (but not all) of the tests that are not skipped are run twice if JIT
 support is available. On the second run, JIT compilation is forced. This
-testing can be suppressed by putting "nojit" on the RunTest command line.
+testing can be suppressed by putting "-nojit" on the RunTest command line.

 The entire set of tests is run once for each of the 8-bit, 16-bit and 32-bit
 libraries that are enabled. If you want to run just one set of tests, call
 RunTest with either the -8, -16 or -32 option.

-If valgrind is installed, you can run the tests under it by putting "valgrind"
+If valgrind is installed, you can run the tests under it by putting "-valgrind"
 on the RunTest command line. To run pcre2test on just one or more specific test
 files, give their numbers as arguments to RunTest, for example:

@ -684,7 +696,7 @@ Test 14 contains some special UTF and UCP tests that give different output for
 different code unit widths.

 Test 15 contains a number of tests that must not be run with JIT. They check,
-among other non-JIT things, the match-limiting features of the intepretive
+among other non-JIT things, the match-limiting features of the interpretive
 matcher.

 Test 16 is run only when JIT support is not available. It checks that an
@ -719,8 +731,8 @@ compile context.
 The source file called pcre2_chartables.c contains the default set of tables.
 By default, this is created as a copy of pcre2_chartables.c.dist, which
 contains tables for ASCII coding. However, if --enable-rebuild-chartables is
-specified for ./configure, a different version of pcre2_chartables.c is built
-by the program dftables (compiled from dftables.c), which uses the ANSI C
+specified for ./configure, a new version of pcre2_chartables.c is built by the
+program pcre2_dftables (compiled from pcre2_dftables.c), which uses the ANSI C
 character handling functions such as isalnum(), isalpha(), isupper(),
 islower(), etc. to build the table sources. This means that the default C
 locale that is set for your system will control the contents of these default
@ -730,32 +742,40 @@ file does not get automatically re-generated. The best way to do this is to
 move pcre2_chartables.c.dist out of the way and replace it with your customized
 tables.

-When the dftables program is run as a result of --enable-rebuild-chartables,
-it uses the default C locale that is set on your system. It does not pay
-attention to the LC_xxx environment variables. In other words, it uses the
-system's default locale rather than whatever the compiling user happens to have
-set. If you really do want to build a source set of character tables in a
-locale that is specified by the LC_xxx variables, you can run the dftables
-program by hand with the -L option. For example:
+When the pcre2_dftables program is run as a result of specifying
+--enable-rebuild-chartables, it uses the default C locale that is set on your
+system. It does not pay attention to the LC_xxx environment variables. In other
+words, it uses the system's default locale rather than whatever the compiling
+user happens to have set. If you really do want to build a source set of
+character tables in a locale that is specified by the LC_xxx variables, you can
+run the pcre2_dftables program by hand with the -L option. For example:

-  ./dftables -L pcre2_chartables.c.special
+  ./pcre2_dftables -L pcre2_chartables.c.special

-The first two 256-byte tables provide lower casing and case flipping functions,
-respectively. The next table consists of three 32-byte bit maps which identify
-digits, "word" characters, and white space, respectively. These are used when
-building 32-byte bit maps that represent character classes for code points less
-than 256. The final 256-byte table has bits indicating various character types,
-as follows:
+The second argument names the file where the source code for the tables is
+written. The first two 256-byte tables provide lower casing and case flipping
+functions, respectively. The next table consists of a number of 32-byte bit
+maps which identify certain character classes such as digits, "word"
+characters, white space, etc. These are used when building 32-byte bit maps
+that represent character classes for code points less than 256. The final
+256-byte table has bits indicating various character types, as follows:

    1   white space character
    2   letter
-    4   decimal digit
-    8   hexadecimal digit
+    4   lower case letter
+    8   decimal digit
   16   alphanumeric or '_'
-  128   regular expression metacharacter or binary zero

-You should not alter the set of characters that contain the 128 bit, as that
-will cause PCRE2 to malfunction.
+You can also specify -b (with or without -L) when running pcre2_dftables. This
+causes the tables to be written in binary instead of as source code. A set of
+binary tables can be loaded into memory by an application and passed to
+pcre2_compile() in the same way as tables created dynamically by calling
+pcre2_maketables(). The tables are just a string of bytes, independent of
+hardware characteristics such as endianness. This means they can be bundled
+with an application that runs in different environments, to ensure consistent
+behaviour.
+
+See also the pcre2build section "Creating character tables at build time".


 File manifest
@ -766,7 +786,7 @@ The distribution should contain the files listed below.
 (A) Source files for the PCRE2 library functions and their headers are found in
    the src directory:

-  src/dftables.c           auxiliary program for building pcre2_chartables.c
+  src/pcre2_dftables.c     auxiliary program for building pcre2_chartables.c
                           when --enable-rebuild-chartables is specified

  src/pcre2_chartables.c.dist  a default set of character tables that assume
@ -890,6 +910,6 @@ The distribution should contain the files listed below.
                          )   environments

 Philip Hazel
-Email local part: ph10
-Email domain: cam.ac.uk
-Last updated: 16 April 2019
+Email local part: Philip.Hazel
+Email domain: gmail.com
+Last updated: 15 April 2022
--- a/README-OS4.md
+++ b/README-OS4.md
@ -0,0 +1,39 @@
+PCRE2 (Perl-compatible regular expression library)
+---------------------------------------------------------------------------
+
+This is a port of PCRE2 10.40 by Philip Hazel for AmigaOS 4, as found at the
+GitHub repository https://github.com/PCRE2Project/pcre2
+
+More information about PCRE can be found at its official website
+at https://www.pcre.org and at the documentation that comes with this
+package.
+
+In the archive both newlib and clib2 libraries are included. It has been
+tested with various applications, but in case you find issues please 
+contact me.
+
+To install it into your AmigaOS 4 SDK installation, just extract all the 
+files in the SDK: path.
+
+Compile
+--------------------------
+The source and the changes I did can be found at my personale repository
+https://git.walkero.gr/walkero/pcre2
+
+You can compile it using the Makefile.os4 file, and produce the libraries
+yourself.
+
+* with newlib run:
+  ```bash
+  make -f Makefile.os4 all
+  ```
+* with clib2 run:
+  ```bash
+  make -f Makefile.os4 all USE_CLIB2=yes
+  ```
+
+Changelog
+--------------------------
+v10.40r1 - 2022-07-31
+* First release
+
--- a/README.md
+++ b/README.md
@ -0,0 +1,56 @@
+# PCRE2 - Perl-Compatible Regular Expressions
+
+The PCRE2 library is a set of C functions that implement regular expression
+pattern matching using the same syntax and semantics as Perl 5. PCRE2 has its
+own native API, as well as a set of wrapper functions that correspond to the
+POSIX regular expression API. The PCRE2 library is free, even for building 
+proprietary software. It comes in three forms, for processing 8-bit, 16-bit,
+or 32-bit code units, in either literal or UTF encoding.
+
+PCRE2 was first released in 2015 to replace the API in the original PCRE 
+library, which is now obsolete and no longer maintained. As well as a more
+flexible API, the code of PCRE2 has been much improved since the fork.
+ 
+## Download
+
+As well as downloading from the 
+[GitHub site](https://github.com/PCRE2Project/pcre2), you can download PCRE2 
+or the older, unmaintained PCRE1 library from an 
+[*unofficial* mirror](https://sourceforge.net/projects/pcre/files/) at SourceForge.
+
+You can check out the PCRE2 source code via Git or Subversion:
+
+    git clone https://github.com/PCRE2Project/pcre2.git
+    svn co    https://github.com/PCRE2Project/pcre2.git
+
+## Contributed Ports
+
+If you just need the command-line PCRE2 tools on Windows, precompiled binary
+versions are available at this 
+[Rexegg page](http://www.rexegg.com/pcregrep-pcretest.html).
+
+A PCRE2 port for z/OS, a mainframe operating system which uses EBCDIC as its
+default character encoding, can be found at 
+[http://www.cbttape.org](http://www.cbttape.org/) (File 939).
+
+## Documentation
+
+You can read the PCRE2 documentation 
+[here](https://PCRE2Project.github.io/pcre2/doc/html/index.html).
+
+Comparisons to Perl's regular expression semantics can be found in the
+community authored Wikipedia entry for PCRE.
+
+There is a curated summary of changes for each PCRE release, copies of
+documentation from older releases, and other useful information from the third
+party authored 
+[RexEgg PCRE Documentation and Change Log page](http://www.rexegg.com/pcre-documentation.html).
+
+## Contact
+
+To report a problem with the PCRE2 library, or to make a feature request, please
+use the PCRE2 GitHub issues tracker. There is a mailing list for discussion of
+ PCRE2 issues and development at pcre2-dev@googlegroups.com, which is where any
+announcements will be made. You can browse the 
+[list archives](https://groups.google.com/g/pcre2-dev).
+
--- a/102
+++ b/102
@ -8,7 +8,7 @@
 # * Put printf arguments in single, not double quotes to avoid unwanted
 #     escaping.
 # * Use \0 for binary zero in printf, not \x0, for the benefit of older
-#     versions.
+#     versions (and use octal for other special values).

 # Set the C locale, so that sort(1) behaves predictably.

@ -68,6 +68,22 @@ diff -b  /dev/null /dev/null 2>/dev/null && cf="diff -b"
 diff -u  /dev/null /dev/null 2>/dev/null && cf="diff -u"
 diff -ub /dev/null /dev/null 2>/dev/null && cf="diff -ub"

+# Some tests involve NUL characters. It seems impossible to handle them easily
+# in many operating systems. An earlier version of this script used sed to
+# translate NUL into the string ZERO, but this didn't work on Solaris (aka
+# SunOS), where the version of sed explicitly doesn't like them, and also MacOS
+# (Darwin), OpenBSD, FreeBSD, NetBSD, and some Linux distributions like Alpine,
+# even when using GNU sed. A user suggested using tr instead, which
+# necessitates translating to a single character. However, on (some versions
+# of?) Solaris, the normal "tr" cannot handle binary zeros, but if
+# /usr/xpg4/bin/tr is available, it can do so, so test for that.
+
+if [ -x /usr/xpg4/bin/tr ] ; then
+  tr=/usr/xpg4/bin/tr
+else
+  tr=tr
+fi
+
 # If this test is being run from "make check", $srcdir will be set. If not, set
 # it to the current or parent directory, whichever one contains the test data.
 # Subsequently, we run most of the pcre2grep tests in the source directory so
@ -558,7 +574,7 @@ echo "RC=$?" >>testtrygrep
 echo "---------------------------- Test 107 -----------------------------" >>testtrygrep
 echo "a" >testtemp1grep
 echo "aaaaa" >>testtemp1grep
-(cd $srcdir; $valgrind $vjs $pcre2grep  --line-offsets '(?<=\Ka)' $builddir/testtemp1grep) >>testtrygrep 2>&1
+(cd $srcdir; $valgrind $vjs $pcre2grep  --line-offsets --allow-lookaround-bsk '(?<=\Ka)' $builddir/testtemp1grep) >>testtrygrep 2>&1
 echo "RC=$?" >>testtrygrep

 echo "---------------------------- Test 108 ------------------------------" >>testtrygrep
@ -638,13 +654,13 @@ echo "RC=$?" >>testtrygrep

 echo "---------------------------- Test 125 -----------------------------" >>testtrygrep
 printf 'abcd\n' >testNinputgrep
-$valgrind $vjs $pcre2grep --colour=always '(?<=\K.)' testNinputgrep >>testtrygrep
+$valgrind $vjs $pcre2grep --colour=always --allow-lookaround-bsk '(?<=\K.)' testNinputgrep >>testtrygrep
 echo "RC=$?" >>testtrygrep
-$valgrind $vjs $pcre2grep --colour=always '(?=.\K)' testNinputgrep >>testtrygrep
+$valgrind $vjs $pcre2grep --colour=always --allow-lookaround-bsk '(?=.\K)' testNinputgrep >>testtrygrep
 echo "RC=$?" >>testtrygrep
-$valgrind $vjs $pcre2grep --colour=always '(?<=\K[ac])' testNinputgrep >>testtrygrep
+$valgrind $vjs $pcre2grep --colour=always --allow-lookaround-bsk '(?<=\K[ac])' testNinputgrep >>testtrygrep
 echo "RC=$?" >>testtrygrep
-$valgrind $vjs $pcre2grep --colour=always '(?=[ac]\K)' testNinputgrep >>testtrygrep
+$valgrind $vjs $pcre2grep --colour=always --allow-lookaround-bsk '(?=[ac]\K)' testNinputgrep >>testtrygrep
 echo "RC=$?" >>testtrygrep

 echo "---------------------------- Test 126 -----------------------------" >>testtrygrep
@ -653,6 +669,47 @@ printf 'ABC\0XYZ\nABCDEF\nDEFABC\n' >testtemp2grep
 $valgrind $vjs $pcre2grep -a -f testtemp1grep testtemp2grep >>testtrygrep
 echo "RC=$?" >>testtrygrep

+echo "---------------------------- Test 127 -----------------------------" >>testtrygrep
+(cd $srcdir; $valgrind $vjs $pcre2grep -o --om-capture=0 'pattern()()()()' testdata/grepinput) >>testtrygrep
+echo "RC=$?" >>testtrygrep
+
+echo "---------------------------- Test 128 -----------------------------" >>testtrygrep
+(cd $srcdir; $valgrind $vjs $pcre2grep -o1 --om-capture=0 'pattern()()()()' testdata/grepinput) >>testtrygrep 2>&1
+echo "RC=$?" >>testtrygrep
+
+echo "---------------------------- Test 129 -----------------------------" >>testtrygrep
+(cd $srcdir; $valgrind $vjs $pcre2grep -m 2 'fox' testdata/grepinput) >>testtrygrep 2>&1
+echo "RC=$?" >>testtrygrep
+
+echo "---------------------------- Test 130 -----------------------------" >>testtrygrep
+(cd $srcdir; $valgrind $vjs $pcre2grep -o -m2 'fox' testdata/grepinput) >>testtrygrep 2>&1
+echo "RC=$?" >>testtrygrep
+
+echo "---------------------------- Test 131 -----------------------------" >>testtrygrep
+(cd $srcdir; $valgrind $vjs $pcre2grep -oc -m2 'fox' testdata/grepinput) >>testtrygrep 2>&1
+echo "RC=$?" >>testtrygrep
+
+echo "---------------------------- Test 132 -----------------------------" >>testtrygrep
+(cd $srcdir; exec 3<testdata/grepinput; $valgrind $vjs $pcre2grep -m1 -A3 '^match' <&3; echo '---'; head -1 <&3; exec 3<&-) >>testtrygrep 2>&1
+echo "RC=$?" >>testtrygrep
+
+echo "---------------------------- Test 133 -----------------------------" >>testtrygrep
+(cd $srcdir; exec 3<testdata/grepinput; $valgrind $vjs $pcre2grep -m1 -A3 '^match' <&3; echo '---'; $valgrind $vjs $pcre2grep -m1 -A3 '^match' <&3; exec 3<&-) >>testtrygrep 2>&1
+echo "RC=$?" >>testtrygrep
+
+echo "---------------------------- Test 134 -----------------------------" >>testtrygrep
+(cd $srcdir; $valgrind $vjs $pcre2grep -m1 -O '=$x{41}$x423$o{103}$o1045=' 'fox') <$srcdir/testdata/grepinputv >>testtrygrep 2>&1
+echo "RC=$?" >>testtrygrep
+
+echo "---------------------------- Test 135 -----------------------------" >>testtrygrep
+(cd $srcdir; $valgrind $vjs $pcre2grep -HZ 'word' ./testdata/grepinputv) | $tr '\000' '@' >>testtrygrep
+echo "RC=$?" >>testtrygrep
+(cd $srcdir; $valgrind $vjs $pcre2grep -lZ 'word' ./testdata/grepinputv ./testdata/grepinputv) | $tr '\000' '@' >>testtrygrep
+echo "RC=$?" >>testtrygrep
+(cd $srcdir; $valgrind $vjs $pcre2grep -A 1 -B 1 -HZ 'word' ./testdata/grepinputv) | $tr '\000' '@' >>testtrygrep
+echo "RC=$?" >>testtrygrep
+(cd $srcdir; $valgrind $vjs $pcre2grep -MHZn 'start[\s]+end' testdata/grepinputM) >>testtrygrep
+echo "RC=$?" >>testtrygrep

 # Now compare the results.

@ -674,7 +731,21 @@ if [ $utf8 -ne 0 ] ; then
  echo "RC=$?" >>testtrygrep

  echo "---------------------------- Test U3 ------------------------------" >>testtrygrep
-  (cd $srcdir; $valgrind $vjs $pcre2grep --line-offsets -u --newline=any '(?<=\K\x{17f})' ./testdata/grepinput8) >>testtrygrep
+  (cd $srcdir; $valgrind $vjs $pcre2grep --line-offsets -u --newline=any --allow-lookaround-bsk '(?<=\K\x{17f})' ./testdata/grepinput8) >>testtrygrep
+  echo "RC=$?" >>testtrygrep
+
+  echo "---------------------------- Test U4 ------------------------------" >>testtrygrep
+  printf 'A\341\200\200\200CD\342\200\200Z\n' >testtemp1grep
+  (cd $srcdir; $valgrind $vjs $pcre2grep -u -o '....' $builddir/testtemp1grep) >>testtrygrep 2>&1
+  echo "RC=$?" >>testtrygrep
+
+  echo "---------------------------- Test U5 ------------------------------" >>testtrygrep
+  printf 'A\341\200\200\200CD\342\200\200Z\n' >testtemp1grep
+  (cd $srcdir; $valgrind $vjs $pcre2grep -U -o '....' $builddir/testtemp1grep) >>testtrygrep
+  echo "RC=$?" >>testtrygrep
+
+  echo "---------------------------- Test U6 -----------------------------" >>testtrygrep
+  (cd $srcdir; $valgrind $vjs $pcre2grep -u -m1 -O '=$x{1d3}$o{744}=' 'fox') <$srcdir/testdata/grepinputv >>testtrygrep 2>&1
  echo "RC=$?" >>testtrygrep

  $cf $srcdir/testdata/grepoutput8 testtrygrep
@ -714,24 +785,10 @@ $valgrind $vjs $pcre2grep -n --newline=any "^(abc|def|ghi|jkl)" testNinputgrep >
 printf '%c--------------------------- Test N6 ------------------------------\r\n' - >>testtrygrep
 $valgrind $vjs $pcre2grep -n --newline=anycrlf "^(abc|def|ghi|jkl)" testNinputgrep >>testtrygrep

-# It seems impossible to handle NUL characters easily in many operating
-# systems, including Solaris (aka SunOS), where the version of sed explicitly
-# doesn't like them, and also MacOS (Darwin), OpenBSD, FreeBSD, and NetBSD. So
-# now we run this test only on OS that are known to work. For the rest, we
-# fudge the output so that the comparison works.
-
 printf '%c--------------------------- Test N7 ------------------------------\r\n' - >>testtrygrep
-uname=`uname`
-case $uname in
-  Linux)
 printf 'abc\0def' >testNinputgrep
-    $valgrind $vjs $pcre2grep -na --newline=nul "^(abc|def)" testNinputgrep | sed 's/\x00/ZERO/' >>testtrygrep
+$valgrind $vjs $pcre2grep -na --newline=nul "^(abc|def)" testNinputgrep | $tr '\000' '@' >>testtrygrep
 echo "" >>testtrygrep
-    ;;
-  *)
-    echo '1:abcZERO2:def' >>testtrygrep
-    ;;
-esac

 $cf $srcdir/testdata/grepoutputN testtrygrep
 if [ $? != 0 ] ; then exit 1; fi
@ -747,6 +804,7 @@ if $valgrind $vjs $pcre2grep --help | $valgrind $vjs $pcre2grep -q 'callout scri
  $valgrind $vjs $pcre2grep '(T)(..(.))()()()()()()()(..)(?C"/bin/echo|Arg1: [$11] [${11}]")' $srcdir/testdata/grepinputv >>testtrygrep
  $valgrind $vjs $pcre2grep '(T)(?C"|$0:$1$n")' $srcdir/testdata/grepinputv >>testtrygrep
  $valgrind $vjs $pcre2grep '(T)(?C"|$1$n")(*F)' $srcdir/testdata/grepinputv >>testtrygrep
+  $valgrind $vjs $pcre2grep -m1 '(T)(?C"|$0:$1:$x{41}$o{101}$n")' $srcdir/testdata/grepinputv >>testtrygrep

  if $valgrind $vjs $pcre2grep --help | $valgrind $vjs $pcre2grep -q 'Non-fork callout scripts in patterns are supported'; then
    $cf $srcdir/testdata/grepoutputCN testtrygrep
--- a/72
+++ b/72
@ -17,8 +17,16 @@
 # individual test numbers, ranges of tests such as 3-6 or 3- (meaning 3 to the
 # end), or a number preceded by ~ to exclude a test. For example, "3-15 ~10"
 # runs tests 3 to 15, excluding test 10, and just "~10" runs all the tests
-# except test 10. Whatever order the arguments are in, the tests are always run
-# in numerical order.
+# except test 10. Whatever order the arguments are in, these tests are always
+# run in numerical order.
+#
+# If no specific tests are selected (which is the case when this script is run
+# via 'make check') the default is to run all the numbered tests.
+#
+# There may also be named (as well as numbered) tests for special purposes. At
+# present there is just one, called "heap". This test's output contains the
+# sizes of heap frames and frame vectors, which depend on the environment. It
+# is therefore not run unless explicitly requested.
 #
 # Inappropriate tests are automatically skipped (with a comment to say so). For
 # example, if JIT support is not compiled, test 16 is skipped, whereas if JIT
@ -80,7 +88,9 @@ title22="Test 22: \C tests with UTF (not supported for DFA matching)"
 title23="Test 23: \C disabled test"
 title24="Test 24: Non-UTF pattern conversion tests"
 title25="Test 25: UTF pattern conversion tests"
-maxtest=25
+title26="Test 26: Auto-generated unicode property tests"
+maxtest=26
+titleheap="Test 'heap': Environment-specific heap tests"

 if [ $# -eq 1 -a "$1" = "list" ]; then
  echo $title0
@ -109,6 +119,12 @@ if [ $# -eq 1 -a "$1" = "list" ]; then
  echo $title23
  echo $title24
  echo $title25
+  echo $title26
+  echo ""
+  echo $titleheap
+  echo ""
+  echo "Numbered tests are automatically run if nothing selected."
+  echo "Named tests must be explicitly selected."
  exit 0
 fi

@ -238,6 +254,8 @@ do22=no
 do23=no
 do24=no
 do25=no
+do26=no
+doheap=no

 while [ $# -gt 0 ] ; do
  case $1 in
@ -267,6 +285,8 @@ while [ $# -gt 0 ] ; do
   23) do23=yes;;
   24) do24=yes;;
   25) do25=yes;;
+   26) do26=yes;;
+ heap) doheap=yes;;
   -8) arg8=yes;;
  -16) arg16=yes;;
  -32) arg32=yes;;
@ -320,7 +340,8 @@ fi
 # set up a large stack.

 $sim ./pcre2test -S 64 /dev/null /dev/null
-if [ $? -eq 0 -a "$bigstack" != "" ] ; then
+support_setstack=$?
+if [ $support_setstack -eq 0 -a "$bigstack" != "" ] ; then
  setstack="-S 64"
 else
  setstack=""
@ -407,8 +428,8 @@ if [ $jit -ne 0 -a "$nojit" != "yes" ] ; then
  fi
 fi

-# If no specific tests were requested, select all. Those that are not
-# relevant will be automatically skipped.
+# If no specific tests were requested, select all the numbered tests. Those
+# that are not relevant will be automatically skipped.

 if [ $do0  = no -a $do1  = no -a $do2  = no -a $do3  = no -a \
     $do4  = no -a $do5  = no -a $do6  = no -a $do7  = no -a \
@ -416,7 +437,7 @@ if [ $do0  = no -a $do1  = no -a $do2  = no -a $do3  = no -a \
     $do12 = no -a $do13 = no -a $do14 = no -a $do15 = no -a \
     $do16 = no -a $do17 = no -a $do18 = no -a $do19 = no -a \
     $do20 = no -a $do21 = no -a $do22 = no -a $do23 = no -a \
-     $do24 = no -a $do25 = no \
+     $do24 = no -a $do25 = no -a $do26 = no -a $doheap = no \
   ]; then
  do0=yes
  do1=yes
@ -444,6 +465,7 @@ if [ $do0  = no -a $do1  = no -a $do2  = no -a $do3  = no -a \
  do23=yes
  do24=yes
  do25=yes
+  do26=yes
 fi

 # Handle any explicit skips at this stage, so that an argument list may consist
@ -479,7 +501,9 @@ for bmode in "$test8" "$test16" "$test32"; do
    echo '' >testtry
    checkspecial '-C'
    checkspecial '--help'
+    if [ $support_setstack -eq 0 ] ; then
      checkspecial '-S 1 -t 10 testSinput'
+    fi
    echo "  OK"
  fi

@ -493,15 +517,20 @@ for bmode in "$test8" "$test16" "$test32"; do
    done
  fi

-  # PCRE2 tests that are not Perl-compatible: API, errors, internals
+  # PCRE2 tests that are not Perl-compatible: API, errors, internals. We copy
+  # the testbtables file to the current directory for use by this test.

  if [ $do2 = yes ] ; then
    echo $title2 "(excluding UTF-$bits)"
+    cp $testdata/testbtables .
    for opt in "" $jitopt; do
      $sim $valgrind ${opt:+$vjs} ./pcre2test -q $setstack $bmode $opt $testdata/testinput2 testtry
-      if [ $? = 0 ] ; then
+      saverc=$?
+      if [ $saverc = 0 ] ; then
        $sim $valgrind ${opt:+$vjs} ./pcre2test -q $bmode $opt -error -70,-62,-2,-1,0,100,101,191,200 >>testtry
        checkresult $? 2 "$opt"
+      else
+        checkresult $saverc 2 "$opt"
      fi
    done
  fi
@ -855,10 +884,33 @@ for bmode in "$test8" "$test16" "$test32"; do
    fi
  fi

+  # Auto-generated unicode property tests
+
+  if [ $do26 = yes ] ; then
+    echo $title26
+    if [ $utf -eq 0 ] ; then
+      echo "  Skipped because UTF-$bits support is not available"
+    else
+      for opt in "" $jitopt; do
+        $sim $valgrind ${opt:+$vjs} ./pcre2test -q $setstack $bmode $opt $testdata/testinput26 testtry
+        checkresult $? 26 "$opt"
+      done
+    fi
+  fi
+
+  # Manually selected heap tests - output may vary in different environments,
+  # which is why that are not automatically run.
+
+  if [ $doheap = yes ] ; then
+    echo $titleheap
+    $sim $valgrind ./pcre2test -q $setstack $bmode $testdata/testinputheap testtry
+    checkresult $? heap-$bits ""
+  fi
+
 # End of loop for 8/16/32-bit tests
 done

 # Clean up local working files
-rm -f testSinput test3input testsaved1 testsaved2 test3output test3outputA test3outputB teststdout teststderr testtry
+rm -f testbtables testSinput test3input testsaved1 testsaved2 test3output test3outputA test3outputB teststdout teststderr testtry

 # End
--- a/RunTest.bat
+++ b/RunTest.bat
@ -26,6 +26,7 @@
@rem Updated for new test 14 (moving others up a number), August 2015.
@rem Tidied and updated for new tests 21, 22, 23 by PH, October 2015.
@rem PH added missing "set type" for test 22, April 2016.
+@rem PH added copy command for new testbtables file, November 2020


 setlocal enabledelayedexpansion
@ -134,9 +135,9 @@ if "%all%" == "yes" (
  set do7=yes
  set do8=yes
  set do9=yes
-  set do10=yes
+  set do10=no
  set do11=yes
-  set do12=yes
+  set do12=no
  set do13=yes
  set do14=yes
  set do15=yes
@ -305,6 +306,7 @@ if %jit% EQU 1 call :runsub 1 testoutjit "Test with JIT Override" -q -jit
 goto :eof

 :do2
+  copy /y %srcdir%\testdata\testbtables testbtables
 
  call :runsub 2 testout "API, errors, internals, and non-Perl stuff" -q
  if %jit% EQU 1 call :runsub 2 testoutjit "Test with JIT Override" -q -jit
 goto :eof
--- a/WORKSPACE.bazel
+++ b/WORKSPACE.bazel
@ -0,0 +1 @@
+# See MODULE.bazel
--- a/cmake/FindEditline.cmake
+++ b/cmake/FindEditline.cmake
@ -1,12 +1,11 @@
 # Modified from FindReadline.cmake (PH Feb 2012)

-if(EDITLINE_INCLUDE_DIR AND EDITLINE_LIBRARY AND NCURSES_LIBRARY)
+if(EDITLINE_INCLUDE_DIR AND EDITLINE_LIBRARY)
  set(EDITLINE_FOUND TRUE)
-else(EDITLINE_INCLUDE_DIR AND EDITLINE_LIBRARY AND NCURSES_LIBRARY)
-  FIND_PATH(EDITLINE_INCLUDE_DIR readline.h
-    /usr/include/editline
-    /usr/include/edit/readline  
-    /usr/include/readline
+else(EDITLINE_INCLUDE_DIR AND EDITLINE_LIBRARY)
+  FIND_PATH(EDITLINE_INCLUDE_DIR readline.h PATH_SUFFIXES
+    editline
+    edit/readline
  )
  
  FIND_LIBRARY(EDITLINE_LIBRARY NAMES edit)
@ -14,4 +13,4 @@ else(EDITLINE_INCLUDE_DIR AND EDITLINE_LIBRARY AND NCURSES_LIBRARY)
  FIND_PACKAGE_HANDLE_STANDARD_ARGS(Editline DEFAULT_MSG EDITLINE_INCLUDE_DIR EDITLINE_LIBRARY)

  MARK_AS_ADVANCED(EDITLINE_INCLUDE_DIR EDITLINE_LIBRARY)
-endif(EDITLINE_INCLUDE_DIR AND EDITLINE_LIBRARY AND NCURSES_LIBRARY)
+endif(EDITLINE_INCLUDE_DIR AND EDITLINE_LIBRARY)
--- a/cmake/pcre2-config-version.cmake.in
+++ b/cmake/pcre2-config-version.cmake.in
@ -0,0 +1,15 @@
+set(PACKAGE_VERSION_MAJOR @PCRE2_MAJOR@)
+set(PACKAGE_VERSION_MINOR @PCRE2_MINOR@)
+set(PACKAGE_VERSION_PATCH 0)
+set(PACKAGE_VERSION @PCRE2_MAJOR@.@PCRE2_MINOR@.0)
+
+# Check whether the requested PACKAGE_FIND_VERSION is compatible
+if(PACKAGE_VERSION VERSION_LESS PACKAGE_FIND_VERSION OR
+   PACKAGE_VERSION_MAJOR GREATER PACKAGE_FIND_VERSION_MAJOR)
+  set(PACKAGE_VERSION_COMPATIBLE FALSE)
+else()
+  set(PACKAGE_VERSION_COMPATIBLE TRUE)
+  if(PACKAGE_VERSION VERSION_EQUAL PACKAGE_FIND_VERSION)
+    set(PACKAGE_VERSION_EXACT TRUE)
+  endif()
+endif()
--- a/cmake/pcre2-config.cmake.in
+++ b/cmake/pcre2-config.cmake.in
@ -0,0 +1,145 @@
+# pcre2-config.cmake
+# ----------------
+#
+# Finds the PCRE2 library, specify the starting search path in PCRE2_ROOT.
+#
+# Static vs. shared
+# -----------------
+# To make use of the static library instead of the shared one, one needs
+# to set the variable PCRE2_USE_STATIC_LIBS to ON before calling find_package.
+# Example:
+#   set(PCRE2_USE_STATIC_LIBS ON)
+#   find_package(PCRE2 CONFIG COMPONENTS 8BIT)
+#
+# This will define the following variables:
+#
+#   PCRE2_FOUND   - True if the system has the PCRE2 library.
+#   PCRE2_VERSION - The version of the PCRE2 library which was found.
+#
+# and the following imported targets:
+#
+#   PCRE2::8BIT  - The 8 bit PCRE2 library.
+#   PCRE2::16BIT - The 16 bit PCRE2 library.
+#   PCRE2::32BIT - The 32 bit PCRE2 library.
+#   PCRE2::POSIX - The POSIX PCRE2 library.
+
+set(PCRE2_NON_STANDARD_LIB_PREFIX @NON_STANDARD_LIB_PREFIX@)
+set(PCRE2_NON_STANDARD_LIB_SUFFIX @NON_STANDARD_LIB_SUFFIX@)
+set(PCRE2_8BIT_NAME pcre2-8)
+set(PCRE2_16BIT_NAME pcre2-16)
+set(PCRE2_32BIT_NAME pcre2-32)
+set(PCRE2_POSIX_NAME pcre2-posix)
+find_path(PCRE2_INCLUDE_DIR NAMES pcre2.h DOC "PCRE2 include directory")
+if (PCRE2_USE_STATIC_LIBS)
+  if (MSVC)
+    set(PCRE2_8BIT_NAME pcre2-8-static)
+    set(PCRE2_16BIT_NAME pcre2-16-static)
+    set(PCRE2_32BIT_NAME pcre2-32-static)
+    set(PCRE2_POSIX_NAME pcre2-posix-static)
+  endif ()
+
+  set(PCRE2_PREFIX ${CMAKE_STATIC_LIBRARY_PREFIX})
+  set(PCRE2_SUFFIX ${CMAKE_STATIC_LIBRARY_SUFFIX})
+else ()
+  set(PCRE2_PREFIX ${CMAKE_SHARED_LIBRARY_PREFIX})
+  if (MINGW AND PCRE2_NON_STANDARD_LIB_PREFIX)
+    set(PCRE2_PREFIX "")
+  endif ()
+
+  set(PCRE2_SUFFIX ${CMAKE_SHARED_LIBRARY_SUFFIX})
+  if (MINGW AND PCRE2_NON_STANDARD_LIB_SUFFIX)
+    set(PCRE2_SUFFIX "-0.dll")
+  endif ()
+endif ()
+find_library(PCRE2_8BIT_LIBRARY NAMES ${PCRE2_PREFIX}${PCRE2_8BIT_NAME}${PCRE2_SUFFIX} ${PCRE2_PREFIX}${PCRE2_8BIT_NAME}d${PCRE2_SUFFIX} DOC "8 bit PCRE2 library")
+find_library(PCRE2_16BIT_LIBRARY NAMES ${PCRE2_PREFIX}${PCRE2_16BIT_NAME}${PCRE2_SUFFIX} ${PCRE2_PREFIX}${PCRE2_8BIT_NAME}d${PCRE2_SUFFIX} DOC "16 bit PCRE2 library")
+find_library(PCRE2_32BIT_LIBRARY NAMES ${PCRE2_PREFIX}${PCRE2_32BIT_NAME}${PCRE2_SUFFIX} ${PCRE2_PREFIX}${PCRE2_8BIT_NAME}d${PCRE2_SUFFIX} DOC "32 bit PCRE2 library")
+find_library(PCRE2_POSIX_LIBRARY NAMES ${PCRE2_PREFIX}${PCRE2_POSIX_NAME}${PCRE2_SUFFIX} ${PCRE2_PREFIX}${PCRE2_8BIT_NAME}d${PCRE2_SUFFIX} DOC "8 bit POSIX PCRE2 library")
+unset(PCRE2_NON_STANDARD_LIB_PREFIX)
+unset(PCRE2_NON_STANDARD_LIB_SUFFIX)
+unset(PCRE2_8BIT_NAME)
+unset(PCRE2_16BIT_NAME)
+unset(PCRE2_32BIT_NAME)
+unset(PCRE2_POSIX_NAME)
+
+# Set version
+if (PCRE2_INCLUDE_DIR)
+  set(PCRE2_VERSION "@PCRE2_MAJOR@.@PCRE2_MINOR@.0")
+endif ()
+
+# Which components have been found.
+if (PCRE2_8BIT_LIBRARY)
+  set(PCRE2_8BIT_FOUND TRUE)
+endif ()
+if (PCRE2_16BIT_LIBRARY)
+  set(PCRE2_16BIT_FOUND TRUE)
+endif ()
+if (PCRE2_32BIT_LIBRARY)
+  set(PCRE2_32BIT_FOUND TRUE)
+endif ()
+if (PCRE2_POSIX_LIBRARY)
+  set(PCRE2_POSIX_FOUND TRUE)
+endif ()
+
+# Check if at least one component has been specified.
+list(LENGTH PCRE2_FIND_COMPONENTS PCRE2_NCOMPONENTS)
+if (PCRE2_NCOMPONENTS LESS 1)
+  message(FATAL_ERROR "No components have been specified. This is not allowed. Please, specify at least one component.")
+endif ()
+unset(PCRE2_NCOMPONENTS)
+
+# When POSIX component has been specified make sure that also 8BIT component is specified.
+set(PCRE2_8BIT_COMPONENT FALSE)
+set(PCRE2_POSIX_COMPONENT FALSE)
+foreach(component ${PCRE2_FIND_COMPONENTS})
+  if (component STREQUAL "8BIT")
+    set(PCRE2_8BIT_COMPONENT TRUE)
+  elseif (component STREQUAL "POSIX")
+    set(PCRE2_POSIX_COMPONENT TRUE)
+  endif ()
+endforeach()
+
+if (PCRE2_POSIX_COMPONENT AND NOT PCRE2_8BIT_COMPONENT)
+  message(FATAL_ERROR "The component POSIX is specified while the 8BIT one is not. This is not allowed. Please, also specify the 8BIT component.")
+endif()
+unset(PCRE2_8BIT_COMPONENT)
+unset(PCRE2_POSIX_COMPONENT)
+
+include(FindPackageHandleStandardArgs)
+set(${CMAKE_FIND_PACKAGE_NAME}_CONFIG "${CMAKE_CURRENT_LIST_FILE}")
+find_package_handle_standard_args(PCRE2
+  FOUND_VAR PCRE2_FOUND
+  REQUIRED_VARS PCRE2_INCLUDE_DIR
+  HANDLE_COMPONENTS
+  VERSION_VAR PCRE2_VERSION
+  CONFIG_MODE
+)
+
+set(PCRE2_LIBRARIES)
+if (PCRE2_FOUND)
+  foreach(component ${PCRE2_FIND_COMPONENTS})
+    if (PCRE2_USE_STATIC_LIBS)
+      add_library(PCRE2::${component} STATIC IMPORTED)
+      target_compile_definitions(PCRE2::${component} INTERFACE PCRE2_STATIC)
+    else ()
+      add_library(PCRE2::${component} SHARED IMPORTED)
+    endif ()
+    set_target_properties(PCRE2::${component} PROPERTIES
+      IMPORTED_LOCATION "${PCRE2_${component}_LIBRARY}"
+      INTERFACE_INCLUDE_DIRECTORIES "${PCRE2_INCLUDE_DIR}"
+    )
+    if (component STREQUAL "POSIX")
+      set_target_properties(PCRE2::${component} PROPERTIES
+        INTERFACE_LINK_LIBRARIES "PCRE2::8BIT"
+        LINK_LIBRARIES "PCRE2::8BIT"
+      )
+    endif ()
+
+    set(PCRE2_LIBRARIES ${PCRE2_LIBRARIES} ${PCRE2_${component}_LIBRARY})
+    mark_as_advanced(PCRE2_${component}_LIBRARY)
+  endforeach()
+endif ()
+
+mark_as_advanced(
+  PCRE2_INCLUDE_DIR
+)
--- a/config-cmake.h.in
+++ b/config-cmake.h.in
@ -1,8 +1,7 @@
 /* config.h for CMake builds */

+#cmakedefine HAVE_ATTRIBUTE_UNINITIALIZED 1
 #cmakedefine HAVE_DIRENT_H 1
-#cmakedefine HAVE_INTTYPES_H 1    
-#cmakedefine HAVE_STDINT_H 1                                                   
 #cmakedefine HAVE_STRERROR 1
 #cmakedefine HAVE_SYS_STAT_H 1
 #cmakedefine HAVE_SYS_TYPES_H 1
@ -10,9 +9,10 @@
 #cmakedefine HAVE_WINDOWS_H 1

 #cmakedefine HAVE_BCOPY 1
+#cmakedefine HAVE_MEMFD_CREATE 1
 #cmakedefine HAVE_MEMMOVE 1
-
-#cmakedefine PCRE2_STATIC 1
+#cmakedefine HAVE_SECURE_GETENV 1
+#cmakedefine HAVE_STRERROR 1

 #cmakedefine SUPPORT_PCRE2_8 1
 #cmakedefine SUPPORT_PCRE2_16 1
--- a/configure.ac
+++ b/configure.ac
@ -9,21 +9,21 @@ dnl The PCRE2_PRERELEASE feature is for identifying release candidates. It might
 dnl be defined as -RC2, for example. For real releases, it should be empty.

 m4_define(pcre2_major, [10])
-m4_define(pcre2_minor, [33])
+m4_define(pcre2_minor, [41])
 m4_define(pcre2_prerelease, [])
-m4_define(pcre2_date, [2019-04-16])
+m4_define(pcre2_date, [2022-xx-xx])
+
+# Libtool shared library interface versions (current:revision:age)
+m4_define(libpcre2_8_version,     [11:0:11])
+m4_define(libpcre2_16_version,    [11:0:11])
+m4_define(libpcre2_32_version,    [11:0:11])
+m4_define(libpcre2_posix_version, [3:2:0])

 # NOTE: The CMakeLists.txt file searches for the above variables in the first
 # 50 lines of this file. Please update that if the variables above are moved.

-# Libtool shared library interface versions (current:revision:age)
-m4_define(libpcre2_8_version,     [8:0:8])
-m4_define(libpcre2_16_version,    [8:0:8])
-m4_define(libpcre2_32_version,    [8:0:8])
-m4_define(libpcre2_posix_version, [2:2:0])
-
-AC_PREREQ(2.57)
-AC_INIT(PCRE2, pcre2_major.pcre2_minor[]pcre2_prerelease, , pcre2)
+AC_PREREQ([2.60])
+AC_INIT([PCRE2],pcre2_major.pcre2_minor[]pcre2_prerelease,[],[pcre2])
 AC_CONFIG_SRCDIR([src/pcre2.h.in])
 AM_INIT_AUTOMAKE([dist-bzip2 dist-zip])
 m4_ifdef([AM_SILENT_RULES], [AM_SILENT_RULES([yes])])
@ -64,14 +64,31 @@ m4_ifdef([AM_PROG_AR], [AM_PROG_AR])
 AC_TYPE_INT64_T

 AC_PROG_INSTALL
-AC_LIBTOOL_WIN32_DLL
-LT_INIT
+LT_INIT([win32-dll])
 AC_PROG_LN_S

 # Check for GCC visibility feature

 PCRE2_VISIBILITY

+# Check for Clang __attribute__((uninitialized)) feature
+
+AC_MSG_CHECKING([for __attribute__((uninitialized))])
+AC_LANG_PUSH([C])
+tmp_CFLAGS=$CFLAGS
+CFLAGS="$CFLAGS -Werror"
+AC_COMPILE_IFELSE([AC_LANG_PROGRAM(,
+                   [[char buf[128] __attribute__((uninitialized));(void)buf]])],
+                   [pcre2_cc_cv_attribute_uninitialized=yes],
+                   [pcre2_cc_cv_attribute_uninitialized=no])
+AC_MSG_RESULT([$pcre2_cc_cv_attribute_uninitialized])
+if test "$pcre2_cc_cv_attribute_uninitialized" = yes; then
+  AC_DEFINE([HAVE_ATTRIBUTE_UNINITIALIZED], 1, [Define this if your compiler
+             supports __attribute__((uninitialized))])
+fi
+CFLAGS=$tmp_CFLAGS
+AC_LANG_POP([C])
+
 # Versioning

 PCRE2_MAJOR="pcre2_major"
@ -158,11 +175,18 @@ if test "$enable_jit" = "auto"; then
  echo checking for JIT support on this hardware... $enable_jit
 fi

-# Handle --enable-jit-sealloc (disabled by default)
+# Handle --enable-jit-sealloc (disabled by default and only experimental)
+case $host_os in
+  linux* | netbsd*)
    AC_ARG_ENABLE(jit-sealloc,
      AS_HELP_STRING([--enable-jit-sealloc],
-                             [enable SELinux compatible execmem allocator in JIT]),
+        [enable SELinux compatible execmem allocator in JIT (experimental)]),
        ,enable_jit_sealloc=no)
+    ;;
+  *)
+    enable_jit_sealloc=unsupported
+    ;;
+esac

 # Handle --disable-pcre2grep-jit (enabled by default)
 AC_ARG_ENABLE(pcre2grep-jit,
@ -399,7 +423,7 @@ case "$enable_newline" in
  anycrlf) ac_pcre2_newline_value=5 ;;
  nul)     ac_pcre2_newline_value=6 ;;
  *)
-  AC_MSG_ERROR([invalid argument \"$enable_newline\" to --enable-newline option])
+  AC_MSG_ERROR([invalid argument "$enable_newline" to --enable-newline option])
  ;;
 esac

@ -428,7 +452,7 @@ fi
 case "$with_link_size" in
  2|3|4) ;;
  *)
-  AC_MSG_ERROR([invalid argument \"$with_link_size\" to --with-link-size option])
+  AC_MSG_ERROR([invalid argument "$with_link_size" to --with-link-size option])
  ;;
 esac

@ -461,7 +485,6 @@ HAVE_BCOPY is defined. If your system has neither bcopy() nor memmove(), make
 sure both macros are undefined; an emulation function will then be used. */])

 # Checks for header files.
-AC_HEADER_STDC
 AC_CHECK_HEADERS(limits.h sys/types.h sys/stat.h dirent.h)
 AC_CHECK_HEADERS([windows.h], [HAVE_WINDOWS_H=1])
 AC_CHECK_HEADERS([sys/wait.h], [HAVE_SYS_WAIT_H=1])
@ -489,7 +512,20 @@ AC_TYPE_SIZE_T

 # Checks for library functions.

-AC_CHECK_FUNCS(bcopy memmove strerror mkostemp secure_getenv)
+AC_CHECK_FUNCS(bcopy memfd_create memmove mkostemp secure_getenv strerror)
+AC_MSG_CHECKING([for realpath])
+AC_LINK_IFELSE([AC_LANG_PROGRAM([[
+#include <stdlib.h>
+#include <limits.h>
+]],[[
+char buffer[PATH_MAX];
+realpath(".", buffer);
+]])],
+[AC_MSG_RESULT([yes])
+ AC_DEFINE([HAVE_REALPATH], 1,
+  [Define to 1 if you have the `realpath' function.])
+],
+AC_MSG_RESULT([no]))

 # Check for the availability of libz (aka zlib)

@ -561,14 +597,14 @@ if test "$enable_pcre2test_libreadline" = "yes"; then
 fi
 fi

-
 # Check for the availability of libedit. Different distributions put its
 # headers in different places. Try to cover the most common ones.

 if test "$enable_pcre2test_libedit" = "yes"; then
-  AC_CHECK_HEADERS([editline/readline.h], [HAVE_EDITLINE_READLINE_H=1],
-    [AC_CHECK_HEADERS([edit/readline/readline.h], [HAVE_READLINE_READLINE_H=1],
-      [AC_CHECK_HEADERS([readline/readline.h], [HAVE_READLINE_READLINE_H=1])])])
+  AC_CHECK_HEADERS([editline/readline.h edit/readline/readline.h readline.h], [
+    HAVE_LIBEDIT_HEADER=1
+    break
+  ])
  AC_CHECK_LIB([edit], [readline], [LIBEDIT="-ledit"])
 fi

@ -904,10 +940,9 @@ if test "$enable_pcre2test_libedit" = "yes"; then
    echo "** Cannot use both --enable-pcre2test-libedit and --enable-pcre2test-readline"
    exit 1
  fi
-  if test "$HAVE_EDITLINE_READLINE_H" != "1" -a \
-          "$HAVE_READLINE_READLINE_H" != "1"; then
-    echo "** Cannot --enable-pcre2test-libedit because neither editline/readline.h"
-    echo "** nor readline/readline.h was found."
+  if test -z "$HAVE_LIBEDIT_HEADER"; then
+    echo "** Cannot --enable-pcre2test-libedit because neither editline/readline.h,"
+    echo "** edit/readline/readline.h nor a compatible header was found."
    exit 1
  fi
  if test -z "$LIBEDIT"; then
@ -981,7 +1016,27 @@ fi # enable_coverage

 AM_CONDITIONAL([WITH_GCOV],[test "x$enable_coverage" = "xyes"])

+AC_MSG_CHECKING([whether Intel CET is enabled])
+AC_LANG_PUSH([C])
+AC_COMPILE_IFELSE([AC_LANG_PROGRAM(,
+                   [[#ifndef __CET__
+# error CET is not enabled
+#endif]])],
+                   [pcre2_cc_cv_intel_cet_enabled=yes],
+                   [pcre2_cc_cv_intel_cet_enabled=no])
+AC_MSG_RESULT([$pcre2_cc_cv_intel_cet_enabled])
+if test "$pcre2_cc_cv_intel_cet_enabled" = yes; then
+  CET_CFLAGS="-mshstk"
+  AC_SUBST([CET_CFLAGS])
+fi
+AC_LANG_POP([C])
+
+# LIB_POSTFIX is used by CMakeLists.txt for Windows debug builds.
+# Pass empty LIB_POSTFIX to *.pc files and pcre2-config here.
+AC_SUBST(LIB_POSTFIX)
+
 # Produce these files, in addition to config.h.
+
 AC_CONFIG_FILES(
 	Makefile
 	libpcre2-8.pc
--- a/doc/html/NON-AUTOTOOLS-BUILD.txt
+++ b/doc/html/NON-AUTOTOOLS-BUILD.txt
@ -40,7 +40,11 @@ GENERIC INSTRUCTIONS FOR THE PCRE2 C LIBRARY

 The following are generic instructions for building the PCRE2 C library "by
 hand". If you are going to use CMake, this section does not apply to you; you
-can skip ahead to the CMake section.
+can skip ahead to the CMake section. Note that the settings concerned with
+8-bit, 16-bit, and 32-bit code units relate to the type of data string that
+PCRE2 processes. They are NOT referring to the underlying operating system bit
+width. You do not have to do anything special to compile in a 64-bit
+environment, for example.

 (1) Copy or rename the file src/config.h.generic as src/config.h, and edit the
     macro settings that it contains to whatever is appropriate for your
@ -74,23 +78,23 @@ can skip ahead to the CMake section.
       src/pcre2_chartables.c.

     OR:
-       Compile src/dftables.c as a stand-alone program (using -DHAVE_CONFIG_H
-       if you have set up src/config.h), and then run it with the single
-       argument "src/pcre2_chartables.c". This generates a set of standard
-       character tables and writes them to that file. The tables are generated
-       using the default C locale for your system. If you want to use a locale
-       that is specified by LC_xxx environment variables, add the -L option to
-       the dftables command. You must use this method if you are building on a
-       system that uses EBCDIC code.
+       Compile src/pcre2_dftables.c as a stand-alone program (using
+       -DHAVE_CONFIG_H if you have set up src/config.h), and then run it with
+       the single argument "src/pcre2_chartables.c". This generates a set of
+       standard character tables and writes them to that file. The tables are
+       generated using the default C locale for your system. If you want to use
+       a locale that is specified by LC_xxx environment variables, add the -L
+       option to the pcre2_dftables command. You must use this method if you
+       are building on a system that uses EBCDIC code.

     The tables in src/pcre2_chartables.c are defaults. The caller of PCRE2 can
     specify alternative tables at run time.

- (4) For an 8-bit library, compile the following source files from the src
-     directory, setting -DPCRE2_CODE_UNIT_WIDTH=8 as a compiler option. Also
-     set -DHAVE_CONFIG_H if you have set up src/config.h with your
-     configuration, or else use other -D settings to change the configuration
-     as required.
+ (4) For a library that supports 8-bit code units in the character strings that
+     it processes, compile the following source files from the src directory,
+     setting -DPCRE2_CODE_UNIT_WIDTH=8 as a compiler option. Also set
+     -DHAVE_CONFIG_H if you have set up src/config.h with your configuration,
+     or else use other -D settings to change the configuration as required.

       pcre2_auto_possess.c
       pcre2_chartables.c
@ -117,6 +121,7 @@ can skip ahead to the CMake section.
       pcre2_substring.c
       pcre2_tables.c
       pcre2_ucd.c
+       pcre2_ucptables.c
       pcre2_valid_utf.c
       pcre2_xclass.c

@ -142,9 +147,9 @@ can skip ahead to the CMake section.
     If your system has static and shared libraries, you may have to do this
     once for each type.

- (6) If you want to build a 16-bit library or 32-bit library (as well as, or
-     instead of the 8-bit library) just supply 16 or 32 as the value of
-     -DPCRE2_CODE_UNIT_WIDTH when you are compiling.
+ (6) If you want to build a library that supports 16-bit or 32-bit code units,
+     (as well as, or instead of the 8-bit library) just supply 16 or 32 as the
+     value of -DPCRE2_CODE_UNIT_WIDTH when you are compiling.

 (7) If you want to build the POSIX wrapper functions (which apply only to the
     8-bit library), ensure that you have the src/pcre2posix.h file and then
@ -302,7 +307,7 @@ cache can be deleted by selecting "File > Delete Cache".
 3.  Create a new, empty build directory, preferably a subdirectory of the
    source dir. For example, C:\pcre2\pcre2-xx\build.

-4.  Run cmake-gui from the Shell envirornment of your build tool, for example,
+4.  Run cmake-gui from the Shell environment of your build tool, for example,
    Msys for Msys/MinGW or Visual Studio Command Prompt for VC/VC++. Do not try
    to start Cmake from the Windows Start menu, as this can lead to errors.

@ -339,10 +344,10 @@ cache can be deleted by selecting "File > Delete Cache".

 BUILDING PCRE2 ON WINDOWS WITH VISUAL STUDIO

-The code currently cannot be compiled without a stdint.h header, which is
-available only in relatively recent versions of Visual Studio. However, this
-portable and permissively-licensed implementation of the header worked without
-issue:
+The code currently cannot be compiled without an inttypes.h header, which is
+available only with Visual Studio 2013 or newer. However, this portable and
+permissively-licensed implementation of the stdint.h header could be used as an
+alternative:

  http://www.azillionmonkeys.com/qed/pstdint.h

@ -369,7 +374,7 @@ Otherwise:
 1. Copy RunTest.bat into the directory where pcre2test.exe and pcre2grep.exe
   have been created.

-2. Edit RunTest.bat to indentify the full or relative location of
+2. Edit RunTest.bat to identify the full or relative location of
   the pcre2 source (wherein which the testdata folder resides), e.g.:

   set srcdir=C:\pcre2\pcre2-10.00
@ -401,6 +406,6 @@ Everything in that location, source and executable, is in EBCDIC and native
 z/OS file formats. The port provides an API for LE languages such as COBOL and
 for the z/OS and z/VM versions of the Rexx languages.

-==============================
-Last Updated: 14 November 2018
-==============================
+===========================
+Last Updated: 28 April 2021
+===========================
--- a/doc/html/README.txt
+++ b/doc/html/README.txt
@ -4,18 +4,20 @@ README file for PCRE2 (Perl-compatible regular expression library)
 PCRE2 is a re-working of the original PCRE1 library to provide an entirely new
 API. Since its initial release in 2015, there has been further development of
 the code and it now differs from PCRE1 in more than just the API. There are new
-features and the internals have been improved. The latest release of PCRE2 is
-always available in three alternative formats from:
+features, and the internals have been improved. The original PCRE1 library is
+now obsolete and no longer maintained. The latest release of PCRE2 is available
+in .tar.gz, tar.bz2, or .zip form from this GitHub repository:

-  ftp://ftp.csx.cam.ac.uk/pub/software/programming/pcre/pcre2-xxx.tar.gz
-  ftp://ftp.csx.cam.ac.uk/pub/software/programming/pcre/pcre2-xxx.tar.bz2
-  ftp://ftp.csx.cam.ac.uk/pub/software/programming/pcre/pcre2-xxx.zip
+https://github.com/PCRE2Project/pcre2/releases

-There is a mailing list for discussion about the development of PCRE (both the
-original and new APIs) at pcre-dev@exim.org. You can access the archives and
-subscribe or manage your subscription here:
+There is a mailing list for discussion about the development of PCRE2 at
+pcre2-dev@googlegroups.com. You can subscribe by sending an email to
+pcre2-dev+subscribe@googlegroups.com.

-   https://lists.exim.org/mailman/listinfo/pcre-dev
+You can access the archives and also subscribe or manage your subscription
+here:
+
+https://groups.google.com/g/pcre2-dev

 Please read the NEWS file if you are upgrading from a previous release. The
 contents of this README file are:
@ -112,12 +114,18 @@ Building PCRE2 using autotools
 The following instructions assume the use of the widely used "configure; make;
 make install" (autotools) process.

-To build PCRE2 on system that supports autotools, first run the "configure"
-command from the PCRE2 distribution directory, with your current directory set
+If you have downloaded and unpacked a PCRE2 release tarball, run the
+"configure" command from the PCRE2 directory, with your current directory set
 to the directory where you want the files to be created. This command is a
 standard GNU "autoconf" configuration script, for which generic instructions
 are supplied in the file INSTALL.

+The files in the GitHub repository do not contain "configure". If you have
+downloaded the PCRE2 source files from GitHub, before you can run "configure"
+you must run the shell script called autogen.sh. This runs a number of
+autotools to create a "configure" script (you must of course have the autotools
+commands installed in order to do this).
+
 Most commonly, people build PCRE2 within its own distribution directory, and in
 this case, on many systems, just running "./configure" is sufficient. However,
 the usual methods of changing standard defaults are available. For example:
@ -164,9 +172,11 @@ library. They are also documented in the pcre2build man page.
  will be a compile time error. If in doubt, use --enable-jit=auto, which
  enables JIT only if the current hardware is supported.

-. If you are enabling JIT under SELinux you may also want to add
-  --enable-jit-sealloc, which enables the use of an execmem allocator in JIT
-  that is compatible with SELinux. This has no effect if JIT is not enabled.
+. If you are enabling JIT under SELinux environment you may also want to add
+  --enable-jit-sealloc, which enables the use of an executable memory allocator
+  that is compatible with SELinux. Warning: this allocator is experimental!
+  It does not support fork() operation and may crash when no disk space is
+  available. This option has no effect if JIT is disabled.

 . If you do not want to make use of the default support for UTF-8 Unicode
  character strings in the 8-bit library, UTF-16 Unicode character strings in
@ -184,10 +194,10 @@ library. They are also documented in the pcre2build man page.

  As well as supporting UTF strings, Unicode support includes support for the
  \P, \p, and \X sequences that recognize Unicode character properties.
-  However, only the basic two-letter properties such as Lu are supported.
-  Escape sequences such as \d and \w in patterns do not by default make use of
-  Unicode properties, but can be made to do so by setting the PCRE2_UCP option
-  or starting a pattern with (*UCP).
+  However, only a subset of Unicode properties are supported; see the
+  pcre2pattern man page for details. Escape sequences such as \d and \w in
+  patterns do not by default make use of Unicode properties, but can be made to
+  do so by setting the PCRE2_UCP option or starting a pattern with (*UCP).

 . You can build PCRE2 to recognize either CR or LF or the sequence CRLF, or any
  of the preceding, or any of the Unicode newline sequences, or the NUL (zero)
@ -267,9 +277,9 @@ library. They are also documented in the pcre2build man page.

  --enable-rebuild-chartables

-  a program called dftables is compiled and run in the default C locale when
-  you obey "make". It builds a source file called pcre2_chartables.c. If you do
-  not specify this option, pcre2_chartables.c is created as a copy of
+  a program called pcre2_dftables is compiled and run in the default C locale
+  when you obey "make". It builds a source file called pcre2_chartables.c. If
+  you do not specify this option, pcre2_chartables.c is created as a copy of
  pcre2_chartables.c.dist. See "Character tables" below for further
  information.

@ -295,8 +305,8 @@ library. They are also documented in the pcre2build man page.
  unaddressable. This allows it to detect invalid memory accesses, and is
  mostly useful for debugging PCRE2 itself.

-. In environments where the gcc compiler is used and lcov version 1.6 or above
-  is installed, if you specify
+. In environments where the gcc compiler is used and lcov is installed, if you
+  specify

  --enable-coverage

@ -365,19 +375,20 @@ library. They are also documented in the pcre2build man page.
  necessary to specify something like LIBS="-lncurses" as well. This is
  because, to quote the readline INSTALL, "Readline uses the termcap functions,
  but does not link with the termcap or curses library itself, allowing
-  applications which link with readline the to choose an appropriate library."
+  applications which link with readline the option to choose an appropriate
+  library."
  If you get error messages about missing functions tgetstr, tgetent, tputs,
  tgetflag, or tgoto, this is the problem, and linking with the ncurses library
  should fix it.

 . The C99 standard defines formatting modifiers z and t for size_t and
  ptrdiff_t values, respectively. By default, PCRE2 uses these modifiers in
-  environments other than Microsoft Visual Studio when __STDC_VERSION__ is
-  defined and has a value greater than or equal to 199901L (indicating C99).
-  However, there is at least one environment that claims to be C99 but does not
-  support these modifiers. If --disable-percent-zt is specified, no use is made
-  of the z or t modifiers. Instead or %td or %zu, %lu is used, with a cast for
-  size_t values.
+  environments other than Microsoft Visual Studio versions earlier than 2013
+  when __STDC_VERSION__ is defined and has a value greater than or equal to
+  199901L (indicating C99). However, there is at least one environment that
+  claims to be C99 but does not support these modifiers. If
+  --disable-percent-zt is specified, no use is made of the z or t modifiers.
+  Instead of %td or %zu, %lu is used, with a cast for size_t values.

 . There is a special option called --enable-fuzz-support for use by people who
  want to run fuzzing tests on PCRE2. At present this applies only to the 8-bit
@ -390,10 +401,10 @@ library. They are also documented in the pcre2build man page.
  Setting --enable-fuzz-support also causes a binary called pcre2fuzzcheck to
  be created. This is normally run under valgrind or used when PCRE2 is
  compiled with address sanitizing enabled. It calls the fuzzing function and
-  outputs information about it is doing. The input strings are specified by
-  arguments: if an argument starts with "=" the rest of it is a literal input
-  string. Otherwise, it is assumed to be a file name, and the contents of the
-  file are the test string.
+  outputs information about what it is doing. The input strings are specified
+  by arguments: if an argument starts with "=" the rest of it is a literal
+  input string. Otherwise, it is assumed to be a file name, and the contents
+  of the file are the test string.

 . Releases before 10.30 could be compiled with --disable-stack-for-recursion,
  which caused pcre2_match() to use individual blocks on the heap for
@ -546,11 +557,11 @@ Cross-compiling using autotools

 You can specify CC and CFLAGS in the normal way to the "configure" command, in
 order to cross-compile PCRE2 for some other host. However, you should NOT
-specify --enable-rebuild-chartables, because if you do, the dftables.c source
-file is compiled and run on the local host, in order to generate the inbuilt
-character tables (the pcre2_chartables.c file). This will probably not work,
-because dftables.c needs to be compiled with the local compiler, not the cross
-compiler.
+specify --enable-rebuild-chartables, because if you do, the pcre2_dftables.c
+source file is compiled and run on the local host, in order to generate the
+inbuilt character tables (the pcre2_chartables.c file). This will probably not
+work, because pcre2_dftables.c needs to be compiled with the local compiler,
+not the cross compiler.

 When --enable-rebuild-chartables is not specified, pcre2_chartables.c is
 created by making a copy of pcre2_chartables.c.dist, which is a default set of
@ -558,9 +569,10 @@ tables that assumes ASCII code. Cross-compiling with the default tables should
 not be a problem.

 If you need to modify the character tables when cross-compiling, you should
-move pcre2_chartables.c.dist out of the way, then compile dftables.c by hand
-and run it on the local host to make a new version of pcre2_chartables.c.dist.
-Then when you cross-compile PCRE2 this new version of the tables will be used.
+move pcre2_chartables.c.dist out of the way, then compile pcre2_dftables.c by
+hand and run it on the local host to make a new version of
+pcre2_chartables.c.dist. See the pcre2build section "Creating character tables
+at build time" for more details.


 Making new tarballs
@ -597,13 +609,13 @@ is available. RunTest outputs a comment when it skips a test.

 Many (but not all) of the tests that are not skipped are run twice if JIT
 support is available. On the second run, JIT compilation is forced. This
-testing can be suppressed by putting "nojit" on the RunTest command line.
+testing can be suppressed by putting "-nojit" on the RunTest command line.

 The entire set of tests is run once for each of the 8-bit, 16-bit and 32-bit
 libraries that are enabled. If you want to run just one set of tests, call
 RunTest with either the -8, -16 or -32 option.

-If valgrind is installed, you can run the tests under it by putting "valgrind"
+If valgrind is installed, you can run the tests under it by putting "-valgrind"
 on the RunTest command line. To run pcre2test on just one or more specific test
 files, give their numbers as arguments to RunTest, for example:

@ -684,7 +696,7 @@ Test 14 contains some special UTF and UCP tests that give different output for
 different code unit widths.

 Test 15 contains a number of tests that must not be run with JIT. They check,
-among other non-JIT things, the match-limiting features of the intepretive
+among other non-JIT things, the match-limiting features of the interpretive
 matcher.

 Test 16 is run only when JIT support is not available. It checks that an
@ -719,8 +731,8 @@ compile context.
 The source file called pcre2_chartables.c contains the default set of tables.
 By default, this is created as a copy of pcre2_chartables.c.dist, which
 contains tables for ASCII coding. However, if --enable-rebuild-chartables is
-specified for ./configure, a different version of pcre2_chartables.c is built
-by the program dftables (compiled from dftables.c), which uses the ANSI C
+specified for ./configure, a new version of pcre2_chartables.c is built by the
+program pcre2_dftables (compiled from pcre2_dftables.c), which uses the ANSI C
 character handling functions such as isalnum(), isalpha(), isupper(),
 islower(), etc. to build the table sources. This means that the default C
 locale that is set for your system will control the contents of these default
@ -730,32 +742,40 @@ file does not get automatically re-generated. The best way to do this is to
 move pcre2_chartables.c.dist out of the way and replace it with your customized
 tables.

-When the dftables program is run as a result of --enable-rebuild-chartables,
-it uses the default C locale that is set on your system. It does not pay
-attention to the LC_xxx environment variables. In other words, it uses the
-system's default locale rather than whatever the compiling user happens to have
-set. If you really do want to build a source set of character tables in a
-locale that is specified by the LC_xxx variables, you can run the dftables
-program by hand with the -L option. For example:
+When the pcre2_dftables program is run as a result of specifying
+--enable-rebuild-chartables, it uses the default C locale that is set on your
+system. It does not pay attention to the LC_xxx environment variables. In other
+words, it uses the system's default locale rather than whatever the compiling
+user happens to have set. If you really do want to build a source set of
+character tables in a locale that is specified by the LC_xxx variables, you can
+run the pcre2_dftables program by hand with the -L option. For example:

-  ./dftables -L pcre2_chartables.c.special
+  ./pcre2_dftables -L pcre2_chartables.c.special

-The first two 256-byte tables provide lower casing and case flipping functions,
-respectively. The next table consists of three 32-byte bit maps which identify
-digits, "word" characters, and white space, respectively. These are used when
-building 32-byte bit maps that represent character classes for code points less
-than 256. The final 256-byte table has bits indicating various character types,
-as follows:
+The second argument names the file where the source code for the tables is
+written. The first two 256-byte tables provide lower casing and case flipping
+functions, respectively. The next table consists of a number of 32-byte bit
+maps which identify certain character classes such as digits, "word"
+characters, white space, etc. These are used when building 32-byte bit maps
+that represent character classes for code points less than 256. The final
+256-byte table has bits indicating various character types, as follows:

    1   white space character
    2   letter
-    4   decimal digit
-    8   hexadecimal digit
+    4   lower case letter
+    8   decimal digit
   16   alphanumeric or '_'
-  128   regular expression metacharacter or binary zero

-You should not alter the set of characters that contain the 128 bit, as that
-will cause PCRE2 to malfunction.
+You can also specify -b (with or without -L) when running pcre2_dftables. This
+causes the tables to be written in binary instead of as source code. A set of
+binary tables can be loaded into memory by an application and passed to
+pcre2_compile() in the same way as tables created dynamically by calling
+pcre2_maketables(). The tables are just a string of bytes, independent of
+hardware characteristics such as endianness. This means they can be bundled
+with an application that runs in different environments, to ensure consistent
+behaviour.
+
+See also the pcre2build section "Creating character tables at build time".


 File manifest
@ -766,7 +786,7 @@ The distribution should contain the files listed below.
 (A) Source files for the PCRE2 library functions and their headers are found in
    the src directory:

-  src/dftables.c           auxiliary program for building pcre2_chartables.c
+  src/pcre2_dftables.c     auxiliary program for building pcre2_chartables.c
                           when --enable-rebuild-chartables is specified

  src/pcre2_chartables.c.dist  a default set of character tables that assume
@ -890,6 +910,6 @@ The distribution should contain the files listed below.
                          )   environments

 Philip Hazel
-Email local part: ph10
-Email domain: cam.ac.uk
-Last updated: 16 April 2019
+Email local part: Philip.Hazel
+Email domain: gmail.com
+Last updated: 15 April 2022
--- a/doc/html/index.html
+++ b/doc/html/index.html
@ -146,6 +146,9 @@ in the library.
 <tr><td><a href="pcre2_get_mark.html">pcre2_get_mark</a></td>
    <td>&nbsp;&nbsp;Get a (*MARK) name</td></tr>

+<tr><td><a href="pcre2_get_match_data_size.html">pcre2_get_match_data_size</a></td>
+    <td>&nbsp;&nbsp;Get the size of a match data block</td></tr>
+
 <tr><td><a href="pcre2_get_ovector_count.html">pcre2_get_ovector_count</a></td>
    <td>&nbsp;&nbsp;Get the ovector count</td></tr>

@ -176,6 +179,9 @@ in the library.
 <tr><td><a href="pcre2_maketables.html">pcre2_maketables</a></td>
    <td>&nbsp;&nbsp;Build character tables in current locale</td></tr>

+<tr><td><a href="pcre2_maketables_free.html">pcre2_maketables_free</a></td>
+    <td>&nbsp;&nbsp;Free character tables</td></tr>
+
 <tr><td><a href="pcre2_match.html">pcre2_match</a></td>
    <td>&nbsp;&nbsp;Match a compiled pattern to a subject string
    (Perl compatible)</td></tr>
--- a/doc/html/pcre2.html
+++ b/doc/html/pcre2.html
@ -28,7 +28,8 @@ nearly two decades, the limitations of the original API were making development
 increasingly difficult. The new API is more extensible, and it was simplified
 by abolishing the separate "study" optimizing function; in PCRE2, patterns are
 automatically optimized where possible. Since forking from PCRE1, the code has
-been extensively refactored and new features introduced.
+been extensively refactored and new features introduced. The old library is now
+obsolete and is no longer maintained.
 </P>
 <P>
 As well as Perl-style regular expression patterns, some features that appeared
@ -38,8 +39,14 @@ Oniguruma syntax items, and there are options for requesting some minor changes
 that give better ECMAScript (aka JavaScript) compatibility.
 </P>
 <P>
-The source code for PCRE2 can be compiled to support 8-bit, 16-bit, or 32-bit
-code units, which means that up to three separate libraries may be installed.
+The source code for PCRE2 can be compiled to support strings of 8-bit, 16-bit,
+or 32-bit code units, which means that up to three separate libraries may be
+installed, one for each code unit size. The size of code unit is not related to
+the bit size of the underlying hardware. In a 64-bit environment that also
+supports 32-bit applications, versions of PCRE2 that are compiled in both
+64-bit and 32-bit modes may be needed.
+</P>
+<P>
 The original work to extend PCRE to 16-bit and 32-bit code units was done by
 Zoltan Herczeg and Christian Persch, respectively. In all three cases, strings
 can be interpreted either as one character per code unit, or as UTF-encoded
@ -187,20 +194,20 @@ function, listing its arguments and results.
 <P>
 Philip Hazel
 <br>
-University Computing Service
+Retired from University Computing Service
 <br>
 Cambridge, England.
 <br>
 </P>
 <P>
 Putting an actual email address here is a spam magnet. If you want to email me,
-use my two initials, followed by the two digits 10, at the domain cam.ac.uk.
+use my two names separated by a dot at gmail.com.
 </P>
 <br><a name="SEC5" href="#TOC1">REVISION</a><br>
 <P>
-Last updated: 17 September 2018
+Last updated: 27 August 2021
 <br>
-Copyright &copy; 1997-2018 University of Cambridge.
+Copyright &copy; 1997-2021 University of Cambridge.
 <br>
 <p>
 Return to the <a href="index.html">PCRE2 index page</a>.
--- a/doc/html/pcre2_compile.html
+++ b/doc/html/pcre2_compile.html
@ -65,6 +65,7 @@ The option bits are:
  PCRE2_EXTENDED           Ignore white space and # comments
  PCRE2_FIRSTLINE          Force matching to be before newline
  PCRE2_LITERAL            Pattern characters are all literal
+  PCRE2_MATCH_INVALID_UTF  Enable support for matching invalid UTF
  PCRE2_MATCH_UNSET_BACKREF  Match unset backreferences
  PCRE2_MULTILINE          ^ and $ match newlines within data
  PCRE2_NEVER_BACKSLASH_C  Lock out the use of \C in patterns
@ -91,8 +92,18 @@ Additional options may be set in the compile context via the
 function.
 </P>
 <P>
-The yield of this function is a pointer to a private data structure that
-contains the compiled pattern, or NULL if an error was detected.
+If either of <i>errorcode</i> or <i>erroroffset</i> is NULL, the function returns
+NULL immediately. Otherwise, the yield of this function is a pointer to a
+private data structure that contains the compiled pattern, or NULL if an error
+was detected. In the error case, a text error message can be obtained by
+passing the value returned via the <i>errorcode</i> argument to the the
+<b>pcre2_get_error_message()</b> function. The offset (in code units) where the
+error was encountered is returned via the <i>erroroffset</i> argument.
+</P>
+<P>
+If there is no error, the value passed via <i>errorcode</i> returns the message
+"no error" if passed to <b>pcre2_get_error_message()</b>, and the value passed
+via <i>erroroffset</i> is zero.
 </P>
 <P>
 There is a complete description of the PCRE2 native API, with more detail on
--- a/doc/html/pcre2_dfa_match.html
+++ b/doc/html/pcre2_dfa_match.html
@ -45,10 +45,16 @@ just once (except when processing lookaround assertions). This function is
  <i>workspace</i>    Points to a vector of ints used as working space
  <i>wscount</i>      Number of elements in the vector
 </pre>
-For <b>pcre2_dfa_match()</b>, a match context is needed only if you want to set
-up a callout function or specify the heap limit or the match or the recursion
-depth limits. The <i>length</i> and <i>startoffset</i> values are code units, not
-characters. The options are:
+The size of output vector needed to contain all the results depends on the
+number of simultaneous matches, not on the number of parentheses in the
+pattern. Using <b>pcre2_match_data_create_from_pattern()</b> to create the match
+data block is therefore not advisable when using this function.
+</P>
+<P>
+A match context is needed only if you want to set up a callout function or
+specify the heap limit or the match or the recursion depth limits. The
+<i>length</i> and <i>startoffset</i> values are code units, not characters. The
+options are:
 <pre>
  PCRE2_ANCHORED          Match only at the first position
  PCRE2_COPY_MATCHED_SUBJECT
--- a/doc/html/pcre2_get_match_data_size.html
+++ b/doc/html/pcre2_get_match_data_size.html
@ -0,0 +1,39 @@
+<html>
+<head>
+<title>pcre2_get_match_data_size specification</title>
+</head>
+<body bgcolor="#FFFFFF" text="#00005A" link="#0066FF" alink="#3399FF" vlink="#2222BB">
+<h1>pcre2_get_match_data_size man page</h1>
+<p>
+Return to the <a href="index.html">PCRE2 index page</a>.
+</p>
+<p>
+This page is part of the PCRE2 HTML documentation. It was generated
+automatically from the original man page. If there is any nonsense in it,
+please consult the man page, in case the conversion went wrong.
+<br>
+<br><b>
+SYNOPSIS
+</b><br>
+<P>
+<b>#include &#60;pcre2.h&#62;</b>
+</P>
+<P>
+<b>PCRE2_SIZE pcre2_get_match_data_size(pcre2_match_data *<i>match_data</i>);</b>
+</P>
+<br><b>
+DESCRIPTION
+</b><br>
+<P>
+This function returns the size, in bytes, of the match data block that is its
+argument.
+</P>
+<P>
+There is a complete description of the PCRE2 native API in the
+<a href="pcre2api.html"><b>pcre2api</b></a>
+page and a description of the POSIX API in the
+<a href="pcre2posix.html"><b>pcre2posix</b></a>
+page.
+<p>
+Return to the <a href="index.html">PCRE2 index page</a>.
+</p>
--- a/doc/html/pcre2_jit_compile.html
+++ b/doc/html/pcre2_jit_compile.html
@ -40,11 +40,17 @@ bits:
  PCRE2_JIT_COMPLETE      compile code for full matching
  PCRE2_JIT_PARTIAL_SOFT  compile code for soft partial matching
  PCRE2_JIT_PARTIAL_HARD  compile code for hard partial matching
-  PCRE2_JIT_INVALID_UTF   compile code to handle invalid UTF
 </pre>
+There is also an obsolete option called PCRE2_JIT_INVALID_UTF, which has been
+superseded by the <b>pcre2_compile()</b> option PCRE2_MATCH_INVALID_UTF. The old
+option is deprecated and may be removed in the future.
+</P>
+<P>
 The yield of the function is 0 for success, or a negative error code otherwise.
 In particular, PCRE2_ERROR_JIT_BADOPTION is returned if JIT is not supported or
-if an unknown bit is set in <i>options</i>.
+if an unknown bit is set in <i>options</i>. The function can also return
+PCRE2_ERROR_NOMEMORY if JIT is unable to allocate executable memory for the
+compiler, even if it was because of a system security restriction.
 </P>
 <P>
 There is a complete description of the PCRE2 native API in the
--- a/doc/html/pcre2_jit_free_unused_memory.html
+++ b/doc/html/pcre2_jit_free_unused_memory.html
@ -29,7 +29,7 @@ This function frees unused JIT executable memory. The argument is a general
 context, for custom memory management, or NULL for standard memory management.
 JIT memory allocation retains some memory in order to improve future JIT
 compilation speed. In low memory conditions,
-\fBpcre2_jit_free_unused_memory()\fB can be used to cause this memory to be
+<b>pcre2_jit_free_unused_memory()</b> can be used to cause this memory to be
 freed.
 </P>
 <P>
--- a/doc/html/pcre2_jit_match.html
+++ b/doc/html/pcre2_jit_match.html
@ -33,7 +33,9 @@ processed by the JIT compiler against a given subject string, using a matching
 algorithm that is similar to Perl's. It is a "fast path" interface to JIT, and
 it bypasses some of the sanity checks that <b>pcre2_match()</b> applies.
 Its arguments are exactly the same as for
-<a href="pcre2_match.html"><b>pcre2_match()</b>.</a>
+<a href="pcre2_match.html"><b>pcre2_match()</b>,</a>
+except that the subject string must be specified with a length;
+PCRE2_ZERO_TERMINATED is not supported.
 </P>
 <P>
 The supported options are PCRE2_NOTBOL, PCRE2_NOTEOL, PCRE2_NOTEMPTY,
--- a/doc/html/pcre2_jit_stack_create.html
+++ b/doc/html/pcre2_jit_stack_create.html
@ -34,7 +34,8 @@ allocation. The result can be passed to the JIT run-time code by calling
 <b>pcre2_jit_stack_assign()</b> to associate the stack with a compiled pattern,
 which can then be processed by <b>pcre2_match()</b> or <b>pcre2_jit_match()</b>.
 A maximum stack size of 512KiB to 1MiB should be more than enough for any
-pattern. For more details, see the
+pattern. If the stack couldn't be allocated or the values passed were not
+reasonable, NULL will be returned. For more details, see the
 <a href="pcre2jit.html"><b>pcre2jit</b></a>
 page.
 </P>
--- a/doc/html/pcre2_maketables.html
+++ b/doc/html/pcre2_maketables.html
@ -19,7 +19,7 @@ SYNOPSIS
 <b>#include &#60;pcre2.h&#62;</b>
 </P>
 <P>
-<b>const unsigned char *pcre2_maketables(pcre2_general_context *<i>gcontext</i>);</b>
+<b>const uint8_t *pcre2_maketables(pcre2_general_context *<i>gcontext</i>);</b>
 </P>
 <br><b>
 DESCRIPTION
--- a/doc/html/pcre2_maketables_free.html
+++ b/doc/html/pcre2_maketables_free.html
@ -0,0 +1,44 @@
+<html>
+<head>
+<title>pcre2_maketables_free specification</title>
+</head>
+<body bgcolor="#FFFFFF" text="#00005A" link="#0066FF" alink="#3399FF" vlink="#2222BB">
+<h1>pcre2_maketables_free man page</h1>
+<p>
+Return to the <a href="index.html">PCRE2 index page</a>.
+</p>
+<p>
+This page is part of the PCRE2 HTML documentation. It was generated
+automatically from the original man page. If there is any nonsense in it,
+please consult the man page, in case the conversion went wrong.
+<br>
+<br><b>
+SYNOPSIS
+</b><br>
+<P>
+<b>#include &#60;pcre2.h&#62;</b>
+</P>
+<P>
+<b>void pcre2_maketables_free(pcre2_general_context *<i>gcontext</i>,</b>
+<b>  const uint8_t *<i>tables</i>);</b>
+</P>
+<br><b>
+DESCRIPTION
+</b><br>
+<P>
+This function discards a set of character tables that were created by a call
+to
+<a href="pcre2_maketables.html"><b>pcre2_maketables()</b>.</a>
+</P>
+<P>
+The <i>gcontext</i> parameter should match what was used in that call to
+account for any custom allocators that might be in use; if it is NULL
+the system <b>free()</b> is used.
+</P>
+<P>
+There is a complete description of the PCRE2 native API in the
+<a href="pcre2api.html"><b>pcre2api</b></a>
+page.
+<p>
+Return to the <a href="index.html">PCRE2 index page</a>.
+</p>
--- a/doc/html/pcre2_match_data_create.html
+++ b/doc/html/pcre2_match_data_create.html
@ -30,8 +30,9 @@ This function creates a new match data block, which is used for holding the
 result of a match. The first argument specifies the number of pairs of offsets
 that are required. These form the "output vector" (ovector) within the match
 data block, and are used to identify the matched string and any captured
-substrings. There is always one pair of offsets; if <b>ovecsize</b> is zero, it
-is treated as one.
+substrings when matching with <b>pcre2_match()</b>, or a number of different
+matches at the same point when used with <b>pcre2_dfa_match()</b>. There is
+always one pair of offsets; if <b>ovecsize</b> is zero, it is treated as one.
 </P>
 <P>
 The second argument points to a general context, for custom memory management,
--- a/doc/html/pcre2_match_data_create_from_pattern.html
+++ b/doc/html/pcre2_match_data_create_from_pattern.html
@ -26,12 +26,15 @@ SYNOPSIS
 DESCRIPTION
 </b><br>
 <P>
-This function creates a new match data block, which is used for holding the
-result of a match. The first argument points to a compiled pattern. The number
-of capturing parentheses within the pattern is used to compute the number of
-pairs of offsets that are required in the match data block. These form the
-"output vector" (ovector) within the match data block, and are used to identify
-the matched string and any captured substrings.
+This function creates a new match data block for holding the result of a match.
+The first argument points to a compiled pattern. The number of capturing
+parentheses within the pattern is used to compute the number of pairs of
+offsets that are required in the match data block. These form the "output
+vector" (ovector) within the match data block, and are used to identify the
+matched string and any captured substrings when matching with
+<b>pcre2_match()</b>. If you are using <b>pcre2_dfa_match()</b>, which uses the
+outut vector in a different way, you should use <b>pcre2_match_data_create()</b>
+instead of this function.
 </P>
 <P>
 The second argument points to a general context, for custom memory management,
--- a/doc/html/pcre2_serialize_decode.html
+++ b/doc/html/pcre2_serialize_decode.html
@ -48,7 +48,7 @@ the following negative error codes:
  PCRE2_ERROR_BADDATA   <i>number_of_codes</i> is zero or less
  PCRE2_ERROR_BADMAGIC  mismatch of id bytes in <i>bytes</i>
  PCRE2_ERROR_BADMODE   mismatch of variable unit size or PCRE version
-  PCRE2_ERROR_MEMORY    memory allocation failed
+  PCRE2_ERROR_NOMEMORY  memory allocation failed
  PCRE2_ERROR_NULL      <i>codes</i> or <i>bytes</i> is NULL
 </pre>
 PCRE2_ERROR_BADMAGIC may mean that the data is corrupt, or that it was compiled
--- a/doc/html/pcre2_set_character_tables.html
+++ b/doc/html/pcre2_set_character_tables.html
@ -20,16 +20,19 @@ SYNOPSIS
 </P>
 <P>
 <b>int pcre2_set_character_tables(pcre2_compile_context *<i>ccontext</i>,</b>
-<b>  const unsigned char *<i>tables</i>);</b>
+<b>  const uint8_t *<i>tables</i>);</b>
 </P>
 <br><b>
 DESCRIPTION
 </b><br>
 <P>
 This function sets a pointer to custom character tables within a compile
-context. The second argument must be the result of a call to
-<b>pcre2_maketables()</b> or NULL to request the default tables. The result is
-always zero.
+context. The second argument must point to a set of PCRE2 character tables or
+be NULL to request the default tables. The result is always zero. Character
+tables can be created by calling <b>pcre2_maketables()</b> or by running the
+<b>pcre2_dftables</b> maintenance command in binary mode (see the
+<a href="pcre2build.html"><b>pcre2build</b></a>
+documentation).
 </P>
 <P>
 There is a complete description of the PCRE2 native API in the
--- a/doc/html/pcre2_set_compile_extra_options.html
+++ b/doc/html/pcre2_set_compile_extra_options.html
@ -30,7 +30,8 @@ This function sets additional option bits for <b>pcre2_compile()</b> that are
 housed in a compile context. It completely replaces all the bits. The extra
 options are:
 <pre>
-  PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES  Allow \x{df800} to \x{dfff} in UTF-8 and UTF-32 modes
+  PCRE2_EXTRA_ALLOW_LOOKAROUND_BSK     Allow \K in lookarounds
+  PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES  Allow \x{d800} to \x{dfff} in UTF-8 and UTF-32 modes
  PCRE2_EXTRA_ALT_BSUX                 Extended alternate \u, \U, and \x handling
  PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL    Treat all invalid escapes as a literal following character
  PCRE2_EXTRA_ESCAPED_CR_IS_LF         Interpret \r as \n
--- a/doc/html/pcre2_substitute.html
+++ b/doc/html/pcre2_substitute.html
@ -48,8 +48,8 @@ Its arguments are:
  <i>outlengthptr</i>  Points to the length of the output buffer
 </pre>
 A match data block is needed only if you want to inspect the data from the
-match that is returned in that block. A match context is needed only if you
-want to:
+final match that is returned in that block or if PCRE2_SUBSTITUTE_MATCHED is
+set. A match context is needed only if you want to:
 <pre>
  Set up a callout function
  Set a matching offset limit
@ -57,29 +57,46 @@ want to:
  Change the backtracking depth limit
  Set custom memory management in the match context
 </pre>
-The <i>length</i>, <i>startoffset</i> and <i>rlength</i> values are code
-units, not characters, as is the contents of the variable pointed at by
-<i>outlengthptr</i>, which is updated to the actual length of the new string.
+The <i>length</i>, <i>startoffset</i> and <i>rlength</i> values are code units,
+not characters, as is the contents of the variable pointed at by
+<i>outlengthptr</i>. This variable must contain the length of the output buffer
+when the function is called. If the function is successful, the value is
+changed to the length of the new string, excluding the trailing zero that is
+automatically added.
+</P>
+<P>
 The subject and replacement lengths can be given as PCRE2_ZERO_TERMINATED for
 zero-terminated strings. The options are:
 <pre>
  PCRE2_ANCHORED                     Match only at the first position
-  PCRE2_ENDANCHORED          Pattern can match only at end of subject
+  PCRE2_ENDANCHORED                  Match only at end of subject
  PCRE2_NOTBOL                       Subject is not the beginning of a line
  PCRE2_NOTEOL                       Subject is not the end of a line
  PCRE2_NOTEMPTY                     An empty string is not a valid match
  PCRE2_NOTEMPTY_ATSTART             An empty string at the start of the subject is not a valid match
  PCRE2_NO_JIT                       Do not use JIT matching
-  PCRE2_NO_UTF_CHECK         Do not check the subject or replacement for UTF validity (only relevant if
-                              PCRE2_UTF was set at compile time)
+  PCRE2_NO_UTF_CHECK                 Do not check for UTF validity in the subject or replacement
+                                      (only relevant if PCRE2_UTF was set at compile time)
  PCRE2_SUBSTITUTE_EXTENDED          Do extended replacement processing
  PCRE2_SUBSTITUTE_GLOBAL            Replace all occurrences in the subject
+  PCRE2_SUBSTITUTE_LITERAL           The replacement string is literal
+  PCRE2_SUBSTITUTE_MATCHED           Use pre-existing match data for first match
  PCRE2_SUBSTITUTE_OVERFLOW_LENGTH   If overflow, compute needed length
+  PCRE2_SUBSTITUTE_REPLACEMENT_ONLY  Return only replacement string(s)
  PCRE2_SUBSTITUTE_UNKNOWN_UNSET     Treat unknown group as unset
  PCRE2_SUBSTITUTE_UNSET_EMPTY       Simple unset insert = empty string
 </pre>
+If PCRE2_SUBSTITUTE_LITERAL is set, PCRE2_SUBSTITUTE_EXTENDED,
+PCRE2_SUBSTITUTE_UNKNOWN_UNSET, and PCRE2_SUBSTITUTE_UNSET_EMPTY are ignored.
+</P>
+<P>
+If PCRE2_SUBSTITUTE_MATCHED is set, <i>match_data</i> must be non-NULL; its
+contents must be the result of a call to <b>pcre2_match()</b> using the same
+pattern and subject.
+</P>
+<P>
 The function returns the number of substitutions, which may be zero if there
-were no matches. The result can be greater than one only when
+are no matches. The result may be greater than one only when
 PCRE2_SUBSTITUTE_GLOBAL is set. In the event of an error, a negative error code
 is returned.
 </P>
--- a/doc/html/pcre2api.html
+++ b/doc/html/pcre2api.html
--- a/doc/html/pcre2build.html
+++ b/doc/html/pcre2build.html
@ -128,7 +128,7 @@ To build it without Unicode support, add
  --disable-unicode
 </pre>
 to the <b>configure</b> command. This setting applies to all three libraries. It
-is not possible to build one library with Unicode support, and another without,
+is not possible to build one library with Unicode support and another without
 in the same configuration.
 </P>
 <P>
@ -142,8 +142,9 @@ locked this out by setting PCRE2_NEVER_UTF.
 UTF support allows the libraries to process character code points up to
 0x10ffff in the strings that they handle. Unicode support also gives access to
 the Unicode properties of characters, using pattern escapes such as \P, \p,
-and \X. Only the general category properties such as <i>Lu</i> and <i>Nd</i> are
-supported. Details are given in the
+and \X. Only the general category properties such as <i>Lu</i> and <i>Nd</i>,
+script names, and some bi-directional properties are supported. Details are
+given in the
 <a href="pcre2pattern.html"><b>pcre2pattern</b></a>
 documentation.
 </P>
@ -188,11 +189,11 @@ which enables the use of an execmem allocator in JIT that is compatible with
 SELinux. This has no effect if JIT is not enabled. See the
 <a href="pcre2jit.html"><b>pcre2jit</b></a>
 documentation for a discussion of JIT usage. When JIT support is enabled,
-pcre2grep automatically makes use of it, unless you add
+<b>pcre2grep</b> automatically makes use of it, unless you add
 <pre>
  --disable-pcre2grep-jit
 </pre>
-to the "configure" command.
+to the <b>configure</b> command.
 </P>
 <br><a name="SEC8" href="#TOC1">NEWLINE RECOGNITION</a><br>
 <P>
@ -283,12 +284,11 @@ to the <b>configure</b> command. This setting also applies to the
 counting is done differently).
 </P>
 <P>
-The <b>pcre2_match()</b> function starts out using a 20KiB vector on the system
-stack to record backtracking points. The more nested backtracking points there
-are (that is, the deeper the search tree), the more memory is needed. If the
-initial vector is not large enough, heap memory is used, up to a certain limit,
-which is specified in kibibytes (units of 1024 bytes). The limit can be changed
-at run time, as described in the
+The <b>pcre2_match()</b> function uses heap memory to record backtracking
+points. The more nested backtracking points there are (that is, the deeper the
+search tree), the more memory is needed. There is an upper limit, specified in
+kibibytes (units of 1024 bytes). This limit can be changed at run time, as
+described in the
 <a href="pcre2api.html"><b>pcre2api</b></a>
 documentation. The default limit (in effect unlimited) is 20 million. You can
 change this by a setting such as
@ -307,7 +307,7 @@ You can also explicitly limit the depth of nested backtracking in the
 for --with-match-limit. You can set a lower default limit by adding, for
 example,
 <pre>
-  --with-match-limit_depth=10000
+  --with-match-limit-depth=10000
 </pre>
 to the <b>configure</b> command. This value can be overridden at run time. This
 depth limit indirectly limits the amount of heap memory that is used, but
@ -321,7 +321,7 @@ As well as applying to <b>pcre2_match()</b>, the depth limit also controls
 the depth of recursive function calls in <b>pcre2_dfa_match()</b>. These are
 used for lookaround assertions, atomic groups, and recursion within patterns.
 The limit does not apply to JIT matching.
-</P>
+<a name="createtables"></a></P>
 <br><a name="SEC12" href="#TOC1">CREATING CHARACTER TABLES AT BUILD TIME</a><br>
 <P>
 PCRE2 uses fixed tables for processing characters whose code points are less
@ -332,12 +332,34 @@ only. If you add
  --enable-rebuild-chartables
 </pre>
 to the <b>configure</b> command, the distributed tables are no longer used.
-Instead, a program called <b>dftables</b> is compiled and run. This outputs the
-source for new set of tables, created in the default locale of your C run-time
-system. This method of replacing the tables does not work if you are cross
-compiling, because <b>dftables</b> is run on the local host. If you need to
-create alternative tables when cross compiling, you will have to do so "by
-hand".
+Instead, a program called <b>pcre2_dftables</b> is compiled and run. This
+outputs the source for new set of tables, created in the default locale of your
+C run-time system. This method of replacing the tables does not work if you are
+cross compiling, because <b>pcre2_dftables</b> needs to be run on the local
+host and therefore not compiled with the cross compiler.
+</P>
+<P>
+If you need to create alternative tables when cross compiling, you will have to
+do so "by hand". There may also be other reasons for creating tables manually.
+To cause <b>pcre2_dftables</b> to be built on the local host, run a normal
+compiling command, and then run the program with the output file as its
+argument, for example:
+<pre>
+  cc src/pcre2_dftables.c -o pcre2_dftables
+  ./pcre2_dftables src/pcre2_chartables.c
+</pre>
+This builds the tables in the default locale of the local host. If you want to
+specify a locale, you must use the -L option:
+<pre>
+  LC_ALL=fr_FR ./pcre2_dftables -L src/pcre2_chartables.c
+</pre>
+You can also specify -b (with or without -L). This causes the tables to be
+written in binary instead of as source code. A set of binary tables can be
+loaded into memory by an application and passed to <b>pcre2_compile()</b> in the
+same way as tables created by calling <b>pcre2_maketables()</b>. The tables are
+just a string of bytes, independent of hardware characteristics such as
+endianness. This means they can be bundled with an application that runs in
+different environments, to ensure consistent behaviour.
 </P>
 <br><a name="SEC13" href="#TOC1">USING EBCDIC CODE</a><br>
 <P>
@ -414,7 +436,7 @@ default parameter values by adding, for example,
  --with-pcre2grep-bufsize=51200
  --with-pcre2grep-max-bufsize=2097152
 </pre>
-to the <b>configure</b> command. The caller of \fPpcre2grep\fP can override
+to the <b>configure</b> command. The caller of <b>pcre2grep</b> can override
 these values by using --buffer-size and --max-buffer-size on the command line.
 </P>
 <br><a name="SEC17" href="#TOC1">PCRE2TEST OPTION FOR LIBREADLINE SUPPORT</a><br>
@ -531,15 +553,16 @@ documentation.
 <P>
 The C99 standard defines formatting modifiers z and t for size_t and
 ptrdiff_t values, respectively. By default, PCRE2 uses these modifiers in
-environments other than Microsoft Visual Studio when __STDC_VERSION__ is
-defined and has a value greater than or equal to 199901L (indicating C99).
+environments other than old versions of Microsoft Visual Studio when
+__STDC_VERSION__ is defined and has a value greater than or equal to 199901L
+(indicating support for C99).
 However, there is at least one environment that claims to be C99 but does not
 support these modifiers. If
 <pre>
  --disable-percent-zt
 </pre>
-is specified, no use is made of the z or t modifiers. Instead or %td or %zu,
-%lu is used, with a cast for size_t values.
+is specified, no use is made of the z or t modifiers. Instead of %td or %zu,
+a suitable format is used depending in the size of long for the platform.
 </P>
 <br><a name="SEC22" href="#TOC1">SUPPORT FOR FUZZERS</a><br>
 <P>
@ -585,16 +608,16 @@ give a warning.
 <P>
 Philip Hazel
 <br>
-University Computing Service
+Retired from University Computing Service
 <br>
 Cambridge, England.
 <br>
 </P>
 <br><a name="SEC26" href="#TOC1">REVISION</a><br>
 <P>
-Last updated: 03 March 2019
+Last updated: 27 July 2022
 <br>
-Copyright &copy; 1997-2019 University of Cambridge.
+Copyright &copy; 1997-2022 University of Cambridge.
 <br>
 <p>
 Return to the <a href="index.html">PCRE2 index page</a>.
--- a/doc/html/pcre2compat.html
+++ b/doc/html/pcre2compat.html
@ -16,32 +16,43 @@ please consult the man page, in case the conversion went wrong.
 DIFFERENCES BETWEEN PCRE2 AND PERL
 </b><br>
 <P>
-This document describes the differences in the ways that PCRE2 and Perl handle
-regular expressions. The differences described here are with respect to Perl
-versions 5.26, but as both Perl and PCRE2 are continually changing, the
-information may sometimes be out of date.
+This document describes some of the differences in the ways that PCRE2 and Perl
+handle regular expressions. The differences described here are with respect to
+Perl version 5.34.0, but as both Perl and PCRE2 are continually changing, the
+information may at times be out of date.
 </P>
 <P>
-1. PCRE2 has only a subset of Perl's Unicode support. Details of what it does
+1. When PCRE2_DOTALL (equivalent to Perl's /s qualifier) is not set, the
+behaviour of the '.' metacharacter differs from Perl. In PCRE2, '.' matches the
+next character unless it is the start of a newline sequence. This means that,
+if the newline setting is CR, CRLF, or NUL, '.' will match the code point LF
+(0x0A) in ASCII/Unicode environments, and NL (either 0x15 or 0x25) when using
+EBCDIC. In Perl, '.' appears never to match LF, even when 0x0A is not a newline
+indicator.
+</P>
+<P>
+2. PCRE2 has only a subset of Perl's Unicode support. Details of what it does
 have are given in the
 <a href="pcre2unicode.html"><b>pcre2unicode</b></a>
 page.
 </P>
 <P>
-2. Like Perl, PCRE2 allows repeat quantifiers on parenthesized assertions, but
+3. Like Perl, PCRE2 allows repeat quantifiers on parenthesized assertions, but
 they do not mean what you might think. For example, (?!a){3} does not assert
 that the next three characters are not "a". It just asserts that the next
 character is not "a" three times (in principle; PCRE2 optimizes this to run the
 assertion just once). Perl allows some repeat quantifiers on other assertions,
-for example, \b* (but not \b{3}), but these do not seem to have any use.
+for example, \b* , but these do not seem to have any use. PCRE2 does not allow
+any kind of quantifier on non-lookaround assertions.
 </P>
 <P>
-3. Capture groups that occur inside negative lookaround assertions are counted,
+4. Capture groups that occur inside negative lookaround assertions are counted,
 but their entries in the offsets vector are set only when a negative assertion
 is a condition that has a matching branch (that is, the condition is false).
+Perl may set such capture groups in other circumstances.
 </P>
 <P>
-4. The following Perl escape sequences are not supported: \F, \l, \L, \u,
+5. The following Perl escape sequences are not supported: \F, \l, \L, \u,
 \U, and \N when followed by a character name. \N on its own, matching a
 non-newline character, and \N{U+dd..}, matching a Unicode code point, are
 supported. The escapes that modify the case of following letters are
@ -52,24 +63,26 @@ PCRE2_EXTRA_ALT_BSUX options is set, \U and \u are interpreted as ECMAScript
 interprets them.
 </P>
 <P>
-5. The Perl escape sequences \p, \P, and \X are supported only if PCRE2 is
+6. The Perl escape sequences \p, \P, and \X are supported only if PCRE2 is
 built with Unicode support (the default). The properties that can be tested
 with \p and \P are limited to the general category properties such as Lu and
-Nd, script names such as Greek or Han, and the derived properties Any and L&.
-PCRE2 does support the Cs (surrogate) property, which Perl does not; the Perl
-documentation says "Because Perl hides the need for the user to understand the
-internal representation of Unicode characters, there is no need to implement
-the somewhat messy concept of surrogates."
+Nd, script names such as Greek or Han, Bidi_Class, Bidi_Control, and the
+derived properties Any and LC (synonym L&). Both PCRE2 and Perl support the Cs
+(surrogate) property, but in PCRE2 its use is limited. See the
+<a href="pcre2pattern.html"><b>pcre2pattern</b></a>
+documentation for details. The long synonyms for property names that Perl
+supports (such as \p{Letter}) are not supported by PCRE2, nor is it permitted
+to prefix any of these properties with "Is".
 </P>
 <P>
-6. PCRE2 supports the \Q...\E escape for quoting substrings. Characters
+7. PCRE2 supports the \Q...\E escape for quoting substrings. Characters
 in between are treated as literals. However, this is slightly different from
 Perl in that $ and @ are also handled as literals inside the quotes. In Perl,
-they cause variable interpolation (but of course PCRE2 does not have
-variables). Also, Perl does "double-quotish backslash interpolation" on any
-backslashes between \Q and \E which, its documentation says, "may lead to
-confusing results". PCRE2 treats a backslash between \Q and \E just like any
-other character. Note the following examples:
+they cause variable interpolation (PCRE2 does not have variables). Also, Perl
+does "double-quotish backslash interpolation" on any backslashes between \Q
+and \E which, its documentation says, "may lead to confusing results". PCRE2
+treats a backslash between \Q and \E just like any other character. Note the
+following examples:
 <pre>
    Pattern            PCRE2 matches     Perl matches

@ -79,41 +92,38 @@ other character. Note the following examples:
    \QA\B\E            A\B               A\B
    \Q\\E              \                 \\E
 </pre>
-The \Q...\E sequence is recognized both inside and outside character classes.
+The \Q...\E sequence is recognized both inside and outside character classes
+by both PCRE2 and Perl.
 </P>
 <P>
-7. Fairly obviously, PCRE2 does not support the (?{code}) and (??{code})
+8. Fairly obviously, PCRE2 does not support the (?{code}) and (??{code})
 constructions. However, PCRE2 does have a "callout" feature, which allows an
 external function to be called during pattern matching. See the
 <a href="pcre2callout.html"><b>pcre2callout</b></a>
 documentation for details.
 </P>
 <P>
-8. Subroutine calls (whether recursive or not) were treated as atomic groups up
+9. Subroutine calls (whether recursive or not) were treated as atomic groups up
 to PCRE2 release 10.23, but from release 10.30 this changed, and backtracking
 into subroutine calls is now supported, as in Perl.
 </P>
 <P>
-9. If any of the backtracking control verbs are used in a group that is called
-as a subroutine (whether or not recursively), their effect is confined to that
-group; it does not extend to the surrounding pattern. This is not always the
-case in Perl. In particular, if (*THEN) is present in a group that is called as
-a subroutine, its action is limited to that group, even if the group does not
-contain any | characters. Note that such groups are processed as anchored
-at the point where they are tested.
+10. In PCRE2, if any of the backtracking control verbs are used in a group that
+is called as a subroutine (whether or not recursively), their effect is
+confined to that group; it does not extend to the surrounding pattern. This is
+not always the case in Perl. In particular, if (*THEN) is present in a group
+that is called as a subroutine, its action is limited to that group, even if
+the group does not contain any | characters. Note that such groups are
+processed as anchored at the point where they are tested.
 </P>
 <P>
-10. If a pattern contains more than one backtracking control verb, the first
+11. If a pattern contains more than one backtracking control verb, the first
 one that is backtracked onto acts. For example, in the pattern
 A(*COMMIT)B(*PRUNE)C a failure in B triggers (*COMMIT), but a failure in C
 triggers (*PRUNE). Perl's behaviour is more complex; in many cases it is the
 same as PCRE2, but there are cases where it differs.
 </P>
 <P>
-11. Most backtracking verbs in assertions have their normal actions. They are
-not confined to the assertion.
-</P>
-<P>
 12. There are some differences that are concerned with the settings of captured
 strings when part of a pattern is repeated. For example, matching "aba" against
 the pattern /^(a(b)?)+$/ in Perl leaves $2 unset, but in PCRE2 it is set to
@ -123,7 +133,7 @@ the pattern /^(a(b)?)+$/ in Perl leaves $2 unset, but in PCRE2 it is set to
 13. PCRE2's handling of duplicate capture group numbers and names is not as
 general as Perl's. This is a consequence of the fact the PCRE2 works internally
 just with numbers, using an external table to translate between numbers and
-names. In particular, a pattern such as (?|(?&#60;a&#62;A)|(?&#60;b&#62;B), where the two
+names. In particular, a pattern such as (?|(?&#60;a&#62;A)|(?&#60;b&#62;B)), where the two
 capture groups have the same number but different names, is not supported, and
 causes an error at compile time. If it were allowed, it would not be possible
 to distinguish which group matched, because both names map to capture group
@ -146,19 +156,27 @@ certainly user mistakes.
 16. In PCRE2, the upper/lower case character properties Lu and Ll are not
 affected when case-independent matching is specified. For example, \p{Lu}
 always matches an upper case letter. I think Perl has changed in this respect;
-in the release at the time of writing (5.24), \p{Lu} and \p{Ll} match all
+in the release at the time of writing (5.34), \p{Lu} and \p{Ll} match all
 letters, regardless of case, when case independence is specified.
 </P>
 <P>
-17. PCRE2 provides some extensions to the Perl regular expression facilities.
-Perl 5.10 includes new features that are not in earlier versions of Perl, some
+17. From release 5.32.0, Perl locks out the use of \K in lookaround
+assertions. From release 10.38 PCRE2 does the same by default. However, there
+is an option for re-enabling the previous behaviour. When this option is set,
+\K is acted on when it occurs in positive assertions, but is ignored in
+negative assertions.
+</P>
+<P>
+18. PCRE2 provides some extensions to the Perl regular expression facilities.
+Perl 5.10 included new features that were not in earlier versions of Perl, some
 of which (such as named parentheses) were in PCRE2 for some time before. This
-list is with respect to Perl 5.26:
+list is with respect to Perl 5.34:
 <br>
 <br>
 (a) Although lookbehind assertions in PCRE2 must match fixed length strings,
-each alternative branch of a lookbehind assertion can match a different length
-of string. Perl requires them all to have the same length.
+each alternative toplevel branch of a lookbehind assertion can match a
+different length of string. Perl used to require them all to have the same
+length, but the latest version has some variable length support.
 <br>
 <br>
 (b) From PCRE2 10.23, backreferences to groups of fixed length are supported
@ -203,16 +221,21 @@ different way and is not Perl-compatible.
 <br>
 <br>
 (l) PCRE2 recognizes some special sequences such as (*CR) or (*NO_JIT) at
-the start of a pattern that set overall options that cannot be changed within
+the start of a pattern. These set overall options that cannot be changed within
 the pattern.
+<br>
+<br>
+(m) PCRE2 supports non-atomic positive lookaround assertions. This is an
+extension to the lookaround facilities. The default, Perl-compatible
+lookarounds are atomic.
 </P>
 <P>
-18. The Perl /a modifier restricts /d numbers to pure ascii, and the /aa
+19. The Perl /a modifier restricts /d numbers to pure ascii, and the /aa
 modifier restricts /i case-insensitive matching to pure ascii, ignoring Unicode
 rules. This separation cannot be represented with PCRE2_UCP.
 </P>
 <P>
-19. Perl has different limits than PCRE2. See the
+20. Perl has different limits than PCRE2. See the
 <a href="pcre2limit.html"><b>pcre2limit</b></a>
 documentation for details. Perl went with 5.10 from recursion to iteration
 keeping the intermediate matches on the heap, which is ~10% slower but does not
@ -225,7 +248,7 @@ AUTHOR
 <P>
 Philip Hazel
 <br>
-University Computing Service
+Retired from University Computing Service
 <br>
 Cambridge, England.
 <br>
@ -234,9 +257,9 @@ Cambridge, England.
 REVISION
 </b><br>
 <P>
-Last updated: 12 February 2019
+Last updated: 08 December 2021
 <br>
-Copyright &copy; 1997-2019 University of Cambridge.
+Copyright &copy; 1997-2021 University of Cambridge.
 <br>
 <p>
 Return to the <a href="index.html">PCRE2 index page</a>.
--- a/doc/html/pcre2convert.html
+++ b/doc/html/pcre2convert.html
@ -141,8 +141,8 @@ permitted to match separator characters, but the double-star (**) feature
 </P>
 <P>
 PCRE2_CONVERT_GLOB_NO_WILD_SEPARATOR matches globs with wildcards allowed to
-match separator characters. PCRE2_GLOB_NO_STARSTAR matches globs with the
-double-star feature disabled. These options may be given together.
+match separator characters. PCRE2_CONVERT_GLOB_NO_STARSTAR matches globs with
+the double-star feature disabled. These options may be given together.
 </P>
 <br><a name="SEC5" href="#TOC1">CONVERTING POSIX PATTERNS</a><br>
 <P>
--- a/doc/html/pcre2demo.html
+++ b/doc/html/pcre2demo.html
@ -104,12 +104,11 @@ uint32_t newline;

 PCRE2_SIZE erroroffset;
 PCRE2_SIZE *ovector;
+PCRE2_SIZE subject_length;

-size_t subject_length;
 pcre2_match_data *match_data;


-
 /**************************************************************************
 * First, sort out the command line. There is only one possible option at  *
 * the moment, "-g" to request repeated matching to find all occurrences,  *
@ -138,12 +137,14 @@ if (argc - i != 2)
  return 1;
  }

-/* As pattern and subject are char arguments, they can be straightforwardly
-cast to PCRE2_SPTR as we are working in 8-bit code units. */
+/* Pattern and subject are char arguments, so they can be straightforwardly
+cast to PCRE2_SPTR because we are working in 8-bit code units. The subject
+length is cast to PCRE2_SIZE for completeness, though PCRE2_SIZE is in fact
+defined to be size_t. */

 pattern = (PCRE2_SPTR)argv[i];
 subject = (PCRE2_SPTR)argv[i+1];
-subject_length = strlen((char *)subject);
+subject_length = (PCRE2_SIZE)strlen((char *)subject);


 /*************************************************************************
@ -172,17 +173,22 @@ if (re == NULL)


 /*************************************************************************
-* If the compilation succeeded, we call PCRE again, in order to do a     *
+* If the compilation succeeded, we call PCRE2 again, in order to do a    *
 * pattern match against the subject string. This does just ONE match. If *
 * further matching is needed, it will be done below. Before running the  *
-* match we must set up a match_data block for holding the result.        *
+* match we must set up a match_data block for holding the result. Using  *
+* pcre2_match_data_create_from_pattern() ensures that the block is       *
+* exactly the right size for the number of capturing parentheses in the  *
+* pattern. If you need to know the actual size of a match_data block as  *
+* a number of bytes, you can find it like this:                          *
+*                                                                        *
+* PCRE2_SIZE match_data_size = pcre2_get_match_data_size(match_data);    *
 *************************************************************************/

-/* Using this function ensures that the block is exactly the right size for
-the number of capturing parentheses in the pattern. */
-
 match_data = pcre2_match_data_create_from_pattern(re, NULL);

+/* Now run the match. */
+
 rc = pcre2_match(
  re,                   /* the compiled pattern */
  subject,              /* the subject string */
@ -209,8 +215,8 @@ if (rc &lt; 0)
  return 1;
  }

-/* Match succeded. Get a pointer to the output vector, where string offsets are
-stored. */
+/* Match succeeded. Get a pointer to the output vector, where string offsets
+are stored. */

 ovector = pcre2_get_ovector_pointer(match_data);
 printf("Match succeeded at offset %d\n", (int)ovector[0]);
@ -228,9 +234,12 @@ pcre2_match_data_create_from_pattern() above. */
 if (rc == 0)
  printf("ovector was not big enough for all the captured substrings\n");

-/* We must guard against patterns such as /(?=.\K)/ that use \K in an assertion
-to set the start of a match later than its end. In this demonstration program,
-we just detect this case and give up. */
+/* Since release 10.38 PCRE2 has locked out the use of \K in lookaround
+assertions. However, there is an option to re-enable the old behaviour. If that
+is set, it is possible to run patterns such as /(?=.\K)/ that use \K in an
+assertion to set the start of a match later than its end. In this demonstration
+program, we show how to detect this case, but it shouldn't arise because the
+option is never set. */

 if (ovector[0] &gt; ovector[1])
  {
@ -249,7 +258,7 @@ application you might want to do things other than print them. */
 for (i = 0; i &lt; rc; i++)
  {
  PCRE2_SPTR substring_start = subject + ovector[2*i];
-  size_t substring_length = ovector[2*i+1] - ovector[2*i];
+  PCRE2_SIZE substring_length = ovector[2*i+1] - ovector[2*i];
  printf("%2d: %.*s\n", i, (int)substring_length, (char *)substring_start);
  }

@ -447,7 +456,7 @@ for (;;)
    return 1;
    }

-  /* Match succeded */
+  /* Match succeeded */

  printf("\nMatch succeeded again at offset %d\n", (int)ovector[0]);

--- a/doc/html/pcre2grep.html
+++ b/doc/html/pcre2grep.html
@ -71,13 +71,15 @@ For example:
 <pre>
  pcre2grep some-pattern file1 - file3
 </pre>
-Input files are searched line by line. By default, each line that matches a
+By default, input files are searched line by line. Each line that matches a
 pattern is copied to the standard output, and if there is more than one file,
 the file name is output at the start of each line, followed by a colon.
-However, there are options that can change how <b>pcre2grep</b> behaves. In
-particular, the <b>-M</b> option makes it possible to search for strings that
-span line boundaries. What defines a line boundary is controlled by the
-<b>-N</b> (<b>--newline</b>) option.
+However, there are options that can change how <b>pcre2grep</b> behaves. For
+example, the <b>-M</b> option makes it possible to search for strings that span
+line boundaries. What defines a line boundary is controlled by the <b>-N</b>
+(<b>--newline</b>) option. The <b>-h</b> and <b>-H</b> options control whether or
+not file names are shown, and the <b>-Z</b> option changes the file name
+terminator to a zero byte.
 </P>
 <P>
 The amount of memory used for buffering files that are being scanned is
@ -111,8 +113,8 @@ matching substrings, or if <b>--only-matching</b>, <b>--file-offsets</b>, or
 (either shown literally, or as an offset), scanning resumes immediately
 following the match, so that further matches on the same line can be found. If
 there are multiple patterns, they are all tried on the remainder of the line,
-but patterns that follow the one that matched are not tried on the earlier part
-of the line.
+but patterns that follow the one that matched are not tried on the earlier
+matched part of the line.
 </P>
 <P>
 This behaviour means that the order in which multiple patterns are specified
@ -146,11 +148,10 @@ ignored.
 <br><a name="SEC4" href="#TOC1">BINARY FILES</a><br>
 <P>
 By default, a file that contains a binary zero byte within the first 1024 bytes
-is identified as a binary file, and is processed specially. (GNU grep
-identifies binary files in this manner.) However, if the newline type is
-specified as "nul", that is, the line terminator is a binary zero, the test for
-a binary file is not applied. See the <b>--binary-files</b> option for a means
-of changing the way binary files are handled.
+is identified as a binary file, and is processed specially. However, if the
+newline type is specified as NUL, that is, the line terminator is a binary
+zero, the test for a binary file is not applied. See the <b>--binary-files</b>
+option for a means of changing the way binary files are handled.
 </P>
 <br><a name="SEC5" href="#TOC1">BINARY ZEROS IN PATTERNS</a><br>
 <P>
@ -179,9 +180,11 @@ Output up to <i>number</i> lines of context after each matching line. Fewer
 lines are output if the next match or the end of the file is reached, or if the
 processing buffer size has been set too small. If file names and/or line
 numbers are being output, a hyphen separator is used instead of a colon for the
-context lines. A line containing "--" is output between each group of lines,
-unless they are in fact contiguous in the input file. The value of <i>number</i>
-is expected to be relatively small. When <b>-c</b> is used, <b>-A</b> is ignored.
+context lines (the <b>-Z</b> option can be used to change the file name
+terminator to a zero byte). A line containing "--" is output between each group
+of lines, unless they are in fact contiguous in the input file. The value of
+<i>number</i> is expected to be relatively small. When <b>-c</b> is used,
+<b>-A</b> is ignored.
 </P>
 <P>
 <b>-a</b>, <b>--text</b>
@ -189,14 +192,21 @@ Treat binary files as text. This is equivalent to
 <b>--binary-files</b>=<i>text</i>.
 </P>
 <P>
+<b>--allow-lookaround-bsk</b>
+PCRE2 now forbids the use of \K in lookarounds by default, in line with Perl.
+This option causes <b>pcre2grep</b> to set the PCRE2_EXTRA_ALLOW_LOOKAROUND_BSK
+option, which enables this somewhat dangerous usage.
+</P>
+<P>
 <b>-B</b> <i>number</i>, <b>--before-context=</b><i>number</i>
 Output up to <i>number</i> lines of context before each matching line. Fewer
 lines are output if the previous match or the start of the file is within
 <i>number</i> lines, or if the processing buffer size has been set too small. If
 file names and/or line numbers are being output, a hyphen separator is used
-instead of a colon for the context lines. A line containing "--" is output
-between each group of lines, unless they are in fact contiguous in the input
-file. The value of <i>number</i> is expected to be relatively small. When
+instead of a colon for the context lines (the <b>-Z</b> option can be used to
+change the file name terminator to a zero byte). A line containing "--" is
+output between each group of lines, unless they are in fact contiguous in the
+input file. The value of <i>number</i> is expected to be relatively small. When
 <b>-c</b> is used, <b>-B</b> is ignored.
 </P>
 <P>
@ -406,20 +416,22 @@ shown separately. This option is mutually exclusive with <b>--output</b>,
 <P>
 <b>-H</b>, <b>--with-filename</b>
 Force the inclusion of the file name at the start of output lines when
-searching a single file. By default, the file name is not shown in this case.
-For matching lines, the file name is followed by a colon; for context lines, a
-hyphen separator is used. If a line number is also being output, it follows the
-file name. When the <b>-M</b> option causes a pattern to match more than one
-line, only the first is preceded by the file name. This option overrides any
-previous <b>-h</b>, <b>-l</b>, or <b>-L</b> options.
+searching a single file. The file name is not normally shown in this case.
+By default, for matching lines, the file name is followed by a colon; for
+context lines, a hyphen separator is used. The <b>-Z</b> option can be used to
+change the terminator to a zero byte. If a line number is also being output,
+it follows the file name. When the <b>-M</b> option causes a pattern to match
+more than one line, only the first is preceded by the file name. This option
+overrides any previous <b>-h</b>, <b>-l</b>, or <b>-L</b> options.
 </P>
 <P>
 <b>-h</b>, <b>--no-filename</b>
-Suppress the output file names when searching multiple files. By default,
-file names are shown when multiple files are searched. For matching lines, the
-file name is followed by a colon; for context lines, a hyphen separator is used.
-If a line number is also being output, it follows the file name. This option
-overrides any previous <b>-H</b>, <b>-L</b>, or <b>-l</b> options.
+Suppress the output file names when searching multiple files. File names are
+normally shown when multiple files are searched. By default, for matching
+lines, the file name is followed by a colon; for context lines, a hyphen
+separator is used. The <b>-Z</b> option can be used to change the terminator to
+a zero byte. If a line number is also being output, it follows the file name.
+This option overrides any previous <b>-H</b>, <b>-L</b>, or <b>-l</b> options.
 </P>
 <P>
 <b>--heap-limit</b>=<i>number</i>
@ -443,8 +455,8 @@ Ignore upper/lower case distinctions during comparisons.
 <P>
 <b>--include</b>=<i>pattern</i>
 If any <b>--include</b> patterns are specified, the only files that are
-processed are those that match one of the patterns (and do not match an
-<b>--exclude</b> pattern). This option does not affect directories, but it
+processed are those whose names match one of the patterns and do not match an
+<b>--exclude</b> pattern. This option does not affect directories, but it
 applies to all files, whether listed on the command line, obtained from
 <b>--file-list</b>, or by scanning a directory. The pattern is a PCRE2 regular
 expression, and is matched against the final component of the file name, not
@ -463,8 +475,8 @@ may be given any number of times; all the files are read.
 <P>
 <b>--include-dir</b>=<i>pattern</i>
 If any <b>--include-dir</b> patterns are specified, the only directories that
-are processed are those that match one of the patterns (and do not match an
-<b>--exclude-dir</b> pattern). This applies to all directories, whether listed
+are processed are those whose names match one of the patterns and do not match
+an <b>--exclude-dir</b> pattern. This applies to all directories, whether listed
 on the command line, obtained from <b>--file-list</b>, or by scanning a parent
 directory. The pattern is a PCRE2 regular expression, and is matched against
 the final component of the directory name, not the entire path. The <b>-F</b>,
@ -476,19 +488,22 @@ given any number of times. If a directory matches both <b>--include-dir</b> and
 <b>-L</b>, <b>--files-without-match</b>
 Instead of outputting lines from the files, just output the names of the files
 that do not contain any lines that would have been output. Each file name is
-output once, on a separate line. This option overrides any previous <b>-H</b>,
-<b>-h</b>, or <b>-l</b> options.
+output once, on a separate line by default, but if the <b>-Z</b> option is set, 
+they are separated by zero bytes instead of newlines. This option overrides any
+previous <b>-H</b>, <b>-h</b>, or <b>-l</b> options.
 </P>
 <P>
 <b>-l</b>, <b>--files-with-matches</b>
 Instead of outputting lines from the files, just output the names of the files
 containing lines that would have been output. Each file name is output once, on
-a separate line. Searching normally stops as soon as a matching line is found
-in a file. However, if the <b>-c</b> (count) option is also used, matching
-continues in order to obtain the correct count, and those files that have at
-least one match are listed along with their counts. Using this option with
-<b>-c</b> is a way of suppressing the listing of files with no matches. This
-opeion overrides any previous <b>-H</b>, <b>-h</b>, or <b>-L</b> options.
+a separate line, but if the <b>-Z</b> option is set, they are separated by zero
+bytes instead of newlines. Searching normally stops as soon as a matching line
+is found in a file. However, if the <b>-c</b> (count) option is also used,
+matching continues in order to obtain the correct count, and those files that
+have at least one match are listed along with their counts. Using this option
+with <b>-c</b> is a way of suppressing the listing of files with no matches that
+occurs with <b>-c</b> on its own. This option overrides any previous <b>-H</b>,
+<b>-h</b>, or <b>-L</b> options.
 </P>
 <P>
 <b>--label</b>=<i>name</i>
@ -501,8 +516,8 @@ short form for this option.
 When this option is given, non-compressed input is read and processed line by
 line, and the output is flushed after each write. By default, input is read in
 large chunks, unless <b>pcre2grep</b> can determine that it is reading from a
-terminal (which is currently possible only in Unix-like environments or
-Windows). Output to terminal is normally automatically flushed by the operating
+terminal, which is currently possible only in Unix-like environments or
+Windows. Output to terminal is normally automatically flushed by the operating
 system. This option can be useful when the input or output is attached to a
 pipe and you do not want <b>pcre2grep</b> to buffer up large amounts of data.
 However, its use will affect performance, and the <b>-M</b> (multiline) option
@ -528,46 +543,6 @@ locale is specified, the PCRE2 library's default (usually the "C" locale) is
 used. There is no short form for this option.
 </P>
 <P>
-<b>--match-limit</b>=<i>number</i>
-Processing some regular expression patterns may take a very long time to search
-for all possible matching strings. Others may require a very large amount of
-memory. There are three options that set resource limits for matching.
-<br>
-<br>
-The <b>--match-limit</b> option provides a means of limiting computing resource
-usage when processing patterns that are not going to match, but which have a
-very large number of possibilities in their search trees. The classic example
-is a pattern that uses nested unlimited repeats. Internally, PCRE2 has a
-counter that is incremented each time around its main processing loop. If the
-value set by <b>--match-limit</b> is reached, an error occurs.
-<br>
-<br>
-The <b>--heap-limit</b> option specifies, as a number of kibibytes (units of
-1024 bytes), the amount of heap memory that may be used for matching. Heap
-memory is needed only if matching the pattern requires a significant number of
-nested backtracking points to be remembered. This parameter can be set to zero
-to forbid the use of heap memory altogether.
-<br>
-<br>
-The <b>--depth-limit</b> option limits the depth of nested backtracking points,
-which indirectly limits the amount of memory that is used. The amount of memory
-needed for each backtracking point depends on the number of capturing
-parentheses in the pattern, so the amount of memory that is used before this
-limit acts varies from pattern to pattern. This limit is of use only if it is
-set smaller than <b>--match-limit</b>.
-<br>
-<br>
-There are no short forms for these options. The default limits can be set
-when the PCRE2 library is compiled; if they are not specified, the defaults
-are very large and so effectively unlimited.
-</P>
-<P>
-\fB--max-buffer-size=<i>number</i>
-This limits the expansion of the processing buffer, whose initial size can be
-set by <b>--buffer-size</b>. The maximum buffer size is silently forced to be no
-smaller than the starting buffer size.
-</P>
-<P>
 <b>-M</b>, <b>--multiline</b>
 Allow patterns to match more than one line. When this option is set, the PCRE2
 library is called in "multiline" mode. This allows a matched string to extend
@ -597,29 +572,84 @@ well as possibly handling a two-character newline sequence.
 There is a limit to the number of lines that can be matched, imposed by the way
 that <b>pcre2grep</b> buffers the input file as it scans it. With a sufficiently
 large processing buffer, this should not be a problem, but the <b>-M</b> option
-does not work when input is read line by line (see \fP--line-buffered\fP.)
+does not work when input is read line by line (see <b>--line-buffered</b>.)
+</P>
+<P>
+<b>-m</b> <i>number</i>, <b>--max-count</b>=<i>number</i>
+Stop processing after finding <i>number</i> matching lines, or non-matching
+lines if <b>-v</b> is also set. Any trailing context lines are output after the
+final match. In multiline mode, each multiline match counts as just one line
+for this purpose. If this limit is reached when reading the standard input from
+a regular file, the file is left positioned just after the last matching line.
+If <b>-c</b> is also set, the count that is output is never greater than
+<i>number</i>. This option has no effect if used with <b>-L</b>, <b>-l</b>, or
+<b>-q</b>, or when just checking for a match in a binary file.
+</P>
+<P>
+<b>--match-limit</b>=<i>number</i>
+Processing some regular expression patterns may take a very long time to search
+for all possible matching strings. Others may require a very large amount of
+memory. There are three options that set resource limits for matching.
+<br>
+<br>
+The <b>--match-limit</b> option provides a means of limiting computing resource
+usage when processing patterns that are not going to match, but which have a
+very large number of possibilities in their search trees. The classic example
+is a pattern that uses nested unlimited repeats. Internally, PCRE2 has a
+counter that is incremented each time around its main processing loop. If the
+value set by <b>--match-limit</b> is reached, an error occurs.
+<br>
+<br>
+The <b>--heap-limit</b> option specifies, as a number of kibibytes (units of
+1024 bytes), the maximum amount of heap memory that may be used for matching.
+<br>
+<br>
+The <b>--depth-limit</b> option limits the depth of nested backtracking points,
+which indirectly limits the amount of memory that is used. The amount of memory
+needed for each backtracking point depends on the number of capturing
+parentheses in the pattern, so the amount of memory that is used before this
+limit acts varies from pattern to pattern. This limit is of use only if it is
+set smaller than <b>--match-limit</b>.
+<br>
+<br>
+There are no short forms for these options. The default limits can be set
+when the PCRE2 library is compiled; if they are not specified, the defaults
+are very large and so effectively unlimited.
+</P>
+<P>
+<b>--max-buffer-size</b>=<i>number</i>
+This limits the expansion of the processing buffer, whose initial size can be
+set by <b>--buffer-size</b>. The maximum buffer size is silently forced to be no
+smaller than the starting buffer size.
 </P>
 <P>
 <b>-N</b> <i>newline-type</i>, <b>--newline</b>=<i>newline-type</i>
-The PCRE2 library supports five different conventions for indicating
-the ends of lines. They are the single-character sequences CR (carriage return)
-and LF (linefeed), the two-character sequence CRLF, an "anycrlf" convention,
-which recognizes any of the preceding three types, and an "any" convention, in
-which any Unicode line ending sequence is assumed to end a line. The Unicode
-sequences are the three just mentioned, plus VT (vertical tab, U+000B), FF
-(form feed, U+000C), NEL (next line, U+0085), LS (line separator, U+2028), and
-PS (paragraph separator, U+2029).
+Six different conventions for indicating the ends of lines in scanned files are
+supported. For example:
+<pre>
+  pcre2grep -N CRLF 'some pattern' &#60;file&#62;
+</pre>
+The newline type may be specified in upper, lower, or mixed case. If the
+newline type is NUL, lines are separated by binary zero characters. The other
+types are the single-character sequences CR (carriage return) and LF
+(linefeed), the two-character sequence CRLF, an "anycrlf" type, which
+recognizes any of the preceding three types, and an "any" type, for which any
+Unicode line ending sequence is assumed to end a line. The Unicode sequences
+are the three just mentioned, plus VT (vertical tab, U+000B), FF (form feed,
+U+000C), NEL (next line, U+0085), LS (line separator, U+2028), and PS
+(paragraph separator, U+2029).
 <br>
 <br>
 When the PCRE2 library is built, a default line-ending sequence is specified.
 This is normally the standard sequence for the operating system. Unless
 otherwise specified by this option, <b>pcre2grep</b> uses the library's default.
-The possible values for this option are CR, LF, CRLF, ANYCRLF, or ANY. This
-makes it possible to use <b>pcre2grep</b> to scan files that have come from
-other environments without having to modify their line endings. If the data
-that is being scanned does not agree with the convention set by this option,
-<b>pcre2grep</b> may behave in strange ways. Note that this option does not
-apply to files specified by the <b>-f</b>, <b>--exclude-from</b>, or
+<br>
+<br>
+This option makes it possible to use <b>pcre2grep</b> to scan files that have
+come from other environments without having to modify their line endings. If
+the data that is being scanned does not agree with the convention set by this
+option, <b>pcre2grep</b> may behave in strange ways. Note that this option does
+not apply to files specified by the <b>-f</b>, <b>--exclude-from</b>, or
 <b>--include-from</b> options, which are expected to use the operating system's
 standard newline sequence.
 </P>
@ -641,29 +671,41 @@ It should never be needed in normal use.
 </P>
 <P>
 <b>-O</b> <i>text</i>, <b>--output</b>=<i>text</i>
-When there is a match, instead of outputting the whole line that matched,
-output just the given text. This option is mutually exclusive with
-<b>--only-matching</b>, <b>--file-offsets</b>, and <b>--line-offsets</b>. Escape
-sequences starting with a dollar character may be used to insert the contents
-of the matched part of the line and/or captured substrings into the text.
+When there is a match, instead of outputting the line that matched, output just
+the text specified in this option, followed by an operating-system standard
+newline. In this mode, no context is shown. That is, the <b>-A</b>, <b>-B</b>,
+and <b>-C</b> options are ignored. The <b>--newline</b> option has no effect on
+this option, which is mutually exclusive with <b>--only-matching</b>,
+<b>--file-offsets</b>, and <b>--line-offsets</b>. However, like
+<b>--only-matching</b>, if there is more than one match in a line, each of them
+causes a line of output.
 <br>
 <br>
-$&#60;digits&#62; or ${&#60;digits&#62;} is replaced by the captured
-substring of the given decimal number; zero substitutes the whole match. If
-the number is greater than the number of capturing substrings, or if the
-capture is unset, the replacement is empty.
+Escape sequences starting with a dollar character may be used to insert the
+contents of the matched part of the line and/or captured substrings into the
+text.
+<br>
+<br>
+$&#60;digits&#62; or ${&#60;digits&#62;} is replaced by the captured substring of the given
+decimal number; zero substitutes the whole match. If the number is greater than
+the number of capturing substrings, or if the capture is unset, the replacement
+is empty.
 <br>
 <br>
 $a is replaced by bell; $b by backspace; $e by escape; $f by form feed; $n by
 newline; $r by carriage return; $t by tab; $v by vertical tab.
 <br>
 <br>
-$o&#60;digits&#62; is replaced by the character represented by the given octal
-number; up to three digits are processed.
+$o&#60;digits&#62; or $o{&#60;digits&#62;} is replaced by the character whose code point is the
+given octal number. In the first form, up to three octal digits are processed.
+When more digits are needed in Unicode mode to specify a wide character, the
+second form must be used.
 <br>
 <br>
-$x&#60;digits&#62; is replaced by the character represented by the given hexadecimal
-number; up to two digits are processed.
+$x&#60;digits&#62; or $x{&#60;digits&#62;} is replaced by the character represented by the
+given hexadecimal number. In the first form, up to two hexadecimal digits are
+processed. When more digits are needed in Unicode mode to specify a wide
+character, the second form must be used.
 <br>
 <br>
 Any other character is substituted by itself. In particular, $$ is replaced by
@ -685,20 +727,32 @@ otherwise empty line. This option is mutually exclusive with <b>--output</b>,
 <P>
 <b>-o</b><i>number</i>, <b>--only-matching</b>=<i>number</i>
 Show only the part of the line that matched the capturing parentheses of the
-given number. Up to 32 capturing parentheses are supported, and -o0 is
-equivalent to <b>-o</b> without a number. Because these options can be given
-without an argument (see above), if an argument is present, it must be given in
-the same shell item, for example, -o3 or --only-matching=2. The comments given
-for the non-argument case above also apply to this option. If the specified
-capturing parentheses do not exist in the pattern, or were not set in the
-match, nothing is output unless the file name or line number are being output.
+given number. Up to 50 capturing parentheses are supported by default. This
+limit can be changed via the <b>--om-capture</b> option. A pattern may contain
+any number of capturing parentheses, but only those whose number is within the
+limit can be accessed by <b>-o</b>. An error occurs if the number specified by
+<b>-o</b> is greater than the limit.
+<br>
+<br>
+-o0 is the same as <b>-o</b> without a number. Because these options can be
+given without an argument (see above), if an argument is present, it must be
+given in the same shell item, for example, -o3 or --only-matching=2. The
+comments given for the non-argument case above also apply to this option. If
+the specified capturing parentheses do not exist in the pattern, or were not
+set in the match, nothing is output unless the file name or line number are
+being output.
 <br>
 <br>
 If this option is given multiple times, multiple substrings are output for each
 match, in the order the options are given, and all on one line. For example,
 -o3 -o1 -o3 causes the substrings matched by capturing parentheses 3 and 1 and
 then 3 again to be output. By default, there is no separator (but see the next
-option).
+but one option).
+</P>
+<P>
+<b>--om-capture</b>=<i>number</i>
+Set the number of capturing parentheses that can be accessed by <b>-o</b>. The
+default is 50.
 </P>
 <P>
 <b>--om-separator</b>=<i>text</i>
@ -720,7 +774,8 @@ option to "recurse".
 </P>
 <P>
 <b>--recursion-limit</b>=<i>number</i>
-See <b>--match-limit</b> above.
+This is an obsolete synonym for <b>--depth-limit</b>. See <b>--match-limit</b>
+above for details.
 </P>
 <P>
 <b>-s</b>, <b>--no-messages</b>
@ -741,11 +796,23 @@ ignored when used with <b>-L</b> (list files without matches), because the grand
 total would always be zero.
 </P>
 <P>
-<b>-u</b>, <b>--utf-8</b>
+<b>-u</b>, <b>--utf</b>
 Operate in UTF-8 mode. This option is available only if PCRE2 has been compiled
 with UTF-8 support. All patterns (including those for any <b>--exclude</b> and
-<b>--include</b> options) and all subject lines that are scanned must be valid
-strings of UTF-8 characters.
+<b>--include</b> options) and all lines that are scanned must be valid strings
+of UTF-8 characters. If an invalid UTF-8 string is encountered, an error
+occurs.
+</P>
+<P>
+<b>-U</b>, <b>--utf-allow-invalid</b>
+As <b>--utf</b>, but in addition subject lines may contain invalid UTF-8 code
+unit sequences. These can never form part of any pattern match. Patterns
+themselves, however, must still be valid UTF-8 strings. This facility allows
+valid UTF-8 strings to be sought within arbitrary byte sequences in executable
+or other binary files. For more details about matching in non-valid UTF-8
+strings, see the
+<a href="pcre2unicode.html"><b>pcre2unicode</b>(3)</a>
+documentation.
 </P>
 <P>
 <b>-V</b>, <b>--version</b>
@ -756,7 +823,9 @@ ignored.
 <P>
 <b>-v</b>, <b>--invert-match</b>
 Invert the sense of the match, so that lines which do <i>not</i> match any of
-the patterns are the ones that are found.
+the patterns are the ones that are found. When this option is set, options such
+as <b>--only-matching</b> and <b>--output</b>, which specify parts of a match
+that are to be output, are ignored.
 </P>
 <P>
 <b>-w</b>, <b>--word-regex</b>, <b>--word-regexp</b>
@ -776,6 +845,13 @@ pattern and ")$" at the end. This option applies only to the patterns that are
 matched against the contents of files; it does not apply to patterns specified
 by any of the <b>--include</b> or <b>--exclude</b> options.
 </P>
+<P>
+<b>-Z</b>, <b>--null</b>
+Terminate files names in the regular output with a zero byte (the NUL
+character) instead of what would normally appear. This is useful when file
+names contain unusual characters such as colons, hyphens, or even newlines. The
+option does not apply to file names in error messages.
+</P>
 <br><a name="SEC7" href="#TOC1">ENVIRONMENT VARIABLES</a><br>
 <P>
 The environment variables <b>LC_ALL</b> and <b>LC_CTYPE</b> are examined, in that
@ -786,16 +862,27 @@ by the <b>--locale</b> option. If no locale is set, the PCRE2 library's default
 <br><a name="SEC8" href="#TOC1">NEWLINES</a><br>
 <P>
 The <b>-N</b> (<b>--newline</b>) option allows <b>pcre2grep</b> to scan files with
-different newline conventions from the default. Any parts of the input files
-that are written to the standard output are copied identically, with whatever
-newline sequences they have in the input. However, the setting of this option
-affects only the way scanned files are processed. It does not affect the
-interpretation of files specified by the <b>-f</b>, <b>--file-list</b>,
-<b>--exclude-from</b>, or <b>--include-from</b> options, nor does it affect the
-way in which <b>pcre2grep</b> writes informational messages to the standard
-error and output streams. For these it uses the string "\n" to indicate
-newlines, relying on the C I/O library to convert this to an appropriate
-sequence.
+newline conventions that differ from the default. This option affects only the
+way scanned files are processed. It does not affect the interpretation of files
+specified by the <b>-f</b>, <b>--file-list</b>, <b>--exclude-from</b>, or
+<b>--include-from</b> options.
+</P>
+<P>
+Any parts of the scanned input files that are written to the standard output
+are copied with whatever newline sequences they have in the input. However, if
+the final line of a file is output, and it does not end with a newline
+sequence, a newline sequence is added. If the newline setting is CR, LF, CRLF
+or NUL, that line ending is output; for the other settings (ANYCRLF or ANY) a
+single NL is used.
+</P>
+<P>
+The newline setting does not affect the way in which <b>pcre2grep</b> writes
+newlines in informational messages to the standard output and error streams.
+Under Windows, the standard output is set to be binary, so that "\r\n" at the
+ends of output lines that are copied from the input is not converted to
+"\r\r\n" by the C I/O library. This means that any messages written to the
+standard output must end with "\r\n". For all other operating systems, and
+for all messages to the standard error stream, "\n" is used.
 </P>
 <br><a name="SEC9" href="#TOC1">OPTIONS COMPATIBILITY</a><br>
 <P>
@ -806,9 +893,9 @@ as in the GNU <b>grep</b> program. Any long option of the form
 <b>--file-offsets</b>, <b>--heap-limit</b>, <b>--include-dir</b>,
 <b>--line-offsets</b>, <b>--locale</b>, <b>--match-limit</b>, <b>-M</b>,
 <b>--multiline</b>, <b>-N</b>, <b>--newline</b>, <b>--om-separator</b>,
-<b>--output</b>, <b>-u</b>, and <b>--utf-8</b> options are specific to
-<b>pcre2grep</b>, as is the use of the <b>--only-matching</b> option with a
-capturing parentheses number.
+<b>--output</b>, <b>-u</b>, <b>--utf</b>, <b>-U</b>, and <b>--utf-allow-invalid</b>
+options are specific to <b>pcre2grep</b>, as is the use of the
+<b>--only-matching</b> option with a capturing parentheses number.
 </P>
 <P>
 Although most of the common options work the same way, a few are different in
@ -868,12 +955,36 @@ documentation for details). Numbered callouts are ignored by <b>pcre2grep</b>;
 only callouts with string arguments are useful.
 </P>
 <br><b>
+Echoing a specific string
+</b><br>
+<P>
+Starting the callout string with a pipe character invokes an echoing facility
+that avoids calling an external program or script. This facility is always
+available, provided that callouts were not completely disabled when
+<b>pcre2grep</b> was built. The rest of the callout string is processed as a
+zero-terminated string, which means it should not contain any internal binary
+zeros. It is written to the output, having first been passed through the same
+escape processing as text from the <b>--output</b> (<b>-O</b>) option (see
+above). However, $0 cannot be used to insert a matched substring because the
+match is still in progress. Instead, the single character '0' is inserted. Any
+syntax errors in the string (for example, a dollar not followed by another
+character) causes the callout to be ignored. No terminator is added to the
+output string, so if you want a newline, you must include it explicitly using
+the escape $n. For example:
+<pre>
+  pcre2grep '(.)(..(.))(?C"|[$1] [$2] [$3]$n")' &#60;some file&#62;
+</pre>
+Matching continues normally after the string is output. If you want to see only
+the callout output but not any output from an actual match, you should end the
+pattern with (*FAIL).
+</P>
+<br><b>
 Calling external programs or scripts
 </b><br>
 <P>
 This facility can be independently disabled when <b>pcre2grep</b> is built. It
 is supported for Windows, where a call to <b>_spawnvp()</b> is used, for VMS,
-where <b>lib$spawn()</b> is used, and for any other Unix-like environment where
+where <b>lib$spawn()</b> is used, and for any Unix-like environment where
 <b>fork()</b> and <b>execv()</b> are available.
 </P>
 <P>
@ -885,14 +996,11 @@ arguments:
  executable_name|arg1|arg2|...
 </pre>
 Any substring (including the executable name) may contain escape sequences
-started by a dollar character: $&#60;digits&#62; or ${&#60;digits&#62;} is replaced by the
-captured substring of the given decimal number, which must be greater than
-zero. If the number is greater than the number of capturing substrings, or if
-the capture is unset, the replacement is empty.
-</P>
-<P>
-Any other character is substituted by itself. In particular, $$ is replaced by
-a single dollar and $| is replaced by a pipe character. Here is an example:
+started by a dollar character. These are the same as for the <b>--output</b>
+(<b>-O</b>) option documented above, except that $0 cannot insert the matched
+string because the match is still in progress. Instead, the character '0'
+is inserted. If you need a literal dollar or pipe character in any
+substring, use $$ or $| respectively. Here is an example:
 <pre>
  echo -e "abcde\n12345" | pcre2grep \
    '(?x)(.)(..(.))
@ -905,28 +1013,14 @@ a single dollar and $| is replaced by a pipe character. Here is an example:
    Arg1: [1] [234] [4] Arg2: |1| ()
    12345
 </pre>
-The parameters for the system call that is used to run the
-program or script are zero-terminated strings. This means that binary zero
-characters in the callout argument will cause premature termination of their
-substrings, and therefore should not be present. Any syntax errors in the
-string (for example, a dollar not followed by another character) cause the
-callout to be ignored. If running the program fails for any reason (including
-the non-existence of the executable), a local matching failure occurs and the
-matcher backtracks in the normal way.
-</P>
-<br><b>
-Echoing a specific string
-</b><br>
-<P>
-This facility is always available, provided that callouts were not completely
-disabled when <b>pcre2grep</b> was built. If the callout string starts with a
-pipe (vertical bar) character, the rest of the string is written to the output,
-having been passed through the same escape processing as text from the --output
-option. This provides a simple echoing facility that avoids calling an external
-program or script. No terminator is added to the string, so if you want a
-newline, you must include it explicitly. Matching continues normally after the
-string is output. If you want to see only the callout output but not any output
-from an actual match, you should end the relevant pattern with (*FAIL).
+The parameters for the system call that is used to run the program or script
+are zero-terminated strings. This means that binary zero characters in the
+callout argument will cause premature termination of their substrings, and
+therefore should not be present. Any syntax errors in the string (for example,
+a dollar not followed by another character) causes the callout to be ignored.
+If running the program fails for any reason (including the non-existence of the
+executable), a local matching failure occurs and the matcher backtracks in the
+normal way.
 </P>
 <br><a name="SEC12" href="#TOC1">MATCHING ERRORS</a><br>
 <P>
@ -958,22 +1052,23 @@ because VMS does not distinguish between exit(0) and exit(1).
 </P>
 <br><a name="SEC14" href="#TOC1">SEE ALSO</a><br>
 <P>
-<b>pcre2pattern</b>(3), <b>pcre2syntax</b>(3), <b>pcre2callout</b>(3).
+<b>pcre2pattern</b>(3), <b>pcre2syntax</b>(3), <b>pcre2callout</b>(3),
+<b>pcre2unicode</b>(3).
 </P>
 <br><a name="SEC15" href="#TOC1">AUTHOR</a><br>
 <P>
 Philip Hazel
 <br>
-University Computing Service
+Retired from University Computing Service
 <br>
 Cambridge, England.
 <br>
 </P>
 <br><a name="SEC16" href="#TOC1">REVISION</a><br>
 <P>
-Last updated: 24 November 2018
+Last updated: 30 July 2022
 <br>
-Copyright &copy; 1997-2018 University of Cambridge.
+Copyright &copy; 1997-2022 University of Cambridge.
 <br>
 <p>
 Return to the <a href="index.html">PCRE2 index page</a>.
--- a/doc/html/pcre2jit.html
+++ b/doc/html/pcre2jit.html
@ -54,6 +54,7 @@ platforms:
 <pre>
  ARM 32-bit (v5, v7, and Thumb2)
  ARM 64-bit
+  IBM s390x 64 bit
  Intel x86 32-bit and 64-bit
  MIPS 32-bit and 64-bit
  Power PC 32-bit and 64-bit
@ -90,7 +91,7 @@ or a negative error code.
 There is a limit to the size of pattern that JIT supports, imposed by the size
 of machine stack that it uses. The exact rules are not documented because they
 may change at any time, in particular, when new optimizations are introduced.
-If a pattern is too big, a call to \fBpcre2_jit_compile()\fB returns
+If a pattern is too big, a call to <b>pcre2_jit_compile()</b> returns
 PCRE2_ERROR_NOMEMORY.
 </P>
 <P>
@ -147,25 +148,29 @@ pattern.
 </P>
 <br><a name="SEC4" href="#TOC1">MATCHING SUBJECTS CONTAINING INVALID UTF</a><br>
 <P>
-When a pattern is compiled with the PCRE2_UTF option, the interpretive matching
-function expects its subject string to be a valid sequence of UTF code units.
-If it is not, the result is undefined. This is also true by default of matching
-via JIT. However, if the option PCRE2_JIT_INVALID_UTF is passed to
-<b>pcre2_jit_compile()</b>, code that can process a subject containing invalid
-UTF is compiled.
+When a pattern is compiled with the PCRE2_UTF option, subject strings are
+normally expected to be a valid sequence of UTF code units. By default, this is
+checked at the start of matching and an error is generated if invalid UTF is
+detected. The PCRE2_NO_UTF_CHECK option can be passed to <b>pcre2_match()</b> to
+skip the check (for improved performance) if you are sure that a subject string
+is valid. If this option is used with an invalid string, the result is
+undefined.
 </P>
 <P>
-In this mode, an invalid code unit sequence never matches any pattern item. It
-does not match dot, it does not match \p{Any}, it does not even match negative
-items such as [^X]. A lookbehind assertion fails if it encounters an invalid
-sequence while moving the current point backwards. In other words, an invalid
-UTF code unit sequence acts as a barrier which no match can cross. Reaching an
-invalid sequence causes an immediate backtrack.
+However, a way of running matches on strings that may contain invalid UTF
+sequences is available. Calling <b>pcre2_compile()</b> with the
+PCRE2_MATCH_INVALID_UTF option has two effects: it tells the interpreter in
+<b>pcre2_match()</b> to support invalid UTF, and, if <b>pcre2_jit_compile()</b>
+is called, the compiled JIT code also supports invalid UTF. Details of how this
+support works, in both the JIT and the interpretive cases, is given in the
+<a href="pcre2unicode.html"><b>pcre2unicode</b></a>
+documentation.
 </P>
 <P>
-Using this option, an application can run matches in arbitrary data, knowing
-that any matched strings that are returned will be valid UTF. This can be
-useful when searching for text in executable or other binary files.
+There is also an obsolete option for <b>pcre2_jit_compile()</b> called
+PCRE2_JIT_INVALID_UTF, which currently exists only for backward compatibility.
+It is superseded by the <b>pcre2_compile()</b> option PCRE2_MATCH_INVALID_UTF
+and should no longer be used. It may be removed in future.
 </P>
 <br><a name="SEC5" href="#TOC1">UNSUPPORTED OPTIONS AND PATTERN ITEMS</a><br>
 <P>
@ -264,11 +269,11 @@ starts another match, that match must use a different JIT stack to the one used
 for currently suspended match(es).
 </P>
 <P>
-In a multithread application, if you do not
-specify a JIT stack, or if you assign or pass back NULL from a callback, that
-is thread-safe, because each thread has its own machine stack. However, if you
-assign or pass back a non-NULL JIT stack, this must be a different stack for
-each thread so that the application is thread-safe.
+In a multithread application, if you do not specify a JIT stack, or if you
+assign or pass back NULL from a callback, that is thread-safe, because each
+thread has its own machine stack. However, if you assign or pass back a
+non-NULL JIT stack, this must be a different stack for each thread so that the
+application is thread-safe.
 </P>
 <P>
 Strictly speaking, even more is allowed. You can assign the same non-NULL stack
@ -282,7 +287,7 @@ inefficient solution, and not recommended.
 This is a suggestion for how a multithreaded program that needs to set up
 non-default JIT stacks might operate:
 <pre>
-  During thread initalization
+  During thread initialization
    thread_local_var = pcre2_jit_stack_create(...)

  During thread exit
@ -335,12 +340,12 @@ stack through the JIT callback function.
 You can free a JIT stack at any time, as long as it will not be used by
 <b>pcre2_match()</b> again. When you assign the stack to a match context, only a
 pointer is set. There is no reference counting or any other magic. You can free
-compiled patterns, contexts, and stacks in any order, anytime. Just \fIdo
-not\fP call <b>pcre2_match()</b> with a match context pointing to an already
-freed stack, as that will cause SEGFAULT. (Also, do not free a stack currently
-used by <b>pcre2_match()</b> in another thread). You can also replace the stack
-in a context at any time when it is not in use. You should free the previous
-stack before assigning a replacement.
+compiled patterns, contexts, and stacks in any order, anytime.
+Just <i>do not</i> call <b>pcre2_match()</b> with a match context pointing to an
+already freed stack, as that will cause SEGFAULT. (Also, do not free a stack
+currently used by <b>pcre2_match()</b> in another thread). You can also replace
+the stack in a context at any time when it is not in use. You should free the
+previous stack before assigning a replacement.
 </P>
 <P>
 (5) Should I allocate/free a stack every time before/after calling
@ -377,8 +382,8 @@ out this complicated API.
 <b>void pcre2_jit_free_unused_memory(pcre2_general_context *<i>gcontext</i>);</b>
 </P>
 <P>
-The JIT executable allocator does not free all memory when it is possible.
-It expects new allocations, and keeps some free memory around to improve
+The JIT executable allocator does not free all memory when it is possible. It
+expects new allocations, and keeps some free memory around to improve
 allocation speed. However, in low memory conditions, it might be better to free
 all possible memory. You can cause this to happen by calling
 pcre2_jit_free_unused_memory(). Its argument is a general context, for custom
@ -437,10 +442,10 @@ that was not compiled.
 <P>
 When you call <b>pcre2_match()</b>, as well as testing for invalid options, a
 number of other sanity checks are performed on the arguments. For example, if
-the subject pointer is NULL, an immediate error is given. Also, unless
-PCRE2_NO_UTF_CHECK is set, a UTF subject string is tested for validity. In the
-interests of speed, these checks do not happen on the JIT fast path, and if
-invalid data is passed, the result is undefined.
+the subject pointer is NULL but the length is non-zero, an immediate error is
+given. Also, unless PCRE2_NO_UTF_CHECK is set, a UTF subject string is tested
+for validity. In the interests of speed, these checks do not happen on the JIT
+fast path, and if invalid data is passed, the result is undefined.
 </P>
 <P>
 Bypassing the sanity checks and the <b>pcre2_match()</b> wrapping can give
@ -461,9 +466,9 @@ Cambridge, England.
 </P>
 <br><a name="SEC14" href="#TOC1">REVISION</a><br>
 <P>
-Last updated: 06 March 2019
+Last updated: 30 November 2021
 <br>
-Copyright &copy; 1997-2019 University of Cambridge.
+Copyright &copy; 1997-2021 University of Cambridge.
 <br>
 <p>
 Return to the <a href="index.html">PCRE2 index page</a>.
--- a/doc/html/pcre2limits.html
+++ b/doc/html/pcre2limits.html
@ -71,13 +71,18 @@ is 255 code units for the 8-bit library and 65535 code units for the 16-bit and
 The maximum length of a string argument to a callout is the largest number a
 32-bit unsigned integer can hold.
 </P>
+<P>
+The maximum amount of heap memory used for matching is controlled by the heap 
+limit, which can be set in a pattern or in a match context. The default is a 
+very large number, effectively unlimited.
+</P>
 <br><b>
 AUTHOR
 </b><br>
 <P>
 Philip Hazel
 <br>
-University Computing Service
+Retired from University Computing Service
 <br>
 Cambridge, England.
 <br>
@ -86,9 +91,9 @@ Cambridge, England.
 REVISION
 </b><br>
 <P>
-Last updated: 02 February 2019
+Last updated: 26 July 2022
 <br>
-Copyright &copy; 1997-2019 University of Cambridge.
+Copyright &copy; 1997-2022 University of Cambridge.
 <br>
 <p>
 Return to the <a href="index.html">PCRE2 index page</a>.
--- a/doc/html/pcre2matching.html
+++ b/doc/html/pcre2matching.html
@ -78,8 +78,9 @@ tried is controlled by the greedy or ungreedy nature of the quantifier.
 If a leaf node is reached, a matching string has been found, and at that point
 the algorithm stops. Thus, if there is more than one possible match, this
 algorithm returns the first one that it finds. Whether this is the shortest,
-the longest, or some intermediate length depends on the way the greedy and
-ungreedy repetition quantifiers are specified in the pattern.
+the longest, or some intermediate length depends on the way the alternations
+and the greedy or ungreedy repetition quantifiers are specified in the
+pattern.
 </P>
 <P>
 Because it ends up with a single path through the tree, it is relatively
@ -109,11 +110,17 @@ no more unterminated paths. At this point, terminated paths represent the
 different matching possibilities (if there are none, the match has failed).
 Thus, if there is more than one possible match, this algorithm finds all of
 them, and in particular, it finds the longest. The matches are returned in
-decreasing order of length. There is an option to stop the algorithm after the
-first match (which is necessarily the shortest) is found.
+the output vector in decreasing order of length. There is an option to stop the
+algorithm after the first match (which is necessarily the shortest) is found.
 </P>
 <P>
-Note that all the matches that are found start at the same point in the
+Note that the size of vector needed to contain all the results depends on the
+number of simultaneous matches, not on the number of parentheses in the
+pattern. Using <b>pcre2_match_data_create_from_pattern()</b> to create the match
+data block is therefore not advisable when doing DFA matching.
+</P>
+<P>
+Note also that all the matches that are found start at the same point in the
 subject. If the pattern
 <pre>
  cat(er(pillar)?)?
@ -188,23 +195,20 @@ code unit) at a time, for all active paths through the tree.
 9. Except for (*FAIL), the backtracking control verbs such as (*PRUNE) are not
 supported. (*FAIL) is supported, and behaves like a failing negative assertion.
 </P>
+<P>
+10. The PCRE2_MATCH_INVALID_UTF option for <b>pcre2_compile()</b> is not
+supported by <b>pcre2_dfa_match()</b>.
+</P>
 <br><a name="SEC5" href="#TOC1">ADVANTAGES OF THE ALTERNATIVE ALGORITHM</a><br>
 <P>
-Using the alternative matching algorithm provides the following advantages:
+The main advantage of the alternative algorithm is that all possible matches
+(at a single point in the subject) are automatically found, and in particular,
+the longest match is found. To find more than one match at the same point using
+the standard algorithm, you have to do kludgy things with callouts.
 </P>
 <P>
-1. All possible matches (at a single point in the subject) are automatically
-found, and in particular, the longest match is found. To find more than one
-match using the standard algorithm, you have to do kludgy things with
-callouts.
-</P>
-<P>
-2. Because the alternative algorithm scans the subject string just once, and
-never needs to backtrack (except for lookbehinds), it is possible to pass very
-long subject strings to the matching function in several pieces, checking for
-partial matching each time. Although it is also possible to do multi-segment
-matching using the standard algorithm, by retaining partially matched
-substrings, it is more complicated. The
+Partial matching is possible with this algorithm, though it has some
+limitations. The
 <a href="pcre2partial.html"><b>pcre2partial</b></a>
 documentation gives details of partial matching and discusses multi-segment
 matching.
@ -219,26 +223,30 @@ because it has to search for all possible matches, but is also because it is
 less susceptible to optimization.
 </P>
 <P>
-2. Capturing parentheses, backreferences, and script runs are not supported.
+2. Capturing parentheses, backreferences, script runs, and matching within
+invalid UTF string are not supported.
 </P>
 <P>
 3. Although atomic groups are supported, their use does not provide the
 performance advantage that it does for the standard algorithm.
 </P>
+<P>
+4. JIT optimization is not supported.
+</P>
 <br><a name="SEC7" href="#TOC1">AUTHOR</a><br>
 <P>
 Philip Hazel
 <br>
-University Computing Service
+Retired from University Computing Service
 <br>
 Cambridge, England.
 <br>
 </P>
 <br><a name="SEC8" href="#TOC1">REVISION</a><br>
 <P>
-Last updated: 10 October 2018
+Last updated: 28 August 2021
 <br>
-Copyright &copy; 1997-2018 University of Cambridge.
+Copyright &copy; 1997-2021 University of Cambridge.
 <br>
 <p>
 Return to the <a href="index.html">PCRE2 index page</a>.
--- a/doc/html/pcre2partial.html
+++ b/doc/html/pcre2partial.html
@ -14,78 +14,123 @@ please consult the man page, in case the conversion went wrong.
 <br>
 <ul>
 <li><a name="TOC1" href="#SEC1">PARTIAL MATCHING IN PCRE2</a>
-<li><a name="TOC2" href="#SEC2">PARTIAL MATCHING USING pcre2_match()</a>
-<li><a name="TOC3" href="#SEC3">PARTIAL MATCHING USING pcre2_dfa_match()</a>
-<li><a name="TOC4" href="#SEC4">PARTIAL MATCHING AND WORD BOUNDARIES</a>
-<li><a name="TOC5" href="#SEC5">EXAMPLE OF PARTIAL MATCHING USING PCRE2TEST</a>
+<li><a name="TOC2" href="#SEC2">REQUIREMENTS FOR A PARTIAL MATCH</a>
+<li><a name="TOC3" href="#SEC3">PARTIAL MATCHING USING pcre2_match()</a>
+<li><a name="TOC4" href="#SEC4">MULTI-SEGMENT MATCHING WITH pcre2_match()</a>
+<li><a name="TOC5" href="#SEC5">PARTIAL MATCHING USING pcre2_dfa_match()</a>
 <li><a name="TOC6" href="#SEC6">MULTI-SEGMENT MATCHING WITH pcre2_dfa_match()</a>
-<li><a name="TOC7" href="#SEC7">MULTI-SEGMENT MATCHING WITH pcre2_match()</a>
-<li><a name="TOC8" href="#SEC8">ISSUES WITH MULTI-SEGMENT MATCHING</a>
-<li><a name="TOC9" href="#SEC9">AUTHOR</a>
-<li><a name="TOC10" href="#SEC10">REVISION</a>
+<li><a name="TOC7" href="#SEC7">AUTHOR</a>
+<li><a name="TOC8" href="#SEC8">REVISION</a>
 </ul>
 <br><a name="SEC1" href="#TOC1">PARTIAL MATCHING IN PCRE2</a><br>
 <P>
-In normal use of PCRE2, if the subject string that is passed to a matching
-function matches as far as it goes, but is too short to match the entire
-pattern, PCRE2_ERROR_NOMATCH is returned. There are circumstances where it
-might be helpful to distinguish this case from other cases in which there is no
-match.
+In normal use of PCRE2, if there is a match up to the end of a subject string,
+but more characters are needed to match the entire pattern, PCRE2_ERROR_NOMATCH
+is returned, just like any other failing match. There are circumstances where
+it might be helpful to distinguish this "partial match" case.
 </P>
 <P>
-Consider, for example, an application where a human is required to type in data
-for a field with specific formatting requirements. An example might be a date
-in the form <i>ddmmmyy</i>, defined by this pattern:
+One example is an application where the subject string is very long, and not
+all available at once. The requirement here is to be able to do the matching
+segment by segment, but special action is needed when a matched substring spans
+the boundary between two segments.
+</P>
+<P>
+Another example is checking a user input string as it is typed, to ensure that
+it conforms to a required format. Invalid characters can be immediately
+diagnosed and rejected, giving instant feedback.
+</P>
+<P>
+Partial matching is a PCRE2-specific feature; it is not Perl-compatible. It is
+requested by setting one of the PCRE2_PARTIAL_HARD or PCRE2_PARTIAL_SOFT
+options when calling a matching function. The difference between the two
+options is whether or not a partial match is preferred to an alternative
+complete match, though the details differ between the two types of matching
+function. If both options are set, PCRE2_PARTIAL_HARD takes precedence.
+</P>
+<P>
+If you want to use partial matching with just-in-time optimized code, as well
+as setting a partial match option for the matching function, you must also call
+<b>pcre2_jit_compile()</b> with one or both of these options:
 <pre>
-  ^\d?\d(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)\d\d$
-</pre>
-If the application sees the user's keystrokes one by one, and can check that
-what has been typed so far is potentially valid, it is able to raise an error
-as soon as a mistake is made, by beeping and not reflecting the character that
-has been typed, for example. This immediate feedback is likely to be a better
-user interface than a check that is delayed until the entire string has been
-entered. Partial matching can also be useful when the subject string is very
-long and is not all available at once.
-</P>
-<P>
-PCRE2 supports partial matching by means of the PCRE2_PARTIAL_SOFT and
-PCRE2_PARTIAL_HARD options, which can be set when calling a matching function.
-The difference between the two options is whether or not a partial match is
-preferred to an alternative complete match, though the details differ between
-the two types of matching function. If both options are set, PCRE2_PARTIAL_HARD
-takes precedence.
-</P>
-<P>
-If you want to use partial matching with just-in-time optimized code, you must
-call <b>pcre2_jit_compile()</b> with one or both of these options:
-<pre>
-  PCRE2_JIT_PARTIAL_SOFT
  PCRE2_JIT_PARTIAL_HARD
+  PCRE2_JIT_PARTIAL_SOFT
 </pre>
 PCRE2_JIT_COMPLETE should also be set if you are going to run non-partial
-matches on the same pattern. If the appropriate JIT mode has not been compiled,
-interpretive matching code is used.
+matches on the same pattern. Separate code is compiled for each mode. If the
+appropriate JIT mode has not been compiled, interpretive matching code is used.
 </P>
 <P>
 Setting a partial matching option disables two of PCRE2's standard
-optimizations. PCRE2 remembers the last literal code unit in a pattern, and
-abandons matching immediately if it is not present in the subject string. This
-optimization cannot be used for a subject string that might match only
-partially. PCRE2 also knows the minimum length of a matching string, and does
+optimization hints. PCRE2 remembers the last literal code unit in a pattern,
+and abandons matching immediately if it is not present in the subject string.
+This optimization cannot be used for a subject string that might match only
+partially. PCRE2 also remembers a minimum length of a matching string, and does
 not bother to run the matching function on shorter strings. This optimization
 is also disabled for partial matching.
 </P>
-<br><a name="SEC2" href="#TOC1">PARTIAL MATCHING USING pcre2_match()</a><br>
+<br><a name="SEC2" href="#TOC1">REQUIREMENTS FOR A PARTIAL MATCH</a><br>
 <P>
-A partial match occurs during a call to <b>pcre2_match()</b> when the end of the
-subject string is reached successfully, but matching cannot continue because
-more characters are needed. However, at least one character in the subject must
-have been inspected. This character need not form part of the final matched
-string; lookbehind assertions and the \K escape sequence provide ways of
-inspecting characters before the start of a matched string. The requirement for
-inspecting at least one character exists because an empty string can always be
-matched; without such a restriction there would always be a partial match of an
-empty string at the end of the subject.
+A possible partial match occurs during matching when the end of the subject
+string is reached successfully, but either more characters are needed to
+complete the match, or the addition of more characters might change what is
+matched.
+</P>
+<P>
+Example 1: if the pattern is /abc/ and the subject is "ab", more characters are
+definitely needed to complete a match. In this case both hard and soft matching
+options yield a partial match.
+</P>
+<P>
+Example 2: if the pattern is /ab+/ and the subject is "ab", a complete match
+can be found, but the addition of more characters might change what is
+matched. In this case, only PCRE2_PARTIAL_HARD returns a partial match;
+PCRE2_PARTIAL_SOFT returns the complete match.
+</P>
+<P>
+On reaching the end of the subject, when PCRE2_PARTIAL_HARD is set, if the next
+pattern item is \z, \Z, \b, \B, or $ there is always a partial match.
+Otherwise, for both options, the next pattern item must be one that inspects a
+character, and at least one of the following must be true:
+</P>
+<P>
+(1) At least one character has already been inspected. An inspected character
+need not form part of the final matched string; lookbehind assertions and the
+\K escape sequence provide ways of inspecting characters before the start of a
+matched string.
+</P>
+<P>
+(2) The pattern contains one or more lookbehind assertions. This condition
+exists in case there is a lookbehind that inspects characters before the start
+of the match.
+</P>
+<P>
+(3) There is a special case when the whole pattern can match an empty string.
+When the starting point is at the end of the subject, the empty string match is
+a possibility, and if PCRE2_PARTIAL_SOFT is set and neither of the above
+conditions is true, it is returned. However, because adding more characters
+might result in a non-empty match, PCRE2_PARTIAL_HARD returns a partial match,
+which in this case means "there is going to be a match at this point, but until
+some more characters are added, we do not know if it will be an empty string or
+something longer".
+</P>
+<br><a name="SEC3" href="#TOC1">PARTIAL MATCHING USING pcre2_match()</a><br>
+<P>
+When a partial matching option is set, the result of calling
+<b>pcre2_match()</b> can be one of the following:
+</P>
+<P>
+<b>A successful match</b>
+A complete match has been found, starting and ending within this subject.
+</P>
+<P>
+<b>PCRE2_ERROR_NOMATCH</b>
+No match can start anywhere in this subject.
+</P>
+<P>
+<b>PCRE2_ERROR_PARTIAL</b>
+Adding more characters may result in a complete match that uses one or more
+characters from the end of this subject.
 </P>
 <P>
 When a partial match is returned, the first two elements in the ovector point
@ -103,54 +148,42 @@ these characters are needed for a subsequent re-match with additional
 characters.
 </P>
 <P>
-What happens when a partial match is identified depends on which of the two
-partial matching options are set.
-</P>
-<br><b>
-PCRE2_PARTIAL_SOFT WITH pcre2_match()
-</b><br>
-<P>
-If PCRE2_PARTIAL_SOFT is set when <b>pcre2_match()</b> identifies a partial
-match, the partial match is remembered, but matching continues as normal, and
-other alternatives in the pattern are tried. If no complete match can be found,
-PCRE2_ERROR_PARTIAL is returned instead of PCRE2_ERROR_NOMATCH.
-</P>
-<P>
-This option is "soft" because it prefers a complete match over a partial match.
-All the various matching items in a pattern behave as if the subject string is
-potentially complete. For example, \z, \Z, and $ match at the end of the
-subject, as normal, and for \b and \B the end of the subject is treated as a
-non-alphanumeric.
-</P>
-<P>
 If there is more than one partial match, the first one that was found provides
 the data that is returned. Consider this pattern:
 <pre>
  /123\w+X|dogY/
 </pre>
-If this is matched against the subject string "abc123dog", both
-alternatives fail to match, but the end of the subject is reached during
-matching, so PCRE2_ERROR_PARTIAL is returned. The offsets are set to 3 and 9,
-identifying "123dog" as the first partial match that was found. (In this
-example, there are two partial matches, because "dog" on its own partially
-matches the second alternative.)
+If this is matched against the subject string "abc123dog", both alternatives
+fail to match, but the end of the subject is reached during matching, so
+PCRE2_ERROR_PARTIAL is returned. The offsets are set to 3 and 9, identifying
+"123dog" as the first partial match. (In this example, there are two partial
+matches, because "dog" on its own partially matches the second alternative.)
 </P>
 <br><b>
-PCRE2_PARTIAL_HARD WITH pcre2_match()
+How a partial match is processed by pcre2_match()
 </b><br>
 <P>
-If PCRE2_PARTIAL_HARD is set for <b>pcre2_match()</b>, PCRE2_ERROR_PARTIAL is
-returned as soon as a partial match is found, without continuing to search for
-possible complete matches. This option is "hard" because it prefers an earlier
-partial match over a later complete match. For this reason, the assumption is
-made that the end of the supplied subject string may not be the true end of the
-available data, and so, if \z, \Z, \b, \B, or $ are encountered at the end
-of the subject, the result is PCRE2_ERROR_PARTIAL, provided that at least one
-character in the subject has been inspected.
+What happens when a partial match is identified depends on which of the two
+partial matching options is set.
+</P>
+<P>
+If PCRE2_PARTIAL_HARD is set, PCRE2_ERROR_PARTIAL is returned as soon as a
+partial match is found, without continuing to search for possible complete
+matches. This option is "hard" because it prefers an earlier partial match over
+a later complete match. For this reason, the assumption is made that the end of
+the supplied subject string is not the true end of the available data, which is
+why \z, \Z, \b, \B, and $ always give a partial match.
+</P>
+<P>
+If PCRE2_PARTIAL_SOFT is set, the partial match is remembered, but matching
+continues as normal, and other alternatives in the pattern are tried. If no
+complete match can be found, PCRE2_ERROR_PARTIAL is returned instead of
+PCRE2_ERROR_NOMATCH. This option is "soft" because it prefers a complete match
+over a partial match. All the various matching items in a pattern behave as if
+the subject string is potentially complete; \z, \Z, and $ match at the end of
+the subject, as normal, and for \b and \B the end of the subject is treated
+as a non-alphanumeric.
 </P>
-<br><b>
-Comparing hard and soft partial matching
-</b><br>
 <P>
 The difference between the two partial matching options can be illustrated by a
 pattern such as:
@ -175,26 +208,135 @@ to follow this explanation by thinking of the two patterns like this:
 The second pattern will never match "dogsbody", because it will always find the
 shorter match first.
 </P>
-<br><a name="SEC3" href="#TOC1">PARTIAL MATCHING USING pcre2_dfa_match()</a><br>
+<br><b>
+Example of partial matching using pcre2test
+</b><br>
 <P>
-The DFA functions move along the subject string character by character, without
+The <b>pcre2test</b> data modifiers <b>partial_hard</b> (or <b>ph</b>) and
+<b>partial_soft</b> (or <b>ps</b>) set PCRE2_PARTIAL_HARD and PCRE2_PARTIAL_SOFT,
+respectively, when calling <b>pcre2_match()</b>. Here is a run of
+<b>pcre2test</b> using a pattern that matches the whole subject in the form of a
+date:
+<pre>
+    re&#62; /^\d?\d(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)\d\d$/
+  data&#62; 25dec3\=ph
+  Partial match: 23dec3
+  data&#62; 3ju\=ph
+  Partial match: 3ju
+  data&#62; 3juj\=ph
+  No match
+</pre>
+This example gives the same results for both hard and soft partial matching
+options. Here is an example where there is a difference:
+<pre>
+    re&#62; /^\d?\d(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)\d\d$/
+  data&#62; 25jun04\=ps
+   0: 25jun04
+   1: jun
+  data&#62; 25jun04\=ph
+  Partial match: 25jun04
+</pre>
+With PCRE2_PARTIAL_SOFT, the subject is matched completely. For
+PCRE2_PARTIAL_HARD, however, the subject is assumed not to be complete, so
+there is only a partial match.
+</P>
+<br><a name="SEC4" href="#TOC1">MULTI-SEGMENT MATCHING WITH pcre2_match()</a><br>
+<P>
+PCRE was not originally designed with multi-segment matching in mind. However,
+over time, features (including partial matching) that make multi-segment
+matching possible have been added. A very long string can be searched segment
+by segment by calling <b>pcre2_match()</b> repeatedly, with the aim of achieving
+the same results that would happen if the entire string was available for
+searching all the time. Normally, the strings that are being sought are much
+shorter than each individual segment, and are in the middle of very long
+strings, so the pattern is normally not anchored.
+</P>
+<P>
+Special logic must be implemented to handle a matched substring that spans a
+segment boundary. PCRE2_PARTIAL_HARD should be used, because it returns a
+partial match at the end of a segment whenever there is the possibility of
+changing the match by adding more characters. The PCRE2_NOTBOL option should
+also be set for all but the first segment.
+</P>
+<P>
+When a partial match occurs, the next segment must be added to the current
+subject and the match re-run, using the <i>startoffset</i> argument of
+<b>pcre2_match()</b> to begin at the point where the partial match started.
+For example:
+<pre>
+    re&#62; /\d?\d(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)\d\d/
+  data&#62; ...the date is 23ja\=ph
+  Partial match: 23ja
+  data&#62; ...the date is 23jan19 and on that day...\=offset=15
+   0: 23jan19
+   1: jan
+</pre>
+Note the use of the <b>offset</b> modifier to start the new match where the
+partial match was found. In this example, the next segment was added to the one
+in which the partial match was found. This is the most straightforward
+approach, typically using a memory buffer that is twice the size of each
+segment. After a partial match, the first half of the buffer is discarded, the
+second half is moved to the start of the buffer, and a new segment is added
+before repeating the match as in the example above. After a no match, the
+entire buffer can be discarded.
+</P>
+<P>
+If there are memory constraints, you may want to discard text that precedes a
+partial match before adding the next segment. Unfortunately, this is not at
+present straightforward. In cases such as the above, where the pattern does not
+contain any lookbehinds, it is sufficient to retain only the partially matched
+substring. However, if the pattern contains a lookbehind assertion, characters
+that precede the start of the partial match may have been inspected during the
+matching process. When <b>pcre2test</b> displays a partial match, it indicates
+these characters with '&#60;' if the <b>allusedtext</b> modifier is set:
+<pre>
+    re&#62; "(?&#60;=123)abc"
+  data&#62; xx123ab\=ph,allusedtext
+  Partial match: 123ab
+                 &#60;&#60;&#60;
+</pre>
+However, the <b>allusedtext</b> modifier is not available for JIT matching,
+because JIT matching does not record the first (or last) consulted characters.
+For this reason, this information is not available via the API. It is therefore
+not possible in general to obtain the exact number of characters that must be
+retained in order to get the right match result. If you cannot retain the
+entire segment, you must find some heuristic way of choosing.
+</P>
+<P>
+If you know the approximate length of the matching substrings, you can use that
+to decide how much text to retain. The only lookbehind information that is
+currently available via the API is the length of the longest individual
+lookbehind in a pattern, but this can be misleading if there are nested
+lookbehinds. The value returned by calling <b>pcre2_pattern_info()</b> with the
+PCRE2_INFO_MAXLOOKBEHIND option is the maximum number of characters (not code
+units) that any individual lookbehind moves back when it is processed. A
+pattern such as "(?&#60;=(?&#60;!b)a)" has a maximum lookbehind value of one, but
+inspects two characters before its starting point.
+</P>
+<P>
+In a non-UTF or a 32-bit case, moving back is just a subtraction, but in
+UTF-8 or UTF-16 you have to count characters while moving back through the code
+units.
+</P>
+<br><a name="SEC5" href="#TOC1">PARTIAL MATCHING USING pcre2_dfa_match()</a><br>
+<P>
+The DFA function moves along the subject string character by character, without
 backtracking, searching for all possible matches simultaneously. If the end of
 the subject is reached before the end of the pattern, there is the possibility
-of a partial match, again provided that at least one character has been
-inspected.
+of a partial match.
 </P>
 <P>
 When PCRE2_PARTIAL_SOFT is set, PCRE2_ERROR_PARTIAL is returned only if there
 have been no complete matches. Otherwise, the complete matches are returned.
-However, if PCRE2_PARTIAL_HARD is set, a partial match takes precedence over
-any complete matches. The portion of the string that was matched when the
-longest partial match was found is set as the first matching string.
+If PCRE2_PARTIAL_HARD is set, a partial match takes precedence over any
+complete matches. The portion of the string that was matched when the longest
+partial match was found is set as the first matching string.
 </P>
 <P>
-Because the DFA functions always search for all possible matches, and there is
-no difference between greedy and ungreedy repetition, their behaviour is
-different from the standard functions when PCRE2_PARTIAL_HARD is set. Consider
-the string "dog" matched against the ungreedy pattern shown above:
+Because the DFA function always searches for all possible matches, and there is
+no difference between greedy and ungreedy repetition, its behaviour is
+different from the <b>pcre2_match()</b>. Consider the string "dog" matched
+against this ungreedy pattern:
 <pre>
  /dog(sbody)??/
 </pre>
@ -202,58 +344,16 @@ Whereas the standard function stops as soon as it finds the complete match for
 "dog", the DFA function also finds the partial match for "dogsbody", and so
 returns that when PCRE2_PARTIAL_HARD is set.
 </P>
-<br><a name="SEC4" href="#TOC1">PARTIAL MATCHING AND WORD BOUNDARIES</a><br>
-<P>
-If a pattern ends with one of sequences \b or \B, which test for word
-boundaries, partial matching with PCRE2_PARTIAL_SOFT can give counter-intuitive
-results. Consider this pattern:
-<pre>
-  /\bcat\b/
-</pre>
-This matches "cat", provided there is a word boundary at either end. If the
-subject string is "the cat", the comparison of the final "t" with a following
-character cannot take place, so a partial match is found. However, normal
-matching carries on, and \b matches at the end of the subject when the last
-character is a letter, so a complete match is found. The result, therefore, is
-<i>not</i> PCRE2_ERROR_PARTIAL. Using PCRE2_PARTIAL_HARD in this case does yield
-PCRE2_ERROR_PARTIAL, because then the partial match takes precedence.
-</P>
-<br><a name="SEC5" href="#TOC1">EXAMPLE OF PARTIAL MATCHING USING PCRE2TEST</a><br>
-<P>
-If the <b>partial_soft</b> (or <b>ps</b>) modifier is present on a
-<b>pcre2test</b> data line, the PCRE2_PARTIAL_SOFT option is used for the match.
-Here is a run of <b>pcre2test</b> that uses the date example quoted above:
-<pre>
-    re&#62; /^\d?\d(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)\d\d$/
-  data&#62; 25jun04\=ps
-   0: 25jun04
-   1: jun
-  data&#62; 25dec3\=ps
-  Partial match: 23dec3
-  data&#62; 3ju\=ps
-  Partial match: 3ju
-  data&#62; 3juj\=ps
-  No match
-  data&#62; j\=ps
-  No match
-</pre>
-The first data string is matched completely, so <b>pcre2test</b> shows the
-matched substrings. The remaining four strings do not match the complete
-pattern, but the first two are partial matches. Similar output is obtained
-if DFA matching is used.
-</P>
-<P>
-If the <b>partial_hard</b> (or <b>ph</b>) modifier is present on a
-<b>pcre2test</b> data line, the PCRE2_PARTIAL_HARD option is set for the match.
-</P>
 <br><a name="SEC6" href="#TOC1">MULTI-SEGMENT MATCHING WITH pcre2_dfa_match()</a><br>
 <P>
-When a partial match has been found using a DFA matching function, it is
+When a partial match has been found using the DFA matching function, it is
 possible to continue the match by providing additional subject data and calling
 the function again with the same compiled regular expression, this time setting
 the PCRE2_DFA_RESTART option. You must pass the same working space as before,
-because this is where details of the previous partial match are stored. Here is
-an example using <b>pcre2test</b>:
+because this is where details of the previous partial match are stored. You can
+set the PCRE2_PARTIAL_SOFT or PCRE2_PARTIAL_HARD options with PCRE2_DFA_RESTART
+to continue partial matching over multiple segments. Here is an example using
+<b>pcre2test</b>:
 <pre>
    re&#62; /^\d?\d(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)\d\d$/
  data&#62; 23ja\=dfa,ps
@ -265,155 +365,10 @@ The first call has "23ja" as the subject, and requests partial matching; the
 second call has "n05" as the subject for the continued (restarted) match.
 Notice that when the match is complete, only the last part is shown; PCRE2 does
 not retain the previously partially-matched string. It is up to the calling
-program to do that if it needs to.
-</P>
-<P>
-That means that, for an unanchored pattern, if a continued match fails, it is
-not possible to try again at a new starting point. All this facility is capable
-of doing is continuing with the previous match attempt. In the previous
-example, if the second set of data is "ug23" the result is no match, even
-though there would be a match for "aug23" if the entire string were given at
-once. Depending on the application, this may or may not be what you want.
-The only way to allow for starting again at the next character is to retain the
-matched part of the subject and try a new complete match.
-</P>
-<P>
-You can set the PCRE2_PARTIAL_SOFT or PCRE2_PARTIAL_HARD options with
-PCRE2_DFA_RESTART to continue partial matching over multiple segments. This
-facility can be used to pass very long subject strings to the DFA matching
-functions.
-</P>
-<br><a name="SEC7" href="#TOC1">MULTI-SEGMENT MATCHING WITH pcre2_match()</a><br>
-<P>
-Unlike the DFA function, it is not possible to restart the previous match with
-a new segment of data when using <b>pcre2_match()</b>. Instead, new data must be
-added to the previous subject string, and the entire match re-run, starting
-from the point where the partial match occurred. Earlier data can be discarded.
-</P>
-<P>
-It is best to use PCRE2_PARTIAL_HARD in this situation, because it does not
-treat the end of a segment as the end of the subject when matching \z, \Z,
-\b, \B, and $. Consider an unanchored pattern that matches dates:
-<pre>
-    re&#62; /\d?\d(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)\d\d/
-  data&#62; The date is 23ja\=ph
-  Partial match: 23ja
-</pre>
-At this stage, an application could discard the text preceding "23ja", add on
-text from the next segment, and call the matching function again. Unlike the
-DFA matching function, the entire matching string must always be available,
-and the complete matching process occurs for each call, so more memory and more
-processing time is needed.
-</P>
-<br><a name="SEC8" href="#TOC1">ISSUES WITH MULTI-SEGMENT MATCHING</a><br>
-<P>
-Certain types of pattern may give problems with multi-segment matching,
-whichever matching function is used.
-</P>
-<P>
-1. If the pattern contains a test for the beginning of a line, you need to pass
-the PCRE2_NOTBOL option when the subject string for any call does start at the
-beginning of a line. There is also a PCRE2_NOTEOL option, but in practice when
-doing multi-segment matching you should be using PCRE2_PARTIAL_HARD, which
-includes the effect of PCRE2_NOTEOL.
-</P>
-<P>
-2. If a pattern contains a lookbehind assertion, characters that precede the
-start of the partial match may have been inspected during the matching process.
-When using <b>pcre2_match()</b>, sufficient characters must be retained for the
-next match attempt. You can ensure that enough characters are retained by doing
-the following:
-</P>
-<P>
-Before doing any matching, find the length of the longest lookbehind in the
-pattern by calling <b>pcre2_pattern_info()</b> with the PCRE2_INFO_MAXLOOKBEHIND
-option. Note that the resulting count is in characters, not code units. After a
-partial match, moving back from the ovector[0] offset in the subject by the
-number of characters given for the maximum lookbehind gets you to the earliest
-character that must be retained. In a non-UTF or a 32-bit situation, moving
-back is just a subtraction, but in UTF-8 or UTF-16 you have to count characters
-while moving back through the code units.
-</P>
-<P>
-Characters before the point you have now reached can be discarded, and after
-the next segment has been added to what is retained, you should run the next
-match with the <b>startoffset</b> argument set so that the match begins at the
-same point as before.
-</P>
-<P>
-For example, if the pattern "(?&#60;=123)abc" is partially matched against the
-string "xx123ab", the ovector offsets are 5 and 7 ("ab"). The maximum
-lookbehind count is 3, so all characters before offset 2 can be discarded. The
-value of <b>startoffset</b> for the next match should be 3. When <b>pcre2test</b>
-displays a partial match, it indicates the lookbehind characters with '&#60;'
-characters:
-<pre>
-    re&#62; "(?&#60;=123)abc"
-  data&#62; xx123ab\=ph
-  Partial match: 123ab
-                 &#60;&#60;&#60;
-</PRE>
-</P>
-<P>
-3. Because a partial match must always contain at least one character, what
-might be considered a partial match of an empty string actually gives a "no
-match" result. For example:
-<pre>
-    re&#62; /c(?&#60;=abc)x/
-  data&#62; ab\=ps
-  No match
-</pre>
-If the next segment begins "cx", a match should be found, but this will only
-happen if characters from the previous segment are retained. For this reason, a
-"no match" result should be interpreted as "partial match of an empty string"
-when the pattern contains lookbehinds.
-</P>
-<P>
-4. Matching a subject string that is split into multiple segments may not
-always produce exactly the same result as matching over one single long string,
-especially when PCRE2_PARTIAL_SOFT is used. The section "Partial Matching and
-Word Boundaries" above describes an issue that arises if the pattern ends with
-\b or \B. Another kind of difference may occur when there are multiple
-matching possibilities, because (for PCRE2_PARTIAL_SOFT) a partial match result
-is given only when there are no completed matches. This means that as soon as
-the shortest match has been found, continuation to a new subject segment is no
-longer possible. Consider this <b>pcre2test</b> example:
-<pre>
-    re&#62; /dog(sbody)?/
-  data&#62; dogsb\=ps
-   0: dog
-  data&#62; do\=ps,dfa
-  Partial match: do
-  data&#62; gsb\=ps,dfa,dfa_restart
-   0: g
-  data&#62; dogsbody\=dfa
-   0: dogsbody
-   1: dog
-</pre>
-The first data line passes the string "dogsb" to a standard matching function,
-setting the PCRE2_PARTIAL_SOFT option. Although the string is a partial match
-for "dogsbody", the result is not PCRE2_ERROR_PARTIAL, because the shorter
-string "dog" is a complete match. Similarly, when the subject is presented to
-a DFA matching function in several parts ("do" and "gsb" being the first two)
-the match stops when "dog" has been found, and it is not possible to continue.
-On the other hand, if "dogsbody" is presented as a single string, a DFA
-matching function finds both matches.
-</P>
-<P>
-Because of these problems, it is best to use PCRE2_PARTIAL_HARD when matching
-multi-segment data. The example above then behaves differently:
-<pre>
-    re&#62; /dog(sbody)?/
-  data&#62; dogsb\=ph
-  Partial match: dogsb
-  data&#62; do\=ps,dfa
-  Partial match: do
-  data&#62; gsb\=ph,dfa,dfa_restart
-  Partial match: gsb
-</pre>
-5. Patterns that contain alternatives at the top level which do not all start
-with the same pattern item may not work as expected when PCRE2_DFA_RESTART is
-used. For example, consider this pattern:
+program to do that if it needs to. This means that, for an unanchored pattern,
+if a continued match fails, it is not possible to try again at a new starting
+point. All this facility is capable of doing is continuing with the previous
+match attempt. For example, consider this pattern:
 <pre>
  1234|3789
 </pre>
@ -422,30 +377,18 @@ alternative is found at offset 3. There is no partial match for the second
 alternative, because such a match does not start at the same point in the
 subject string. Attempting to continue with the string "7890" does not yield a
 match because only those alternatives that match at one point in the subject
-are remembered. The problem arises because the start of the second alternative
-matches within the first alternative. There is no problem with anchored
-patterns or patterns such as:
-<pre>
-  1234|ABCD
-</pre>
-where no string can be a partial match for both alternatives. This is not a
-problem if a standard matching function is used, because the entire match has
-to be rerun each time:
-<pre>
-    re&#62; /1234|3789/
-  data&#62; ABC123\=ph
-  Partial match: 123
-  data&#62; 1237890
-   0: 3789
-</pre>
-Of course, instead of using PCRE2_DFA_RESTART, the same technique of re-running
-the entire match can also be used with the DFA matching function. Another
-possibility is to work with two buffers. If a partial match at offset <i>n</i>
-in the first buffer is followed by "no match" when PCRE2_DFA_RESTART is used on
-the second buffer, you can then try a new match starting at offset <i>n+1</i> in
-the first buffer.
+are remembered. Depending on the application, this may or may not be what you
+want.
 </P>
-<br><a name="SEC9" href="#TOC1">AUTHOR</a><br>
+<P>
+If you do want to allow for starting again at the next character, one way of
+doing it is to retain some or all of the segment and try a new complete match,
+as described for <b>pcre2_match()</b> above. Another possibility is to work with
+two buffers. If a partial match at offset <i>n</i> in the first buffer is
+followed by "no match" when PCRE2_DFA_RESTART is used on the second buffer, you
+can then try a new match starting at offset <i>n+1</i> in the first buffer.
+</P>
+<br><a name="SEC7" href="#TOC1">AUTHOR</a><br>
 <P>
 Philip Hazel
 <br>
@ -454,11 +397,11 @@ University Computing Service
 Cambridge, England.
 <br>
 </P>
-<br><a name="SEC10" href="#TOC1">REVISION</a><br>
+<br><a name="SEC8" href="#TOC1">REVISION</a><br>
 <P>
-Last updated: 22 December 2014
+Last updated: 04 September 2019
 <br>
-Copyright &copy; 1997-2014 University of Cambridge.
+Copyright &copy; 1997-2019 University of Cambridge.
 <br>
 <p>
 Return to the <a href="index.html">PCRE2 index page</a>.
--- a/doc/html/pcre2pattern.html
+++ b/doc/html/pcre2pattern.html
@ -33,17 +33,18 @@ please consult the man page, in case the conversion went wrong.
 <li><a name="TOC18" href="#SEC18">ATOMIC GROUPING AND POSSESSIVE QUANTIFIERS</a>
 <li><a name="TOC19" href="#SEC19">BACKREFERENCES</a>
 <li><a name="TOC20" href="#SEC20">ASSERTIONS</a>
-<li><a name="TOC21" href="#SEC21">SCRIPT RUNS</a>
-<li><a name="TOC22" href="#SEC22">CONDITIONAL GROUPS</a>
-<li><a name="TOC23" href="#SEC23">COMMENTS</a>
-<li><a name="TOC24" href="#SEC24">RECURSIVE PATTERNS</a>
-<li><a name="TOC25" href="#SEC25">GROUPS AS SUBROUTINES</a>
-<li><a name="TOC26" href="#SEC26">ONIGURUMA SUBROUTINE SYNTAX</a>
-<li><a name="TOC27" href="#SEC27">CALLOUTS</a>
-<li><a name="TOC28" href="#SEC28">BACKTRACKING CONTROL</a>
-<li><a name="TOC29" href="#SEC29">SEE ALSO</a>
-<li><a name="TOC30" href="#SEC30">AUTHOR</a>
-<li><a name="TOC31" href="#SEC31">REVISION</a>
+<li><a name="TOC21" href="#SEC21">NON-ATOMIC ASSERTIONS</a>
+<li><a name="TOC22" href="#SEC22">SCRIPT RUNS</a>
+<li><a name="TOC23" href="#SEC23">CONDITIONAL GROUPS</a>
+<li><a name="TOC24" href="#SEC24">COMMENTS</a>
+<li><a name="TOC25" href="#SEC25">RECURSIVE PATTERNS</a>
+<li><a name="TOC26" href="#SEC26">GROUPS AS SUBROUTINES</a>
+<li><a name="TOC27" href="#SEC27">ONIGURUMA SUBROUTINE SYNTAX</a>
+<li><a name="TOC28" href="#SEC28">CALLOUTS</a>
+<li><a name="TOC29" href="#SEC29">BACKTRACKING CONTROL</a>
+<li><a name="TOC30" href="#SEC30">SEE ALSO</a>
+<li><a name="TOC31" href="#SEC31">AUTHOR</a>
+<li><a name="TOC32" href="#SEC32">REVISION</a>
 </ul>
 <br><a name="SEC1" href="#TOC1">PCRE2 REGULAR EXPRESSION DETAILS</a><br>
 <P>
@ -91,10 +92,11 @@ single code units, or as multiple UTF-8 or UTF-16 code units. UTF-32 can be
 specified for the 32-bit library, in which case it constrains the character
 values to valid Unicode code points. To process UTF strings, PCRE2 must be
 built to include Unicode support (which is the default). When using UTF strings
-you must either call the compiling function with the PCRE2_UTF option, or the
-pattern must start with the special sequence (*UTF), which is equivalent to
-setting the relevant option. How setting a UTF mode affects pattern matching is
-mentioned in several places below. There is also a summary of features in the
+you must either call the compiling function with one or both of the PCRE2_UTF
+or PCRE2_MATCH_INVALID_UTF options, or the pattern must start with the special
+sequence (*UTF), which is equivalent to setting the relevant PCRE2_UTF. How
+setting a UTF mode affects pattern matching is mentioned in several places
+below. There is also a summary of features in the
 <a href="pcre2unicode.html"><b>pcre2unicode</b></a>
 page.
 </P>
@ -112,7 +114,8 @@ Another special sequence that may appear at the start of a pattern is (*UCP).
 This has the same effect as setting the PCRE2_UCP option: it causes sequences
 such as \d and \w to use Unicode properties to determine character types,
 instead of recognizing only characters with codes less than 256 via a lookup
-table.
+table. If also causes upper/lower casing operations to use Unicode properties
+for characters with code points greater than 127, even when UTF is not set.
 </P>
 <P>
 Some applications that allow their users to supply patterns may wish to
@ -286,8 +289,11 @@ corresponding characters in the subject. As a trivial example, the pattern
  The quick brown fox
 </pre>
 matches a portion of a subject string that is identical to itself. When
-caseless matching is specified (the PCRE2_CASELESS option), letters are matched
-independently of case.
+caseless matching is specified (the PCRE2_CASELESS option or (?i) within the
+pattern), letters are matched independently of case. Note that there are two
+ASCII characters, K and S, that, in addition to their lower case ASCII
+equivalents, are case-equivalent with Unicode U+212A (Kelvin sign) and U+017F
+(long S) respectively when either PCRE2_UTF or PCRE2_UCP is set.
 </P>
 <P>
 The power of regular expressions comes from the ability to include wild cards,
@ -323,6 +329,20 @@ a character class the only metacharacters are:
  [      POSIX character class (if followed by POSIX syntax)
  ]      terminates the character class
 </pre>
+If a pattern is compiled with the PCRE2_EXTENDED option, most white space in
+the pattern, other than in a character class, and characters between a #
+outside a character class and the next newline, inclusive, are ignored. An
+escaping backslash can be used to include a white space or a # character as
+part of the pattern. If the PCRE2_EXTENDED_MORE option is set, the same
+applies, but in addition unescaped space and horizontal tab characters are
+ignored inside a character class. Note: only these two characters are ignored,
+not the full set of pattern white space characters that are ignored outside a
+character class. Option settings can be changed within a pattern; see the
+section entitled
+<a href="#internaloptions">"Internal Option Setting"</a>
+below.
+</P>
+<P>
 The following sections describe the use of each of the metacharacters.
 </P>
 <br><a name="SEC5" href="#TOC1">BACKSLASH</a><br>
@ -340,16 +360,9 @@ precede a non-alphanumeric with backslash to specify that it stands for itself.
 In particular, if you want to match a backslash, you write \\.
 </P>
 <P>
-In a UTF mode, only ASCII digits and letters have any special meaning after a
-backslash. All other characters (in particular, those whose code points are
-greater than 127) are treated as literals.
-</P>
-<P>
-If a pattern is compiled with the PCRE2_EXTENDED option, most white space in
-the pattern (other than in a character class), and characters between a #
-outside a character class and the next newline, inclusive, are ignored. An
-escaping backslash can be used to include a white space or # character as part
-of the pattern.
+Only ASCII digits and letters have any special meaning after a backslash. All
+other characters (in particular, those whose code points are greater than 127)
+are treated as literals.
 </P>
 <P>
 If you want to treat all characters in a sequence as literals, you can do so by
@ -428,11 +441,11 @@ There may be any number of hexadecimal digits. This syntax is from ECMAScript
 6.
 </P>
 <P>
-The \N{U+hhh..} escape sequence is recognized only when the PCRE2_UTF option
-is set, that is, when PCRE2 is operating in a Unicode mode. Perl also uses
-\N{name} to specify characters by Unicode name; PCRE2 does not support this.
-Note that when \N is not followed by an opening brace (curly bracket) it has
-an entirely different meaning, matching any character that is not a newline.
+The \N{U+hhh..} escape sequence is recognized only when PCRE2 is operating in
+UTF mode. Perl also uses \N{name} to specify characters by Unicode name; PCRE2
+does not support this. Note that when \N is not followed by an opening brace
+(curly bracket) it has an entirely different meaning, matching any character
+that is not a newline.
 </P>
 <P>
 There are some legacy applications where the escape sequence \r is expected to
@ -521,7 +534,7 @@ for themselves. For example, outside a character class:
  \0113  is a tab followed by the character "3"
  \113   might be a backreference, otherwise the character with octal code 113
  \377   might be a backreference, otherwise the value 255 (decimal)
-  \81    is always a backreference .sp
+  \81    is always a backreference
 </pre>
 Note that octal values of 100 or greater that are specified using this syntax
 must not be introduced by a leading zero, because no more than three octal
@ -732,7 +745,7 @@ Unicode support is not needed for these characters to be recognized.
 <P>
 It is possible to restrict \R to match only CR, LF, or CRLF (instead of the
 complete set of Unicode line endings) by setting the option PCRE2_BSR_ANYCRLF
-at compile time. (BSR is an abbrevation for "backslash R".) This can be made
+at compile time. (BSR is an abbreviation for "backslash R".) This can be made
 the default when PCRE2 is built; if this is the case, the other behaviour can
 be requested via the PCRE2_BSR_UNICODE option. It is also possible to specify
 these settings by starting a pattern string with one of the following
@ -763,186 +776,62 @@ can be used in any mode, though in 8-bit and 16-bit non-UTF modes these
 sequences are of course limited to testing characters whose code points are
 less than U+0100 and U+10000, respectively. In 32-bit non-UTF mode, code points
 greater than 0x10ffff (the Unicode limit) may be encountered. These are all
-treated as being in the Unknown script and with an unassigned type. The extra
-escape sequences are:
+treated as being in the Unknown script and with an unassigned type.
+</P>
+<P>
+Matching characters by Unicode property is not fast, because PCRE2 has to do a
+multistage table lookup in order to find a character's property. That is why
+the traditional escape sequences such as \d and \w do not use Unicode
+properties in PCRE2 by default, though you can make them do so by setting the
+PCRE2_UCP option or by starting the pattern with (*UCP).
+</P>
+<P>
+The extra escape sequences that provide property support are:
 <pre>
  \p{<i>xx</i>}   a character with the <i>xx</i> property
  \P{<i>xx</i>}   a character without the <i>xx</i> property
  \X       a Unicode extended grapheme cluster
 </pre>
-The property names represented by <i>xx</i> above are case-sensitive. There is
-support for Unicode script names, Unicode general category properties, "Any",
-which matches any character (including newline), and some special PCRE2
-properties (described in the
-<a href="#extraprops">next section).</a>
-Other Perl properties such as "InMusicalSymbols" are not supported by PCRE2.
-Note that \P{Any} does not match any characters, so always causes a match
-failure.
+The property names represented by <i>xx</i> above are not case-sensitive, and in
+accordance with Unicode's "loose matching" rules, spaces, hyphens, and
+underscores are ignored. There is support for Unicode script names, Unicode
+general category properties, "Any", which matches any character (including
+newline), Bidi_Class, a number of binary (yes/no) properties, and some special
+PCRE2 properties (described
+<a href="#extraprops">below).</a>
+Certain other Perl properties such as "InMusicalSymbols" are not supported by
+PCRE2. Note that \P{Any} does not match any characters, so always causes a
+match failure.
+</P>
+<br><b>
+Script properties for \p and \P
+</b><br>
+<P>
+There are three different syntax forms for matching a script. Each Unicode
+character has a basic script and, optionally, a list of other scripts ("Script
+Extensions") with which it is commonly used. Using the Adlam script as an
+example, \p{sc:Adlam} matches characters whose basic script is Adlam, whereas
+\p{scx:Adlam} matches, in addition, characters that have Adlam in their
+extensions list. The full names "script" and "script extensions" for the
+property types are recognized, and a equals sign is an alternative to the
+colon. If a script name is given without a property type, for example,
+\p{Adlam}, it is treated as \p{scx:Adlam}. Perl changed to this
+interpretation at release 5.26 and PCRE2 changed at release 10.40.
 </P>
 <P>
-Sets of Unicode characters are defined as belonging to certain scripts. A
-character from one of these sets can be matched using a script name. For
-example:
-<pre>
-  \p{Greek}
-  \P{Han}
-</pre>
 Unassigned characters (and in non-UTF 32-bit mode, characters with code points
 greater than 0x10FFFF) are assigned the "Unknown" script. Others that are not
 part of an identified script are lumped together as "Common". The current list
-of scripts is:
-</P>
-<P>
-Adlam,
-Ahom,
-Anatolian_Hieroglyphs,
-Arabic,
-Armenian,
-Avestan,
-Balinese,
-Bamum,
-Bassa_Vah,
-Batak,
-Bengali,
-Bhaiksuki,
-Bopomofo,
-Brahmi,
-Braille,
-Buginese,
-Buhid,
-Canadian_Aboriginal,
-Carian,
-Caucasian_Albanian,
-Chakma,
-Cham,
-Cherokee,
-Common,
-Coptic,
-Cuneiform,
-Cypriot,
-Cyrillic,
-Deseret,
-Devanagari,
-Dogra,
-Duployan,
-Egyptian_Hieroglyphs,
-Elbasan,
-Ethiopic,
-Georgian,
-Glagolitic,
-Gothic,
-Grantha,
-Greek,
-Gujarati,
-Gunjala_Gondi,
-Gurmukhi,
-Han,
-Hangul,
-Hanifi_Rohingya,
-Hanunoo,
-Hatran,
-Hebrew,
-Hiragana,
-Imperial_Aramaic,
-Inherited,
-Inscriptional_Pahlavi,
-Inscriptional_Parthian,
-Javanese,
-Kaithi,
-Kannada,
-Katakana,
-Kayah_Li,
-Kharoshthi,
-Khmer,
-Khojki,
-Khudawadi,
-Lao,
-Latin,
-Lepcha,
-Limbu,
-Linear_A,
-Linear_B,
-Lisu,
-Lycian,
-Lydian,
-Mahajani,
-Makasar,
-Malayalam,
-Mandaic,
-Manichaean,
-Marchen,
-Masaram_Gondi,
-Medefaidrin,
-Meetei_Mayek,
-Mende_Kikakui,
-Meroitic_Cursive,
-Meroitic_Hieroglyphs,
-Miao,
-Modi,
-Mongolian,
-Mro,
-Multani,
-Myanmar,
-Nabataean,
-New_Tai_Lue,
-Newa,
-Nko,
-Nushu,
-Ogham,
-Ol_Chiki,
-Old_Hungarian,
-Old_Italic,
-Old_North_Arabian,
-Old_Permic,
-Old_Persian,
-Old_Sogdian,
-Old_South_Arabian,
-Old_Turkic,
-Oriya,
-Osage,
-Osmanya,
-Pahawh_Hmong,
-Palmyrene,
-Pau_Cin_Hau,
-Phags_Pa,
-Phoenician,
-Psalter_Pahlavi,
-Rejang,
-Runic,
-Samaritan,
-Saurashtra,
-Sharada,
-Shavian,
-Siddham,
-SignWriting,
-Sinhala,
-Sogdian,
-Sora_Sompeng,
-Soyombo,
-Sundanese,
-Syloti_Nagri,
-Syriac,
-Tagalog,
-Tagbanwa,
-Tai_Le,
-Tai_Tham,
-Tai_Viet,
-Takri,
-Tamil,
-Tangut,
-Telugu,
-Thaana,
-Thai,
-Tibetan,
-Tifinagh,
-Tirhuta,
-Ugaritic,
-Unknown,
-Vai,
-Warang_Citi,
-Yi,
-Zanabazar_Square.
+of recognized script names and their 4-character abbreviations can be obtained
+by running this command:
+<pre>
+  pcre2test -LS
+
+</PRE>
 </P>
+<br><b>
+The general category property for \p and \P
+</b><br>
 <P>
 Each character has exactly one Unicode general category property, specified by
 a two-letter abbreviation. For compatibility with Perl, negation can be
@ -1004,9 +893,9 @@ The following general category property codes are supported:
  Zp    Paragraph separator
  Zs    Space separator
 </pre>
-The special property L& is also supported: it matches a character that has
-the Lu, Ll, or Lt property, in other words, a letter that is not classified as
-a modifier or "other".
+The special property LC, which has the synonym L&, is also supported: it
+matches a character that has the Lu, Ll, or Lt property, in other words, a
+letter that is not classified as a modifier or "other".
 </P>
 <P>
 The Cs (Surrogate) property applies only to characters whose code points are in
@ -1033,12 +922,54 @@ Specifying caseless matching does not affect these escape sequences. For
 example, \p{Lu} always matches only upper case letters. This is different from
 the behaviour of current versions of Perl.
 </P>
+<br><b>
+Binary (yes/no) properties for \p and \P
+</b><br>
 <P>
-Matching characters by Unicode property is not fast, because PCRE2 has to do a
-multistage table lookup in order to find a character's property. That is why
-the traditional escape sequences such as \d and \w do not use Unicode
-properties in PCRE2 by default, though you can make them do so by setting the
-PCRE2_UCP option or by starting the pattern with (*UCP).
+Unicode defines a number of binary properties, that is, properties whose only
+values are true or false. You can obtain a list of those that are recognized by
+\p and \P, along with their abbreviations, by running this command:
+<pre>
+  pcre2test -LP
+
+</PRE>
+</P>
+<br><b>
+The Bidi_Class property for \p and \P
+</b><br>
+<P>
+<pre>
+  \p{Bidi_Class:&#60;class&#62;}   matches a character with the given class
+  \p{BC:&#60;class&#62;}           matches a character with the given class
+</pre>
+The recognized classes are:
+<pre>
+  AL          Arabic letter
+  AN          Arabic number
+  B           paragraph separator
+  BN          boundary neutral
+  CS          common separator
+  EN          European number
+  ES          European separator
+  ET          European terminator
+  FSI         first strong isolate
+  L           left-to-right
+  LRE         left-to-right embedding
+  LRI         left-to-right isolate
+  LRO         left-to-right override
+  NSM         non-spacing mark
+  ON          other neutral
+  PDF         pop directional format
+  PDI         pop directional isolate
+  R           right-to-left
+  RLE         right-to-left embedding
+  RLI         right-to-left isolate
+  RLO         right-to-left override
+  S           segment separator
+  WS          which space
+</pre>
+An equals sign may be used instead of a colon. The class names are
+case-insensitive; only the short names listed above are recognized.
 </P>
 <br><b>
 Extended grapheme clusters
@ -1069,7 +1000,7 @@ additional characters according to the following rules for ending a cluster:
 3. Do not break Hangul (a Korean script) syllable sequences. Hangul characters
 are of five types: L, V, T, LV, and LVT. An L character may be followed by an
 L, V, LV, or LVT character; an LV or V character may be followed by a V or T
-character; an LVT or T character may be follwed only by a T character.
+character; an LVT or T character may be followed only by a T character.
 </P>
 <P>
 4. Do not end before extending characters or spacing marks or the "zero-width
@ -1154,8 +1085,11 @@ For example, when the pattern
 matches "foobar", the first substring is still set to "foo".
 </P>
 <P>
-Perl documents that the use of \K within assertions is "not well defined". In
-PCRE2, \K is acted upon when it occurs inside positive assertions, but is
+From version 5.32.0 Perl forbids the use of \K in lookaround assertions. From
+release 10.38 PCRE2 also forbids this by default. However, the
+PCRE2_EXTRA_ALLOW_LOOKAROUND_BSK option can be used when calling
+<b>pcre2_compile()</b> to re-enable the previous behaviour. When this option is
+set, \K is acted upon when it occurs inside positive assertions, but is
 ignored in negative assertions. Note that when a pattern such as (?=ab\K)
 matches, the reported start of the match can be greater than the end of the
 match. Using \K in a lookbehind assertion at the start of a pattern can also
@ -1312,15 +1246,17 @@ end of the subject in both modes, and if all branches of a pattern start with
 <P>
 Outside a character class, a dot in the pattern matches any one character in
 the subject string except (by default) a character that signifies the end of a
-line.
+line. One or more characters may be specified as line terminators (see
+<a href="#newlines">"Newline conventions"</a>
+above).
 </P>
 <P>
-When a line ending is defined as a single character, dot never matches that
-character; when the two-character sequence CRLF is used, dot does not match CR
-if it is immediately followed by LF, but otherwise it matches all characters
-(including isolated CRs and LFs). When any Unicode line endings are being
-recognized, dot does not match CR or LF or any of the other line ending
-characters.
+Dot never matches a single line-ending character. When the two-character
+sequence CRLF is the only line ending, dot does not match CR if it is
+immediately followed by LF, but otherwise it matches all characters (including
+isolated CRs and LFs). When ANYCRLF is selected for line endings, no occurences
+of CR of LF match dot. When all Unicode line endings are being recognized, dot
+does not match CR or LF or any of the other line ending characters.
 </P>
 <P>
 The behaviour of dot with regard to newlines can be changed. If the
@ -1360,7 +1296,7 @@ with \C in UTF-8 or UTF-16 mode means that the rest of the string may start
 with a malformed UTF character. This has undefined results, because PCRE2
 assumes that it is matching character by character in a valid UTF string (by
 default it checks the subject string's validity at the start of processing
-unless the PCRE2_NO_UTF_CHECK option is used).
+unless the PCRE2_NO_UTF_CHECK or PCRE2_MATCH_INVALID_UTF option is used).
 </P>
 <P>
 An application can lock out the use of \C by setting the
@ -1432,7 +1368,10 @@ Characters in a class may be specified by their code points using \o, \x, or
 \N{U+hh..} in the usual way. When caseless matching is set, any letters in a
 class represent both their upper case and lower case versions, so for example,
 a caseless [aeiou] matches "A" as well as "a", and a caseless [^aeiou] does not
-match "A", whereas a caseful version would.
+match "A", whereas a caseful version would. Note that there are two ASCII
+characters, K and S, that, in addition to their lower case ASCII equivalents,
+are case-equivalent with Unicode U+212A (Kelvin sign) and U+017F (long S)
+respectively when either PCRE2_UTF or PCRE2_UCP is set.
 </P>
 <P>
 Characters that might indicate line breaks are never treated in any special way
@ -1644,7 +1583,7 @@ that succeeds is used. If the alternatives are within a group
 <a href="#group">(defined below),</a>
 "succeeds" means matching the rest of the main pattern as well as the
 alternative in the group.
-</P>
+<a name="internaloptions"></a></P>
 <br><a name="SEC13" href="#TOC1">INTERNAL OPTION SETTING</a><br>
 <P>
 The settings of the PCRE2_CASELESS, PCRE2_MULTILINE, PCRE2_DOTALL,
@ -1895,12 +1834,19 @@ are permitted for groups with the same number, for example:
  (?|(?&#60;AA&#62;aa)|(?&#60;AA&#62;bb))
 </pre>
 The duplicate name constraint can be disabled by setting the PCRE2_DUPNAMES
-option at compile time, or by the use of (?J) within the pattern. Duplicate
-names can be useful for patterns where only one instance of the named capture
-group can match. Suppose you want to match the name of a weekday, either as a
-3-letter abbreviation or as the full name, and in both cases you want to
-extract the abbreviation. This pattern (ignoring the line breaks) does the job:
+option at compile time, or by the use of (?J) within the pattern, as described
+in the section entitled
+<a href="#internaloptions">"Internal Option Setting"</a>
+above.
+</P>
+<P>
+Duplicate names can be useful for patterns where only one instance of the named
+capture group can match. Suppose you want to match the name of a weekday,
+either as a 3-letter abbreviation or as the full name, and in both cases you
+want to extract the abbreviation. This pattern (ignoring the line breaks) does
+the job:
 <pre>
+  (?J)
  (?&#60;DN&#62;Mon|Fri|Sun)(?:day)?|
  (?&#60;DN&#62;Tue)(?:sday)?|
  (?&#60;DN&#62;Wed)(?:nesday)?|
@ -1921,7 +1867,7 @@ they appear in the overall pattern. The first one that is set is used for the
 reference. For example, this pattern matches both "foofoo" and "barbar" but not
 "foobar" or "barfoo":
 <pre>
-  (?:(?&#60;n&#62;foo)|(?&#60;n&#62;bar))\k&#60;n&#62;
+  (?J)(?:(?&#60;n&#62;foo)|(?&#60;n&#62;bar))\k&#60;n&#62;

 </PRE>
 </P>
@ -1955,7 +1901,7 @@ items:
  an escape such as \d or \pL that matches a single character
  a character class
  a backreference
-  a parenthesized group (including most assertions)
+  a parenthesized group (including lookaround assertions)
  a subroutine call (recursive or otherwise)
 </pre>
 The general repetition quantifier specifies a minimum and maximum number of
@ -2013,8 +1959,10 @@ no characters with a quantifier that has no upper limit, for example:
 </pre>
 Earlier versions of Perl and PCRE1 used to give an error at compile time for
 such patterns. However, because there are cases where this can be useful, such
-patterns are now accepted, but if any repetition of the group does in fact
-match no characters, the loop is forcibly broken.
+patterns are now accepted, but whenever an iteration of such a group matches no
+characters, matching moves on to the next item in the pattern instead of
+repeatedly matching an empty string. This does not prevent backtracking into
+any of the iterations if a subsequent item fails to match.
 </P>
 <P>
 By default, quantifiers are "greedy", that is, they match as much as possible
@ -2139,10 +2087,10 @@ be easier to remember:
 <pre>
  (*atomic:\d+)foo
 </pre>
-This kind of parenthesized group "locks up" the  part of the pattern it
-contains once it has matched, and a failure further into the pattern is
-prevented from backtracking into it. Backtracking past it to previous items,
-however, works as normal.
+This kind of parenthesized group "locks up" the part of the pattern it contains
+once it has matched, and a failure further into the pattern is prevented from
+backtracking into it. Backtracking past it to previous items, however, works as
+normal.
 </P>
 <P>
 An alternative description is that a group of this type matches exactly the
@ -2341,11 +2289,11 @@ using alternation, as in the example above, or by a quantifier with a minimum
 of zero.
 </P>
 <P>
-Backreferences of this type cause the group that they reference to be treated
-as an
+For versions of PCRE2 less than 10.25, backreferences of this type used to
+cause the group that they reference to be treated as an
 <a href="#atomicgroup">atomic group.</a>
-Once the whole group has been matched, a subsequent matching failure cannot
-cause backtracking into the middle of the group.
+This restriction no longer applies, and backtracking into such groups can occur
+as normal.
 <a name="bigassertions"></a></P>
 <br><a name="SEC20" href="#TOC1">ASSERTIONS</a><br>
 <P>
@ -2361,10 +2309,18 @@ those that look behind it, and in each case an assertion may be positive (must
 match for the assertion to be true) or negative (must not match for the
 assertion to be true). An assertion group is matched in the normal way,
 and if it is true, matching continues after it, but with the matching position
-in the subject string is was it was before the assertion was processed.
+in the subject string reset to what it was before the assertion was processed.
 </P>
 <P>
-A lookaround assertion may also appear as the condition in a
+The Perl-compatible lookaround assertions are atomic. If an assertion is true,
+but there is a subsequent matching failure, there is no backtracking into the
+assertion. However, there are some cases where non-atomic assertions can be
+useful. PCRE2 has some support for these, described in the section entitled
+<a href="#nonatomicassertions">"Non-atomic assertions"</a>
+below, but they are not Perl-compatible.
+</P>
+<P>
+A lookaround assertion may appear as the condition in a
 <a href="#conditions">conditional group</a>
 (see below). In this case, the result of matching the assertion determines
 which branch of the condition is followed.
@ -2397,36 +2353,23 @@ control passes to the previous backtracking point, thus discarding any captured
 strings within the assertion.
 </P>
 <P>
-For compatibility with Perl, most assertion groups may be repeated; though it
-makes no sense to assert the same thing several times, the side effect of
-capturing may occasionally be useful. However, an assertion that forms the
-condition for a conditional group may not be quantified. In practice, for
-other assertions, there only three cases:
-<br>
-<br>
-(1) If the quantifier is {0}, the assertion is never obeyed during matching.
-However, it may contain internal capture groups that are called from elsewhere
-via the
-<a href="#groupsassubroutines">subroutine mechanism.</a>
-<br>
-<br>
-(2) If quantifier is {0,n} where n is greater than zero, it is treated as if it
-were {0,1}. At run time, the rest of the pattern match is tried with and
-without the assertion, the order depending on the greediness of the quantifier.
-<br>
-<br>
-(3) If the minimum repetition is greater than zero, the quantifier is ignored.
-The assertion is obeyed just once when encountered during matching.
+Most assertion groups may be repeated; though it makes no sense to assert the
+same thing several times, the side effect of capturing in positive assertions
+may occasionally be useful. However, an assertion that forms the condition for
+a conditional group may not be quantified. PCRE2 used to restrict the
+repetition of assertions, but from release 10.35 the only restriction is that
+an unlimited maximum repetition is changed to be one more than the minimum. For
+example, {3,} is treated as {3,4}.
 </P>
 <br><b>
 Alphabetic assertion names
 </b><br>
 <P>
-Traditionally, symbolic sequences such as (?= and (?&#60;= have been used to specify
-lookaround assertions. Perl 5.28 introduced some experimental alphabetic
-alternatives which might be easier to remember. They all start with (* instead
-of (? and must be written using lower case letters. PCRE2 supports the
-following synonyms:
+Traditionally, symbolic sequences such as (?= and (?&#60;= have been used to
+specify lookaround assertions. Perl 5.28 introduced some experimental
+alphabetic alternatives which might be easier to remember. They all start with
+(* instead of (? and must be written using lower case letters. PCRE2 supports
+the following synonyms:
 <pre>
  (*positive_lookahead:  or (*pla: is the same as (?=
  (*negative_lookahead:  or (*nla: is the same as (?!
@ -2599,8 +2542,69 @@ preceded by "foo", while
 </pre>
 is another pattern that matches "foo" preceded by three digits and any three
 characters that are not "999".
+<a name="nonatomicassertions"></a></P>
+<br><a name="SEC21" href="#TOC1">NON-ATOMIC ASSERTIONS</a><br>
+<P>
+The traditional Perl-compatible lookaround assertions are atomic. That is, if
+an assertion is true, but there is a subsequent matching failure, there is no
+backtracking into the assertion. However, there are some cases where non-atomic
+positive assertions can be useful. PCRE2 provides these using the following
+syntax:
+<pre>
+  (*non_atomic_positive_lookahead:  or (*napla: or (?*
+  (*non_atomic_positive_lookbehind: or (*naplb: or (?&#60;*
+</pre>
+Consider the problem of finding the right-most word in a string that also
+appears earlier in the string, that is, it must appear at least twice in total.
+This pattern returns the required result as captured substring 1:
+<pre>
+  ^(?x)(*napla: .* \b(\w++)) (?&#62; .*? \b\1\b ){2}
+</pre>
+For a subject such as "word1 word2 word3 word2 word3 word4" the result is
+"word3". How does it work? At the start, ^(?x) anchors the pattern and sets the
+"x" option, which causes white space (introduced for readability) to be
+ignored. Inside the assertion, the greedy .* at first consumes the entire
+string, but then has to backtrack until the rest of the assertion can match a
+word, which is captured by group 1. In other words, when the assertion first
+succeeds, it captures the right-most word in the string.
 </P>
-<br><a name="SEC21" href="#TOC1">SCRIPT RUNS</a><br>
+<P>
+The current matching point is then reset to the start of the subject, and the
+rest of the pattern match checks for two occurrences of the captured word,
+using an ungreedy .*? to scan from the left. If this succeeds, we are done, but
+if the last word in the string does not occur twice, this part of the pattern
+fails. If a traditional atomic lookhead (?= or (*pla: had been used, the
+assertion could not be re-entered, and the whole match would fail. The pattern
+would succeed only if the very last word in the subject was found twice.
+</P>
+<P>
+Using a non-atomic lookahead, however, means that when the last word does not
+occur twice in the string, the lookahead can backtrack and find the second-last
+word, and so on, until either the match succeeds, or all words have been
+tested.
+</P>
+<P>
+Two conditions must be met for a non-atomic assertion to be useful: the
+contents of one or more capturing groups must change after a backtrack into the
+assertion, and there must be a backreference to a changed group later in the
+pattern. If this is not the case, the rest of the pattern match fails exactly
+as before because nothing has changed, so using a non-atomic assertion just
+wastes resources.
+</P>
+<P>
+There is one exception to backtracking into a non-atomic assertion. If an
+(*ACCEPT) control verb is triggered, the assertion succeeds atomically. That
+is, a subsequent match failure cannot backtrack into the assertion.
+</P>
+<P>
+Non-atomic assertions are not supported by the alternative matching function
+<b>pcre2_dfa_match()</b>. They are supported by JIT, but only if they do not
+contain any control verbs such as (*ACCEPT). (This may change in future). Note
+that assertions that appear as conditions for
+<a href="#conditions">conditional groups</a>
+(see below) must be atomic.
+</P>
+<br><a name="SEC22" href="#TOC1">SCRIPT RUNS</a><br>
 <P>
 In concept, a script run is a sequence of characters that are all from the same
 Unicode script such as Latin or Greek. However, because some scripts are
@ -2662,7 +2666,7 @@ parentheses.
 should not be used within a script run group, because it causes an immediate
 exit from the group, bypassing the script run checking.
 <a name="conditions"></a></P>
-<br><a name="SEC22" href="#TOC1">CONDITIONAL GROUPS</a><br>
+<br><a name="SEC23" href="#TOC1">CONDITIONAL GROUPS</a><br>
 <P>
 It is possible to cause the matching process to obey a pattern fragment
 conditionally or to choose between two alternative fragments, depending on
@ -2807,7 +2811,7 @@ breaks):
  (?(DEFINE) (?&#60;byte&#62; 2[0-4]\d | 25[0-5] | 1\d\d | [1-9]?\d) )
  \b (?&byte) (\.(?&byte)){3} \b
 </pre>
-The first part of the pattern is a DEFINE group inside which a another group
+The first part of the pattern is a DEFINE group inside which another group
 named "byte" is defined. This matches an individual component of an IPv4
 address (a number less than 256). When matching takes place, this part of the
 pattern is skipped because DEFINE acts like a false condition. The rest of the
@ -2838,8 +2842,13 @@ Assertion conditions
 <P>
 If the condition is not in any of the above formats, it must be a parenthesized
 assertion. This may be a positive or negative lookahead or lookbehind
-assertion. Consider this pattern, again containing non-significant white space,
-and with the two alternatives on the second line:
+assertion. However, it must be a traditional atomic assertion, not one of the
+PCRE2-specific
+<a href="#nonatomicassertions">non-atomic assertions.</a>
+</P>
+<P>
+Consider this pattern, again containing non-significant white space, and with
+the two alternatives on the second line:
 <pre>
  (?(?=[^a-z]*[a-z])
  \d{2}-[a-z]{3}-\d{2}  |  \d{2}-\d{2}-\d{2} )
@ -2858,7 +2867,7 @@ positive and negative assertions, because matching always continues after the
 assertion, whether it succeeds or fails. (Compare non-conditional assertions,
 for which captures are retained only for positive assertions that succeed.)
 <a name="comments"></a></P>
-<br><a name="SEC23" href="#TOC1">COMMENTS</a><br>
+<br><a name="SEC24" href="#TOC1">COMMENTS</a><br>
 <P>
 There are two ways of including comments in patterns that are processed by
 PCRE2. In both cases, the start of the comment must not be in a character
@ -2888,7 +2897,7 @@ a newline in the pattern. The sequence \n is still literal at this stage, so
 it does not terminate the comment. Only an actual character with the code value
 0x0a (the default newline) does so.
 <a name="recursion"></a></P>
-<br><a name="SEC24" href="#TOC1">RECURSIVE PATTERNS</a><br>
+<br><a name="SEC25" href="#TOC1">RECURSIVE PATTERNS</a><br>
 <P>
 Consider the problem of matching a string in parentheses, allowing for
 unlimited nested parentheses. Without the use of recursion, the best that can
@ -3076,7 +3085,7 @@ alternative matches "a" and then recurses. In the recursion, \1 does now match
 "b" and so the whole match succeeds. This match used to fail in Perl, but in
 later versions (I tried 5.024) it now works.
 <a name="groupsassubroutines"></a></P>
-<br><a name="SEC25" href="#TOC1">GROUPS AS SUBROUTINES</a><br>
+<br><a name="SEC26" href="#TOC1">GROUPS AS SUBROUTINES</a><br>
 <P>
 If the syntax for a recursive group call (either by number or by name) is used
 outside the parentheses to which it refers, it operates a bit like a subroutine
@ -3124,7 +3133,7 @@ in groups when called as subroutines is described in the section entitled
 <a href="#btsub">"Backtracking verbs in subroutines"</a>
 below.
 <a name="onigurumasubroutines"></a></P>
-<br><a name="SEC26" href="#TOC1">ONIGURUMA SUBROUTINE SYNTAX</a><br>
+<br><a name="SEC27" href="#TOC1">ONIGURUMA SUBROUTINE SYNTAX</a><br>
 <P>
 For compatibility with Oniguruma, the non-Perl syntax \g followed by a name or
 a number enclosed either in angle brackets or single quotes, is an alternative
@ -3142,7 +3151,7 @@ plus or a minus sign it is taken as a relative reference. For example:
 Note that \g{...} (Perl syntax) and \g&#60;...&#62; (Oniguruma syntax) are <i>not</i>
 synonymous. The former is a backreference; the latter is a subroutine call.
 </P>
-<br><a name="SEC27" href="#TOC1">CALLOUTS</a><br>
+<br><a name="SEC28" href="#TOC1">CALLOUTS</a><br>
 <P>
 Perl has a feature whereby using the sequence (?{...}) causes arbitrary Perl
 code to be obeyed in the middle of matching a regular expression. This makes it
@ -3218,13 +3227,13 @@ example:
 </pre>
 The doubling is removed before the string is passed to the callout function.
 <a name="backtrackcontrol"></a></P>
-<br><a name="SEC28" href="#TOC1">BACKTRACKING CONTROL</a><br>
+<br><a name="SEC29" href="#TOC1">BACKTRACKING CONTROL</a><br>
 <P>
 There are a number of special "Backtracking Control Verbs" (to use Perl's
 terminology) that modify the behaviour of backtracking during matching. They
 are generally of the form (*VERB) or (*VERB:NAME). Some verbs take either form,
-possibly behaving differently depending on whether or not a name is present.
-The names are not required to be unique within the pattern.
+and may behave differently depending on whether or not a name argument is
+present. The names are not required to be unique within the pattern.
 </P>
 <P>
 By default, for compatibility with Perl, a name is any sequence of characters
@ -3252,7 +3261,8 @@ PCRE2_ALT_VERBNAMES is also set.
 The maximum length of a name is 255 in the 8-bit library and 65535 in the
 16-bit and 32-bit libraries. If the name is empty, that is, if the closing
 parenthesis immediately follows the colon, the effect is as if the colon were
-not there. Any number of these verbs may occur in a pattern.
+not there. Any number of these verbs may occur in a pattern. Except for
+(*ACCEPT), they may not be quantified.
 </P>
 <P>
 Since these verbs are specifically related to backtracking, most of them can be
@ -3315,6 +3325,19 @@ This matches "AB", "AAD", or "ACD"; when it matches "AB", "B" is captured by
 the outer parentheses.
 </P>
 <P>
+(*ACCEPT) is the only backtracking verb that is allowed to be quantified
+because an ungreedy quantification with a minimum of zero acts only when a
+backtrack happens. Consider, for example,
+<pre>
+  (A(*ACCEPT)??B)C
+</pre>
+where A, B, and C may be complex expressions. After matching "A", the matcher
+processes "BC"; if that fails, causing a backtrack, (*ACCEPT) is triggered and
+the match succeeds. In both cases, all but C is captured. Whereas (*COMMIT)
+(see below) means "fail on backtrack", a repeated (*ACCEPT) of this type means
+"succeed on backtrack".
+</P>
+<P>
 <b>Warning:</b> (*ACCEPT) should not be used within a script run group, because
 it causes an immediate exit from the group, bypassing the script run checking.
 <pre>
@ -3332,8 +3355,9 @@ A match with the string "aaaa" always fails, but the callout is taken before
 each backtrack happens (in this example, 10 times).
 </P>
 <P>
-(*ACCEPT:NAME) and (*FAIL:NAME) are treated as (*MARK:NAME)(*ACCEPT) and
-(*MARK:NAME)(*FAIL), respectively.
+(*ACCEPT:NAME) and (*FAIL:NAME) behave the same as (*MARK:NAME)(*ACCEPT) and
+(*MARK:NAME)(*FAIL), respectively, that is, a (*MARK) is recorded just before
+the verb acts.
 </P>
 <br><b>
 Recording which path was taken
@ -3497,10 +3521,16 @@ successful match if there is a later mismatch. Consider:
 </pre>
 If the subject is "aaaac...", after the first match attempt fails (starting at
 the first character in the string), the starting point skips on to start the
-next attempt at "c". Note that a possessive quantifer does not have the same
+next attempt at "c". Note that a possessive quantifier does not have the same
 effect as this example; although it would suppress backtracking during the
 first match attempt, the second attempt would start at the second character
 instead of skipping on to "c".
+</P>
+<P>
+If (*SKIP) is used to specify a new starting position that is the same as the
+starting position of the current match, or (by being inside a lookbehind)
+earlier, the position specified by (*SKIP) is ignored, and instead the normal
+"bumpalong" occurs.
 <pre>
  (*SKIP:NAME)
 </pre>
@ -3665,11 +3695,20 @@ retained in both cases.
 </P>
 <P>
 The remaining verbs act only when a later failure causes a backtrack to
-reach them. This means that their effect is confined to the assertion,
-because lookaround assertions are atomic. A backtrack that occurs after an
-assertion is complete does not jump back into the assertion. Note in particular
-that a (*MARK) name that is set in an assertion is not "seen" by an instance of
-(*SKIP:NAME) latter in the pattern.
+reach them. This means that, for the Perl-compatible assertions, their effect
+is confined to the assertion, because Perl lookaround assertions are atomic. A
+backtrack that occurs after such an assertion is complete does not jump back
+into the assertion. Note in particular that a (*MARK) name that is set in an
+assertion is not "seen" by an instance of (*SKIP:NAME) later in the pattern.
+</P>
+<P>
+PCRE2 now supports non-atomic positive assertions, as described in the section
+entitled
+<a href="#nonatomicassertions">"Non-atomic assertions"</a>
+above. These assertions must be standalone (not used as conditions). They are
+not Perl-compatible. For these assertions, a later backtrack does jump back
+into the assertion, and therefore verbs such as (*COMMIT) can be triggered by
+backtracks from later in the pattern.
 </P>
 <P>
 The effect of (*THEN) is not allowed to escape beyond an assertion. If there
@ -3711,25 +3750,25 @@ enclosing group that has alternatives (its normal behaviour). However, if there
 is no such group within the subroutine's group, the subroutine match fails and
 there is a backtrack at the outer level.
 </P>
-<br><a name="SEC29" href="#TOC1">SEE ALSO</a><br>
+<br><a name="SEC30" href="#TOC1">SEE ALSO</a><br>
 <P>
 <b>pcre2api</b>(3), <b>pcre2callout</b>(3), <b>pcre2matching</b>(3),
 <b>pcre2syntax</b>(3), <b>pcre2</b>(3).
 </P>
-<br><a name="SEC30" href="#TOC1">AUTHOR</a><br>
+<br><a name="SEC31" href="#TOC1">AUTHOR</a><br>
 <P>
 Philip Hazel
 <br>
-University Computing Service
+Retired from University Computing Service
 <br>
 Cambridge, England.
 <br>
 </P>
-<br><a name="SEC31" href="#TOC1">REVISION</a><br>
+<br><a name="SEC32" href="#TOC1">REVISION</a><br>
 <P>
-Last updated: 12 February 2019
+Last updated: 12 January 2022
 <br>
-Copyright &copy; 1997-2019 University of Cambridge.
+Copyright &copy; 1997-2022 University of Cambridge.
 <br>
 <p>
 Return to the <a href="index.html">PCRE2 index page</a>.
--- a/doc/html/pcre2perform.html
+++ b/doc/html/pcre2perform.html
@ -83,12 +83,31 @@ From release 10.30, the interpretive (non-JIT) version of <b>pcre2_match()</b>
 uses very little system stack at run time. In earlier releases recursive
 function calls could use a great deal of stack, and this could cause problems,
 but this usage has been eliminated. Backtracking positions are now explicitly
-remembered in memory frames controlled by the code. An initial 20KiB vector of
-frames is allocated on the system stack (enough for about 100 frames for small
-patterns), but if this is insufficient, heap memory is used. The amount of heap
-memory can be limited; if the limit is set to zero, only the initial stack
-vector is used. Rewriting patterns to be time-efficient, as described below,
-may also reduce the memory requirements.
+remembered in memory frames controlled by the code. 
+</P>
+<P>
+The size of each frame depends on the size of pointer variables and the number
+of capturing parenthesized groups in the pattern being matched. On a 64-bit
+system the frame size for a pattern with no captures is 128 bytes. For each
+capturing group the size increases by 16 bytes.
+</P>
+<P>
+Until release 10.41, an initial 20KiB frames vector was allocated on the system 
+stack, but this still caused some issues for multi-thread applications where
+each thread has a very small stack. From release 10.41 backtracking memory
+frames are always held in heap memory. An initial heap allocation is obtained
+the first time any match data block is passed to <b>pcre2_match()</b>. This is
+remembered with the match data block and re-used if that block is used for
+another match. It is freed when the match data block itself is freed.
+</P>
+<P>
+The size of the initial block is the larger of 20KiB or ten times the pattern's 
+frame size, unless the heap limit is less than this, in which case the heap 
+limit is used. If the initial block proves to be too small during matching, it
+is replaced by a larger block, subject to the heap limit. The heap limit is 
+checked only when a new block is to be allocated. Reducing the heap limit 
+between calls to <b>pcre2_match()</b> with the same match data block does not 
+affect the saved block.
 </P>
 <P>
 In contrast to <b>pcre2_match()</b>, <b>pcre2_dfa_match()</b> does use recursive
@ -245,16 +264,16 @@ pattern to match. This is done by repeatedly matching with different limits.
 <P>
 Philip Hazel
 <br>
-University Computing Service
+Retired from University Computing Service
 <br>
 Cambridge, England.
 <br>
 </P>
 <br><a name="SEC6" href="#TOC1">REVISION</a><br>
 <P>
-Last updated: 03 February 2019
+Last updated: 27 July 2022
 <br>
-Copyright &copy; 1997-2019 University of Cambridge.
+Copyright &copy; 1997-2022 University of Cambridge.
 <br>
 <p>
 Return to the <a href="index.html">PCRE2 index page</a>.
--- a/doc/html/pcre2posix.html
+++ b/doc/html/pcre2posix.html
@ -68,11 +68,14 @@ application. Because the POSIX functions call the native ones, it is also
 necessary to add <b>-lpcre2-8</b>.
 </P>
 <P>
-Although they are not defined as protypes in <b>pcre2posix.h</b>, the library
-does contain functions with the POSIX names <b>regcomp()</b> etc. These simply
-pass their arguments to the PCRE2 functions. These functions are provided for
-backwards compatibility with earlier versions of PCRE2, so that existing
-programs do not have to be recompiled.
+Although they were not defined as protypes in <b>pcre2posix.h</b>, releases
+10.33 to 10.36 of the library contained functions with the POSIX names
+<b>regcomp()</b> etc. These simply passed their arguments to the PCRE2
+functions. These functions were provided for backwards compatibility with
+earlier versions of PCRE2, which had only POSIX names. However, this has proved
+troublesome in situations where a program links with several libraries, some of
+which use PCRE2's POSIX interface while others use the real POSIX functions.
+For this reason, the POSIX names have been removed since release 10.37.
 </P>
 <P>
 Calling the header file <b>pcre2posix.h</b> avoids any conflict with other POSIX
@ -344,9 +347,9 @@ Cambridge, England.
 </P>
 <br><a name="SEC10" href="#TOC1">REVISION</a><br>
 <P>
-Last updated: 30 January 2019
+Last updated: 26 April 2021
 <br>
-Copyright &copy; 1997-2019 University of Cambridge.
+Copyright &copy; 1997-2021 University of Cambridge.
 <br>
 <p>
 Return to the <a href="index.html">PCRE2 index page</a>.
--- a/doc/html/pcre2serialize.html
+++ b/doc/html/pcre2serialize.html
@ -23,12 +23,12 @@ please consult the man page, in case the conversion went wrong.
 <br><a name="SEC1" href="#TOC1">SAVING AND RE-USING PRECOMPILED PCRE2 PATTERNS</a><br>
 <P>
 <b>int32_t pcre2_serialize_decode(pcre2_code **<i>codes</i>,</b>
-<b>  int32_t <i>number_of_codes</i>, const uint32_t *<i>bytes</i>,</b>
+<b>  int32_t <i>number_of_codes</i>, const uint8_t *<i>bytes</i>,</b>
 <b>  pcre2_general_context *<i>gcontext</i>);</b>
 <br>
 <br>
-<b>int32_t pcre2_serialize_encode(pcre2_code **<i>codes</i>,</b>
-<b>  int32_t <i>number_of_codes</i>, uint32_t **<i>serialized_bytes</i>,</b>
+<b>int32_t pcre2_serialize_encode(const pcre2_code **<i>codes</i>,</b>
+<b>  int32_t <i>number_of_codes</i>, uint8_t **<i>serialized_bytes</i>,</b>
 <b>  PCRE2_SIZE *<i>serialized_size</i>, pcre2_general_context *<i>gcontext</i>);</b>
 <br>
 <br>
@ -94,7 +94,7 @@ of serialized patterns, or one of the following negative error codes:
 <pre>
  PCRE2_ERROR_BADDATA      the number of patterns is zero or less
  PCRE2_ERROR_BADMAGIC     mismatch of id bytes in one of the patterns
-  PCRE2_ERROR_MEMORY       memory allocation failed
+  PCRE2_ERROR_NOMEMORY     memory allocation failed
  PCRE2_ERROR_MIXEDTABLES  the patterns do not all use the same tables
  PCRE2_ERROR_NULL         the 1st, 3rd, or 4th argument is NULL
 </pre>
@ -154,7 +154,6 @@ mangagement functions for the decoded patterns. If this argument is NULL,
 <b>malloc()</b> and <b>free()</b> are used. After deserialization, the byte
 stream is no longer needed and can be discarded.
 <pre>
-  int32_t number_of_codes;
  pcre2_code *list_of_codes[2];
  uint8_t *bytes = &#60;serialized data&#62;;
  int32_t number_of_codes =
--- a/doc/html/pcre2syntax.html
+++ b/doc/html/pcre2syntax.html
@ -19,28 +19,31 @@ please consult the man page, in case the conversion went wrong.
 <li><a name="TOC4" href="#SEC4">CHARACTER TYPES</a>
 <li><a name="TOC5" href="#SEC5">GENERAL CATEGORY PROPERTIES FOR \p and \P</a>
 <li><a name="TOC6" href="#SEC6">PCRE2 SPECIAL CATEGORY PROPERTIES FOR \p and \P</a>
-<li><a name="TOC7" href="#SEC7">SCRIPT NAMES FOR \p AND \P</a>
-<li><a name="TOC8" href="#SEC8">CHARACTER CLASSES</a>
-<li><a name="TOC9" href="#SEC9">QUANTIFIERS</a>
-<li><a name="TOC10" href="#SEC10">ANCHORS AND SIMPLE ASSERTIONS</a>
-<li><a name="TOC11" href="#SEC11">REPORTED MATCH POINT SETTING</a>
-<li><a name="TOC12" href="#SEC12">ALTERNATION</a>
-<li><a name="TOC13" href="#SEC13">CAPTURING</a>
-<li><a name="TOC14" href="#SEC14">ATOMIC GROUPS</a>
-<li><a name="TOC15" href="#SEC15">COMMENT</a>
-<li><a name="TOC16" href="#SEC16">OPTION SETTING</a>
-<li><a name="TOC17" href="#SEC17">NEWLINE CONVENTION</a>
-<li><a name="TOC18" href="#SEC18">WHAT \R MATCHES</a>
-<li><a name="TOC19" href="#SEC19">LOOKAHEAD AND LOOKBEHIND ASSERTIONS</a>
-<li><a name="TOC20" href="#SEC20">SCRIPT RUNS</a>
-<li><a name="TOC21" href="#SEC21">BACKREFERENCES</a>
-<li><a name="TOC22" href="#SEC22">SUBROUTINE REFERENCES (POSSIBLY RECURSIVE)</a>
-<li><a name="TOC23" href="#SEC23">CONDITIONAL PATTERNS</a>
-<li><a name="TOC24" href="#SEC24">BACKTRACKING CONTROL</a>
-<li><a name="TOC25" href="#SEC25">CALLOUTS</a>
-<li><a name="TOC26" href="#SEC26">SEE ALSO</a>
-<li><a name="TOC27" href="#SEC27">AUTHOR</a>
-<li><a name="TOC28" href="#SEC28">REVISION</a>
+<li><a name="TOC7" href="#SEC7">BINARY PROPERTIES FOR \p AND \P</a>
+<li><a name="TOC8" href="#SEC8">SCRIPT MATCHING WITH \p AND \P</a>
+<li><a name="TOC9" href="#SEC9">THE BIDI_CLASS PROPERTY FOR \p AND \P</a>
+<li><a name="TOC10" href="#SEC10">CHARACTER CLASSES</a>
+<li><a name="TOC11" href="#SEC11">QUANTIFIERS</a>
+<li><a name="TOC12" href="#SEC12">ANCHORS AND SIMPLE ASSERTIONS</a>
+<li><a name="TOC13" href="#SEC13">REPORTED MATCH POINT SETTING</a>
+<li><a name="TOC14" href="#SEC14">ALTERNATION</a>
+<li><a name="TOC15" href="#SEC15">CAPTURING</a>
+<li><a name="TOC16" href="#SEC16">ATOMIC GROUPS</a>
+<li><a name="TOC17" href="#SEC17">COMMENT</a>
+<li><a name="TOC18" href="#SEC18">OPTION SETTING</a>
+<li><a name="TOC19" href="#SEC19">NEWLINE CONVENTION</a>
+<li><a name="TOC20" href="#SEC20">WHAT \R MATCHES</a>
+<li><a name="TOC21" href="#SEC21">LOOKAHEAD AND LOOKBEHIND ASSERTIONS</a>
+<li><a name="TOC22" href="#SEC22">NON-ATOMIC LOOKAROUND ASSERTIONS</a>
+<li><a name="TOC23" href="#SEC23">SCRIPT RUNS</a>
+<li><a name="TOC24" href="#SEC24">BACKREFERENCES</a>
+<li><a name="TOC25" href="#SEC25">SUBROUTINE REFERENCES (POSSIBLY RECURSIVE)</a>
+<li><a name="TOC26" href="#SEC26">CONDITIONAL PATTERNS</a>
+<li><a name="TOC27" href="#SEC27">BACKTRACKING CONTROL</a>
+<li><a name="TOC28" href="#SEC28">CALLOUTS</a>
+<li><a name="TOC29" href="#SEC29">SEE ALSO</a>
+<li><a name="TOC30" href="#SEC30">AUTHOR</a>
+<li><a name="TOC31" href="#SEC31">REVISION</a>
 </ul>
 <br><a name="SEC1" href="#TOC1">PCRE2 REGULAR EXPRESSION SYNTAX SUMMARY</a><br>
 <P>
@ -135,6 +138,11 @@ happening, \s and \w may also match characters with code points in the range
 sequences is changed to use Unicode properties and they match many more
 characters.
 </P>
+<P>
+Property descriptions in \p and \P are matched caselessly; hyphens,
+underscores, and white space are ignored, in accordance with Unicode's "loose
+matching" rules.
+</P>
 <br><a name="SEC5" href="#TOC1">GENERAL CATEGORY PROPERTIES FOR \p and \P</a><br>
 <P>
 <pre>
@ -151,6 +159,7 @@ characters.
  Lo         Other letter
  Lt         Title case letter
  Lu         Upper case letter
+  Lc         Ll, Lu, or Lt
  L&         Ll, Lu, or Lt

  M          Mark
@ -197,158 +206,58 @@ characters.
 Perl and POSIX space are now the same. Perl added VT to its space character set
 at release 5.18.
 </P>
-<br><a name="SEC7" href="#TOC1">SCRIPT NAMES FOR \p AND \P</a><br>
+<br><a name="SEC7" href="#TOC1">BINARY PROPERTIES FOR \p AND \P</a><br>
 <P>
-Adlam,
-Ahom,
-Anatolian_Hieroglyphs,
-Arabic,
-Armenian,
-Avestan,
-Balinese,
-Bamum,
-Bassa_Vah,
-Batak,
-Bengali,
-Bhaiksuki,
-Bopomofo,
-Brahmi,
-Braille,
-Buginese,
-Buhid,
-Canadian_Aboriginal,
-Carian,
-Caucasian_Albanian,
-Chakma,
-Cham,
-Cherokee,
-Common,
-Coptic,
-Cuneiform,
-Cypriot,
-Cyrillic,
-Deseret,
-Devanagari,
-Dogra,
-Duployan,
-Egyptian_Hieroglyphs,
-Elbasan,
-Ethiopic,
-Georgian,
-Glagolitic,
-Gothic,
-Grantha,
-Greek,
-Gujarati,
-Gunjala_Gondi,
-Gurmukhi,
-Han,
-Hangul,
-Hanifi_Rohingya,
-Hanunoo,
-Hatran,
-Hebrew,
-Hiragana,
-Imperial_Aramaic,
-Inherited,
-Inscriptional_Pahlavi,
-Inscriptional_Parthian,
-Javanese,
-Kaithi,
-Kannada,
-Katakana,
-Kayah_Li,
-Kharoshthi,
-Khmer,
-Khojki,
-Khudawadi,
-Lao,
-Latin,
-Lepcha,
-Limbu,
-Linear_A,
-Linear_B,
-Lisu,
-Lycian,
-Lydian,
-Mahajani,
-Makasar,
-Malayalam,
-Mandaic,
-Manichaean,
-Marchen,
-Masaram_Gondi,
-Medefaidrin,
-Meetei_Mayek,
-Mende_Kikakui,
-Meroitic_Cursive,
-Meroitic_Hieroglyphs,
-Miao,
-Modi,
-Mongolian,
-Mro,
-Multani,
-Myanmar,
-Nabataean,
-New_Tai_Lue,
-Newa,
-Nko,
-Nushu,
-Ogham,
-Ol_Chiki,
-Old_Hungarian,
-Old_Italic,
-Old_North_Arabian,
-Old_Permic,
-Old_Persian,
-Old_Sogdian,
-Old_South_Arabian,
-Old_Turkic,
-Oriya,
-Osage,
-Osmanya,
-Pahawh_Hmong,
-Palmyrene,
-Pau_Cin_Hau,
-Phags_Pa,
-Phoenician,
-Psalter_Pahlavi,
-Rejang,
-Runic,
-Samaritan,
-Saurashtra,
-Sharada,
-Shavian,
-Siddham,
-SignWriting,
-Sinhala,
-Sogdian,
-Sora_Sompeng,
-Soyombo,
-Sundanese,
-Syloti_Nagri,
-Syriac,
-Tagalog,
-Tagbanwa,
-Tai_Le,
-Tai_Tham,
-Tai_Viet,
-Takri,
-Tamil,
-Tangut,
-Telugu,
-Thaana,
-Thai,
-Tibetan,
-Tifinagh,
-Tirhuta,
-Ugaritic,
-Vai,
-Warang_Citi,
-Yi,
-Zanabazar_Square.
+Unicode defines a number of binary properties, that is, properties whose only
+values are true or false. You can obtain a list of those that are recognized by
+\p and \P, along with their abbreviations, by running this command:
+<pre>
+  pcre2test -LP
+</PRE>
 </P>
-<br><a name="SEC8" href="#TOC1">CHARACTER CLASSES</a><br>
+<br><a name="SEC8" href="#TOC1">SCRIPT MATCHING WITH \p AND \P</a><br>
+<P>
+Many script names and their 4-letter abbreviations are recognized in
+\p{sc:...} or \p{scx:...} items, or on their own with \p (and also \P of
+course). You can obtain a list of these scripts by running this command:
+<pre>
+  pcre2test -LS
+</PRE>
+</P>
+<br><a name="SEC9" href="#TOC1">THE BIDI_CLASS PROPERTY FOR \p AND \P</a><br>
+<P>
+<pre>
+  \p{Bidi_Class:&#60;class&#62;}   matches a character with the given class
+  \p{BC:&#60;class&#62;}           matches a character with the given class
+</pre>
+The recognized classes are:
+<pre>
+  AL          Arabic letter
+  AN          Arabic number
+  B           paragraph separator
+  BN          boundary neutral
+  CS          common separator
+  EN          European number
+  ES          European separator
+  ET          European terminator
+  FSI         first strong isolate
+  L           left-to-right
+  LRE         left-to-right embedding
+  LRI         left-to-right isolate
+  LRO         left-to-right override
+  NSM         non-spacing mark
+  ON          other neutral
+  PDF         pop directional format
+  PDI         pop directional isolate
+  R           right-to-left
+  RLE         right-to-left embedding
+  RLI         right-to-left isolate
+  RLO         right-to-left override
+  S           segment separator
+  WS          which space
+</PRE>
+</P>
+<br><a name="SEC10" href="#TOC1">CHARACTER CLASSES</a><br>
 <P>
 <pre>
  [...]       positive character class
@ -376,7 +285,7 @@ In PCRE2, POSIX character set names recognize only ASCII characters by default,
 but some of them use Unicode properties if PCRE2_UCP is set. You can use
 \Q...\E inside a character class.
 </P>
-<br><a name="SEC9" href="#TOC1">QUANTIFIERS</a><br>
+<br><a name="SEC11" href="#TOC1">QUANTIFIERS</a><br>
 <P>
 <pre>
  ?           0 or 1, greedy
@ -397,7 +306,7 @@ but some of them use Unicode properties if PCRE2_UCP is set. You can use
  {n,}?       n or more, lazy
 </PRE>
 </P>
-<br><a name="SEC10" href="#TOC1">ANCHORS AND SIMPLE ASSERTIONS</a><br>
+<br><a name="SEC12" href="#TOC1">ANCHORS AND SIMPLE ASSERTIONS</a><br>
 <P>
 <pre>
  \b          word boundary
@ -415,20 +324,23 @@ but some of them use Unicode properties if PCRE2_UCP is set. You can use
  \G          first matching position in subject
 </PRE>
 </P>
-<br><a name="SEC11" href="#TOC1">REPORTED MATCH POINT SETTING</a><br>
+<br><a name="SEC13" href="#TOC1">REPORTED MATCH POINT SETTING</a><br>
 <P>
 <pre>
  \K          set reported start of match
 </pre>
+From release 10.38 \K is not permitted by default in lookaround assertions,
+for compatibility with Perl. However, if the PCRE2_EXTRA_ALLOW_LOOKAROUND_BSK
+option is set, the previous behaviour is re-enabled. When this option is set,
 \K is honoured in positive assertions, but ignored in negative ones.
 </P>
-<br><a name="SEC12" href="#TOC1">ALTERNATION</a><br>
+<br><a name="SEC14" href="#TOC1">ALTERNATION</a><br>
 <P>
 <pre>
  expr|expr|expr...
 </PRE>
 </P>
-<br><a name="SEC13" href="#TOC1">CAPTURING</a><br>
+<br><a name="SEC15" href="#TOC1">CAPTURING</a><br>
 <P>
 <pre>
  (...)           capture group
@ -443,26 +355,26 @@ In non-UTF modes, names may contain underscores and ASCII letters and digits;
 in UTF modes, any Unicode letters and Unicode decimal digits are permitted. In
 both cases, a name must not start with a digit.
 </P>
-<br><a name="SEC14" href="#TOC1">ATOMIC GROUPS</a><br>
+<br><a name="SEC16" href="#TOC1">ATOMIC GROUPS</a><br>
 <P>
 <pre>
  (?&#62;...)         atomic non-capture group
  (*atomic:...)   atomic non-capture group
 </PRE>
 </P>
-<br><a name="SEC15" href="#TOC1">COMMENT</a><br>
+<br><a name="SEC17" href="#TOC1">COMMENT</a><br>
 <P>
 <pre>
  (?#....)        comment (not nestable)
 </PRE>
 </P>
-<br><a name="SEC16" href="#TOC1">OPTION SETTING</a><br>
+<br><a name="SEC18" href="#TOC1">OPTION SETTING</a><br>
 <P>
 Changes of these options within a group are automatically cancelled at the end
 of the group.
 <pre>
  (?i)            caseless
-  (?J)            allow duplicate names
+  (?J)            allow duplicate named groups
  (?m)            multiline
  (?n)            no auto capture
  (?s)            single line (dotall)
@ -501,7 +413,7 @@ not increase them. LIMIT_RECURSION is an obsolete synonym for LIMIT_DEPTH. The
 application can lock out the use of (*UTF) and (*UCP) by setting the
 PCRE2_NEVER_UTF or PCRE2_NEVER_UCP options, respectively, at compile time.
 </P>
-<br><a name="SEC17" href="#TOC1">NEWLINE CONVENTION</a><br>
+<br><a name="SEC19" href="#TOC1">NEWLINE CONVENTION</a><br>
 <P>
 These are recognized only at the very start of the pattern or after option
 settings with a similar syntax.
@ -514,7 +426,7 @@ settings with a similar syntax.
  (*NUL)          the NUL character (binary zero)
 </PRE>
 </P>
-<br><a name="SEC18" href="#TOC1">WHAT \R MATCHES</a><br>
+<br><a name="SEC20" href="#TOC1">WHAT \R MATCHES</a><br>
 <P>
 These are recognized only at the very start of the pattern or after option
 setting with a similar syntax.
@ -523,7 +435,7 @@ setting with a similar syntax.
  (*BSR_UNICODE)  any Unicode newline sequence
 </PRE>
 </P>
-<br><a name="SEC19" href="#TOC1">LOOKAHEAD AND LOOKBEHIND ASSERTIONS</a><br>
+<br><a name="SEC21" href="#TOC1">LOOKAHEAD AND LOOKBEHIND ASSERTIONS</a><br>
 <P>
 <pre>
  (?=...)                     )
@ -544,7 +456,20 @@ setting with a similar syntax.
 </pre>
 Each top-level branch of a lookbehind must be of a fixed length.
 </P>
-<br><a name="SEC20" href="#TOC1">SCRIPT RUNS</a><br>
+<br><a name="SEC22" href="#TOC1">NON-ATOMIC LOOKAROUND ASSERTIONS</a><br>
+<P>
+These assertions are specific to PCRE2 and are not Perl-compatible.
+<pre>
+  (?*...)                                )
+  (*napla:...)                           ) synonyms
+  (*non_atomic_positive_lookahead:...)   )
+
+  (?&#60;*...)                               )
+  (*naplb:...)                           ) synonyms
+  (*non_atomic_positive_lookbehind:...)  )
+</PRE>
+</P>
+<br><a name="SEC23" href="#TOC1">SCRIPT RUNS</a><br>
 <P>
 <pre>
  (*script_run:...)           ) script run, can be backtracked into
@ -554,7 +479,7 @@ Each top-level branch of a lookbehind must be of a fixed length.
  (*asr:...)                  )
 </PRE>
 </P>
-<br><a name="SEC21" href="#TOC1">BACKREFERENCES</a><br>
+<br><a name="SEC24" href="#TOC1">BACKREFERENCES</a><br>
 <P>
 <pre>
  \n              reference by number (can be ambiguous)
@ -571,7 +496,7 @@ Each top-level branch of a lookbehind must be of a fixed length.
  (?P=name)       reference by name (Python)
 </PRE>
 </P>
-<br><a name="SEC22" href="#TOC1">SUBROUTINE REFERENCES (POSSIBLY RECURSIVE)</a><br>
+<br><a name="SEC25" href="#TOC1">SUBROUTINE REFERENCES (POSSIBLY RECURSIVE)</a><br>
 <P>
 <pre>
  (?R)            recurse whole pattern
@ -590,7 +515,7 @@ Each top-level branch of a lookbehind must be of a fixed length.
  \g'-n'          call subroutine by relative number (PCRE2 extension)
 </PRE>
 </P>
-<br><a name="SEC23" href="#TOC1">CONDITIONAL PATTERNS</a><br>
+<br><a name="SEC26" href="#TOC1">CONDITIONAL PATTERNS</a><br>
 <P>
 <pre>
  (?(condition)yes-pattern)
@ -613,7 +538,7 @@ Note the ambiguity of (?(R) and (?(Rn) which might be named reference
 conditions or recursion tests. Such a condition is interpreted as a reference
 condition if the relevant named group exists.
 </P>
-<br><a name="SEC24" href="#TOC1">BACKTRACKING CONTROL</a><br>
+<br><a name="SEC27" href="#TOC1">BACKTRACKING CONTROL</a><br>
 <P>
 All backtracking control verbs may be in the form (*VERB:NAME). For (*MARK) the
 name is mandatory, for the others it is optional. (*SKIP) changes its behaviour
@ -640,7 +565,7 @@ pattern is not anchored.
 The effect of one of these verbs in a group called as a subroutine is confined
 to the subroutine call.
 </P>
-<br><a name="SEC25" href="#TOC1">CALLOUTS</a><br>
+<br><a name="SEC28" href="#TOC1">CALLOUTS</a><br>
 <P>
 <pre>
  (?C)            callout (assumed number 0)
@ -651,25 +576,25 @@ The allowed string delimiters are ` ' " ^ % # $ (which are the same for the
 start and the end), and the starting delimiter { matched with the ending
 delimiter }. To encode the ending delimiter within the string, double it.
 </P>
-<br><a name="SEC26" href="#TOC1">SEE ALSO</a><br>
+<br><a name="SEC29" href="#TOC1">SEE ALSO</a><br>
 <P>
 <b>pcre2pattern</b>(3), <b>pcre2api</b>(3), <b>pcre2callout</b>(3),
 <b>pcre2matching</b>(3), <b>pcre2</b>(3).
 </P>
-<br><a name="SEC27" href="#TOC1">AUTHOR</a><br>
+<br><a name="SEC30" href="#TOC1">AUTHOR</a><br>
 <P>
 Philip Hazel
 <br>
-University Computing Service
+Retired from University Computing Service
 <br>
 Cambridge, England.
 <br>
 </P>
-<br><a name="SEC28" href="#TOC1">REVISION</a><br>
+<br><a name="SEC31" href="#TOC1">REVISION</a><br>
 <P>
-Last updated: 11 February 2019
+Last updated: 12 January 2022
 <br>
-Copyright &copy; 1997-2019 University of Cambridge.
+Copyright &copy; 1997-2022 University of Cambridge.
 <br>
 <p>
 Return to the <a href="index.html">PCRE2 index page</a>.
--- a/doc/html/pcre2test.html
+++ b/doc/html/pcre2test.html
@ -59,12 +59,7 @@ patterns, and the subject lines specify PCRE2 function options, control how the
 subject is processed, and what output is produced.
 </P>
 <P>
-As the original fairly simple PCRE library evolved, it acquired many different
-features, and as a result, the original <b>pcretest</b> program ended up with a
-lot of options in a messy, arcane syntax for testing all the features. The
-move to the new PCRE2 API provided an opportunity to re-implement the test
-program as <b>pcre2test</b>, with a cleaner modifier syntax. Nevertheless, there
-are still many obscure modifiers, some of which are specifically designed for
+There are many obscure modifiers, some of which are specifically designed for
 use in conjunction with the test script and data files that are distributed as
 part of PCRE2. All the modifiers are documented here, some without much
 justification, but many of them are unlikely to be of use except when testing
@ -83,16 +78,16 @@ to 8-bit code units for output.
 </P>
 <P>
 In the rest of this document, the names of library functions and structures
-are given in generic form, for example, <b>pcre_compile()</b>. The actual
+are given in generic form, for example, <b>pcre2_compile()</b>. The actual
 names used in the libraries have a suffix _8, _16, or _32, as appropriate.
 <a name="inputencoding"></a></P>
 <br><a name="SEC3" href="#TOC1">INPUT ENCODING</a><br>
 <P>
 Input to <b>pcre2test</b> is processed line by line, either by calling the C
-library's <b>fgets()</b> function, or via the <b>libreadline</b> library. In some
-Windows environments character 26 (hex 1A) causes an immediate end of file, and
-no further data is read, so this character should be avoided unless you really
-want that action.
+library's <b>fgets()</b> function, or via the <b>libreadline</b> or <b>libedit</b>
+library. In some Windows environments character 26 (hex 1A) causes an immediate
+end of file, and no further data is read, so this character should be avoided
+unless you really want that action.
 </P>
 <P>
 The input is processed using using C's string functions, so must not
@ -242,19 +237,38 @@ Behave as if each pattern line has the <b>jit</b> modifier; after successful
 compilation, each pattern is passed to the just-in-time compiler, if available.
 </P>
 <P>
+<b>-jitfast</b>
+Behave as if each pattern line has the <b>jitfast</b> modifier; after
+successful compilation, each pattern is passed to the just-in-time compiler, if
+available, and each subject line is passed directly to the JIT matcher via its
+"fast path".
+</P>
+<P>
 <b>-jitverify</b>
 Behave as if each pattern line has the <b>jitverify</b> modifier; after
 successful compilation, each pattern is passed to the just-in-time compiler, if
-available, and the use of JIT is verified.
+available, and the use of JIT for matching is verified.
 </P>
 <P>
 <b>-LM</b>
 List modifiers: write a list of available pattern and subject modifiers to the
 standard output, then exit with zero exit code. All other options are ignored.
-If both -C and -LM are present, whichever is first is recognized.
+If both -C and any -Lx options are present, whichever is first is recognized.
 </P>
 <P>
-\fB-pattern\fB <i>modifier-list</i>
+<b>-LP</b>
+List properties: write a list of recognized Unicode properties to the standard
+output, then exit with zero exit code. All other options are ignored. If both
+-C and any -Lx options are present, whichever is first is recognized.
+</P>
+<P>
+<b>-LS</b>
+List scripts: write a list of recogized Unicode script names to the standard
+output, then exit with zero exit code. All other options are ignored. If both
+-C and any -Lx options are present, whichever is first is recognized.
+</P>
+<P>
+<b>-pattern</b> <i>modifier-list</i>
 Behave as if each pattern line contains the given modifiers.
 </P>
 <P>
@ -316,7 +330,7 @@ test data, command lines that begin with # may appear. This file format, with
 some restrictions, can also be processed by the <b>perltest.sh</b> script that
 is distributed with PCRE2 as a means of checking that the behaviour of PCRE2
 and Perl is the same. For a specification of <b>perltest.sh</b>, see the
-comments near its beginning.
+comments near its beginning. See also the #perltest command below.
 </P>
 <P>
 When the input is a terminal, <b>pcre2test</b> prompts for each line of input,
@ -368,6 +382,12 @@ output.
 This command is used to load a set of precompiled patterns from a file, as
 described in the section entitled "Saving and restoring compiled patterns"
 <a href="#saverestore">below.</a>
+<pre>
+  #loadtables &#60;filename&#62;
+</pre>
+This command is used to load a set of binary character tables that can be
+accessed by the tables=3 qualifier. Such tables can be created by the
+<b>pcre2_dftables</b> program with the -b option.
 <pre>
  #newline_default [&#60;newline-list&#62;]
 </pre>
@ -407,14 +427,20 @@ patterns. Modifiers on a pattern can change these settings.
 <pre>
  #perltest
 </pre>
-The appearance of this line causes all subsequent modifier settings to be
-checked for compatibility with the <b>perltest.sh</b> script, which is used to
-confirm that Perl gives the same results as PCRE2. Also, apart from comment
-lines, #pattern commands, and #subject commands that set or unset "mark", no
-command lines are permitted, because they and many of the modifiers are
-specific to <b>pcre2test</b>, and should not be used in test files that are also
-processed by <b>perltest.sh</b>. The <b>#perltest</b> command helps detect tests
-that are accidentally put in the wrong file.
+This line is used in test files that can also be processed by <b>perltest.sh</b>
+to confirm that Perl gives the same results as PCRE2. Subsequent tests are
+checked for the use of <b>pcre2test</b> features that are incompatible with the
+<b>perltest.sh</b> script.
+</P>
+<P>
+Patterns must use '/' as their delimiter, and only certain modifiers are
+supported. Comment lines, #pattern commands, and #subject commands that set or
+unset "mark" are recognized and acted on. The #perltest, #forbid_utf, and
+#newline_default commands, which are needed in the relevant pcre2test files,
+are silently ignored. All other command lines are ignored, but give a warning
+message. The <b>#perltest</b> command helps detect tests that are accidentally
+put in the wrong file or use the wrong delimiter. For more details of the
+<b>perltest.sh</b> script see the comments it contains.
 <pre>
  #pop [&#60;modifiers&#62;]
  #popcopy [&#60;modifiers&#62;]
@ -467,15 +493,17 @@ excluding pattern meta-characters):
 </pre>
 This is interpreted as the pattern's delimiter. A regular expression may be
 continued over several input lines, in which case the newline characters are
-included within it. It is possible to include the delimiter within the pattern
-by escaping it with a backslash, for example
+included within it. It is possible to include the delimiter as a literal within
+the pattern by escaping it with a backslash, for example
 <pre>
  /abc\/def/
 </pre>
 If you do this, the escape and the delimiter form part of the pattern, but
-since the delimiters are all non-alphanumeric, this does not affect its
-interpretation. If the terminating delimiter is immediately followed by a
-backslash, for example,
+since the delimiters are all non-alphanumeric, the inclusion of the backslash
+does not affect the pattern's interpretation. Note, however, that this trick
+does not work within \Q...\E literal bracketing because the backslash will
+itself be interpreted as a literal. If the terminating delimiter is immediately
+followed by a backslash, for example,
 <pre>
  /abc/\
 </pre>
@ -493,11 +521,11 @@ A pattern can be followed by a modifier list (details below).
 </P>
 <br><a name="SEC9" href="#TOC1">SUBJECT LINE SYNTAX</a><br>
 <P>
-Before each subject line is passed to <b>pcre2_match()</b> or
-<b>pcre2_dfa_match()</b>, leading and trailing white space is removed, and the
-line is scanned for backslash escapes, unless the <b>subject_literal</b>
-modifier was set for the pattern. The following provide a means of encoding
-non-printing characters in a visible way:
+Before each subject line is passed to <b>pcre2_match()</b>,
+<b>pcre2_dfa_match()</b>, or <b>pcre2_jit_match()</b>, leading and trailing white
+space is removed, and the line is scanned for backslash escapes, unless the
+<b>subject_literal</b> modifier was set for the pattern. The following provide a
+means of encoding non-printing characters in a visible way:
 <pre>
  \a         alarm (BEL, \x07)
  \b         backspace (\x08)
@ -594,6 +622,7 @@ way <b>pcre2_compile()</b> behaves. See
 for a description of the effects of these options.
 <pre>
      allow_empty_class         set PCRE2_ALLOW_EMPTY_CLASS
+      allow_lookaround_bsk      set PCRE2_EXTRA_ALLOW_LOOKAROUND_BSK
      allow_surrogate_escapes   set PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES
      alt_bsux                  set PCRE2_ALT_BSUX
      alt_circumflex            set PCRE2_ALT_CIRCUMFLEX
@ -613,6 +642,7 @@ for a description of the effects of these options.
      firstline                 set PCRE2_FIRSTLINE
      literal                   set PCRE2_LITERAL
      match_line                set PCRE2_EXTRA_MATCH_LINE
+      match_invalid_utf         set PCRE2_MATCH_INVALID_UTF
      match_unset_backref       set PCRE2_MATCH_UNSET_BACKREF
      match_word                set PCRE2_EXTRA_MATCH_WORD
  /m  multiline                 set PCRE2_MULTILINE
@ -671,7 +701,7 @@ heavily used in the test files.
      pushcopy                  push a copy onto the stack
      stackguard=&#60;number&#62;       test the stackguard feature
      subject_literal           treat all subject lines as literal
-      tables=[0|1|2]            select internal tables
+      tables=[0|1|2|3]          select internal tables
      use_length                do not zero-terminate the pattern
      utf8_input                treat input as UTF-8
 </pre>
@ -738,7 +768,9 @@ options, the line is omitted. "First code unit" is where any match must start;
 if there is more than one they are listed as "starting code units". "Last code
 unit" is the last literal code unit that must be present in any match. This is
 not necessarily the last character. These lines are omitted if no starting or
-ending code units are recorded.
+ending code units are recorded. The subject length line is omitted when
+<b>no_start_optimize</b> is set because the minimum length is not calculated
+when it can never be used.
 </P>
 <P>
 The <b>framesize</b> modifier shows the size, in bytes, of the storage frames
@ -1017,18 +1049,20 @@ Using alternative character tables
 </b><br>
 <P>
 The value specified for the <b>tables</b> modifier must be one of the digits 0,
-1, or 2. It causes a specific set of built-in character tables to be passed to
-<b>pcre2_compile()</b>. This is used in the PCRE2 tests to check behaviour with
-different character tables. The digit specifies the tables as follows:
+1, 2, or 3. It causes a specific set of built-in character tables to be passed
+to <b>pcre2_compile()</b>. This is used in the PCRE2 tests to check behaviour
+with different character tables. The digit specifies the tables as follows:
 <pre>
  0   do not pass any special character tables
  1   the default ASCII tables, as distributed in
        pcre2_chartables.c.dist
  2   a set of tables defining ISO 8859 characters
+  3   a set of tables loaded by the #loadtables command
 </pre>
-In table 2, some characters whose codes are greater than 128 are identified as
-letters, digits, spaces, etc. Setting alternate character tables and a locale
-are mutually exclusive.
+In tables 2, some characters whose codes are greater than 128 are identified as
+letters, digits, spaces, etc. Tables 3 can be used only after a
+<b>#loadtables</b> command has loaded them from a binary file. Setting alternate
+character tables and a locale are mutually exclusive.
 </P>
 <br><b>
 Setting certain match controls
@ -1053,9 +1087,12 @@ process.
      startchar                   show starting character when relevant
      substitute_callout          use substitution callouts
      substitute_extended         use PCRE2_SUBSTITUTE_EXTENDED
-      substitute_skip=&#60;n&#62;        skip substitution number n
+      substitute_literal          use PCRE2_SUBSTITUTE_LITERAL
+      substitute_matched          use PCRE2_SUBSTITUTE_MATCHED
      substitute_overflow_length  use PCRE2_SUBSTITUTE_OVERFLOW_LENGTH
-      substitute_stop=&#60;n&#62;        skip substitution number n and greater
+      substitute_replacement_only use PCRE2_SUBSTITUTE_REPLACEMENT_ONLY
+      substitute_skip=&#60;n&#62;         skip substitution &#60;n&#62;
+      substitute_stop=&#60;n&#62;         skip substitution &#60;n&#62; and following
      substitute_unknown_unset    use PCRE2_SUBSTITUTE_UNKNOWN_UNSET
      substitute_unset_empty      use PCRE2_SUBSTITUTE_UNSET_EMPTY
 </pre>
@ -1186,7 +1223,7 @@ Setting match controls
 The following modifiers affect the matching process or request additional
 information. Some of them may also be specified on a pattern line (see above),
 in which case they apply to every subject line that is matched against that
-pattern.
+pattern, but can be overridden by modifiers on the subject.
 <pre>
      aftertext                  show text after match
      allaftertext               show text after captures
@ -1204,7 +1241,8 @@ pattern.
      copy=&#60;number or name&#62;      copy captured substring
      depth_limit=&#60;n&#62;            set a depth limit
      dfa                        use <b>pcre2_dfa_match()</b>
-      find_limits                find match and depth limits
+      find_limits                find heap, match and depth limits
+      find_limits_noheap         find match and depth limits
      get=&#60;number or name&#62;       extract captured substring
      getall                     extract all captured substrings
  /g  global                     global matching
@ -1214,6 +1252,8 @@ pattern.
      match_limit=&#60;n&#62;            set a match limit
      memory                     show heap memory usage
      null_context               match with a NULL context
+      null_replacement           substitute with NULL replacement
+      null_subject               match with NULL subject
      offset=&#60;n&#62;                 set starting offset
      offset_limit=&#60;n&#62;           set offset limit
      ovector=&#60;n&#62;                set size of output vector
@ -1223,8 +1263,11 @@ pattern.
      startoffset=&#60;n&#62;            same as offset=&#60;n&#62;
      substitute_callout         use substitution callouts
      substitute_extedded        use PCRE2_SUBSTITUTE_EXTENDED
-      substitute_skip=&#60;n&#62;        skip substitution number n
+      substitute_literal         use PCRE2_SUBSTITUTE_LITERAL
+      substitute_matched         use PCRE2_SUBSTITUTE_MATCHED
      substitute_overflow_length use PCRE2_SUBSTITUTE_OVERFLOW_LENGTH
+      substitute_replacement_only use PCRE2_SUBSTITUTE_REPLACEMENT_ONLY
+      substitute_skip=&#60;n&#62;        skip substitution number n
      substitute_stop=&#60;n&#62;        skip substitution number n and greater
      substitute_unknown_unset   use PCRE2_SUBSTITUTE_UNKNOWN_UNSET
      substitute_unset_empty     use PCRE2_SUBSTITUTE_UNSET_EMPTY
@ -1249,22 +1292,27 @@ following line with a plus character following the capture number.
 </P>
 <P>
 The <b>allusedtext</b> modifier requests that all the text that was consulted
-during a successful pattern match by the interpreter should be shown. This
-feature is not supported for JIT matching, and if requested with JIT it is
-ignored (with a warning message). Setting this modifier affects the output if
-there is a lookbehind at the start of a match, or a lookahead at the end, or if
-\K is used in the pattern. Characters that precede or follow the start and end
-of the actual match are indicated in the output by '&#60;' or '&#62;' characters
-underneath them. Here is an example:
+during a successful pattern match by the interpreter should be shown, for both
+full and partial matches. This feature is not supported for JIT matching, and
+if requested with JIT it is ignored (with a warning message). Setting this
+modifier affects the output if there is a lookbehind at the start of a match,
+or, for a complete match, a lookahead at the end, or if \K is used in the
+pattern. Characters that precede or follow the start and end of the actual
+match are indicated in the output by '&#60;' or '&#62;' characters underneath them.
+Here is an example:
 <pre>
    re&#62; /(?&#60;=pqr)abc(?=xyz)/
  data&#62; 123pqrabcxyz456\=allusedtext
   0: pqrabcxyz
      &#60;&#60;&#60;   &#62;&#62;&#62;
+  data&#62; 123pqrabcxy\=ph,allusedtext
+  Partial match: pqrabcxy
+                 &#60;&#60;&#60;
 </pre>
-This shows that the matched string is "abc", with the preceding and following
-strings "pqr" and "xyz" having been consulted during the match (when processing
-the assertions).
+The first, complete match shows that the matched string is "abc", with the
+preceding and following strings "pqr" and "xyz" having been consulted during
+the match (when processing the assertions). The partial match can indicate only
+the preceding string.
 </P>
 <P>
 The <b>startchar</b> modifier requests that the starting character for the match
@ -1380,9 +1428,15 @@ Testing the substitution function
 </b><br>
 <P>
 If the <b>replace</b> modifier is set, the <b>pcre2_substitute()</b> function is
-called instead of one of the matching functions. Note that replacement strings
-cannot contain commas, because a comma signifies the end of a modifier. This is
-not thought to be an issue in a test program.
+called instead of one of the matching functions (or after one call of
+<b>pcre2_match()</b> in the case of PCRE2_SUBSTITUTE_MATCHED). Note that
+replacement strings cannot contain commas, because a comma signifies the end of
+a modifier. This is not thought to be an issue in a test program.
+</P>
+<P>
+Specifying a completely empty replacement string disables this modifier.
+However, it is possible to specify an empty replacement by providing a buffer
+length, as described below, for an otherwise empty replacement.
 </P>
 <P>
 Unlike subject strings, <b>pcre2test</b> does not process replacement strings
@ -1398,11 +1452,16 @@ for <b>pcre2_substitute()</b>:
 <pre>
  global                      PCRE2_SUBSTITUTE_GLOBAL
  substitute_extended         PCRE2_SUBSTITUTE_EXTENDED
+  substitute_literal          PCRE2_SUBSTITUTE_LITERAL
+  substitute_matched          PCRE2_SUBSTITUTE_MATCHED
  substitute_overflow_length  PCRE2_SUBSTITUTE_OVERFLOW_LENGTH
+  substitute_replacement_only PCRE2_SUBSTITUTE_REPLACEMENT_ONLY
  substitute_unknown_unset    PCRE2_SUBSTITUTE_UNKNOWN_UNSET
  substitute_unset_empty      PCRE2_SUBSTITUTE_UNSET_EMPTY
-
-</PRE>
+</pre>
+See the
+<a href="pcre2api.html"><b>pcre2api</b></a>
+documentation for details of these options.
 </P>
 <P>
 After a successful substitution, the modified string is output, preceded by the
@ -1506,7 +1565,7 @@ Setting heap, match, and depth limits
 <P>
 The <b>heap_limit</b>, <b>match_limit</b>, and <b>depth_limit</b> modifiers set
 the appropriate limits in the match context. These values are ignored when the
-<b>find_limits</b> modifier is specified.
+<b>find_limits</b> or <b>find_limits_noheap</b> modifier is specified.
 </P>
 <br><b>
 Finding minimum limits
@ -1516,8 +1575,12 @@ If the <b>find_limits</b> modifier is present on a subject line, <b>pcre2test</b
 calls the relevant matching function several times, setting different values in
 the match context via <b>pcre2_set_heap_limit()</b>,
 <b>pcre2_set_match_limit()</b>, or <b>pcre2_set_depth_limit()</b> until it finds
-the minimum values for each parameter that allows the match to complete without
-error. If JIT is being used, only the match limit is relevant.
+the smallest value for each parameter that allows the match to complete without
+a "limit exceeded" error. The match itself may succeed or fail. An alternative
+modifier, <b>find_limits_noheap</b>, omits the heap limit. This is used in the
+standard tests, because the minimum heap limit varies between systems. If JIT
+is being used, only the match limit is relevant, and the other two are
+automatically omitted.
 </P>
 <P>
 When using this modifier, the pattern should not contain any limit settings
@ -1545,9 +1608,7 @@ overall amount of computing resource that is used.
 </P>
 <P>
 For both kinds of matching, the <i>heap_limit</i> number, which is in kibibytes
-(units of 1024 bytes), limits the amount of heap memory used for matching. A
-value of zero disables the use of any heap memory; many simple pattern matches
-can be done without using the heap, so zero is not an unreasonable setting.
+(units of 1024 bytes), limits the amount of heap memory used for matching.
 </P>
 <br><b>
 Showing MARK names
@ -1565,12 +1626,10 @@ Showing memory usage
 <P>
 The <b>memory</b> modifier causes <b>pcre2test</b> to log the sizes of all heap
 memory allocation and freeing calls that occur during a call to
-<b>pcre2_match()</b> or <b>pcre2_dfa_match()</b>. These occur only when a match
-requires a bigger vector than the default for remembering backtracking points
-(<b>pcre2_match()</b>) or for internal workspace (<b>pcre2_dfa_match()</b>). In
-many cases there will be no heap memory used and therefore no additional
-output. No heap memory is allocated during matching with JIT, so in that case
-the <b>memory</b> modifier never has any effect. For this modifier to work, the
+<b>pcre2_match()</b> or <b>pcre2_dfa_match()</b>. In the latter case, heap memory
+is used only when a match requires more internal workspace that the default
+allocation on the stack, so in many cases there will be no output. No heap
+memory is allocated during matching with JIT. For this modifier to work, the
 <b>null_context</b> modifier must not be set on both the pattern and the
 subject, though it can be set on one or the other.
 </P>
@ -1624,7 +1683,7 @@ When testing <b>pcre2_substitute()</b>, this modifier also has the effect of
 passing the replacement string as zero-terminated.
 </P>
 <br><b>
-Passing a NULL context
+Passing a NULL context, subject, or replacement
 </b><br>
 <P>
 Normally, <b>pcre2test</b> passes a context block to <b>pcre2_match()</b>,
@ -1632,7 +1691,13 @@ Normally, <b>pcre2test</b> passes a context block to <b>pcre2_match()</b>,
 If the <b>null_context</b> modifier is set, however, NULL is passed. This is for
 testing that the matching and substitution functions behave correctly in this
 case (they use default values). This modifier cannot be used with the
-<b>find_limits</b> or <b>substitute_callout</b> modifiers.
+<b>find_limits</b>, <b>find_limits_noheap</b>, or <b>substitute_callout</b>
+modifiers.
+</P>
+<P>
+Similarly, for testing purposes, if the <b>null_subject</b> or
+<b>null_replacement</b> modifier is set, the subject or replacement string
+pointers are passed as NULL, respectively, to the relevant functions.
 </P>
 <br><a name="SEC12" href="#TOC1">THE ALTERNATIVE MATCHING FUNCTION</a><br>
 <P>
@ -1779,7 +1844,7 @@ restart the match with additional subject data by means of the
 <b>dfa_restart</b> modifier. For example:
 <pre>
    re&#62; /^\d?\d(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)\d\d$/
-  data&#62; 23ja\=P,dfa
+  data&#62; 23ja\=ps,dfa
  Partial match: 23ja
  data&#62; n05\=dfa,dfa_restart
   0: n05
@ -2071,16 +2136,16 @@ on the stack.
 <P>
 Philip Hazel
 <br>
-University Computing Service
+Retired from University Computing Service
 <br>
 Cambridge, England.
 <br>
 </P>
 <br><a name="SEC21" href="#TOC1">REVISION</a><br>
 <P>
-Last updated: 11 March 2019
+Last updated: 27 July 2022
 <br>
-Copyright &copy; 1997-2019 University of Cambridge.
+Copyright &copy; 1997-2022 University of Cambridge.
 <br>
 <p>
 Return to the <a href="index.html">PCRE2 index page</a>.
--- a/doc/html/pcre2unicode.html
+++ b/doc/html/pcre2unicode.html
@ -16,22 +16,33 @@ please consult the man page, in case the conversion went wrong.
 UNICODE AND UTF SUPPORT
 </b><br>
 <P>
-When PCRE2 is built with Unicode support (which is the default), it has
-knowledge of Unicode character properties and can process text strings in
-UTF-8, UTF-16, or UTF-32 format (depending on the code unit width). However, by
-default, PCRE2 assumes that one code unit is one character. To process a
-pattern as a UTF string, where a character may require more than one code unit,
-you must call
-<a href="pcre2_compile.html"><b>pcre2_compile()</b></a>
-with the PCRE2_UTF option flag, or the pattern must start with the sequence
-(*UTF). When either of these is the case, both the pattern and any subject
-strings that are matched against it are treated as UTF strings instead of
-strings of individual one-code-unit characters. There are also some other
-changes to the way characters are handled, as documented below.
+PCRE2 is normally built with Unicode support, though if you do not need it, you
+can build it without, in which case the library will be smaller. With Unicode
+support, PCRE2 has knowledge of Unicode character properties and can process
+strings of text in UTF-8, UTF-16, and UTF-32 format (depending on the code unit
+width), but this is not the default. Unless specifically requested, PCRE2
+treats each code unit in a string as one character.
 </P>
 <P>
-If you do not need Unicode support you can build PCRE2 without it, in which
-case the library will be smaller.
+There are two ways of telling PCRE2 to switch to UTF mode, where characters may
+consist of more than one code unit and the range of values is constrained. The
+program can call
+<a href="pcre2_compile.html"><b>pcre2_compile()</b></a>
+with the PCRE2_UTF option, or the pattern may start with the sequence (*UTF).
+However, the latter facility can be locked out by the PCRE2_NEVER_UTF option.
+That is, the programmer can prevent the supplier of the pattern from switching
+to UTF mode.
+</P>
+<P>
+Note that the PCRE2_MATCH_INVALID_UTF option (see
+<a href="#matchinvalid">below)</a>
+forces PCRE2_UTF to be set.
+</P>
+<P>
+In UTF mode, both the pattern and any subject strings that are matched against
+it are treated as UTF strings instead of strings of individual one-code-unit
+characters. There are also some other changes to the way characters are
+handled, as documented below.
 </P>
 <br><b>
 UNICODE PROPERTY SUPPORT
@ -39,17 +50,18 @@ UNICODE PROPERTY SUPPORT
 <P>
 When PCRE2 is built with Unicode support, the escape sequences \p{..},
 \P{..}, and \X can be used. This is not dependent on the PCRE2_UTF setting.
-The Unicode properties that can be tested are limited to the general category
-properties such as Lu for an upper case letter or Nd for a decimal number, the
-Unicode script names such as Arabic or Han, and the derived properties Any and
-L&. Full lists are given in the
+The Unicode properties that can be tested are a subset of those that Perl
+supports. Currently they are limited to the general category properties such as
+Lu for an upper case letter or Nd for a decimal number, the Unicode script
+names such as Arabic or Han, Bidi_Class, Bidi_Control, and the derived
+properties Any and LC (synonym L&). Full lists are given in the
 <a href="pcre2pattern.html"><b>pcre2pattern</b></a>
 and
 <a href="pcre2syntax.html"><b>pcre2syntax</b></a>
-documentation. Only the short names for properties are supported. For example,
-\p{L} matches a letter. Its Perl synonym, \p{Letter}, is not supported.
-Furthermore, in Perl, many properties may optionally be prefixed by "Is", for
-compatibility with Perl 5.6. PCRE2 does not support this.
+documentation. In general, only the short names for properties are supported.
+For example, \p{L} matches a letter. Its longer synonym, \p{Letter}, is not
+supported. Furthermore, in Perl, many properties may optionally be prefixed by
+"Is", for compatibility with Perl 5.6. PCRE2 does not support this.
 </P>
 <br><b>
 WIDE CHARACTERS AND UTF MODES
@ -63,22 +75,22 @@ also recognized; larger ones can be coded using \o{...}.
 <P>
 The escape sequence \N{U+&#60;hex digits&#62;} is recognized as another way of
 specifying a Unicode character by code point in a UTF mode. It is not allowed
-in non-UTF modes.
+in non-UTF mode.
 </P>
 <P>
-In UTF modes, repeat quantifiers apply to complete UTF characters, not to
+In UTF mode, repeat quantifiers apply to complete UTF characters, not to
 individual code units.
 </P>
 <P>
-In UTF modes, the dot metacharacter matches one UTF character instead of a
+In UTF mode, the dot metacharacter matches one UTF character instead of a
 single code unit.
 </P>
 <P>
-In UTF modes, capture group names are not restricted to ASCII, and may contain
+In UTF mode, capture group names are not restricted to ASCII, and may contain
 any Unicode letters and decimal digits, as well as underscore.
 </P>
 <P>
-The escape sequence \C can be used to match a single code unit in a UTF mode,
+The escape sequence \C can be used to match a single code unit in UTF mode,
 but its use can lead to some strange effects because it breaks up multi-unit
 characters (see the description of \C in the
 <a href="pcre2pattern.html"><b>pcre2pattern</b></a>
@ -93,7 +105,7 @@ may consist of more than one code unit. The use of \C in these modes provokes
 a match-time error. Also, the JIT optimization does not support \C in these
 modes. If JIT optimization is requested for a UTF-8 or UTF-16 pattern that
 contains \C, it will not succeed, and so when <b>pcre2_match()</b> is called,
-the matching will be carried out by the normal interpretive function.
+the matching will be carried out by the interpretive function.
 </P>
 <P>
 The character escapes \b, \B, \d, \D, \s, \S, \w, and \W correctly test
@ -123,14 +135,16 @@ However, the special horizontal and vertical white space matching escapes (\h,
 not PCRE2_UCP is set.
 </P>
 <br><b>
-CASE-EQUIVALENCE IN UTF MODES
+UNICODE CASE-EQUIVALENCE
 </b><br>
 <P>
-Case-insensitive matching in a UTF mode makes use of Unicode properties except
-for characters whose code points are less than 128 and that have at most two
-case-equivalent values. For these, a direct table lookup is used for speed. A
-few Unicode characters such as Greek sigma have more than two code points that
-are case-equivalent, and these are treated as such.
+If either PCRE2_UTF or PCRE2_UCP is set, upper/lower case processing makes use
+of Unicode properties except for characters whose code points are less than 128
+and that have at most two case-equivalent values. For these, a direct table
+lookup is used for speed. A few Unicode characters such as Greek sigma have
+more than two code points that are case-equivalent, and these are treated
+specially. Setting PCRE2_UCP without PCRE2_UTF allows Unicode-style case
+processing for non-UTF character encodings such as UCS-2.
 <a name="scriptruns"></a></P>
 <br><b>
 SCRIPT RUNS
@ -248,7 +262,7 @@ VALIDITY OF UTF STRINGS
 <P>
 When the PCRE2_UTF option is set, the strings passed as patterns and subjects
 are (by default) checked for validity on entry to the relevant functions. If an
-invalid UTF string is passed, an negative error code is returned. The code unit
+invalid UTF string is passed, a negative error code is returned. The code unit
 offset to the offending character can be extracted from the match data block by
 calling <b>pcre2_get_startchar()</b>, which is used for this purpose after a UTF
 error.
@ -263,17 +277,16 @@ only valid UTF code unit sequences.
 </P>
 <P>
 If you pass an invalid UTF string when PCRE2_NO_UTF_CHECK is set, the result
-is usually undefined and your program may crash or loop indefinitely. There is,
-however, one mode of matching that can handle invalid UTF subject strings. This
-is matching via the JIT optimization using the PCRE2_JIT_INVALID_UTF option
-when calling <b>pcre2_jit_compile()</b>. For details, see the
-<a href="pcre2jit.html"><b>pcre2jit</b></a>
-documentation.
+is undefined and your program may crash or loop indefinitely or give incorrect
+results. There is, however, one mode of matching that can handle invalid UTF
+subject strings. This is enabled by passing PCRE2_MATCH_INVALID_UTF to
+<b>pcre2_compile()</b> and is discussed below in the next section. The rest of
+this section covers the case when PCRE2_MATCH_INVALID_UTF is not set.
 </P>
 <P>
-Passing PCRE2_NO_UTF_CHECK to <b>pcre2_compile()</b> just disables the check for
-the pattern; it does not also apply to subject strings. If you want to disable
-the check for a subject string you must pass this same option to
+Passing PCRE2_NO_UTF_CHECK to <b>pcre2_compile()</b> just disables the UTF check
+for the pattern; it does not also apply to subject strings. If you want to
+disable the check for a subject string you must pass this same option to
 <b>pcre2_match()</b> or <b>pcre2_dfa_match()</b>.
 </P>
 <P>
@ -352,7 +365,7 @@ these code points are excluded by RFC 3629.
 <pre>
  PCRE2_ERROR_UTF8_ERR13
 </pre>
-A 4-byte character has a value greater than 0x10fff; these code points are
+A 4-byte character has a value greater than 0x10ffff; these code points are
 excluded by RFC 3629.
 <pre>
  PCRE2_ERROR_UTF8_ERR14
@ -405,7 +418,59 @@ The following negative error codes are given for invalid UTF-32 strings:
  PCRE2_ERROR_UTF32_ERR1  Surrogate character (0xd800 to 0xdfff)
  PCRE2_ERROR_UTF32_ERR2  Code point is greater than 0x10ffff

-</PRE>
+<a name="matchinvalid"></a></PRE>
+</P>
+<br><b>
+MATCHING IN INVALID UTF STRINGS
+</b><br>
+<P>
+You can run pattern matches on subject strings that may contain invalid UTF
+sequences if you call <b>pcre2_compile()</b> with the PCRE2_MATCH_INVALID_UTF
+option. This is supported by <b>pcre2_match()</b>, including JIT matching, but
+not by <b>pcre2_dfa_match()</b>. When PCRE2_MATCH_INVALID_UTF is set, it forces
+PCRE2_UTF to be set as well. Note, however, that the pattern itself must be a
+valid UTF string.
+</P>
+<P>
+Setting PCRE2_MATCH_INVALID_UTF does not affect what <b>pcre2_compile()</b>
+generates, but if <b>pcre2_jit_compile()</b> is subsequently called, it does
+generate different code. If JIT is not used, the option affects the behaviour
+of the interpretive code in <b>pcre2_match()</b>. When PCRE2_MATCH_INVALID_UTF
+is set at compile time, PCRE2_NO_UTF_CHECK is ignored at match time.
+</P>
+<P>
+In this mode, an invalid code unit sequence in the subject never matches any
+pattern item. It does not match dot, it does not match \p{Any}, it does not
+even match negative items such as [^X]. A lookbehind assertion fails if it
+encounters an invalid sequence while moving the current point backwards. In
+other words, an invalid UTF code unit sequence acts as a barrier which no match
+can cross.
+</P>
+<P>
+You can also think of this as the subject being split up into fragments of
+valid UTF, delimited internally by invalid code unit sequences. The pattern is
+matched fragment by fragment. The result of a successful match, however, is
+given as code unit offsets in the entire subject string in the usual way. There
+are a few points to consider:
+</P>
+<P>
+The internal boundaries are not interpreted as the beginnings or ends of lines
+and so do not match circumflex or dollar characters in the pattern.
+</P>
+<P>
+If <b>pcre2_match()</b> is called with an offset that points to an invalid
+UTF-sequence, that sequence is skipped, and the match starts at the next valid
+UTF character, or the end of the subject.
+</P>
+<P>
+At internal fragment boundaries, \b and \B behave in the same way as at the
+beginning and end of the subject. For example, a sequence such as \bWORD\b
+would match an instance of WORD that is surrounded by invalid UTF code units.
+</P>
+<P>
+Using PCRE2_MATCH_INVALID_UTF, an application can run matches on arbitrary
+data, knowing that any matched strings that are returned are valid UTF. This
+can be useful when searching for UTF text in executable or other binary files.
 </P>
 <br><b>
 AUTHOR
@ -413,7 +478,7 @@ AUTHOR
 <P>
 Philip Hazel
 <br>
-University Computing Service
+Retired from University Computing Service
 <br>
 Cambridge, England.
 <br>
@ -422,9 +487,9 @@ Cambridge, England.
 REVISION
 </b><br>
 <P>
-Last updated: 06 March 2019
+Last updated: 22 December 2021
 <br>
-Copyright &copy; 1997-2019 University of Cambridge.
+Copyright &copy; 1997-2021 University of Cambridge.
 <br>
 <p>
 Return to the <a href="index.html">PCRE2 index page</a>.
--- a/doc/index.html.src
+++ b/doc/index.html.src
@ -146,6 +146,9 @@ in the library.
 <tr><td><a href="pcre2_get_mark.html">pcre2_get_mark</a></td>
    <td>&nbsp;&nbsp;Get a (*MARK) name</td></tr>

+<tr><td><a href="pcre2_get_match_data_size.html">pcre2_get_match_data_size</a></td>
+    <td>&nbsp;&nbsp;Get the size of a match data block</td></tr>
+
 <tr><td><a href="pcre2_get_ovector_count.html">pcre2_get_ovector_count</a></td>
    <td>&nbsp;&nbsp;Get the ovector count</td></tr>

@ -176,6 +179,9 @@ in the library.
 <tr><td><a href="pcre2_maketables.html">pcre2_maketables</a></td>
    <td>&nbsp;&nbsp;Build character tables in current locale</td></tr>

+<tr><td><a href="pcre2_maketables_free.html">pcre2_maketables_free</a></td>
+    <td>&nbsp;&nbsp;Free character tables</td></tr>
+
 <tr><td><a href="pcre2_match.html">pcre2_match</a></td>
    <td>&nbsp;&nbsp;Match a compiled pattern to a subject string
    (Perl compatible)</td></tr>
--- a/doc/pcre2-config.txt
+++ b/doc/pcre2-config.txt
@ -16,8 +16,8 @@ DESCRIPTION

       pcre2-config returns the configuration of the installed PCRE2 libraries
       and the options required to compile a program to use them. Some of  the
-       options  apply  only  to  the  8-bit,  or  16-bit, or 32-bit libraries,
-       respectively, and are not available for libraries that  have  not  been
+       options  apply  only  to the 8-bit, or 16-bit, or 32-bit libraries, re-
+       spectively, and are not available for  libraries  that  have  not  been
       built. If an unavailable option is encountered, the "usage" information
       is output.

@ -36,30 +36,30 @@ OPTIONS
       --version Writes the version number of the installed PCRE2 libraries to
                 the standard output.

-       --libs8   Writes  to  the  standard  output  the  command  line options
-                 required to link with the 8-bit PCRE2 library  (-lpcre2-8  on
+       --libs8   Writes  to  the  standard output the command line options re-
+                 quired to link with the 8-bit  PCRE2  library  (-lpcre2-8  on
                 many systems).

-       --libs16  Writes  to  the  standard  output  the  command  line options
-                 required to link with the 16-bit PCRE2 library (-lpcre2-16 on
+       --libs16  Writes  to  the  standard output the command line options re-
+                 quired to link with the 16-bit PCRE2 library  (-lpcre2-16  on
                 many systems).

-       --libs32  Writes  to  the  standard  output  the  command  line options
-                 required to link with the 32-bit PCRE2 library (-lpcre2-32 on
+       --libs32  Writes  to  the  standard output the command line options re-
+                 quired to link with the 32-bit PCRE2 library  (-lpcre2-32  on
                 many systems).

       --libs-posix
-                 Writes  to  the  standard  output  the  command  line options
-                 required to link  with  PCRE2's  POSIX  API  wrapper  library
+                 Writes  to  the  standard output the command line options re-
+                 quired  to  link  with  PCRE2's  POSIX  API  wrapper  library
                 (-lpcre2-posix -lpcre2-8 on many systems).

-       --cflags  Writes  to  the  standard  output  the  command  line options
-                 required to compile files that use PCRE2  (this  may  include
-                 some -I options, but is blank on many systems).
+       --cflags  Writes  to  the  standard output the command line options re-
+                 quired to compile files that use PCRE2 (this may include some
+                 -I options, but is blank on many systems).

       --cflags-posix
-                 Writes  to  the  standard  output  the  command  line options
-                 required to compile files that use PCRE2's POSIX API  wrapper
+                 Writes  to  the  standard output the command line options re-
+                 quired to compile files that use PCRE2's  POSIX  API  wrapper
                 library  (this  may  include some -I options, but is blank on
                 many systems).

--- a/doc/pcre2.3
+++ b/doc/pcre2.3
@ -1,4 +1,4 @@
-.TH PCRE2 3 "17 September 2018" "PCRE2 10.33"
+.TH PCRE2 3 "27 August 2021" "PCRE2 10.38"
 .SH NAME
 PCRE2 - Perl-compatible regular expressions (revised API)
 .SH INTRODUCTION
@ -11,7 +11,8 @@ nearly two decades, the limitations of the original API were making development
 increasingly difficult. The new API is more extensible, and it was simplified
 by abolishing the separate "study" optimizing function; in PCRE2, patterns are
 automatically optimized where possible. Since forking from PCRE1, the code has
-been extensively refactored and new features introduced.
+been extensively refactored and new features introduced. The old library is now
+obsolete and is no longer maintained.
 .P
 As well as Perl-style regular expression patterns, some features that appeared
 in Python and the original PCRE before they appeared in Perl are available
@ -19,8 +20,13 @@ using the Python syntax. There is also some support for one or two .NET and
 Oniguruma syntax items, and there are options for requesting some minor changes
 that give better ECMAScript (aka JavaScript) compatibility.
 .P
-The source code for PCRE2 can be compiled to support 8-bit, 16-bit, or 32-bit
-code units, which means that up to three separate libraries may be installed.
+The source code for PCRE2 can be compiled to support strings of 8-bit, 16-bit,
+or 32-bit code units, which means that up to three separate libraries may be
+installed, one for each code unit size. The size of code unit is not related to
+the bit size of the underlying hardware. In a 64-bit environment that also
+supports 32-bit applications, versions of PCRE2 that are compiled in both
+64-bit and 32-bit modes may be needed.
+.P
 The original work to extend PCRE to 16-bit and 32-bit code units was done by
 Zoltan Herczeg and Christian Persch, respectively. In all three cases, strings
 can be interpreted either as one character per code unit, or as UTF-encoded
@ -185,18 +191,18 @@ function, listing its arguments and results.
 .sp
 .nf
 Philip Hazel
-University Computing Service
+Retired from University Computing Service
 Cambridge, England.
 .fi
 .P
 Putting an actual email address here is a spam magnet. If you want to email me,
-use my two initials, followed by the two digits 10, at the domain cam.ac.uk.
+use my two names separated by a dot at gmail.com.
 .
 .
 .SH REVISION
 .rs
 .sp
 .nf
-Last updated: 17 September 2018
-Copyright (c) 1997-2018 University of Cambridge.
+Last updated: 27 August 2021
+Copyright (c) 1997-2021 University of Cambridge.
 .fi
--- a/doc/pcre2.txt
+++ b/doc/pcre2.txt
--- a/doc/pcre2_compile.3
+++ b/doc/pcre2_compile.3
@ -1,4 +1,4 @@
-.TH PCRE2_COMPILE 3 "11 February 2019" "PCRE2 10.33"
+.TH PCRE2_COMPILE 3 "22 April 2022" "PCRE2 10.41"
 .SH NAME
 PCRE2 - Perl-compatible regular expressions (revised API)
 .SH SYNOPSIS
@ -53,6 +53,7 @@ The option bits are:
  PCRE2_EXTENDED           Ignore white space and # comments
  PCRE2_FIRSTLINE          Force matching to be before newline
  PCRE2_LITERAL            Pattern characters are all literal
+  PCRE2_MATCH_INVALID_UTF  Enable support for matching invalid UTF
  PCRE2_MATCH_UNSET_BACKREF  Match unset backreferences
  PCRE2_MULTILINE          ^ and $ match newlines within data
  PCRE2_NEVER_BACKSLASH_C  Lock out the use of \eC in patterns
@ -79,8 +80,17 @@ Additional options may be set in the compile context via the
 .\"
 function.
 .P
-The yield of this function is a pointer to a private data structure that
-contains the compiled pattern, or NULL if an error was detected.
+If either of \fIerrorcode\fP or \fIerroroffset\fP is NULL, the function returns
+NULL immediately. Otherwise, the yield of this function is a pointer to a
+private data structure that contains the compiled pattern, or NULL if an error
+was detected. In the error case, a text error message can be obtained by
+passing the value returned via the \fIerrorcode\fP argument to the the
+\fBpcre2_get_error_message()\fP function. The offset (in code units) where the
+error was encountered is returned via the \fIerroroffset\fP argument.
+.P
+If there is no error, the value passed via \fIerrorcode\fP returns the message
+"no error" if passed to \fBpcre2_get_error_message()\fP, and the value passed
+via \fIerroroffset\fP is zero.
 .P
 There is a complete description of the PCRE2 native API, with more detail on
 each option, in the
--- a/doc/pcre2_dfa_match.3
+++ b/doc/pcre2_dfa_match.3
@ -1,4 +1,4 @@
-.TH PCRE2_DFA_MATCH 3 "16 October 2018" "PCRE2 10.33"
+.TH PCRE2_DFA_MATCH 3 "28 August 2021" "PCRE2 10.38"
 .SH NAME
 PCRE2 - Perl-compatible regular expressions (revised API)
 .SH SYNOPSIS
@ -33,10 +33,15 @@ just once (except when processing lookaround assertions). This function is
  \fIworkspace\fP    Points to a vector of ints used as working space
  \fIwscount\fP      Number of elements in the vector
 .sp
-For \fBpcre2_dfa_match()\fP, a match context is needed only if you want to set
-up a callout function or specify the heap limit or the match or the recursion
-depth limits. The \fIlength\fP and \fIstartoffset\fP values are code units, not
-characters. The options are:
+The size of output vector needed to contain all the results depends on the
+number of simultaneous matches, not on the number of parentheses in the
+pattern. Using \fBpcre2_match_data_create_from_pattern()\fP to create the match
+data block is therefore not advisable when using this function.
+.P
+A match context is needed only if you want to set up a callout function or
+specify the heap limit or the match or the recursion depth limits. The
+\fIlength\fP and \fIstartoffset\fP values are code units, not characters. The
+options are:
 .sp
  PCRE2_ANCHORED          Match only at the first position
  PCRE2_COPY_MATCHED_SUBJECT
--- a/doc/pcre2_get_match_data_size.3
+++ b/doc/pcre2_get_match_data_size.3
@ -0,0 +1,27 @@
+.TH PCRE2_GET_MATCH_DATA_SIZE 3 "16 July 2019" "PCRE2 10.34"
+.SH NAME
+PCRE2 - Perl-compatible regular expressions (revised API)
+.SH SYNOPSIS
+.rs
+.sp
+.B #include <pcre2.h>
+.PP
+.nf
+.B PCRE2_SIZE pcre2_get_match_data_size(pcre2_match_data *\fImatch_data\fP);
+.fi
+.
+.SH DESCRIPTION
+.rs
+.sp
+This function returns the size, in bytes, of the match data block that is its
+argument.
+.P
+There is a complete description of the PCRE2 native API in the
+.\" HREF
+\fBpcre2api\fP
+.\"
+page and a description of the POSIX API in the
+.\" HREF
+\fBpcre2posix\fP
+.\"
+page.
--- a/doc/pcre2_jit_compile.3
+++ b/doc/pcre2_jit_compile.3
@ -1,4 +1,4 @@
-.TH PCRE2_JIT_COMPILE 3 "06 March 2019" "PCRE2 10.33"
+.TH PCRE2_JIT_COMPILE 3 "29 July 2019" "PCRE2 10.34"
 .SH NAME
 PCRE2 - Perl-compatible regular expressions (revised API)
 .SH SYNOPSIS
@ -29,11 +29,16 @@ bits:
  PCRE2_JIT_COMPLETE      compile code for full matching
  PCRE2_JIT_PARTIAL_SOFT  compile code for soft partial matching
  PCRE2_JIT_PARTIAL_HARD  compile code for hard partial matching
-  PCRE2_JIT_INVALID_UTF   compile code to handle invalid UTF
 .sp
+There is also an obsolete option called PCRE2_JIT_INVALID_UTF, which has been
+superseded by the \fBpcre2_compile()\fP option PCRE2_MATCH_INVALID_UTF. The old
+option is deprecated and may be removed in the future.
+.P
 The yield of the function is 0 for success, or a negative error code otherwise.
 In particular, PCRE2_ERROR_JIT_BADOPTION is returned if JIT is not supported or
-if an unknown bit is set in \fIoptions\fP.
+if an unknown bit is set in \fIoptions\fP. The function can also return
+PCRE2_ERROR_NOMEMORY if JIT is unable to allocate executable memory for the
+compiler, even if it was because of a system security restriction.
 .P
 There is a complete description of the PCRE2 native API in the
 .\" HREF
--- a/doc/pcre2_jit_free_unused_memory.3
+++ b/doc/pcre2_jit_free_unused_memory.3
@ -17,7 +17,7 @@ This function frees unused JIT executable memory. The argument is a general
 context, for custom memory management, or NULL for standard memory management.
 JIT memory allocation retains some memory in order to improve future JIT
 compilation speed. In low memory conditions,
-\fBpcre2_jit_free_unused_memory()\fB can be used to cause this memory to be
+\fBpcre2_jit_free_unused_memory()\fP can be used to cause this memory to be
 freed.
 .P
 There is a complete description of the PCRE2 native API in the
--- a/doc/pcre2_jit_match.3
+++ b/doc/pcre2_jit_match.3
@ -1,4 +1,4 @@
-.TH PCRE2_JIT_MATCH 3 "03 November 2014" "PCRE2 10.0"
+.TH PCRE2_JIT_MATCH 3 "11 February 2020" "PCRE2 10.35"
 .SH NAME
 PCRE2 - Perl-compatible regular expressions (revised API)
 .SH SYNOPSIS
@ -22,8 +22,10 @@ algorithm that is similar to Perl's. It is a "fast path" interface to JIT, and
 it bypasses some of the sanity checks that \fBpcre2_match()\fP applies.
 Its arguments are exactly the same as for
 .\" HREF
-\fBpcre2_match()\fP.
+\fBpcre2_match()\fP,
 .\"
+except that the subject string must be specified with a length;
+PCRE2_ZERO_TERMINATED is not supported.
 .P
 The supported options are PCRE2_NOTBOL, PCRE2_NOTEOL, PCRE2_NOTEMPTY,
 PCRE2_NOTEMPTY_ATSTART, PCRE2_PARTIAL_HARD, and PCRE2_PARTIAL_SOFT. Unsupported
--- a/doc/pcre2_jit_stack_create.3
+++ b/doc/pcre2_jit_stack_create.3
@ -22,7 +22,8 @@ allocation. The result can be passed to the JIT run-time code by calling
 \fBpcre2_jit_stack_assign()\fP to associate the stack with a compiled pattern,
 which can then be processed by \fBpcre2_match()\fP or \fBpcre2_jit_match()\fP.
 A maximum stack size of 512KiB to 1MiB should be more than enough for any
-pattern. For more details, see the
+pattern. If the stack couldn't be allocated or the values passed were not
+reasonable, NULL will be returned. For more details, see the
 .\" HREF
 \fBpcre2jit\fP
 .\"
--- a/doc/pcre2_maketables.3
+++ b/doc/pcre2_maketables.3
@ -7,7 +7,7 @@ PCRE2 - Perl-compatible regular expressions (revised API)
 .B #include <pcre2.h>
 .PP
 .SM
-.B const unsigned char *pcre2_maketables(pcre2_general_context *\fIgcontext\fP);
+.B const uint8_t *pcre2_maketables(pcre2_general_context *\fIgcontext\fP);
 .
 .SH DESCRIPTION
 .rs
--- a/doc/pcre2_maketables_free.3
+++ b/doc/pcre2_maketables_free.3
@ -0,0 +1,31 @@
+.TH PCRE2_MAKETABLES_FREE 3 "02 September 2019" "PCRE2 10.34"
+.SH NAME
+PCRE2 - Perl-compatible regular expressions (revised API)
+.SH SYNOPSIS
+.rs
+.sp
+.B #include <pcre2.h>
+.PP
+.nf
+.B void pcre2_maketables_free(pcre2_general_context *\fIgcontext\fP,
+.B "  const uint8_t *\fItables\fP);"
+.fi
+.
+.SH DESCRIPTION
+.rs
+.sp
+This function discards a set of character tables that were created by a call
+to
+.\" HREF
+\fBpcre2_maketables()\fP.
+.\"
+.P
+The \fIgcontext\fP parameter should match what was used in that call to
+account for any custom allocators that might be in use; if it is NULL
+the system \fBfree()\fP is used.
+.P
+There is a complete description of the PCRE2 native API in the
+.\" HREF
+\fBpcre2api\fP
+.\"
+page.
--- a/doc/pcre2_match_data_create.3
+++ b/doc/pcre2_match_data_create.3
@ -1,4 +1,4 @@
-.TH PCRE2_MATCH_DATA_CREATE 3 "29 July 2015" "PCRE2 10.21"
+.TH PCRE2_MATCH_DATA_CREATE 3 "28 August 2021" "PCRE2 10.38"
 .SH NAME
 PCRE2 - Perl-compatible regular expressions (revised API)
 .SH SYNOPSIS
@ -18,8 +18,9 @@ This function creates a new match data block, which is used for holding the
 result of a match. The first argument specifies the number of pairs of offsets
 that are required. These form the "output vector" (ovector) within the match
 data block, and are used to identify the matched string and any captured
-substrings. There is always one pair of offsets; if \fBovecsize\fP is zero, it
-is treated as one.
+substrings when matching with \fBpcre2_match()\fP, or a number of different
+matches at the same point when used with \fBpcre2_dfa_match()\fP. There is
+always one pair of offsets; if \fBovecsize\fP is zero, it is treated as one.
 .P
 The second argument points to a general context, for custom memory management,
 or is NULL for system memory management. The result of the function is NULL if
--- a/doc/pcre2_match_data_create_from_pattern.3
+++ b/doc/pcre2_match_data_create_from_pattern.3
@ -1,4 +1,4 @@
-.TH PCRE2_MATCH_DATA_CREATE_FROM_PATTERN 3 "29 July 2015" "PCRE2 10.21"
+.TH PCRE2_MATCH_DATA_CREATE_FROM_PATTERN 3 "28 August 2021" "PCRE2 10.38"
 .SH NAME
 PCRE2 - Perl-compatible regular expressions (revised API)
 .SH SYNOPSIS
@ -14,12 +14,15 @@ PCRE2 - Perl-compatible regular expressions (revised API)
 .SH DESCRIPTION
 .rs
 .sp
-This function creates a new match data block, which is used for holding the
-result of a match. The first argument points to a compiled pattern. The number
-of capturing parentheses within the pattern is used to compute the number of
-pairs of offsets that are required in the match data block. These form the
-"output vector" (ovector) within the match data block, and are used to identify
-the matched string and any captured substrings.
+This function creates a new match data block for holding the result of a match.
+The first argument points to a compiled pattern. The number of capturing
+parentheses within the pattern is used to compute the number of pairs of
+offsets that are required in the match data block. These form the "output
+vector" (ovector) within the match data block, and are used to identify the
+matched string and any captured substrings when matching with
+\fBpcre2_match()\fP. If you are using \fBpcre2_dfa_match()\fP, which uses the
+outut vector in a different way, you should use \fBpcre2_match_data_create()\fP
+instead of this function.
 .P
 The second argument points to a general context, for custom memory management,
 or is NULL to use the same memory allocator as was used for the compiled
--- a/doc/pcre2_serialize_decode.3
+++ b/doc/pcre2_serialize_decode.3
@ -36,7 +36,7 @@ the following negative error codes:
  PCRE2_ERROR_BADDATA   \fInumber_of_codes\fP is zero or less
  PCRE2_ERROR_BADMAGIC  mismatch of id bytes in \fIbytes\fP
  PCRE2_ERROR_BADMODE   mismatch of variable unit size or PCRE version
-  PCRE2_ERROR_MEMORY    memory allocation failed
+  PCRE2_ERROR_NOMEMORY  memory allocation failed
  PCRE2_ERROR_NULL      \fIcodes\fP or \fIbytes\fP is NULL
 .sp
 PCRE2_ERROR_BADMAGIC may mean that the data is corrupt, or that it was compiled
--- a/doc/pcre2_set_character_tables.3
+++ b/doc/pcre2_set_character_tables.3
@ -1,4 +1,4 @@
-.TH PCRE2_SET_CHARACTER_TABLES 3 "22 October 2014" "PCRE2 10.00"
+.TH PCRE2_SET_CHARACTER_TABLES 3 "20 March 2020" "PCRE2 10.35"
 .SH NAME
 PCRE2 - Perl-compatible regular expressions (revised API)
 .SH SYNOPSIS
@ -8,16 +8,21 @@ PCRE2 - Perl-compatible regular expressions (revised API)
 .PP
 .nf
 .B int pcre2_set_character_tables(pcre2_compile_context *\fIccontext\fP,
-.B "  const unsigned char *\fItables\fP);"
+.B "  const uint8_t *\fItables\fP);"
 .fi
 .
 .SH DESCRIPTION
 .rs
 .sp
 This function sets a pointer to custom character tables within a compile
-context. The second argument must be the result of a call to
-\fBpcre2_maketables()\fP or NULL to request the default tables. The result is
-always zero.
+context. The second argument must point to a set of PCRE2 character tables or
+be NULL to request the default tables. The result is always zero. Character
+tables can be created by calling \fBpcre2_maketables()\fP or by running the
+\fBpcre2_dftables\fP maintenance command in binary mode (see the
+.\" HREF
+\fBpcre2build\fP
+.\"
+documentation).
 .P
 There is a complete description of the PCRE2 native API in the
 .\" HREF
--- a/doc/pcre2_set_compile_extra_options.3
+++ b/doc/pcre2_set_compile_extra_options.3
@ -1,4 +1,4 @@
-.TH PCRE2_SET_COMPILE_EXTRA_OPTIONS 3 "11 February 2019" "PCRE2 10.33"
+.TH PCRE2_SET_COMPILE_EXTRA_OPTIONS 3 "31 August 2021" "PCRE2 10.38"
 .SH NAME
 PCRE2 - Perl-compatible regular expressions (revised API)
 .SH SYNOPSIS
@ -18,12 +18,13 @@ This function sets additional option bits for \fBpcre2_compile()\fP that are
 housed in a compile context. It completely replaces all the bits. The extra
 options are:
 .sp
+  PCRE2_EXTRA_ALLOW_LOOKAROUND_BSK     Allow \eK in lookarounds
 .\" JOIN
-  PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES  Allow \ex{df800} to \ex{dfff}
+  PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES  Allow \ex{d800} to \ex{dfff}
                                         in UTF-8 and UTF-32 modes
 .\" JOIN
-  PCRE2_EXTRA_ALT_BSUX                 Extended alternate \eu, \eU, and \ex
-                                         handling
+  PCRE2_EXTRA_ALT_BSUX                 Extended alternate \eu, \eU, and
+                                         \ex handling
 .\" JOIN
  PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL    Treat all invalid escapes as
                                         a literal following character
--- a/doc/pcre2_substitute.3
+++ b/doc/pcre2_substitute.3
@ -1,4 +1,4 @@
-.TH PCRE2_SUBSTITUTE 3 "04 April 2017" "PCRE2 10.30"
+.TH PCRE2_SUBSTITUTE 3 "22 January 2020" "PCRE2 10.35"
 .SH NAME
 PCRE2 - Perl-compatible regular expressions (revised API)
 .SH SYNOPSIS
@ -36,8 +36,8 @@ Its arguments are:
  \fIoutlengthptr\fP  Points to the length of the output buffer
 .sp
 A match data block is needed only if you want to inspect the data from the
-match that is returned in that block. A match context is needed only if you
-want to:
+final match that is returned in that block or if PCRE2_SUBSTITUTE_MATCHED is
+set. A match context is needed only if you want to:
 .sp
  Set up a callout function
  Set a matching offset limit
@ -45,33 +45,57 @@ want to:
  Change the backtracking depth limit
  Set custom memory management in the match context
 .sp
-The \fIlength\fP, \fIstartoffset\fP and \fIrlength\fP values are code
-units, not characters, as is the contents of the variable pointed at by
-\fIoutlengthptr\fP, which is updated to the actual length of the new string.
+The \fIlength\fP, \fIstartoffset\fP and \fIrlength\fP values are code units,
+not characters, as is the contents of the variable pointed at by
+\fIoutlengthptr\fP. This variable must contain the length of the output buffer
+when the function is called. If the function is successful, the value is
+changed to the length of the new string, excluding the trailing zero that is
+automatically added.
+.P
 The subject and replacement lengths can be given as PCRE2_ZERO_TERMINATED for
 zero-terminated strings. The options are:
 .sp
  PCRE2_ANCHORED                     Match only at the first position
-  PCRE2_ENDANCHORED          Pattern can match only at end of subject
-  PCRE2_NOTBOL               Subject is not the beginning of a line
-  PCRE2_NOTEOL               Subject is not the end of a line
-  PCRE2_NOTEMPTY             An empty string is not a valid match
+  PCRE2_ENDANCHORED                  Match only at end of subject
 .\" JOIN
-  PCRE2_NOTEMPTY_ATSTART     An empty string at the start of the
-                              subject is not a valid match
+  PCRE2_NOTBOL                       Subject is not the beginning of a
+                                      line
+  PCRE2_NOTEOL                       Subject is not the end of a line
+.\" JOIN
+  PCRE2_NOTEMPTY                     An empty string is not a
+                                      valid match
+.\" JOIN
+  PCRE2_NOTEMPTY_ATSTART             An empty string at the start of
+                                      the subject is not a valid match
  PCRE2_NO_JIT                       Do not use JIT matching
 .\" JOIN
-  PCRE2_NO_UTF_CHECK         Do not check the subject or replacement
-                              for UTF validity (only relevant if
-                              PCRE2_UTF was set at compile time)
+  PCRE2_NO_UTF_CHECK                 Do not check for UTF validity in
+                                      the subject or replacement
+.\" JOIN
+                                      (only relevant if PCRE2_UTF was
+                                      set at compile time)
  PCRE2_SUBSTITUTE_EXTENDED          Do extended replacement processing
-  PCRE2_SUBSTITUTE_GLOBAL    Replace all occurrences in the subject
+.\" JOIN
+  PCRE2_SUBSTITUTE_GLOBAL            Replace all occurrences in the
+                                      subject
+  PCRE2_SUBSTITUTE_LITERAL           The replacement string is literal
+.\" JOIN
+  PCRE2_SUBSTITUTE_MATCHED           Use pre-existing match data for
+                                      first match
  PCRE2_SUBSTITUTE_OVERFLOW_LENGTH   If overflow, compute needed length
+  PCRE2_SUBSTITUTE_REPLACEMENT_ONLY  Return only replacement string(s)
  PCRE2_SUBSTITUTE_UNKNOWN_UNSET     Treat unknown group as unset
  PCRE2_SUBSTITUTE_UNSET_EMPTY       Simple unset insert = empty string
 .sp
+If PCRE2_SUBSTITUTE_LITERAL is set, PCRE2_SUBSTITUTE_EXTENDED,
+PCRE2_SUBSTITUTE_UNKNOWN_UNSET, and PCRE2_SUBSTITUTE_UNSET_EMPTY are ignored.
+.P
+If PCRE2_SUBSTITUTE_MATCHED is set, \fImatch_data\fP must be non-NULL; its
+contents must be the result of a call to \fBpcre2_match()\fP using the same
+pattern and subject.
+.P
 The function returns the number of substitutions, which may be zero if there
-were no matches. The result can be greater than one only when
+are no matches. The result may be greater than one only when
 PCRE2_SUBSTITUTE_GLOBAL is set. In the event of an error, a negative error code
 is returned.
 .P
--- a/doc/pcre2api.3
+++ b/doc/pcre2api.3
--- a/doc/pcre2build.3
+++ b/doc/pcre2build.3
@ -1,4 +1,4 @@
-.TH PCRE2BUILD 3 "03 March 2019" "PCRE2 10.33"
+.TH PCRE2BUILD 3 "27 July 2022" "PCRE2 10.41"
 .SH NAME
 PCRE2 - Perl-compatible regular expressions (revised API)
 .
@ -110,7 +110,7 @@ To build it without Unicode support, add
  --disable-unicode
 .sp
 to the \fBconfigure\fP command. This setting applies to all three libraries. It
-is not possible to build one library with Unicode support, and another without,
+is not possible to build one library with Unicode support and another without
 in the same configuration.
 .P
 Of itself, Unicode support does not make PCRE2 treat strings as UTF-8, UTF-16
@ -122,8 +122,9 @@ locked this out by setting PCRE2_NEVER_UTF.
 UTF support allows the libraries to process character code points up to
 0x10ffff in the strings that they handle. Unicode support also gives access to
 the Unicode properties of characters, using pattern escapes such as \eP, \ep,
-and \eX. Only the general category properties such as \fILu\fP and \fINd\fP are
-supported. Details are given in the
+and \eX. Only the general category properties such as \fILu\fP and \fINd\fP,
+script names, and some bi-directional properties are supported. Details are
+given in the
 .\" HREF
 \fBpcre2pattern\fP
 .\"
@ -175,11 +176,11 @@ SELinux. This has no effect if JIT is not enabled. See the
 \fBpcre2jit\fP
 .\"
 documentation for a discussion of JIT usage. When JIT support is enabled,
-pcre2grep automatically makes use of it, unless you add
+\fBpcre2grep\fP automatically makes use of it, unless you add
 .sp
  --disable-pcre2grep-jit
 .sp
-to the "configure" command.
+to the \fBconfigure\fP command.
 .
 .
 .SH "NEWLINE RECOGNITION"
@ -277,12 +278,11 @@ to the \fBconfigure\fP command. This setting also applies to the
 \fBpcre2_dfa_match()\fP matching function, and to JIT matching (though the
 counting is done differently).
 .P
-The \fBpcre2_match()\fP function starts out using a 20KiB vector on the system
-stack to record backtracking points. The more nested backtracking points there
-are (that is, the deeper the search tree), the more memory is needed. If the
-initial vector is not large enough, heap memory is used, up to a certain limit,
-which is specified in kibibytes (units of 1024 bytes). The limit can be changed
-at run time, as described in the
+The \fBpcre2_match()\fP function uses heap memory to record backtracking
+points. The more nested backtracking points there are (that is, the deeper the
+search tree), the more memory is needed. There is an upper limit, specified in
+kibibytes (units of 1024 bytes). This limit can be changed at run time, as
+described in the
 .\" HREF
 \fBpcre2api\fP
 .\"
@ -302,7 +302,7 @@ You can also explicitly limit the depth of nested backtracking in the
 for --with-match-limit. You can set a lower default limit by adding, for
 example,
 .sp
-  --with-match-limit_depth=10000
+  --with-match-limit-depth=10000
 .sp
 to the \fBconfigure\fP command. This value can be overridden at run time. This
 depth limit indirectly limits the amount of heap memory that is used, but
@ -317,6 +317,7 @@ used for lookaround assertions, atomic groups, and recursion within patterns.
 The limit does not apply to JIT matching.
 .
 .
+.\" HTML <a name="createtables"></a>
 .SH "CREATING CHARACTER TABLES AT BUILD TIME"
 .rs
 .sp
@ -328,12 +329,33 @@ only. If you add
  --enable-rebuild-chartables
 .sp
 to the \fBconfigure\fP command, the distributed tables are no longer used.
-Instead, a program called \fBdftables\fP is compiled and run. This outputs the
-source for new set of tables, created in the default locale of your C run-time
-system. This method of replacing the tables does not work if you are cross
-compiling, because \fBdftables\fP is run on the local host. If you need to
-create alternative tables when cross compiling, you will have to do so "by
-hand".
+Instead, a program called \fBpcre2_dftables\fP is compiled and run. This
+outputs the source for new set of tables, created in the default locale of your
+C run-time system. This method of replacing the tables does not work if you are
+cross compiling, because \fBpcre2_dftables\fP needs to be run on the local
+host and therefore not compiled with the cross compiler.
+.P
+If you need to create alternative tables when cross compiling, you will have to
+do so "by hand". There may also be other reasons for creating tables manually.
+To cause \fBpcre2_dftables\fP to be built on the local host, run a normal
+compiling command, and then run the program with the output file as its
+argument, for example:
+.sp
+  cc src/pcre2_dftables.c -o pcre2_dftables
+  ./pcre2_dftables src/pcre2_chartables.c
+.sp
+This builds the tables in the default locale of the local host. If you want to
+specify a locale, you must use the -L option:
+.sp
+  LC_ALL=fr_FR ./pcre2_dftables -L src/pcre2_chartables.c
+.sp
+You can also specify -b (with or without -L). This causes the tables to be
+written in binary instead of as source code. A set of binary tables can be
+loaded into memory by an application and passed to \fBpcre2_compile()\fP in the
+same way as tables created by calling \fBpcre2_maketables()\fP. The tables are
+just a string of bytes, independent of hardware characteristics such as
+endianness. This means they can be bundled with an application that runs in
+different environments, to ensure consistent behaviour.
 .
 .
 .SH "USING EBCDIC CODE"
@ -417,7 +439,7 @@ default parameter values by adding, for example,
  --with-pcre2grep-bufsize=51200
  --with-pcre2grep-max-bufsize=2097152
 .sp
-to the \fBconfigure\fP command. The caller of \fPpcre2grep\fP can override
+to the \fBconfigure\fP command. The caller of \fBpcre2grep\fP can override
 these values by using --buffer-size and --max-buffer-size on the command line.
 .
 .
@ -541,15 +563,16 @@ documentation.
 .sp
 The C99 standard defines formatting modifiers z and t for size_t and
 ptrdiff_t values, respectively. By default, PCRE2 uses these modifiers in
-environments other than Microsoft Visual Studio when __STDC_VERSION__ is
-defined and has a value greater than or equal to 199901L (indicating C99).
+environments other than old versions of Microsoft Visual Studio when
+__STDC_VERSION__ is defined and has a value greater than or equal to 199901L
+(indicating support for C99).
 However, there is at least one environment that claims to be C99 but does not
 support these modifiers. If
 .sp
  --disable-percent-zt
 .sp
-is specified, no use is made of the z or t modifiers. Instead or %td or %zu,
-%lu is used, with a cast for size_t values.
+is specified, no use is made of the z or t modifiers. Instead of %td or %zu,
+a suitable format is used depending in the size of long for the platform.
 .
 .
 .SH "SUPPORT FOR FUZZERS"
@ -601,7 +624,7 @@ give a warning.
 .sp
 .nf
 Philip Hazel
-University Computing Service
+Retired from University Computing Service
 Cambridge, England.
 .fi
 .
@ -610,6 +633,6 @@ Cambridge, England.
 .rs
 .sp
 .nf
-Last updated: 03 March 2019
-Copyright (c) 1997-2019 University of Cambridge.
+Last updated: 27 July 2022
+Copyright (c) 1997-2022 University of Cambridge.
 .fi
--- a/doc/pcre2compat.3
+++ b/doc/pcre2compat.3
@ -1,33 +1,43 @@
-.TH PCRE2COMPAT 3 "12 February 2019" "PCRE2 10.33"
+.TH PCRE2COMPAT 3 "08 December 2021" "PCRE2 10.40"
 .SH NAME
 PCRE2 - Perl-compatible regular expressions (revised API)
 .SH "DIFFERENCES BETWEEN PCRE2 AND PERL"
 .rs
 .sp
-This document describes the differences in the ways that PCRE2 and Perl handle
-regular expressions. The differences described here are with respect to Perl
-versions 5.26, but as both Perl and PCRE2 are continually changing, the
-information may sometimes be out of date.
+This document describes some of the differences in the ways that PCRE2 and Perl
+handle regular expressions. The differences described here are with respect to
+Perl version 5.34.0, but as both Perl and PCRE2 are continually changing, the
+information may at times be out of date.
 .P
-1. PCRE2 has only a subset of Perl's Unicode support. Details of what it does
+1. When PCRE2_DOTALL (equivalent to Perl's /s qualifier) is not set, the
+behaviour of the '.' metacharacter differs from Perl. In PCRE2, '.' matches the
+next character unless it is the start of a newline sequence. This means that,
+if the newline setting is CR, CRLF, or NUL, '.' will match the code point LF
+(0x0A) in ASCII/Unicode environments, and NL (either 0x15 or 0x25) when using
+EBCDIC. In Perl, '.' appears never to match LF, even when 0x0A is not a newline
+indicator.
+.P
+2. PCRE2 has only a subset of Perl's Unicode support. Details of what it does
 have are given in the
 .\" HREF
 \fBpcre2unicode\fP
 .\"
 page.
 .P
-2. Like Perl, PCRE2 allows repeat quantifiers on parenthesized assertions, but
+3. Like Perl, PCRE2 allows repeat quantifiers on parenthesized assertions, but
 they do not mean what you might think. For example, (?!a){3} does not assert
 that the next three characters are not "a". It just asserts that the next
 character is not "a" three times (in principle; PCRE2 optimizes this to run the
 assertion just once). Perl allows some repeat quantifiers on other assertions,
-for example, \eb* (but not \eb{3}), but these do not seem to have any use.
+for example, \eb* , but these do not seem to have any use. PCRE2 does not allow
+any kind of quantifier on non-lookaround assertions.
 .P
-3. Capture groups that occur inside negative lookaround assertions are counted,
+4. Capture groups that occur inside negative lookaround assertions are counted,
 but their entries in the offsets vector are set only when a negative assertion
 is a condition that has a matching branch (that is, the condition is false).
+Perl may set such capture groups in other circumstances.
 .P
-4. The following Perl escape sequences are not supported: \eF, \el, \eL, \eu,
+5. The following Perl escape sequences are not supported: \eF, \el, \eL, \eu,
 \eU, and \eN when followed by a character name. \eN on its own, matching a
 non-newline character, and \eN{U+dd..}, matching a Unicode code point, are
 supported. The escapes that modify the case of following letters are
@ -37,23 +47,27 @@ generated by default. However, if either of the PCRE2_ALT_BSUX or
 PCRE2_EXTRA_ALT_BSUX options is set, \eU and \eu are interpreted as ECMAScript
 interprets them.
 .P
-5. The Perl escape sequences \ep, \eP, and \eX are supported only if PCRE2 is
+6. The Perl escape sequences \ep, \eP, and \eX are supported only if PCRE2 is
 built with Unicode support (the default). The properties that can be tested
 with \ep and \eP are limited to the general category properties such as Lu and
-Nd, script names such as Greek or Han, and the derived properties Any and L&.
-PCRE2 does support the Cs (surrogate) property, which Perl does not; the Perl
-documentation says "Because Perl hides the need for the user to understand the
-internal representation of Unicode characters, there is no need to implement
-the somewhat messy concept of surrogates."
+Nd, script names such as Greek or Han, Bidi_Class, Bidi_Control, and the
+derived properties Any and LC (synonym L&). Both PCRE2 and Perl support the Cs
+(surrogate) property, but in PCRE2 its use is limited. See the
+.\" HREF
+\fBpcre2pattern\fP
+.\"
+documentation for details. The long synonyms for property names that Perl
+supports (such as \ep{Letter}) are not supported by PCRE2, nor is it permitted
+to prefix any of these properties with "Is".
 .P
-6. PCRE2 supports the \eQ...\eE escape for quoting substrings. Characters
+7. PCRE2 supports the \eQ...\eE escape for quoting substrings. Characters
 in between are treated as literals. However, this is slightly different from
 Perl in that $ and @ are also handled as literals inside the quotes. In Perl,
-they cause variable interpolation (but of course PCRE2 does not have
-variables). Also, Perl does "double-quotish backslash interpolation" on any
-backslashes between \eQ and \eE which, its documentation says, "may lead to
-confusing results". PCRE2 treats a backslash between \eQ and \eE just like any
-other character. Note the following examples:
+they cause variable interpolation (PCRE2 does not have variables). Also, Perl
+does "double-quotish backslash interpolation" on any backslashes between \eQ
+and \eE which, its documentation says, "may lead to confusing results". PCRE2
+treats a backslash between \eQ and \eE just like any other character. Note the
+following examples:
 .sp
    Pattern            PCRE2 matches     Perl matches
 .sp
@ -65,9 +79,10 @@ other character. Note the following examples:
    \eQA\eB\eE            A\eB               A\eB
    \eQ\e\eE              \e                 \e\eE
 .sp
-The \eQ...\eE sequence is recognized both inside and outside character classes.
+The \eQ...\eE sequence is recognized both inside and outside character classes
+by both PCRE2 and Perl.
 .P
-7. Fairly obviously, PCRE2 does not support the (?{code}) and (??{code})
+8. Fairly obviously, PCRE2 does not support the (?{code}) and (??{code})
 constructions. However, PCRE2 does have a "callout" feature, which allows an
 external function to be called during pattern matching. See the
 .\" HREF
@ -75,27 +90,24 @@ external function to be called during pattern matching. See the
 .\"
 documentation for details.
 .P
-8. Subroutine calls (whether recursive or not) were treated as atomic groups up
+9. Subroutine calls (whether recursive or not) were treated as atomic groups up
 to PCRE2 release 10.23, but from release 10.30 this changed, and backtracking
 into subroutine calls is now supported, as in Perl.
 .P
-9. If any of the backtracking control verbs are used in a group that is called
-as a subroutine (whether or not recursively), their effect is confined to that
-group; it does not extend to the surrounding pattern. This is not always the
-case in Perl. In particular, if (*THEN) is present in a group that is called as
-a subroutine, its action is limited to that group, even if the group does not
-contain any | characters. Note that such groups are processed as anchored
-at the point where they are tested.
+10. In PCRE2, if any of the backtracking control verbs are used in a group that
+is called as a subroutine (whether or not recursively), their effect is
+confined to that group; it does not extend to the surrounding pattern. This is
+not always the case in Perl. In particular, if (*THEN) is present in a group
+that is called as a subroutine, its action is limited to that group, even if
+the group does not contain any | characters. Note that such groups are
+processed as anchored at the point where they are tested.
 .P
-10. If a pattern contains more than one backtracking control verb, the first
+11. If a pattern contains more than one backtracking control verb, the first
 one that is backtracked onto acts. For example, in the pattern
 A(*COMMIT)B(*PRUNE)C a failure in B triggers (*COMMIT), but a failure in C
 triggers (*PRUNE). Perl's behaviour is more complex; in many cases it is the
 same as PCRE2, but there are cases where it differs.
 .P
-11. Most backtracking verbs in assertions have their normal actions. They are
-not confined to the assertion.
-.P
 12. There are some differences that are concerned with the settings of captured
 strings when part of a pattern is repeated. For example, matching "aba" against
 the pattern /^(a(b)?)+$/ in Perl leaves $2 unset, but in PCRE2 it is set to
@ -104,7 +116,7 @@ the pattern /^(a(b)?)+$/ in Perl leaves $2 unset, but in PCRE2 it is set to
 13. PCRE2's handling of duplicate capture group numbers and names is not as
 general as Perl's. This is a consequence of the fact the PCRE2 works internally
 just with numbers, using an external table to translate between numbers and
-names. In particular, a pattern such as (?|(?<a>A)|(?<b>B), where the two
+names. In particular, a pattern such as (?|(?<a>A)|(?<b>B)), where the two
 capture groups have the same number but different names, is not supported, and
 causes an error at compile time. If it were allowed, it would not be possible
 to distinguish which group matched, because both names map to capture group
@ -124,17 +136,24 @@ certainly user mistakes.
 16. In PCRE2, the upper/lower case character properties Lu and Ll are not
 affected when case-independent matching is specified. For example, \ep{Lu}
 always matches an upper case letter. I think Perl has changed in this respect;
-in the release at the time of writing (5.24), \ep{Lu} and \ep{Ll} match all
+in the release at the time of writing (5.34), \ep{Lu} and \ep{Ll} match all
 letters, regardless of case, when case independence is specified.
 .P
-17. PCRE2 provides some extensions to the Perl regular expression facilities.
-Perl 5.10 includes new features that are not in earlier versions of Perl, some
+17. From release 5.32.0, Perl locks out the use of \eK in lookaround
+assertions. From release 10.38 PCRE2 does the same by default. However, there
+is an option for re-enabling the previous behaviour. When this option is set,
+\eK is acted on when it occurs in positive assertions, but is ignored in
+negative assertions.
+.P
+18. PCRE2 provides some extensions to the Perl regular expression facilities.
+Perl 5.10 included new features that were not in earlier versions of Perl, some
 of which (such as named parentheses) were in PCRE2 for some time before. This
-list is with respect to Perl 5.26:
+list is with respect to Perl 5.34:
 .sp
 (a) Although lookbehind assertions in PCRE2 must match fixed length strings,
-each alternative branch of a lookbehind assertion can match a different length
-of string. Perl requires them all to have the same length.
+each alternative toplevel branch of a lookbehind assertion can match a
+different length of string. Perl used to require them all to have the same
+length, but the latest version has some variable length support.
 .sp
 (b) From PCRE2 10.23, backreferences to groups of fixed length are supported
 in lookbehinds, provided that there is no possibility of referencing a
@ -168,14 +187,18 @@ variable interpolation, but not general hooks on every match.
 different way and is not Perl-compatible.
 .sp
 (l) PCRE2 recognizes some special sequences such as (*CR) or (*NO_JIT) at
-the start of a pattern that set overall options that cannot be changed within
+the start of a pattern. These set overall options that cannot be changed within
 the pattern.
+.sp
+(m) PCRE2 supports non-atomic positive lookaround assertions. This is an
+extension to the lookaround facilities. The default, Perl-compatible
+lookarounds are atomic.
 .P
-18. The Perl /a modifier restricts /d numbers to pure ascii, and the /aa
+19. The Perl /a modifier restricts /d numbers to pure ascii, and the /aa
 modifier restricts /i case-insensitive matching to pure ascii, ignoring Unicode
 rules. This separation cannot be represented with PCRE2_UCP.
 .P
-19. Perl has different limits than PCRE2. See the
+20. Perl has different limits than PCRE2. See the
 .\" HREF
 \fBpcre2limit\fP
 .\"
@ -190,7 +213,7 @@ fall into any stack-overflow limit. PCRE2 made a similar change at release
 .sp
 .nf
 Philip Hazel
-University Computing Service
+Retired from University Computing Service
 Cambridge, England.
 .fi
 .
@ -199,6 +222,6 @@ Cambridge, England.
 .rs
 .sp
 .nf
-Last updated: 12 February 2019
-Copyright (c) 1997-2019 University of Cambridge.
+Last updated: 08 December 2021
+Copyright (c) 1997-2021 University of Cambridge.
 .fi
--- a/doc/pcre2convert.3
+++ b/doc/pcre2convert.3
@ -116,8 +116,8 @@ permitted to match separator characters, but the double-star (**) feature
 (which does match separators) is supported.
 .P
 PCRE2_CONVERT_GLOB_NO_WILD_SEPARATOR matches globs with wildcards allowed to
-match separator characters. PCRE2_GLOB_NO_STARSTAR matches globs with the
-double-star feature disabled. These options may be given together.
+match separator characters. PCRE2_CONVERT_GLOB_NO_STARSTAR matches globs with
+the double-star feature disabled. These options may be given together.
 .
 .
 .SH "CONVERTING POSIX PATTERNS"
--- a/doc/pcre2demo.3
+++ b/doc/pcre2demo.3
@ -104,12 +104,11 @@ uint32_t newline;

 PCRE2_SIZE erroroffset;
 PCRE2_SIZE *ovector;
+PCRE2_SIZE subject_length;

-size_t subject_length;
 pcre2_match_data *match_data;


-
 /**************************************************************************
 * First, sort out the command line. There is only one possible option at  *
 * the moment, "-g" to request repeated matching to find all occurrences,  *
@ -138,12 +137,14 @@ if (argc - i != 2)
  return 1;
  }

-/* As pattern and subject are char arguments, they can be straightforwardly
-cast to PCRE2_SPTR as we are working in 8-bit code units. */
+/* Pattern and subject are char arguments, so they can be straightforwardly
+cast to PCRE2_SPTR because we are working in 8-bit code units. The subject
+length is cast to PCRE2_SIZE for completeness, though PCRE2_SIZE is in fact
+defined to be size_t. */

 pattern = (PCRE2_SPTR)argv[i];
 subject = (PCRE2_SPTR)argv[i+1];
-subject_length = strlen((char *)subject);
+subject_length = (PCRE2_SIZE)strlen((char *)subject);


 /*************************************************************************
@ -172,17 +173,22 @@ if (re == NULL)


 /*************************************************************************
-* If the compilation succeeded, we call PCRE again, in order to do a     *
+* If the compilation succeeded, we call PCRE2 again, in order to do a    *
 * pattern match against the subject string. This does just ONE match. If *
 * further matching is needed, it will be done below. Before running the  *
-* match we must set up a match_data block for holding the result.        *
+* match we must set up a match_data block for holding the result. Using  *
+* pcre2_match_data_create_from_pattern() ensures that the block is       *
+* exactly the right size for the number of capturing parentheses in the  *
+* pattern. If you need to know the actual size of a match_data block as  *
+* a number of bytes, you can find it like this:                          *
+*                                                                        *
+* PCRE2_SIZE match_data_size = pcre2_get_match_data_size(match_data);    *
 *************************************************************************/

-/* Using this function ensures that the block is exactly the right size for
-the number of capturing parentheses in the pattern. */
-
 match_data = pcre2_match_data_create_from_pattern(re, NULL);

+/* Now run the match. */
+
 rc = pcre2_match(
  re,                   /* the compiled pattern */
  subject,              /* the subject string */
@ -209,8 +215,8 @@ if (rc < 0)
  return 1;
  }

-/* Match succeded. Get a pointer to the output vector, where string offsets are
-stored. */
+/* Match succeeded. Get a pointer to the output vector, where string offsets
+are stored. */

 ovector = pcre2_get_ovector_pointer(match_data);
 printf("Match succeeded at offset %d\en", (int)ovector[0]);
@ -228,9 +234,12 @@ pcre2_match_data_create_from_pattern() above. */
 if (rc == 0)
  printf("ovector was not big enough for all the captured substrings\en");

-/* We must guard against patterns such as /(?=.\eK)/ that use \eK in an assertion
-to set the start of a match later than its end. In this demonstration program,
-we just detect this case and give up. */
+/* Since release 10.38 PCRE2 has locked out the use of \eK in lookaround
+assertions. However, there is an option to re-enable the old behaviour. If that
+is set, it is possible to run patterns such as /(?=.\eK)/ that use \eK in an
+assertion to set the start of a match later than its end. In this demonstration
+program, we show how to detect this case, but it shouldn't arise because the
+option is never set. */

 if (ovector[0] > ovector[1])
  {
@ -249,7 +258,7 @@ application you might want to do things other than print them. */
 for (i = 0; i < rc; i++)
  {
  PCRE2_SPTR substring_start = subject + ovector[2*i];
-  size_t substring_length = ovector[2*i+1] - ovector[2*i];
+  PCRE2_SIZE substring_length = ovector[2*i+1] - ovector[2*i];
  printf("%2d: %.*s\en", i, (int)substring_length, (char *)substring_start);
  }

@ -447,7 +456,7 @@ for (;;)
    return 1;
    }

-  /* Match succeded */
+  /* Match succeeded */

  printf("\enMatch succeeded again at offset %d\en", (int)ovector[0]);

--- a/doc/pcre2grep.1
+++ b/doc/pcre2grep.1
@ -1,4 +1,4 @@
-.TH PCRE2GREP 1 "24 November 2018" "PCRE2 10.33"
+.TH PCRE2GREP 1 "30 July 2022" "PCRE2 10.41"
 .SH NAME
 pcre2grep - a grep with Perl-compatible regular expressions.
 .SH SYNOPSIS
@ -43,13 +43,15 @@ For example:
 .sp
  pcre2grep some-pattern file1 - file3
 .sp
-Input files are searched line by line. By default, each line that matches a
+By default, input files are searched line by line. Each line that matches a
 pattern is copied to the standard output, and if there is more than one file,
 the file name is output at the start of each line, followed by a colon.
-However, there are options that can change how \fBpcre2grep\fP behaves. In
-particular, the \fB-M\fP option makes it possible to search for strings that
-span line boundaries. What defines a line boundary is controlled by the
-\fB-N\fP (\fB--newline\fP) option.
+However, there are options that can change how \fBpcre2grep\fP behaves. For
+example, the \fB-M\fP option makes it possible to search for strings that span
+line boundaries. What defines a line boundary is controlled by the \fB-N\fP
+(\fB--newline\fP) option. The \fB-h\fP and \fB-H\fP options control whether or
+not file names are shown, and the \fB-Z\fP option changes the file name
+terminator to a zero byte.
 .P
 The amount of memory used for buffering files that are being scanned is
 controlled by parameters that can be set by the \fB--buffer-size\fP and
@ -79,8 +81,8 @@ matching substrings, or if \fB--only-matching\fP, \fB--file-offsets\fP, or
 (either shown literally, or as an offset), scanning resumes immediately
 following the match, so that further matches on the same line can be found. If
 there are multiple patterns, they are all tried on the remainder of the line,
-but patterns that follow the one that matched are not tried on the earlier part
-of the line.
+but patterns that follow the one that matched are not tried on the earlier
+matched part of the line.
 .P
 This behaviour means that the order in which multiple patterns are specified
 can affect the output when one of the above options is used. This is no longer
@ -115,11 +117,10 @@ ignored.
 .rs
 .sp
 By default, a file that contains a binary zero byte within the first 1024 bytes
-is identified as a binary file, and is processed specially. (GNU grep
-identifies binary files in this manner.) However, if the newline type is
-specified as "nul", that is, the line terminator is a binary zero, the test for
-a binary file is not applied. See the \fB--binary-files\fP option for a means
-of changing the way binary files are handled.
+is identified as a binary file, and is processed specially. However, if the
+newline type is specified as NUL, that is, the line terminator is a binary
+zero, the test for a binary file is not applied. See the \fB--binary-files\fP
+option for a means of changing the way binary files are handled.
 .
 .
 .SH "BINARY ZEROS IN PATTERNS"
@ -150,22 +151,30 @@ Output up to \fInumber\fP lines of context after each matching line. Fewer
 lines are output if the next match or the end of the file is reached, or if the
 processing buffer size has been set too small. If file names and/or line
 numbers are being output, a hyphen separator is used instead of a colon for the
-context lines. A line containing "--" is output between each group of lines,
-unless they are in fact contiguous in the input file. The value of \fInumber\fP
-is expected to be relatively small. When \fB-c\fP is used, \fB-A\fP is ignored.
+context lines (the \fB-Z\fP option can be used to change the file name
+terminator to a zero byte). A line containing "--" is output between each group
+of lines, unless they are in fact contiguous in the input file. The value of
+\fInumber\fP is expected to be relatively small. When \fB-c\fP is used,
+\fB-A\fP is ignored.
 .TP
 \fB-a\fP, \fB--text\fP
 Treat binary files as text. This is equivalent to
 \fB--binary-files\fP=\fItext\fP.
 .TP
+\fB--allow-lookaround-bsk\fP
+PCRE2 now forbids the use of \eK in lookarounds by default, in line with Perl.
+This option causes \fBpcre2grep\fP to set the PCRE2_EXTRA_ALLOW_LOOKAROUND_BSK
+option, which enables this somewhat dangerous usage.
+.TP
 \fB-B\fP \fInumber\fP, \fB--before-context=\fP\fInumber\fP
 Output up to \fInumber\fP lines of context before each matching line. Fewer
 lines are output if the previous match or the start of the file is within
 \fInumber\fP lines, or if the processing buffer size has been set too small. If
 file names and/or line numbers are being output, a hyphen separator is used
-instead of a colon for the context lines. A line containing "--" is output
-between each group of lines, unless they are in fact contiguous in the input
-file. The value of \fInumber\fP is expected to be relatively small. When
+instead of a colon for the context lines (the \fB-Z\fP option can be used to
+change the file name terminator to a zero byte). A line containing "--" is
+output between each group of lines, unless they are in fact contiguous in the
+input file. The value of \fInumber\fP is expected to be relatively small. When
 \fB-c\fP is used, \fB-B\fP is ignored.
 .TP
 \fB--binary-files=\fP\fIword\fP
@ -352,19 +361,21 @@ shown separately. This option is mutually exclusive with \fB--output\fP,
 .TP
 \fB-H\fP, \fB--with-filename\fP
 Force the inclusion of the file name at the start of output lines when
-searching a single file. By default, the file name is not shown in this case.
-For matching lines, the file name is followed by a colon; for context lines, a
-hyphen separator is used. If a line number is also being output, it follows the
-file name. When the \fB-M\fP option causes a pattern to match more than one
-line, only the first is preceded by the file name. This option overrides any
-previous \fB-h\fP, \fB-l\fP, or \fB-L\fP options.
+searching a single file. The file name is not normally shown in this case.
+By default, for matching lines, the file name is followed by a colon; for
+context lines, a hyphen separator is used. The \fB-Z\fP option can be used to
+change the terminator to a zero byte. If a line number is also being output,
+it follows the file name. When the \fB-M\fP option causes a pattern to match
+more than one line, only the first is preceded by the file name. This option
+overrides any previous \fB-h\fP, \fB-l\fP, or \fB-L\fP options.
 .TP
 \fB-h\fP, \fB--no-filename\fP
-Suppress the output file names when searching multiple files. By default,
-file names are shown when multiple files are searched. For matching lines, the
-file name is followed by a colon; for context lines, a hyphen separator is used.
-If a line number is also being output, it follows the file name. This option
-overrides any previous \fB-H\fP, \fB-L\fP, or \fB-l\fP options.
+Suppress the output file names when searching multiple files. File names are
+normally shown when multiple files are searched. By default, for matching
+lines, the file name is followed by a colon; for context lines, a hyphen
+separator is used. The \fB-Z\fP option can be used to change the terminator to
+a zero byte. If a line number is also being output, it follows the file name.
+This option overrides any previous \fB-H\fP, \fB-L\fP, or \fB-l\fP options.
 .TP
 \fB--heap-limit\fP=\fInumber\fP
 See \fB--match-limit\fP below.
@ -383,8 +394,8 @@ Ignore upper/lower case distinctions during comparisons.
 .TP
 \fB--include\fP=\fIpattern\fP
 If any \fB--include\fP patterns are specified, the only files that are
-processed are those that match one of the patterns (and do not match an
-\fB--exclude\fP pattern). This option does not affect directories, but it
+processed are those whose names match one of the patterns and do not match an
+\fB--exclude\fP pattern. This option does not affect directories, but it
 applies to all files, whether listed on the command line, obtained from
 \fB--file-list\fP, or by scanning a directory. The pattern is a PCRE2 regular
 expression, and is matched against the final component of the file name, not
@ -401,8 +412,8 @@ may be given any number of times; all the files are read.
 .TP
 \fB--include-dir\fP=\fIpattern\fP
 If any \fB--include-dir\fP patterns are specified, the only directories that
-are processed are those that match one of the patterns (and do not match an
-\fB--exclude-dir\fP pattern). This applies to all directories, whether listed
+are processed are those whose names match one of the patterns and do not match
+an \fB--exclude-dir\fP pattern. This applies to all directories, whether listed
 on the command line, obtained from \fB--file-list\fP, or by scanning a parent
 directory. The pattern is a PCRE2 regular expression, and is matched against
 the final component of the directory name, not the entire path. The \fB-F\fP,
@ -413,18 +424,21 @@ given any number of times. If a directory matches both \fB--include-dir\fP and
 \fB-L\fP, \fB--files-without-match\fP
 Instead of outputting lines from the files, just output the names of the files
 that do not contain any lines that would have been output. Each file name is
-output once, on a separate line. This option overrides any previous \fB-H\fP,
-\fB-h\fP, or \fB-l\fP options.
+output once, on a separate line by default, but if the \fB-Z\fP option is set, 
+they are separated by zero bytes instead of newlines. This option overrides any
+previous \fB-H\fP, \fB-h\fP, or \fB-l\fP options.
 .TP
 \fB-l\fP, \fB--files-with-matches\fP
 Instead of outputting lines from the files, just output the names of the files
 containing lines that would have been output. Each file name is output once, on
-a separate line. Searching normally stops as soon as a matching line is found
-in a file. However, if the \fB-c\fP (count) option is also used, matching
-continues in order to obtain the correct count, and those files that have at
-least one match are listed along with their counts. Using this option with
-\fB-c\fP is a way of suppressing the listing of files with no matches. This
-opeion overrides any previous \fB-H\fP, \fB-h\fP, or \fB-L\fP options.
+a separate line, but if the \fB-Z\fP option is set, they are separated by zero
+bytes instead of newlines. Searching normally stops as soon as a matching line
+is found in a file. However, if the \fB-c\fP (count) option is also used,
+matching continues in order to obtain the correct count, and those files that
+have at least one match are listed along with their counts. Using this option
+with \fB-c\fP is a way of suppressing the listing of files with no matches that
+occurs with \fB-c\fP on its own. This option overrides any previous \fB-H\fP,
+\fB-h\fP, or \fB-L\fP options.
 .TP
 \fB--label\fP=\fIname\fP
 This option supplies a name to be used for the standard input when file names
@ -435,8 +449,8 @@ short form for this option.
 When this option is given, non-compressed input is read and processed line by
 line, and the output is flushed after each write. By default, input is read in
 large chunks, unless \fBpcre2grep\fP can determine that it is reading from a
-terminal (which is currently possible only in Unix-like environments or
-Windows). Output to terminal is normally automatically flushed by the operating
+terminal, which is currently possible only in Unix-like environments or
+Windows. Output to terminal is normally automatically flushed by the operating
 system. This option can be useful when the input or output is attached to a
 pipe and you do not want \fBpcre2grep\fP to buffer up large amounts of data.
 However, its use will affect performance, and the \fB-M\fP (multiline) option
@ -459,40 +473,6 @@ the value in the \fBLC_ALL\fP or \fBLC_CTYPE\fP environment variables. If no
 locale is specified, the PCRE2 library's default (usually the "C" locale) is
 used. There is no short form for this option.
 .TP
-\fB--match-limit\fP=\fInumber\fP
-Processing some regular expression patterns may take a very long time to search
-for all possible matching strings. Others may require a very large amount of
-memory. There are three options that set resource limits for matching.
-.sp
-The \fB--match-limit\fP option provides a means of limiting computing resource
-usage when processing patterns that are not going to match, but which have a
-very large number of possibilities in their search trees. The classic example
-is a pattern that uses nested unlimited repeats. Internally, PCRE2 has a
-counter that is incremented each time around its main processing loop. If the
-value set by \fB--match-limit\fP is reached, an error occurs.
-.sp
-The \fB--heap-limit\fP option specifies, as a number of kibibytes (units of
-1024 bytes), the amount of heap memory that may be used for matching. Heap
-memory is needed only if matching the pattern requires a significant number of
-nested backtracking points to be remembered. This parameter can be set to zero
-to forbid the use of heap memory altogether.
-.sp
-The \fB--depth-limit\fP option limits the depth of nested backtracking points,
-which indirectly limits the amount of memory that is used. The amount of memory
-needed for each backtracking point depends on the number of capturing
-parentheses in the pattern, so the amount of memory that is used before this
-limit acts varies from pattern to pattern. This limit is of use only if it is
-set smaller than \fB--match-limit\fP.
-.sp
-There are no short forms for these options. The default limits can be set
-when the PCRE2 library is compiled; if they are not specified, the defaults
-are very large and so effectively unlimited.
-.TP
-\fB--max-buffer-size=\fInumber\fP
-This limits the expansion of the processing buffer, whose initial size can be
-set by \fB--buffer-size\fP. The maximum buffer size is silently forced to be no
-smaller than the starting buffer size.
-.TP
 \fB-M\fP, \fB--multiline\fP
 Allow patterns to match more than one line. When this option is set, the PCRE2
 library is called in "multiline" mode. This allows a matched string to extend
@ -520,27 +500,74 @@ well as possibly handling a two-character newline sequence.
 There is a limit to the number of lines that can be matched, imposed by the way
 that \fBpcre2grep\fP buffers the input file as it scans it. With a sufficiently
 large processing buffer, this should not be a problem, but the \fB-M\fP option
-does not work when input is read line by line (see \fP--line-buffered\fP.)
+does not work when input is read line by line (see \fB--line-buffered\fP.)
+.TP
+\fB-m\fP \fInumber\fP, \fB--max-count\fP=\fInumber\fP
+Stop processing after finding \fInumber\fP matching lines, or non-matching
+lines if \fB-v\fP is also set. Any trailing context lines are output after the
+final match. In multiline mode, each multiline match counts as just one line
+for this purpose. If this limit is reached when reading the standard input from
+a regular file, the file is left positioned just after the last matching line.
+If \fB-c\fP is also set, the count that is output is never greater than
+\fInumber\fP. This option has no effect if used with \fB-L\fP, \fB-l\fP, or
+\fB-q\fP, or when just checking for a match in a binary file.
+.TP
+\fB--match-limit\fP=\fInumber\fP
+Processing some regular expression patterns may take a very long time to search
+for all possible matching strings. Others may require a very large amount of
+memory. There are three options that set resource limits for matching.
+.sp
+The \fB--match-limit\fP option provides a means of limiting computing resource
+usage when processing patterns that are not going to match, but which have a
+very large number of possibilities in their search trees. The classic example
+is a pattern that uses nested unlimited repeats. Internally, PCRE2 has a
+counter that is incremented each time around its main processing loop. If the
+value set by \fB--match-limit\fP is reached, an error occurs.
+.sp
+The \fB--heap-limit\fP option specifies, as a number of kibibytes (units of
+1024 bytes), the maximum amount of heap memory that may be used for matching.
+.sp
+The \fB--depth-limit\fP option limits the depth of nested backtracking points,
+which indirectly limits the amount of memory that is used. The amount of memory
+needed for each backtracking point depends on the number of capturing
+parentheses in the pattern, so the amount of memory that is used before this
+limit acts varies from pattern to pattern. This limit is of use only if it is
+set smaller than \fB--match-limit\fP.
+.sp
+There are no short forms for these options. The default limits can be set
+when the PCRE2 library is compiled; if they are not specified, the defaults
+are very large and so effectively unlimited.
+.TP
+\fB--max-buffer-size\fP=\fInumber\fP
+This limits the expansion of the processing buffer, whose initial size can be
+set by \fB--buffer-size\fP. The maximum buffer size is silently forced to be no
+smaller than the starting buffer size.
 .TP
 \fB-N\fP \fInewline-type\fP, \fB--newline\fP=\fInewline-type\fP
-The PCRE2 library supports five different conventions for indicating
-the ends of lines. They are the single-character sequences CR (carriage return)
-and LF (linefeed), the two-character sequence CRLF, an "anycrlf" convention,
-which recognizes any of the preceding three types, and an "any" convention, in
-which any Unicode line ending sequence is assumed to end a line. The Unicode
-sequences are the three just mentioned, plus VT (vertical tab, U+000B), FF
-(form feed, U+000C), NEL (next line, U+0085), LS (line separator, U+2028), and
-PS (paragraph separator, U+2029).
+Six different conventions for indicating the ends of lines in scanned files are
+supported. For example:
+.sp
+  pcre2grep -N CRLF 'some pattern' <file>
+.sp
+The newline type may be specified in upper, lower, or mixed case. If the
+newline type is NUL, lines are separated by binary zero characters. The other
+types are the single-character sequences CR (carriage return) and LF
+(linefeed), the two-character sequence CRLF, an "anycrlf" type, which
+recognizes any of the preceding three types, and an "any" type, for which any
+Unicode line ending sequence is assumed to end a line. The Unicode sequences
+are the three just mentioned, plus VT (vertical tab, U+000B), FF (form feed,
+U+000C), NEL (next line, U+0085), LS (line separator, U+2028), and PS
+(paragraph separator, U+2029).
 .sp
 When the PCRE2 library is built, a default line-ending sequence is specified.
 This is normally the standard sequence for the operating system. Unless
 otherwise specified by this option, \fBpcre2grep\fP uses the library's default.
-The possible values for this option are CR, LF, CRLF, ANYCRLF, or ANY. This
-makes it possible to use \fBpcre2grep\fP to scan files that have come from
-other environments without having to modify their line endings. If the data
-that is being scanned does not agree with the convention set by this option,
-\fBpcre2grep\fP may behave in strange ways. Note that this option does not
-apply to files specified by the \fB-f\fP, \fB--exclude-from\fP, or
+.sp
+This option makes it possible to use \fBpcre2grep\fP to scan files that have
+come from other environments without having to modify their line endings. If
+the data that is being scanned does not agree with the convention set by this
+option, \fBpcre2grep\fP may behave in strange ways. Note that this option does
+not apply to files specified by the \fB-f\fP, \fB--exclude-from\fP, or
 \fB--include-from\fP options, which are expected to use the operating system's
 standard newline sequence.
 .TP
@ -559,25 +586,36 @@ use of JIT at run time. It is provided for testing and working round problems.
 It should never be needed in normal use.
 .TP
 \fB-O\fP \fItext\fP, \fB--output\fP=\fItext\fP
-When there is a match, instead of outputting the whole line that matched,
-output just the given text. This option is mutually exclusive with
-\fB--only-matching\fP, \fB--file-offsets\fP, and \fB--line-offsets\fP. Escape
-sequences starting with a dollar character may be used to insert the contents
-of the matched part of the line and/or captured substrings into the text.
+When there is a match, instead of outputting the line that matched, output just
+the text specified in this option, followed by an operating-system standard
+newline. In this mode, no context is shown. That is, the \fB-A\fP, \fB-B\fP,
+and \fB-C\fP options are ignored. The \fB--newline\fP option has no effect on
+this option, which is mutually exclusive with \fB--only-matching\fP,
+\fB--file-offsets\fP, and \fB--line-offsets\fP. However, like
+\fB--only-matching\fP, if there is more than one match in a line, each of them
+causes a line of output.
 .sp
-$<digits> or ${<digits>} is replaced by the captured
-substring of the given decimal number; zero substitutes the whole match. If
-the number is greater than the number of capturing substrings, or if the
-capture is unset, the replacement is empty.
+Escape sequences starting with a dollar character may be used to insert the
+contents of the matched part of the line and/or captured substrings into the
+text.
+.sp
+$<digits> or ${<digits>} is replaced by the captured substring of the given
+decimal number; zero substitutes the whole match. If the number is greater than
+the number of capturing substrings, or if the capture is unset, the replacement
+is empty.
 .sp
 $a is replaced by bell; $b by backspace; $e by escape; $f by form feed; $n by
 newline; $r by carriage return; $t by tab; $v by vertical tab.
 .sp
-$o<digits> is replaced by the character represented by the given octal
-number; up to three digits are processed.
+$o<digits> or $o{<digits>} is replaced by the character whose code point is the
+given octal number. In the first form, up to three octal digits are processed.
+When more digits are needed in Unicode mode to specify a wide character, the
+second form must be used.
 .sp
-$x<digits> is replaced by the character represented by the given hexadecimal
-number; up to two digits are processed.
+$x<digits> or $x{<digits>} is replaced by the character represented by the
+given hexadecimal number. In the first form, up to two hexadecimal digits are
+processed. When more digits are needed in Unicode mode to specify a wide
+character, the second form must be used.
 .sp
 Any other character is substituted by itself. In particular, $$ is replaced by
 a single dollar.
@ -596,19 +634,29 @@ otherwise empty line. This option is mutually exclusive with \fB--output\fP,
 .TP
 \fB-o\fP\fInumber\fP, \fB--only-matching\fP=\fInumber\fP
 Show only the part of the line that matched the capturing parentheses of the
-given number. Up to 32 capturing parentheses are supported, and -o0 is
-equivalent to \fB-o\fP without a number. Because these options can be given
-without an argument (see above), if an argument is present, it must be given in
-the same shell item, for example, -o3 or --only-matching=2. The comments given
-for the non-argument case above also apply to this option. If the specified
-capturing parentheses do not exist in the pattern, or were not set in the
-match, nothing is output unless the file name or line number are being output.
+given number. Up to 50 capturing parentheses are supported by default. This
+limit can be changed via the \fB--om-capture\fP option. A pattern may contain
+any number of capturing parentheses, but only those whose number is within the
+limit can be accessed by \fB-o\fP. An error occurs if the number specified by
+\fB-o\fP is greater than the limit.
+.sp
+-o0 is the same as \fB-o\fP without a number. Because these options can be
+given without an argument (see above), if an argument is present, it must be
+given in the same shell item, for example, -o3 or --only-matching=2. The
+comments given for the non-argument case above also apply to this option. If
+the specified capturing parentheses do not exist in the pattern, or were not
+set in the match, nothing is output unless the file name or line number are
+being output.
 .sp
 If this option is given multiple times, multiple substrings are output for each
 match, in the order the options are given, and all on one line. For example,
 -o3 -o1 -o3 causes the substrings matched by capturing parentheses 3 and 1 and
 then 3 again to be output. By default, there is no separator (but see the next
-option).
+but one option).
+.TP
+\fB--om-capture\fP=\fInumber\fP
+Set the number of capturing parentheses that can be accessed by \fB-o\fP. The
+default is 50.
 .TP
 \fB--om-separator\fP=\fItext\fP
 Specify a separating string for multiple occurrences of \fB-o\fP. The default
@ -626,7 +674,8 @@ immediate end-of-file. This option is a shorthand for setting the \fB-d\fP
 option to "recurse".
 .TP
 \fB--recursion-limit\fP=\fInumber\fP
-See \fB--match-limit\fP above.
+This is an obsolete synonym for \fB--depth-limit\fP. See \fB--match-limit\fP
+above for details.
 .TP
 \fB-s\fP, \fB--no-messages\fP
 Suppress error messages about non-existent or unreadable files. Such files are
@ -644,11 +693,24 @@ is listed. If file names are being output, the grand total is preceded by
 ignored when used with \fB-L\fP (list files without matches), because the grand
 total would always be zero.
 .TP
-\fB-u\fP, \fB--utf-8\fP
+\fB-u\fP, \fB--utf\fP
 Operate in UTF-8 mode. This option is available only if PCRE2 has been compiled
 with UTF-8 support. All patterns (including those for any \fB--exclude\fP and
-\fB--include\fP options) and all subject lines that are scanned must be valid
-strings of UTF-8 characters.
+\fB--include\fP options) and all lines that are scanned must be valid strings
+of UTF-8 characters. If an invalid UTF-8 string is encountered, an error
+occurs.
+.TP
+\fB-U\fP, \fB--utf-allow-invalid\fP
+As \fB--utf\fP, but in addition subject lines may contain invalid UTF-8 code
+unit sequences. These can never form part of any pattern match. Patterns
+themselves, however, must still be valid UTF-8 strings. This facility allows
+valid UTF-8 strings to be sought within arbitrary byte sequences in executable
+or other binary files. For more details about matching in non-valid UTF-8
+strings, see the
+.\" HREF
+\fBpcre2unicode\fP(3)
+.\"
+documentation.
 .TP
 \fB-V\fP, \fB--version\fP
 Write the version numbers of \fBpcre2grep\fP and the PCRE2 library to the
@ -657,7 +719,9 @@ ignored.
 .TP
 \fB-v\fP, \fB--invert-match\fP
 Invert the sense of the match, so that lines which do \fInot\fP match any of
-the patterns are the ones that are found.
+the patterns are the ones that are found. When this option is set, options such
+as \fB--only-matching\fP and \fB--output\fP, which specify parts of a match
+that are to be output, are ignored.
 .TP
 \fB-w\fP, \fB--word-regex\fP, \fB--word-regexp\fP
 Force the patterns only to match "words". That is, there must be a word
@ -674,6 +738,12 @@ be more than one line. This is equivalent to having "^(?:" at the start of each
 pattern and ")$" at the end. This option applies only to the patterns that are
 matched against the contents of files; it does not apply to patterns specified
 by any of the \fB--include\fP or \fB--exclude\fP options.
+.TP
+\fB-Z\fP, \fB--null\fP
+Terminate files names in the regular output with a zero byte (the NUL
+character) instead of what would normally appear. This is useful when file
+names contain unusual characters such as colons, hyphens, or even newlines. The
+option does not apply to file names in error messages.
 .
 .
 .SH "ENVIRONMENT VARIABLES"
@ -689,16 +759,25 @@ by the \fB--locale\fP option. If no locale is set, the PCRE2 library's default
 .rs
 .sp
 The \fB-N\fP (\fB--newline\fP) option allows \fBpcre2grep\fP to scan files with
-different newline conventions from the default. Any parts of the input files
-that are written to the standard output are copied identically, with whatever
-newline sequences they have in the input. However, the setting of this option
-affects only the way scanned files are processed. It does not affect the
-interpretation of files specified by the \fB-f\fP, \fB--file-list\fP,
-\fB--exclude-from\fP, or \fB--include-from\fP options, nor does it affect the
-way in which \fBpcre2grep\fP writes informational messages to the standard
-error and output streams. For these it uses the string "\en" to indicate
-newlines, relying on the C I/O library to convert this to an appropriate
-sequence.
+newline conventions that differ from the default. This option affects only the
+way scanned files are processed. It does not affect the interpretation of files
+specified by the \fB-f\fP, \fB--file-list\fP, \fB--exclude-from\fP, or
+\fB--include-from\fP options.
+.P
+Any parts of the scanned input files that are written to the standard output
+are copied with whatever newline sequences they have in the input. However, if
+the final line of a file is output, and it does not end with a newline
+sequence, a newline sequence is added. If the newline setting is CR, LF, CRLF
+or NUL, that line ending is output; for the other settings (ANYCRLF or ANY) a
+single NL is used.
+.P
+The newline setting does not affect the way in which \fBpcre2grep\fP writes
+newlines in informational messages to the standard output and error streams.
+Under Windows, the standard output is set to be binary, so that "\er\en" at the
+ends of output lines that are copied from the input is not converted to
+"\er\er\en" by the C I/O library. This means that any messages written to the
+standard output must end with "\er\en". For all other operating systems, and
+for all messages to the standard error stream, "\en" is used.
 .
 .
 .SH "OPTIONS COMPATIBILITY"
@ -711,9 +790,9 @@ as in the GNU \fBgrep\fP program. Any long option of the form
 \fB--file-offsets\fP, \fB--heap-limit\fP, \fB--include-dir\fP,
 \fB--line-offsets\fP, \fB--locale\fP, \fB--match-limit\fP, \fB-M\fP,
 \fB--multiline\fP, \fB-N\fP, \fB--newline\fP, \fB--om-separator\fP,
-\fB--output\fP, \fB-u\fP, and \fB--utf-8\fP options are specific to
-\fBpcre2grep\fP, as is the use of the \fB--only-matching\fP option with a
-capturing parentheses number.
+\fB--output\fP, \fB-u\fP, \fB--utf\fP, \fB-U\fP, and \fB--utf-allow-invalid\fP
+options are specific to \fBpcre2grep\fP, as is the use of the
+\fB--only-matching\fP option with a capturing parentheses number.
 .P
 Although most of the common options work the same way, a few are different in
 \fBpcre2grep\fP. For example, the \fB--include\fP option's argument is a glob
@ -775,12 +854,36 @@ documentation for details). Numbered callouts are ignored by \fBpcre2grep\fP;
 only callouts with string arguments are useful.
 .
 .
+.SS "Echoing a specific string"
+.rs
+.sp
+Starting the callout string with a pipe character invokes an echoing facility
+that avoids calling an external program or script. This facility is always
+available, provided that callouts were not completely disabled when
+\fBpcre2grep\fP was built. The rest of the callout string is processed as a
+zero-terminated string, which means it should not contain any internal binary
+zeros. It is written to the output, having first been passed through the same
+escape processing as text from the \fB--output\fP (\fB-O\fP) option (see
+above). However, $0 cannot be used to insert a matched substring because the
+match is still in progress. Instead, the single character '0' is inserted. Any
+syntax errors in the string (for example, a dollar not followed by another
+character) causes the callout to be ignored. No terminator is added to the
+output string, so if you want a newline, you must include it explicitly using
+the escape $n. For example:
+.sp
+  pcre2grep '(.)(..(.))(?C"|[$1] [$2] [$3]$n")' <some file>
+.sp
+Matching continues normally after the string is output. If you want to see only
+the callout output but not any output from an actual match, you should end the
+pattern with (*FAIL).
+.
+.
 .SS "Calling external programs or scripts"
 .rs
 .sp
 This facility can be independently disabled when \fBpcre2grep\fP is built. It
 is supported for Windows, where a call to \fB_spawnvp()\fP is used, for VMS,
-where \fBlib$spawn()\fP is used, and for any other Unix-like environment where
+where \fBlib$spawn()\fP is used, and for any Unix-like environment where
 \fBfork()\fP and \fBexecv()\fP are available.
 .P
 If the callout string does not start with a pipe (vertical bar) character, it
@ -791,13 +894,11 @@ arguments:
  executable_name|arg1|arg2|...
 .sp
 Any substring (including the executable name) may contain escape sequences
-started by a dollar character: $<digits> or ${<digits>} is replaced by the
-captured substring of the given decimal number, which must be greater than
-zero. If the number is greater than the number of capturing substrings, or if
-the capture is unset, the replacement is empty.
-.P
-Any other character is substituted by itself. In particular, $$ is replaced by
-a single dollar and $| is replaced by a pipe character. Here is an example:
+started by a dollar character. These are the same as for the \fB--output\fP
+(\fB-O\fP) option documented above, except that $0 cannot insert the matched
+string because the match is still in progress. Instead, the character '0'
+is inserted. If you need a literal dollar or pipe character in any
+substring, use $$ or $| respectively. Here is an example:
 .sp
  echo -e "abcde\en12345" | pcre2grep \e
    '(?x)(.)(..(.))
@ -810,28 +911,14 @@ a single dollar and $| is replaced by a pipe character. Here is an example:
    Arg1: [1] [234] [4] Arg2: |1| ()
    12345
 .sp
-The parameters for the system call that is used to run the
-program or script are zero-terminated strings. This means that binary zero
-characters in the callout argument will cause premature termination of their
-substrings, and therefore should not be present. Any syntax errors in the
-string (for example, a dollar not followed by another character) cause the
-callout to be ignored. If running the program fails for any reason (including
-the non-existence of the executable), a local matching failure occurs and the
-matcher backtracks in the normal way.
-.
-.
-.SS "Echoing a specific string"
-.rs
-.sp
-This facility is always available, provided that callouts were not completely
-disabled when \fBpcre2grep\fP was built. If the callout string starts with a
-pipe (vertical bar) character, the rest of the string is written to the output,
-having been passed through the same escape processing as text from the --output
-option. This provides a simple echoing facility that avoids calling an external
-program or script. No terminator is added to the string, so if you want a
-newline, you must include it explicitly. Matching continues normally after the
-string is output. If you want to see only the callout output but not any output
-from an actual match, you should end the relevant pattern with (*FAIL).
+The parameters for the system call that is used to run the program or script
+are zero-terminated strings. This means that binary zero characters in the
+callout argument will cause premature termination of their substrings, and
+therefore should not be present. Any syntax errors in the string (for example,
+a dollar not followed by another character) causes the callout to be ignored.
+If running the program fails for any reason (including the non-existence of the
+executable), a local matching failure occurs and the matcher backtracks in the
+normal way.
 .
 .
 .SH "MATCHING ERRORS"
@ -867,7 +954,8 @@ because VMS does not distinguish between exit(0) and exit(1).
 .SH "SEE ALSO"
 .rs
 .sp
-\fBpcre2pattern\fP(3), \fBpcre2syntax\fP(3), \fBpcre2callout\fP(3).
+\fBpcre2pattern\fP(3), \fBpcre2syntax\fP(3), \fBpcre2callout\fP(3),
+\fBpcre2unicode\fP(3).
 .
 .
 .SH AUTHOR
@ -875,7 +963,7 @@ because VMS does not distinguish between exit(0) and exit(1).
 .sp
 .nf
 Philip Hazel
-University Computing Service
+Retired from University Computing Service
 Cambridge, England.
 .fi
 .
@ -884,6 +972,6 @@ Cambridge, England.
 .rs
 .sp
 .nf
-Last updated: 24 November 2018
-Copyright (c) 1997-2018 University of Cambridge.
+Last updated: 30 July 2022
+Copyright (c) 1997-2022 University of Cambridge.
 .fi
--- a/doc/pcre2grep.txt
+++ b/doc/pcre2grep.txt
@ -12,11 +12,11 @@ SYNOPSIS
 DESCRIPTION

       pcre2grep  searches  files  for  character patterns, in the same way as
-       other grep commands do,  but  it  uses  the  PCRE2  regular  expression
-       library  to  support  patterns  that  are  compatible  with the regular
-       expressions of Perl 5. See pcre2syntax(3) for a quick-reference summary
-       of  pattern  syntax,  or  pcre2pattern(3) for a full description of the
-       syntax and semantics of the regular expressions that PCRE2 supports.
+       other grep commands do, but it uses the PCRE2  regular  expression  li-
+       brary  to support patterns that are compatible with the regular expres-
+       sions of Perl 5. See pcre2syntax(3) for a  quick-reference  summary  of
+       pattern syntax, or pcre2pattern(3) for a full description of the syntax
+       and semantics of the regular expressions that PCRE2 supports.

       Patterns, whether supplied on the command line or in a  separate  file,
       are given without delimiters. For example:
@ -26,8 +26,8 @@ DESCRIPTION
       If you attempt to use delimiters (for example, by surrounding a pattern
       with slashes, as is common in Perl scripts), they  are  interpreted  as
       part  of  the pattern. Quotes can of course be used to delimit patterns
-       on the command line because they are  interpreted  by  the  shell,  and
-       indeed  quotes  are required if a pattern contains white space or shell
+       on the command line because they are interpreted by the shell, and  in-
+       deed  quotes  are  required  if a pattern contains white space or shell
       metacharacters.

       The first argument that follows any option settings is treated  as  the
@ -42,20 +42,22 @@ DESCRIPTION

         pcre2grep some-pattern file1 - file3

-       Input files are searched line by  line.  By  default,  each  line  that
+       By default, input files are searched  line  by  line.  Each  line  that
       matches  a  pattern  is  copied to the standard output, and if there is
       more than one file, the file name is output at the start of each  line,
       followed  by  a  colon.  However, there are options that can change how
-       pcre2grep behaves. In particular, the -M option makes  it  possible  to
+       pcre2grep behaves. For example, the -M  option  makes  it  possible  to
       search  for  strings  that  span  line  boundaries. What defines a line
-       boundary is controlled by the -N (--newline) option.
+       boundary is controlled by the -N (--newline) option. The -h and -H  op-
+       tions  control  whether  or not file names are shown, and the -Z option
+       changes the file name terminator to a zero byte.

       The amount of memory used for buffering files that are being scanned is
       controlled  by  parameters  that  can  be  set by the --buffer-size and
       --max-buffer-size options. The first of these sets the size  of  buffer
       that  is obtained at the start of processing. If an input file contains
-       very long lines, a larger buffer may be  needed;  this  is  handled  by
-       automatically extending the buffer, up to the limit specified by --max-
+       very long lines, a larger buffer may be needed; this is handled by  au-
+       tomatically  extending  the buffer, up to the limit specified by --max-
       buffer-size. The default values for these parameters can  be  set  when
       pcre2grep  is  built;  if nothing is specified, the defaults are set to
       20KiB and 1MiB respectively. An error occurs if a line is too long  and
@ -75,12 +77,12 @@ DESCRIPTION
       By default, as soon as one pattern matches a line, no further  patterns
       are considered. However, if --colour (or --color) is used to colour the
       matching substrings, or if --only-matching, --file-offsets, or  --line-
-       offsets  is  used  to  output  only  the  part of the line that matched
-       (either shown literally, or as an offset), scanning resumes immediately
+       offsets  is  used to output only the part of the line that matched (ei-
+       ther shown literally, or as an offset),  scanning  resumes  immediately
       following  the  match,  so that further matches on the same line can be
-       found. If there are multiple  patterns,  they  are  all  tried  on  the
-       remainder  of  the  line, but patterns that follow the one that matched
-       are not tried on the earlier part of the line.
+       found. If there are multiple patterns, they are all tried  on  the  re-
+       mainder  of the line, but patterns that follow the one that matched are
+       not tried on the earlier matched part of the line.

       This behaviour means that the order  in  which  multiple  patterns  are
       specified  can affect the output when one of the above options is used.
@ -89,11 +91,11 @@ DESCRIPTION
       overlap).

       Patterns that can match an empty string are accepted, but empty  string
-       matches   are   never   recognized.   An   example   is   the   pattern
-       "(super)?(man)?", in which all components are  optional.  This  pattern
-       finds  all  occurrences  of  both "super" and "man"; the output differs
-       from matching with "super|man" when only the  matching  substrings  are
-       being shown.
+       matches   are  never  recognized.  An  example  is  the  pattern  "(su-
+       per)?(man)?", in which all components are optional. This pattern  finds
+       all  occurrences  of  both  "super"  and "man"; the output differs from
+       matching with "super|man" when only the matching substrings  are  being
+       shown.

       If  the  LC_ALL or LC_CTYPE environment variable is set, pcre2grep uses
       the value to set a locale when calling the PCRE2 library.  The --locale
@ -115,11 +117,10 @@ BINARY FILES

       By  default,  a  file that contains a binary zero byte within the first
       1024 bytes is identified as a binary file, and is processed  specially.
-       (GNU grep identifies binary files in this manner.) However, if the new-
-       line type is specified as "nul", that is,  the  line  terminator  is  a
-       binary  zero,  the  test  for  a  binary  file  is not applied. See the
-       --binary-files option for a means of changing the way binary files  are
-       handled.
+       However,  if  the  newline  type is specified as NUL, that is, the line
+       terminator is a binary zero, the test for a binary file is not applied.
+       See  the  --binary-files  option for a means of changing the way binary
+       files are handled.


 BINARY ZEROS IN PATTERNS
@ -148,42 +149,51 @@ OPTIONS
                 Output  up  to  number  lines  of context after each matching
                 line. Fewer lines are output if the next match or the end  of
                 the  file  is  reached,  or if the processing buffer size has
-                 been  set  too  small.  If file names and/or line numbers are
-                 being output, a hyphen separator is used instead of  a  colon
-                 for  the  context  lines.  A  line  containing "--" is output
-                 between each group of lines, unless they are in fact contigu-
-                 ous  in the input file. The value of number is expected to be
-                 relatively small. When -c is used, -A is ignored.
+                 been set too small. If file names and/or line numbers are be-
+                 ing output, a hyphen separator is used instead of a colon for
+                 the context lines (the -Z option can be used  to  change  the
+                 file  name terminator to a zero byte). A line containing "--"
+                 is output between each group of lines,  unless  they  are  in
+                 fact contiguous in the input file. The value of number is ex-
+                 pected to be relatively small. When -c is  used,  -A  is  ig-
+                 nored.

       -a, --text
                 Treat  binary  files as text. This is equivalent to --binary-
                 files=text.

+       --allow-lookaround-bsk
+                 PCRE2 now forbids the use of \K in lookarounds by default, in
+                 line  with  Perl.   This  option  causes pcre2grep to set the
+                 PCRE2_EXTRA_ALLOW_LOOKAROUND_BSK option, which  enables  this
+                 somewhat dangerous usage.
+
       -B number, --before-context=number
                 Output  up  to  number  lines of context before each matching
                 line. Fewer lines are output if the  previous  match  or  the
                 start  of the file is within number lines, or if the process-
                 ing buffer size has been set too small. If file names  and/or
-                 line  numbers  are  being  output, a hyphen separator is used
-                 instead of a colon for the context lines. A  line  containing
-                 "--"  is  output between each group of lines, unless they are
-                 in fact contiguous in the input file. The value of number  is
-                 expected  to  be  relatively  small.  When  -c is used, -B is
-                 ignored.
+                 line numbers are being output, a hyphen separator is used in-
+                 stead of a colon for the context lines (the -Z option can  be
+                 used  to  change  the file name terminator to a zero byte). A
+                 line containing "--" is output between each group  of  lines,
+                 unless  they  are  in  fact contiguous in the input file. The
+                 value of number is expected to be relatively small.  When  -c
+                 is used, -B is ignored.

       --binary-files=word
                 Specify  how binary files are to be processed. If the word is
-                 "binary"  (the  default),  pattern  matching  is performed on
-                 binary files, but the only  output  is  "Binary  file  <name>
+                 "binary" (the default), pattern matching is performed on  bi-
+                 nary  files,  but  the  only  output  is  "Binary file <name>
                 matches" when a match succeeds. If the word is "text",  which
                 is  equivalent  to  the -a or --text option, binary files are
                 processed in the same way as any other file.  In  this  case,
                 when  a  match  succeeds,  the  output may be binary garbage,
                 which can have nasty effects if sent to a  terminal.  If  the
-                 word is  "without-match",  which  is  equivalent  to  the  -I
-                 option,  binary  files  are  not  processed  at all; they are
-                 assumed not to be of interest and are skipped without causing
-                 any output or affecting the return code.
+                 word  is  "without-match",  which is equivalent to the -I op-
+                 tion, binary files are not processed at all; they are assumed
+                 not  to  be  of  interest and are skipped without causing any
+                 output or affecting the return code.

       --buffer-size=number
                 Set the parameter that controls how much memory  is  obtained
@ -208,10 +218,10 @@ OPTIONS
                 If  no lines are selected, the number zero is output. If sev-
                 eral files are are being scanned, a count is output for  each
                 of  them and the -t option can be used to cause a total to be
-                 output  at  the  end.  However,  if  the --files-with-matches
-                 option is also  used,  only  those  files  whose  counts  are
-                 greater  than  zero  are listed. When -c is used, the -A, -B,
-                 and -C options are ignored.
+                 output at the end. However, if the  --files-with-matches  op-
+                 tion  is also used, only those files whose counts are greater
+                 than zero are listed. When -c is used, the -A, -B, and -C op-
+                 tions are ignored.

       --colour, --color
                 If this option is given without any data, it is equivalent to
@ -238,8 +248,8 @@ OPTIONS
                 semicolon,  except  in  the  case  of GREP_COLORS, which must
                 start with "ms=" or "mt=" followed by two semicolon-separated
                 colours,  terminated  by the end of the string or by a colon.
-                 If  GREP_COLORS  does  not  start  with  "ms=" or "mt=" it is
-                 ignored, and GREP_COLOR is checked.
+                 If GREP_COLORS does not start with "ms=" or "mt=" it  is  ig-
+                 nored, and GREP_COLOR is checked.

                 If  the  string obtained from one of the above variables con-
                 tains any characters other than semicolon or digits, the set-
@ -250,9 +260,9 @@ OPTIONS
                 set, the default is "1;31", which gives red.

       -D action, --devices=action
-                 If an input path is  not  a  regular  file  or  a  directory,
-                 "action"  specifies  how  it is to be processed. Valid values
-                 are "read" (the default) or "skip" (silently skip the path).
+                 If  an  input path is not a regular file or a directory, "ac-
+                 tion" specifies how it is to be processed. Valid  values  are
+                 "read" (the default) or "skip" (silently skip the path).

       -d action, --directories=action
                 If an input path is a directory, "action" specifies how it is
@ -261,8 +271,8 @@ OPTIONS
                 "recurse" (equivalent to the -r option), or "skip"  (silently
                 skip  the  path, the default in Windows environments). In the
                 "read" case, directories are read as if  they  were  ordinary
-                 files. In some operating systems  the  effect  of  reading  a
-                 directory like this is an immediate end-of-file; in others it
+                 files.  In some operating systems the effect of reading a di-
+                 rectory like this is an immediate end-of-file; in  others  it
                 may provoke an error.

       --depth-limit=number
@ -295,8 +305,8 @@ OPTIONS
                 whether  listed  on  the  command line, obtained from --file-
                 list, or by scanning a directory. The pattern is a PCRE2 reg-
                 ular  expression,  and is matched against the final component
-                 of  the  file  name,  not the entire path. The -F, -w, and -x
-                 options do not apply to this pattern. The option may be given
+                 of the file name, not the entire path. The -F, -w, and -x op-
+                 tions  do  not apply to this pattern. The option may be given
                 any number of times in order to specify multiple patterns. If
                 a  file  name matches both an --include and an --exclude pat-
                 tern, it is excluded. There is no short form for this option.
@ -310,29 +320,29 @@ OPTIONS

       --exclude-dir=pattern
                 Directories whose names match the pattern are skipped without
-                 being processed, whatever  the  setting  of  the  --recursive
-                 option.  This  applies  to all directories, whether listed on
-                 the command line, obtained from --file-list, or by scanning a
+                 being  processed, whatever the setting of the --recursive op-
+                 tion. This applies to all directories, whether listed on  the
+                 command  line,  obtained  from  --file-list, or by scanning a
                 parent directory. The pattern is a PCRE2 regular  expression,
                 and  is  matched against the final component of the directory
                 name, not the entire path. The -F, -w, and -x options do  not
                 apply  to this pattern. The option may be given any number of
                 times in order to specify more than one pattern. If a  direc-
-                 tory matches both  --include-dir  and  --exclude-dir,  it  is
-                 excluded. There is no short form for this option.
+                 tory  matches both --include-dir and --exclude-dir, it is ex-
+                 cluded. There is no short form for this option.

       -F, --fixed-strings
                 Interpret each data-matching  pattern  as  a  list  of  fixed
-                 strings, separated by  newlines,  instead  of  as  a  regular
-                 expression.  What  constitutes  a newline for this purpose is
-                 controlled by the --newline option. The -w (match as a  word)
-                 and  -x (match whole line) options can be used with -F.  They
-                 apply to each of the fixed strings. A line is selected if any
+                 strings,  separated  by newlines, instead of as a regular ex-
+                 pression. What constitutes a newline for this purpose is con-
+                 trolled by the --newline option. The -w (match as a word) and
+                 -x (match whole line) options can be used with -F.  They  ap-
+                 ply  to  each of the fixed strings. A line is selected if any
                 of the fixed strings are found in it (subject to -w or -x, if
                 present).  This  option applies only to the patterns that are
                 matched against the contents of files; it does not  apply  to
-                 patterns specified by  any  of  the  --include  or  --exclude
-                 options.
+                 patterns  specified  by any of the --include or --exclude op-
+                 tions.

       -f filename, --file=filename
                 Read patterns from the file, one per  line,  and  match  them
@ -360,8 +370,8 @@ OPTIONS
       --file-list=filename
                 Read a list of  files  and/or  directories  that  are  to  be
                 scanned from the given file, one per line. What constitutes a
-                 newline  when  reading  the  file  is  the operating system's
-                 default. Trailing white space is removed from each line,  and
+                 newline when reading the file is the operating  system's  de-
+                 fault.  Trailing  white  space is removed from each line, and
                 blank lines are ignored. These paths are processed before any
                 that  are  listed  on  the command line. The file name can be
                 given as "-" to refer to the standard input.  If  --file  and
@ -382,22 +392,25 @@ OPTIONS

       -H, --with-filename
                 Force  the  inclusion of the file name at the start of output
-                 lines when searching a single file. By default, the file name
-                 is not shown in this case.  For matching lines, the file name
-                 is followed by a colon; for context lines, a hyphen separator
-                 is used. If a line number is also being  output,  it  follows
-                 the  file  name. When the -M option causes a pattern to match
-                 more than one line, only the first is preceded  by  the  file
-                 name.  This  option  overrides  any  previous  -h,  -l, or -L
-                 options.
+                 lines when searching a single file. The file name is not nor-
+                 mally  shown  in  this case.  By default, for matching lines,
+                 the file name is followed by a colon; for  context  lines,  a
+                 hyphen separator is used. The -Z option can be used to change
+                 the terminator to a zero byte. If a line number is also being
+                 output, it follows the file name. When the -M option causes a
+                 pattern to match more than one line, only the first  is  pre-
+                 ceded  by  the  file name. This option overrides any previous
+                 -h, -l, or -L options.

       -h, --no-filename
                 Suppress the output file names when searching multiple files.
-                 By  default,  file  names  are  shown when multiple files are
-                 searched. For matching lines, the file name is followed by  a
-                 colon;  for  context lines, a hyphen separator is used.  If a
-                 line number is also being output, it follows the  file  name.
-                 This option overrides any previous -H, -L, or -l options.
+                 File  names  are  normally  shown  when  multiple  files  are
+                 searched. By default, for matching lines, the  file  name  is
+                 followed by a colon; for context lines, a hyphen separator is
+                 used. The -Z option can be used to change the terminator to a
+                 zero  byte. If a line number is also being output, it follows
+                 the file name.  This option overrides any previous -H, -L, or
+                 -l options.

       --heap-limit=number
                 See --match-limit below.
@ -414,17 +427,17 @@ OPTIONS

       --include=pattern
                 If any --include patterns are specified, the only files  that
-                 are  processed  are those that match one of the patterns (and
-                 do not match an --exclude  pattern).  This  option  does  not
+                 are processed are those whose names match one of the patterns
+                 and do not match an --exclude pattern. This option  does  not
                 affect  directories,  but  it  applies  to all files, whether
                 listed on the command line, obtained from --file-list, or  by
                 scanning  a directory. The pattern is a PCRE2 regular expres-
                 sion, and is matched against the final component of the  file
                 name,  not the entire path. The -F, -w, and -x options do not
                 apply to this pattern. The option may be given any number  of
-                 times.  If  a  file  name  matches  both  an --include and an
-                 --exclude pattern, it is excluded.  There is  no  short  form
-                 for this option.
+                 times.  If a file name matches both an --include and an --ex-
+                 clude pattern, it is excluded.  There is no  short  form  for
+                 this option.

       --include-from=filename
                 Treat  each  non-empty  line  of  the file as the data for an
@ -435,11 +448,11 @@ OPTIONS

       --include-dir=pattern
                 If  any --include-dir patterns are specified, the only direc-
-                 tories that are processed are those that  match  one  of  the
-                 patterns  (and  do  not match an --exclude-dir pattern). This
+                 tories that are processed are those whose names match one  of
+                 the  patterns and do not match an --exclude-dir pattern. This
                 applies to all directories, whether  listed  on  the  command
-                 line,  obtained  from  --file-list,  or  by scanning a parent
-                 directory. The pattern is a PCRE2 regular expression, and  is
+                 line,  obtained from --file-list, or by scanning a parent di-
+                 rectory. The pattern is a PCRE2 regular  expression,  and  is
                 matched  against  the  final component of the directory name,
                 not the entire path. The -F, -w, and -x options do not  apply
                 to this pattern. The option may be given any number of times.
@ -450,20 +463,23 @@ OPTIONS
                 Instead  of  outputting lines from the files, just output the
                 names of the files that do not contain any lines  that  would
                 have  been  output. Each file name is output once, on a sepa-
-                 rate line. This option overrides any previous -H, -h,  or  -l
-                 options.
+                 rate line by default, but if the -Z option is set,  they  are
+                 separated  by  zero  bytes  instead  of newlines. This option
+                 overrides any previous -H, -h, or -l options.

       -l, --files-with-matches
                 Instead of outputting lines from the files, just  output  the
                 names of the files containing lines that would have been out-
-                 put.  Each  file  name  is  output  once, on a separate line.
-                 Searching normally stops as soon as a matching line is  found
-                 in  a  file.  However, if the -c (count) option is also used,
-                 matching continues in order to obtain the correct count,  and
-                 those  files  that  have  at least one match are listed along
-                 with their counts. Using this option with -c is a way of sup-
-                 pressing  the  listing  of files with no matches. This opeion
-                 overrides any previous -H, -h, or -L options.
+                 put. Each file name is output once, on a separate  line,  but
+                 if the -Z option is set, they are separated by zero bytes in-
+                 stead of newlines. Searching normally  stops  as  soon  as  a
+                 matching  line is found in a file. However, if the -c (count)
+                 option is also used, matching continues in  order  to  obtain
+                 the  correct  count,  and  those files that have at least one
+                 match are listed along with their counts. Using  this  option
+                 with  -c is a way of suppressing the listing of files with no
+                 matches that occurs with -c on its own. This option overrides
+                 any previous -H, -h, or -L options.

       --label=name
                 This option supplies a name to be used for the standard input
@ -474,15 +490,15 @@ OPTIONS
                 When  this  option is given, non-compressed input is read and
                 processed line by line, and the output is flushed after  each
                 write.  By  default,  input  is  read in large chunks, unless
-                 pcre2grep  can  determine  that it is reading from a terminal
-                 (which is currently possible only in  Unix-like  environments
-                 or  Windows).  Output  to  terminal is normally automatically
-                 flushed by the operating system. This option  can  be  useful
-                 when the input or output is attached to a pipe and you do not
-                 want pcre2grep to buffer up large amounts of data.   However,
-                 its  use  will  affect  performance,  and  the -M (multiline)
-                 option ceases to work. When input is from a compressed .gz or
-                 .bz2 file, --line-buffered is ignored.
+                 pcre2grep can determine that it is reading from  a  terminal,
+                 which is currently possible only in Unix-like environments or
+                 Windows. Output to terminal is normally automatically flushed
+                 by  the  operating system. This option can be useful when the
+                 input or output is attached to a pipe and  you  do  not  want
+                 pcre2grep  to  buffer up large amounts of data.  However, its
+                 use will affect performance, and the  -M  (multiline)  option
+                 ceases  to  work. When input is from a compressed .gz or .bz2
+                 file, --line-buffered is ignored.

       --line-offsets
                 Instead of showing lines or parts of lines that  match,  show
@ -498,50 +514,9 @@ OPTIONS
       --locale=locale-name
                 This option specifies a locale to be used for pattern  match-
                 ing.  It  overrides the value in the LC_ALL or LC_CTYPE envi-
-                 ronment  variables.  If  no  locale  is  specified, the PCRE2
-                 library's default (usually the "C" locale) is used. There  is
-                 no short form for this option.
-
-       --match-limit=number
-                 Processing  some  regular expression patterns may take a very
-                 long time to search for all possible matching strings. Others
-                 may  require  a  very large amount of memory. There are three
-                 options that set resource limits for matching.
-
-                 The --match-limit option provides a means of limiting comput-
-                 ing  resource  usage  when  processing  patterns that are not
-                 going to match, but which have a very large number of  possi-
-                 bilities in their search trees. The classic example is a pat-
-                 tern that uses nested unlimited  repeats.  Internally,  PCRE2
-                 has  a  counter that is incremented each time around its main
-                 processing  loop.  If  the  value  set  by  --match-limit  is
-                 reached, an error occurs.
-
-                 The  --heap-limit  option specifies, as a number of kibibytes
-                 (units of 1024 bytes), the amount of heap memory that may  be
-                 used for matching. Heap memory is needed only if matching the
-                 pattern requires a significant number of nested  backtracking
-                 points to be remembered. This parameter can be set to zero to
-                 forbid the use of heap memory altogether.
-
-                 The --depth-limit option limits the  depth  of  nested  back-
-                 tracking points, which indirectly limits the amount of memory
-                 that is used. The amount of memory needed for each backtrack-
-                 ing  point  depends on the number of capturing parentheses in
-                 the pattern, so the amount of memory that is used before this
-                 limit  acts  varies from pattern to pattern. This limit is of
-                 use only if it is set smaller than --match-limit.
-
-                 There are no short forms for these options. The default  lim-
-                 its  can  be  set when the PCRE2 library is compiled; if they
-                 are not specified, the defaults are very large and so  effec-
-                 tively unlimited.
-
-       --max-buffer-size=number
-                 This  limits  the  expansion  of the processing buffer, whose
-                 initial size can be set by --buffer-size. The maximum  buffer
-                 size  is  silently  forced to be no smaller than the starting
-                 buffer size.
+                 ronment variables. If no locale is specified, the  PCRE2  li-
+                 brary's default (usually the "C" locale) is used. There is no
+                 short form for this option.

       -M, --multiline
                 Allow patterns to match more than one line. When this  option
@ -567,10 +542,10 @@ OPTIONS

                   pcre2grep -M 'regular\s+expression' <file>

-                 The  \s  escape  sequence  matches any white space character,
-                 including newlines, and is followed  by  +  so  as  to  match
-                 trailing  white  space  on the first line as well as possibly
-                 handling a two-character newline sequence.
+                 The \s escape sequence matches any white space character, in-
+                 cluding newlines, and is followed by + so as to match  trail-
+                 ing  white  space  on the first line as well as possibly han-
+                 dling a two-character newline sequence.

                 There is a limit to the number of lines that can be  matched,
                 imposed  by  the way that pcre2grep buffers the input file as
@ -578,31 +553,87 @@ OPTIONS
                 this should not be a problem, but the -M option does not work
                 when input is read line by line (see --line-buffered.)

-       -N newline-type, --newline=newline-type
-                 The PCRE2 library supports  five  different  conventions  for
-                 indicating  the  ends of lines. They are the single-character
-                 sequences CR (carriage return) and LF  (linefeed),  the  two-
-                 character  sequence CRLF, an "anycrlf" convention, which rec-
-                 ognizes any of the preceding three types, and an  "any"  con-
-                 vention, in which any Unicode line ending sequence is assumed
-                 to end a line. The Unicode sequences are the three just  men-
-                 tioned,  plus  VT  (vertical  tab,  U+000B),  FF  (form feed,
-                 U+000C),  NEL  (next  line,  U+0085),  LS  (line   separator,
-                 U+2028), and PS (paragraph separator, U+2029).
+       -m number, --max-count=number
+                 Stop processing after finding number matching lines, or  non-
+                 matching  lines if -v is also set. Any trailing context lines
+                 are output after the final match.  In  multiline  mode,  each
+                 multiline  match counts as just one line for this purpose. If
+                 this limit is reached when reading the standard input from  a
+                 regular file, the file is left positioned just after the last
+                 matching line.  If -c is also set, the count that  is  output
+                 is  never  greater  than number. This option has no effect if
+                 used with -L, -l, or -q, or when just checking for a match in
+                 a binary file.

-                 When  the  PCRE2  library  is  built,  a  default line-ending
-                 sequence  is  specified.   This  is  normally  the   standard
-                 sequence for the operating system. Unless otherwise specified
-                 by this option, pcre2grep uses the  library's  default.   The
-                 possible values for this option are CR, LF, CRLF, ANYCRLF, or
-                 ANY. This makes it possible to use pcre2grep  to  scan  files
+       --match-limit=number
+                 Processing  some  regular expression patterns may take a very
+                 long time to search for all possible matching strings. Others
+                 may  require  a  very large amount of memory. There are three
+                 options that set resource limits for matching.
+
+                 The --match-limit option provides a means of limiting comput-
+                 ing  resource usage when processing patterns that are not go-
+                 ing to match, but which have a very large number of possibil-
+                 ities in their search trees. The classic example is a pattern
+                 that uses nested unlimited repeats. Internally, PCRE2  has  a
+                 counter  that  is  incremented each time around its main pro-
+                 cessing loop. If the value set by --match-limit  is  reached,
+                 an error occurs.
+
+                 The  --heap-limit  option specifies, as a number of kibibytes
+                 (units of 1024 bytes), the maximum amount of heap memory that
+                 may be used for matching.
+
+                 The  --depth-limit  option  limits  the depth of nested back-
+                 tracking points, which indirectly limits the amount of memory
+                 that is used. The amount of memory needed for each backtrack-
+                 ing point depends on the number of capturing  parentheses  in
+                 the pattern, so the amount of memory that is used before this
+                 limit acts varies from pattern to pattern. This limit  is  of
+                 use only if it is set smaller than --match-limit.
+
+                 There  are no short forms for these options. The default lim-
+                 its can be set when the PCRE2 library is  compiled;  if  they
+                 are  not specified, the defaults are very large and so effec-
+                 tively unlimited.
+
+       --max-buffer-size=number
+                 This limits the expansion of  the  processing  buffer,  whose
+                 initial  size can be set by --buffer-size. The maximum buffer
+                 size is silently forced to be no smaller  than  the  starting
+                 buffer size.
+
+       -N newline-type, --newline=newline-type
+                 Six different conventions for indicating the ends of lines in
+                 scanned files are supported. For example:
+
+                   pcre2grep -N CRLF 'some pattern' <file>
+
+                 The newline type may be specified in upper, lower,  or  mixed
+                 case.  If the newline type is NUL, lines are separated by bi-
+                 nary zero characters. The other types are the  single-charac-
+                 ter  sequences  CR  (carriage  return) and LF (linefeed), the
+                 two-character sequence CRLF, an "anycrlf" type, which  recog-
+                 nizes  any  of  the preceding three types, and an "any" type,
+                 for which any Unicode line ending sequence is assumed to  end
+                 a  line.  The Unicode sequences are the three just mentioned,
+                 plus VT (vertical tab, U+000B), FF (form feed,  U+000C),  NEL
+                 (next  line,  U+0085),  LS  (line  separator, U+2028), and PS
+                 (paragraph separator, U+2029).
+
+                 When the PCRE2 library is built, a  default  line-ending  se-
+                 quence  is specified.  This is normally the standard sequence
+                 for the operating system. Unless otherwise specified by  this
+                 option, pcre2grep uses the library's default.
+
+                 This  option makes it possible to use pcre2grep to scan files
                 that have come from other environments without having to mod-
                 ify  their  line  endings.  If the data that is being scanned
                 does not agree  with  the  convention  set  by  this  option,
                 pcre2grep  may  behave in strange ways. Note that this option
                 does not apply to files specified by the -f,  --exclude-from,
-                 or --include-from options, which  are  expected  to  use  the
-                 operating system's standard newline sequence.
+                 or  --include-from options, which are expected to use the op-
+                 erating system's standard newline sequence.

       -n, --line-number
                 Precede each output line by its line number in the file, fol-
@ -620,12 +651,19 @@ OPTIONS
                 lems.  It should never be needed in normal use.

       -O text, --output=text
-                 When  there  is a match, instead of outputting the whole line
-                 that matched, output just the  given  text.  This  option  is
-                 mutually  exclusive with --only-matching, --file-offsets, and
-                 --line-offsets. Escape sequences starting with a dollar char-
-                 acter  may be used to insert the contents of the matched part
-                 of the line and/or captured substrings into the text.
+                 When there is a match, instead of outputting  the  line  that
+                 matched,  output just the text specified in this option, fol-
+                 lowed by an operating-system standard newline. In this  mode,
+                 no  context is shown. That is, the -A, -B, and -C options are
+                 ignored. The --newline option has no effect on  this  option,
+                 which is mutually exclusive with --only-matching, --file-off-
+                 sets, and --line-offsets. However, like  --only-matching,  if
+                 there is more than one match in a line, each of them causes a
+                 line of output.
+
+                 Escape sequences starting with a dollar character may be used
+                 to insert the contents of the matched part of the line and/or
+                 captured substrings into the text.

                 $<digits> or ${<digits>} is replaced  by  the  captured  sub-
                 string  of  the  given  decimal  number; zero substitutes the
@ -637,11 +675,17 @@ OPTIONS
                 form  feed;  $n by newline; $r by carriage return; $t by tab;
                 $v by vertical tab.

-                 $o<digits> is replaced by the character  represented  by  the
-                 given octal number; up to three digits are processed.
+                 $o<digits> or $o{<digits>} is replaced by the character whose
+                 code  point  is the given octal number. In the first form, up
+                 to three octal digits are processed.  When  more  digits  are
+                 needed  in Unicode mode to specify a wide character, the sec-
+                 ond form must be used.

-                 $x<digits>  is  replaced  by the character represented by the
-                 given hexadecimal number; up to two digits are processed.
+                 $x<digits> or $x{<digits>} is replaced by the character  rep-
+                 resented  by the given hexadecimal number. In the first form,
+                 up to two hexadecimal digits are processed. When more  digits
+                 are  needed  in Unicode mode to specify a wide character, the
+                 second form must be used.

                 Any other character is substituted by itself. In  particular,
                 $$ is replaced by a single dollar.
@ -651,9 +695,9 @@ OPTIONS
                 of the whole line. In this mode, no context  is  shown.  That
                 is,  the -A, -B, and -C options are ignored. If there is more
                 than one match in a line, each of them is  shown  separately,
-                 on  a  separate  line  of  output.  If -o is combined with -v
-                 (invert the sense of the match to find  non-matching  lines),
-                 no  output is generated, but the return code is set appropri-
+                 on  a separate line of output. If -o is combined with -v (in-
+                 vert the sense of the match to find non-matching  lines),  no
+                 output  is  generated,  but  the return code is set appropri-
                 ately. If the matched portion of the line is  empty,  nothing
                 is  output  unless  the  file  name  or line number are being
                 printed, in which case they are shown on an  otherwise  empty
@ -662,23 +706,32 @@ OPTIONS

       -onumber, --only-matching=number
                 Show only the part of the line  that  matched  the  capturing
-                 parentheses of the given number. Up to 32 capturing parenthe-
-                 ses are supported, and -o0 is equivalent to -o without a num-
-                 ber.  Because  these options can be given without an argument
-                 (see above), if an argument is present, it must be  given  in
-                 the  same  shell item, for example, -o3 or --only-matching=2.
-                 The comments given for the non-argument case above also apply
-                 to this option. If the specified capturing parentheses do not
-                 exist in the pattern, or were not set in the  match,  nothing
-                 is  output unless the file name or line number are being out-
-                 put.
+                 parentheses of the given number. Up to 50 capturing parenthe-
+                 ses are supported by default. This limit can be  changed  via
+                 the  --om-capture option. A pattern may contain any number of
+                 capturing parentheses, but only those whose number is  within
+                 the  limit can be accessed by -o. An error occurs if the num-
+                 ber specified by -o is greater than the limit.
+
+                 -o0 is the same as -o without a number. Because these options
+                 can  be given without an argument (see above), if an argument
+                 is present, it must be given in the same shell item, for  ex-
+                 ample,  -o3  or --only-matching=2. The comments given for the
+                 non-argument case above also apply to  this  option.  If  the
+                 specified  capturing parentheses do not exist in the pattern,
+                 or were not set in the match, nothing is  output  unless  the
+                 file name or line number are being output.

                 If  this  option is given multiple times, multiple substrings
                 are output for each match,  in  the  order  the  options  are
                 given,  and  all on one line. For example, -o3 -o1 -o3 causes
                 the substrings matched by capturing parentheses 3 and  1  and
                 then  3 again to be output. By default, there is no separator
-                 (but see the next option).
+                 (but see the next but one option).
+
+       --om-capture=number
+                 Set the number of capturing parentheses that can be  accessed
+                 by -o. The default is 50.

       --om-separator=text
                 Specify  a  separating string for multiple occurrences of -o.
@ -695,11 +748,12 @@ OPTIONS
                 it contains, taking note of any --include and --exclude  set-
                 tings.  By  default, a directory is read as a normal file; in
                 some operating systems this gives an  immediate  end-of-file.
-                 This  option  is  a  shorthand  for  setting the -d option to
-                 "recurse".
+                 This  option is a shorthand for setting the -d option to "re-
+                 curse".

       --recursion-limit=number
-                 See --match-limit above.
+                 This is an obsolete synonym for --depth-limit.  See  --match-
+                 limit above for details.

       -s, --no-messages
                 Suppress  error  messages  about  non-existent  or unreadable
@ -710,8 +764,8 @@ OPTIONS
                 This  option  is  useful when scanning more than one file. If
                 used on its own, -t suppresses all output except for a  grand
                 total  number  of matching lines (or non-matching lines if -v
-                 is  used)  in  all  the files. If -t is used with -c, a grand
-                 total is output except when the previous output is  just  one
+                 is used) in all the files. If -t is used with -c, a grand to-
+                 tal  is  output  except  when the previous output is just one
                 line. In other words, it is not output when just  one  file's
                 count  is  listed.  If file names are being output, the grand
                 total is preceded by "TOTAL:". Otherwise, it appears as  just
@ -719,12 +773,20 @@ OPTIONS
                 (list files without matches), because the grand  total  would
                 always be zero.

-       -u, --utf-8
-                 Operate in UTF-8 mode. This option is available only if PCRE2
+       -u, --utf Operate in UTF-8 mode. This option is available only if PCRE2
                 has been compiled with UTF-8 support. All patterns (including
-                 those for any --exclude and --include options) and  all  sub-
-                 ject  lines  that  are scanned must be valid strings of UTF-8
-                 characters.
+                 those  for any --exclude and --include options) and all lines
+                 that are scanned must be valid strings of  UTF-8  characters.
+                 If an invalid UTF-8 string is encountered, an error occurs.
+
+       -U, --utf-allow-invalid
+                 As  --utf,  but in addition subject lines may contain invalid
+                 UTF-8 code unit sequences. These can never form part  of  any
+                 pattern  match.  Patterns  themselves, however, must still be
+                 valid UTF-8 strings. This facility allows valid UTF-8 strings
+                 to be sought within arbitrary byte sequences in executable or
+                 other binary files. For more details about matching  in  non-
+                 valid UTF-8 strings, see the pcre2unicode(3) documentation.

       -V, --version
                 Write  the version numbers of pcre2grep and the PCRE2 library
@ -733,7 +795,10 @@ OPTIONS

       -v, --invert-match
                 Invert  the  sense  of  the match, so that lines which do not
-                 match any of the patterns are the ones that are found.
+                 match any of the patterns are the ones that are  found.  When
+                 this  option  is  set,  options  such  as --only-matching and
+                 --output, which specify parts of a match that are to be  out-
+                 put, are ignored.

       -w, --word-regex, --word-regexp
                 Force the patterns only to match "words". That is, there must
@ -754,28 +819,45 @@ OPTIONS
                 does  not apply to patterns specified by any of the --include
                 or --exclude options.

+       -Z, --null
+                 Terminate files names in the regular output with a zero  byte
+                 (the  NUL  character)  instead of what would normally appear.
+                 This is useful when file  names  contain  unusual  characters
+                 such  as  colons,  hyphens, or even newlines. The option does
+                 not apply to file names in error messages.
+

 ENVIRONMENT VARIABLES

-       The environment variables LC_ALL and LC_CTYPE  are  examined,  in  that
-       order,  for  a  locale.  The first one that is set is used. This can be
-       overridden by the --locale option. If  no  locale  is  set,  the  PCRE2
-       library's default (usually the "C" locale) is used.
+       The environment variables LC_ALL and LC_CTYPE are examined, in that or-
+       der, for a locale. The first one that is set is used. This can be over-
+       ridden by the --locale option. If no locale is set, the PCRE2 library's
+       default (usually the "C" locale) is used.


 NEWLINES

-       The -N (--newline) option allows pcre2grep to scan files with different
-       newline conventions from the default. Any parts of the input files that
-       are  written  to the standard output are copied identically, with what-
-       ever newline sequences they have in the input. However, the setting  of
-       this  option  affects only the way scanned files are processed. It does
-       not affect the interpretation of files specified  by  the  -f,  --file-
-       list, --exclude-from, or --include-from options, nor does it affect the
-       way in which pcre2grep writes informational messages  to  the  standard
-       error and output streams. For these it uses the string "\n" to indicate
-       newlines, relying on the C I/O library to convert this to an  appropri-
-       ate sequence.
+       The  -N  (--newline) option allows pcre2grep to scan files with newline
+       conventions that differ from the default. This option affects only  the
+       way  scanned files are processed. It does not affect the interpretation
+       of files specified by the -f,  --file-list,  --exclude-from,  or  --in-
+       clude-from options.
+
+       Any  parts  of the scanned input files that are written to the standard
+       output are copied with whatever newline sequences they have in the  in-
+       put.  However,  if  the final line of a file is output, and it does not
+       end with a newline sequence, a newline sequence is added. If  the  new-
+       line  setting  is  CR, LF, CRLF or NUL, that line ending is output; for
+       the other settings (ANYCRLF or ANY) a single NL is used.
+
+       The newline setting does not affect the way in which  pcre2grep  writes
+       newlines  in  informational  messages  to the standard output and error
+       streams.  Under Windows, the standard output is set to  be  binary,  so
+       that  "\r\n" at the ends of output lines that are copied from the input
+       is not converted to "\r\r\n" by the C I/O library. This means that  any
+       messages  written  to the standard output must end with "\r\n". For all
+       other operating systems, and for all messages  to  the  standard  error
+       stream, "\n" is used.


 OPTIONS COMPATIBILITY
@ -785,9 +867,9 @@ OPTIONS COMPATIBILITY
       terminology) is also available as --xxx-regex (PCRE2 terminology). How-
       ever, the  --depth-limit,  --file-list,  --file-offsets,  --heap-limit,
       --include-dir,  --line-offsets,  --locale,  --match-limit, -M, --multi-
-       line, -N, --newline, --om-separator, --output, -u, and --utf-8  options
-       are  specific to pcre2grep, as is the use of the --only-matching option
-       with a capturing parentheses number.
+       line, -N, --newline,  --om-separator,  --output,  -u,  --utf,  -U,  and
+       --utf-allow-invalid options are specific to pcre2grep, as is the use of
+       the --only-matching option with a capturing parentheses number.

       Although most of the common options work the same way, a few  are  dif-
       ferent  in pcre2grep. For example, the --include option's argument is a
@ -818,13 +900,13 @@ OPTIONS WITH DATA
         --file /some/file

       Note,  however, that if you want to supply a file name beginning with ~
-       as data in a shell command, and have the  shell  expand  ~  to  a  home
-       directory, you must separate the file name from the option, because the
+       as data in a shell command, and have the shell expand ~ to a  home  di-
+       rectory,  you  must separate the file name from the option, because the
       shell does not treat ~ specially unless it is at the start of an item.

       The exceptions to the above are the --colour (or --color)  and  --only-
-       matching  options,  for  which  the  data  is optional. If one of these
-       options does have data, it must be given in the first  form,  using  an
+       matching  options,  for which the data is optional. If one of these op-
+       tions does have data, it must be given in  the  first  form,  using  an
       equals character. Otherwise pcre2grep will assume that it has no data.


@ -834,8 +916,8 @@ USING PCRE2'S CALLOUT FACILITY
       scripts or echoing specific strings during matching by  making  use  of
       PCRE2's  callout  facility.  However, this support can be completely or
       partially disabled when pcre2grep is built. You can  find  out  whether
-       your  binary  has  support  for  callouts by running it with the --help
-       option. If callout support is completely disabled, all callouts in pat-
+       your  binary has support for callouts by running it with the --help op-
+       tion. If callout support is completely disabled, all callouts  in  pat-
       terns are ignored by pcre2grep.  If the facility is partially disabled,
       calling external programs is not supported, and callouts  that  request
       it are ignored.
@ -845,12 +927,35 @@ USING PCRE2'S CALLOUT FACILITY
       mentation  for  details).  Numbered  callouts are ignored by pcre2grep;
       only callouts with string arguments are useful.

+   Echoing a specific string
+
+       Starting the callout string with a pipe character  invokes  an  echoing
+       facility that avoids calling an external program or script. This facil-
+       ity is always available, provided that  callouts  were  not  completely
+       disabled  when  pcre2grep  was built. The rest of the callout string is
+       processed as a zero-terminated string, which means it should  not  con-
+       tain  any  internal  binary  zeros. It is written to the output, having
+       first been passed through the same escape processing as text  from  the
+       --output  (-O) option (see above). However, $0 cannot be used to insert
+       a matched substring because the match is still  in  progress.  Instead,
+       the  single  character '0' is inserted. Any syntax errors in the string
+       (for example, a dollar not followed by another  character)  causes  the
+       callout  to be ignored. No terminator is added to the output string, so
+       if you want a newline, you must include it explicitly using the  escape
+       $n. For example:
+
+         pcre2grep '(.)(..(.))(?C"|[$1] [$2] [$3]$n")' <some file>
+
+       Matching  continues normally after the string is output. If you want to
+       see only the callout output but not any output from  an  actual  match,
+       you should end the pattern with (*FAIL).
+
   Calling external programs or scripts

       This facility can be independently disabled when pcre2grep is built. It
       is supported for Windows, where a call to _spawnvp() is used, for  VMS,
-       where lib$spawn() is used, and  for  any  other  Unix-like  environment
-       where fork() and execv() are available.
+       where  lib$spawn()  is  used,  and  for any Unix-like environment where
+       fork() and execv() are available.

       If the callout string does not start with a pipe (vertical bar) charac-
       ter,  it  is parsed into a list of substrings separated by pipe charac-
@ -859,16 +964,12 @@ USING PCRE2'S CALLOUT FACILITY

         executable_name|arg1|arg2|...

-       Any substring  (including  the  executable  name)  may  contain  escape
-       sequences  started  by  a dollar character: $<digits> or ${<digits>} is
-       replaced by the captured substring of the given decimal  number,  which
-       must  be greater than zero. If the number is greater than the number of
-       capturing substrings, or if the capture is unset,  the  replacement  is
-       empty.
-
-       Any  other  character  is  substituted  by itself. In particular, $$ is
-       replaced by a single dollar and $| is replaced  by  a  pipe  character.
-       Here is an example:
+       Any  substring  (including  the executable name) may contain escape se-
+       quences started by a dollar character. These are the same  as  for  the
+       --output (-O) option documented above, except that $0 cannot insert the
+       matched string because the match is still  in  progress.  Instead,  the
+       character '0' is inserted. If you need a literal dollar or pipe charac-
+       ter in any substring, use $$ or $| respectively. Here is an example:

         echo -e "abcde\n12345" | pcre2grep \
           '(?x)(.)(..(.))
@ -886,31 +987,18 @@ USING PCRE2'S CALLOUT FACILITY
       ters in the callout argument will cause premature termination of  their
       substrings,  and  therefore should not be present. Any syntax errors in
       the string (for example, a dollar not followed  by  another  character)
-       cause the callout to be ignored. If running the program fails  for  any
+       causes the callout to be ignored.  If running the program fails for any
       reason (including the non-existence of the executable), a local  match-
       ing failure occurs and the matcher backtracks in the normal way.

-   Echoing a specific string
-
-       This facility is always available, provided that callouts were not com-
-       pletely disabled when pcre2grep was built. If the callout string starts
-       with a pipe (vertical bar) character, the rest of the string is written
-       to the output, having been passed through the same escape processing as
-       text from the --output option. This provides a simple echoing  facility
-       that  avoids  calling  an  external program or script. No terminator is
-       added to the string, so if you want a  newline,  you  must  include  it
-       explicitly.  Matching continues normally after the string is output. If
-       you want to see only the callout output but  not  any  output  from  an
-       actual match, you should end the relevant pattern with (*FAIL).
-

 MATCHING ERRORS

       It  is  possible  to supply a regular expression that takes a very long
       time to fail to match certain lines.  Such  patterns  normally  involve
       nested  indefinite repeats, for example: (a+)*\d when matched against a
-       line of a's with no final digit. The  PCRE2  matching  function  has  a
-       resource  limit that causes it to abort in these circumstances. If this
+       line of a's with no final digit. The PCRE2 matching function has a  re-
+       source  limit  that  causes it to abort in these circumstances. If this
       happens, pcre2grep outputs an error message and the  line  that  caused
       the  problem  to  the  standard error stream. If there are more than 20
       such errors, pcre2grep gives up.
@ -936,17 +1024,17 @@ DIAGNOSTICS

 SEE ALSO

-       pcre2pattern(3), pcre2syntax(3), pcre2callout(3).
+       pcre2pattern(3), pcre2syntax(3), pcre2callout(3), pcre2unicode(3).


 AUTHOR

       Philip Hazel
-       University Computing Service
+       Retired from University Computing Service
       Cambridge, England.


 REVISION

-       Last updated: 24 November 2018
-       Copyright (c) 1997-2018 University of Cambridge.
+       Last updated: 30 July 2022
+       Copyright (c) 1997-2022 University of Cambridge.
--- a/doc/pcre2jit.3
+++ b/doc/pcre2jit.3
@ -1,4 +1,4 @@
-.TH PCRE2JIT 3 "06 March 2019" "PCRE2 10.33"
+.TH PCRE2JIT 3 "30 November 2021" "PCRE2 10.40"
 .SH NAME
 PCRE2 - Perl-compatible regular expressions (revised API)
 .SH "PCRE2 JUST-IN-TIME COMPILER SUPPORT"
@ -29,6 +29,7 @@ platforms:
 .sp
  ARM 32-bit (v5, v7, and Thumb2)
  ARM 64-bit
+  IBM s390x 64 bit
  Intel x86 32-bit and 64-bit
  MIPS 32-bit and 64-bit
  Power PC 32-bit and 64-bit
@ -64,7 +65,7 @@ or a negative error code.
 There is a limit to the size of pattern that JIT supports, imposed by the size
 of machine stack that it uses. The exact rules are not documented because they
 may change at any time, in particular, when new optimizations are introduced.
-If a pattern is too big, a call to \fBpcre2_jit_compile()\fB returns
+If a pattern is too big, a call to \fBpcre2_jit_compile()\fP returns
 PCRE2_ERROR_NOMEMORY.
 .P
 PCRE2_JIT_COMPLETE requests the JIT compiler to generate code for complete
@ -123,23 +124,29 @@ pattern.
 .SH "MATCHING SUBJECTS CONTAINING INVALID UTF"
 .rs
 .sp
-When a pattern is compiled with the PCRE2_UTF option, the interpretive matching
-function expects its subject string to be a valid sequence of UTF code units.
-If it is not, the result is undefined. This is also true by default of matching
-via JIT. However, if the option PCRE2_JIT_INVALID_UTF is passed to
-\fBpcre2_jit_compile()\fP, code that can process a subject containing invalid
-UTF is compiled.
+When a pattern is compiled with the PCRE2_UTF option, subject strings are
+normally expected to be a valid sequence of UTF code units. By default, this is
+checked at the start of matching and an error is generated if invalid UTF is
+detected. The PCRE2_NO_UTF_CHECK option can be passed to \fBpcre2_match()\fP to
+skip the check (for improved performance) if you are sure that a subject string
+is valid. If this option is used with an invalid string, the result is
+undefined.
 .P
-In this mode, an invalid code unit sequence never matches any pattern item. It
-does not match dot, it does not match \ep{Any}, it does not even match negative
-items such as [^X]. A lookbehind assertion fails if it encounters an invalid
-sequence while moving the current point backwards. In other words, an invalid
-UTF code unit sequence acts as a barrier which no match can cross. Reaching an
-invalid sequence causes an immediate backtrack.
+However, a way of running matches on strings that may contain invalid UTF
+sequences is available. Calling \fBpcre2_compile()\fP with the
+PCRE2_MATCH_INVALID_UTF option has two effects: it tells the interpreter in
+\fBpcre2_match()\fP to support invalid UTF, and, if \fBpcre2_jit_compile()\fP
+is called, the compiled JIT code also supports invalid UTF. Details of how this
+support works, in both the JIT and the interpretive cases, is given in the
+.\" HREF
+\fBpcre2unicode\fP
+.\"
+documentation.
 .P
-Using this option, an application can run matches in arbitrary data, knowing
-that any matched strings that are returned will be valid UTF. This can be
-useful when searching for text in executable or other binary files.
+There is also an obsolete option for \fBpcre2_jit_compile()\fP called
+PCRE2_JIT_INVALID_UTF, which currently exists only for backward compatibility.
+It is superseded by the \fBpcre2_compile()\fP option PCRE2_MATCH_INVALID_UTF
+and should no longer be used. It may be removed in future.
 .
 .
 .SH "UNSUPPORTED OPTIONS AND PATTERN ITEMS"
@ -244,11 +251,11 @@ non-sequential matches in one thread is to use callouts: if a callout function
 starts another match, that match must use a different JIT stack to the one used
 for currently suspended match(es).
 .P
-In a multithread application, if you do not
-specify a JIT stack, or if you assign or pass back NULL from a callback, that
-is thread-safe, because each thread has its own machine stack. However, if you
-assign or pass back a non-NULL JIT stack, this must be a different stack for
-each thread so that the application is thread-safe.
+In a multithread application, if you do not specify a JIT stack, or if you
+assign or pass back NULL from a callback, that is thread-safe, because each
+thread has its own machine stack. However, if you assign or pass back a
+non-NULL JIT stack, this must be a different stack for each thread so that the
+application is thread-safe.
 .P
 Strictly speaking, even more is allowed. You can assign the same non-NULL stack
 to a match context that is used by any number of patterns, as long as they are
@ -260,7 +267,7 @@ inefficient solution, and not recommended.
 This is a suggestion for how a multithreaded program that needs to set up
 non-default JIT stacks might operate:
 .sp
-  During thread initalization
+  During thread initialization
    thread_local_var = pcre2_jit_stack_create(...)
 .sp
  During thread exit
@ -309,12 +316,12 @@ stack through the JIT callback function.
 You can free a JIT stack at any time, as long as it will not be used by
 \fBpcre2_match()\fP again. When you assign the stack to a match context, only a
 pointer is set. There is no reference counting or any other magic. You can free
-compiled patterns, contexts, and stacks in any order, anytime. Just \fIdo
-not\fP call \fBpcre2_match()\fP with a match context pointing to an already
-freed stack, as that will cause SEGFAULT. (Also, do not free a stack currently
-used by \fBpcre2_match()\fP in another thread). You can also replace the stack
-in a context at any time when it is not in use. You should free the previous
-stack before assigning a replacement.
+compiled patterns, contexts, and stacks in any order, anytime.
+Just \fIdo not\fP call \fBpcre2_match()\fP with a match context pointing to an
+already freed stack, as that will cause SEGFAULT. (Also, do not free a stack
+currently used by \fBpcre2_match()\fP in another thread). You can also replace
+the stack in a context at any time when it is not in use. You should free the
+previous stack before assigning a replacement.
 .P
 (5) Should I allocate/free a stack every time before/after calling
 \fBpcre2_match()\fP?
@ -348,8 +355,8 @@ out this complicated API.
 .B void pcre2_jit_free_unused_memory(pcre2_general_context *\fIgcontext\fP);
 .fi
 .P
-The JIT executable allocator does not free all memory when it is possible.
-It expects new allocations, and keeps some free memory around to improve
+The JIT executable allocator does not free all memory when it is possible. It
+expects new allocations, and keeps some free memory around to improve
 allocation speed. However, in low memory conditions, it might be better to free
 all possible memory. You can cause this to happen by calling
 pcre2_jit_free_unused_memory(). Its argument is a general context, for custom
@ -409,10 +416,10 @@ that was not compiled.
 .P
 When you call \fBpcre2_match()\fP, as well as testing for invalid options, a
 number of other sanity checks are performed on the arguments. For example, if
-the subject pointer is NULL, an immediate error is given. Also, unless
-PCRE2_NO_UTF_CHECK is set, a UTF subject string is tested for validity. In the
-interests of speed, these checks do not happen on the JIT fast path, and if
-invalid data is passed, the result is undefined.
+the subject pointer is NULL but the length is non-zero, an immediate error is
+given. Also, unless PCRE2_NO_UTF_CHECK is set, a UTF subject string is tested
+for validity. In the interests of speed, these checks do not happen on the JIT
+fast path, and if invalid data is passed, the result is undefined.
 .P
 Bypassing the sanity checks and the \fBpcre2_match()\fP wrapping can give
 speedups of more than 10%.
@ -438,6 +445,6 @@ Cambridge, England.
 .rs
 .sp
 .nf
-Last updated: 06 March 2019
-Copyright (c) 1997-2019 University of Cambridge.
+Last updated: 30 November 2021
+Copyright (c) 1997-2021 University of Cambridge.
 .fi
--- a/doc/pcre2limits.3
+++ b/doc/pcre2limits.3
@ -1,4 +1,4 @@
-.TH PCRE2LIMITS 3 "03 February 2019" "PCRE2 10.33"
+.TH PCRE2LIMITS 3 "26 July 2022" "PCRE2 10.41"
 .SH NAME
 PCRE2 - Perl-compatible regular expressions (revised API)
 .SH "SIZE AND OTHER LIMITATIONS"
@ -51,6 +51,10 @@ is 255 code units for the 8-bit library and 65535 code units for the 16-bit and
 .P
 The maximum length of a string argument to a callout is the largest number a
 32-bit unsigned integer can hold.
+.P
+The maximum amount of heap memory used for matching is controlled by the heap 
+limit, which can be set in a pattern or in a match context. The default is a 
+very large number, effectively unlimited.
 .
 .
 .SH AUTHOR
@ -58,7 +62,7 @@ The maximum length of a string argument to a callout is the largest number a
 .sp
 .nf
 Philip Hazel
-University Computing Service
+Retired from University Computing Service
 Cambridge, England.
 .fi
 .
@ -67,6 +71,6 @@ Cambridge, England.
 .rs
 .sp
 .nf
-Last updated: 02 February 2019
-Copyright (c) 1997-2019 University of Cambridge.
+Last updated: 26 July 2022
+Copyright (c) 1997-2022 University of Cambridge.
 .fi
--- a/doc/pcre2matching.3
+++ b/doc/pcre2matching.3
@ -1,4 +1,4 @@
-.TH PCRE2MATCHING 3 "10 October 2018" "PCRE2 10.33"
+.TH PCRE2MATCHING 3 "28 August 2021" "PCRE2 10.38"
 .SH NAME
 PCRE2 - Perl-compatible regular expressions (revised API)
 .SH "PCRE2 MATCHING ALGORITHMS"
@ -61,8 +61,9 @@ tried is controlled by the greedy or ungreedy nature of the quantifier.
 If a leaf node is reached, a matching string has been found, and at that point
 the algorithm stops. Thus, if there is more than one possible match, this
 algorithm returns the first one that it finds. Whether this is the shortest,
-the longest, or some intermediate length depends on the way the greedy and
-ungreedy repetition quantifiers are specified in the pattern.
+the longest, or some intermediate length depends on the way the alternations
+and the greedy or ungreedy repetition quantifiers are specified in the
+pattern.
 .P
 Because it ends up with a single path through the tree, it is relatively
 straightforward for this algorithm to keep track of the substrings that are
@ -91,10 +92,15 @@ no more unterminated paths. At this point, terminated paths represent the
 different matching possibilities (if there are none, the match has failed).
 Thus, if there is more than one possible match, this algorithm finds all of
 them, and in particular, it finds the longest. The matches are returned in
-decreasing order of length. There is an option to stop the algorithm after the
-first match (which is necessarily the shortest) is found.
+the output vector in decreasing order of length. There is an option to stop the
+algorithm after the first match (which is necessarily the shortest) is found.
 .P
-Note that all the matches that are found start at the same point in the
+Note that the size of vector needed to contain all the results depends on the
+number of simultaneous matches, not on the number of parentheses in the
+pattern. Using \fBpcre2_match_data_create_from_pattern()\fP to create the match
+data block is therefore not advisable when doing DFA matching.
+.P
+Note also that all the matches that are found start at the same point in the
 subject. If the pattern
 .sp
  cat(er(pillar)?)?
@ -157,24 +163,21 @@ code unit) at a time, for all active paths through the tree.
 .P
 9. Except for (*FAIL), the backtracking control verbs such as (*PRUNE) are not
 supported. (*FAIL) is supported, and behaves like a failing negative assertion.
+.P
+10. The PCRE2_MATCH_INVALID_UTF option for \fBpcre2_compile()\fP is not
+supported by \fBpcre2_dfa_match()\fP.
 .
 .
 .SH "ADVANTAGES OF THE ALTERNATIVE ALGORITHM"
 .rs
 .sp
-Using the alternative matching algorithm provides the following advantages:
+The main advantage of the alternative algorithm is that all possible matches
+(at a single point in the subject) are automatically found, and in particular,
+the longest match is found. To find more than one match at the same point using
+the standard algorithm, you have to do kludgy things with callouts.
 .P
-1. All possible matches (at a single point in the subject) are automatically
-found, and in particular, the longest match is found. To find more than one
-match using the standard algorithm, you have to do kludgy things with
-callouts.
-.P
-2. Because the alternative algorithm scans the subject string just once, and
-never needs to backtrack (except for lookbehinds), it is possible to pass very
-long subject strings to the matching function in several pieces, checking for
-partial matching each time. Although it is also possible to do multi-segment
-matching using the standard algorithm, by retaining partially matched
-substrings, it is more complicated. The
+Partial matching is possible with this algorithm, though it has some
+limitations. The
 .\" HREF
 \fBpcre2partial\fP
 .\"
@ -191,10 +194,13 @@ The alternative algorithm suffers from a number of disadvantages:
 because it has to search for all possible matches, but is also because it is
 less susceptible to optimization.
 .P
-2. Capturing parentheses, backreferences, and script runs are not supported.
+2. Capturing parentheses, backreferences, script runs, and matching within
+invalid UTF string are not supported.
 .P
 3. Although atomic groups are supported, their use does not provide the
 performance advantage that it does for the standard algorithm.
+.P
+4. JIT optimization is not supported.
 .
 .
 .SH AUTHOR
@ -202,7 +208,7 @@ performance advantage that it does for the standard algorithm.
 .sp
 .nf
 Philip Hazel
-University Computing Service
+Retired from University Computing Service
 Cambridge, England.
 .fi
 .
@ -211,6 +217,6 @@ Cambridge, England.
 .rs
 .sp
 .nf
-Last updated: 10 October 2018
-Copyright (c) 1997-2018 University of Cambridge.
+Last updated: 28 August 2021
+Copyright (c) 1997-2021 University of Cambridge.
 .fi
--- a/doc/pcre2partial.3
+++ b/doc/pcre2partial.3
@ -1,67 +1,107 @@
-.TH PCRE2PARTIAL 3 "22 December 2014" "PCRE2 10.00"
+.TH PCRE2PARTIAL 3 "04 September 2019" "PCRE2 10.34"
 .SH NAME
 PCRE2 - Perl-compatible regular expressions
 .SH "PARTIAL MATCHING IN PCRE2"
 .rs
 .sp
-In normal use of PCRE2, if the subject string that is passed to a matching
-function matches as far as it goes, but is too short to match the entire
-pattern, PCRE2_ERROR_NOMATCH is returned. There are circumstances where it
-might be helpful to distinguish this case from other cases in which there is no
-match.
+In normal use of PCRE2, if there is a match up to the end of a subject string,
+but more characters are needed to match the entire pattern, PCRE2_ERROR_NOMATCH
+is returned, just like any other failing match. There are circumstances where
+it might be helpful to distinguish this "partial match" case.
 .P
-Consider, for example, an application where a human is required to type in data
-for a field with specific formatting requirements. An example might be a date
-in the form \fIddmmmyy\fP, defined by this pattern:
-.sp
-  ^\ed?\ed(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)\ed\ed$
-.sp
-If the application sees the user's keystrokes one by one, and can check that
-what has been typed so far is potentially valid, it is able to raise an error
-as soon as a mistake is made, by beeping and not reflecting the character that
-has been typed, for example. This immediate feedback is likely to be a better
-user interface than a check that is delayed until the entire string has been
-entered. Partial matching can also be useful when the subject string is very
-long and is not all available at once.
+One example is an application where the subject string is very long, and not
+all available at once. The requirement here is to be able to do the matching
+segment by segment, but special action is needed when a matched substring spans
+the boundary between two segments.
 .P
-PCRE2 supports partial matching by means of the PCRE2_PARTIAL_SOFT and
-PCRE2_PARTIAL_HARD options, which can be set when calling a matching function.
-The difference between the two options is whether or not a partial match is
-preferred to an alternative complete match, though the details differ between
-the two types of matching function. If both options are set, PCRE2_PARTIAL_HARD
-takes precedence.
+Another example is checking a user input string as it is typed, to ensure that
+it conforms to a required format. Invalid characters can be immediately
+diagnosed and rejected, giving instant feedback.
 .P
-If you want to use partial matching with just-in-time optimized code, you must
-call \fBpcre2_jit_compile()\fP with one or both of these options:
+Partial matching is a PCRE2-specific feature; it is not Perl-compatible. It is
+requested by setting one of the PCRE2_PARTIAL_HARD or PCRE2_PARTIAL_SOFT
+options when calling a matching function. The difference between the two
+options is whether or not a partial match is preferred to an alternative
+complete match, though the details differ between the two types of matching
+function. If both options are set, PCRE2_PARTIAL_HARD takes precedence.
+.P
+If you want to use partial matching with just-in-time optimized code, as well
+as setting a partial match option for the matching function, you must also call
+\fBpcre2_jit_compile()\fP with one or both of these options:
 .sp
-  PCRE2_JIT_PARTIAL_SOFT
  PCRE2_JIT_PARTIAL_HARD
+  PCRE2_JIT_PARTIAL_SOFT
 .sp
 PCRE2_JIT_COMPLETE should also be set if you are going to run non-partial
-matches on the same pattern. If the appropriate JIT mode has not been compiled,
-interpretive matching code is used.
+matches on the same pattern. Separate code is compiled for each mode. If the
+appropriate JIT mode has not been compiled, interpretive matching code is used.
 .P
 Setting a partial matching option disables two of PCRE2's standard
-optimizations. PCRE2 remembers the last literal code unit in a pattern, and
-abandons matching immediately if it is not present in the subject string. This
-optimization cannot be used for a subject string that might match only
-partially. PCRE2 also knows the minimum length of a matching string, and does
+optimization hints. PCRE2 remembers the last literal code unit in a pattern,
+and abandons matching immediately if it is not present in the subject string.
+This optimization cannot be used for a subject string that might match only
+partially. PCRE2 also remembers a minimum length of a matching string, and does
 not bother to run the matching function on shorter strings. This optimization
 is also disabled for partial matching.
 .
 .
+.SH "REQUIREMENTS FOR A PARTIAL MATCH"
+.rs
+.sp
+A possible partial match occurs during matching when the end of the subject
+string is reached successfully, but either more characters are needed to
+complete the match, or the addition of more characters might change what is
+matched.
+.P
+Example 1: if the pattern is /abc/ and the subject is "ab", more characters are
+definitely needed to complete a match. In this case both hard and soft matching
+options yield a partial match.
+.P
+Example 2: if the pattern is /ab+/ and the subject is "ab", a complete match
+can be found, but the addition of more characters might change what is
+matched. In this case, only PCRE2_PARTIAL_HARD returns a partial match;
+PCRE2_PARTIAL_SOFT returns the complete match.
+.P
+On reaching the end of the subject, when PCRE2_PARTIAL_HARD is set, if the next
+pattern item is \ez, \eZ, \eb, \eB, or $ there is always a partial match.
+Otherwise, for both options, the next pattern item must be one that inspects a
+character, and at least one of the following must be true:
+.P
+(1) At least one character has already been inspected. An inspected character
+need not form part of the final matched string; lookbehind assertions and the
+\eK escape sequence provide ways of inspecting characters before the start of a
+matched string.
+.P
+(2) The pattern contains one or more lookbehind assertions. This condition
+exists in case there is a lookbehind that inspects characters before the start
+of the match.
+.P
+(3) There is a special case when the whole pattern can match an empty string.
+When the starting point is at the end of the subject, the empty string match is
+a possibility, and if PCRE2_PARTIAL_SOFT is set and neither of the above
+conditions is true, it is returned. However, because adding more characters
+might result in a non-empty match, PCRE2_PARTIAL_HARD returns a partial match,
+which in this case means "there is going to be a match at this point, but until
+some more characters are added, we do not know if it will be an empty string or
+something longer".
+.
+.
+.
 .SH "PARTIAL MATCHING USING pcre2_match()"
 .rs
 .sp
-A partial match occurs during a call to \fBpcre2_match()\fP when the end of the
-subject string is reached successfully, but matching cannot continue because
-more characters are needed. However, at least one character in the subject must
-have been inspected. This character need not form part of the final matched
-string; lookbehind assertions and the \eK escape sequence provide ways of
-inspecting characters before the start of a matched string. The requirement for
-inspecting at least one character exists because an empty string can always be
-matched; without such a restriction there would always be a partial match of an
-empty string at the end of the subject.
+When a partial matching option is set, the result of calling
+\fBpcre2_match()\fP can be one of the following:
+.TP 2
+\fBA successful match\fP
+A complete match has been found, starting and ending within this subject.
+.TP
+\fBPCRE2_ERROR_NOMATCH\fP
+No match can start anywhere in this subject.
+.TP
+\fBPCRE2_ERROR_PARTIAL\fP
+Adding more characters may result in a complete match that uses one or more
+characters from the end of this subject.
 .P
 When a partial match is returned, the first two elements in the ovector point
 to the portion of the subject that was matched, but the values in the rest of
@ -77,53 +117,40 @@ is "456abc12", a partial match is found for the string "abc12", because all
 these characters are needed for a subsequent re-match with additional
 characters.
 .P
-What happens when a partial match is identified depends on which of the two
-partial matching options are set.
-.
-.
-.SS "PCRE2_PARTIAL_SOFT WITH pcre2_match()"
-.rs
-.sp
-If PCRE2_PARTIAL_SOFT is set when \fBpcre2_match()\fP identifies a partial
-match, the partial match is remembered, but matching continues as normal, and
-other alternatives in the pattern are tried. If no complete match can be found,
-PCRE2_ERROR_PARTIAL is returned instead of PCRE2_ERROR_NOMATCH.
-.P
-This option is "soft" because it prefers a complete match over a partial match.
-All the various matching items in a pattern behave as if the subject string is
-potentially complete. For example, \ez, \eZ, and $ match at the end of the
-subject, as normal, and for \eb and \eB the end of the subject is treated as a
-non-alphanumeric.
-.P
 If there is more than one partial match, the first one that was found provides
 the data that is returned. Consider this pattern:
 .sp
  /123\ew+X|dogY/
 .sp
-If this is matched against the subject string "abc123dog", both
-alternatives fail to match, but the end of the subject is reached during
-matching, so PCRE2_ERROR_PARTIAL is returned. The offsets are set to 3 and 9,
-identifying "123dog" as the first partial match that was found. (In this
-example, there are two partial matches, because "dog" on its own partially
-matches the second alternative.)
+If this is matched against the subject string "abc123dog", both alternatives
+fail to match, but the end of the subject is reached during matching, so
+PCRE2_ERROR_PARTIAL is returned. The offsets are set to 3 and 9, identifying
+"123dog" as the first partial match. (In this example, there are two partial
+matches, because "dog" on its own partially matches the second alternative.)
 .
 .
-.SS "PCRE2_PARTIAL_HARD WITH pcre2_match()"
-.rs
-.sp
-If PCRE2_PARTIAL_HARD is set for \fBpcre2_match()\fP, PCRE2_ERROR_PARTIAL is
-returned as soon as a partial match is found, without continuing to search for
-possible complete matches. This option is "hard" because it prefers an earlier
-partial match over a later complete match. For this reason, the assumption is
-made that the end of the supplied subject string may not be the true end of the
-available data, and so, if \ez, \eZ, \eb, \eB, or $ are encountered at the end
-of the subject, the result is PCRE2_ERROR_PARTIAL, provided that at least one
-character in the subject has been inspected.
-.
-.
-.SS "Comparing hard and soft partial matching"
+.SS "How a partial match is processed by pcre2_match()"
 .rs
 .sp
+What happens when a partial match is identified depends on which of the two
+partial matching options is set.
+.P
+If PCRE2_PARTIAL_HARD is set, PCRE2_ERROR_PARTIAL is returned as soon as a
+partial match is found, without continuing to search for possible complete
+matches. This option is "hard" because it prefers an earlier partial match over
+a later complete match. For this reason, the assumption is made that the end of
+the supplied subject string is not the true end of the available data, which is
+why \ez, \eZ, \eb, \eB, and $ always give a partial match.
+.P
+If PCRE2_PARTIAL_SOFT is set, the partial match is remembered, but matching
+continues as normal, and other alternatives in the pattern are tried. If no
+complete match can be found, PCRE2_ERROR_PARTIAL is returned instead of
+PCRE2_ERROR_NOMATCH. This option is "soft" because it prefers a complete match
+over a partial match. All the various matching items in a pattern behave as if
+the subject string is potentially complete; \ez, \eZ, and $ match at the end of
+the subject, as normal, and for \eb and \eB the end of the subject is treated
+as a non-alphanumeric.
+.P
 The difference between the two partial matching options can be illustrated by a
 pattern such as:
 .sp
@ -148,25 +175,132 @@ The second pattern will never match "dogsbody", because it will always find the
 shorter match first.
 .
 .
+.SS "Example of partial matching using pcre2test"
+.rs
+.sp
+The \fBpcre2test\fP data modifiers \fBpartial_hard\fP (or \fBph\fP) and
+\fBpartial_soft\fP (or \fBps\fP) set PCRE2_PARTIAL_HARD and PCRE2_PARTIAL_SOFT,
+respectively, when calling \fBpcre2_match()\fP. Here is a run of
+\fBpcre2test\fP using a pattern that matches the whole subject in the form of a
+date:
+.sp
+    re> /^\ed?\ed(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)\ed\ed$/
+  data> 25dec3\e=ph
+  Partial match: 23dec3
+  data> 3ju\e=ph
+  Partial match: 3ju
+  data> 3juj\e=ph
+  No match
+.sp
+This example gives the same results for both hard and soft partial matching
+options. Here is an example where there is a difference:
+.sp
+    re> /^\ed?\ed(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)\ed\ed$/
+  data> 25jun04\e=ps
+   0: 25jun04
+   1: jun
+  data> 25jun04\e=ph
+  Partial match: 25jun04
+.sp
+With PCRE2_PARTIAL_SOFT, the subject is matched completely. For
+PCRE2_PARTIAL_HARD, however, the subject is assumed not to be complete, so
+there is only a partial match.
+.
+.
+.
+.SH "MULTI-SEGMENT MATCHING WITH pcre2_match()"
+.rs
+.sp
+PCRE was not originally designed with multi-segment matching in mind. However,
+over time, features (including partial matching) that make multi-segment
+matching possible have been added. A very long string can be searched segment
+by segment by calling \fBpcre2_match()\fP repeatedly, with the aim of achieving
+the same results that would happen if the entire string was available for
+searching all the time. Normally, the strings that are being sought are much
+shorter than each individual segment, and are in the middle of very long
+strings, so the pattern is normally not anchored.
+.P
+Special logic must be implemented to handle a matched substring that spans a
+segment boundary. PCRE2_PARTIAL_HARD should be used, because it returns a
+partial match at the end of a segment whenever there is the possibility of
+changing the match by adding more characters. The PCRE2_NOTBOL option should
+also be set for all but the first segment.
+.P
+When a partial match occurs, the next segment must be added to the current
+subject and the match re-run, using the \fIstartoffset\fP argument of
+\fBpcre2_match()\fP to begin at the point where the partial match started.
+For example:
+.sp
+    re> /\ed?\ed(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)\ed\ed/
+  data> ...the date is 23ja\e=ph
+  Partial match: 23ja
+  data> ...the date is 23jan19 and on that day...\e=offset=15
+   0: 23jan19
+   1: jan
+.sp
+Note the use of the \fBoffset\fP modifier to start the new match where the
+partial match was found. In this example, the next segment was added to the one
+in which the partial match was found. This is the most straightforward
+approach, typically using a memory buffer that is twice the size of each
+segment. After a partial match, the first half of the buffer is discarded, the
+second half is moved to the start of the buffer, and a new segment is added
+before repeating the match as in the example above. After a no match, the
+entire buffer can be discarded.
+.P
+If there are memory constraints, you may want to discard text that precedes a
+partial match before adding the next segment. Unfortunately, this is not at
+present straightforward. In cases such as the above, where the pattern does not
+contain any lookbehinds, it is sufficient to retain only the partially matched
+substring. However, if the pattern contains a lookbehind assertion, characters
+that precede the start of the partial match may have been inspected during the
+matching process. When \fBpcre2test\fP displays a partial match, it indicates
+these characters with '<' if the \fBallusedtext\fP modifier is set:
+.sp
+    re> "(?<=123)abc"
+  data> xx123ab\e=ph,allusedtext
+  Partial match: 123ab
+                 <<<
+.sp
+However, the \fBallusedtext\fP modifier is not available for JIT matching,
+because JIT matching does not record the first (or last) consulted characters.
+For this reason, this information is not available via the API. It is therefore
+not possible in general to obtain the exact number of characters that must be
+retained in order to get the right match result. If you cannot retain the
+entire segment, you must find some heuristic way of choosing.
+.P
+If you know the approximate length of the matching substrings, you can use that
+to decide how much text to retain. The only lookbehind information that is
+currently available via the API is the length of the longest individual
+lookbehind in a pattern, but this can be misleading if there are nested
+lookbehinds. The value returned by calling \fBpcre2_pattern_info()\fP with the
+PCRE2_INFO_MAXLOOKBEHIND option is the maximum number of characters (not code
+units) that any individual lookbehind moves back when it is processed. A
+pattern such as "(?<=(?<!b)a)" has a maximum lookbehind value of one, but
+inspects two characters before its starting point.
+.P
+In a non-UTF or a 32-bit case, moving back is just a subtraction, but in
+UTF-8 or UTF-16 you have to count characters while moving back through the code
+units.
+.
+.
 .SH "PARTIAL MATCHING USING pcre2_dfa_match()"
 .rs
 .sp
-The DFA functions move along the subject string character by character, without
+The DFA function moves along the subject string character by character, without
 backtracking, searching for all possible matches simultaneously. If the end of
 the subject is reached before the end of the pattern, there is the possibility
-of a partial match, again provided that at least one character has been
-inspected.
+of a partial match.
 .P
 When PCRE2_PARTIAL_SOFT is set, PCRE2_ERROR_PARTIAL is returned only if there
 have been no complete matches. Otherwise, the complete matches are returned.
-However, if PCRE2_PARTIAL_HARD is set, a partial match takes precedence over
-any complete matches. The portion of the string that was matched when the
-longest partial match was found is set as the first matching string.
+If PCRE2_PARTIAL_HARD is set, a partial match takes precedence over any
+complete matches. The portion of the string that was matched when the longest
+partial match was found is set as the first matching string.
 .P
-Because the DFA functions always search for all possible matches, and there is
-no difference between greedy and ungreedy repetition, their behaviour is
-different from the standard functions when PCRE2_PARTIAL_HARD is set. Consider
-the string "dog" matched against the ungreedy pattern shown above:
+Because the DFA function always searches for all possible matches, and there is
+no difference between greedy and ungreedy repetition, its behaviour is
+different from the \fBpcre2_match()\fP. Consider the string "dog" matched
+against this ungreedy pattern:
 .sp
  /dog(sbody)??/
 .sp
@ -175,62 +309,17 @@ Whereas the standard function stops as soon as it finds the complete match for
 returns that when PCRE2_PARTIAL_HARD is set.
 .
 .
-.SH "PARTIAL MATCHING AND WORD BOUNDARIES"
-.rs
-.sp
-If a pattern ends with one of sequences \eb or \eB, which test for word
-boundaries, partial matching with PCRE2_PARTIAL_SOFT can give counter-intuitive
-results. Consider this pattern:
-.sp
-  /\ebcat\eb/
-.sp
-This matches "cat", provided there is a word boundary at either end. If the
-subject string is "the cat", the comparison of the final "t" with a following
-character cannot take place, so a partial match is found. However, normal
-matching carries on, and \eb matches at the end of the subject when the last
-character is a letter, so a complete match is found. The result, therefore, is
-\fInot\fP PCRE2_ERROR_PARTIAL. Using PCRE2_PARTIAL_HARD in this case does yield
-PCRE2_ERROR_PARTIAL, because then the partial match takes precedence.
-.
-.
-.SH "EXAMPLE OF PARTIAL MATCHING USING PCRE2TEST"
-.rs
-.sp
-If the \fBpartial_soft\fP (or \fBps\fP) modifier is present on a
-\fBpcre2test\fP data line, the PCRE2_PARTIAL_SOFT option is used for the match.
-Here is a run of \fBpcre2test\fP that uses the date example quoted above:
-.sp
-    re> /^\ed?\ed(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)\ed\ed$/
-  data> 25jun04\e=ps
-   0: 25jun04
-   1: jun
-  data> 25dec3\e=ps
-  Partial match: 23dec3
-  data> 3ju\e=ps
-  Partial match: 3ju
-  data> 3juj\e=ps
-  No match
-  data> j\e=ps
-  No match
-.sp
-The first data string is matched completely, so \fBpcre2test\fP shows the
-matched substrings. The remaining four strings do not match the complete
-pattern, but the first two are partial matches. Similar output is obtained
-if DFA matching is used.
-.P
-If the \fBpartial_hard\fP (or \fBph\fP) modifier is present on a
-\fBpcre2test\fP data line, the PCRE2_PARTIAL_HARD option is set for the match.
-.
-.
 .SH "MULTI-SEGMENT MATCHING WITH pcre2_dfa_match()"
 .rs
 .sp
-When a partial match has been found using a DFA matching function, it is
+When a partial match has been found using the DFA matching function, it is
 possible to continue the match by providing additional subject data and calling
 the function again with the same compiled regular expression, this time setting
 the PCRE2_DFA_RESTART option. You must pass the same working space as before,
-because this is where details of the previous partial match are stored. Here is
-an example using \fBpcre2test\fP:
+because this is where details of the previous partial match are stored. You can
+set the PCRE2_PARTIAL_SOFT or PCRE2_PARTIAL_HARD options with PCRE2_DFA_RESTART
+to continue partial matching over multiple segments. Here is an example using
+\fBpcre2test\fP:
 .sp
    re> /^\ed?\ed(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)\ed\ed$/
  data> 23ja\e=dfa,ps
@ -242,147 +331,10 @@ The first call has "23ja" as the subject, and requests partial matching; the
 second call has "n05" as the subject for the continued (restarted) match.
 Notice that when the match is complete, only the last part is shown; PCRE2 does
 not retain the previously partially-matched string. It is up to the calling
-program to do that if it needs to.
-.P
-That means that, for an unanchored pattern, if a continued match fails, it is
-not possible to try again at a new starting point. All this facility is capable
-of doing is continuing with the previous match attempt. In the previous
-example, if the second set of data is "ug23" the result is no match, even
-though there would be a match for "aug23" if the entire string were given at
-once. Depending on the application, this may or may not be what you want.
-The only way to allow for starting again at the next character is to retain the
-matched part of the subject and try a new complete match.
-.P
-You can set the PCRE2_PARTIAL_SOFT or PCRE2_PARTIAL_HARD options with
-PCRE2_DFA_RESTART to continue partial matching over multiple segments. This
-facility can be used to pass very long subject strings to the DFA matching
-functions.
-.
-.
-.SH "MULTI-SEGMENT MATCHING WITH pcre2_match()"
-.rs
-.sp
-Unlike the DFA function, it is not possible to restart the previous match with
-a new segment of data when using \fBpcre2_match()\fP. Instead, new data must be
-added to the previous subject string, and the entire match re-run, starting
-from the point where the partial match occurred. Earlier data can be discarded.
-.P
-It is best to use PCRE2_PARTIAL_HARD in this situation, because it does not
-treat the end of a segment as the end of the subject when matching \ez, \eZ,
-\eb, \eB, and $. Consider an unanchored pattern that matches dates:
-.sp
-    re> /\ed?\ed(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)\ed\ed/
-  data> The date is 23ja\e=ph
-  Partial match: 23ja
-.sp
-At this stage, an application could discard the text preceding "23ja", add on
-text from the next segment, and call the matching function again. Unlike the
-DFA matching function, the entire matching string must always be available,
-and the complete matching process occurs for each call, so more memory and more
-processing time is needed.
-.
-.
-.SH "ISSUES WITH MULTI-SEGMENT MATCHING"
-.rs
-.sp
-Certain types of pattern may give problems with multi-segment matching,
-whichever matching function is used.
-.P
-1. If the pattern contains a test for the beginning of a line, you need to pass
-the PCRE2_NOTBOL option when the subject string for any call does start at the
-beginning of a line. There is also a PCRE2_NOTEOL option, but in practice when
-doing multi-segment matching you should be using PCRE2_PARTIAL_HARD, which
-includes the effect of PCRE2_NOTEOL.
-.P
-2. If a pattern contains a lookbehind assertion, characters that precede the
-start of the partial match may have been inspected during the matching process.
-When using \fBpcre2_match()\fP, sufficient characters must be retained for the
-next match attempt. You can ensure that enough characters are retained by doing
-the following:
-.P
-Before doing any matching, find the length of the longest lookbehind in the
-pattern by calling \fBpcre2_pattern_info()\fP with the PCRE2_INFO_MAXLOOKBEHIND
-option. Note that the resulting count is in characters, not code units. After a
-partial match, moving back from the ovector[0] offset in the subject by the
-number of characters given for the maximum lookbehind gets you to the earliest
-character that must be retained. In a non-UTF or a 32-bit situation, moving
-back is just a subtraction, but in UTF-8 or UTF-16 you have to count characters
-while moving back through the code units.
-.P
-Characters before the point you have now reached can be discarded, and after
-the next segment has been added to what is retained, you should run the next
-match with the \fBstartoffset\fP argument set so that the match begins at the
-same point as before.
-.P
-For example, if the pattern "(?<=123)abc" is partially matched against the
-string "xx123ab", the ovector offsets are 5 and 7 ("ab"). The maximum
-lookbehind count is 3, so all characters before offset 2 can be discarded. The
-value of \fBstartoffset\fP for the next match should be 3. When \fBpcre2test\fP
-displays a partial match, it indicates the lookbehind characters with '<'
-characters:
-.sp
-    re> "(?<=123)abc"
-  data> xx123ab\e=ph
-  Partial match: 123ab
-                 <<<
-.P
-3. Because a partial match must always contain at least one character, what
-might be considered a partial match of an empty string actually gives a "no
-match" result. For example:
-.sp
-    re> /c(?<=abc)x/
-  data> ab\e=ps
-  No match
-.sp
-If the next segment begins "cx", a match should be found, but this will only
-happen if characters from the previous segment are retained. For this reason, a
-"no match" result should be interpreted as "partial match of an empty string"
-when the pattern contains lookbehinds.
-.P
-4. Matching a subject string that is split into multiple segments may not
-always produce exactly the same result as matching over one single long string,
-especially when PCRE2_PARTIAL_SOFT is used. The section "Partial Matching and
-Word Boundaries" above describes an issue that arises if the pattern ends with
-\eb or \eB. Another kind of difference may occur when there are multiple
-matching possibilities, because (for PCRE2_PARTIAL_SOFT) a partial match result
-is given only when there are no completed matches. This means that as soon as
-the shortest match has been found, continuation to a new subject segment is no
-longer possible. Consider this \fBpcre2test\fP example:
-.sp
-    re> /dog(sbody)?/
-  data> dogsb\e=ps
-   0: dog
-  data> do\e=ps,dfa
-  Partial match: do
-  data> gsb\e=ps,dfa,dfa_restart
-   0: g
-  data> dogsbody\e=dfa
-   0: dogsbody
-   1: dog
-.sp
-The first data line passes the string "dogsb" to a standard matching function,
-setting the PCRE2_PARTIAL_SOFT option. Although the string is a partial match
-for "dogsbody", the result is not PCRE2_ERROR_PARTIAL, because the shorter
-string "dog" is a complete match. Similarly, when the subject is presented to
-a DFA matching function in several parts ("do" and "gsb" being the first two)
-the match stops when "dog" has been found, and it is not possible to continue.
-On the other hand, if "dogsbody" is presented as a single string, a DFA
-matching function finds both matches.
-.P
-Because of these problems, it is best to use PCRE2_PARTIAL_HARD when matching
-multi-segment data. The example above then behaves differently:
-.sp
-    re> /dog(sbody)?/
-  data> dogsb\e=ph
-  Partial match: dogsb
-  data> do\e=ps,dfa
-  Partial match: do
-  data> gsb\e=ph,dfa,dfa_restart
-  Partial match: gsb
-.sp
-5. Patterns that contain alternatives at the top level which do not all start
-with the same pattern item may not work as expected when PCRE2_DFA_RESTART is
-used. For example, consider this pattern:
+program to do that if it needs to. This means that, for an unanchored pattern,
+if a continued match fails, it is not possible to try again at a new starting
+point. All this facility is capable of doing is continuing with the previous
+match attempt. For example, consider this pattern:
 .sp
  1234|3789
 .sp
@ -391,28 +343,15 @@ alternative is found at offset 3. There is no partial match for the second
 alternative, because such a match does not start at the same point in the
 subject string. Attempting to continue with the string "7890" does not yield a
 match because only those alternatives that match at one point in the subject
-are remembered. The problem arises because the start of the second alternative
-matches within the first alternative. There is no problem with anchored
-patterns or patterns such as:
-.sp
-  1234|ABCD
-.sp
-where no string can be a partial match for both alternatives. This is not a
-problem if a standard matching function is used, because the entire match has
-to be rerun each time:
-.sp
-    re> /1234|3789/
-  data> ABC123\e=ph
-  Partial match: 123
-  data> 1237890
-   0: 3789
-.sp
-Of course, instead of using PCRE2_DFA_RESTART, the same technique of re-running
-the entire match can also be used with the DFA matching function. Another
-possibility is to work with two buffers. If a partial match at offset \fIn\fP
-in the first buffer is followed by "no match" when PCRE2_DFA_RESTART is used on
-the second buffer, you can then try a new match starting at offset \fIn+1\fP in
-the first buffer.
+are remembered. Depending on the application, this may or may not be what you
+want.
+.P
+If you do want to allow for starting again at the next character, one way of
+doing it is to retain some or all of the segment and try a new complete match,
+as described for \fBpcre2_match()\fP above. Another possibility is to work with
+two buffers. If a partial match at offset \fIn\fP in the first buffer is
+followed by "no match" when PCRE2_DFA_RESTART is used on the second buffer, you
+can then try a new match starting at offset \fIn+1\fP in the first buffer.
 .
 .
 .SH AUTHOR
@ -429,6 +368,6 @@ Cambridge, England.
 .rs
 .sp
 .nf
-Last updated: 22 December 2014
-Copyright (c) 1997-2014 University of Cambridge.
+Last updated: 04 September 2019
+Copyright (c) 1997-2019 University of Cambridge.
 .fi
--- a/doc/pcre2pattern.3
+++ b/doc/pcre2pattern.3
@ -1,4 +1,4 @@
-.TH PCRE2PATTERN 3 "12 February 2019" "PCRE2 10.33"
+.TH PCRE2PATTERN 3 "12 January 2022" "PCRE2 10.40"
 .SH NAME
 PCRE2 - Perl-compatible regular expressions (revised API)
 .SH "PCRE2 REGULAR EXPRESSION DETAILS"
@ -52,10 +52,11 @@ single code units, or as multiple UTF-8 or UTF-16 code units. UTF-32 can be
 specified for the 32-bit library, in which case it constrains the character
 values to valid Unicode code points. To process UTF strings, PCRE2 must be
 built to include Unicode support (which is the default). When using UTF strings
-you must either call the compiling function with the PCRE2_UTF option, or the
-pattern must start with the special sequence (*UTF), which is equivalent to
-setting the relevant option. How setting a UTF mode affects pattern matching is
-mentioned in several places below. There is also a summary of features in the
+you must either call the compiling function with one or both of the PCRE2_UTF
+or PCRE2_MATCH_INVALID_UTF options, or the pattern must start with the special
+sequence (*UTF), which is equivalent to setting the relevant PCRE2_UTF. How
+setting a UTF mode affects pattern matching is mentioned in several places
+below. There is also a summary of features in the
 .\" HREF
 \fBpcre2unicode\fP
 .\"
@ -74,7 +75,8 @@ Another special sequence that may appear at the start of a pattern is (*UCP).
 This has the same effect as setting the PCRE2_UCP option: it causes sequences
 such as \ed and \ew to use Unicode properties to determine character types,
 instead of recognizing only characters with codes less than 256 via a lookup
-table.
+table. If also causes upper/lower casing operations to use Unicode properties
+for characters with code points greater than 127, even when UTF is not set.
 .P
 Some applications that allow their users to supply patterns may wish to
 restrict them for security reasons. If the PCRE2_NEVER_UCP option is passed to
@ -261,8 +263,11 @@ corresponding characters in the subject. As a trivial example, the pattern
  The quick brown fox
 .sp
 matches a portion of a subject string that is identical to itself. When
-caseless matching is specified (the PCRE2_CASELESS option), letters are matched
-independently of case.
+caseless matching is specified (the PCRE2_CASELESS option or (?i) within the
+pattern), letters are matched independently of case. Note that there are two
+ASCII characters, K and S, that, in addition to their lower case ASCII
+equivalents, are case-equivalent with Unicode U+212A (Kelvin sign) and U+017F
+(long S) respectively when either PCRE2_UTF or PCRE2_UCP is set.
 .P
 The power of regular expressions comes from the ability to include wild cards,
 character classes, alternatives, and repetitions in the pattern. These are
@ -296,6 +301,22 @@ a character class the only metacharacters are:
  [      POSIX character class (if followed by POSIX syntax)
  ]      terminates the character class
 .sp
+If a pattern is compiled with the PCRE2_EXTENDED option, most white space in
+the pattern, other than in a character class, and characters between a #
+outside a character class and the next newline, inclusive, are ignored. An
+escaping backslash can be used to include a white space or a # character as
+part of the pattern. If the PCRE2_EXTENDED_MORE option is set, the same
+applies, but in addition unescaped space and horizontal tab characters are
+ignored inside a character class. Note: only these two characters are ignored,
+not the full set of pattern white space characters that are ignored outside a
+character class. Option settings can be changed within a pattern; see the
+section entitled
+.\" HTML <a href="#internaloptions">
+.\" </a>
+"Internal Option Setting"
+.\"
+below.
+.P
 The following sections describe the use of each of the metacharacters.
 .
 .
@ -313,15 +334,9 @@ would otherwise be interpreted as a metacharacter, so it is always safe to
 precede a non-alphanumeric with backslash to specify that it stands for itself.
 In particular, if you want to match a backslash, you write \e\e.
 .P
-In a UTF mode, only ASCII digits and letters have any special meaning after a
-backslash. All other characters (in particular, those whose code points are
-greater than 127) are treated as literals.
-.P
-If a pattern is compiled with the PCRE2_EXTENDED option, most white space in
-the pattern (other than in a character class), and characters between a #
-outside a character class and the next newline, inclusive, are ignored. An
-escaping backslash can be used to include a white space or # character as part
-of the pattern.
+Only ASCII digits and letters have any special meaning after a backslash. All
+other characters (in particular, those whose code points are greater than 127)
+are treated as literals.
 .P
 If you want to treat all characters in a sequence as literals, you can do so by
 putting them between \eQ and \eE. This is different from Perl in that $ and @
@ -398,11 +413,11 @@ PCRE2_EXTRA_ALT_BSUX has the same effect as PCRE2_ALT_BSUX and, in addition,
 There may be any number of hexadecimal digits. This syntax is from ECMAScript
 6.
 .P
-The \eN{U+hhh..} escape sequence is recognized only when the PCRE2_UTF option
-is set, that is, when PCRE2 is operating in a Unicode mode. Perl also uses
-\eN{name} to specify characters by Unicode name; PCRE2 does not support this.
-Note that when \eN is not followed by an opening brace (curly bracket) it has
-an entirely different meaning, matching any character that is not a newline.
+The \eN{U+hhh..} escape sequence is recognized only when PCRE2 is operating in
+UTF mode. Perl also uses \eN{name} to specify characters by Unicode name; PCRE2
+does not support this. Note that when \eN is not followed by an opening brace
+(curly bracket) it has an entirely different meaning, matching any character
+that is not a newline.
 .P
 There are some legacy applications where the escape sequence \er is expected to
 match a newline. If the PCRE2_EXTRA_ESCAPED_CR_IS_LF option is set, \er in a
@ -494,7 +509,6 @@ for themselves. For example, outside a character class:
 .\" JOIN
  \e377   might be a backreference, otherwise
            the value 255 (decimal)
-.\" JOIN
  \e81    is always a backreference
 .sp
 Note that octal values of 100 or greater that are specified using this syntax
@ -726,7 +740,7 @@ Unicode support is not needed for these characters to be recognized.
 .P
 It is possible to restrict \eR to match only CR, LF, or CRLF (instead of the
 complete set of Unicode line endings) by setting the option PCRE2_BSR_ANYCRLF
-at compile time. (BSR is an abbrevation for "backslash R".) This can be made
+at compile time. (BSR is an abbreviation for "backslash R".) This can be made
 the default when PCRE2 is built; if this is the case, the other behaviour can
 be requested via the PCRE2_BSR_UNICODE option. It is also possible to specify
 these settings by starting a pattern string with one of the following
@ -758,187 +772,64 @@ can be used in any mode, though in 8-bit and 16-bit non-UTF modes these
 sequences are of course limited to testing characters whose code points are
 less than U+0100 and U+10000, respectively. In 32-bit non-UTF mode, code points
 greater than 0x10ffff (the Unicode limit) may be encountered. These are all
-treated as being in the Unknown script and with an unassigned type. The extra
-escape sequences are:
+treated as being in the Unknown script and with an unassigned type.
+.P
+Matching characters by Unicode property is not fast, because PCRE2 has to do a
+multistage table lookup in order to find a character's property. That is why
+the traditional escape sequences such as \ed and \ew do not use Unicode
+properties in PCRE2 by default, though you can make them do so by setting the
+PCRE2_UCP option or by starting the pattern with (*UCP).
+.P
+The extra escape sequences that provide property support are:
 .sp
  \ep{\fIxx\fP}   a character with the \fIxx\fP property
  \eP{\fIxx\fP}   a character without the \fIxx\fP property
  \eX       a Unicode extended grapheme cluster
 .sp
-The property names represented by \fIxx\fP above are case-sensitive. There is
-support for Unicode script names, Unicode general category properties, "Any",
-which matches any character (including newline), and some special PCRE2
-properties (described in the
+The property names represented by \fIxx\fP above are not case-sensitive, and in
+accordance with Unicode's "loose matching" rules, spaces, hyphens, and
+underscores are ignored. There is support for Unicode script names, Unicode
+general category properties, "Any", which matches any character (including
+newline), Bidi_Class, a number of binary (yes/no) properties, and some special
+PCRE2 properties (described
 .\" HTML <a href="#extraprops">
 .\" </a>
-next section).
+below).
 .\"
-Other Perl properties such as "InMusicalSymbols" are not supported by PCRE2.
-Note that \eP{Any} does not match any characters, so always causes a match
-failure.
+Certain other Perl properties such as "InMusicalSymbols" are not supported by
+PCRE2. Note that \eP{Any} does not match any characters, so always causes a
+match failure.
+.
+.
+.
+.SS "Script properties for \ep and \eP"
+.rs
+.sp
+There are three different syntax forms for matching a script. Each Unicode
+character has a basic script and, optionally, a list of other scripts ("Script
+Extensions") with which it is commonly used. Using the Adlam script as an
+example, \ep{sc:Adlam} matches characters whose basic script is Adlam, whereas
+\ep{scx:Adlam} matches, in addition, characters that have Adlam in their
+extensions list. The full names "script" and "script extensions" for the
+property types are recognized, and a equals sign is an alternative to the
+colon. If a script name is given without a property type, for example,
+\ep{Adlam}, it is treated as \ep{scx:Adlam}. Perl changed to this
+interpretation at release 5.26 and PCRE2 changed at release 10.40.
 .P
-Sets of Unicode characters are defined as belonging to certain scripts. A
-character from one of these sets can be matched using a script name. For
-example:
-.sp
-  \ep{Greek}
-  \eP{Han}
-.sp
 Unassigned characters (and in non-UTF 32-bit mode, characters with code points
 greater than 0x10FFFF) are assigned the "Unknown" script. Others that are not
 part of an identified script are lumped together as "Common". The current list
-of scripts is:
-.P
-Adlam,
-Ahom,
-Anatolian_Hieroglyphs,
-Arabic,
-Armenian,
-Avestan,
-Balinese,
-Bamum,
-Bassa_Vah,
-Batak,
-Bengali,
-Bhaiksuki,
-Bopomofo,
-Brahmi,
-Braille,
-Buginese,
-Buhid,
-Canadian_Aboriginal,
-Carian,
-Caucasian_Albanian,
-Chakma,
-Cham,
-Cherokee,
-Common,
-Coptic,
-Cuneiform,
-Cypriot,
-Cyrillic,
-Deseret,
-Devanagari,
-Dogra,
-Duployan,
-Egyptian_Hieroglyphs,
-Elbasan,
-Ethiopic,
-Georgian,
-Glagolitic,
-Gothic,
-Grantha,
-Greek,
-Gujarati,
-Gunjala_Gondi,
-Gurmukhi,
-Han,
-Hangul,
-Hanifi_Rohingya,
-Hanunoo,
-Hatran,
-Hebrew,
-Hiragana,
-Imperial_Aramaic,
-Inherited,
-Inscriptional_Pahlavi,
-Inscriptional_Parthian,
-Javanese,
-Kaithi,
-Kannada,
-Katakana,
-Kayah_Li,
-Kharoshthi,
-Khmer,
-Khojki,
-Khudawadi,
-Lao,
-Latin,
-Lepcha,
-Limbu,
-Linear_A,
-Linear_B,
-Lisu,
-Lycian,
-Lydian,
-Mahajani,
-Makasar,
-Malayalam,
-Mandaic,
-Manichaean,
-Marchen,
-Masaram_Gondi,
-Medefaidrin,
-Meetei_Mayek,
-Mende_Kikakui,
-Meroitic_Cursive,
-Meroitic_Hieroglyphs,
-Miao,
-Modi,
-Mongolian,
-Mro,
-Multani,
-Myanmar,
-Nabataean,
-New_Tai_Lue,
-Newa,
-Nko,
-Nushu,
-Ogham,
-Ol_Chiki,
-Old_Hungarian,
-Old_Italic,
-Old_North_Arabian,
-Old_Permic,
-Old_Persian,
-Old_Sogdian,
-Old_South_Arabian,
-Old_Turkic,
-Oriya,
-Osage,
-Osmanya,
-Pahawh_Hmong,
-Palmyrene,
-Pau_Cin_Hau,
-Phags_Pa,
-Phoenician,
-Psalter_Pahlavi,
-Rejang,
-Runic,
-Samaritan,
-Saurashtra,
-Sharada,
-Shavian,
-Siddham,
-SignWriting,
-Sinhala,
-Sogdian,
-Sora_Sompeng,
-Soyombo,
-Sundanese,
-Syloti_Nagri,
-Syriac,
-Tagalog,
-Tagbanwa,
-Tai_Le,
-Tai_Tham,
-Tai_Viet,
-Takri,
-Tamil,
-Tangut,
-Telugu,
-Thaana,
-Thai,
-Tibetan,
-Tifinagh,
-Tirhuta,
-Ugaritic,
-Unknown,
-Vai,
-Warang_Citi,
-Yi,
-Zanabazar_Square.
-.P
+of recognized script names and their 4-character abbreviations can be obtained
+by running this command:
+.sp
+  pcre2test -LS
+.sp
+.
+.
+.
+.SS "The general category property for \ep and \eP"
+.rs
+.sp
 Each character has exactly one Unicode general category property, specified by
 a two-letter abbreviation. For compatibility with Perl, negation can be
 specified by including a circumflex between the opening brace and the property
@ -998,9 +889,9 @@ The following general category property codes are supported:
  Zp    Paragraph separator
  Zs    Space separator
 .sp
-The special property L& is also supported: it matches a character that has
-the Lu, Ll, or Lt property, in other words, a letter that is not classified as
-a modifier or "other".
+The special property LC, which has the synonym L&, is also supported: it
+matches a character that has the Lu, Ll, or Lt property, in other words, a
+letter that is not classified as a modifier or "other".
 .P
 The Cs (Surrogate) property applies only to characters whose code points are in
 the range U+D800 to U+DFFF. These characters are no different to any other
@ -1024,12 +915,53 @@ Unicode table.
 Specifying caseless matching does not affect these escape sequences. For
 example, \ep{Lu} always matches only upper case letters. This is different from
 the behaviour of current versions of Perl.
-.P
-Matching characters by Unicode property is not fast, because PCRE2 has to do a
-multistage table lookup in order to find a character's property. That is why
-the traditional escape sequences such as \ed and \ew do not use Unicode
-properties in PCRE2 by default, though you can make them do so by setting the
-PCRE2_UCP option or by starting the pattern with (*UCP).
+.
+.
+.SS "Binary (yes/no) properties for \ep and \eP"
+.rs
+.sp
+Unicode defines a number of binary properties, that is, properties whose only
+values are true or false. You can obtain a list of those that are recognized by
+\ep and \eP, along with their abbreviations, by running this command:
+.sp
+  pcre2test -LP
+.sp
+.
+.
+.SS "The Bidi_Class property for \ep and \eP"
+.rs
+.sp
+  \ep{Bidi_Class:<class>}   matches a character with the given class
+  \ep{BC:<class>}           matches a character with the given class
+.sp
+The recognized classes are:
+.sp
+  AL          Arabic letter
+  AN          Arabic number
+  B           paragraph separator
+  BN          boundary neutral
+  CS          common separator
+  EN          European number
+  ES          European separator
+  ET          European terminator
+  FSI         first strong isolate
+  L           left-to-right
+  LRE         left-to-right embedding
+  LRI         left-to-right isolate
+  LRO         left-to-right override
+  NSM         non-spacing mark
+  ON          other neutral
+  PDF         pop directional format
+  PDI         pop directional isolate
+  R           right-to-left
+  RLE         right-to-left embedding
+  RLI         right-to-left isolate
+  RLO         right-to-left override
+  S           segment separator
+  WS          which space
+.sp
+An equals sign may be used instead of a colon. The class names are
+case-insensitive; only the short names listed above are recognized.
 .
 .
 .SS Extended grapheme clusters
@ -1059,7 +991,7 @@ additional characters according to the following rules for ending a cluster:
 3. Do not break Hangul (a Korean script) syllable sequences. Hangul characters
 are of five types: L, V, T, LV, and LVT. An L character may be followed by an
 L, V, LV, or LVT character; an LV or V character may be followed by a V or T
-character; an LVT or T character may be follwed only by a T character.
+character; an LVT or T character may be followed only by a T character.
 .P
 4. Do not end before extending characters or spacing marks or the "zero-width
 joiner" character. Characters with the "mark" property always have the
@ -1145,8 +1077,11 @@ For example, when the pattern
 .sp
 matches "foobar", the first substring is still set to "foo".
 .P
-Perl documents that the use of \eK within assertions is "not well defined". In
-PCRE2, \eK is acted upon when it occurs inside positive assertions, but is
+From version 5.32.0 Perl forbids the use of \eK in lookaround assertions. From
+release 10.38 PCRE2 also forbids this by default. However, the
+PCRE2_EXTRA_ALLOW_LOOKAROUND_BSK option can be used when calling
+\fBpcre2_compile()\fP to re-enable the previous behaviour. When this option is
+set, \eK is acted upon when it occurs inside positive assertions, but is
 ignored in negative assertions. Note that when a pattern such as (?=ab\eK)
 matches, the reported start of the match can be greater than the end of the
 match. Using \eK in a lookbehind assertion at the start of a pattern can also
@ -1305,14 +1240,19 @@ end of the subject in both modes, and if all branches of a pattern start with
 .sp
 Outside a character class, a dot in the pattern matches any one character in
 the subject string except (by default) a character that signifies the end of a
-line.
+line. One or more characters may be specified as line terminators (see
+.\" HTML <a href="#newlines">
+.\" </a>
+"Newline conventions"
+.\"
+above).
 .P
-When a line ending is defined as a single character, dot never matches that
-character; when the two-character sequence CRLF is used, dot does not match CR
-if it is immediately followed by LF, but otherwise it matches all characters
-(including isolated CRs and LFs). When any Unicode line endings are being
-recognized, dot does not match CR or LF or any of the other line ending
-characters.
+Dot never matches a single line-ending character. When the two-character
+sequence CRLF is the only line ending, dot does not match CR if it is
+immediately followed by LF, but otherwise it matches all characters (including
+isolated CRs and LFs). When ANYCRLF is selected for line endings, no occurences
+of CR of LF match dot. When all Unicode line endings are being recognized, dot
+does not match CR or LF or any of the other line ending characters.
 .P
 The behaviour of dot with regard to newlines can be changed. If the
 PCRE2_DOTALL option is set, a dot matches any one character, without exception.
@ -1352,7 +1292,7 @@ with \eC in UTF-8 or UTF-16 mode means that the rest of the string may start
 with a malformed UTF character. This has undefined results, because PCRE2
 assumes that it is matching character by character in a valid UTF string (by
 default it checks the subject string's validity at the start of processing
-unless the PCRE2_NO_UTF_CHECK option is used).
+unless the PCRE2_NO_UTF_CHECK or PCRE2_MATCH_INVALID_UTF option is used).
 .P
 An application can lock out the use of \eC by setting the
 PCRE2_NEVER_BACKSLASH_C option when compiling a pattern. It is also possible to
@ -1426,7 +1366,10 @@ Characters in a class may be specified by their code points using \eo, \ex, or
 \eN{U+hh..} in the usual way. When caseless matching is set, any letters in a
 class represent both their upper case and lower case versions, so for example,
 a caseless [aeiou] matches "A" as well as "a", and a caseless [^aeiou] does not
-match "A", whereas a caseful version would.
+match "A", whereas a caseful version would. Note that there are two ASCII
+characters, K and S, that, in addition to their lower case ASCII equivalents,
+are case-equivalent with Unicode U+212A (Kelvin sign) and U+017F (long S)
+respectively when either PCRE2_UTF or PCRE2_UCP is set.
 .P
 Characters that might indicate line breaks are never treated in any special way
 when matching character classes, whatever line-ending sequence is in use, and
@ -1638,6 +1581,7 @@ that succeeds is used. If the alternatives are within a group
 alternative in the group.
 .
 .
+.\" HTML <a name="internaloptions"></a>
 .SH "INTERNAL OPTION SETTING"
 .rs
 .sp
@ -1896,12 +1840,21 @@ are permitted for groups with the same number, for example:
  (?|(?<AA>aa)|(?<AA>bb))
 .sp
 The duplicate name constraint can be disabled by setting the PCRE2_DUPNAMES
-option at compile time, or by the use of (?J) within the pattern. Duplicate
-names can be useful for patterns where only one instance of the named capture
-group can match. Suppose you want to match the name of a weekday, either as a
-3-letter abbreviation or as the full name, and in both cases you want to
-extract the abbreviation. This pattern (ignoring the line breaks) does the job:
+option at compile time, or by the use of (?J) within the pattern, as described
+in the section entitled
+.\" HTML <a href="#internaloptions">
+.\" </a>
+"Internal Option Setting"
+.\"
+above.
+.P
+Duplicate names can be useful for patterns where only one instance of the named
+capture group can match. Suppose you want to match the name of a weekday,
+either as a 3-letter abbreviation or as the full name, and in both cases you
+want to extract the abbreviation. This pattern (ignoring the line breaks) does
+the job:
 .sp
+  (?J)
  (?<DN>Mon|Fri|Sun)(?:day)?|
  (?<DN>Tue)(?:sday)?|
  (?<DN>Wed)(?:nesday)?|
@ -1921,7 +1874,7 @@ they appear in the overall pattern. The first one that is set is used for the
 reference. For example, this pattern matches both "foofoo" and "barbar" but not
 "foobar" or "barfoo":
 .sp
-  (?:(?<n>foo)|(?<n>bar))\ek<n>
+  (?J)(?:(?<n>foo)|(?<n>bar))\ek<n>
 .sp
 .P
 If you make a subroutine call to a non-unique named group, the one that
@ -1960,7 +1913,7 @@ items:
  an escape such as \ed or \epL that matches a single character
  a character class
  a backreference
-  a parenthesized group (including most assertions)
+  a parenthesized group (including lookaround assertions)
  a subroutine call (recursive or otherwise)
 .sp
 The general repetition quantifier specifies a minimum and maximum number of
@ -2021,8 +1974,10 @@ no characters with a quantifier that has no upper limit, for example:
 .sp
 Earlier versions of Perl and PCRE1 used to give an error at compile time for
 such patterns. However, because there are cases where this can be useful, such
-patterns are now accepted, but if any repetition of the group does in fact
-match no characters, the loop is forcibly broken.
+patterns are now accepted, but whenever an iteration of such a group matches no
+characters, matching moves on to the next item in the pattern instead of
+repeatedly matching an empty string. This does not prevent backtracking into
+any of the iterations if a subsequent item fails to match.
 .P
 By default, quantifiers are "greedy", that is, they match as much as possible
 (up to the maximum number of permitted times), without causing the rest of the
@ -2140,10 +2095,10 @@ be easier to remember:
 .sp
  (*atomic:\ed+)foo
 .sp
-This kind of parenthesized group "locks up" the  part of the pattern it
-contains once it has matched, and a failure further into the pattern is
-prevented from backtracking into it. Backtracking past it to previous items,
-however, works as normal.
+This kind of parenthesized group "locks up" the part of the pattern it contains
+once it has matched, and a failure further into the pattern is prevented from
+backtracking into it. Backtracking past it to previous items, however, works as
+normal.
 .P
 An alternative description is that a group of this type matches exactly the
 string of characters that an identical standalone pattern would match, if
@ -2339,14 +2294,14 @@ the first iteration does not need to match the backreference. This can be done
 using alternation, as in the example above, or by a quantifier with a minimum
 of zero.
 .P
-Backreferences of this type cause the group that they reference to be treated
-as an
+For versions of PCRE2 less than 10.25, backreferences of this type used to
+cause the group that they reference to be treated as an
 .\" HTML <a href="#atomicgroup">
 .\" </a>
 atomic group.
 .\"
-Once the whole group has been matched, a subsequent matching failure cannot
-cause backtracking into the middle of the group.
+This restriction no longer applies, and backtracking into such groups can occur
+as normal.
 .
 .
 .\" HTML <a name="bigassertions"></a>
@ -2367,9 +2322,19 @@ those that look behind it, and in each case an assertion may be positive (must
 match for the assertion to be true) or negative (must not match for the
 assertion to be true). An assertion group is matched in the normal way,
 and if it is true, matching continues after it, but with the matching position
-in the subject string is was it was before the assertion was processed.
+in the subject string reset to what it was before the assertion was processed.
 .P
-A lookaround assertion may also appear as the condition in a
+The Perl-compatible lookaround assertions are atomic. If an assertion is true,
+but there is a subsequent matching failure, there is no backtracking into the
+assertion. However, there are some cases where non-atomic assertions can be
+useful. PCRE2 has some support for these, described in the section entitled
+.\" HTML <a href="#nonatomicassertions">
+.\" </a>
+"Non-atomic assertions"
+.\"
+below, but they are not Perl-compatible.
+.P
+A lookaround assertion may appear as the condition in a
 .\" HTML <a href="#conditions">
 .\" </a>
 conditional group
@ -2404,36 +2369,23 @@ the "no" branch of the condition. For other failing negative assertions,
 control passes to the previous backtracking point, thus discarding any captured
 strings within the assertion.
 .P
-For compatibility with Perl, most assertion groups may be repeated; though it
-makes no sense to assert the same thing several times, the side effect of
-capturing may occasionally be useful. However, an assertion that forms the
-condition for a conditional group may not be quantified. In practice, for
-other assertions, there only three cases:
-.sp
-(1) If the quantifier is {0}, the assertion is never obeyed during matching.
-However, it may contain internal capture groups that are called from elsewhere
-via the
-.\" HTML <a href="#groupsassubroutines">
-.\" </a>
-subroutine mechanism.
-.\"
-.sp
-(2) If quantifier is {0,n} where n is greater than zero, it is treated as if it
-were {0,1}. At run time, the rest of the pattern match is tried with and
-without the assertion, the order depending on the greediness of the quantifier.
-.sp
-(3) If the minimum repetition is greater than zero, the quantifier is ignored.
-The assertion is obeyed just once when encountered during matching.
+Most assertion groups may be repeated; though it makes no sense to assert the
+same thing several times, the side effect of capturing in positive assertions
+may occasionally be useful. However, an assertion that forms the condition for
+a conditional group may not be quantified. PCRE2 used to restrict the
+repetition of assertions, but from release 10.35 the only restriction is that
+an unlimited maximum repetition is changed to be one more than the minimum. For
+example, {3,} is treated as {3,4}.
 .
 .
 .SS "Alphabetic assertion names"
 .rs
 .sp
-Traditionally, symbolic sequences such as (?= and (?<= have been used to specify
-lookaround assertions. Perl 5.28 introduced some experimental alphabetic
-alternatives which might be easier to remember. They all start with (* instead
-of (? and must be written using lower case letters. PCRE2 supports the
-following synonyms:
+Traditionally, symbolic sequences such as (?= and (?<= have been used to
+specify lookaround assertions. Perl 5.28 introduced some experimental
+alphabetic alternatives which might be easier to remember. They all start with
+(* instead of (? and must be written using lower case letters. PCRE2 supports
+the following synonyms:
 .sp
  (*positive_lookahead:  or (*pla: is the same as (?=
  (*negative_lookahead:  or (*nla: is the same as (?!
@ -2610,6 +2562,68 @@ is another pattern that matches "foo" preceded by three digits and any three
 characters that are not "999".
 .
 .
+.\" HTML <a name="nonatomicassertions"></a>
+.SH "NON-ATOMIC ASSERTIONS"
+.rs
+.sp
+The traditional Perl-compatible lookaround assertions are atomic. That is, if
+an assertion is true, but there is a subsequent matching failure, there is no
+backtracking into the assertion. However, there are some cases where non-atomic
+positive assertions can be useful. PCRE2 provides these using the following
+syntax:
+.sp
+  (*non_atomic_positive_lookahead:  or (*napla: or (?*
+  (*non_atomic_positive_lookbehind: or (*naplb: or (?<*
+.sp
+Consider the problem of finding the right-most word in a string that also
+appears earlier in the string, that is, it must appear at least twice in total.
+This pattern returns the required result as captured substring 1:
+.sp
+  ^(?x)(*napla: .* \eb(\ew++)) (?> .*? \eb\e1\eb ){2}
+.sp
+For a subject such as "word1 word2 word3 word2 word3 word4" the result is
+"word3". How does it work? At the start, ^(?x) anchors the pattern and sets the
+"x" option, which causes white space (introduced for readability) to be
+ignored. Inside the assertion, the greedy .* at first consumes the entire
+string, but then has to backtrack until the rest of the assertion can match a
+word, which is captured by group 1. In other words, when the assertion first
+succeeds, it captures the right-most word in the string.
+.P
+The current matching point is then reset to the start of the subject, and the
+rest of the pattern match checks for two occurrences of the captured word,
+using an ungreedy .*? to scan from the left. If this succeeds, we are done, but
+if the last word in the string does not occur twice, this part of the pattern
+fails. If a traditional atomic lookhead (?= or (*pla: had been used, the
+assertion could not be re-entered, and the whole match would fail. The pattern
+would succeed only if the very last word in the subject was found twice.
+.P
+Using a non-atomic lookahead, however, means that when the last word does not
+occur twice in the string, the lookahead can backtrack and find the second-last
+word, and so on, until either the match succeeds, or all words have been
+tested.
+.P
+Two conditions must be met for a non-atomic assertion to be useful: the
+contents of one or more capturing groups must change after a backtrack into the
+assertion, and there must be a backreference to a changed group later in the
+pattern. If this is not the case, the rest of the pattern match fails exactly
+as before because nothing has changed, so using a non-atomic assertion just
+wastes resources.
+.P
+There is one exception to backtracking into a non-atomic assertion. If an
+(*ACCEPT) control verb is triggered, the assertion succeeds atomically. That
+is, a subsequent match failure cannot backtrack into the assertion.
+.P
+Non-atomic assertions are not supported by the alternative matching function
+\fBpcre2_dfa_match()\fP. They are supported by JIT, but only if they do not
+contain any control verbs such as (*ACCEPT). (This may change in future). Note
+that assertions that appear as conditions for
+.\" HTML <a href="#conditions">
+.\" </a>
+conditional groups
+.\"
+(see below) must be atomic.
+.
+.
 .SH "SCRIPT RUNS"
 .rs
 .sp
@ -2830,7 +2844,7 @@ breaks):
  (?(DEFINE) (?<byte> 2[0-4]\ed | 25[0-5] | 1\ed\ed | [1-9]?\ed) )
  \eb (?&byte) (\e.(?&byte)){3} \eb
 .sp
-The first part of the pattern is a DEFINE group inside which a another group
+The first part of the pattern is a DEFINE group inside which another group
 named "byte" is defined. This matches an individual component of an IPv4
 address (a number less than 256). When matching takes place, this part of the
 pattern is skipped because DEFINE acts like a false condition. The rest of the
@ -2861,8 +2875,15 @@ than two digits.
 .sp
 If the condition is not in any of the above formats, it must be a parenthesized
 assertion. This may be a positive or negative lookahead or lookbehind
-assertion. Consider this pattern, again containing non-significant white space,
-and with the two alternatives on the second line:
+assertion. However, it must be a traditional atomic assertion, not one of the
+PCRE2-specific
+.\" HTML <a href="#nonatomicassertions">
+.\" </a>
+non-atomic assertions.
+.\"
+.P
+Consider this pattern, again containing non-significant white space, and with
+the two alternatives on the second line:
 .sp
  (?(?=[^a-z]*[a-z])
  \ed{2}-[a-z]{3}-\ed{2}  |  \ed{2}-\ed{2}-\ed{2} )
@ -3261,8 +3282,8 @@ The doubling is removed before the string is passed to the callout function.
 There are a number of special "Backtracking Control Verbs" (to use Perl's
 terminology) that modify the behaviour of backtracking during matching. They
 are generally of the form (*VERB) or (*VERB:NAME). Some verbs take either form,
-possibly behaving differently depending on whether or not a name is present.
-The names are not required to be unique within the pattern.
+and may behave differently depending on whether or not a name argument is
+present. The names are not required to be unique within the pattern.
 .P
 By default, for compatibility with Perl, a name is any sequence of characters
 that does not include a closing parenthesis. The name is not processed in
@ -3286,7 +3307,8 @@ PCRE2_ALT_VERBNAMES is also set.
 The maximum length of a name is 255 in the 8-bit library and 65535 in the
 16-bit and 32-bit libraries. If the name is empty, that is, if the closing
 parenthesis immediately follows the colon, the effect is as if the colon were
-not there. Any number of these verbs may occur in a pattern.
+not there. Any number of these verbs may occur in a pattern. Except for
+(*ACCEPT), they may not be quantified.
 .P
 Since these verbs are specifically related to backtracking, most of them can be
 used only when the pattern is to be matched using the traditional matching
@ -3360,6 +3382,18 @@ example:
 This matches "AB", "AAD", or "ACD"; when it matches "AB", "B" is captured by
 the outer parentheses.
 .P
+(*ACCEPT) is the only backtracking verb that is allowed to be quantified
+because an ungreedy quantification with a minimum of zero acts only when a
+backtrack happens. Consider, for example,
+.sp
+  (A(*ACCEPT)??B)C
+.sp
+where A, B, and C may be complex expressions. After matching "A", the matcher
+processes "BC"; if that fails, causing a backtrack, (*ACCEPT) is triggered and
+the match succeeds. In both cases, all but C is captured. Whereas (*COMMIT)
+(see below) means "fail on backtrack", a repeated (*ACCEPT) of this type means
+"succeed on backtrack".
+.P
 \fBWarning:\fP (*ACCEPT) should not be used within a script run group, because
 it causes an immediate exit from the group, bypassing the script run checking.
 .sp
@ -3376,8 +3410,9 @@ nearest equivalent is the callout feature, as for example in this pattern:
 A match with the string "aaaa" always fails, but the callout is taken before
 each backtrack happens (in this example, 10 times).
 .P
-(*ACCEPT:NAME) and (*FAIL:NAME) are treated as (*MARK:NAME)(*ACCEPT) and
-(*MARK:NAME)(*FAIL), respectively.
+(*ACCEPT:NAME) and (*FAIL:NAME) behave the same as (*MARK:NAME)(*ACCEPT) and
+(*MARK:NAME)(*FAIL), respectively, that is, a (*MARK) is recorded just before
+the verb acts.
 .
 .
 .SS "Recording which path was taken"
@ -3539,10 +3574,15 @@ successful match if there is a later mismatch. Consider:
 .sp
 If the subject is "aaaac...", after the first match attempt fails (starting at
 the first character in the string), the starting point skips on to start the
-next attempt at "c". Note that a possessive quantifer does not have the same
+next attempt at "c". Note that a possessive quantifier does not have the same
 effect as this example; although it would suppress backtracking during the
 first match attempt, the second attempt would start at the second character
 instead of skipping on to "c".
+.P
+If (*SKIP) is used to specify a new starting position that is the same as the
+starting position of the current match, or (by being inside a lookbehind)
+earlier, the position specified by (*SKIP) is ignored, and instead the normal
+"bumpalong" occurs.
 .sp
  (*SKIP:NAME)
 .sp
@ -3700,11 +3740,22 @@ a positive assertion and false for a negative one; captured substrings are
 retained in both cases.
 .P
 The remaining verbs act only when a later failure causes a backtrack to
-reach them. This means that their effect is confined to the assertion,
-because lookaround assertions are atomic. A backtrack that occurs after an
-assertion is complete does not jump back into the assertion. Note in particular
-that a (*MARK) name that is set in an assertion is not "seen" by an instance of
-(*SKIP:NAME) latter in the pattern.
+reach them. This means that, for the Perl-compatible assertions, their effect
+is confined to the assertion, because Perl lookaround assertions are atomic. A
+backtrack that occurs after such an assertion is complete does not jump back
+into the assertion. Note in particular that a (*MARK) name that is set in an
+assertion is not "seen" by an instance of (*SKIP:NAME) later in the pattern.
+.P
+PCRE2 now supports non-atomic positive assertions, as described in the section
+entitled
+.\" HTML <a href="#nonatomicassertions">
+.\" </a>
+"Non-atomic assertions"
+.\"
+above. These assertions must be standalone (not used as conditions). They are
+not Perl-compatible. For these assertions, a later backtrack does jump back
+into the assertion, and therefore verbs such as (*COMMIT) can be triggered by
+backtracks from later in the pattern.
 .P
 The effect of (*THEN) is not allowed to escape beyond an assertion. If there
 are no more branches to try, (*THEN) causes a positive assertion to be false,
@ -3754,7 +3805,7 @@ there is a backtrack at the outer level.
 .sp
 .nf
 Philip Hazel
-University Computing Service
+Retired from University Computing Service
 Cambridge, England.
 .fi
 .
@ -3763,6 +3814,6 @@ Cambridge, England.
 .rs
 .sp
 .nf
-Last updated: 12 February 2019
-Copyright (c) 1997-2019 University of Cambridge.
+Last updated: 12 January 2022
+Copyright (c) 1997-2022 University of Cambridge.
 .fi
--- a/doc/pcre2perform.3
+++ b/doc/pcre2perform.3
@ -1,4 +1,4 @@
-.TH PCRE2PERFORM 3 "03 February 2019" "PCRE2 10.33"
+.TH PCRE2PERFORM 3 "27 July 2022" "PCRE2 10.41"
 .SH NAME
 PCRE2 - Perl-compatible regular expressions (revised API)
 .SH "PCRE2 PERFORMANCE"
@ -69,12 +69,28 @@ From release 10.30, the interpretive (non-JIT) version of \fBpcre2_match()\fP
 uses very little system stack at run time. In earlier releases recursive
 function calls could use a great deal of stack, and this could cause problems,
 but this usage has been eliminated. Backtracking positions are now explicitly
-remembered in memory frames controlled by the code. An initial 20KiB vector of
-frames is allocated on the system stack (enough for about 100 frames for small
-patterns), but if this is insufficient, heap memory is used. The amount of heap
-memory can be limited; if the limit is set to zero, only the initial stack
-vector is used. Rewriting patterns to be time-efficient, as described below,
-may also reduce the memory requirements.
+remembered in memory frames controlled by the code. 
+.P
+The size of each frame depends on the size of pointer variables and the number
+of capturing parenthesized groups in the pattern being matched. On a 64-bit
+system the frame size for a pattern with no captures is 128 bytes. For each
+capturing group the size increases by 16 bytes.
+.P
+Until release 10.41, an initial 20KiB frames vector was allocated on the system 
+stack, but this still caused some issues for multi-thread applications where
+each thread has a very small stack. From release 10.41 backtracking memory
+frames are always held in heap memory. An initial heap allocation is obtained
+the first time any match data block is passed to \fBpcre2_match()\fP. This is
+remembered with the match data block and re-used if that block is used for
+another match. It is freed when the match data block itself is freed.
+.P
+The size of the initial block is the larger of 20KiB or ten times the pattern's 
+frame size, unless the heap limit is less than this, in which case the heap 
+limit is used. If the initial block proves to be too small during matching, it
+is replaced by a larger block, subject to the heap limit. The heap limit is 
+checked only when a new block is to be allocated. Reducing the heap limit 
+between calls to \fBpcre2_match()\fP with the same match data block does not 
+affect the saved block.
 .P
 In contrast to \fBpcre2_match()\fP, \fBpcre2_dfa_match()\fP does use recursive
 function calls, but only for processing atomic groups, lookaround assertions,
@ -230,7 +246,7 @@ pattern to match. This is done by repeatedly matching with different limits.
 .sp
 .nf
 Philip Hazel
-University Computing Service
+Retired from University Computing Service
 Cambridge, England.
 .fi
 .
@ -239,6 +255,6 @@ Cambridge, England.
 .rs
 .sp
 .nf
-Last updated: 03 February 2019
-Copyright (c) 1997-2019 University of Cambridge.
+Last updated: 27 July 2022
+Copyright (c) 1997-2022 University of Cambridge.
 .fi
--- a/doc/pcre2posix.3
+++ b/doc/pcre2posix.3
@ -1,4 +1,4 @@
-.TH PCRE2POSIX 3 "30 January 2019" "PCRE2 10.33"
+.TH PCRE2POSIX 3 "26 April 2021" "PCRE2 10.37"
 .SH NAME
 PCRE2 - Perl-compatible regular expressions (revised API)
 .SH "SYNOPSIS"
@ -44,11 +44,14 @@ can be accessed by adding \fB-lpcre2-posix\fP to the command for linking an
 application. Because the POSIX functions call the native ones, it is also
 necessary to add \fB-lpcre2-8\fP.
 .P
-Although they are not defined as protypes in \fBpcre2posix.h\fP, the library
-does contain functions with the POSIX names \fBregcomp()\fP etc. These simply
-pass their arguments to the PCRE2 functions. These functions are provided for
-backwards compatibility with earlier versions of PCRE2, so that existing
-programs do not have to be recompiled.
+Although they were not defined as protypes in \fBpcre2posix.h\fP, releases
+10.33 to 10.36 of the library contained functions with the POSIX names
+\fBregcomp()\fP etc. These simply passed their arguments to the PCRE2
+functions. These functions were provided for backwards compatibility with
+earlier versions of PCRE2, which had only POSIX names. However, this has proved
+troublesome in situations where a program links with several libraries, some of
+which use PCRE2's POSIX interface while others use the real POSIX functions.
+For this reason, the POSIX names have been removed since release 10.37.
 .P
 Calling the header file \fBpcre2posix.h\fP avoids any conflict with other POSIX
 libraries. It can, of course, be renamed or aliased as \fBregex.h\fP, which is
@ -321,6 +324,6 @@ Cambridge, England.
 .rs
 .sp
 .nf
-Last updated: 30 January 2019
-Copyright (c) 1997-2019 University of Cambridge.
+Last updated: 26 April 2021
+Copyright (c) 1997-2021 University of Cambridge.
 .fi
--- a/doc/pcre2serialize.3
+++ b/doc/pcre2serialize.3
@ -6,11 +6,11 @@ PCRE2 - Perl-compatible regular expressions (revised API)
 .sp
 .nf
 .B int32_t pcre2_serialize_decode(pcre2_code **\fIcodes\fP,
-.B "  int32_t \fInumber_of_codes\fP, const uint32_t *\fIbytes\fP,"
+.B "  int32_t \fInumber_of_codes\fP, const uint8_t *\fIbytes\fP,"
 .B "  pcre2_general_context *\fIgcontext\fP);"
 .sp
-.B int32_t pcre2_serialize_encode(pcre2_code **\fIcodes\fP,
-.B "  int32_t \fInumber_of_codes\fP, uint32_t **\fIserialized_bytes\fP,"
+.B int32_t pcre2_serialize_encode(const pcre2_code **\fIcodes\fP,
+.B "  int32_t \fInumber_of_codes\fP, uint8_t **\fIserialized_bytes\fP,"
 .B "  PCRE2_SIZE *\fIserialized_size\fP, pcre2_general_context *\fIgcontext\fP);"
 .sp
 .B void pcre2_serialize_free(uint8_t *\fIbytes\fP);
@ -81,7 +81,7 @@ of serialized patterns, or one of the following negative error codes:
 .sp
  PCRE2_ERROR_BADDATA      the number of patterns is zero or less
  PCRE2_ERROR_BADMAGIC     mismatch of id bytes in one of the patterns
-  PCRE2_ERROR_MEMORY       memory allocation failed
+  PCRE2_ERROR_NOMEMORY     memory allocation failed
  PCRE2_ERROR_MIXEDTABLES  the patterns do not all use the same tables
  PCRE2_ERROR_NULL         the 1st, 3rd, or 4th argument is NULL
 .sp
@ -141,7 +141,6 @@ mangagement functions for the decoded patterns. If this argument is NULL,
 \fBmalloc()\fP and \fBfree()\fP are used. After deserialization, the byte
 stream is no longer needed and can be discarded.
 .sp
-  int32_t number_of_codes;
  pcre2_code *list_of_codes[2];
  uint8_t *bytes = <serialized data>;
  int32_t number_of_codes =
--- a/Show More
+++ b/Show More