Compare commits
122 Commits
pcre2-10.3
...
amigaos
Author | SHA1 | Date |
---|---|---|
George Sokianos | 4a45482c9c | |
Philip Hazel | 8b133fa0ba | |
Philip Hazel | cc5e121c8e | |
Philip Hazel | 1343bdff8f | |
Philip Hazel | d90fb23878 | |
Ezekiel Warren | e47fc51584 | |
Zoltan Herczeg | b67d568201 | |
Zoltan Herczeg | 4851890ede | |
Amin Yahyaabadi | 3e52db5209 | |
Philip Hazel | 4804b00e8f | |
Philip Hazel | 7549fdca74 | |
Philip Hazel | 5271b533c4 | |
larinsv | 45af1203bd | |
Rémi Verschelde | 187b7ba050 | |
William A Rowe Jr | 06f34ba374 | |
GregThain | a334ea2a34 | |
Carlo Marcelo Arenas Belón | 15a82c3efd | |
Philip Hazel | 51a5fcdc1f | |
Philip Hazel | 104fe2fead | |
Philip Hazel | f65df06305 | |
pkeir | a13d7d4340 | |
Lucas Trzesniewski | c630e868ca | |
Joe Zhang | 77ce1ff528 | |
Philip Hazel | ff5402a378 | |
Philip Hazel | b52d055d1b | |
Carlo Marcelo Arenas Belón | a4ac97fea8 | |
Philip Hazel | fedf4d9d40 | |
Philip Hazel | 8ebf9efe7b | |
Carlo Marcelo Arenas Belón | 4edcf6ada5 | |
Philip Hazel | d0c7544e78 | |
Carlo Marcelo Arenas Belón | f28e82602d | |
Philip Hazel | 1bb2b97b29 | |
Lucas Trzesniewski | 3fec24a26f | |
Philip Hazel | 66b3cb34df | |
Philip Hazel | 29a43aa11d | |
Philip Hazel | 3103b8f20a | |
Philip Hazel | 13be26a5c2 | |
pagabuc | ba6a5f16d2 | |
Zoltan Herczeg | d07c967b3a | |
Carlo Marcelo Arenas Belón | 4279abbd7d | |
Philip Hazel | 8ff3ab27d5 | |
Zoltan Herczeg | e612e06b5d | |
Philip Hazel | 64c9baaaa4 | |
Carlo Marcelo Arenas Belón | 9c8abddc52 | |
Carlo Marcelo Arenas Belón | f11c26842d | |
Zoltan Herczeg | 4ca0530b9b | |
Zoltan Herczeg | 03654e751e | |
Zoltan Herczeg | d4fa336fbc | |
Zoltan Herczeg | 50a51cb7e6 | |
Philip Hazel | f7a7341726 | |
Philip Hazel | eef5740ff9 | |
Zoltan Herczeg | dea56d2df9 | |
Adam | 111cd470b5 | |
Philip Hazel | fdd9479108 | |
Philip Hazel | 419e3c68a3 | |
Zoltan Herczeg | e21345de97 | |
Philip Hazel | e85a81ebac | |
Philip Hazel | 504ff06fff | |
Philip Hazel | 360a84e80b | |
Zoltan Herczeg | 061e57695a | |
Philip Hazel | 7f7d3e8521 | |
Philip Hazel | bf35c0518c | |
Zoltan Herczeg | 68fbc1982e | |
Philip Hazel | 06d3a66065 | |
Philip Hazel | 87571b5af3 | |
Philip Hazel | 838cdac4dc | |
Philip Hazel | 628a804102 | |
Philip Hazel | ec091e2e44 | |
Philip Hazel | 636569a957 | |
Philip Hazel | 81d3729c66 | |
Zoltan Herczeg | f90542a209 | |
Carlo Marcelo Arenas Belón | 14dbc6e6ec | |
Philip Hazel | 80205ee2a0 | |
Jessica Clarke | 04ecb267c0 | |
Jessica Clarke | 534b4760e3 | |
Philip Hazel | 31fb2e58a1 | |
Zoltan Herczeg | 435140a0ac | |
Philip Hazel | c24047f15d | |
Zoltan Herczeg | e7457003cd | |
Philip Hazel | d888d36013 | |
Zoltan Herczeg | 6614b281bc | |
Zoltan Herczeg | afa4756d19 | |
Philip Hazel | 7713f33e46 | |
Michael Kaufmann | af2637ee5e | |
Philip Hazel | 98e7d70bc6 | |
Philip Hazel | 321b559ed4 | |
Philip Hazel | 16c8a84cce | |
Philip Hazel | 4514ddd2a2 | |
Philip Hazel | 944f0e10a1 | |
Philip Hazel | b29732063b | |
Philip Hazel | 92d7cf1dd0 | |
Philip Hazel | 1d432ee3cf | |
Philip Hazel | 194a15315a | |
Philip Hazel | 1c41a5b815 | |
Zoltan Herczeg | 4243515033 | |
Philip Hazel | 49b29f837d | |
Philip Hazel | 30abd0ac8d | |
Philip Hazel | 0246c6bf64 | |
Philip Hazel | 823d4ac956 | |
Philip Hazel | ba3d0edcbd | |
Philip Hazel | 4ef0c51d2b | |
Philip Hazel | 7ab2769728 | |
Philip Hazel | 2a294ddadb | |
Philip Hazel | cb854a912e | |
Philip Hazel | 16dccbcb13 | |
Carlo Marcelo Arenas Belón | ae4e6261e5 | |
Carlo Marcelo Arenas Belón | d24a1c9d31 | |
Carlo Marcelo Arenas Belón | 055b7ce4a9 | |
Philip Hazel | 4a8f5d104c | |
Carlo Marcelo Arenas Belón | 587b94277b | |
Philip Hazel | c8d31f1605 | |
Carlo Marcelo Arenas Belón | adf76faace | |
Zoltan Herczeg | d144199dfb | |
Carlo Marcelo Arenas Belón | eb42305f07 | |
Philip Hazel | 46890604a4 | |
Carlo Marcelo Arenas Belón | acc520924c | |
Philip Hazel | bc70a183fc | |
Carlo Marcelo Arenas Belón | dae475092d | |
Philip Hazel | 1ed34b9cb1 | |
Philip Hazel | f19e84674e | |
Carlo Marcelo Arenas Belón | 7db8784296 | |
Philip Hazel | 072717a61f |
|
@ -0,0 +1,3 @@
|
|||
common --experimental_enable_bzlmod
|
||||
build --incompatible_enable_cc_toolchain_resolution
|
||||
build --incompatible_strict_action_env
|
|
@ -0,0 +1,77 @@
|
|||
|
||||
name: Build
|
||||
on: [push, pull_request]
|
||||
|
||||
jobs:
|
||||
linux:
|
||||
name: Linux
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v2
|
||||
|
||||
- name: Autogen
|
||||
run: ./autogen.sh
|
||||
|
||||
- name: Configure
|
||||
run: ./configure --enable-jit --enable-pcre2-8 --enable-pcre2-16 --enable-pcre2-32
|
||||
|
||||
- name: Build
|
||||
run: make
|
||||
|
||||
- name: Test (main test script)
|
||||
run: ./RunTest
|
||||
|
||||
- name: Test (JIT test program)
|
||||
run: ./pcre2_jit_test
|
||||
|
||||
- name: Test (pcre2grep test script)
|
||||
run: ./RunGrepTest
|
||||
|
||||
alpine:
|
||||
name: alpine
|
||||
runs-on: ubuntu-latest
|
||||
container: alpine
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v2
|
||||
|
||||
- name: Autotools
|
||||
run: apk add --no-cache automake autoconf gcc libtool make musl-dev
|
||||
|
||||
- name: Autogen
|
||||
run: ./autogen.sh
|
||||
|
||||
- name: Configure
|
||||
run: ./configure --enable-jit --enable-pcre2-8 --enable-pcre2-16 --enable-pcre2-32
|
||||
|
||||
- name: Build
|
||||
run: make
|
||||
|
||||
- name: Test (main test script)
|
||||
run: ./RunTest
|
||||
|
||||
- name: Test (JIT test program)
|
||||
run: ./pcre2_jit_test
|
||||
|
||||
- name: Test (pcre2grep test script)
|
||||
run: ./RunGrepTest
|
||||
|
||||
windows:
|
||||
name: 32bit Windows
|
||||
runs-on: windows-latest
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v2
|
||||
|
||||
- name: Configure
|
||||
run: cmake -DPCRE2_SUPPORT_JIT=ON -DPCRE2_BUILD_PCRE2_16=ON -DPCRE2_BUILD_PCRE2_32=ON -B build -A Win32
|
||||
|
||||
- name: Build
|
||||
run: cmake --build build
|
||||
|
||||
- name: Test
|
||||
run: |
|
||||
cd build\Debug
|
||||
..\..\RunTest.bat
|
||||
|
|
@ -0,0 +1,73 @@
|
|||
# For most projects, this workflow file will not need changing; you simply need
|
||||
# to commit it to your repository.
|
||||
#
|
||||
# You may wish to alter this file to override the set of languages analyzed,
|
||||
# or to provide custom queries or build logic.
|
||||
#
|
||||
# ******** NOTE ********
|
||||
# We have attempted to detect the languages in your repository. Please check
|
||||
# the `language` matrix defined below to confirm you have the correct set of
|
||||
# supported CodeQL languages.
|
||||
#
|
||||
name: "CodeQL"
|
||||
|
||||
on:
|
||||
push:
|
||||
branches: [ master ]
|
||||
pull_request:
|
||||
# The branches below must be a subset of the branches above
|
||||
branches: [ master ]
|
||||
schedule:
|
||||
- cron: '27 6 * * 4'
|
||||
|
||||
# Declare default permissions as read only.
|
||||
permissions: read-all
|
||||
|
||||
jobs:
|
||||
analyze:
|
||||
name: Analyze
|
||||
runs-on: ubuntu-latest
|
||||
permissions:
|
||||
actions: read
|
||||
contents: read
|
||||
security-events: write
|
||||
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
language: [ 'cpp', 'python' ]
|
||||
# CodeQL supports [ 'cpp', 'csharp', 'go', 'java', 'javascript', 'python', 'ruby' ]
|
||||
# Learn more about CodeQL language support at https://git.io/codeql-language-support
|
||||
|
||||
steps:
|
||||
- name: Checkout repository
|
||||
uses: actions/checkout@v2
|
||||
|
||||
# Initializes the CodeQL tools for scanning.
|
||||
- name: Initialize CodeQL
|
||||
uses: github/codeql-action/init@v1
|
||||
with:
|
||||
languages: ${{ matrix.language }}
|
||||
# If you wish to specify custom queries, you can do so here or in a config file.
|
||||
# By default, queries listed here will override any specified in a config file.
|
||||
# Prefix the list here with "+" to use these queries and those in the config file.
|
||||
# queries: ./path/to/local/query, your-org/your-repo/queries@main
|
||||
|
||||
# Autobuild attempts to build any compiled languages (C/C++, C#, or Java).
|
||||
# If this step fails, then you should remove it and run the build manually (see below)
|
||||
- name: Autobuild
|
||||
uses: github/codeql-action/autobuild@v1
|
||||
|
||||
# ℹ️ Command-line programs to run using the OS shell.
|
||||
# 📚 https://git.io/JvXDl
|
||||
|
||||
# ✏️ If the Autobuild fails above, remove it and uncomment the following three lines
|
||||
# and modify them (or add more) to build your code if your project
|
||||
# uses a compiled language
|
||||
|
||||
#- run: |
|
||||
# make bootstrap
|
||||
# make release
|
||||
|
||||
- name: Perform CodeQL Analysis
|
||||
uses: github/codeql-action/analyze@v1
|
|
@ -0,0 +1,55 @@
|
|||
name: Scorecards supply-chain security
|
||||
on:
|
||||
# Only the default branch is supported.
|
||||
branch_protection_rule:
|
||||
schedule:
|
||||
- cron: '23 17 * * 1'
|
||||
push:
|
||||
branches: [ master ]
|
||||
|
||||
# Declare default permissions as read only.
|
||||
permissions: read-all
|
||||
|
||||
jobs:
|
||||
analysis:
|
||||
name: Scorecards analysis
|
||||
runs-on: ubuntu-latest
|
||||
permissions:
|
||||
# Needed to upload the results to code-scanning dashboard.
|
||||
security-events: write
|
||||
actions: read
|
||||
contents: read
|
||||
|
||||
steps:
|
||||
- name: "Checkout code"
|
||||
uses: actions/checkout@ec3a7ce113134d7a93b817d10a8272cb61118579 # v2.4.0
|
||||
with:
|
||||
persist-credentials: false
|
||||
|
||||
- name: "Run analysis"
|
||||
uses: ossf/scorecard-action@c1aec4ac820532bab364f02a81873c555a0ba3a1 # v1.0.4
|
||||
with:
|
||||
results_file: results.sarif
|
||||
results_format: sarif
|
||||
# Read-only PAT token. To create it,
|
||||
# follow the steps in https://github.com/ossf/scorecard-action#pat-token-creation.
|
||||
repo_token: ${{ secrets.SCORECARD_READ_TOKEN }}
|
||||
# Publish the results to enable scorecard badges. For more details, see
|
||||
# https://github.com/ossf/scorecard-action#publishing-results.
|
||||
# For private repositories, `publish_results` will automatically be set to `false`,
|
||||
# regardless of the value entered here.
|
||||
publish_results: true
|
||||
|
||||
# Upload the results as artifacts (optional).
|
||||
- name: "Upload artifact"
|
||||
uses: actions/upload-artifact@82c141cc518b40d92cc801eee768e7aafc9c2fa2 # v2.3.1
|
||||
with:
|
||||
name: SARIF file
|
||||
path: results.sarif
|
||||
retention-days: 5
|
||||
|
||||
# Upload the results to GitHub's code scanning dashboard.
|
||||
- name: "Upload to code-scanning"
|
||||
uses: github/codeql-action/upload-sarif@5f532563584d71fdef14ee64d17bafb34f751ce5 # v1.0.26
|
||||
with:
|
||||
sarif_file: results.sarif
|
|
@ -6,7 +6,9 @@
|
|||
*.pc
|
||||
*.o
|
||||
*~
|
||||
*.lha
|
||||
|
||||
__pycache__
|
||||
.deps
|
||||
.libs
|
||||
|
||||
|
@ -74,4 +76,7 @@ src/pcre2.h
|
|||
src/pcre2_chartables.c
|
||||
src/stamp-h1
|
||||
|
||||
/bazel-*
|
||||
|
||||
# End
|
||||
|
||||
|
|
6
AUTHORS
6
AUTHORS
|
@ -8,7 +8,7 @@ Email domain: gmail.com
|
|||
Retired from University of Cambridge Computing Service,
|
||||
Cambridge, England.
|
||||
|
||||
Copyright (c) 1997-2021 University of Cambridge
|
||||
Copyright (c) 1997-2022 University of Cambridge
|
||||
All rights reserved
|
||||
|
||||
|
||||
|
@ -19,7 +19,7 @@ Written by: Zoltan Herczeg
|
|||
Email local part: hzmester
|
||||
Emain domain: freemail.hu
|
||||
|
||||
Copyright(c) 2010-2021 Zoltan Herczeg
|
||||
Copyright(c) 2010-2022 Zoltan Herczeg
|
||||
All rights reserved.
|
||||
|
||||
|
||||
|
@ -30,7 +30,7 @@ Written by: Zoltan Herczeg
|
|||
Email local part: hzmester
|
||||
Emain domain: freemail.hu
|
||||
|
||||
Copyright(c) 2009-2021 Zoltan Herczeg
|
||||
Copyright(c) 2009-2022 Zoltan Herczeg
|
||||
All rights reserved.
|
||||
|
||||
####
|
||||
|
|
|
@ -0,0 +1,72 @@
|
|||
load("@rules_cc//cc:defs.bzl", "cc_library", "cc_test")
|
||||
load("@bazel_skylib//rules:copy_file.bzl", "copy_file")
|
||||
|
||||
copy_file(
|
||||
name = "config_h_generic",
|
||||
src = "src/config.h.generic",
|
||||
out = "src/config.h",
|
||||
)
|
||||
|
||||
copy_file(
|
||||
name = "pcre2_h_generic",
|
||||
src = "src/pcre2.h.generic",
|
||||
out = "src/pcre2.h",
|
||||
)
|
||||
|
||||
copy_file(
|
||||
name = "pcre2_chartables_c",
|
||||
src = "src/pcre2_chartables.c.dist",
|
||||
out = "src/pcre2_chartables.c",
|
||||
)
|
||||
|
||||
cc_library(
|
||||
name = "pcre2",
|
||||
srcs = [
|
||||
"src/pcre2_auto_possess.c",
|
||||
"src/pcre2_compile.c",
|
||||
"src/pcre2_config.c",
|
||||
"src/pcre2_context.c",
|
||||
"src/pcre2_convert.c",
|
||||
"src/pcre2_dfa_match.c",
|
||||
"src/pcre2_error.c",
|
||||
"src/pcre2_extuni.c",
|
||||
"src/pcre2_find_bracket.c",
|
||||
"src/pcre2_maketables.c",
|
||||
"src/pcre2_match.c",
|
||||
"src/pcre2_match_data.c",
|
||||
"src/pcre2_newline.c",
|
||||
"src/pcre2_ord2utf.c",
|
||||
"src/pcre2_pattern_info.c",
|
||||
"src/pcre2_script_run.c",
|
||||
"src/pcre2_serialize.c",
|
||||
"src/pcre2_string_utils.c",
|
||||
"src/pcre2_study.c",
|
||||
"src/pcre2_substitute.c",
|
||||
"src/pcre2_substring.c",
|
||||
"src/pcre2_tables.c",
|
||||
"src/pcre2_ucd.c",
|
||||
"src/pcre2_ucptables.c",
|
||||
"src/pcre2_valid_utf.c",
|
||||
"src/pcre2_xclass.c",
|
||||
":pcre2_chartables_c",
|
||||
],
|
||||
hdrs = glob(["src/*.h"]) + [
|
||||
":config_h_generic",
|
||||
":pcre2_h_generic",
|
||||
],
|
||||
defines = [
|
||||
"HAVE_CONFIG_H",
|
||||
"PCRE2_CODE_UNIT_WIDTH=8",
|
||||
"PCRE2_STATIC",
|
||||
],
|
||||
includes = ["src"],
|
||||
strip_include_prefix = "src",
|
||||
visibility = ["//visibility:public"],
|
||||
)
|
||||
|
||||
cc_binary(
|
||||
name = "pcre2demo",
|
||||
srcs = ["src/pcre2demo.c"],
|
||||
visibility = ["//visibility:public"],
|
||||
deps = [":pcre2"],
|
||||
)
|
|
@ -103,13 +103,18 @@
|
|||
PROJECT(PCRE2 C)
|
||||
|
||||
# Increased minimum to 2.8.5 to support GNUInstallDirs.
|
||||
# Increased minimum to 3.0.0 because older than 2.8.12 is deprecated.
|
||||
CMAKE_MINIMUM_REQUIRED(VERSION 3.0.0)
|
||||
# Increased minimum to 3.1 to support imported targets.
|
||||
CMAKE_MINIMUM_REQUIRED(VERSION 3.1)
|
||||
|
||||
# Set policy CMP0026 to avoid warnings for the use of LOCATION in
|
||||
# GET_TARGET_PROPERTY. This should no longer be required.
|
||||
# CMAKE_POLICY(SET CMP0026 OLD)
|
||||
|
||||
# With a recent cmake, you can provide a rootdir to look for non
|
||||
# standard installed library dependencies, but to do so, the policy
|
||||
# needs to be set to new (by uncommenting the following)
|
||||
# CMAKE_POLICY(SET CMP0074 NEW)
|
||||
|
||||
# For FindReadline.cmake. This was changed to allow setting CMAKE_MODULE_PATH
|
||||
# on the command line.
|
||||
# SET(CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake)
|
||||
|
@ -142,10 +147,16 @@ CHECK_INCLUDE_FILE(windows.h HAVE_WINDOWS_H)
|
|||
CHECK_SYMBOL_EXISTS(bcopy "strings.h" HAVE_BCOPY)
|
||||
CHECK_SYMBOL_EXISTS(memfd_create "sys/mman.h" HAVE_MEMFD_CREATE)
|
||||
CHECK_SYMBOL_EXISTS(memmove "string.h" HAVE_MEMMOVE)
|
||||
CHECK_SYMBOL_EXISTS(realpath "stdlib.h" HAVE_REALPATH)
|
||||
CHECK_SYMBOL_EXISTS(secure_getenv "stdlib.h" HAVE_SECURE_GETENV)
|
||||
CHECK_SYMBOL_EXISTS(strerror "string.h" HAVE_STRERROR)
|
||||
|
||||
CHECK_C_SOURCE_COMPILES(
|
||||
"#include <stdlib.h>
|
||||
#include <limits.h>
|
||||
int main(int c, char *v[]) { char buf[PATH_MAX]; realpath(v[1], buf); return 0; }"
|
||||
HAVE_REALPATH
|
||||
)
|
||||
|
||||
set(ORIG_CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS})
|
||||
set(CMAKE_REQUIRED_FLAGS "${CMAKE_REQUIRED_FLAGS} -Werror")
|
||||
CHECK_C_SOURCE_COMPILES(
|
||||
|
@ -300,9 +311,19 @@ ENDIF(PCRE2_SUPPORT_LIBZ)
|
|||
IF(EDITLINE_FOUND)
|
||||
OPTION (PCRE2_SUPPORT_LIBEDIT "Enable support for linking pcre2test with libedit." OFF)
|
||||
ENDIF(EDITLINE_FOUND)
|
||||
IF(PCRE2_SUPPORT_LIBEDIT)
|
||||
IF(EDITLINE_FOUND)
|
||||
IF(PCRE2_SUPPORT_LIBEDIT)
|
||||
INCLUDE_DIRECTORIES(${EDITLINE_INCLUDE_DIR})
|
||||
ENDIF(PCRE2_SUPPORT_LIBEDIT)
|
||||
ENDIF(PCRE2_SUPPORT_LIBEDIT)
|
||||
ELSE(EDITLINE_FOUND)
|
||||
IF(PCRE2_SUPPORT_LIBEDIT)
|
||||
MESSAGE(FATAL_ERROR
|
||||
" libedit not found, set EDITLINE_INCLUDE_DIR to a compatible header\n"
|
||||
" or set Editline_ROOT to a full libedit installed tree, as needed\n"
|
||||
" Might need to enable policy CMP0074 in CMakeLists.txt"
|
||||
)
|
||||
ENDIF(PCRE2_SUPPORT_LIBEDIT)
|
||||
ENDIF(EDITLINE_FOUND)
|
||||
|
||||
# readline lib
|
||||
IF(READLINE_FOUND)
|
||||
|
@ -340,7 +361,12 @@ IF(PCRE2_BUILD_PCRE2GREP AND NOT PCRE2_BUILD_PCRE2_8)
|
|||
ENDIF(PCRE2_BUILD_PCRE2GREP AND NOT PCRE2_BUILD_PCRE2_8)
|
||||
|
||||
IF(PCRE2_SUPPORT_LIBREADLINE AND PCRE2_SUPPORT_LIBEDIT)
|
||||
MESSAGE(FATAL_ERROR "Only one of libreadline or libeditline can be specified")
|
||||
IF(READLINE_FOUND)
|
||||
MESSAGE(FATAL_ERROR
|
||||
" Only one of the readline compatible libraries can be enabled.\n"
|
||||
" Disable libreadline with -DPCRE2_SUPPORT_LIBREADLINE=OFF"
|
||||
)
|
||||
ENDIF(READLINE_FOUND)
|
||||
ENDIF(PCRE2_SUPPORT_LIBREADLINE AND PCRE2_SUPPORT_LIBEDIT)
|
||||
|
||||
IF(PCRE2_SUPPORT_BSR_ANYCRLF)
|
||||
|
@ -357,6 +383,12 @@ ENDIF(PCRE2_SUPPORT_UNICODE)
|
|||
|
||||
IF(PCRE2_SUPPORT_JIT)
|
||||
SET(SUPPORT_JIT 1)
|
||||
IF(UNIX)
|
||||
FIND_PACKAGE(Threads REQUIRED)
|
||||
IF(CMAKE_USE_PTHREADS_INIT)
|
||||
SET(REQUIRE_PTHREAD 1)
|
||||
ENDIF(CMAKE_USE_PTHREADS_INIT)
|
||||
ENDIF(UNIX)
|
||||
ENDIF(PCRE2_SUPPORT_JIT)
|
||||
|
||||
IF(PCRE2_SUPPORT_JIT_SEALLOC)
|
||||
|
@ -626,6 +658,8 @@ IF(MINGW AND BUILD_SHARED_LIBS)
|
|||
ENDIF(MINGW AND BUILD_SHARED_LIBS)
|
||||
|
||||
IF(MSVC AND BUILD_SHARED_LIBS)
|
||||
SET(dll_pdb_files ${PROJECT_BINARY_DIR}/pcre2-posix.pdb ${dll_pdb_files})
|
||||
SET(dll_pdb_debug_files ${PROJECT_BINARY_DIR}/pcre2-posixd.pdb ${dll_pdb_debug_files})
|
||||
IF (EXISTS ${PROJECT_SOURCE_DIR}/pcre2.rc)
|
||||
SET(PCRE2_SOURCES ${PCRE2_SOURCES} pcre2.rc)
|
||||
ENDIF(EXISTS ${PROJECT_SOURCE_DIR}/pcre2.rc)
|
||||
|
@ -671,6 +705,10 @@ IF(PCRE2_BUILD_PCRE2_8)
|
|||
VERSION ${LIBPCRE2_8_VERSION}
|
||||
SOVERSION ${LIBPCRE2_8_SOVERSION})
|
||||
TARGET_COMPILE_DEFINITIONS(pcre2-8-static PUBLIC PCRE2_STATIC)
|
||||
TARGET_INCLUDE_DIRECTORIES(pcre2-8-static PUBLIC ${PROJECT_BINARY_DIR})
|
||||
IF(REQUIRE_PTHREAD)
|
||||
TARGET_LINK_LIBRARIES(pcre2-8-static Threads::Threads)
|
||||
ENDIF(REQUIRE_PTHREAD)
|
||||
SET(targets ${targets} pcre2-8-static)
|
||||
ADD_LIBRARY(pcre2-posix-static STATIC ${PCRE2POSIX_HEADERS} ${PCRE2POSIX_SOURCES})
|
||||
SET_TARGET_PROPERTIES(pcre2-posix-static PROPERTIES
|
||||
|
@ -681,6 +719,7 @@ IF(PCRE2_BUILD_PCRE2_8)
|
|||
SOVERSION ${LIBPCRE2_POSIX_SOVERSION})
|
||||
TARGET_LINK_LIBRARIES(pcre2-posix-static pcre2-8-static)
|
||||
TARGET_COMPILE_DEFINITIONS(pcre2-posix-static PUBLIC PCRE2_STATIC)
|
||||
TARGET_INCLUDE_DIRECTORIES(pcre2-posix-static PUBLIC ${PROJECT_BINARY_DIR})
|
||||
SET(targets ${targets} pcre2-posix-static)
|
||||
|
||||
IF(MSVC)
|
||||
|
@ -697,6 +736,7 @@ IF(PCRE2_BUILD_PCRE2_8)
|
|||
|
||||
IF(BUILD_SHARED_LIBS)
|
||||
ADD_LIBRARY(pcre2-8-shared SHARED ${PCRE2_HEADERS} ${PCRE2_SOURCES} ${PROJECT_BINARY_DIR}/config.h)
|
||||
TARGET_INCLUDE_DIRECTORIES(pcre2-8-shared PUBLIC ${PROJECT_BINARY_DIR})
|
||||
SET_TARGET_PROPERTIES(pcre2-8-shared PROPERTIES
|
||||
COMPILE_DEFINITIONS PCRE2_CODE_UNIT_WIDTH=8
|
||||
MACHO_COMPATIBILITY_VERSION "${LIBPCRE2_8_MACHO_COMPATIBILITY_VERSION}"
|
||||
|
@ -704,8 +744,12 @@ IF(PCRE2_BUILD_PCRE2_8)
|
|||
VERSION ${LIBPCRE2_8_VERSION}
|
||||
SOVERSION ${LIBPCRE2_8_SOVERSION}
|
||||
OUTPUT_NAME pcre2-8)
|
||||
IF(REQUIRE_PTHREAD)
|
||||
TARGET_LINK_LIBRARIES(pcre2-8-shared Threads::Threads)
|
||||
ENDIF(REQUIRE_PTHREAD)
|
||||
SET(targets ${targets} pcre2-8-shared)
|
||||
ADD_LIBRARY(pcre2-posix-shared SHARED ${PCRE2POSIX_HEADERS} ${PCRE2POSIX_SOURCES})
|
||||
TARGET_INCLUDE_DIRECTORIES(pcre2-posix-shared PUBLIC ${PROJECT_BINARY_DIR})
|
||||
SET_TARGET_PROPERTIES(pcre2-posix-shared PROPERTIES
|
||||
COMPILE_DEFINITIONS PCRE2_CODE_UNIT_WIDTH=8
|
||||
MACHO_COMPATIBILITY_VERSION "${LIBPCRE2_POSIX_MACHO_COMPATIBILITY_VERSION}"
|
||||
|
@ -715,6 +759,8 @@ IF(PCRE2_BUILD_PCRE2_8)
|
|||
OUTPUT_NAME pcre2-posix)
|
||||
TARGET_LINK_LIBRARIES(pcre2-posix-shared pcre2-8-shared)
|
||||
SET(targets ${targets} pcre2-posix-shared)
|
||||
SET(dll_pdb_files ${PROJECT_BINARY_DIR}/pcre2-8.pdb ${dll_pdb_files})
|
||||
SET(dll_pdb_debug_files ${PROJECT_BINARY_DIR}/pcre2-8d.pdb ${dll_pdb_debug_files})
|
||||
|
||||
IF(MINGW)
|
||||
IF(NON_STANDARD_LIB_PREFIX)
|
||||
|
@ -740,6 +786,7 @@ ENDIF(PCRE2_BUILD_PCRE2_8)
|
|||
IF(PCRE2_BUILD_PCRE2_16)
|
||||
IF(BUILD_STATIC_LIBS)
|
||||
ADD_LIBRARY(pcre2-16-static STATIC ${PCRE2_HEADERS} ${PCRE2_SOURCES} ${PROJECT_BINARY_DIR}/config.h)
|
||||
TARGET_INCLUDE_DIRECTORIES(pcre2-16-static PUBLIC ${PROJECT_BINARY_DIR})
|
||||
SET_TARGET_PROPERTIES(pcre2-16-static PROPERTIES
|
||||
COMPILE_DEFINITIONS PCRE2_CODE_UNIT_WIDTH=16
|
||||
MACHO_COMPATIBILITY_VERSION "${LIBPCRE2_32_MACHO_COMPATIBILITY_VERSION}"
|
||||
|
@ -747,6 +794,9 @@ IF(PCRE2_BUILD_PCRE2_16)
|
|||
VERSION ${LIBPCRE2_16_VERSION}
|
||||
SOVERSION ${LIBPCRE2_16_SOVERSION})
|
||||
TARGET_COMPILE_DEFINITIONS(pcre2-16-static PUBLIC PCRE2_STATIC)
|
||||
IF(REQUIRE_PTHREAD)
|
||||
TARGET_LINK_LIBRARIES(pcre2-16-static Threads::Threads)
|
||||
ENDIF(REQUIRE_PTHREAD)
|
||||
SET(targets ${targets} pcre2-16-static)
|
||||
|
||||
IF(MSVC)
|
||||
|
@ -761,6 +811,7 @@ IF(PCRE2_BUILD_PCRE2_16)
|
|||
|
||||
IF(BUILD_SHARED_LIBS)
|
||||
ADD_LIBRARY(pcre2-16-shared SHARED ${PCRE2_HEADERS} ${PCRE2_SOURCES} ${PROJECT_BINARY_DIR}/config.h)
|
||||
TARGET_INCLUDE_DIRECTORIES(pcre2-16-shared PUBLIC ${PROJECT_BINARY_DIR})
|
||||
SET_TARGET_PROPERTIES(pcre2-16-shared PROPERTIES
|
||||
COMPILE_DEFINITIONS PCRE2_CODE_UNIT_WIDTH=16
|
||||
MACHO_COMPATIBILITY_VERSION "${LIBPCRE2_32_MACHO_COMPATIBILITY_VERSION}"
|
||||
|
@ -768,7 +819,12 @@ IF(PCRE2_BUILD_PCRE2_16)
|
|||
VERSION ${LIBPCRE2_16_VERSION}
|
||||
SOVERSION ${LIBPCRE2_16_SOVERSION}
|
||||
OUTPUT_NAME pcre2-16)
|
||||
IF(REQUIRE_PTHREAD)
|
||||
TARGET_LINK_LIBRARIES(pcre2-16-shared Threads::Threads)
|
||||
ENDIF(REQUIRE_PTHREAD)
|
||||
SET(targets ${targets} pcre2-16-shared)
|
||||
SET(dll_pdb_files ${PROJECT_BINARY_DIR}/pcre2-16.pdb ${dll_pdb_files})
|
||||
SET(dll_pdb_debug_files ${PROJECT_BINARY_DIR}/pcre2-16d.pdb ${dll_pdb_debug_files})
|
||||
|
||||
IF(MINGW)
|
||||
IF(NON_STANDARD_LIB_PREFIX)
|
||||
|
@ -792,6 +848,7 @@ ENDIF(PCRE2_BUILD_PCRE2_16)
|
|||
IF(PCRE2_BUILD_PCRE2_32)
|
||||
IF(BUILD_STATIC_LIBS)
|
||||
ADD_LIBRARY(pcre2-32-static STATIC ${PCRE2_HEADERS} ${PCRE2_SOURCES} ${PROJECT_BINARY_DIR}/config.h)
|
||||
TARGET_INCLUDE_DIRECTORIES(pcre2-32-static PUBLIC ${PROJECT_BINARY_DIR})
|
||||
SET_TARGET_PROPERTIES(pcre2-32-static PROPERTIES
|
||||
COMPILE_DEFINITIONS PCRE2_CODE_UNIT_WIDTH=32
|
||||
MACHO_COMPATIBILITY_VERSION "${LIBPCRE2_32_MACHO_COMPATIBILITY_VERSION}"
|
||||
|
@ -799,6 +856,9 @@ IF(PCRE2_BUILD_PCRE2_32)
|
|||
VERSION ${LIBPCRE2_32_VERSION}
|
||||
SOVERSION ${LIBPCRE2_32_SOVERSION})
|
||||
TARGET_COMPILE_DEFINITIONS(pcre2-32-static PUBLIC PCRE2_STATIC)
|
||||
IF(REQUIRE_PTHREAD)
|
||||
TARGET_LINK_LIBRARIES(pcre2-32-static Threads::Threads)
|
||||
ENDIF(REQUIRE_PTHREAD)
|
||||
SET(targets ${targets} pcre2-32-static)
|
||||
|
||||
IF(MSVC)
|
||||
|
@ -813,6 +873,7 @@ IF(PCRE2_BUILD_PCRE2_32)
|
|||
|
||||
IF(BUILD_SHARED_LIBS)
|
||||
ADD_LIBRARY(pcre2-32-shared SHARED ${PCRE2_HEADERS} ${PCRE2_SOURCES} ${PROJECT_BINARY_DIR}/config.h)
|
||||
TARGET_INCLUDE_DIRECTORIES(pcre2-32-shared PUBLIC ${PROJECT_BINARY_DIR})
|
||||
SET_TARGET_PROPERTIES(pcre2-32-shared PROPERTIES
|
||||
COMPILE_DEFINITIONS PCRE2_CODE_UNIT_WIDTH=32
|
||||
MACHO_COMPATIBILITY_VERSION "${LIBPCRE2_32_MACHO_COMPATIBILITY_VERSION}"
|
||||
|
@ -820,7 +881,12 @@ IF(PCRE2_BUILD_PCRE2_32)
|
|||
VERSION ${LIBPCRE2_32_VERSION}
|
||||
SOVERSION ${LIBPCRE2_32_SOVERSION}
|
||||
OUTPUT_NAME pcre2-32)
|
||||
IF(REQUIRE_PTHREAD)
|
||||
TARGET_LINK_LIBRARIES(pcre2-32-shared Threads::Threads)
|
||||
ENDIF(REQUIRE_PTHREAD)
|
||||
SET(targets ${targets} pcre2-32-shared)
|
||||
SET(dll_pdb_files ${PROJECT_BINARY_DIR}/pcre2-32.pdb ${dll_pdb_files})
|
||||
SET(dll_pdb_debug_files ${PROJECT_BINARY_DIR}/pcre2-32d.pdb ${dll_pdb_debug_files})
|
||||
|
||||
IF(MINGW)
|
||||
IF(NON_STANDARD_LIB_PREFIX)
|
||||
|
@ -1022,25 +1088,13 @@ FILE(GLOB html ${PROJECT_SOURCE_DIR}/doc/html/*.html)
|
|||
FILE(GLOB man1 ${PROJECT_SOURCE_DIR}/doc/*.1)
|
||||
FILE(GLOB man3 ${PROJECT_SOURCE_DIR}/doc/*.3)
|
||||
|
||||
FOREACH(man ${man3})
|
||||
GET_FILENAME_COMPONENT(man_tmp ${man} NAME)
|
||||
SET(man3_new ${man3} ${man})
|
||||
ENDFOREACH(man ${man3})
|
||||
SET(man3 ${man3_new})
|
||||
|
||||
INSTALL(FILES ${man1} DESTINATION man/man1)
|
||||
INSTALL(FILES ${man3} DESTINATION man/man3)
|
||||
INSTALL(FILES ${html} DESTINATION share/doc/pcre2/html)
|
||||
|
||||
IF(MSVC AND INSTALL_MSVC_PDB)
|
||||
INSTALL(FILES ${PROJECT_BINARY_DIR}/pcre2.pdb
|
||||
${PROJECT_BINARY_DIR}/pcre2posix.pdb
|
||||
DESTINATION bin
|
||||
CONFIGURATIONS RelWithDebInfo)
|
||||
INSTALL(FILES ${PROJECT_BINARY_DIR}/pcre2d.pdb
|
||||
${PROJECT_BINARY_DIR}/pcre2posixd.pdb
|
||||
DESTINATION bin
|
||||
CONFIGURATIONS Debug)
|
||||
INSTALL(FILES ${dll_pdb_files} DESTINATION bin CONFIGURATIONS RelWithDebInfo)
|
||||
INSTALL(FILES ${dll_pdb_debug_files} DESTINATION bin CONFIGURATIONS Debug)
|
||||
ENDIF(MSVC AND INSTALL_MSVC_PDB)
|
||||
|
||||
# Help, only for nice output
|
||||
|
|
169
ChangeLog
169
ChangeLog
|
@ -1,5 +1,160 @@
|
|||
Change Log for PCRE2
|
||||
--------------------
|
||||
Change Log for PCRE2 - see also the Git log
|
||||
-------------------------------------------
|
||||
|
||||
|
||||
Version 10.41 xx-xxx-2022
|
||||
-------------------------
|
||||
|
||||
1. Add fflush() before and after a fork callout in pcre2grep to get its output
|
||||
to be the same on all systems. (THere were previously ordering differences in
|
||||
Alpine Linux).
|
||||
|
||||
2. Merged patch from @carenas (GitHub #110) for pthreads support in CMake.
|
||||
|
||||
3. SSF scorecards grumbled about possible overflow in an expression in
|
||||
pcre2test. It never would have overflowed in practice, but some casts have been
|
||||
added and at the some time there's been some tidying of fprints that output
|
||||
size_t values.
|
||||
|
||||
4. PR #94 showed up an unused enum in pcre2_convert.c, which is now removed.
|
||||
|
||||
5. Minor code re-arrangement to remove gcc warning about realloc() in
|
||||
pcre2test.
|
||||
|
||||
6. Change a number of int variables that hold buffer and line lengths in
|
||||
pcre2grep to PCRE2_SIZE (aka size_t).
|
||||
|
||||
7. Added an #ifdef to cut out a call to PRIV(jit_free) when JIT is not
|
||||
supported (even though that function would do nothing in that case) at the
|
||||
request of a user who doesn't even want to link with pcre_jit_compile.o. Also
|
||||
tidied up an untidy #ifdef arrangement in pcre2test.
|
||||
|
||||
8. Fixed an issue in the backtracking optimization of character repeats in
|
||||
JIT. Furthermore optimize star repetitions, not just plus repetitions.
|
||||
|
||||
9. Removed the use of an initial backtracking frames vector on the system stack
|
||||
in pcre2_match() so that it now always uses the heap. (In a multi-thread
|
||||
environment with very small stacks there had been an issue.) This also is
|
||||
tidier for JIT matching, which didn't need that vector. The heap vector is now
|
||||
remembered in the match data block and re-used if that block itself is re-used.
|
||||
It is freed with the match data block.
|
||||
|
||||
10. Adjusted the find_limits code in pcre2test to work with change 9 above.
|
||||
|
||||
11. Added find_limits_noheap to pcre2test, because the heap limits are now
|
||||
different in different environments and so cannot be included in the standard
|
||||
tests.
|
||||
|
||||
12. Created a test for pcre2_match() heap processing that is not part of the
|
||||
tests run by 'make check', but can be run manually. The current output is from
|
||||
a 64-bit system.
|
||||
|
||||
13. Implemented -Z aka --null in pcre2grep.
|
||||
|
||||
|
||||
Version 10.40 15-April-2022
|
||||
---------------------------
|
||||
|
||||
1. Merged patch from @carenas (GitHub #35, 7db87842) to fix pcre2grep incorrect
|
||||
handling of multiple passes.
|
||||
|
||||
2. Merged patch from @carenas (GitHub #36, dae47509) to fix portability issue
|
||||
in pcre2grep with buffered fseek(stdin).
|
||||
|
||||
3. Merged patch from @carenas (GitHub #37, acc520924) to fix tests when -S is
|
||||
not supported.
|
||||
|
||||
4. Revert an unintended change in JIT repeat detection.
|
||||
|
||||
5. Merged patch from @carenas (GitHub #52, b037bfa1) to fix build on GNU Hurd.
|
||||
|
||||
6. Merged documentation and comments patches from @carenas (GitHub #47).
|
||||
|
||||
7. Merged patch from @carenas (GitHub #49) to remove obsolete JFriedl test code
|
||||
from pcre2grep.
|
||||
|
||||
8. Merged patch from @carenas (GitHub #48) to fix CMake install issue #46.
|
||||
|
||||
9. Merged patch from @carenas (GitHub #53) fixing NULL checks in matching and
|
||||
substituting.
|
||||
|
||||
10. Add null_subject and null_replacement modifiers to pcre2test.
|
||||
|
||||
11. Add check for NULL subject to POSIX regexec() function.
|
||||
|
||||
12. Add check for NULL replacement to pcre2_substitute().
|
||||
|
||||
13. For the subject arguments of pcre2_match(), pcre2_dfa_match(), and
|
||||
pcre2_substitute(), and the replacement argument of the latter, if the pointer
|
||||
is NULL and the length is zero, treat as an empty string. Apparently a number
|
||||
of applications treat NULL/0 in this way.
|
||||
|
||||
14. Added support for Bidi_Class and a number of binary Unicode properties,
|
||||
including Bidi_Control.
|
||||
|
||||
15. Fix some minor issues raised by clang sanitize.
|
||||
|
||||
16. Very minor code speed up for maximizing character property matches.
|
||||
|
||||
17. A number of changes to script matching for \p and \P:
|
||||
|
||||
(a) Script extensions for a character are now coded as a bitmap instead of
|
||||
a list of script numbers, which should be faster and does not need a
|
||||
loop.
|
||||
|
||||
(b) Added the syntax \p{script:xxx} and \p{script_extensions:xxx} (synonyms
|
||||
sc and scx).
|
||||
|
||||
(c) Changed \p{scriptname} from being the same as \p{sc:scriptname} to being
|
||||
the same as \p{scx:scriptname} because this change happened in Perl at
|
||||
release 5.26.
|
||||
|
||||
(d) The standard Unicode 4-letter abbreviations for script names are now
|
||||
recognized.
|
||||
|
||||
(e) In accordance with Unicode and Perl's "loose matching" rules, spaces,
|
||||
hyphens, and underscores are ignored in property names, which are then
|
||||
matched independent of case.
|
||||
|
||||
18. The Python scripts in the maint directory have been refactored. There are
|
||||
now three scripts that generate pcre2_ucd.c, pcre2_ucp.h, and pcre2_ucptables.c
|
||||
(which is #included by pcre2_tables.c). The data lists that used to be
|
||||
duplicated are now held in a single common Python module.
|
||||
|
||||
19. On CHERI, and thus Arm's Morello prototype, pointers are represented as
|
||||
hardware capabilities, which consist of both an integer address and additional
|
||||
metadata, meaning they are twice the size of the platform's size_t type, i.e.
|
||||
16 bytes on a 64-bit system. The ovector member of heapframe happens to only be
|
||||
8 byte aligned, and so computing frame_size ended up with a multiple of 8 but
|
||||
not 16. Whilst the first frame was always suitably aligned, this then
|
||||
misaligned the frame that follows, resulting in an alignment fault when storing
|
||||
a pointer to Fecode at the start of match. Patch to fix this issue by Jessica
|
||||
Clarke PR#72.
|
||||
|
||||
20. Added -LP and -LS listing options to pcre2test.
|
||||
|
||||
21. A user discovered that the library names in CMakeLists.txt for MSVC
|
||||
debugger (PDB) files were incorrect - perhaps never tried for PCRE2?
|
||||
|
||||
22. An item such as [Aa] is optimized into a caseless single character match.
|
||||
When this was quantified (e.g. [Aa]{2}) and was also the last literal item in a
|
||||
pattern, the optimizing "must be present for a match" character check was not
|
||||
being flagged as caseless, causing some matches that should have succeeded to
|
||||
fail.
|
||||
|
||||
23. Fixed a unicode property matching issue in JIT. The character was not
|
||||
fully read in caseless matching.
|
||||
|
||||
24. Fixed an issue affecting recursions in JIT caused by duplicated data
|
||||
transfers.
|
||||
|
||||
25. Merged patch from @carenas (GitHub #96) which fixes some problems with
|
||||
pcre2test and readline/readedit:
|
||||
|
||||
* Use the right header for libedit in FreeBSD with autoconf
|
||||
* Really allow libedit with cmake
|
||||
* Avoid using readline headers with libedit
|
||||
|
||||
|
||||
Version 10.39 29-October-2021
|
||||
-----------------------------
|
||||
|
@ -14,10 +169,10 @@ Version 10.39 29-October-2021
|
|||
honoured if chosen.
|
||||
|
||||
prtdiff_t is signed, so use a signed type instead, and make sure
|
||||
that an appropiate width is chosen if pointers are 64bit wide and
|
||||
that an appropriate width is chosen if pointers are 64bit wide and
|
||||
long is not (ex: Windows 64bit).
|
||||
|
||||
IMHO removing the cast (and therefore the positibilty of truncation)
|
||||
IMHO removing the cast (and therefore the possibilty of truncation)
|
||||
make the code cleaner and the fallback is likely portable enough
|
||||
with all 64-bit POSIX systems doing LP64 except for Windows.
|
||||
|
||||
|
@ -68,7 +223,7 @@ Version 10.38 01-October-2021
|
|||
-----------------------------
|
||||
|
||||
1. Fix invalid single character repetition issues in JIT when the repetition
|
||||
is inside a capturing bracket and the bracket is preceeded by character
|
||||
is inside a capturing bracket and the bracket is preceded by character
|
||||
literals.
|
||||
|
||||
2. Installed revised CMake configuration files provided by Jan-Willem Blokland.
|
||||
|
@ -308,7 +463,7 @@ now correctly backtracked, so this unnecessary restriction has been removed.
|
|||
|
||||
7. Added PCRE2_SUBSTITUTE_MATCHED.
|
||||
|
||||
8. Added (?* and (?<* as synonms for (*napla: and (*naplb: to match another
|
||||
8. Added (?* and (?<* as synonyms for (*napla: and (*naplb: to match another
|
||||
regex engine. The Perl regex folks are aware of this usage and have made a note
|
||||
about it.
|
||||
|
||||
|
@ -739,7 +894,7 @@ Patch by Guillem Jover.
|
|||
warnings were reported.
|
||||
|
||||
38. Using the clang compiler with sanitizing options causes runtime complaints
|
||||
about truncation for statments such as x = ~x when x is an 8-bit value; it
|
||||
about truncation for statements such as x = ~x when x is an 8-bit value; it
|
||||
seems to compute ~x as a 32-bit value. Changing such statements to x = 255 ^ x
|
||||
gets rid of the warnings. There were also two missing casts in pcre2test.
|
||||
|
||||
|
|
58
HACKING
58
HACKING
|
@ -8,8 +8,8 @@ library is referred to as PCRE1 below. For information about testing PCRE2, see
|
|||
the pcre2test documentation and the comment at the head of the RunTest file.
|
||||
|
||||
PCRE1 releases were up to 8.3x when PCRE2 was developed, and later bug fix
|
||||
releases remain in the 8.xx series. PCRE2 releases started at 10.00 to avoid
|
||||
confusion with PCRE1.
|
||||
releases carried on the 8.xx series, up to the final 8.45 release. PCRE2
|
||||
releases started at 10.00 to avoid confusion with PCRE1.
|
||||
|
||||
|
||||
Historical note 1
|
||||
|
@ -38,8 +38,8 @@ Historical note 2
|
|||
By contrast, the code originally written by Henry Spencer (which was
|
||||
subsequently heavily modified for Perl) compiles the expression twice: once in
|
||||
a dummy mode in order to find out how much store will be needed, and then for
|
||||
real. (The Perl version probably doesn't do this any more; I'm talking about
|
||||
the original library.) The execution function operates by backtracking and
|
||||
real. (The Perl version may or may not still do this; I'm talking about the
|
||||
original library.) The execution function operates by backtracking and
|
||||
maximizing (or, optionally, minimizing, in Perl) the amount of the subject that
|
||||
matches individual wild portions of the pattern. This is an "NFA algorithm" in
|
||||
Friedl's terminology.
|
||||
|
@ -151,8 +151,8 @@ of code units in the item itself. The exception is the aforementioned large
|
|||
advance to check for such values. When auto-callouts are enabled, the generous
|
||||
assumption is made that there will be a callout for each pattern code unit
|
||||
(which of course is only actually true if all code units are literals) plus one
|
||||
at the end. There is a default parsed pattern vector on the system stack, but
|
||||
if this is not big enough, heap memory is used.
|
||||
at the end. A default parsed pattern vector is defined on the system stack, to
|
||||
minimize memory handling, but if this is not big enough, heap memory is used.
|
||||
|
||||
As before, the actual compiling function is run twice, the first time to
|
||||
determine the amount of memory needed for the final compiled pattern. It
|
||||
|
@ -187,7 +187,7 @@ META_CLASS_EMPTY [] empty class - only with PCRE2_ALLOW_EMPTY_CLASS
|
|||
META_CLASS_EMPTY_NOT [^] negative empty class - ditto
|
||||
META_CLASS_END ] end of non-empty class
|
||||
META_CLASS_NOT [^ start non-empty negative class
|
||||
META_COMMIT (*COMMIT)
|
||||
META_COMMIT (*COMMIT) - no argument (see below for with argument)
|
||||
META_COND_ASSERT (?(?assertion)
|
||||
META_DOLLAR $ metacharacter
|
||||
META_DOT . metacharacter
|
||||
|
@ -201,18 +201,18 @@ META_NOCAPTURE (?: no capture parens
|
|||
META_PLUS +
|
||||
META_PLUS_PLUS ++
|
||||
META_PLUS_QUERY +?
|
||||
META_PRUNE (*PRUNE) - no argument
|
||||
META_PRUNE (*PRUNE) - no argument (see below for with argument)
|
||||
META_QUERY ?
|
||||
META_QUERY_PLUS ?+
|
||||
META_QUERY_QUERY ??
|
||||
META_RANGE_ESCAPED hyphen in class range with at least one escape
|
||||
META_RANGE_LITERAL hyphen in class range defined literally
|
||||
META_SKIP (*SKIP) - no argument
|
||||
META_THEN (*THEN) - no argument
|
||||
META_SKIP (*SKIP) - no argument (see below for with argument)
|
||||
META_THEN (*THEN) - no argument (see below for with argument)
|
||||
|
||||
The two RANGE values occur only in character classes. They are positioned
|
||||
between two literals that define the start and end of the range. In an EBCDIC
|
||||
evironment it is necessary to know whether either of the range values was
|
||||
environment it is necessary to know whether either of the range values was
|
||||
specified as an escape. In an ASCII/Unicode environment the distinction is not
|
||||
relevant.
|
||||
|
||||
|
@ -229,17 +229,16 @@ If the data for META_ALT is non-zero, it is inside a lookbehind, and the data
|
|||
is the length of its branch, for which OP_REVERSE must be generated.
|
||||
|
||||
META_BACKREF, META_CAPTURE, and META_RECURSE have the capture group number as
|
||||
their data in the lower 16 bits of the element.
|
||||
their data in the lower 16 bits of the element. META_RECURSE is followed by an
|
||||
offset, for use in error messages.
|
||||
|
||||
META_BACKREF is followed by an offset if the back reference group number is 10
|
||||
or more. The offsets of the first ocurrences of references to groups whose
|
||||
or more. The offsets of the first occurrences of references to groups whose
|
||||
numbers are less than 10 are put in cb->small_ref_offset[] (only the first
|
||||
occurrence is useful). On 64-bit systems this avoids using more than two parsed
|
||||
pattern elements for items such as \3. The offset is used when an error occurs
|
||||
because the reference is to a non-existent group.
|
||||
|
||||
META_RECURSE is always followed by an offset, for use in error messages.
|
||||
|
||||
META_ESCAPE has an ESC_xxx value as its data. For ESC_P and ESC_p, the next
|
||||
element contains the 16-bit type and data property values, packed together.
|
||||
ESC_g and ESC_k are used only for named references - numerical ones are turned
|
||||
|
@ -291,9 +290,9 @@ META_LOOKBEHIND (?<= start of lookbehind
|
|||
META_LOOKBEHIND_NA (*naplb: start of non-atomic lookbehind
|
||||
META_LOOKBEHINDNOT (?<! start of negative lookbehind
|
||||
|
||||
The following are followed by two elements, the minimum and maximum. Repeat
|
||||
values are limited to 65535 (MAX_REPEAT). A maximum value of "unlimited" is
|
||||
represented by UNLIMITED_REPEAT, which is bigger than MAX_REPEAT:
|
||||
The following are followed by two elements, the minimum and maximum. The
|
||||
maximum value is limited to 65535 (MAX_REPEAT). A maximum value of "unlimited"
|
||||
is represented by UNLIMITED_REPEAT, which is bigger than MAX_REPEAT:
|
||||
|
||||
META_MINMAX {n,m} repeat
|
||||
META_MINMAX_PLUS {n,m}+ repeat
|
||||
|
@ -347,11 +346,11 @@ support is not available for this kind of matching.
|
|||
Changeable options
|
||||
------------------
|
||||
|
||||
The /i, /m, or /s options (PCRE2_CASELESS, PCRE2_MULTILINE, PCRE2_DOTALL, and
|
||||
others) may be changed in the middle of patterns by items such as (?i). Their
|
||||
processing is handled entirely at compile time by generating different opcodes
|
||||
for the different settings. The runtime functions do not need to keep track of
|
||||
an option's state.
|
||||
The /i, /m, or /s options (PCRE2_CASELESS, PCRE2_MULTILINE, PCRE2_DOTALL) and
|
||||
some others may be changed in the middle of patterns by items such as (?i).
|
||||
Their processing is handled entirely at compile time by generating different
|
||||
opcodes for the different settings. The runtime functions do not need to keep
|
||||
track of an option's state.
|
||||
|
||||
PCRE2_DUPNAMES, PCRE2_EXTENDED, PCRE2_EXTENDED_MORE, and PCRE2_NO_AUTO_CAPTURE
|
||||
are tracked and processed during the parsing pre-pass. The others are handled
|
||||
|
@ -468,8 +467,8 @@ Caseless matching (positive or negative) of characters that have more than two
|
|||
case-equivalent code points (which is possible only in UTF mode) is handled by
|
||||
compiling a Unicode property item (see below), with the pseudo-property
|
||||
PT_CLIST. The value of this property is an offset in a vector called
|
||||
"ucd_caseless_sets" which identifies the start of a short list of equivalent
|
||||
characters, terminated by the value NOTACHAR (0xffffffff).
|
||||
"ucd_caseless_sets" which identifies the start of a short list of case
|
||||
equivalent characters, terminated by the value NOTACHAR (0xffffffff).
|
||||
|
||||
|
||||
Repeating single characters
|
||||
|
@ -546,8 +545,9 @@ Each is followed by two code units that encode the desired property as a type
|
|||
and a value. The types are a set of #defines of the form PT_xxx, and the values
|
||||
are enumerations of the form ucp_xx, defined in the pcre2_ucp.h source file.
|
||||
The value is relevant only for PT_GC (General Category), PT_PC (Particular
|
||||
Category), PT_SC (Script), and the pseudo-property PT_CLIST, which is used to
|
||||
identify a list of case-equivalent characters when there are three or more.
|
||||
Category), PT_SC (Script), PT_BIDICL (Bidi Class), PT_BOOL (Boolean property),
|
||||
and the pseudo-property PT_CLIST, which is used to identify a list of
|
||||
case-equivalent characters when there are three or more (see above).
|
||||
|
||||
Repeats of these items use the OP_TYPESTAR etc. set of opcodes, followed by
|
||||
three code units: OP_PROP or OP_NOTPROP, and then the desired property type and
|
||||
|
@ -667,7 +667,7 @@ used for subpatterns that do not repeat indefinitely, OP_KETRMIN and
|
|||
OP_KETRMAX are used for indefinite repetitions, minimally or maximally
|
||||
respectively, and OP_KETRPOS for possessive repetitions (see below for more
|
||||
details). All four are followed by a LINK_SIZE value giving (as a positive
|
||||
number) the offset back to the matching bracket opcode.
|
||||
number) the offset back to the matching opening bracket opcode.
|
||||
|
||||
If a subpattern is quantified such that it is permitted to match zero times, it
|
||||
is preceded by one of OP_BRAZERO, OP_BRAMINZERO, or OP_SKIPZERO. These are
|
||||
|
@ -827,4 +827,4 @@ not a real opcode, but is used to check at compile time that tables indexed by
|
|||
opcode are the correct length, in order to catch updating errors.
|
||||
|
||||
Philip Hazel
|
||||
12 July 2019
|
||||
April 2022
|
||||
|
|
6
LICENCE
6
LICENCE
|
@ -26,7 +26,7 @@ Email domain: gmail.com
|
|||
Retired from University of Cambridge Computing Service,
|
||||
Cambridge, England.
|
||||
|
||||
Copyright (c) 1997-2021 University of Cambridge
|
||||
Copyright (c) 1997-2022 University of Cambridge
|
||||
All rights reserved.
|
||||
|
||||
|
||||
|
@ -37,7 +37,7 @@ Written by: Zoltan Herczeg
|
|||
Email local part: hzmester
|
||||
Email domain: freemail.hu
|
||||
|
||||
Copyright(c) 2010-2021 Zoltan Herczeg
|
||||
Copyright(c) 2010-2022 Zoltan Herczeg
|
||||
All rights reserved.
|
||||
|
||||
|
||||
|
@ -48,7 +48,7 @@ Written by: Zoltan Herczeg
|
|||
Email local part: hzmester
|
||||
Email domain: freemail.hu
|
||||
|
||||
Copyright(c) 2009-2021 Zoltan Herczeg
|
||||
Copyright(c) 2009-2022 Zoltan Herczeg
|
||||
All rights reserved.
|
||||
|
||||
|
||||
|
|
|
@ -0,0 +1,8 @@
|
|||
module(
|
||||
name = "pcre2",
|
||||
version = "10.40",
|
||||
compatibility_level = 1,
|
||||
)
|
||||
|
||||
bazel_dep(name = "rules_cc", version = "0.0.1")
|
||||
bazel_dep(name = "bazel_skylib", version = "1.2.1")
|
11
Makefile.am
11
Makefile.am
|
@ -382,6 +382,10 @@ COMMON_SOURCES = \
|
|||
src/pcre2_valid_utf.c \
|
||||
src/pcre2_xclass.c
|
||||
|
||||
# The pcre2_ucptables.c file is #included by pcre2_tables.c
|
||||
|
||||
EXTRA_DIST += src/pcre2_ucptables.c
|
||||
|
||||
if WITH_PCRE2_8
|
||||
lib_LTLIBRARIES += libpcre2-8.la
|
||||
libpcre2_8_la_SOURCES = \
|
||||
|
@ -448,9 +452,10 @@ EXTRA_DIST += \
|
|||
src/sljit/sljitNativePPC_32.c \
|
||||
src/sljit/sljitNativePPC_64.c \
|
||||
src/sljit/sljitNativePPC_common.c \
|
||||
src/sljit/sljitNativeRISCV_32.c \
|
||||
src/sljit/sljitNativeRISCV_64.c \
|
||||
src/sljit/sljitNativeRISCV_common.c \
|
||||
src/sljit/sljitNativeS390X.c \
|
||||
src/sljit/sljitNativeSPARC_32.c \
|
||||
src/sljit/sljitNativeSPARC_common.c \
|
||||
src/sljit/sljitNativeX86_32.c \
|
||||
src/sljit/sljitNativeX86_64.c \
|
||||
src/sljit/sljitNativeX86_common.c \
|
||||
|
@ -663,6 +668,7 @@ EXTRA_DIST += \
|
|||
testdata/testinput23 \
|
||||
testdata/testinput24 \
|
||||
testdata/testinput25 \
|
||||
testdata/testinput26 \
|
||||
testdata/testinputEBC \
|
||||
testdata/testoutput1 \
|
||||
testdata/testoutput2 \
|
||||
|
@ -705,6 +711,7 @@ EXTRA_DIST += \
|
|||
testdata/testoutput23 \
|
||||
testdata/testoutput24 \
|
||||
testdata/testoutput25 \
|
||||
testdata/testoutput26 \
|
||||
testdata/testoutputEBC \
|
||||
testdata/valgrind-jit.supp \
|
||||
testdata/wintestinput3 \
|
||||
|
|
|
@ -0,0 +1,271 @@
|
|||
#
|
||||
# Project: pcre2
|
||||
#
|
||||
# Created on: 10-01-2022 22:01:46
|
||||
#
|
||||
# commands to use:
|
||||
# make -f Makefile.os4 libpcre2.a
|
||||
# make -f Makefile.os4 libpcre2-posix.a
|
||||
# make -f Makefile.os4 pcre2test
|
||||
# sh RunTest
|
||||
# make -f Makefile.os4 clean
|
||||
#
|
||||
|
||||
###################################################################
|
||||
##
|
||||
##//// Objects
|
||||
##
|
||||
###################################################################
|
||||
|
||||
libpcre2_OBJ := \
|
||||
src/pcre2_chartables.o src/pcre2_auto_possess.o src/pcre2_compile.o \
|
||||
src/pcre2_config.o src/pcre2_context.o src/pcre2_convert.o \
|
||||
src/pcre2_dfa_match.o src/pcre2_error.o src/pcre2_extuni.o \
|
||||
src/pcre2_find_bracket.o src/pcre2_jit_compile.o src/pcre2_maketables.o \
|
||||
src/pcre2_match.o src/pcre2_match_data.o src/pcre2_newline.o \
|
||||
src/pcre2_ord2utf.o src/pcre2_pattern_info.o src/pcre2_script_run.o \
|
||||
src/pcre2_serialize.o src/pcre2_string_utils.o src/pcre2_study.o \
|
||||
src/pcre2_substitute.o src/pcre2_substring.o src/pcre2_tables.o \
|
||||
src/pcre2_ucd.o src/pcre2_valid_utf.o src/pcre2_xclass.o \
|
||||
|
||||
|
||||
|
||||
pcre2posix_OBJ := \
|
||||
src/pcre2posix.o
|
||||
|
||||
|
||||
pcre2test_OBJ := \
|
||||
src/pcre2test.o
|
||||
|
||||
|
||||
pcre2grep_OBJ := \
|
||||
src/pcre2grep.o
|
||||
|
||||
###################################################################
|
||||
##
|
||||
##//// Variables and Environment
|
||||
##
|
||||
###################################################################
|
||||
|
||||
MCRT := -mcrt=newlib
|
||||
ifeq ($(USE_CLIB2), yes)
|
||||
MCRT := -mcrt=clib2
|
||||
endif
|
||||
|
||||
CC := gcc:bin/gcc
|
||||
|
||||
INCPATH := -I. -Isrc
|
||||
|
||||
# for pcre2test
|
||||
CFLAGS := $(MCRT) $(INCPATH) -O2 -DHAVE_CONFIG_H -DPCRE2_CODE_UNIT_WIDTH=8
|
||||
|
||||
###################################################################
|
||||
##
|
||||
##//// General rules
|
||||
##
|
||||
###################################################################
|
||||
|
||||
.PHONY: all all-before all-after clean clean-custom realclean
|
||||
|
||||
all: all-before libpcre2.a libpcre2-posix.a all-after
|
||||
|
||||
all-before:
|
||||
# You can add rules here to execute before the project is built
|
||||
|
||||
all-after:
|
||||
# You can add rules here to execute after the project is built
|
||||
|
||||
tests: pcre2test pcre2grep
|
||||
|
||||
clean: clean-custom
|
||||
@echo "Cleaning compiler objects..."
|
||||
@rm -f $(libpcre2_OBJ) $(pcre2posix_OBJ) $(pcre2test_OBJ)
|
||||
|
||||
cleanall: clean
|
||||
@echo "Cleaning compiler targets..."
|
||||
@rm -f libpcre.a libpcre-posix.a pcre2test pcre2grep
|
||||
|
||||
###################################################################
|
||||
##
|
||||
##//// Targets
|
||||
##
|
||||
###################################################################
|
||||
|
||||
libpcre2.a: $(libpcre2_OBJ)
|
||||
ar -rcs libpcre2.a $(libpcre2_OBJ)
|
||||
ranlib libpcre2.a
|
||||
|
||||
libpcre2-posix.a: $(pcre2posix_OBJ)
|
||||
ar -rcs libpcre2-posix.a $(pcre2posix_OBJ)
|
||||
ranlib libpcre2-posix.a
|
||||
|
||||
pcre2test: libpcre2.a libpcre2-posix.a $(pcre2test_OBJ)
|
||||
@echo "Linking pcre2test"
|
||||
@gcc:bin/gcc $(MCRT) -o pcre2test $(pcre2test_OBJ) -L. -lauto -lpcre2 -lpcre2-posix
|
||||
@echo "Removing stale debug target: pcre2test"
|
||||
@rm -f pcre2test.debug
|
||||
|
||||
pcre2grep: libpcre2.a $(pcre2grep_OBJ)
|
||||
@echo "Linking pcre2grep"
|
||||
@gcc:bin/gcc $(MCRT) -o pcre2grep $(pcre2grep_OBJ) -L . -lauto -lpcre2
|
||||
@echo "Removing stale debug target: pcre2grep"
|
||||
@rm -f pcre2grep.debug
|
||||
|
||||
|
||||
###################################################################
|
||||
##
|
||||
##//// Standard rules
|
||||
##
|
||||
###################################################################
|
||||
|
||||
# A default rule to make all the objects listed below
|
||||
# because we are hiding compiler commands from the output
|
||||
|
||||
.c.o:
|
||||
@echo "Compiling $<"
|
||||
@$(CC) -c $< -o $*.o $(CFLAGS)
|
||||
|
||||
src/pcre2_chartables.o: src/pcre2_chartables.c src/config.h src/pcre2_internal.h \
|
||||
src/pcre2.h src/pcre2_ucp.h
|
||||
|
||||
src/pcre2_auto_possess.o: src/pcre2_auto_possess.c src/config.h src/pcre2_internal.h \
|
||||
src/pcre2.h src/pcre2_ucp.h
|
||||
|
||||
src/pcre2_compile.o: src/pcre2_compile.c src/config.h src/pcre2_internal.h \
|
||||
src/pcre2.h src/pcre2_ucp.h src/pcre2_intmodedep.h \
|
||||
|
||||
|
||||
src/pcre2_config.o: src/pcre2_config.c src/config.h src/pcre2_internal.h \
|
||||
src/pcre2.h src/pcre2_ucp.h
|
||||
|
||||
src/pcre2_context.o: src/pcre2_context.c src/config.h src/pcre2_internal.h \
|
||||
src/pcre2.h src/pcre2_ucp.h
|
||||
|
||||
src/pcre2_convert.o: src/pcre2_convert.c src/config.h src/pcre2_internal.h \
|
||||
src/pcre2.h src/pcre2_ucp.h
|
||||
|
||||
src/pcre2_dfa_match.o: src/pcre2_dfa_match.c src/config.h src/pcre2_internal.h \
|
||||
src/pcre2.h src/pcre2_ucp.h
|
||||
|
||||
src/pcre2_error.o: src/pcre2_error.c src/config.h src/pcre2_internal.h \
|
||||
src/pcre2.h src/pcre2_ucp.h
|
||||
|
||||
src/pcre2_extuni.o: src/pcre2_extuni.c src/config.h src/pcre2_internal.h \
|
||||
src/pcre2.h src/pcre2_ucp.h
|
||||
|
||||
src/pcre2_find_bracket.o: src/pcre2_find_bracket.c src/config.h src/pcre2_internal.h \
|
||||
src/pcre2.h src/pcre2_ucp.h
|
||||
|
||||
src/pcre2_jit_compile.o: src/pcre2_jit_compile.c src/config.h src/pcre2_internal.h \
|
||||
src/pcre2.h src/pcre2_ucp.h src/pcre2_intmodedep.h \
|
||||
src/sljit/sljitLir.c src/sljit/sljitLir.h src/sljit/sljitConfig.h \
|
||||
src/sljit/sljitConfigInternal.h src/sljit/sljitUtils.c src/sljit/sljitProtExecAllocator.c \
|
||||
src/sljit/sljitWXExecAllocator.c src/sljit/sljitExecAllocator.c src/pcre2_jit_simd_inc.h \
|
||||
src/pcre2_jit_neon_inc.h src/pcre2_jit_match.c
|
||||
|
||||
src/pcre2_maketables.o: src/pcre2_maketables.c
|
||||
|
||||
src/pcre2_match.o: src/pcre2_match.c src/config.h src/pcre2_internal.h \
|
||||
src/pcre2.h src/pcre2_ucp.h
|
||||
|
||||
src/pcre2_match_data.o: src/pcre2_match_data.c src/config.h src/pcre2_internal.h \
|
||||
src/pcre2.h src/pcre2_ucp.h
|
||||
|
||||
src/pcre2_newline.o: src/pcre2_newline.c src/config.h src/pcre2_internal.h \
|
||||
src/pcre2.h src/pcre2_ucp.h
|
||||
|
||||
src/pcre2_ord2utf.o: src/pcre2_ord2utf.c src/config.h src/pcre2_internal.h \
|
||||
src/pcre2.h src/pcre2_ucp.h
|
||||
|
||||
src/pcre2_pattern_info.o: src/pcre2_pattern_info.c src/config.h src/pcre2_internal.h \
|
||||
src/pcre2.h src/pcre2_ucp.h
|
||||
|
||||
src/pcre2_script_run.o: src/pcre2_script_run.c src/config.h src/pcre2_internal.h \
|
||||
src/pcre2.h src/pcre2_ucp.h
|
||||
|
||||
src/pcre2_serialize.o: src/pcre2_serialize.c src/config.h src/pcre2_internal.h \
|
||||
src/pcre2.h src/pcre2_ucp.h
|
||||
|
||||
src/pcre2test.o: src/pcre2test.c src/config.h src/pcre2.h \
|
||||
src/pcre2posix.h src/pcre2_internal.h src/pcre2_ucp.h \
|
||||
src/pcre2_intmodedep.h src/pcre2_tables.c src/pcre2_ucptables.c \
|
||||
src/pcre2_ucd.c src/pcre2_printint.c
|
||||
|
||||
src/pcre2_string_utils.o: src/pcre2_string_utils.c src/config.h src/pcre2_internal.h \
|
||||
src/pcre2.h src/pcre2_ucp.h
|
||||
|
||||
src/pcre2_study.o: src/pcre2_study.c src/config.h src/pcre2_internal.h \
|
||||
src/pcre2.h src/pcre2_ucp.h
|
||||
|
||||
src/pcre2_substitute.o: src/pcre2_substitute.c src/config.h src/pcre2_internal.h \
|
||||
src/pcre2.h src/pcre2_ucp.h
|
||||
|
||||
src/pcre2_substring.o: src/pcre2_substring.c src/config.h src/pcre2_internal.h \
|
||||
src/pcre2.h src/pcre2_ucp.h
|
||||
|
||||
src/pcre2posix.o: src/pcre2posix.c src/config.h src/pcre2.h \
|
||||
|
||||
|
||||
src/pcre2_tables.o: src/pcre2_tables.c src/config.h src/pcre2_internal.h \
|
||||
src/pcre2.h src/pcre2_ucp.h src/pcre2_intmodedep.h \
|
||||
|
||||
|
||||
src/pcre2_ucd.o: src/pcre2_ucd.c src/config.h src/pcre2_internal.h \
|
||||
src/pcre2.h src/pcre2_ucp.h
|
||||
|
||||
src/pcre2_valid_utf.o: src/pcre2_valid_utf.c src/config.h src/pcre2_internal.h \
|
||||
src/pcre2.h src/pcre2_ucp.h
|
||||
|
||||
src/pcre2_xclass.o: src/pcre2_xclass.c src/config.h src/pcre2_internal.h \
|
||||
src/pcre2.h src/pcre2_ucp.h
|
||||
|
||||
|
||||
src/pcre2grep.o: src/pcre2grep.c src/config.h
|
||||
|
||||
###################################################################
|
||||
##
|
||||
##//// Custom rules
|
||||
##
|
||||
###################################################################
|
||||
|
||||
runtests: libpcre2.a libpcre2-posix.a tests
|
||||
sh RunTest
|
||||
sh RunGrepTest
|
||||
|
||||
release:
|
||||
@echo "Create release folders..."
|
||||
@mkdir -p release/local/newlib/lib release/local/clib2/lib release/local/Documentation/pcre2 release/local/common/include
|
||||
|
||||
@echo "Building newlib based libraries..."
|
||||
@make -f Makefile.os4 all
|
||||
@cp libpcre2.a release/local/newlib/lib/
|
||||
@cp libpcre2-posix.a release/local/newlib/lib/
|
||||
|
||||
@echo "Clean build and libraries files..."
|
||||
@make -f Makefile.os4 cleanall
|
||||
|
||||
@echo "Building clib2 based libraries..."
|
||||
@make -f Makefile.os4 all USE_CLIB2=yes
|
||||
@cp libpcre2.a release/local/clib2/lib/
|
||||
@cp libpcre2-posix.a release/local/clib2/lib/
|
||||
|
||||
@echo "Copy the necessary files..."
|
||||
@cp src/pcre2.h release/local/common/include/
|
||||
@cp src/pcre2posix.h release/local/common/include/
|
||||
@cp COPYING release/local/Documentation/pcre2/
|
||||
@cp HACKING release/local/Documentation/pcre2/
|
||||
@cp LICENCE release/local/Documentation/pcre2/
|
||||
@cp README release/local/Documentation/pcre2/
|
||||
@cp README-OS4.md release/local/Documentation/pcre2/
|
||||
|
||||
@echo "Clean build and libraries files..."
|
||||
@make -f Makefile.os4 cleanall
|
||||
|
||||
@echo "Creating the lha release file..."
|
||||
@rm -f pcre2.lha
|
||||
@lha -aeqr3 a pcre2.lha release/
|
||||
|
||||
@rm -rf release
|
||||
|
||||
###################################################################
|
||||
|
32
NEWS
32
NEWS
|
@ -2,6 +2,38 @@ News about PCRE2 releases
|
|||
-------------------------
|
||||
|
||||
|
||||
Version 10.40 15-April-2022
|
||||
---------------------------
|
||||
|
||||
This is mostly a bug-fixing and code-tidying release. However, there are some
|
||||
extensions to Unicode property handling:
|
||||
|
||||
* Added support for Bidi_Class and a number of binary Unicode properties,
|
||||
including Bidi_Control.
|
||||
|
||||
* A number of changes to script matching for \p and \P:
|
||||
|
||||
(a) Script extensions for a character are now coded as a bitmap instead of
|
||||
a list of script numbers, which should be faster and does not need a
|
||||
loop.
|
||||
|
||||
(b) Added the syntax \p{script:xxx} and \p{script_extensions:xxx} (synonyms
|
||||
sc and scx).
|
||||
|
||||
(c) Changed \p{scriptname} from being the same as \p{sc:scriptname} to being
|
||||
the same as \p{scx:scriptname} because this change happened in Perl at
|
||||
release 5.26.
|
||||
|
||||
(d) The standard Unicode 4-letter abbreviations for script names are now
|
||||
recognized.
|
||||
|
||||
(e) In accordance with Unicode and Perl's "loose matching" rules, spaces,
|
||||
hyphens, and underscores are ignored in property names, which are then
|
||||
matched independent of case.
|
||||
|
||||
As always, see ChangeLog for a list of all changes (also the Git log).
|
||||
|
||||
|
||||
Version 10.39 29-October-2021
|
||||
-----------------------------
|
||||
|
||||
|
|
|
@ -121,6 +121,7 @@ environment, for example.
|
|||
pcre2_substring.c
|
||||
pcre2_tables.c
|
||||
pcre2_ucd.c
|
||||
pcre2_ucptables.c
|
||||
pcre2_valid_utf.c
|
||||
pcre2_xclass.c
|
||||
|
||||
|
@ -306,7 +307,7 @@ cache can be deleted by selecting "File > Delete Cache".
|
|||
3. Create a new, empty build directory, preferably a subdirectory of the
|
||||
source dir. For example, C:\pcre2\pcre2-xx\build.
|
||||
|
||||
4. Run cmake-gui from the Shell envirornment of your build tool, for example,
|
||||
4. Run cmake-gui from the Shell environment of your build tool, for example,
|
||||
Msys for Msys/MinGW or Visual Studio Command Prompt for VC/VC++. Do not try
|
||||
to start Cmake from the Windows Start menu, as this can lead to errors.
|
||||
|
||||
|
@ -373,7 +374,7 @@ Otherwise:
|
|||
1. Copy RunTest.bat into the directory where pcre2test.exe and pcre2grep.exe
|
||||
have been created.
|
||||
|
||||
2. Edit RunTest.bat to indentify the full or relative location of
|
||||
2. Edit RunTest.bat to identify the full or relative location of
|
||||
the pcre2 source (wherein which the testdata folder resides), e.g.:
|
||||
|
||||
set srcdir=C:\pcre2\pcre2-10.00
|
||||
|
|
47
README
47
README
|
@ -8,7 +8,7 @@ features, and the internals have been improved. The original PCRE1 library is
|
|||
now obsolete and no longer maintained. The latest release of PCRE2 is available
|
||||
in .tar.gz, tar.bz2, or .zip form from this GitHub repository:
|
||||
|
||||
https://github.com/PhilipHazel/pcre2/releases
|
||||
https://github.com/PCRE2Project/pcre2/releases
|
||||
|
||||
There is a mailing list for discussion about the development of PCRE2 at
|
||||
pcre2-dev@googlegroups.com. You can subscribe by sending an email to
|
||||
|
@ -17,7 +17,7 @@ pcre2-dev+subscribe@googlegroups.com.
|
|||
You can access the archives and also subscribe or manage your subscription
|
||||
here:
|
||||
|
||||
https://groups.google.com/pcre2-dev
|
||||
https://groups.google.com/g/pcre2-dev
|
||||
|
||||
Please read the NEWS file if you are upgrading from a previous release. The
|
||||
contents of this README file are:
|
||||
|
@ -114,12 +114,18 @@ Building PCRE2 using autotools
|
|||
The following instructions assume the use of the widely used "configure; make;
|
||||
make install" (autotools) process.
|
||||
|
||||
To build PCRE2 on system that supports autotools, first run the "configure"
|
||||
command from the PCRE2 distribution directory, with your current directory set
|
||||
If you have downloaded and unpacked a PCRE2 release tarball, run the
|
||||
"configure" command from the PCRE2 directory, with your current directory set
|
||||
to the directory where you want the files to be created. This command is a
|
||||
standard GNU "autoconf" configuration script, for which generic instructions
|
||||
are supplied in the file INSTALL.
|
||||
|
||||
The files in the GitHub repository do not contain "configure". If you have
|
||||
downloaded the PCRE2 source files from GitHub, before you can run "configure"
|
||||
you must run the shell script called autogen.sh. This runs a number of
|
||||
autotools to create a "configure" script (you must of course have the autotools
|
||||
commands installed in order to do this).
|
||||
|
||||
Most commonly, people build PCRE2 within its own distribution directory, and in
|
||||
this case, on many systems, just running "./configure" is sufficient. However,
|
||||
the usual methods of changing standard defaults are available. For example:
|
||||
|
@ -188,10 +194,10 @@ library. They are also documented in the pcre2build man page.
|
|||
|
||||
As well as supporting UTF strings, Unicode support includes support for the
|
||||
\P, \p, and \X sequences that recognize Unicode character properties.
|
||||
However, only the basic two-letter properties such as Lu are supported.
|
||||
Escape sequences such as \d and \w in patterns do not by default make use of
|
||||
Unicode properties, but can be made to do so by setting the PCRE2_UCP option
|
||||
or starting a pattern with (*UCP).
|
||||
However, only a subset of Unicode properties are supported; see the
|
||||
pcre2pattern man page for details. Escape sequences such as \d and \w in
|
||||
patterns do not by default make use of Unicode properties, but can be made to
|
||||
do so by setting the PCRE2_UCP option or starting a pattern with (*UCP).
|
||||
|
||||
. You can build PCRE2 to recognize either CR or LF or the sequence CRLF, or any
|
||||
of the preceding, or any of the Unicode newline sequences, or the NUL (zero)
|
||||
|
@ -369,7 +375,8 @@ library. They are also documented in the pcre2build man page.
|
|||
necessary to specify something like LIBS="-lncurses" as well. This is
|
||||
because, to quote the readline INSTALL, "Readline uses the termcap functions,
|
||||
but does not link with the termcap or curses library itself, allowing
|
||||
applications which link with readline the to choose an appropriate library."
|
||||
applications which link with readline the option to choose an appropriate
|
||||
library."
|
||||
If you get error messages about missing functions tgetstr, tgetent, tputs,
|
||||
tgetflag, or tgoto, this is the problem, and linking with the ncurses library
|
||||
should fix it.
|
||||
|
@ -394,10 +401,10 @@ library. They are also documented in the pcre2build man page.
|
|||
Setting --enable-fuzz-support also causes a binary called pcre2fuzzcheck to
|
||||
be created. This is normally run under valgrind or used when PCRE2 is
|
||||
compiled with address sanitizing enabled. It calls the fuzzing function and
|
||||
outputs information about it is doing. The input strings are specified by
|
||||
arguments: if an argument starts with "=" the rest of it is a literal input
|
||||
string. Otherwise, it is assumed to be a file name, and the contents of the
|
||||
file are the test string.
|
||||
outputs information about what it is doing. The input strings are specified
|
||||
by arguments: if an argument starts with "=" the rest of it is a literal
|
||||
input string. Otherwise, it is assumed to be a file name, and the contents
|
||||
of the file are the test string.
|
||||
|
||||
. Releases before 10.30 could be compiled with --disable-stack-for-recursion,
|
||||
which caused pcre2_match() to use individual blocks on the heap for
|
||||
|
@ -571,9 +578,9 @@ at build time" for more details.
|
|||
Making new tarballs
|
||||
-------------------
|
||||
|
||||
The command "make dist" creates two PCRE2 tarballs, in tar.gz and zip formats.
|
||||
The command "make distcheck" does the same, but then does a trial build of the
|
||||
new distribution to ensure that it works.
|
||||
The command "make dist" creates three PCRE2 tarballs, in tar.gz, tar.bz2, and
|
||||
zip formats. The command "make distcheck" does the same, but then does a trial
|
||||
build of the new distribution to ensure that it works.
|
||||
|
||||
If you have modified any of the man page sources in the doc directory, you
|
||||
should first run the PrepareRelease script before making a distribution. This
|
||||
|
@ -602,13 +609,13 @@ is available. RunTest outputs a comment when it skips a test.
|
|||
|
||||
Many (but not all) of the tests that are not skipped are run twice if JIT
|
||||
support is available. On the second run, JIT compilation is forced. This
|
||||
testing can be suppressed by putting "nojit" on the RunTest command line.
|
||||
testing can be suppressed by putting "-nojit" on the RunTest command line.
|
||||
|
||||
The entire set of tests is run once for each of the 8-bit, 16-bit and 32-bit
|
||||
libraries that are enabled. If you want to run just one set of tests, call
|
||||
RunTest with either the -8, -16 or -32 option.
|
||||
|
||||
If valgrind is installed, you can run the tests under it by putting "valgrind"
|
||||
If valgrind is installed, you can run the tests under it by putting "-valgrind"
|
||||
on the RunTest command line. To run pcre2test on just one or more specific test
|
||||
files, give their numbers as arguments to RunTest, for example:
|
||||
|
||||
|
@ -689,7 +696,7 @@ Test 14 contains some special UTF and UCP tests that give different output for
|
|||
different code unit widths.
|
||||
|
||||
Test 15 contains a number of tests that must not be run with JIT. They check,
|
||||
among other non-JIT things, the match-limiting features of the intepretive
|
||||
among other non-JIT things, the match-limiting features of the interpretive
|
||||
matcher.
|
||||
|
||||
Test 16 is run only when JIT support is not available. It checks that an
|
||||
|
@ -905,4 +912,4 @@ The distribution should contain the files listed below.
|
|||
Philip Hazel
|
||||
Email local part: Philip.Hazel
|
||||
Email domain: gmail.com
|
||||
Last updated: 29 October 2021
|
||||
Last updated: 15 April 2022
|
||||
|
|
|
@ -0,0 +1,39 @@
|
|||
PCRE2 (Perl-compatible regular expression library)
|
||||
---------------------------------------------------------------------------
|
||||
|
||||
This is a port of PCRE2 10.40 by Philip Hazel for AmigaOS 4, as found at the
|
||||
GitHub repository https://github.com/PCRE2Project/pcre2
|
||||
|
||||
More information about PCRE can be found at its official website
|
||||
at https://www.pcre.org and at the documentation that comes with this
|
||||
package.
|
||||
|
||||
In the archive both newlib and clib2 libraries are included. It has been
|
||||
tested with various applications, but in case you find issues please
|
||||
contact me.
|
||||
|
||||
To install it into your AmigaOS 4 SDK installation, just extract all the
|
||||
files in the SDK: path.
|
||||
|
||||
Compile
|
||||
--------------------------
|
||||
The source and the changes I did can be found at my personale repository
|
||||
https://git.walkero.gr/walkero/pcre2
|
||||
|
||||
You can compile it using the Makefile.os4 file, and produce the libraries
|
||||
yourself.
|
||||
|
||||
* with newlib run:
|
||||
```bash
|
||||
make -f Makefile.os4 all
|
||||
```
|
||||
* with clib2 run:
|
||||
```bash
|
||||
make -f Makefile.os4 all USE_CLIB2=yes
|
||||
```
|
||||
|
||||
Changelog
|
||||
--------------------------
|
||||
v10.40r1 - 2022-07-31
|
||||
* First release
|
||||
|
|
@ -14,14 +14,14 @@ flexible API, the code of PCRE2 has been much improved since the fork.
|
|||
## Download
|
||||
|
||||
As well as downloading from the
|
||||
[GitHub site](https://github.com/PhilipHazel/pcre2), you can download PCRE2
|
||||
[GitHub site](https://github.com/PCRE2Project/pcre2), you can download PCRE2
|
||||
or the older, unmaintained PCRE1 library from an
|
||||
[*unofficial* mirror](https://sourceforge.net/projects/pcre/files/) at SourceForge.
|
||||
|
||||
You can check out the PCRE2 source code via Git or Subversion:
|
||||
|
||||
git clone https://github.com/PhilipHazel/pcre2.git
|
||||
svn co https://github.com/PhilipHazel/pcre2.git
|
||||
git clone https://github.com/PCRE2Project/pcre2.git
|
||||
svn co https://github.com/PCRE2Project/pcre2.git
|
||||
|
||||
## Contributed Ports
|
||||
|
||||
|
@ -36,7 +36,7 @@ default character encoding, can be found at
|
|||
## Documentation
|
||||
|
||||
You can read the PCRE2 documentation
|
||||
[here](https://philiphazel.github.io/pcre2/doc/html/index.html).
|
||||
[here](https://PCRE2Project.github.io/pcre2/doc/html/index.html).
|
||||
|
||||
Comparisons to Perl's regular expression semantics can be found in the
|
||||
community authored Wikipedia entry for PCRE.
|
||||
|
|
48
RunGrepTest
48
RunGrepTest
|
@ -68,6 +68,22 @@ diff -b /dev/null /dev/null 2>/dev/null && cf="diff -b"
|
|||
diff -u /dev/null /dev/null 2>/dev/null && cf="diff -u"
|
||||
diff -ub /dev/null /dev/null 2>/dev/null && cf="diff -ub"
|
||||
|
||||
# Some tests involve NUL characters. It seems impossible to handle them easily
|
||||
# in many operating systems. An earlier version of this script used sed to
|
||||
# translate NUL into the string ZERO, but this didn't work on Solaris (aka
|
||||
# SunOS), where the version of sed explicitly doesn't like them, and also MacOS
|
||||
# (Darwin), OpenBSD, FreeBSD, NetBSD, and some Linux distributions like Alpine,
|
||||
# even when using GNU sed. A user suggested using tr instead, which
|
||||
# necessitates translating to a single character. However, on (some versions
|
||||
# of?) Solaris, the normal "tr" cannot handle binary zeros, but if
|
||||
# /usr/xpg4/bin/tr is available, it can do so, so test for that.
|
||||
|
||||
if [ -x /usr/xpg4/bin/tr ] ; then
|
||||
tr=/usr/xpg4/bin/tr
|
||||
else
|
||||
tr=tr
|
||||
fi
|
||||
|
||||
# If this test is being run from "make check", $srcdir will be set. If not, set
|
||||
# it to the current or parent directory, whichever one contains the test data.
|
||||
# Subsequently, we run most of the pcre2grep tests in the source directory so
|
||||
|
@ -674,13 +690,27 @@ echo "---------------------------- Test 131 -----------------------------" >>tes
|
|||
echo "RC=$?" >>testtrygrep
|
||||
|
||||
echo "---------------------------- Test 132 -----------------------------" >>testtrygrep
|
||||
(cd $srcdir; $valgrind $vjs $pcre2grep -m1 -A3 '^match'; echo '---'; head -1) <$srcdir/testdata/grepinput >>testtrygrep 2>&1
|
||||
(cd $srcdir; exec 3<testdata/grepinput; $valgrind $vjs $pcre2grep -m1 -A3 '^match' <&3; echo '---'; head -1 <&3; exec 3<&-) >>testtrygrep 2>&1
|
||||
echo "RC=$?" >>testtrygrep
|
||||
|
||||
echo "---------------------------- Test 133 -----------------------------" >>testtrygrep
|
||||
(cd $srcdir; exec 3<testdata/grepinput; $valgrind $vjs $pcre2grep -m1 -A3 '^match' <&3; echo '---'; $valgrind $vjs $pcre2grep -m1 -A3 '^match' <&3; exec 3<&-) >>testtrygrep 2>&1
|
||||
echo "RC=$?" >>testtrygrep
|
||||
|
||||
echo "---------------------------- Test 134 -----------------------------" >>testtrygrep
|
||||
(cd $srcdir; $valgrind $vjs $pcre2grep -m1 -O '=$x{41}$x423$o{103}$o1045=' 'fox') <$srcdir/testdata/grepinputv >>testtrygrep 2>&1
|
||||
echo "RC=$?" >>testtrygrep
|
||||
|
||||
echo "---------------------------- Test 135 -----------------------------" >>testtrygrep
|
||||
(cd $srcdir; $valgrind $vjs $pcre2grep -HZ 'word' ./testdata/grepinputv) | $tr '\000' '@' >>testtrygrep
|
||||
echo "RC=$?" >>testtrygrep
|
||||
(cd $srcdir; $valgrind $vjs $pcre2grep -lZ 'word' ./testdata/grepinputv ./testdata/grepinputv) | $tr '\000' '@' >>testtrygrep
|
||||
echo "RC=$?" >>testtrygrep
|
||||
(cd $srcdir; $valgrind $vjs $pcre2grep -A 1 -B 1 -HZ 'word' ./testdata/grepinputv) | $tr '\000' '@' >>testtrygrep
|
||||
echo "RC=$?" >>testtrygrep
|
||||
(cd $srcdir; $valgrind $vjs $pcre2grep -MHZn 'start[\s]+end' testdata/grepinputM) >>testtrygrep
|
||||
echo "RC=$?" >>testtrygrep
|
||||
|
||||
# Now compare the results.
|
||||
|
||||
$cf $srcdir/testdata/grepoutput testtrygrep
|
||||
|
@ -755,22 +785,6 @@ $valgrind $vjs $pcre2grep -n --newline=any "^(abc|def|ghi|jkl)" testNinputgrep >
|
|||
printf '%c--------------------------- Test N6 ------------------------------\r\n' - >>testtrygrep
|
||||
$valgrind $vjs $pcre2grep -n --newline=anycrlf "^(abc|def|ghi|jkl)" testNinputgrep >>testtrygrep
|
||||
|
||||
# This next test involves NUL characters. It seems impossible to handle them
|
||||
# easily in many operating systems. An earlier version of this script used sed
|
||||
# to translate NUL into the string ZERO, but this didn't work on Solaris (aka
|
||||
# SunOS), where the version of sed explicitly doesn't like them, and also MacOS
|
||||
# (Darwin), OpenBSD, FreeBSD, NetBSD, and some Linux distributions like Alpine,
|
||||
# even when using GNU sed. A user suggested using tr instead, which
|
||||
# necessitates translating to a single character (@). However, on (some
|
||||
# versions of?) Solaris, the normal "tr" cannot handle binary zeros, but if
|
||||
# /usr/xpg4/bin/tr is available, it can do so, so test for that.
|
||||
|
||||
if [ -x /usr/xpg4/bin/tr ] ; then
|
||||
tr=/usr/xpg4/bin/tr
|
||||
else
|
||||
tr=tr
|
||||
fi
|
||||
|
||||
printf '%c--------------------------- Test N7 ------------------------------\r\n' - >>testtrygrep
|
||||
printf 'abc\0def' >testNinputgrep
|
||||
$valgrind $vjs $pcre2grep -na --newline=nul "^(abc|def)" testNinputgrep | $tr '\000' '@' >>testtrygrep
|
||||
|
|
61
RunTest
61
RunTest
|
@ -17,8 +17,16 @@
|
|||
# individual test numbers, ranges of tests such as 3-6 or 3- (meaning 3 to the
|
||||
# end), or a number preceded by ~ to exclude a test. For example, "3-15 ~10"
|
||||
# runs tests 3 to 15, excluding test 10, and just "~10" runs all the tests
|
||||
# except test 10. Whatever order the arguments are in, the tests are always run
|
||||
# in numerical order.
|
||||
# except test 10. Whatever order the arguments are in, these tests are always
|
||||
# run in numerical order.
|
||||
#
|
||||
# If no specific tests are selected (which is the case when this script is run
|
||||
# via 'make check') the default is to run all the numbered tests.
|
||||
#
|
||||
# There may also be named (as well as numbered) tests for special purposes. At
|
||||
# present there is just one, called "heap". This test's output contains the
|
||||
# sizes of heap frames and frame vectors, which depend on the environment. It
|
||||
# is therefore not run unless explicitly requested.
|
||||
#
|
||||
# Inappropriate tests are automatically skipped (with a comment to say so). For
|
||||
# example, if JIT support is not compiled, test 16 is skipped, whereas if JIT
|
||||
|
@ -80,7 +88,9 @@ title22="Test 22: \C tests with UTF (not supported for DFA matching)"
|
|||
title23="Test 23: \C disabled test"
|
||||
title24="Test 24: Non-UTF pattern conversion tests"
|
||||
title25="Test 25: UTF pattern conversion tests"
|
||||
maxtest=25
|
||||
title26="Test 26: Auto-generated unicode property tests"
|
||||
maxtest=26
|
||||
titleheap="Test 'heap': Environment-specific heap tests"
|
||||
|
||||
if [ $# -eq 1 -a "$1" = "list" ]; then
|
||||
echo $title0
|
||||
|
@ -109,6 +119,12 @@ if [ $# -eq 1 -a "$1" = "list" ]; then
|
|||
echo $title23
|
||||
echo $title24
|
||||
echo $title25
|
||||
echo $title26
|
||||
echo ""
|
||||
echo $titleheap
|
||||
echo ""
|
||||
echo "Numbered tests are automatically run if nothing selected."
|
||||
echo "Named tests must be explicitly selected."
|
||||
exit 0
|
||||
fi
|
||||
|
||||
|
@ -238,6 +254,8 @@ do22=no
|
|||
do23=no
|
||||
do24=no
|
||||
do25=no
|
||||
do26=no
|
||||
doheap=no
|
||||
|
||||
while [ $# -gt 0 ] ; do
|
||||
case $1 in
|
||||
|
@ -267,6 +285,8 @@ while [ $# -gt 0 ] ; do
|
|||
23) do23=yes;;
|
||||
24) do24=yes;;
|
||||
25) do25=yes;;
|
||||
26) do26=yes;;
|
||||
heap) doheap=yes;;
|
||||
-8) arg8=yes;;
|
||||
-16) arg16=yes;;
|
||||
-32) arg32=yes;;
|
||||
|
@ -320,7 +340,8 @@ fi
|
|||
# set up a large stack.
|
||||
|
||||
$sim ./pcre2test -S 64 /dev/null /dev/null
|
||||
if [ $? -eq 0 -a "$bigstack" != "" ] ; then
|
||||
support_setstack=$?
|
||||
if [ $support_setstack -eq 0 -a "$bigstack" != "" ] ; then
|
||||
setstack="-S 64"
|
||||
else
|
||||
setstack=""
|
||||
|
@ -407,8 +428,8 @@ if [ $jit -ne 0 -a "$nojit" != "yes" ] ; then
|
|||
fi
|
||||
fi
|
||||
|
||||
# If no specific tests were requested, select all. Those that are not
|
||||
# relevant will be automatically skipped.
|
||||
# If no specific tests were requested, select all the numbered tests. Those
|
||||
# that are not relevant will be automatically skipped.
|
||||
|
||||
if [ $do0 = no -a $do1 = no -a $do2 = no -a $do3 = no -a \
|
||||
$do4 = no -a $do5 = no -a $do6 = no -a $do7 = no -a \
|
||||
|
@ -416,7 +437,7 @@ if [ $do0 = no -a $do1 = no -a $do2 = no -a $do3 = no -a \
|
|||
$do12 = no -a $do13 = no -a $do14 = no -a $do15 = no -a \
|
||||
$do16 = no -a $do17 = no -a $do18 = no -a $do19 = no -a \
|
||||
$do20 = no -a $do21 = no -a $do22 = no -a $do23 = no -a \
|
||||
$do24 = no -a $do25 = no \
|
||||
$do24 = no -a $do25 = no -a $do26 = no -a $doheap = no \
|
||||
]; then
|
||||
do0=yes
|
||||
do1=yes
|
||||
|
@ -444,6 +465,7 @@ if [ $do0 = no -a $do1 = no -a $do2 = no -a $do3 = no -a \
|
|||
do23=yes
|
||||
do24=yes
|
||||
do25=yes
|
||||
do26=yes
|
||||
fi
|
||||
|
||||
# Handle any explicit skips at this stage, so that an argument list may consist
|
||||
|
@ -479,7 +501,9 @@ for bmode in "$test8" "$test16" "$test32"; do
|
|||
echo '' >testtry
|
||||
checkspecial '-C'
|
||||
checkspecial '--help'
|
||||
if [ $support_setstack -eq 0 ] ; then
|
||||
checkspecial '-S 1 -t 10 testSinput'
|
||||
fi
|
||||
echo " OK"
|
||||
fi
|
||||
|
||||
|
@ -860,6 +884,29 @@ for bmode in "$test8" "$test16" "$test32"; do
|
|||
fi
|
||||
fi
|
||||
|
||||
# Auto-generated unicode property tests
|
||||
|
||||
if [ $do26 = yes ] ; then
|
||||
echo $title26
|
||||
if [ $utf -eq 0 ] ; then
|
||||
echo " Skipped because UTF-$bits support is not available"
|
||||
else
|
||||
for opt in "" $jitopt; do
|
||||
$sim $valgrind ${opt:+$vjs} ./pcre2test -q $setstack $bmode $opt $testdata/testinput26 testtry
|
||||
checkresult $? 26 "$opt"
|
||||
done
|
||||
fi
|
||||
fi
|
||||
|
||||
# Manually selected heap tests - output may vary in different environments,
|
||||
# which is why that are not automatically run.
|
||||
|
||||
if [ $doheap = yes ] ; then
|
||||
echo $titleheap
|
||||
$sim $valgrind ./pcre2test -q $setstack $bmode $testdata/testinputheap testtry
|
||||
checkresult $? heap-$bits ""
|
||||
fi
|
||||
|
||||
# End of loop for 8/16/32-bit tests
|
||||
done
|
||||
|
||||
|
|
|
@ -135,9 +135,9 @@ if "%all%" == "yes" (
|
|||
set do7=yes
|
||||
set do8=yes
|
||||
set do9=yes
|
||||
set do10=yes
|
||||
set do10=no
|
||||
set do11=yes
|
||||
set do12=yes
|
||||
set do12=no
|
||||
set do13=yes
|
||||
set do14=yes
|
||||
set do15=yes
|
||||
|
|
|
@ -0,0 +1 @@
|
|||
# See MODULE.bazel
|
|
@ -1,17 +1,16 @@
|
|||
# Modified from FindReadline.cmake (PH Feb 2012)
|
||||
|
||||
if(EDITLINE_INCLUDE_DIR AND EDITLINE_LIBRARY AND NCURSES_LIBRARY)
|
||||
if(EDITLINE_INCLUDE_DIR AND EDITLINE_LIBRARY)
|
||||
set(EDITLINE_FOUND TRUE)
|
||||
else(EDITLINE_INCLUDE_DIR AND EDITLINE_LIBRARY AND NCURSES_LIBRARY)
|
||||
FIND_PATH(EDITLINE_INCLUDE_DIR readline.h
|
||||
/usr/include/editline
|
||||
/usr/include/edit/readline
|
||||
/usr/include/readline
|
||||
else(EDITLINE_INCLUDE_DIR AND EDITLINE_LIBRARY)
|
||||
FIND_PATH(EDITLINE_INCLUDE_DIR readline.h PATH_SUFFIXES
|
||||
editline
|
||||
edit/readline
|
||||
)
|
||||
|
||||
FIND_LIBRARY(EDITLINE_LIBRARY NAMES edit)
|
||||
include(FindPackageHandleStandardArgs)
|
||||
FIND_PACKAGE_HANDLE_STANDARD_ARGS(Editline DEFAULT_MSG EDITLINE_INCLUDE_DIR EDITLINE_LIBRARY )
|
||||
FIND_PACKAGE_HANDLE_STANDARD_ARGS(Editline DEFAULT_MSG EDITLINE_INCLUDE_DIR EDITLINE_LIBRARY)
|
||||
|
||||
MARK_AS_ADVANCED(EDITLINE_INCLUDE_DIR EDITLINE_LIBRARY)
|
||||
endif(EDITLINE_INCLUDE_DIR AND EDITLINE_LIBRARY AND NCURSES_LIBRARY)
|
||||
endif(EDITLINE_INCLUDE_DIR AND EDITLINE_LIBRARY)
|
||||
|
|
42
configure.ac
42
configure.ac
|
@ -9,15 +9,15 @@ dnl The PCRE2_PRERELEASE feature is for identifying release candidates. It might
|
|||
dnl be defined as -RC2, for example. For real releases, it should be empty.
|
||||
|
||||
m4_define(pcre2_major, [10])
|
||||
m4_define(pcre2_minor, [39])
|
||||
m4_define(pcre2_minor, [41])
|
||||
m4_define(pcre2_prerelease, [])
|
||||
m4_define(pcre2_date, [2021-10-29])
|
||||
m4_define(pcre2_date, [2022-xx-xx])
|
||||
|
||||
# Libtool shared library interface versions (current:revision:age)
|
||||
m4_define(libpcre2_8_version, [10:4:10])
|
||||
m4_define(libpcre2_16_version, [10:4:10])
|
||||
m4_define(libpcre2_32_version, [10:4:10])
|
||||
m4_define(libpcre2_posix_version, [3:1:0])
|
||||
m4_define(libpcre2_8_version, [11:0:11])
|
||||
m4_define(libpcre2_16_version, [11:0:11])
|
||||
m4_define(libpcre2_32_version, [11:0:11])
|
||||
m4_define(libpcre2_posix_version, [3:2:0])
|
||||
|
||||
# NOTE: The CMakeLists.txt file searches for the above variables in the first
|
||||
# 50 lines of this file. Please update that if the variables above are moved.
|
||||
|
@ -512,7 +512,20 @@ AC_TYPE_SIZE_T
|
|||
|
||||
# Checks for library functions.
|
||||
|
||||
AC_CHECK_FUNCS(bcopy memfd_create memmove mkostemp realpath secure_getenv strerror)
|
||||
AC_CHECK_FUNCS(bcopy memfd_create memmove mkostemp secure_getenv strerror)
|
||||
AC_MSG_CHECKING([for realpath])
|
||||
AC_LINK_IFELSE([AC_LANG_PROGRAM([[
|
||||
#include <stdlib.h>
|
||||
#include <limits.h>
|
||||
]],[[
|
||||
char buffer[PATH_MAX];
|
||||
realpath(".", buffer);
|
||||
]])],
|
||||
[AC_MSG_RESULT([yes])
|
||||
AC_DEFINE([HAVE_REALPATH], 1,
|
||||
[Define to 1 if you have the `realpath' function.])
|
||||
],
|
||||
AC_MSG_RESULT([no]))
|
||||
|
||||
# Check for the availability of libz (aka zlib)
|
||||
|
||||
|
@ -584,14 +597,14 @@ if test "$enable_pcre2test_libreadline" = "yes"; then
|
|||
fi
|
||||
fi
|
||||
|
||||
|
||||
# Check for the availability of libedit. Different distributions put its
|
||||
# headers in different places. Try to cover the most common ones.
|
||||
|
||||
if test "$enable_pcre2test_libedit" = "yes"; then
|
||||
AC_CHECK_HEADERS([editline/readline.h], [HAVE_EDITLINE_READLINE_H=1],
|
||||
[AC_CHECK_HEADERS([edit/readline/readline.h], [HAVE_READLINE_READLINE_H=1],
|
||||
[AC_CHECK_HEADERS([readline/readline.h], [HAVE_READLINE_READLINE_H=1])])])
|
||||
AC_CHECK_HEADERS([editline/readline.h edit/readline/readline.h readline.h], [
|
||||
HAVE_LIBEDIT_HEADER=1
|
||||
break
|
||||
])
|
||||
AC_CHECK_LIB([edit], [readline], [LIBEDIT="-ledit"])
|
||||
fi
|
||||
|
||||
|
@ -927,10 +940,9 @@ if test "$enable_pcre2test_libedit" = "yes"; then
|
|||
echo "** Cannot use both --enable-pcre2test-libedit and --enable-pcre2test-readline"
|
||||
exit 1
|
||||
fi
|
||||
if test "$HAVE_EDITLINE_READLINE_H" != "1" -a \
|
||||
"$HAVE_READLINE_READLINE_H" != "1"; then
|
||||
echo "** Cannot --enable-pcre2test-libedit because neither editline/readline.h"
|
||||
echo "** nor readline/readline.h was found."
|
||||
if test -z "$HAVE_LIBEDIT_HEADER"; then
|
||||
echo "** Cannot --enable-pcre2test-libedit because neither editline/readline.h,"
|
||||
echo "** edit/readline/readline.h nor a compatible header was found."
|
||||
exit 1
|
||||
fi
|
||||
if test -z "$LIBEDIT"; then
|
||||
|
|
|
@ -121,6 +121,7 @@ environment, for example.
|
|||
pcre2_substring.c
|
||||
pcre2_tables.c
|
||||
pcre2_ucd.c
|
||||
pcre2_ucptables.c
|
||||
pcre2_valid_utf.c
|
||||
pcre2_xclass.c
|
||||
|
||||
|
@ -306,7 +307,7 @@ cache can be deleted by selecting "File > Delete Cache".
|
|||
3. Create a new, empty build directory, preferably a subdirectory of the
|
||||
source dir. For example, C:\pcre2\pcre2-xx\build.
|
||||
|
||||
4. Run cmake-gui from the Shell envirornment of your build tool, for example,
|
||||
4. Run cmake-gui from the Shell environment of your build tool, for example,
|
||||
Msys for Msys/MinGW or Visual Studio Command Prompt for VC/VC++. Do not try
|
||||
to start Cmake from the Windows Start menu, as this can lead to errors.
|
||||
|
||||
|
@ -373,7 +374,7 @@ Otherwise:
|
|||
1. Copy RunTest.bat into the directory where pcre2test.exe and pcre2grep.exe
|
||||
have been created.
|
||||
|
||||
2. Edit RunTest.bat to indentify the full or relative location of
|
||||
2. Edit RunTest.bat to identify the full or relative location of
|
||||
the pcre2 source (wherein which the testdata folder resides), e.g.:
|
||||
|
||||
set srcdir=C:\pcre2\pcre2-10.00
|
||||
|
|
|
@ -8,7 +8,7 @@ features, and the internals have been improved. The original PCRE1 library is
|
|||
now obsolete and no longer maintained. The latest release of PCRE2 is available
|
||||
in .tar.gz, tar.bz2, or .zip form from this GitHub repository:
|
||||
|
||||
https://github.com/PhilipHazel/pcre2/releases
|
||||
https://github.com/PCRE2Project/pcre2/releases
|
||||
|
||||
There is a mailing list for discussion about the development of PCRE2 at
|
||||
pcre2-dev@googlegroups.com. You can subscribe by sending an email to
|
||||
|
@ -17,7 +17,7 @@ pcre2-dev+subscribe@googlegroups.com.
|
|||
You can access the archives and also subscribe or manage your subscription
|
||||
here:
|
||||
|
||||
https://groups.google.com/pcre2-dev
|
||||
https://groups.google.com/g/pcre2-dev
|
||||
|
||||
Please read the NEWS file if you are upgrading from a previous release. The
|
||||
contents of this README file are:
|
||||
|
@ -114,12 +114,18 @@ Building PCRE2 using autotools
|
|||
The following instructions assume the use of the widely used "configure; make;
|
||||
make install" (autotools) process.
|
||||
|
||||
To build PCRE2 on system that supports autotools, first run the "configure"
|
||||
command from the PCRE2 distribution directory, with your current directory set
|
||||
If you have downloaded and unpacked a PCRE2 release tarball, run the
|
||||
"configure" command from the PCRE2 directory, with your current directory set
|
||||
to the directory where you want the files to be created. This command is a
|
||||
standard GNU "autoconf" configuration script, for which generic instructions
|
||||
are supplied in the file INSTALL.
|
||||
|
||||
The files in the GitHub repository do not contain "configure". If you have
|
||||
downloaded the PCRE2 source files from GitHub, before you can run "configure"
|
||||
you must run the shell script called autogen.sh. This runs a number of
|
||||
autotools to create a "configure" script (you must of course have the autotools
|
||||
commands installed in order to do this).
|
||||
|
||||
Most commonly, people build PCRE2 within its own distribution directory, and in
|
||||
this case, on many systems, just running "./configure" is sufficient. However,
|
||||
the usual methods of changing standard defaults are available. For example:
|
||||
|
@ -188,10 +194,10 @@ library. They are also documented in the pcre2build man page.
|
|||
|
||||
As well as supporting UTF strings, Unicode support includes support for the
|
||||
\P, \p, and \X sequences that recognize Unicode character properties.
|
||||
However, only the basic two-letter properties such as Lu are supported.
|
||||
Escape sequences such as \d and \w in patterns do not by default make use of
|
||||
Unicode properties, but can be made to do so by setting the PCRE2_UCP option
|
||||
or starting a pattern with (*UCP).
|
||||
However, only a subset of Unicode properties are supported; see the
|
||||
pcre2pattern man page for details. Escape sequences such as \d and \w in
|
||||
patterns do not by default make use of Unicode properties, but can be made to
|
||||
do so by setting the PCRE2_UCP option or starting a pattern with (*UCP).
|
||||
|
||||
. You can build PCRE2 to recognize either CR or LF or the sequence CRLF, or any
|
||||
of the preceding, or any of the Unicode newline sequences, or the NUL (zero)
|
||||
|
@ -369,7 +375,8 @@ library. They are also documented in the pcre2build man page.
|
|||
necessary to specify something like LIBS="-lncurses" as well. This is
|
||||
because, to quote the readline INSTALL, "Readline uses the termcap functions,
|
||||
but does not link with the termcap or curses library itself, allowing
|
||||
applications which link with readline the to choose an appropriate library."
|
||||
applications which link with readline the option to choose an appropriate
|
||||
library."
|
||||
If you get error messages about missing functions tgetstr, tgetent, tputs,
|
||||
tgetflag, or tgoto, this is the problem, and linking with the ncurses library
|
||||
should fix it.
|
||||
|
@ -394,10 +401,10 @@ library. They are also documented in the pcre2build man page.
|
|||
Setting --enable-fuzz-support also causes a binary called pcre2fuzzcheck to
|
||||
be created. This is normally run under valgrind or used when PCRE2 is
|
||||
compiled with address sanitizing enabled. It calls the fuzzing function and
|
||||
outputs information about it is doing. The input strings are specified by
|
||||
arguments: if an argument starts with "=" the rest of it is a literal input
|
||||
string. Otherwise, it is assumed to be a file name, and the contents of the
|
||||
file are the test string.
|
||||
outputs information about what it is doing. The input strings are specified
|
||||
by arguments: if an argument starts with "=" the rest of it is a literal
|
||||
input string. Otherwise, it is assumed to be a file name, and the contents
|
||||
of the file are the test string.
|
||||
|
||||
. Releases before 10.30 could be compiled with --disable-stack-for-recursion,
|
||||
which caused pcre2_match() to use individual blocks on the heap for
|
||||
|
@ -571,9 +578,9 @@ at build time" for more details.
|
|||
Making new tarballs
|
||||
-------------------
|
||||
|
||||
The command "make dist" creates two PCRE2 tarballs, in tar.gz and zip formats.
|
||||
The command "make distcheck" does the same, but then does a trial build of the
|
||||
new distribution to ensure that it works.
|
||||
The command "make dist" creates three PCRE2 tarballs, in tar.gz, tar.bz2, and
|
||||
zip formats. The command "make distcheck" does the same, but then does a trial
|
||||
build of the new distribution to ensure that it works.
|
||||
|
||||
If you have modified any of the man page sources in the doc directory, you
|
||||
should first run the PrepareRelease script before making a distribution. This
|
||||
|
@ -602,13 +609,13 @@ is available. RunTest outputs a comment when it skips a test.
|
|||
|
||||
Many (but not all) of the tests that are not skipped are run twice if JIT
|
||||
support is available. On the second run, JIT compilation is forced. This
|
||||
testing can be suppressed by putting "nojit" on the RunTest command line.
|
||||
testing can be suppressed by putting "-nojit" on the RunTest command line.
|
||||
|
||||
The entire set of tests is run once for each of the 8-bit, 16-bit and 32-bit
|
||||
libraries that are enabled. If you want to run just one set of tests, call
|
||||
RunTest with either the -8, -16 or -32 option.
|
||||
|
||||
If valgrind is installed, you can run the tests under it by putting "valgrind"
|
||||
If valgrind is installed, you can run the tests under it by putting "-valgrind"
|
||||
on the RunTest command line. To run pcre2test on just one or more specific test
|
||||
files, give their numbers as arguments to RunTest, for example:
|
||||
|
||||
|
@ -689,7 +696,7 @@ Test 14 contains some special UTF and UCP tests that give different output for
|
|||
different code unit widths.
|
||||
|
||||
Test 15 contains a number of tests that must not be run with JIT. They check,
|
||||
among other non-JIT things, the match-limiting features of the intepretive
|
||||
among other non-JIT things, the match-limiting features of the interpretive
|
||||
matcher.
|
||||
|
||||
Test 16 is run only when JIT support is not available. It checks that an
|
||||
|
@ -905,4 +912,4 @@ The distribution should contain the files listed below.
|
|||
Philip Hazel
|
||||
Email local part: Philip.Hazel
|
||||
Email domain: gmail.com
|
||||
Last updated: 29 October 2021
|
||||
Last updated: 15 April 2022
|
||||
|
|
|
@ -92,8 +92,18 @@ Additional options may be set in the compile context via the
|
|||
function.
|
||||
</P>
|
||||
<P>
|
||||
The yield of this function is a pointer to a private data structure that
|
||||
contains the compiled pattern, or NULL if an error was detected.
|
||||
If either of <i>errorcode</i> or <i>erroroffset</i> is NULL, the function returns
|
||||
NULL immediately. Otherwise, the yield of this function is a pointer to a
|
||||
private data structure that contains the compiled pattern, or NULL if an error
|
||||
was detected. In the error case, a text error message can be obtained by
|
||||
passing the value returned via the <i>errorcode</i> argument to the the
|
||||
<b>pcre2_get_error_message()</b> function. The offset (in code units) where the
|
||||
error was encountered is returned via the <i>erroroffset</i> argument.
|
||||
</P>
|
||||
<P>
|
||||
If there is no error, the value passed via <i>errorcode</i> returns the message
|
||||
"no error" if passed to <b>pcre2_get_error_message()</b>, and the value passed
|
||||
via <i>erroroffset</i> is zero.
|
||||
</P>
|
||||
<P>
|
||||
There is a complete description of the PCRE2 native API, with more detail on
|
||||
|
|
|
@ -34,7 +34,8 @@ allocation. The result can be passed to the JIT run-time code by calling
|
|||
<b>pcre2_jit_stack_assign()</b> to associate the stack with a compiled pattern,
|
||||
which can then be processed by <b>pcre2_match()</b> or <b>pcre2_jit_match()</b>.
|
||||
A maximum stack size of 512KiB to 1MiB should be more than enough for any
|
||||
pattern. For more details, see the
|
||||
pattern. If the stack couldn't be allocated or the values passed were not
|
||||
reasonable, NULL will be returned. For more details, see the
|
||||
<a href="pcre2jit.html"><b>pcre2jit</b></a>
|
||||
page.
|
||||
</P>
|
||||
|
|
|
@ -48,7 +48,7 @@ the following negative error codes:
|
|||
PCRE2_ERROR_BADDATA <i>number_of_codes</i> is zero or less
|
||||
PCRE2_ERROR_BADMAGIC mismatch of id bytes in <i>bytes</i>
|
||||
PCRE2_ERROR_BADMODE mismatch of variable unit size or PCRE version
|
||||
PCRE2_ERROR_MEMORY memory allocation failed
|
||||
PCRE2_ERROR_NOMEMORY memory allocation failed
|
||||
PCRE2_ERROR_NULL <i>codes</i> or <i>bytes</i> is NULL
|
||||
</pre>
|
||||
PCRE2_ERROR_BADMAGIC may mean that the data is corrupt, or that it was compiled
|
||||
|
|
|
@ -30,8 +30,8 @@ This function sets additional option bits for <b>pcre2_compile()</b> that are
|
|||
housed in a compile context. It completely replaces all the bits. The extra
|
||||
options are:
|
||||
<pre>
|
||||
PCRE2_EXTRA_ALLOW_LOOKAROUND_BSK Allow \K in lookarounds PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES Allow \x{df800} to \x{dfff}
|
||||
in UTF-8 and UTF-32 modes
|
||||
PCRE2_EXTRA_ALLOW_LOOKAROUND_BSK Allow \K in lookarounds
|
||||
PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES Allow \x{d800} to \x{dfff} in UTF-8 and UTF-32 modes
|
||||
PCRE2_EXTRA_ALT_BSUX Extended alternate \u, \U, and \x handling
|
||||
PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL Treat all invalid escapes as a literal following character
|
||||
PCRE2_EXTRA_ESCAPED_CR_IS_LF Interpret \r as \n
|
||||
|
|
|
@ -69,18 +69,18 @@ The subject and replacement lengths can be given as PCRE2_ZERO_TERMINATED for
|
|||
zero-terminated strings. The options are:
|
||||
<pre>
|
||||
PCRE2_ANCHORED Match only at the first position
|
||||
PCRE2_ENDANCHORED Pattern can match only at end of subject
|
||||
PCRE2_ENDANCHORED Match only at end of subject
|
||||
PCRE2_NOTBOL Subject is not the beginning of a line
|
||||
PCRE2_NOTEOL Subject is not the end of a line
|
||||
PCRE2_NOTEMPTY An empty string is not a valid match
|
||||
PCRE2_NOTEMPTY_ATSTART An empty string at the start of the subject is not a valid match
|
||||
PCRE2_NO_JIT Do not use JIT matching
|
||||
PCRE2_NO_UTF_CHECK Do not check the subject or replacement for UTF validity (only relevant if
|
||||
PCRE2_UTF was set at compile time)
|
||||
PCRE2_NO_UTF_CHECK Do not check for UTF validity in the subject or replacement
|
||||
(only relevant if PCRE2_UTF was set at compile time)
|
||||
PCRE2_SUBSTITUTE_EXTENDED Do extended replacement processing
|
||||
PCRE2_SUBSTITUTE_GLOBAL Replace all occurrences in the subject
|
||||
PCRE2_SUBSTITUTE_LITERAL The replacement string is literal
|
||||
PCRE2_SUBSTITUTE_MATCHED Use pre-existing match data for 1st match
|
||||
PCRE2_SUBSTITUTE_MATCHED Use pre-existing match data for first match
|
||||
PCRE2_SUBSTITUTE_OVERFLOW_LENGTH If overflow, compute needed length
|
||||
PCRE2_SUBSTITUTE_REPLACEMENT_ONLY Return only replacement string(s)
|
||||
PCRE2_SUBSTITUTE_UNKNOWN_UNSET Treat unknown group as unset
|
||||
|
@ -90,7 +90,7 @@ If PCRE2_SUBSTITUTE_LITERAL is set, PCRE2_SUBSTITUTE_EXTENDED,
|
|||
PCRE2_SUBSTITUTE_UNKNOWN_UNSET, and PCRE2_SUBSTITUTE_UNSET_EMPTY are ignored.
|
||||
</P>
|
||||
<P>
|
||||
If PCRE2_SUBSTITUTE_MATCHED is set, <i>match_data</i> must be non-zero; its
|
||||
If PCRE2_SUBSTITUTE_MATCHED is set, <i>match_data</i> must be non-NULL; its
|
||||
contents must be the result of a call to <b>pcre2_match()</b> using the same
|
||||
pattern and subject.
|
||||
</P>
|
||||
|
|
|
@ -1017,7 +1017,7 @@ has its own memory control arrangements (see the
|
|||
documentation for more details). If the limit is reached, the negative error
|
||||
code PCRE2_ERROR_HEAPLIMIT is returned. The default limit can be set when PCRE2
|
||||
is built; if it is not, the default is set very large and is essentially
|
||||
"unlimited".
|
||||
unlimited.
|
||||
</P>
|
||||
<P>
|
||||
A value for the heap limit may also be supplied by an item at the start of a
|
||||
|
@ -1030,19 +1030,17 @@ less than the limit set by the caller of <b>pcre2_match()</b> or, if no such
|
|||
limit is set, less than the default.
|
||||
</P>
|
||||
<P>
|
||||
The <b>pcre2_match()</b> function starts out using a 20KiB vector on the system
|
||||
stack for recording backtracking points. The more nested backtracking points
|
||||
there are (that is, the deeper the search tree), the more memory is needed.
|
||||
Heap memory is used only if the initial vector is too small. If the heap limit
|
||||
is set to a value less than 21 (in particular, zero) no heap memory will be
|
||||
used. In this case, only patterns that do not have a lot of nested backtracking
|
||||
can be successfully processed.
|
||||
The <b>pcre2_match()</b> function always needs some heap memory, so setting a
|
||||
value of zero guarantees a "heap limit exceeded" error. Details of how
|
||||
<b>pcre2_match()</b> uses the heap are given in the
|
||||
<a href="pcre2perform.html"><b>pcre2perform</b></a>
|
||||
documentation.
|
||||
</P>
|
||||
<P>
|
||||
Similarly, for <b>pcre2_dfa_match()</b>, a vector on the system stack is used
|
||||
when processing pattern recursions, lookarounds, or atomic groups, and only if
|
||||
this is not big enough is heap memory used. In this case, too, setting a value
|
||||
of zero disables the use of the heap.
|
||||
For <b>pcre2_dfa_match()</b>, a vector on the system stack is used when
|
||||
processing pattern recursions, lookarounds, or atomic groups, and only if this
|
||||
is not big enough is heap memory used. In this case, setting a value of zero
|
||||
disables the use of the heap.
|
||||
<br>
|
||||
<br>
|
||||
<b>int pcre2_set_match_limit(pcre2_match_context *<i>mcontext</i>,</b>
|
||||
|
@ -1089,10 +1087,10 @@ less than the limit set by the caller of <b>pcre2_match()</b> or
|
|||
<br>
|
||||
<br>
|
||||
This parameter limits the depth of nested backtracking in <b>pcre2_match()</b>.
|
||||
Each time a nested backtracking point is passed, a new memory "frame" is used
|
||||
Each time a nested backtracking point is passed, a new memory frame is used
|
||||
to remember the state of matching at that point. Thus, this parameter
|
||||
indirectly limits the amount of memory that is used in a match. However,
|
||||
because the size of each memory "frame" depends on the number of capturing
|
||||
because the size of each memory frame depends on the number of capturing
|
||||
parentheses, the actual memory limit varies from pattern to pattern. This limit
|
||||
was more useful in versions before 10.30, where function recursion was used for
|
||||
backtracking.
|
||||
|
@ -1383,8 +1381,7 @@ If <i>errorcode</i> or <i>erroroffset</i> is NULL, <b>pcre2_compile()</b> return
|
|||
NULL immediately. Otherwise, the variables to which these point are set to an
|
||||
error code and an offset (number of code units) within the pattern,
|
||||
respectively, when <b>pcre2_compile()</b> returns NULL because a compilation
|
||||
error has occurred. The values are not defined when compilation is successful
|
||||
and <b>pcre2_compile()</b> returns a non-NULL value.
|
||||
error has occurred.
|
||||
</P>
|
||||
<P>
|
||||
There are nearly 100 positive error codes that <b>pcre2_compile()</b> may return
|
||||
|
@ -1399,15 +1396,18 @@ because the textual error messages that are obtained by calling the
|
|||
message"
|
||||
<a href="#geterrormessage">below)</a>
|
||||
should be self-explanatory. Macro names starting with PCRE2_ERROR_ are defined
|
||||
for both positive and negative error codes in <b>pcre2.h</b>.
|
||||
for both positive and negative error codes in <b>pcre2.h</b>. When compilation
|
||||
is successful <i>errorcode</i> is set to a value that returns the message "no
|
||||
error" if passed to <b>pcre2_get_error_message()</b>.
|
||||
</P>
|
||||
<P>
|
||||
The value returned in <i>erroroffset</i> is an indication of where in the
|
||||
pattern the error occurred. It is not necessarily the furthest point in the
|
||||
pattern that was read. For example, after the error "lookbehind assertion is
|
||||
not fixed length", the error offset points to the start of the failing
|
||||
assertion. For an invalid UTF-8 or UTF-16 string, the offset is that of the
|
||||
first code unit of the failing character.
|
||||
pattern an error occurred. When there is no error, zero is returned. A non-zero
|
||||
value is not necessarily the furthest point in the pattern that was read. For
|
||||
example, after the error "lookbehind assertion is not fixed length", the error
|
||||
offset points to the start of the failing assertion. For an invalid UTF-8 or
|
||||
UTF-16 string, the offset is that of the first code unit of the failing
|
||||
character.
|
||||
</P>
|
||||
<P>
|
||||
Some errors are not detected until the whole pattern has been scanned; in these
|
||||
|
@ -1845,7 +1845,7 @@ undefined. It may cause your program to crash or loop.
|
|||
</P>
|
||||
<P>
|
||||
Note that this option can also be passed to <b>pcre2_match()</b> and
|
||||
<b>pcre_dfa_match()</b>, to suppress UTF validity checking of the subject
|
||||
<b>pcre2_dfa_match()</b>, to suppress UTF validity checking of the subject
|
||||
string.
|
||||
</P>
|
||||
<P>
|
||||
|
@ -2055,8 +2055,8 @@ point. However, this applies only to characters whose code points are less than
|
|||
\d.
|
||||
</P>
|
||||
<P>
|
||||
When PCRE2 is built with Unicode support (the default), the Unicode properties
|
||||
of all characters can be tested with \p and \P, or, alternatively, the
|
||||
When PCRE2 is built with Unicode support (the default), certain Unicode
|
||||
character properties can be tested with \p and \P, or, alternatively, the
|
||||
PCRE2_UCP option can be set when a pattern is compiled; this causes \w and
|
||||
friends to use Unicode property support instead of the built-in tables.
|
||||
PCRE2_UCP also causes upper/lower casing operations on characters with code
|
||||
|
@ -2640,7 +2640,9 @@ The subject string is passed to <b>pcre2_match()</b> as a pointer in
|
|||
<i>startoffset</i>. The length and offset are in code units, not characters.
|
||||
That is, they are in bytes for the 8-bit library, 16-bit code units for the
|
||||
16-bit library, and 32-bit code units for the 32-bit library, whether or not
|
||||
UTF processing is enabled.
|
||||
UTF processing is enabled. As a special case, if <i>subject</i> is NULL and
|
||||
<i>length</i> is zero, the subject is assumed to be an empty string. If
|
||||
<i>length</i> is non-zero, an error occurs if <i>subject</i> is NULL.
|
||||
</P>
|
||||
<P>
|
||||
If <i>startoffset</i> is greater than the length of the subject,
|
||||
|
@ -3144,11 +3146,11 @@ The backtracking match limit was reached.
|
|||
<pre>
|
||||
PCRE2_ERROR_NOMEMORY
|
||||
</pre>
|
||||
If a pattern contains many nested backtracking points, heap memory is used to
|
||||
remember them. This error is given when the memory allocation function (default
|
||||
or custom) fails. Note that a different error, PCRE2_ERROR_HEAPLIMIT, is given
|
||||
if the amount of memory needed exceeds the heap limit. PCRE2_ERROR_NOMEMORY is
|
||||
also returned if PCRE2_COPY_MATCHED_SUBJECT is set and memory allocation fails.
|
||||
Heap memory is used to remember backgracking points. This error is given when
|
||||
the memory allocation function (default or custom) fails. Note that a different
|
||||
error, PCRE2_ERROR_HEAPLIMIT, is given if the amount of memory needed exceeds
|
||||
the heap limit. PCRE2_ERROR_NOMEMORY is also returned if
|
||||
PCRE2_COPY_MATCHED_SUBJECT is set and memory allocation fails.
|
||||
<pre>
|
||||
PCRE2_ERROR_NULL
|
||||
</pre>
|
||||
|
@ -3394,12 +3396,17 @@ same number causes an error at compile time.
|
|||
<P>
|
||||
This function optionally calls <b>pcre2_match()</b> and then makes a copy of the
|
||||
subject string in <i>outputbuffer</i>, replacing parts that were matched with
|
||||
the <i>replacement</i> string, whose length is supplied in <b>rlength</b>. This
|
||||
can be given as PCRE2_ZERO_TERMINATED for a zero-terminated string. There is an
|
||||
option (see PCRE2_SUBSTITUTE_REPLACEMENT_ONLY below) to return just the
|
||||
replacement string(s). The default action is to perform just one replacement if
|
||||
the pattern matches, but there is an option that requests multiple replacements
|
||||
(see PCRE2_SUBSTITUTE_GLOBAL below).
|
||||
the <i>replacement</i> string, whose length is supplied in <b>rlength</b>, which
|
||||
can be given as PCRE2_ZERO_TERMINATED for a zero-terminated string. As a
|
||||
special case, if <i>replacement</i> is NULL and <i>rlength</i> is zero, the
|
||||
replacement is assumed to be an empty string. If <i>rlength</i> is non-zero, an
|
||||
error occurs if <i>replacement</i> is NULL.
|
||||
</P>
|
||||
<P>
|
||||
There is an option (see PCRE2_SUBSTITUTE_REPLACEMENT_ONLY below) to return just
|
||||
the replacement string(s). The default action is to perform just one
|
||||
replacement if the pattern matches, but there is an option that requests
|
||||
multiple replacements (see PCRE2_SUBSTITUTE_GLOBAL below).
|
||||
</P>
|
||||
<P>
|
||||
If successful, <b>pcre2_substitute()</b> returns the number of substitutions
|
||||
|
@ -3433,12 +3440,12 @@ block may or may not have been changed.
|
|||
As well as the usual options for <b>pcre2_match()</b>, a number of additional
|
||||
options can be set in the <i>options</i> argument of <b>pcre2_substitute()</b>.
|
||||
One such option is PCRE2_SUBSTITUTE_MATCHED. When this is set, an external
|
||||
<i>match_data</i> block must be provided, and it must have been used for an
|
||||
external call to <b>pcre2_match()</b>. The data in the <i>match_data</i> block
|
||||
(return code, offset vector) is used for the first substitution instead of
|
||||
calling <b>pcre2_match()</b> from within <b>pcre2_substitute()</b>. This allows
|
||||
an application to check for a match before choosing to substitute, without
|
||||
having to repeat the match.
|
||||
<i>match_data</i> block must be provided, and it must have already been used for
|
||||
an external call to <b>pcre2_match()</b> with the same pattern and subject
|
||||
arguments. The data in the <i>match_data</i> block (return code, offset vector)
|
||||
is then used for the first substitution instead of calling <b>pcre2_match()</b>
|
||||
from within <b>pcre2_substitute()</b>. This allows an application to check for a
|
||||
match before choosing to substitute, without having to repeat the match.
|
||||
</P>
|
||||
<P>
|
||||
The contents of the externally supplied match data block are not changed when
|
||||
|
@ -3655,7 +3662,9 @@ default.
|
|||
</P>
|
||||
<P>
|
||||
PCRE2_ERROR_NULL is returned if PCRE2_SUBSTITUTE_MATCHED is set but the
|
||||
<i>match_data</i> argument is NULL.
|
||||
<i>match_data</i> argument is NULL or if the <i>subject</i> or <i>replacement</i>
|
||||
arguments are NULL. For backward compatibility reasons an exception is made for
|
||||
the <i>replacement</i> argument if the <i>rlength</i> argument is also 0.
|
||||
</P>
|
||||
<P>
|
||||
PCRE2_ERROR_BADREPLACEMENT is used for miscellaneous syntax errors in the
|
||||
|
@ -3810,12 +3819,13 @@ other alternatives. Ultimately, when it runs out of matches,
|
|||
<P>
|
||||
The function <b>pcre2_dfa_match()</b> is called to match a subject string
|
||||
against a compiled pattern, using a matching algorithm that scans the subject
|
||||
string just once (not counting lookaround assertions), and does not backtrack.
|
||||
This has different characteristics to the normal algorithm, and is not
|
||||
compatible with Perl. Some of the features of PCRE2 patterns are not supported.
|
||||
Nevertheless, there are times when this kind of matching can be useful. For a
|
||||
discussion of the two matching algorithms, and a list of features that
|
||||
<b>pcre2_dfa_match()</b> does not support, see the
|
||||
string just once (not counting lookaround assertions), and does not backtrack
|
||||
(except when processing lookaround assertions). This has different
|
||||
characteristics to the normal algorithm, and is not compatible with Perl. Some
|
||||
of the features of PCRE2 patterns are not supported. Nevertheless, there are
|
||||
times when this kind of matching can be useful. For a discussion of the two
|
||||
matching algorithms, and a list of features that <b>pcre2_dfa_match()</b> does
|
||||
not support, see the
|
||||
<a href="pcre2matching.html"><b>pcre2matching</b></a>
|
||||
documentation.
|
||||
</P>
|
||||
|
@ -3850,7 +3860,7 @@ Here is an example of a simple call to <b>pcre2_dfa_match()</b>:
|
|||
</PRE>
|
||||
</P>
|
||||
<br><b>
|
||||
Option bits for <b>pcre_dfa_match()</b>
|
||||
Option bits for <b>pcre2_dfa_match()</b>
|
||||
</b><br>
|
||||
<P>
|
||||
The unused bits of the <i>options</i> argument for <b>pcre2_dfa_match()</b> must
|
||||
|
@ -4008,9 +4018,9 @@ Cambridge, England.
|
|||
</P>
|
||||
<br><a name="SEC42" href="#TOC1">REVISION</a><br>
|
||||
<P>
|
||||
Last updated: 30 August 2021
|
||||
Last updated: 27 July 2022
|
||||
<br>
|
||||
Copyright © 1997-2021 University of Cambridge.
|
||||
Copyright © 1997-2022 University of Cambridge.
|
||||
<br>
|
||||
<p>
|
||||
Return to the <a href="index.html">PCRE2 index page</a>.
|
||||
|
|
|
@ -142,8 +142,9 @@ locked this out by setting PCRE2_NEVER_UTF.
|
|||
UTF support allows the libraries to process character code points up to
|
||||
0x10ffff in the strings that they handle. Unicode support also gives access to
|
||||
the Unicode properties of characters, using pattern escapes such as \P, \p,
|
||||
and \X. Only the general category properties such as <i>Lu</i> and <i>Nd</i> are
|
||||
supported. Details are given in the
|
||||
and \X. Only the general category properties such as <i>Lu</i> and <i>Nd</i>,
|
||||
script names, and some bi-directional properties are supported. Details are
|
||||
given in the
|
||||
<a href="pcre2pattern.html"><b>pcre2pattern</b></a>
|
||||
documentation.
|
||||
</P>
|
||||
|
@ -283,12 +284,11 @@ to the <b>configure</b> command. This setting also applies to the
|
|||
counting is done differently).
|
||||
</P>
|
||||
<P>
|
||||
The <b>pcre2_match()</b> function starts out using a 20KiB vector on the system
|
||||
stack to record backtracking points. The more nested backtracking points there
|
||||
are (that is, the deeper the search tree), the more memory is needed. If the
|
||||
initial vector is not large enough, heap memory is used, up to a certain limit,
|
||||
which is specified in kibibytes (units of 1024 bytes). The limit can be changed
|
||||
at run time, as described in the
|
||||
The <b>pcre2_match()</b> function uses heap memory to record backtracking
|
||||
points. The more nested backtracking points there are (that is, the deeper the
|
||||
search tree), the more memory is needed. There is an upper limit, specified in
|
||||
kibibytes (units of 1024 bytes). This limit can be changed at run time, as
|
||||
described in the
|
||||
<a href="pcre2api.html"><b>pcre2api</b></a>
|
||||
documentation. The default limit (in effect unlimited) is 20 million. You can
|
||||
change this by a setting such as
|
||||
|
@ -307,7 +307,7 @@ You can also explicitly limit the depth of nested backtracking in the
|
|||
for --with-match-limit. You can set a lower default limit by adding, for
|
||||
example,
|
||||
<pre>
|
||||
--with-match-limit_depth=10000
|
||||
--with-match-limit-depth=10000
|
||||
</pre>
|
||||
to the <b>configure</b> command. This value can be overridden at run time. This
|
||||
depth limit indirectly limits the amount of heap memory that is used, but
|
||||
|
@ -608,16 +608,16 @@ give a warning.
|
|||
<P>
|
||||
Philip Hazel
|
||||
<br>
|
||||
University Computing Service
|
||||
Retired from University Computing Service
|
||||
<br>
|
||||
Cambridge, England.
|
||||
<br>
|
||||
</P>
|
||||
<br><a name="SEC26" href="#TOC1">REVISION</a><br>
|
||||
<P>
|
||||
Last updated: 20 March 2020
|
||||
Last updated: 27 July 2022
|
||||
<br>
|
||||
Copyright © 1997-2020 University of Cambridge.
|
||||
Copyright © 1997-2022 University of Cambridge.
|
||||
<br>
|
||||
<p>
|
||||
Return to the <a href="index.html">PCRE2 index page</a>.
|
||||
|
|
|
@ -18,33 +18,41 @@ DIFFERENCES BETWEEN PCRE2 AND PERL
|
|||
<P>
|
||||
This document describes some of the differences in the ways that PCRE2 and Perl
|
||||
handle regular expressions. The differences described here are with respect to
|
||||
Perl version 5.32.0, but as both Perl and PCRE2 are continually changing, the
|
||||
Perl version 5.34.0, but as both Perl and PCRE2 are continually changing, the
|
||||
information may at times be out of date.
|
||||
</P>
|
||||
<P>
|
||||
1. PCRE2 has only a subset of Perl's Unicode support. Details of what it does
|
||||
1. When PCRE2_DOTALL (equivalent to Perl's /s qualifier) is not set, the
|
||||
behaviour of the '.' metacharacter differs from Perl. In PCRE2, '.' matches the
|
||||
next character unless it is the start of a newline sequence. This means that,
|
||||
if the newline setting is CR, CRLF, or NUL, '.' will match the code point LF
|
||||
(0x0A) in ASCII/Unicode environments, and NL (either 0x15 or 0x25) when using
|
||||
EBCDIC. In Perl, '.' appears never to match LF, even when 0x0A is not a newline
|
||||
indicator.
|
||||
</P>
|
||||
<P>
|
||||
2. PCRE2 has only a subset of Perl's Unicode support. Details of what it does
|
||||
have are given in the
|
||||
<a href="pcre2unicode.html"><b>pcre2unicode</b></a>
|
||||
page.
|
||||
</P>
|
||||
<P>
|
||||
2. Like Perl, PCRE2 allows repeat quantifiers on parenthesized assertions, but
|
||||
3. Like Perl, PCRE2 allows repeat quantifiers on parenthesized assertions, but
|
||||
they do not mean what you might think. For example, (?!a){3} does not assert
|
||||
that the next three characters are not "a". It just asserts that the next
|
||||
character is not "a" three times (in principle; PCRE2 optimizes this to run the
|
||||
assertion just once). Perl allows some repeat quantifiers on other assertions,
|
||||
for example, \b* (but not \b{3}, though oddly it does allow ^{3}), but these
|
||||
do not seem to have any use. PCRE2 does not allow any kind of quantifier on
|
||||
non-lookaround assertions.
|
||||
for example, \b* , but these do not seem to have any use. PCRE2 does not allow
|
||||
any kind of quantifier on non-lookaround assertions.
|
||||
</P>
|
||||
<P>
|
||||
3. Capture groups that occur inside negative lookaround assertions are counted,
|
||||
4. Capture groups that occur inside negative lookaround assertions are counted,
|
||||
but their entries in the offsets vector are set only when a negative assertion
|
||||
is a condition that has a matching branch (that is, the condition is false).
|
||||
Perl may set such capture groups in other circumstances.
|
||||
</P>
|
||||
<P>
|
||||
4. The following Perl escape sequences are not supported: \F, \l, \L, \u,
|
||||
5. The following Perl escape sequences are not supported: \F, \l, \L, \u,
|
||||
\U, and \N when followed by a character name. \N on its own, matching a
|
||||
non-newline character, and \N{U+dd..}, matching a Unicode code point, are
|
||||
supported. The escapes that modify the case of following letters are
|
||||
|
@ -55,26 +63,26 @@ PCRE2_EXTRA_ALT_BSUX options is set, \U and \u are interpreted as ECMAScript
|
|||
interprets them.
|
||||
</P>
|
||||
<P>
|
||||
5. The Perl escape sequences \p, \P, and \X are supported only if PCRE2 is
|
||||
6. The Perl escape sequences \p, \P, and \X are supported only if PCRE2 is
|
||||
built with Unicode support (the default). The properties that can be tested
|
||||
with \p and \P are limited to the general category properties such as Lu and
|
||||
Nd, script names such as Greek or Han, and the derived properties Any and L&.
|
||||
Both PCRE2 and Perl support the Cs (surrogate) property, but in PCRE2 its use
|
||||
is limited. See the
|
||||
Nd, script names such as Greek or Han, Bidi_Class, Bidi_Control, and the
|
||||
derived properties Any and LC (synonym L&). Both PCRE2 and Perl support the Cs
|
||||
(surrogate) property, but in PCRE2 its use is limited. See the
|
||||
<a href="pcre2pattern.html"><b>pcre2pattern</b></a>
|
||||
documentation for details. The long synonyms for property names that Perl
|
||||
supports (such as \p{Letter}) are not supported by PCRE2, nor is it permitted
|
||||
to prefix any of these properties with "Is".
|
||||
</P>
|
||||
<P>
|
||||
6. PCRE2 supports the \Q...\E escape for quoting substrings. Characters
|
||||
7. PCRE2 supports the \Q...\E escape for quoting substrings. Characters
|
||||
in between are treated as literals. However, this is slightly different from
|
||||
Perl in that $ and @ are also handled as literals inside the quotes. In Perl,
|
||||
they cause variable interpolation (but of course PCRE2 does not have
|
||||
variables). Also, Perl does "double-quotish backslash interpolation" on any
|
||||
backslashes between \Q and \E which, its documentation says, "may lead to
|
||||
confusing results". PCRE2 treats a backslash between \Q and \E just like any
|
||||
other character. Note the following examples:
|
||||
they cause variable interpolation (PCRE2 does not have variables). Also, Perl
|
||||
does "double-quotish backslash interpolation" on any backslashes between \Q
|
||||
and \E which, its documentation says, "may lead to confusing results". PCRE2
|
||||
treats a backslash between \Q and \E just like any other character. Note the
|
||||
following examples:
|
||||
<pre>
|
||||
Pattern PCRE2 matches Perl matches
|
||||
|
||||
|
@ -88,19 +96,19 @@ The \Q...\E sequence is recognized both inside and outside character classes
|
|||
by both PCRE2 and Perl.
|
||||
</P>
|
||||
<P>
|
||||
7. Fairly obviously, PCRE2 does not support the (?{code}) and (??{code})
|
||||
8. Fairly obviously, PCRE2 does not support the (?{code}) and (??{code})
|
||||
constructions. However, PCRE2 does have a "callout" feature, which allows an
|
||||
external function to be called during pattern matching. See the
|
||||
<a href="pcre2callout.html"><b>pcre2callout</b></a>
|
||||
documentation for details.
|
||||
</P>
|
||||
<P>
|
||||
8. Subroutine calls (whether recursive or not) were treated as atomic groups up
|
||||
9. Subroutine calls (whether recursive or not) were treated as atomic groups up
|
||||
to PCRE2 release 10.23, but from release 10.30 this changed, and backtracking
|
||||
into subroutine calls is now supported, as in Perl.
|
||||
</P>
|
||||
<P>
|
||||
9. In PCRE2, if any of the backtracking control verbs are used in a group that
|
||||
10. In PCRE2, if any of the backtracking control verbs are used in a group that
|
||||
is called as a subroutine (whether or not recursively), their effect is
|
||||
confined to that group; it does not extend to the surrounding pattern. This is
|
||||
not always the case in Perl. In particular, if (*THEN) is present in a group
|
||||
|
@ -109,20 +117,20 @@ the group does not contain any | characters. Note that such groups are
|
|||
processed as anchored at the point where they are tested.
|
||||
</P>
|
||||
<P>
|
||||
10. If a pattern contains more than one backtracking control verb, the first
|
||||
11. If a pattern contains more than one backtracking control verb, the first
|
||||
one that is backtracked onto acts. For example, in the pattern
|
||||
A(*COMMIT)B(*PRUNE)C a failure in B triggers (*COMMIT), but a failure in C
|
||||
triggers (*PRUNE). Perl's behaviour is more complex; in many cases it is the
|
||||
same as PCRE2, but there are cases where it differs.
|
||||
</P>
|
||||
<P>
|
||||
11. There are some differences that are concerned with the settings of captured
|
||||
12. There are some differences that are concerned with the settings of captured
|
||||
strings when part of a pattern is repeated. For example, matching "aba" against
|
||||
the pattern /^(a(b)?)+$/ in Perl leaves $2 unset, but in PCRE2 it is set to
|
||||
"b".
|
||||
</P>
|
||||
<P>
|
||||
12. PCRE2's handling of duplicate capture group numbers and names is not as
|
||||
13. PCRE2's handling of duplicate capture group numbers and names is not as
|
||||
general as Perl's. This is a consequence of the fact the PCRE2 works internally
|
||||
just with numbers, using an external table to translate between numbers and
|
||||
names. In particular, a pattern such as (?|(?<a>A)|(?<b>B)), where the two
|
||||
|
@ -132,42 +140,43 @@ to distinguish which group matched, because both names map to capture group
|
|||
number 1. To avoid this confusing situation, an error is given at compile time.
|
||||
</P>
|
||||
<P>
|
||||
13. Perl used to recognize comments in some places that PCRE2 does not, for
|
||||
14. Perl used to recognize comments in some places that PCRE2 does not, for
|
||||
example, between the ( and ? at the start of a group. If the /x modifier is
|
||||
set, Perl allowed white space between ( and ? though the latest Perls give an
|
||||
error (for a while it was just deprecated). There may still be some cases where
|
||||
Perl behaves differently.
|
||||
</P>
|
||||
<P>
|
||||
14. Perl, when in warning mode, gives warnings for character classes such as
|
||||
15. Perl, when in warning mode, gives warnings for character classes such as
|
||||
[A-\d] or [a-[:digit:]]. It then treats the hyphens as literals. PCRE2 has no
|
||||
warning features, so it gives an error in these cases because they are almost
|
||||
certainly user mistakes.
|
||||
</P>
|
||||
<P>
|
||||
15. In PCRE2, the upper/lower case character properties Lu and Ll are not
|
||||
16. In PCRE2, the upper/lower case character properties Lu and Ll are not
|
||||
affected when case-independent matching is specified. For example, \p{Lu}
|
||||
always matches an upper case letter. I think Perl has changed in this respect;
|
||||
in the release at the time of writing (5.32), \p{Lu} and \p{Ll} match all
|
||||
in the release at the time of writing (5.34), \p{Lu} and \p{Ll} match all
|
||||
letters, regardless of case, when case independence is specified.
|
||||
</P>
|
||||
<P>
|
||||
16. From release 5.32.0, Perl locks out the use of \K in lookaround
|
||||
17. From release 5.32.0, Perl locks out the use of \K in lookaround
|
||||
assertions. From release 10.38 PCRE2 does the same by default. However, there
|
||||
is an option for re-enabling the previous behaviour. When this option is set,
|
||||
\K is acted on when it occurs in positive assertions, but is ignored in
|
||||
negative assertions.
|
||||
</P>
|
||||
<P>
|
||||
17. PCRE2 provides some extensions to the Perl regular expression facilities.
|
||||
18. PCRE2 provides some extensions to the Perl regular expression facilities.
|
||||
Perl 5.10 included new features that were not in earlier versions of Perl, some
|
||||
of which (such as named parentheses) were in PCRE2 for some time before. This
|
||||
list is with respect to Perl 5.32:
|
||||
list is with respect to Perl 5.34:
|
||||
<br>
|
||||
<br>
|
||||
(a) Although lookbehind assertions in PCRE2 must match fixed length strings,
|
||||
each alternative toplevel branch of a lookbehind assertion can match a
|
||||
different length of string. Perl requires them all to have the same length.
|
||||
different length of string. Perl used to require them all to have the same
|
||||
length, but the latest version has some variable length support.
|
||||
<br>
|
||||
<br>
|
||||
(b) From PCRE2 10.23, backreferences to groups of fixed length are supported
|
||||
|
@ -221,12 +230,12 @@ extension to the lookaround facilities. The default, Perl-compatible
|
|||
lookarounds are atomic.
|
||||
</P>
|
||||
<P>
|
||||
18. The Perl /a modifier restricts /d numbers to pure ascii, and the /aa
|
||||
19. The Perl /a modifier restricts /d numbers to pure ascii, and the /aa
|
||||
modifier restricts /i case-insensitive matching to pure ascii, ignoring Unicode
|
||||
rules. This separation cannot be represented with PCRE2_UCP.
|
||||
</P>
|
||||
<P>
|
||||
19. Perl has different limits than PCRE2. See the
|
||||
20. Perl has different limits than PCRE2. See the
|
||||
<a href="pcre2limit.html"><b>pcre2limit</b></a>
|
||||
documentation for details. Perl went with 5.10 from recursion to iteration
|
||||
keeping the intermediate matches on the heap, which is ~10% slower but does not
|
||||
|
@ -248,7 +257,7 @@ Cambridge, England.
|
|||
REVISION
|
||||
</b><br>
|
||||
<P>
|
||||
Last updated: 30 August 2021
|
||||
Last updated: 08 December 2021
|
||||
<br>
|
||||
Copyright © 1997-2021 University of Cambridge.
|
||||
<br>
|
||||
|
|
|
@ -71,13 +71,15 @@ For example:
|
|||
<pre>
|
||||
pcre2grep some-pattern file1 - file3
|
||||
</pre>
|
||||
Input files are searched line by line. By default, each line that matches a
|
||||
By default, input files are searched line by line. Each line that matches a
|
||||
pattern is copied to the standard output, and if there is more than one file,
|
||||
the file name is output at the start of each line, followed by a colon.
|
||||
However, there are options that can change how <b>pcre2grep</b> behaves. In
|
||||
particular, the <b>-M</b> option makes it possible to search for strings that
|
||||
span line boundaries. What defines a line boundary is controlled by the
|
||||
<b>-N</b> (<b>--newline</b>) option.
|
||||
However, there are options that can change how <b>pcre2grep</b> behaves. For
|
||||
example, the <b>-M</b> option makes it possible to search for strings that span
|
||||
line boundaries. What defines a line boundary is controlled by the <b>-N</b>
|
||||
(<b>--newline</b>) option. The <b>-h</b> and <b>-H</b> options control whether or
|
||||
not file names are shown, and the <b>-Z</b> option changes the file name
|
||||
terminator to a zero byte.
|
||||
</P>
|
||||
<P>
|
||||
The amount of memory used for buffering files that are being scanned is
|
||||
|
@ -178,9 +180,11 @@ Output up to <i>number</i> lines of context after each matching line. Fewer
|
|||
lines are output if the next match or the end of the file is reached, or if the
|
||||
processing buffer size has been set too small. If file names and/or line
|
||||
numbers are being output, a hyphen separator is used instead of a colon for the
|
||||
context lines. A line containing "--" is output between each group of lines,
|
||||
unless they are in fact contiguous in the input file. The value of <i>number</i>
|
||||
is expected to be relatively small. When <b>-c</b> is used, <b>-A</b> is ignored.
|
||||
context lines (the <b>-Z</b> option can be used to change the file name
|
||||
terminator to a zero byte). A line containing "--" is output between each group
|
||||
of lines, unless they are in fact contiguous in the input file. The value of
|
||||
<i>number</i> is expected to be relatively small. When <b>-c</b> is used,
|
||||
<b>-A</b> is ignored.
|
||||
</P>
|
||||
<P>
|
||||
<b>-a</b>, <b>--text</b>
|
||||
|
@ -199,9 +203,10 @@ Output up to <i>number</i> lines of context before each matching line. Fewer
|
|||
lines are output if the previous match or the start of the file is within
|
||||
<i>number</i> lines, or if the processing buffer size has been set too small. If
|
||||
file names and/or line numbers are being output, a hyphen separator is used
|
||||
instead of a colon for the context lines. A line containing "--" is output
|
||||
between each group of lines, unless they are in fact contiguous in the input
|
||||
file. The value of <i>number</i> is expected to be relatively small. When
|
||||
instead of a colon for the context lines (the <b>-Z</b> option can be used to
|
||||
change the file name terminator to a zero byte). A line containing "--" is
|
||||
output between each group of lines, unless they are in fact contiguous in the
|
||||
input file. The value of <i>number</i> is expected to be relatively small. When
|
||||
<b>-c</b> is used, <b>-B</b> is ignored.
|
||||
</P>
|
||||
<P>
|
||||
|
@ -411,20 +416,22 @@ shown separately. This option is mutually exclusive with <b>--output</b>,
|
|||
<P>
|
||||
<b>-H</b>, <b>--with-filename</b>
|
||||
Force the inclusion of the file name at the start of output lines when
|
||||
searching a single file. By default, the file name is not shown in this case.
|
||||
For matching lines, the file name is followed by a colon; for context lines, a
|
||||
hyphen separator is used. If a line number is also being output, it follows the
|
||||
file name. When the <b>-M</b> option causes a pattern to match more than one
|
||||
line, only the first is preceded by the file name. This option overrides any
|
||||
previous <b>-h</b>, <b>-l</b>, or <b>-L</b> options.
|
||||
searching a single file. The file name is not normally shown in this case.
|
||||
By default, for matching lines, the file name is followed by a colon; for
|
||||
context lines, a hyphen separator is used. The <b>-Z</b> option can be used to
|
||||
change the terminator to a zero byte. If a line number is also being output,
|
||||
it follows the file name. When the <b>-M</b> option causes a pattern to match
|
||||
more than one line, only the first is preceded by the file name. This option
|
||||
overrides any previous <b>-h</b>, <b>-l</b>, or <b>-L</b> options.
|
||||
</P>
|
||||
<P>
|
||||
<b>-h</b>, <b>--no-filename</b>
|
||||
Suppress the output file names when searching multiple files. By default,
|
||||
file names are shown when multiple files are searched. For matching lines, the
|
||||
file name is followed by a colon; for context lines, a hyphen separator is used.
|
||||
If a line number is also being output, it follows the file name. This option
|
||||
overrides any previous <b>-H</b>, <b>-L</b>, or <b>-l</b> options.
|
||||
Suppress the output file names when searching multiple files. File names are
|
||||
normally shown when multiple files are searched. By default, for matching
|
||||
lines, the file name is followed by a colon; for context lines, a hyphen
|
||||
separator is used. The <b>-Z</b> option can be used to change the terminator to
|
||||
a zero byte. If a line number is also being output, it follows the file name.
|
||||
This option overrides any previous <b>-H</b>, <b>-L</b>, or <b>-l</b> options.
|
||||
</P>
|
||||
<P>
|
||||
<b>--heap-limit</b>=<i>number</i>
|
||||
|
@ -481,18 +488,20 @@ given any number of times. If a directory matches both <b>--include-dir</b> and
|
|||
<b>-L</b>, <b>--files-without-match</b>
|
||||
Instead of outputting lines from the files, just output the names of the files
|
||||
that do not contain any lines that would have been output. Each file name is
|
||||
output once, on a separate line. This option overrides any previous <b>-H</b>,
|
||||
<b>-h</b>, or <b>-l</b> options.
|
||||
output once, on a separate line by default, but if the <b>-Z</b> option is set,
|
||||
they are separated by zero bytes instead of newlines. This option overrides any
|
||||
previous <b>-H</b>, <b>-h</b>, or <b>-l</b> options.
|
||||
</P>
|
||||
<P>
|
||||
<b>-l</b>, <b>--files-with-matches</b>
|
||||
Instead of outputting lines from the files, just output the names of the files
|
||||
containing lines that would have been output. Each file name is output once, on
|
||||
a separate line. Searching normally stops as soon as a matching line is found
|
||||
in a file. However, if the <b>-c</b> (count) option is also used, matching
|
||||
continues in order to obtain the correct count, and those files that have at
|
||||
least one match are listed along with their counts. Using this option with
|
||||
<b>-c</b> is a way of suppressing the listing of files with no matches that
|
||||
a separate line, but if the <b>-Z</b> option is set, they are separated by zero
|
||||
bytes instead of newlines. Searching normally stops as soon as a matching line
|
||||
is found in a file. However, if the <b>-c</b> (count) option is also used,
|
||||
matching continues in order to obtain the correct count, and those files that
|
||||
have at least one match are listed along with their counts. Using this option
|
||||
with <b>-c</b> is a way of suppressing the listing of files with no matches that
|
||||
occurs with <b>-c</b> on its own. This option overrides any previous <b>-H</b>,
|
||||
<b>-h</b>, or <b>-L</b> options.
|
||||
</P>
|
||||
|
@ -592,10 +601,7 @@ value set by <b>--match-limit</b> is reached, an error occurs.
|
|||
<br>
|
||||
<br>
|
||||
The <b>--heap-limit</b> option specifies, as a number of kibibytes (units of
|
||||
1024 bytes), the amount of heap memory that may be used for matching. Heap
|
||||
memory is needed only if matching the pattern requires a significant number of
|
||||
nested backtracking points to be remembered. This parameter can be set to zero
|
||||
to forbid the use of heap memory altogether.
|
||||
1024 bytes), the maximum amount of heap memory that may be used for matching.
|
||||
<br>
|
||||
<br>
|
||||
The <b>--depth-limit</b> option limits the depth of nested backtracking points,
|
||||
|
@ -839,6 +845,13 @@ pattern and ")$" at the end. This option applies only to the patterns that are
|
|||
matched against the contents of files; it does not apply to patterns specified
|
||||
by any of the <b>--include</b> or <b>--exclude</b> options.
|
||||
</P>
|
||||
<P>
|
||||
<b>-Z</b>, <b>--null</b>
|
||||
Terminate files names in the regular output with a zero byte (the NUL
|
||||
character) instead of what would normally appear. This is useful when file
|
||||
names contain unusual characters such as colons, hyphens, or even newlines. The
|
||||
option does not apply to file names in error messages.
|
||||
</P>
|
||||
<br><a name="SEC7" href="#TOC1">ENVIRONMENT VARIABLES</a><br>
|
||||
<P>
|
||||
The environment variables <b>LC_ALL</b> and <b>LC_CTYPE</b> are examined, in that
|
||||
|
@ -1053,9 +1066,9 @@ Cambridge, England.
|
|||
</P>
|
||||
<br><a name="SEC16" href="#TOC1">REVISION</a><br>
|
||||
<P>
|
||||
Last updated: 31 August 2021
|
||||
Last updated: 30 July 2022
|
||||
<br>
|
||||
Copyright © 1997-2021 University of Cambridge.
|
||||
Copyright © 1997-2022 University of Cambridge.
|
||||
<br>
|
||||
<p>
|
||||
Return to the <a href="index.html">PCRE2 index page</a>.
|
||||
|
|
|
@ -269,11 +269,11 @@ starts another match, that match must use a different JIT stack to the one used
|
|||
for currently suspended match(es).
|
||||
</P>
|
||||
<P>
|
||||
In a multithread application, if you do not
|
||||
specify a JIT stack, or if you assign or pass back NULL from a callback, that
|
||||
is thread-safe, because each thread has its own machine stack. However, if you
|
||||
assign or pass back a non-NULL JIT stack, this must be a different stack for
|
||||
each thread so that the application is thread-safe.
|
||||
In a multithread application, if you do not specify a JIT stack, or if you
|
||||
assign or pass back NULL from a callback, that is thread-safe, because each
|
||||
thread has its own machine stack. However, if you assign or pass back a
|
||||
non-NULL JIT stack, this must be a different stack for each thread so that the
|
||||
application is thread-safe.
|
||||
</P>
|
||||
<P>
|
||||
Strictly speaking, even more is allowed. You can assign the same non-NULL stack
|
||||
|
@ -382,8 +382,8 @@ out this complicated API.
|
|||
<b>void pcre2_jit_free_unused_memory(pcre2_general_context *<i>gcontext</i>);</b>
|
||||
</P>
|
||||
<P>
|
||||
The JIT executable allocator does not free all memory when it is possible.
|
||||
It expects new allocations, and keeps some free memory around to improve
|
||||
The JIT executable allocator does not free all memory when it is possible. It
|
||||
expects new allocations, and keeps some free memory around to improve
|
||||
allocation speed. However, in low memory conditions, it might be better to free
|
||||
all possible memory. You can cause this to happen by calling
|
||||
pcre2_jit_free_unused_memory(). Its argument is a general context, for custom
|
||||
|
@ -442,10 +442,10 @@ that was not compiled.
|
|||
<P>
|
||||
When you call <b>pcre2_match()</b>, as well as testing for invalid options, a
|
||||
number of other sanity checks are performed on the arguments. For example, if
|
||||
the subject pointer is NULL, an immediate error is given. Also, unless
|
||||
PCRE2_NO_UTF_CHECK is set, a UTF subject string is tested for validity. In the
|
||||
interests of speed, these checks do not happen on the JIT fast path, and if
|
||||
invalid data is passed, the result is undefined.
|
||||
the subject pointer is NULL but the length is non-zero, an immediate error is
|
||||
given. Also, unless PCRE2_NO_UTF_CHECK is set, a UTF subject string is tested
|
||||
for validity. In the interests of speed, these checks do not happen on the JIT
|
||||
fast path, and if invalid data is passed, the result is undefined.
|
||||
</P>
|
||||
<P>
|
||||
Bypassing the sanity checks and the <b>pcre2_match()</b> wrapping can give
|
||||
|
@ -466,9 +466,9 @@ Cambridge, England.
|
|||
</P>
|
||||
<br><a name="SEC14" href="#TOC1">REVISION</a><br>
|
||||
<P>
|
||||
Last updated: 23 May 2019
|
||||
Last updated: 30 November 2021
|
||||
<br>
|
||||
Copyright © 1997-2019 University of Cambridge.
|
||||
Copyright © 1997-2021 University of Cambridge.
|
||||
<br>
|
||||
<p>
|
||||
Return to the <a href="index.html">PCRE2 index page</a>.
|
||||
|
|
|
@ -71,13 +71,18 @@ is 255 code units for the 8-bit library and 65535 code units for the 16-bit and
|
|||
The maximum length of a string argument to a callout is the largest number a
|
||||
32-bit unsigned integer can hold.
|
||||
</P>
|
||||
<P>
|
||||
The maximum amount of heap memory used for matching is controlled by the heap
|
||||
limit, which can be set in a pattern or in a match context. The default is a
|
||||
very large number, effectively unlimited.
|
||||
</P>
|
||||
<br><b>
|
||||
AUTHOR
|
||||
</b><br>
|
||||
<P>
|
||||
Philip Hazel
|
||||
<br>
|
||||
University Computing Service
|
||||
Retired from University Computing Service
|
||||
<br>
|
||||
Cambridge, England.
|
||||
<br>
|
||||
|
@ -86,9 +91,9 @@ Cambridge, England.
|
|||
REVISION
|
||||
</b><br>
|
||||
<P>
|
||||
Last updated: 02 February 2019
|
||||
Last updated: 26 July 2022
|
||||
<br>
|
||||
Copyright © 1997-2019 University of Cambridge.
|
||||
Copyright © 1997-2022 University of Cambridge.
|
||||
<br>
|
||||
<p>
|
||||
Return to the <a href="index.html">PCRE2 index page</a>.
|
||||
|
|
|
@ -534,7 +534,7 @@ for themselves. For example, outside a character class:
|
|||
\0113 is a tab followed by the character "3"
|
||||
\113 might be a backreference, otherwise the character with octal code 113
|
||||
\377 might be a backreference, otherwise the value 255 (decimal)
|
||||
\81 is always a backreference .sp
|
||||
\81 is always a backreference
|
||||
</pre>
|
||||
Note that octal values of 100 or greater that are specified using this syntax
|
||||
must not be introduced by a leading zero, because no more than three octal
|
||||
|
@ -776,199 +776,62 @@ can be used in any mode, though in 8-bit and 16-bit non-UTF modes these
|
|||
sequences are of course limited to testing characters whose code points are
|
||||
less than U+0100 and U+10000, respectively. In 32-bit non-UTF mode, code points
|
||||
greater than 0x10ffff (the Unicode limit) may be encountered. These are all
|
||||
treated as being in the Unknown script and with an unassigned type. The extra
|
||||
escape sequences are:
|
||||
treated as being in the Unknown script and with an unassigned type.
|
||||
</P>
|
||||
<P>
|
||||
Matching characters by Unicode property is not fast, because PCRE2 has to do a
|
||||
multistage table lookup in order to find a character's property. That is why
|
||||
the traditional escape sequences such as \d and \w do not use Unicode
|
||||
properties in PCRE2 by default, though you can make them do so by setting the
|
||||
PCRE2_UCP option or by starting the pattern with (*UCP).
|
||||
</P>
|
||||
<P>
|
||||
The extra escape sequences that provide property support are:
|
||||
<pre>
|
||||
\p{<i>xx</i>} a character with the <i>xx</i> property
|
||||
\P{<i>xx</i>} a character without the <i>xx</i> property
|
||||
\X a Unicode extended grapheme cluster
|
||||
</pre>
|
||||
The property names represented by <i>xx</i> above are case-sensitive. There is
|
||||
support for Unicode script names, Unicode general category properties, "Any",
|
||||
which matches any character (including newline), and some special PCRE2
|
||||
properties (described in the
|
||||
<a href="#extraprops">next section).</a>
|
||||
Other Perl properties such as "InMusicalSymbols" are not supported by PCRE2.
|
||||
Note that \P{Any} does not match any characters, so always causes a match
|
||||
failure.
|
||||
The property names represented by <i>xx</i> above are not case-sensitive, and in
|
||||
accordance with Unicode's "loose matching" rules, spaces, hyphens, and
|
||||
underscores are ignored. There is support for Unicode script names, Unicode
|
||||
general category properties, "Any", which matches any character (including
|
||||
newline), Bidi_Class, a number of binary (yes/no) properties, and some special
|
||||
PCRE2 properties (described
|
||||
<a href="#extraprops">below).</a>
|
||||
Certain other Perl properties such as "InMusicalSymbols" are not supported by
|
||||
PCRE2. Note that \P{Any} does not match any characters, so always causes a
|
||||
match failure.
|
||||
</P>
|
||||
<br><b>
|
||||
Script properties for \p and \P
|
||||
</b><br>
|
||||
<P>
|
||||
There are three different syntax forms for matching a script. Each Unicode
|
||||
character has a basic script and, optionally, a list of other scripts ("Script
|
||||
Extensions") with which it is commonly used. Using the Adlam script as an
|
||||
example, \p{sc:Adlam} matches characters whose basic script is Adlam, whereas
|
||||
\p{scx:Adlam} matches, in addition, characters that have Adlam in their
|
||||
extensions list. The full names "script" and "script extensions" for the
|
||||
property types are recognized, and a equals sign is an alternative to the
|
||||
colon. If a script name is given without a property type, for example,
|
||||
\p{Adlam}, it is treated as \p{scx:Adlam}. Perl changed to this
|
||||
interpretation at release 5.26 and PCRE2 changed at release 10.40.
|
||||
</P>
|
||||
<P>
|
||||
Sets of Unicode characters are defined as belonging to certain scripts. A
|
||||
character from one of these sets can be matched using a script name. For
|
||||
example:
|
||||
<pre>
|
||||
\p{Greek}
|
||||
\P{Han}
|
||||
</pre>
|
||||
Unassigned characters (and in non-UTF 32-bit mode, characters with code points
|
||||
greater than 0x10FFFF) are assigned the "Unknown" script. Others that are not
|
||||
part of an identified script are lumped together as "Common". The current list
|
||||
of scripts is:
|
||||
</P>
|
||||
<P>
|
||||
Adlam,
|
||||
Ahom,
|
||||
Anatolian_Hieroglyphs,
|
||||
Arabic,
|
||||
Armenian,
|
||||
Avestan,
|
||||
Balinese,
|
||||
Bamum,
|
||||
Bassa_Vah,
|
||||
Batak,
|
||||
Bengali,
|
||||
Bhaiksuki,
|
||||
Bopomofo,
|
||||
Brahmi,
|
||||
Braille,
|
||||
Buginese,
|
||||
Buhid,
|
||||
Canadian_Aboriginal,
|
||||
Carian,
|
||||
Caucasian_Albanian,
|
||||
Chakma,
|
||||
Cham,
|
||||
Cherokee,
|
||||
Chorasmian,
|
||||
Common,
|
||||
Coptic,
|
||||
Cuneiform,
|
||||
Cypriot,
|
||||
Cypro_Minoan,
|
||||
Cyrillic,
|
||||
Deseret,
|
||||
Devanagari,
|
||||
Dives_Akuru,
|
||||
Dogra,
|
||||
Duployan,
|
||||
Egyptian_Hieroglyphs,
|
||||
Elbasan,
|
||||
Elymaic,
|
||||
Ethiopic,
|
||||
Georgian,
|
||||
Glagolitic,
|
||||
Gothic,
|
||||
Grantha,
|
||||
Greek,
|
||||
Gujarati,
|
||||
Gunjala_Gondi,
|
||||
Gurmukhi,
|
||||
Han,
|
||||
Hangul,
|
||||
Hanifi_Rohingya,
|
||||
Hanunoo,
|
||||
Hatran,
|
||||
Hebrew,
|
||||
Hiragana,
|
||||
Imperial_Aramaic,
|
||||
Inherited,
|
||||
Inscriptional_Pahlavi,
|
||||
Inscriptional_Parthian,
|
||||
Javanese,
|
||||
Kaithi,
|
||||
Kannada,
|
||||
Katakana,
|
||||
Kayah_Li,
|
||||
Kharoshthi,
|
||||
Khitan_Small_Script,
|
||||
Khmer,
|
||||
Khojki,
|
||||
Khudawadi,
|
||||
Lao,
|
||||
Latin,
|
||||
Lepcha,
|
||||
Limbu,
|
||||
Linear_A,
|
||||
Linear_B,
|
||||
Lisu,
|
||||
Lycian,
|
||||
Lydian,
|
||||
Mahajani,
|
||||
Makasar,
|
||||
Malayalam,
|
||||
Mandaic,
|
||||
Manichaean,
|
||||
Marchen,
|
||||
Masaram_Gondi,
|
||||
Medefaidrin,
|
||||
Meetei_Mayek,
|
||||
Mende_Kikakui,
|
||||
Meroitic_Cursive,
|
||||
Meroitic_Hieroglyphs,
|
||||
Miao,
|
||||
Modi,
|
||||
Mongolian,
|
||||
Mro,
|
||||
Multani,
|
||||
Myanmar,
|
||||
Nabataean,
|
||||
Nandinagari,
|
||||
New_Tai_Lue,
|
||||
Newa,
|
||||
Nko,
|
||||
Nushu,
|
||||
Nyakeng_Puachue_Hmong,
|
||||
Ogham,
|
||||
Ol_Chiki,
|
||||
Old_Hungarian,
|
||||
Old_Italic,
|
||||
Old_North_Arabian,
|
||||
Old_Permic,
|
||||
Old_Persian,
|
||||
Old_Sogdian,
|
||||
Old_South_Arabian,
|
||||
Old_Turkic,
|
||||
Old_Uyghur,
|
||||
Oriya,
|
||||
Osage,
|
||||
Osmanya,
|
||||
Pahawh_Hmong,
|
||||
Palmyrene,
|
||||
Pau_Cin_Hau,
|
||||
Phags_Pa,
|
||||
Phoenician,
|
||||
Psalter_Pahlavi,
|
||||
Rejang,
|
||||
Runic,
|
||||
Samaritan,
|
||||
Saurashtra,
|
||||
Sharada,
|
||||
Shavian,
|
||||
Siddham,
|
||||
SignWriting,
|
||||
Sinhala,
|
||||
Sogdian,
|
||||
Sora_Sompeng,
|
||||
Soyombo,
|
||||
Sundanese,
|
||||
Syloti_Nagri,
|
||||
Syriac,
|
||||
Tagalog,
|
||||
Tagbanwa,
|
||||
Tai_Le,
|
||||
Tai_Tham,
|
||||
Tai_Viet,
|
||||
Takri,
|
||||
Tamil,
|
||||
Tangsa,
|
||||
Tangut,
|
||||
Telugu,
|
||||
Thaana,
|
||||
Thai,
|
||||
Tibetan,
|
||||
Tifinagh,
|
||||
Tirhuta,
|
||||
Toto,
|
||||
Ugaritic,
|
||||
Unknown,
|
||||
Vai,
|
||||
Vithkuqi,
|
||||
Wancho,
|
||||
Warang_Citi,
|
||||
Yezidi,
|
||||
Yi,
|
||||
Zanabazar_Square.
|
||||
of recognized script names and their 4-character abbreviations can be obtained
|
||||
by running this command:
|
||||
<pre>
|
||||
pcre2test -LS
|
||||
|
||||
</PRE>
|
||||
</P>
|
||||
<br><b>
|
||||
The general category property for \p and \P
|
||||
</b><br>
|
||||
<P>
|
||||
Each character has exactly one Unicode general category property, specified by
|
||||
a two-letter abbreviation. For compatibility with Perl, negation can be
|
||||
|
@ -1030,9 +893,9 @@ The following general category property codes are supported:
|
|||
Zp Paragraph separator
|
||||
Zs Space separator
|
||||
</pre>
|
||||
The special property L& is also supported: it matches a character that has
|
||||
the Lu, Ll, or Lt property, in other words, a letter that is not classified as
|
||||
a modifier or "other".
|
||||
The special property LC, which has the synonym L&, is also supported: it
|
||||
matches a character that has the Lu, Ll, or Lt property, in other words, a
|
||||
letter that is not classified as a modifier or "other".
|
||||
</P>
|
||||
<P>
|
||||
The Cs (Surrogate) property applies only to characters whose code points are in
|
||||
|
@ -1059,12 +922,54 @@ Specifying caseless matching does not affect these escape sequences. For
|
|||
example, \p{Lu} always matches only upper case letters. This is different from
|
||||
the behaviour of current versions of Perl.
|
||||
</P>
|
||||
<br><b>
|
||||
Binary (yes/no) properties for \p and \P
|
||||
</b><br>
|
||||
<P>
|
||||
Matching characters by Unicode property is not fast, because PCRE2 has to do a
|
||||
multistage table lookup in order to find a character's property. That is why
|
||||
the traditional escape sequences such as \d and \w do not use Unicode
|
||||
properties in PCRE2 by default, though you can make them do so by setting the
|
||||
PCRE2_UCP option or by starting the pattern with (*UCP).
|
||||
Unicode defines a number of binary properties, that is, properties whose only
|
||||
values are true or false. You can obtain a list of those that are recognized by
|
||||
\p and \P, along with their abbreviations, by running this command:
|
||||
<pre>
|
||||
pcre2test -LP
|
||||
|
||||
</PRE>
|
||||
</P>
|
||||
<br><b>
|
||||
The Bidi_Class property for \p and \P
|
||||
</b><br>
|
||||
<P>
|
||||
<pre>
|
||||
\p{Bidi_Class:<class>} matches a character with the given class
|
||||
\p{BC:<class>} matches a character with the given class
|
||||
</pre>
|
||||
The recognized classes are:
|
||||
<pre>
|
||||
AL Arabic letter
|
||||
AN Arabic number
|
||||
B paragraph separator
|
||||
BN boundary neutral
|
||||
CS common separator
|
||||
EN European number
|
||||
ES European separator
|
||||
ET European terminator
|
||||
FSI first strong isolate
|
||||
L left-to-right
|
||||
LRE left-to-right embedding
|
||||
LRI left-to-right isolate
|
||||
LRO left-to-right override
|
||||
NSM non-spacing mark
|
||||
ON other neutral
|
||||
PDF pop directional format
|
||||
PDI pop directional isolate
|
||||
R right-to-left
|
||||
RLE right-to-left embedding
|
||||
RLI right-to-left isolate
|
||||
RLO right-to-left override
|
||||
S segment separator
|
||||
WS which space
|
||||
</pre>
|
||||
An equals sign may be used instead of a colon. The class names are
|
||||
case-insensitive; only the short names listed above are recognized.
|
||||
</P>
|
||||
<br><b>
|
||||
Extended grapheme clusters
|
||||
|
@ -1341,15 +1246,17 @@ end of the subject in both modes, and if all branches of a pattern start with
|
|||
<P>
|
||||
Outside a character class, a dot in the pattern matches any one character in
|
||||
the subject string except (by default) a character that signifies the end of a
|
||||
line.
|
||||
line. One or more characters may be specified as line terminators (see
|
||||
<a href="#newlines">"Newline conventions"</a>
|
||||
above).
|
||||
</P>
|
||||
<P>
|
||||
When a line ending is defined as a single character, dot never matches that
|
||||
character; when the two-character sequence CRLF is used, dot does not match CR
|
||||
if it is immediately followed by LF, but otherwise it matches all characters
|
||||
(including isolated CRs and LFs). When any Unicode line endings are being
|
||||
recognized, dot does not match CR or LF or any of the other line ending
|
||||
characters.
|
||||
Dot never matches a single line-ending character. When the two-character
|
||||
sequence CRLF is the only line ending, dot does not match CR if it is
|
||||
immediately followed by LF, but otherwise it matches all characters (including
|
||||
isolated CRs and LFs). When ANYCRLF is selected for line endings, no occurences
|
||||
of CR of LF match dot. When all Unicode line endings are being recognized, dot
|
||||
does not match CR or LF or any of the other line ending characters.
|
||||
</P>
|
||||
<P>
|
||||
The behaviour of dot with regard to newlines can be changed. If the
|
||||
|
@ -2180,10 +2087,10 @@ be easier to remember:
|
|||
<pre>
|
||||
(*atomic:\d+)foo
|
||||
</pre>
|
||||
This kind of parenthesized group "locks up" the part of the pattern it
|
||||
contains once it has matched, and a failure further into the pattern is
|
||||
prevented from backtracking into it. Backtracking past it to previous items,
|
||||
however, works as normal.
|
||||
This kind of parenthesized group "locks up" the part of the pattern it contains
|
||||
once it has matched, and a failure further into the pattern is prevented from
|
||||
backtracking into it. Backtracking past it to previous items, however, works as
|
||||
normal.
|
||||
</P>
|
||||
<P>
|
||||
An alternative description is that a group of this type matches exactly the
|
||||
|
@ -3859,9 +3766,9 @@ Cambridge, England.
|
|||
</P>
|
||||
<br><a name="SEC32" href="#TOC1">REVISION</a><br>
|
||||
<P>
|
||||
Last updated: 30 August 2021
|
||||
Last updated: 12 January 2022
|
||||
<br>
|
||||
Copyright © 1997-2021 University of Cambridge.
|
||||
Copyright © 1997-2022 University of Cambridge.
|
||||
<br>
|
||||
<p>
|
||||
Return to the <a href="index.html">PCRE2 index page</a>.
|
||||
|
|
|
@ -83,12 +83,31 @@ From release 10.30, the interpretive (non-JIT) version of <b>pcre2_match()</b>
|
|||
uses very little system stack at run time. In earlier releases recursive
|
||||
function calls could use a great deal of stack, and this could cause problems,
|
||||
but this usage has been eliminated. Backtracking positions are now explicitly
|
||||
remembered in memory frames controlled by the code. An initial 20KiB vector of
|
||||
frames is allocated on the system stack (enough for about 100 frames for small
|
||||
patterns), but if this is insufficient, heap memory is used. The amount of heap
|
||||
memory can be limited; if the limit is set to zero, only the initial stack
|
||||
vector is used. Rewriting patterns to be time-efficient, as described below,
|
||||
may also reduce the memory requirements.
|
||||
remembered in memory frames controlled by the code.
|
||||
</P>
|
||||
<P>
|
||||
The size of each frame depends on the size of pointer variables and the number
|
||||
of capturing parenthesized groups in the pattern being matched. On a 64-bit
|
||||
system the frame size for a pattern with no captures is 128 bytes. For each
|
||||
capturing group the size increases by 16 bytes.
|
||||
</P>
|
||||
<P>
|
||||
Until release 10.41, an initial 20KiB frames vector was allocated on the system
|
||||
stack, but this still caused some issues for multi-thread applications where
|
||||
each thread has a very small stack. From release 10.41 backtracking memory
|
||||
frames are always held in heap memory. An initial heap allocation is obtained
|
||||
the first time any match data block is passed to <b>pcre2_match()</b>. This is
|
||||
remembered with the match data block and re-used if that block is used for
|
||||
another match. It is freed when the match data block itself is freed.
|
||||
</P>
|
||||
<P>
|
||||
The size of the initial block is the larger of 20KiB or ten times the pattern's
|
||||
frame size, unless the heap limit is less than this, in which case the heap
|
||||
limit is used. If the initial block proves to be too small during matching, it
|
||||
is replaced by a larger block, subject to the heap limit. The heap limit is
|
||||
checked only when a new block is to be allocated. Reducing the heap limit
|
||||
between calls to <b>pcre2_match()</b> with the same match data block does not
|
||||
affect the saved block.
|
||||
</P>
|
||||
<P>
|
||||
In contrast to <b>pcre2_match()</b>, <b>pcre2_dfa_match()</b> does use recursive
|
||||
|
@ -245,16 +264,16 @@ pattern to match. This is done by repeatedly matching with different limits.
|
|||
<P>
|
||||
Philip Hazel
|
||||
<br>
|
||||
University Computing Service
|
||||
Retired from University Computing Service
|
||||
<br>
|
||||
Cambridge, England.
|
||||
<br>
|
||||
</P>
|
||||
<br><a name="SEC6" href="#TOC1">REVISION</a><br>
|
||||
<P>
|
||||
Last updated: 03 February 2019
|
||||
Last updated: 27 July 2022
|
||||
<br>
|
||||
Copyright © 1997-2019 University of Cambridge.
|
||||
Copyright © 1997-2022 University of Cambridge.
|
||||
<br>
|
||||
<p>
|
||||
Return to the <a href="index.html">PCRE2 index page</a>.
|
||||
|
|
|
@ -23,12 +23,12 @@ please consult the man page, in case the conversion went wrong.
|
|||
<br><a name="SEC1" href="#TOC1">SAVING AND RE-USING PRECOMPILED PCRE2 PATTERNS</a><br>
|
||||
<P>
|
||||
<b>int32_t pcre2_serialize_decode(pcre2_code **<i>codes</i>,</b>
|
||||
<b> int32_t <i>number_of_codes</i>, const uint32_t *<i>bytes</i>,</b>
|
||||
<b> int32_t <i>number_of_codes</i>, const uint8_t *<i>bytes</i>,</b>
|
||||
<b> pcre2_general_context *<i>gcontext</i>);</b>
|
||||
<br>
|
||||
<br>
|
||||
<b>int32_t pcre2_serialize_encode(pcre2_code **<i>codes</i>,</b>
|
||||
<b> int32_t <i>number_of_codes</i>, uint32_t **<i>serialized_bytes</i>,</b>
|
||||
<b>int32_t pcre2_serialize_encode(const pcre2_code **<i>codes</i>,</b>
|
||||
<b> int32_t <i>number_of_codes</i>, uint8_t **<i>serialized_bytes</i>,</b>
|
||||
<b> PCRE2_SIZE *<i>serialized_size</i>, pcre2_general_context *<i>gcontext</i>);</b>
|
||||
<br>
|
||||
<br>
|
||||
|
@ -94,7 +94,7 @@ of serialized patterns, or one of the following negative error codes:
|
|||
<pre>
|
||||
PCRE2_ERROR_BADDATA the number of patterns is zero or less
|
||||
PCRE2_ERROR_BADMAGIC mismatch of id bytes in one of the patterns
|
||||
PCRE2_ERROR_MEMORY memory allocation failed
|
||||
PCRE2_ERROR_NOMEMORY memory allocation failed
|
||||
PCRE2_ERROR_MIXEDTABLES the patterns do not all use the same tables
|
||||
PCRE2_ERROR_NULL the 1st, 3rd, or 4th argument is NULL
|
||||
</pre>
|
||||
|
@ -154,7 +154,6 @@ mangagement functions for the decoded patterns. If this argument is NULL,
|
|||
<b>malloc()</b> and <b>free()</b> are used. After deserialization, the byte
|
||||
stream is no longer needed and can be discarded.
|
||||
<pre>
|
||||
int32_t number_of_codes;
|
||||
pcre2_code *list_of_codes[2];
|
||||
uint8_t *bytes = <serialized data>;
|
||||
int32_t number_of_codes =
|
||||
|
|
|
@ -19,29 +19,31 @@ please consult the man page, in case the conversion went wrong.
|
|||
<li><a name="TOC4" href="#SEC4">CHARACTER TYPES</a>
|
||||
<li><a name="TOC5" href="#SEC5">GENERAL CATEGORY PROPERTIES FOR \p and \P</a>
|
||||
<li><a name="TOC6" href="#SEC6">PCRE2 SPECIAL CATEGORY PROPERTIES FOR \p and \P</a>
|
||||
<li><a name="TOC7" href="#SEC7">SCRIPT NAMES FOR \p AND \P</a>
|
||||
<li><a name="TOC8" href="#SEC8">CHARACTER CLASSES</a>
|
||||
<li><a name="TOC9" href="#SEC9">QUANTIFIERS</a>
|
||||
<li><a name="TOC10" href="#SEC10">ANCHORS AND SIMPLE ASSERTIONS</a>
|
||||
<li><a name="TOC11" href="#SEC11">REPORTED MATCH POINT SETTING</a>
|
||||
<li><a name="TOC12" href="#SEC12">ALTERNATION</a>
|
||||
<li><a name="TOC13" href="#SEC13">CAPTURING</a>
|
||||
<li><a name="TOC14" href="#SEC14">ATOMIC GROUPS</a>
|
||||
<li><a name="TOC15" href="#SEC15">COMMENT</a>
|
||||
<li><a name="TOC16" href="#SEC16">OPTION SETTING</a>
|
||||
<li><a name="TOC17" href="#SEC17">NEWLINE CONVENTION</a>
|
||||
<li><a name="TOC18" href="#SEC18">WHAT \R MATCHES</a>
|
||||
<li><a name="TOC19" href="#SEC19">LOOKAHEAD AND LOOKBEHIND ASSERTIONS</a>
|
||||
<li><a name="TOC20" href="#SEC20">NON-ATOMIC LOOKAROUND ASSERTIONS</a>
|
||||
<li><a name="TOC21" href="#SEC21">SCRIPT RUNS</a>
|
||||
<li><a name="TOC22" href="#SEC22">BACKREFERENCES</a>
|
||||
<li><a name="TOC23" href="#SEC23">SUBROUTINE REFERENCES (POSSIBLY RECURSIVE)</a>
|
||||
<li><a name="TOC24" href="#SEC24">CONDITIONAL PATTERNS</a>
|
||||
<li><a name="TOC25" href="#SEC25">BACKTRACKING CONTROL</a>
|
||||
<li><a name="TOC26" href="#SEC26">CALLOUTS</a>
|
||||
<li><a name="TOC27" href="#SEC27">SEE ALSO</a>
|
||||
<li><a name="TOC28" href="#SEC28">AUTHOR</a>
|
||||
<li><a name="TOC29" href="#SEC29">REVISION</a>
|
||||
<li><a name="TOC7" href="#SEC7">BINARY PROPERTIES FOR \p AND \P</a>
|
||||
<li><a name="TOC8" href="#SEC8">SCRIPT MATCHING WITH \p AND \P</a>
|
||||
<li><a name="TOC9" href="#SEC9">THE BIDI_CLASS PROPERTY FOR \p AND \P</a>
|
||||
<li><a name="TOC10" href="#SEC10">CHARACTER CLASSES</a>
|
||||
<li><a name="TOC11" href="#SEC11">QUANTIFIERS</a>
|
||||
<li><a name="TOC12" href="#SEC12">ANCHORS AND SIMPLE ASSERTIONS</a>
|
||||
<li><a name="TOC13" href="#SEC13">REPORTED MATCH POINT SETTING</a>
|
||||
<li><a name="TOC14" href="#SEC14">ALTERNATION</a>
|
||||
<li><a name="TOC15" href="#SEC15">CAPTURING</a>
|
||||
<li><a name="TOC16" href="#SEC16">ATOMIC GROUPS</a>
|
||||
<li><a name="TOC17" href="#SEC17">COMMENT</a>
|
||||
<li><a name="TOC18" href="#SEC18">OPTION SETTING</a>
|
||||
<li><a name="TOC19" href="#SEC19">NEWLINE CONVENTION</a>
|
||||
<li><a name="TOC20" href="#SEC20">WHAT \R MATCHES</a>
|
||||
<li><a name="TOC21" href="#SEC21">LOOKAHEAD AND LOOKBEHIND ASSERTIONS</a>
|
||||
<li><a name="TOC22" href="#SEC22">NON-ATOMIC LOOKAROUND ASSERTIONS</a>
|
||||
<li><a name="TOC23" href="#SEC23">SCRIPT RUNS</a>
|
||||
<li><a name="TOC24" href="#SEC24">BACKREFERENCES</a>
|
||||
<li><a name="TOC25" href="#SEC25">SUBROUTINE REFERENCES (POSSIBLY RECURSIVE)</a>
|
||||
<li><a name="TOC26" href="#SEC26">CONDITIONAL PATTERNS</a>
|
||||
<li><a name="TOC27" href="#SEC27">BACKTRACKING CONTROL</a>
|
||||
<li><a name="TOC28" href="#SEC28">CALLOUTS</a>
|
||||
<li><a name="TOC29" href="#SEC29">SEE ALSO</a>
|
||||
<li><a name="TOC30" href="#SEC30">AUTHOR</a>
|
||||
<li><a name="TOC31" href="#SEC31">REVISION</a>
|
||||
</ul>
|
||||
<br><a name="SEC1" href="#TOC1">PCRE2 REGULAR EXPRESSION SYNTAX SUMMARY</a><br>
|
||||
<P>
|
||||
|
@ -136,6 +138,11 @@ happening, \s and \w may also match characters with code points in the range
|
|||
sequences is changed to use Unicode properties and they match many more
|
||||
characters.
|
||||
</P>
|
||||
<P>
|
||||
Property descriptions in \p and \P are matched caselessly; hyphens,
|
||||
underscores, and white space are ignored, in accordance with Unicode's "loose
|
||||
matching" rules.
|
||||
</P>
|
||||
<br><a name="SEC5" href="#TOC1">GENERAL CATEGORY PROPERTIES FOR \p and \P</a><br>
|
||||
<P>
|
||||
<pre>
|
||||
|
@ -152,6 +159,7 @@ characters.
|
|||
Lo Other letter
|
||||
Lt Title case letter
|
||||
Lu Upper case letter
|
||||
Lc Ll, Lu, or Lt
|
||||
L& Ll, Lu, or Lt
|
||||
|
||||
M Mark
|
||||
|
@ -198,171 +206,58 @@ characters.
|
|||
Perl and POSIX space are now the same. Perl added VT to its space character set
|
||||
at release 5.18.
|
||||
</P>
|
||||
<br><a name="SEC7" href="#TOC1">SCRIPT NAMES FOR \p AND \P</a><br>
|
||||
<br><a name="SEC7" href="#TOC1">BINARY PROPERTIES FOR \p AND \P</a><br>
|
||||
<P>
|
||||
Adlam,
|
||||
Ahom,
|
||||
Anatolian_Hieroglyphs,
|
||||
Arabic,
|
||||
Armenian,
|
||||
Avestan,
|
||||
Balinese,
|
||||
Bamum,
|
||||
Bassa_Vah,
|
||||
Batak,
|
||||
Bengali,
|
||||
Bhaiksuki,
|
||||
Bopomofo,
|
||||
Brahmi,
|
||||
Braille,
|
||||
Buginese,
|
||||
Buhid,
|
||||
Canadian_Aboriginal,
|
||||
Carian,
|
||||
Caucasian_Albanian,
|
||||
Chakma,
|
||||
Cham,
|
||||
Cherokee,
|
||||
Chorasmian,
|
||||
Common,
|
||||
Coptic,
|
||||
Cuneiform,
|
||||
Cypriot,
|
||||
Cypro_Minoan,
|
||||
Cyrillic,
|
||||
Deseret,
|
||||
Devanagari,
|
||||
Dives_Akuru,
|
||||
Dogra,
|
||||
Duployan,
|
||||
Egyptian_Hieroglyphs,
|
||||
Elbasan,
|
||||
Elymaic,
|
||||
Ethiopic,
|
||||
Georgian,
|
||||
Glagolitic,
|
||||
Gothic,
|
||||
Grantha,
|
||||
Greek,
|
||||
Gujarati,
|
||||
Gunjala_Gondi,
|
||||
Gurmukhi,
|
||||
Han,
|
||||
Hangul,
|
||||
Hanifi_Rohingya,
|
||||
Hanunoo,
|
||||
Hatran,
|
||||
Hebrew,
|
||||
Hiragana,
|
||||
Imperial_Aramaic,
|
||||
Inherited,
|
||||
Inscriptional_Pahlavi,
|
||||
Inscriptional_Parthian,
|
||||
Javanese,
|
||||
Kaithi,
|
||||
Kannada,
|
||||
Katakana,
|
||||
Kayah_Li,
|
||||
Kharoshthi,
|
||||
Khitan_Small_Script,
|
||||
Khmer,
|
||||
Khojki,
|
||||
Khudawadi,
|
||||
Lao,
|
||||
Latin,
|
||||
Lepcha,
|
||||
Limbu,
|
||||
Linear_A,
|
||||
Linear_B,
|
||||
Lisu,
|
||||
Lycian,
|
||||
Lydian,
|
||||
Mahajani,
|
||||
Makasar,
|
||||
Malayalam,
|
||||
Mandaic,
|
||||
Manichaean,
|
||||
Marchen,
|
||||
Masaram_Gondi,
|
||||
Medefaidrin,
|
||||
Meetei_Mayek,
|
||||
Mende_Kikakui,
|
||||
Meroitic_Cursive,
|
||||
Meroitic_Hieroglyphs,
|
||||
Miao,
|
||||
Modi,
|
||||
Mongolian,
|
||||
Mro,
|
||||
Multani,
|
||||
Myanmar,
|
||||
Nabataean,
|
||||
Nandinagari,
|
||||
New_Tai_Lue,
|
||||
Newa,
|
||||
Nko,
|
||||
Nushu,
|
||||
Nyakeng_Puachue_Hmong,
|
||||
Ogham,
|
||||
Ol_Chiki,
|
||||
Old_Hungarian,
|
||||
Old_Italic,
|
||||
Old_North_Arabian,
|
||||
Old_Permic,
|
||||
Old_Persian,
|
||||
Old_Sogdian,
|
||||
Old_South_Arabian,
|
||||
Old_Turkic,
|
||||
Old_Uyghur,
|
||||
Oriya,
|
||||
Osage,
|
||||
Osmanya,
|
||||
Pahawh_Hmong,
|
||||
Palmyrene,
|
||||
Pau_Cin_Hau,
|
||||
Phags_Pa,
|
||||
Phoenician,
|
||||
Psalter_Pahlavi,
|
||||
Rejang,
|
||||
Runic,
|
||||
Samaritan,
|
||||
Saurashtra,
|
||||
Sharada,
|
||||
Shavian,
|
||||
Siddham,
|
||||
SignWriting,
|
||||
Sinhala,
|
||||
Sogdian,
|
||||
Sora_Sompeng,
|
||||
Soyombo,
|
||||
Sundanese,
|
||||
Syloti_Nagri,
|
||||
Syriac,
|
||||
Tagalog,
|
||||
Tagbanwa,
|
||||
Tai_Le,
|
||||
Tai_Tham,
|
||||
Tai_Viet,
|
||||
Takri,
|
||||
Tamil,
|
||||
Tangsa,
|
||||
Tangut,
|
||||
Telugu,
|
||||
Thaana,
|
||||
Thai,
|
||||
Tibetan,
|
||||
Tifinagh,
|
||||
Tirhuta,
|
||||
Toto,
|
||||
Ugaritic,
|
||||
Vai,
|
||||
Vithkuqi,
|
||||
Wancho,
|
||||
Warang_Citi,
|
||||
Yezidi,
|
||||
Yi,
|
||||
Zanabazar_Square.
|
||||
Unicode defines a number of binary properties, that is, properties whose only
|
||||
values are true or false. You can obtain a list of those that are recognized by
|
||||
\p and \P, along with their abbreviations, by running this command:
|
||||
<pre>
|
||||
pcre2test -LP
|
||||
</PRE>
|
||||
</P>
|
||||
<br><a name="SEC8" href="#TOC1">CHARACTER CLASSES</a><br>
|
||||
<br><a name="SEC8" href="#TOC1">SCRIPT MATCHING WITH \p AND \P</a><br>
|
||||
<P>
|
||||
Many script names and their 4-letter abbreviations are recognized in
|
||||
\p{sc:...} or \p{scx:...} items, or on their own with \p (and also \P of
|
||||
course). You can obtain a list of these scripts by running this command:
|
||||
<pre>
|
||||
pcre2test -LS
|
||||
</PRE>
|
||||
</P>
|
||||
<br><a name="SEC9" href="#TOC1">THE BIDI_CLASS PROPERTY FOR \p AND \P</a><br>
|
||||
<P>
|
||||
<pre>
|
||||
\p{Bidi_Class:<class>} matches a character with the given class
|
||||
\p{BC:<class>} matches a character with the given class
|
||||
</pre>
|
||||
The recognized classes are:
|
||||
<pre>
|
||||
AL Arabic letter
|
||||
AN Arabic number
|
||||
B paragraph separator
|
||||
BN boundary neutral
|
||||
CS common separator
|
||||
EN European number
|
||||
ES European separator
|
||||
ET European terminator
|
||||
FSI first strong isolate
|
||||
L left-to-right
|
||||
LRE left-to-right embedding
|
||||
LRI left-to-right isolate
|
||||
LRO left-to-right override
|
||||
NSM non-spacing mark
|
||||
ON other neutral
|
||||
PDF pop directional format
|
||||
PDI pop directional isolate
|
||||
R right-to-left
|
||||
RLE right-to-left embedding
|
||||
RLI right-to-left isolate
|
||||
RLO right-to-left override
|
||||
S segment separator
|
||||
WS which space
|
||||
</PRE>
|
||||
</P>
|
||||
<br><a name="SEC10" href="#TOC1">CHARACTER CLASSES</a><br>
|
||||
<P>
|
||||
<pre>
|
||||
[...] positive character class
|
||||
|
@ -390,7 +285,7 @@ In PCRE2, POSIX character set names recognize only ASCII characters by default,
|
|||
but some of them use Unicode properties if PCRE2_UCP is set. You can use
|
||||
\Q...\E inside a character class.
|
||||
</P>
|
||||
<br><a name="SEC9" href="#TOC1">QUANTIFIERS</a><br>
|
||||
<br><a name="SEC11" href="#TOC1">QUANTIFIERS</a><br>
|
||||
<P>
|
||||
<pre>
|
||||
? 0 or 1, greedy
|
||||
|
@ -411,7 +306,7 @@ but some of them use Unicode properties if PCRE2_UCP is set. You can use
|
|||
{n,}? n or more, lazy
|
||||
</PRE>
|
||||
</P>
|
||||
<br><a name="SEC10" href="#TOC1">ANCHORS AND SIMPLE ASSERTIONS</a><br>
|
||||
<br><a name="SEC12" href="#TOC1">ANCHORS AND SIMPLE ASSERTIONS</a><br>
|
||||
<P>
|
||||
<pre>
|
||||
\b word boundary
|
||||
|
@ -429,7 +324,7 @@ but some of them use Unicode properties if PCRE2_UCP is set. You can use
|
|||
\G first matching position in subject
|
||||
</PRE>
|
||||
</P>
|
||||
<br><a name="SEC11" href="#TOC1">REPORTED MATCH POINT SETTING</a><br>
|
||||
<br><a name="SEC13" href="#TOC1">REPORTED MATCH POINT SETTING</a><br>
|
||||
<P>
|
||||
<pre>
|
||||
\K set reported start of match
|
||||
|
@ -439,13 +334,13 @@ for compatibility with Perl. However, if the PCRE2_EXTRA_ALLOW_LOOKAROUND_BSK
|
|||
option is set, the previous behaviour is re-enabled. When this option is set,
|
||||
\K is honoured in positive assertions, but ignored in negative ones.
|
||||
</P>
|
||||
<br><a name="SEC12" href="#TOC1">ALTERNATION</a><br>
|
||||
<br><a name="SEC14" href="#TOC1">ALTERNATION</a><br>
|
||||
<P>
|
||||
<pre>
|
||||
expr|expr|expr...
|
||||
</PRE>
|
||||
</P>
|
||||
<br><a name="SEC13" href="#TOC1">CAPTURING</a><br>
|
||||
<br><a name="SEC15" href="#TOC1">CAPTURING</a><br>
|
||||
<P>
|
||||
<pre>
|
||||
(...) capture group
|
||||
|
@ -460,20 +355,20 @@ In non-UTF modes, names may contain underscores and ASCII letters and digits;
|
|||
in UTF modes, any Unicode letters and Unicode decimal digits are permitted. In
|
||||
both cases, a name must not start with a digit.
|
||||
</P>
|
||||
<br><a name="SEC14" href="#TOC1">ATOMIC GROUPS</a><br>
|
||||
<br><a name="SEC16" href="#TOC1">ATOMIC GROUPS</a><br>
|
||||
<P>
|
||||
<pre>
|
||||
(?>...) atomic non-capture group
|
||||
(*atomic:...) atomic non-capture group
|
||||
</PRE>
|
||||
</P>
|
||||
<br><a name="SEC15" href="#TOC1">COMMENT</a><br>
|
||||
<br><a name="SEC17" href="#TOC1">COMMENT</a><br>
|
||||
<P>
|
||||
<pre>
|
||||
(?#....) comment (not nestable)
|
||||
</PRE>
|
||||
</P>
|
||||
<br><a name="SEC16" href="#TOC1">OPTION SETTING</a><br>
|
||||
<br><a name="SEC18" href="#TOC1">OPTION SETTING</a><br>
|
||||
<P>
|
||||
Changes of these options within a group are automatically cancelled at the end
|
||||
of the group.
|
||||
|
@ -518,7 +413,7 @@ not increase them. LIMIT_RECURSION is an obsolete synonym for LIMIT_DEPTH. The
|
|||
application can lock out the use of (*UTF) and (*UCP) by setting the
|
||||
PCRE2_NEVER_UTF or PCRE2_NEVER_UCP options, respectively, at compile time.
|
||||
</P>
|
||||
<br><a name="SEC17" href="#TOC1">NEWLINE CONVENTION</a><br>
|
||||
<br><a name="SEC19" href="#TOC1">NEWLINE CONVENTION</a><br>
|
||||
<P>
|
||||
These are recognized only at the very start of the pattern or after option
|
||||
settings with a similar syntax.
|
||||
|
@ -531,7 +426,7 @@ settings with a similar syntax.
|
|||
(*NUL) the NUL character (binary zero)
|
||||
</PRE>
|
||||
</P>
|
||||
<br><a name="SEC18" href="#TOC1">WHAT \R MATCHES</a><br>
|
||||
<br><a name="SEC20" href="#TOC1">WHAT \R MATCHES</a><br>
|
||||
<P>
|
||||
These are recognized only at the very start of the pattern or after option
|
||||
setting with a similar syntax.
|
||||
|
@ -540,7 +435,7 @@ setting with a similar syntax.
|
|||
(*BSR_UNICODE) any Unicode newline sequence
|
||||
</PRE>
|
||||
</P>
|
||||
<br><a name="SEC19" href="#TOC1">LOOKAHEAD AND LOOKBEHIND ASSERTIONS</a><br>
|
||||
<br><a name="SEC21" href="#TOC1">LOOKAHEAD AND LOOKBEHIND ASSERTIONS</a><br>
|
||||
<P>
|
||||
<pre>
|
||||
(?=...) )
|
||||
|
@ -561,7 +456,7 @@ setting with a similar syntax.
|
|||
</pre>
|
||||
Each top-level branch of a lookbehind must be of a fixed length.
|
||||
</P>
|
||||
<br><a name="SEC20" href="#TOC1">NON-ATOMIC LOOKAROUND ASSERTIONS</a><br>
|
||||
<br><a name="SEC22" href="#TOC1">NON-ATOMIC LOOKAROUND ASSERTIONS</a><br>
|
||||
<P>
|
||||
These assertions are specific to PCRE2 and are not Perl-compatible.
|
||||
<pre>
|
||||
|
@ -574,7 +469,7 @@ These assertions are specific to PCRE2 and are not Perl-compatible.
|
|||
(*non_atomic_positive_lookbehind:...) )
|
||||
</PRE>
|
||||
</P>
|
||||
<br><a name="SEC21" href="#TOC1">SCRIPT RUNS</a><br>
|
||||
<br><a name="SEC23" href="#TOC1">SCRIPT RUNS</a><br>
|
||||
<P>
|
||||
<pre>
|
||||
(*script_run:...) ) script run, can be backtracked into
|
||||
|
@ -584,7 +479,7 @@ These assertions are specific to PCRE2 and are not Perl-compatible.
|
|||
(*asr:...) )
|
||||
</PRE>
|
||||
</P>
|
||||
<br><a name="SEC22" href="#TOC1">BACKREFERENCES</a><br>
|
||||
<br><a name="SEC24" href="#TOC1">BACKREFERENCES</a><br>
|
||||
<P>
|
||||
<pre>
|
||||
\n reference by number (can be ambiguous)
|
||||
|
@ -601,7 +496,7 @@ These assertions are specific to PCRE2 and are not Perl-compatible.
|
|||
(?P=name) reference by name (Python)
|
||||
</PRE>
|
||||
</P>
|
||||
<br><a name="SEC23" href="#TOC1">SUBROUTINE REFERENCES (POSSIBLY RECURSIVE)</a><br>
|
||||
<br><a name="SEC25" href="#TOC1">SUBROUTINE REFERENCES (POSSIBLY RECURSIVE)</a><br>
|
||||
<P>
|
||||
<pre>
|
||||
(?R) recurse whole pattern
|
||||
|
@ -620,7 +515,7 @@ These assertions are specific to PCRE2 and are not Perl-compatible.
|
|||
\g'-n' call subroutine by relative number (PCRE2 extension)
|
||||
</PRE>
|
||||
</P>
|
||||
<br><a name="SEC24" href="#TOC1">CONDITIONAL PATTERNS</a><br>
|
||||
<br><a name="SEC26" href="#TOC1">CONDITIONAL PATTERNS</a><br>
|
||||
<P>
|
||||
<pre>
|
||||
(?(condition)yes-pattern)
|
||||
|
@ -643,7 +538,7 @@ Note the ambiguity of (?(R) and (?(Rn) which might be named reference
|
|||
conditions or recursion tests. Such a condition is interpreted as a reference
|
||||
condition if the relevant named group exists.
|
||||
</P>
|
||||
<br><a name="SEC25" href="#TOC1">BACKTRACKING CONTROL</a><br>
|
||||
<br><a name="SEC27" href="#TOC1">BACKTRACKING CONTROL</a><br>
|
||||
<P>
|
||||
All backtracking control verbs may be in the form (*VERB:NAME). For (*MARK) the
|
||||
name is mandatory, for the others it is optional. (*SKIP) changes its behaviour
|
||||
|
@ -670,7 +565,7 @@ pattern is not anchored.
|
|||
The effect of one of these verbs in a group called as a subroutine is confined
|
||||
to the subroutine call.
|
||||
</P>
|
||||
<br><a name="SEC26" href="#TOC1">CALLOUTS</a><br>
|
||||
<br><a name="SEC28" href="#TOC1">CALLOUTS</a><br>
|
||||
<P>
|
||||
<pre>
|
||||
(?C) callout (assumed number 0)
|
||||
|
@ -681,12 +576,12 @@ The allowed string delimiters are ` ' " ^ % # $ (which are the same for the
|
|||
start and the end), and the starting delimiter { matched with the ending
|
||||
delimiter }. To encode the ending delimiter within the string, double it.
|
||||
</P>
|
||||
<br><a name="SEC27" href="#TOC1">SEE ALSO</a><br>
|
||||
<br><a name="SEC29" href="#TOC1">SEE ALSO</a><br>
|
||||
<P>
|
||||
<b>pcre2pattern</b>(3), <b>pcre2api</b>(3), <b>pcre2callout</b>(3),
|
||||
<b>pcre2matching</b>(3), <b>pcre2</b>(3).
|
||||
</P>
|
||||
<br><a name="SEC28" href="#TOC1">AUTHOR</a><br>
|
||||
<br><a name="SEC30" href="#TOC1">AUTHOR</a><br>
|
||||
<P>
|
||||
Philip Hazel
|
||||
<br>
|
||||
|
@ -695,11 +590,11 @@ Retired from University Computing Service
|
|||
Cambridge, England.
|
||||
<br>
|
||||
</P>
|
||||
<br><a name="SEC29" href="#TOC1">REVISION</a><br>
|
||||
<br><a name="SEC31" href="#TOC1">REVISION</a><br>
|
||||
<P>
|
||||
Last updated: 30 August 2021
|
||||
Last updated: 12 January 2022
|
||||
<br>
|
||||
Copyright © 1997-2021 University of Cambridge.
|
||||
Copyright © 1997-2022 University of Cambridge.
|
||||
<br>
|
||||
<p>
|
||||
Return to the <a href="index.html">PCRE2 index page</a>.
|
||||
|
|
|
@ -78,7 +78,7 @@ to 8-bit code units for output.
|
|||
</P>
|
||||
<P>
|
||||
In the rest of this document, the names of library functions and structures
|
||||
are given in generic form, for example, <b>pcre_compile()</b>. The actual
|
||||
are given in generic form, for example, <b>pcre2_compile()</b>. The actual
|
||||
names used in the libraries have a suffix _8, _16, or _32, as appropriate.
|
||||
<a name="inputencoding"></a></P>
|
||||
<br><a name="SEC3" href="#TOC1">INPUT ENCODING</a><br>
|
||||
|
@ -253,7 +253,19 @@ available, and the use of JIT for matching is verified.
|
|||
<b>-LM</b>
|
||||
List modifiers: write a list of available pattern and subject modifiers to the
|
||||
standard output, then exit with zero exit code. All other options are ignored.
|
||||
If both -C and -LM are present, whichever is first is recognized.
|
||||
If both -C and any -Lx options are present, whichever is first is recognized.
|
||||
</P>
|
||||
<P>
|
||||
<b>-LP</b>
|
||||
List properties: write a list of recognized Unicode properties to the standard
|
||||
output, then exit with zero exit code. All other options are ignored. If both
|
||||
-C and any -Lx options are present, whichever is first is recognized.
|
||||
</P>
|
||||
<P>
|
||||
<b>-LS</b>
|
||||
List scripts: write a list of recogized Unicode script names to the standard
|
||||
output, then exit with zero exit code. All other options are ignored. If both
|
||||
-C and any -Lx options are present, whichever is first is recognized.
|
||||
</P>
|
||||
<P>
|
||||
<b>-pattern</b> <i>modifier-list</i>
|
||||
|
@ -1229,7 +1241,8 @@ pattern, but can be overridden by modifiers on the subject.
|
|||
copy=<number or name> copy captured substring
|
||||
depth_limit=<n> set a depth limit
|
||||
dfa use <b>pcre2_dfa_match()</b>
|
||||
find_limits find match and depth limits
|
||||
find_limits find heap, match and depth limits
|
||||
find_limits_noheap find match and depth limits
|
||||
get=<number or name> extract captured substring
|
||||
getall extract all captured substrings
|
||||
/g global global matching
|
||||
|
@ -1239,6 +1252,8 @@ pattern, but can be overridden by modifiers on the subject.
|
|||
match_limit=<n> set a match limit
|
||||
memory show heap memory usage
|
||||
null_context match with a NULL context
|
||||
null_replacement substitute with NULL replacement
|
||||
null_subject match with NULL subject
|
||||
offset=<n> set starting offset
|
||||
offset_limit=<n> set offset limit
|
||||
ovector=<n> set size of output vector
|
||||
|
@ -1550,7 +1565,7 @@ Setting heap, match, and depth limits
|
|||
<P>
|
||||
The <b>heap_limit</b>, <b>match_limit</b>, and <b>depth_limit</b> modifiers set
|
||||
the appropriate limits in the match context. These values are ignored when the
|
||||
<b>find_limits</b> modifier is specified.
|
||||
<b>find_limits</b> or <b>find_limits_noheap</b> modifier is specified.
|
||||
</P>
|
||||
<br><b>
|
||||
Finding minimum limits
|
||||
|
@ -1560,8 +1575,12 @@ If the <b>find_limits</b> modifier is present on a subject line, <b>pcre2test</b
|
|||
calls the relevant matching function several times, setting different values in
|
||||
the match context via <b>pcre2_set_heap_limit()</b>,
|
||||
<b>pcre2_set_match_limit()</b>, or <b>pcre2_set_depth_limit()</b> until it finds
|
||||
the minimum values for each parameter that allows the match to complete without
|
||||
error. If JIT is being used, only the match limit is relevant.
|
||||
the smallest value for each parameter that allows the match to complete without
|
||||
a "limit exceeded" error. The match itself may succeed or fail. An alternative
|
||||
modifier, <b>find_limits_noheap</b>, omits the heap limit. This is used in the
|
||||
standard tests, because the minimum heap limit varies between systems. If JIT
|
||||
is being used, only the match limit is relevant, and the other two are
|
||||
automatically omitted.
|
||||
</P>
|
||||
<P>
|
||||
When using this modifier, the pattern should not contain any limit settings
|
||||
|
@ -1589,9 +1608,7 @@ overall amount of computing resource that is used.
|
|||
</P>
|
||||
<P>
|
||||
For both kinds of matching, the <i>heap_limit</i> number, which is in kibibytes
|
||||
(units of 1024 bytes), limits the amount of heap memory used for matching. A
|
||||
value of zero disables the use of any heap memory; many simple pattern matches
|
||||
can be done without using the heap, so zero is not an unreasonable setting.
|
||||
(units of 1024 bytes), limits the amount of heap memory used for matching.
|
||||
</P>
|
||||
<br><b>
|
||||
Showing MARK names
|
||||
|
@ -1609,12 +1626,10 @@ Showing memory usage
|
|||
<P>
|
||||
The <b>memory</b> modifier causes <b>pcre2test</b> to log the sizes of all heap
|
||||
memory allocation and freeing calls that occur during a call to
|
||||
<b>pcre2_match()</b> or <b>pcre2_dfa_match()</b>. These occur only when a match
|
||||
requires a bigger vector than the default for remembering backtracking points
|
||||
(<b>pcre2_match()</b>) or for internal workspace (<b>pcre2_dfa_match()</b>). In
|
||||
many cases there will be no heap memory used and therefore no additional
|
||||
output. No heap memory is allocated during matching with JIT, so in that case
|
||||
the <b>memory</b> modifier never has any effect. For this modifier to work, the
|
||||
<b>pcre2_match()</b> or <b>pcre2_dfa_match()</b>. In the latter case, heap memory
|
||||
is used only when a match requires more internal workspace that the default
|
||||
allocation on the stack, so in many cases there will be no output. No heap
|
||||
memory is allocated during matching with JIT. For this modifier to work, the
|
||||
<b>null_context</b> modifier must not be set on both the pattern and the
|
||||
subject, though it can be set on one or the other.
|
||||
</P>
|
||||
|
@ -1668,7 +1683,7 @@ When testing <b>pcre2_substitute()</b>, this modifier also has the effect of
|
|||
passing the replacement string as zero-terminated.
|
||||
</P>
|
||||
<br><b>
|
||||
Passing a NULL context
|
||||
Passing a NULL context, subject, or replacement
|
||||
</b><br>
|
||||
<P>
|
||||
Normally, <b>pcre2test</b> passes a context block to <b>pcre2_match()</b>,
|
||||
|
@ -1676,7 +1691,13 @@ Normally, <b>pcre2test</b> passes a context block to <b>pcre2_match()</b>,
|
|||
If the <b>null_context</b> modifier is set, however, NULL is passed. This is for
|
||||
testing that the matching and substitution functions behave correctly in this
|
||||
case (they use default values). This modifier cannot be used with the
|
||||
<b>find_limits</b> or <b>substitute_callout</b> modifiers.
|
||||
<b>find_limits</b>, <b>find_limits_noheap</b>, or <b>substitute_callout</b>
|
||||
modifiers.
|
||||
</P>
|
||||
<P>
|
||||
Similarly, for testing purposes, if the <b>null_subject</b> or
|
||||
<b>null_replacement</b> modifier is set, the subject or replacement string
|
||||
pointers are passed as NULL, respectively, to the relevant functions.
|
||||
</P>
|
||||
<br><a name="SEC12" href="#TOC1">THE ALTERNATIVE MATCHING FUNCTION</a><br>
|
||||
<P>
|
||||
|
@ -2122,9 +2143,9 @@ Cambridge, England.
|
|||
</P>
|
||||
<br><a name="SEC21" href="#TOC1">REVISION</a><br>
|
||||
<P>
|
||||
Last updated: 30 August 2021
|
||||
Last updated: 27 July 2022
|
||||
<br>
|
||||
Copyright © 1997-2021 University of Cambridge.
|
||||
Copyright © 1997-2022 University of Cambridge.
|
||||
<br>
|
||||
<p>
|
||||
Return to the <a href="index.html">PCRE2 index page</a>.
|
||||
|
|
|
@ -50,17 +50,18 @@ UNICODE PROPERTY SUPPORT
|
|||
<P>
|
||||
When PCRE2 is built with Unicode support, the escape sequences \p{..},
|
||||
\P{..}, and \X can be used. This is not dependent on the PCRE2_UTF setting.
|
||||
The Unicode properties that can be tested are limited to the general category
|
||||
properties such as Lu for an upper case letter or Nd for a decimal number, the
|
||||
Unicode script names such as Arabic or Han, and the derived properties Any and
|
||||
L&. Full lists are given in the
|
||||
The Unicode properties that can be tested are a subset of those that Perl
|
||||
supports. Currently they are limited to the general category properties such as
|
||||
Lu for an upper case letter or Nd for a decimal number, the Unicode script
|
||||
names such as Arabic or Han, Bidi_Class, Bidi_Control, and the derived
|
||||
properties Any and LC (synonym L&). Full lists are given in the
|
||||
<a href="pcre2pattern.html"><b>pcre2pattern</b></a>
|
||||
and
|
||||
<a href="pcre2syntax.html"><b>pcre2syntax</b></a>
|
||||
documentation. Only the short names for properties are supported. For example,
|
||||
\p{L} matches a letter. Its Perl synonym, \p{Letter}, is not supported.
|
||||
Furthermore, in Perl, many properties may optionally be prefixed by "Is", for
|
||||
compatibility with Perl 5.6. PCRE2 does not support this.
|
||||
documentation. In general, only the short names for properties are supported.
|
||||
For example, \p{L} matches a letter. Its longer synonym, \p{Letter}, is not
|
||||
supported. Furthermore, in Perl, many properties may optionally be prefixed by
|
||||
"Is", for compatibility with Perl 5.6. PCRE2 does not support this.
|
||||
</P>
|
||||
<br><b>
|
||||
WIDE CHARACTERS AND UTF MODES
|
||||
|
@ -477,7 +478,7 @@ AUTHOR
|
|||
<P>
|
||||
Philip Hazel
|
||||
<br>
|
||||
University Computing Service
|
||||
Retired from University Computing Service
|
||||
<br>
|
||||
Cambridge, England.
|
||||
<br>
|
||||
|
@ -486,9 +487,9 @@ Cambridge, England.
|
|||
REVISION
|
||||
</b><br>
|
||||
<P>
|
||||
Last updated: 23 February 2020
|
||||
Last updated: 22 December 2021
|
||||
<br>
|
||||
Copyright © 1997-2020 University of Cambridge.
|
||||
Copyright © 1997-2021 University of Cambridge.
|
||||
<br>
|
||||
<p>
|
||||
Return to the <a href="index.html">PCRE2 index page</a>.
|
||||
|
|
572
doc/pcre2.txt
572
doc/pcre2.txt
|
@ -1028,7 +1028,7 @@ PCRE2 CONTEXTS
|
|||
pcre2jit documentation for more details). If the limit is reached, the
|
||||
negative error code PCRE2_ERROR_HEAPLIMIT is returned. The default
|
||||
limit can be set when PCRE2 is built; if it is not, the default is set
|
||||
very large and is essentially "unlimited".
|
||||
very large and is essentially unlimited.
|
||||
|
||||
A value for the heap limit may also be supplied by an item at the start
|
||||
of a pattern of the form
|
||||
|
@ -1039,19 +1039,15 @@ PCRE2 CONTEXTS
|
|||
less ddd is less than the limit set by the caller of pcre2_match() or,
|
||||
if no such limit is set, less than the default.
|
||||
|
||||
The pcre2_match() function starts out using a 20KiB vector on the sys-
|
||||
tem stack for recording backtracking points. The more nested backtrack-
|
||||
ing points there are (that is, the deeper the search tree), the more
|
||||
memory is needed. Heap memory is used only if the initial vector is
|
||||
too small. If the heap limit is set to a value less than 21 (in partic-
|
||||
ular, zero) no heap memory will be used. In this case, only patterns
|
||||
that do not have a lot of nested backtracking can be successfully pro-
|
||||
cessed.
|
||||
The pcre2_match() function always needs some heap memory, so setting a
|
||||
value of zero guarantees a "heap limit exceeded" error. Details of how
|
||||
pcre2_match() uses the heap are given in the pcre2perform documenta-
|
||||
tion.
|
||||
|
||||
Similarly, for pcre2_dfa_match(), a vector on the system stack is used
|
||||
when processing pattern recursions, lookarounds, or atomic groups, and
|
||||
only if this is not big enough is heap memory used. In this case, too,
|
||||
setting a value of zero disables the use of the heap.
|
||||
For pcre2_dfa_match(), a vector on the system stack is used when pro-
|
||||
cessing pattern recursions, lookarounds, or atomic groups, and only if
|
||||
this is not big enough is heap memory used. In this case, setting a
|
||||
value of zero disables the use of the heap.
|
||||
|
||||
int pcre2_set_match_limit(pcre2_match_context *mcontext,
|
||||
uint32_t value);
|
||||
|
@ -1093,12 +1089,12 @@ PCRE2 CONTEXTS
|
|||
|
||||
This parameter limits the depth of nested backtracking in
|
||||
pcre2_match(). Each time a nested backtracking point is passed, a new
|
||||
memory "frame" is used to remember the state of matching at that point.
|
||||
memory frame is used to remember the state of matching at that point.
|
||||
Thus, this parameter indirectly limits the amount of memory that is
|
||||
used in a match. However, because the size of each memory "frame" de-
|
||||
pends on the number of capturing parentheses, the actual memory limit
|
||||
varies from pattern to pattern. This limit was more useful in versions
|
||||
before 10.30, where function recursion was used for backtracking.
|
||||
used in a match. However, because the size of each memory frame depends
|
||||
on the number of capturing parentheses, the actual memory limit varies
|
||||
from pattern to pattern. This limit was more useful in versions before
|
||||
10.30, where function recursion was used for backtracking.
|
||||
|
||||
The depth limit is not relevant, and is ignored, when matching is done
|
||||
using JIT compiled code. However, it is supported by pcre2_dfa_match(),
|
||||
|
@ -1372,8 +1368,7 @@ COMPILING A PATTERN
|
|||
diately. Otherwise, the variables to which these point are set to an
|
||||
error code and an offset (number of code units) within the pattern, re-
|
||||
spectively, when pcre2_compile() returns NULL because a compilation er-
|
||||
ror has occurred. The values are not defined when compilation is suc-
|
||||
cessful and pcre2_compile() returns a non-NULL value.
|
||||
ror has occurred.
|
||||
|
||||
There are nearly 100 positive error codes that pcre2_compile() may re-
|
||||
turn if it finds an error in the pattern. There are also some negative
|
||||
|
@ -1385,14 +1380,17 @@ COMPILING A PATTERN
|
|||
pcre2_get_error_message() function (see "Obtaining a textual error mes-
|
||||
sage" below) should be self-explanatory. Macro names starting with
|
||||
PCRE2_ERROR_ are defined for both positive and negative error codes in
|
||||
pcre2.h.
|
||||
pcre2.h. When compilation is successful errorcode is set to a value
|
||||
that returns the message "no error" if passed to pcre2_get_error_mes-
|
||||
sage().
|
||||
|
||||
The value returned in erroroffset is an indication of where in the pat-
|
||||
tern the error occurred. It is not necessarily the furthest point in
|
||||
the pattern that was read. For example, after the error "lookbehind as-
|
||||
sertion is not fixed length", the error offset points to the start of
|
||||
the failing assertion. For an invalid UTF-8 or UTF-16 string, the off-
|
||||
set is that of the first code unit of the failing character.
|
||||
tern an error occurred. When there is no error, zero is returned. A
|
||||
non-zero value is not necessarily the furthest point in the pattern
|
||||
that was read. For example, after the error "lookbehind assertion is
|
||||
not fixed length", the error offset points to the start of the failing
|
||||
assertion. For an invalid UTF-8 or UTF-16 string, the offset is that of
|
||||
the first code unit of the failing character.
|
||||
|
||||
Some errors are not detected until the whole pattern has been scanned;
|
||||
in these cases, the offset passed back is the length of the pattern.
|
||||
|
@ -1815,7 +1813,7 @@ COMPILING A PATTERN
|
|||
to crash or loop.
|
||||
|
||||
Note that this option can also be passed to pcre2_match() and
|
||||
pcre_dfa_match(), to suppress UTF validity checking of the subject
|
||||
pcre2_dfa_match(), to suppress UTF validity checking of the subject
|
||||
string.
|
||||
|
||||
Note also that setting PCRE2_NO_UTF_CHECK at compile time does not dis-
|
||||
|
@ -2012,13 +2010,13 @@ LOCALE SUPPORT
|
|||
code points are less than 256. By default, higher-valued code points
|
||||
never match escapes such as \w or \d.
|
||||
|
||||
When PCRE2 is built with Unicode support (the default), the Unicode
|
||||
properties of all characters can be tested with \p and \P, or, alterna-
|
||||
tively, the PCRE2_UCP option can be set when a pattern is compiled;
|
||||
this causes \w and friends to use Unicode property support instead of
|
||||
the built-in tables. PCRE2_UCP also causes upper/lower casing opera-
|
||||
tions on characters with code points greater than 127 to use Unicode
|
||||
properties. These effects apply even when PCRE2_UTF is not set.
|
||||
When PCRE2 is built with Unicode support (the default), certain Unicode
|
||||
character properties can be tested with \p and \P, or, alternatively,
|
||||
the PCRE2_UCP option can be set when a pattern is compiled; this causes
|
||||
\w and friends to use Unicode property support instead of the built-in
|
||||
tables. PCRE2_UCP also causes upper/lower casing operations on charac-
|
||||
ters with code points greater than 127 to use Unicode properties. These
|
||||
effects apply even when PCRE2_UTF is not set.
|
||||
|
||||
The use of locales with Unicode is discouraged. If you are handling
|
||||
characters with code points greater than 127, you should either use
|
||||
|
@ -2579,7 +2577,9 @@ MATCHING A PATTERN: THE TRADITIONAL FUNCTION
|
|||
and offset are in code units, not characters. That is, they are in
|
||||
bytes for the 8-bit library, 16-bit code units for the 16-bit library,
|
||||
and 32-bit code units for the 32-bit library, whether or not UTF pro-
|
||||
cessing is enabled.
|
||||
cessing is enabled. As a special case, if subject is NULL and length is
|
||||
zero, the subject is assumed to be an empty string. If length is non-
|
||||
zero, an error occurs if subject is NULL.
|
||||
|
||||
If startoffset is greater than the length of the subject, pcre2_match()
|
||||
returns PCRE2_ERROR_BADOFFSET. When the starting offset is zero, the
|
||||
|
@ -3047,12 +3047,12 @@ ERROR RETURNS FROM pcre2_match()
|
|||
|
||||
PCRE2_ERROR_NOMEMORY
|
||||
|
||||
If a pattern contains many nested backtracking points, heap memory is
|
||||
used to remember them. This error is given when the memory allocation
|
||||
function (default or custom) fails. Note that a different error,
|
||||
PCRE2_ERROR_HEAPLIMIT, is given if the amount of memory needed exceeds
|
||||
the heap limit. PCRE2_ERROR_NOMEMORY is also returned if
|
||||
PCRE2_COPY_MATCHED_SUBJECT is set and memory allocation fails.
|
||||
Heap memory is used to remember backgracking points. This error is
|
||||
given when the memory allocation function (default or custom) fails.
|
||||
Note that a different error, PCRE2_ERROR_HEAPLIMIT, is given if the
|
||||
amount of memory needed exceeds the heap limit. PCRE2_ERROR_NOMEMORY is
|
||||
also returned if PCRE2_COPY_MATCHED_SUBJECT is set and memory alloca-
|
||||
tion fails.
|
||||
|
||||
PCRE2_ERROR_NULL
|
||||
|
||||
|
@ -3280,8 +3280,12 @@ CREATING A NEW STRING WITH SUBSTITUTIONS
|
|||
|
||||
This function optionally calls pcre2_match() and then makes a copy of
|
||||
the subject string in outputbuffer, replacing parts that were matched
|
||||
with the replacement string, whose length is supplied in rlength. This
|
||||
can be given as PCRE2_ZERO_TERMINATED for a zero-terminated string.
|
||||
with the replacement string, whose length is supplied in rlength, which
|
||||
can be given as PCRE2_ZERO_TERMINATED for a zero-terminated string. As
|
||||
a special case, if replacement is NULL and rlength is zero, the re-
|
||||
placement is assumed to be an empty string. If rlength is non-zero, an
|
||||
error occurs if replacement is NULL.
|
||||
|
||||
There is an option (see PCRE2_SUBSTITUTE_REPLACEMENT_ONLY below) to re-
|
||||
turn just the replacement string(s). The default action is to perform
|
||||
just one replacement if the pattern matches, but there is an option
|
||||
|
@ -3315,12 +3319,13 @@ CREATING A NEW STRING WITH SUBSTITUTIONS
|
|||
As well as the usual options for pcre2_match(), a number of additional
|
||||
options can be set in the options argument of pcre2_substitute(). One
|
||||
such option is PCRE2_SUBSTITUTE_MATCHED. When this is set, an external
|
||||
match_data block must be provided, and it must have been used for an
|
||||
external call to pcre2_match(). The data in the match_data block (re-
|
||||
turn code, offset vector) is used for the first substitution instead of
|
||||
calling pcre2_match() from within pcre2_substitute(). This allows an
|
||||
application to check for a match before choosing to substitute, without
|
||||
having to repeat the match.
|
||||
match_data block must be provided, and it must have already been used
|
||||
for an external call to pcre2_match() with the same pattern and subject
|
||||
arguments. The data in the match_data block (return code, offset vec-
|
||||
tor) is then used for the first substitution instead of calling
|
||||
pcre2_match() from within pcre2_substitute(). This allows an applica-
|
||||
tion to check for a match before choosing to substitute, without having
|
||||
to repeat the match.
|
||||
|
||||
The contents of the externally supplied match data block are not
|
||||
changed when PCRE2_SUBSTITUTE_MATCHED is set. If PCRE2_SUBSTI-
|
||||
|
@ -3524,7 +3529,9 @@ CREATING A NEW STRING WITH SUBSTITUTIONS
|
|||
does not happen by default.
|
||||
|
||||
PCRE2_ERROR_NULL is returned if PCRE2_SUBSTITUTE_MATCHED is set but the
|
||||
match_data argument is NULL.
|
||||
match_data argument is NULL or if the subject or replacement arguments
|
||||
are NULL. For backward compatibility reasons an exception is made for
|
||||
the replacement argument if the rlength argument is also 0.
|
||||
|
||||
PCRE2_ERROR_BADREPLACEMENT is used for miscellaneous syntax errors in
|
||||
the replacement string, with more particular errors being PCRE2_ER-
|
||||
|
@ -3664,12 +3671,13 @@ MATCHING A PATTERN: THE ALTERNATIVE FUNCTION
|
|||
The function pcre2_dfa_match() is called to match a subject string
|
||||
against a compiled pattern, using a matching algorithm that scans the
|
||||
subject string just once (not counting lookaround assertions), and does
|
||||
not backtrack. This has different characteristics to the normal algo-
|
||||
rithm, and is not compatible with Perl. Some of the features of PCRE2
|
||||
patterns are not supported. Nevertheless, there are times when this
|
||||
kind of matching can be useful. For a discussion of the two matching
|
||||
algorithms, and a list of features that pcre2_dfa_match() does not sup-
|
||||
port, see the pcre2matching documentation.
|
||||
not backtrack (except when processing lookaround assertions). This has
|
||||
different characteristics to the normal algorithm, and is not compati-
|
||||
ble with Perl. Some of the features of PCRE2 patterns are not sup-
|
||||
ported. Nevertheless, there are times when this kind of matching can be
|
||||
useful. For a discussion of the two matching algorithms, and a list of
|
||||
features that pcre2_dfa_match() does not support, see the pcre2matching
|
||||
documentation.
|
||||
|
||||
The arguments for the pcre2_dfa_match() function are the same as for
|
||||
pcre2_match(), plus two extras. The ovector within the match data block
|
||||
|
@ -3698,7 +3706,7 @@ MATCHING A PATTERN: THE ALTERNATIVE FUNCTION
|
|||
wspace, /* working space vector */
|
||||
20); /* number of elements (NOT size in bytes) */
|
||||
|
||||
Option bits for pcre_dfa_match()
|
||||
Option bits for pcre2_dfa_match()
|
||||
|
||||
The unused bits of the options argument for pcre2_dfa_match() must be
|
||||
zero. The only bits that may be set are PCRE2_ANCHORED,
|
||||
|
@ -3848,8 +3856,8 @@ AUTHOR
|
|||
|
||||
REVISION
|
||||
|
||||
Last updated: 30 August 2021
|
||||
Copyright (c) 1997-2021 University of Cambridge.
|
||||
Last updated: 27 July 2022
|
||||
Copyright (c) 1997-2022 University of Cambridge.
|
||||
------------------------------------------------------------------------------
|
||||
|
||||
|
||||
|
@ -3961,8 +3969,8 @@ UNICODE AND UTF SUPPORT
|
|||
0x10ffff in the strings that they handle. Unicode support also gives
|
||||
access to the Unicode properties of characters, using pattern escapes
|
||||
such as \P, \p, and \X. Only the general category properties such as Lu
|
||||
and Nd are supported. Details are given in the pcre2pattern documenta-
|
||||
tion.
|
||||
and Nd, script names, and some bi-directional properties are supported.
|
||||
Details are given in the pcre2pattern documentation.
|
||||
|
||||
Pattern escapes such as \d and \w do not by default make use of Unicode
|
||||
properties. The application can request that they do by setting the
|
||||
|
@ -4106,14 +4114,13 @@ LIMITING PCRE2 RESOURCE USAGE
|
|||
pcre2_dfa_match() matching function, and to JIT matching (though the
|
||||
counting is done differently).
|
||||
|
||||
The pcre2_match() function starts out using a 20KiB vector on the sys-
|
||||
tem stack to record backtracking points. The more nested backtracking
|
||||
points there are (that is, the deeper the search tree), the more memory
|
||||
is needed. If the initial vector is not large enough, heap memory is
|
||||
used, up to a certain limit, which is specified in kibibytes (units of
|
||||
1024 bytes). The limit can be changed at run time, as described in the
|
||||
pcre2api documentation. The default limit (in effect unlimited) is 20
|
||||
million. You can change this by a setting such as
|
||||
The pcre2_match() function uses heap memory to record backtracking
|
||||
points. The more nested backtracking points there are (that is, the
|
||||
deeper the search tree), the more memory is needed. There is an upper
|
||||
limit, specified in kibibytes (units of 1024 bytes). This limit can be
|
||||
changed at run time, as described in the pcre2api documentation. The
|
||||
default limit (in effect unlimited) is 20 million. You can change this
|
||||
by a setting such as
|
||||
|
||||
--with-heap-limit=500
|
||||
|
||||
|
@ -4128,7 +4135,7 @@ LIMITING PCRE2 RESOURCE USAGE
|
|||
for --with-match-limit. You can set a lower default limit by adding,
|
||||
for example,
|
||||
|
||||
--with-match-limit_depth=10000
|
||||
--with-match-limit-depth=10000
|
||||
|
||||
to the configure command. This value can be overridden at run time.
|
||||
This depth limit indirectly limits the amount of heap memory that is
|
||||
|
@ -4438,14 +4445,14 @@ SEE ALSO
|
|||
AUTHOR
|
||||
|
||||
Philip Hazel
|
||||
University Computing Service
|
||||
Retired from University Computing Service
|
||||
Cambridge, England.
|
||||
|
||||
|
||||
REVISION
|
||||
|
||||
Last updated: 20 March 2020
|
||||
Copyright (c) 1997-2020 University of Cambridge.
|
||||
Last updated: 27 July 2022
|
||||
Copyright (c) 1997-2022 University of Cambridge.
|
||||
------------------------------------------------------------------------------
|
||||
|
||||
|
||||
|
@ -4890,29 +4897,36 @@ DIFFERENCES BETWEEN PCRE2 AND PERL
|
|||
|
||||
This document describes some of the differences in the ways that PCRE2
|
||||
and Perl handle regular expressions. The differences described here are
|
||||
with respect to Perl version 5.32.0, but as both Perl and PCRE2 are
|
||||
with respect to Perl version 5.34.0, but as both Perl and PCRE2 are
|
||||
continually changing, the information may at times be out of date.
|
||||
|
||||
1. PCRE2 has only a subset of Perl's Unicode support. Details of what
|
||||
1. When PCRE2_DOTALL (equivalent to Perl's /s qualifier) is not set,
|
||||
the behaviour of the '.' metacharacter differs from Perl. In PCRE2, '.'
|
||||
matches the next character unless it is the start of a newline se-
|
||||
quence. This means that, if the newline setting is CR, CRLF, or NUL,
|
||||
'.' will match the code point LF (0x0A) in ASCII/Unicode environments,
|
||||
and NL (either 0x15 or 0x25) when using EBCDIC. In Perl, '.' appears
|
||||
never to match LF, even when 0x0A is not a newline indicator.
|
||||
|
||||
2. PCRE2 has only a subset of Perl's Unicode support. Details of what
|
||||
it does have are given in the pcre2unicode page.
|
||||
|
||||
2. Like Perl, PCRE2 allows repeat quantifiers on parenthesized asser-
|
||||
3. Like Perl, PCRE2 allows repeat quantifiers on parenthesized asser-
|
||||
tions, but they do not mean what you might think. For example, (?!a){3}
|
||||
does not assert that the next three characters are not "a". It just as-
|
||||
serts that the next character is not "a" three times (in principle;
|
||||
PCRE2 optimizes this to run the assertion just once). Perl allows some
|
||||
repeat quantifiers on other assertions, for example, \b* (but not
|
||||
\b{3}, though oddly it does allow ^{3}), but these do not seem to have
|
||||
any use. PCRE2 does not allow any kind of quantifier on non-lookaround
|
||||
assertions.
|
||||
repeat quantifiers on other assertions, for example, \b* , but these do
|
||||
not seem to have any use. PCRE2 does not allow any kind of quantifier
|
||||
on non-lookaround assertions.
|
||||
|
||||
3. Capture groups that occur inside negative lookaround assertions are
|
||||
4. Capture groups that occur inside negative lookaround assertions are
|
||||
counted, but their entries in the offsets vector are set only when a
|
||||
negative assertion is a condition that has a matching branch (that is,
|
||||
the condition is false). Perl may set such capture groups in other
|
||||
circumstances.
|
||||
|
||||
4. The following Perl escape sequences are not supported: \F, \l, \L,
|
||||
5. The following Perl escape sequences are not supported: \F, \l, \L,
|
||||
\u, \U, and \N when followed by a character name. \N on its own, match-
|
||||
ing a non-newline character, and \N{U+dd..}, matching a Unicode code
|
||||
point, are supported. The escapes that modify the case of following
|
||||
|
@ -4922,25 +4936,25 @@ DIFFERENCES BETWEEN PCRE2 AND PERL
|
|||
PCRE2_ALT_BSUX or PCRE2_EXTRA_ALT_BSUX options is set, \U and \u are
|
||||
interpreted as ECMAScript interprets them.
|
||||
|
||||
5. The Perl escape sequences \p, \P, and \X are supported only if PCRE2
|
||||
6. The Perl escape sequences \p, \P, and \X are supported only if PCRE2
|
||||
is built with Unicode support (the default). The properties that can be
|
||||
tested with \p and \P are limited to the general category properties
|
||||
such as Lu and Nd, script names such as Greek or Han, and the derived
|
||||
properties Any and L&. Both PCRE2 and Perl support the Cs (surrogate)
|
||||
property, but in PCRE2 its use is limited. See the pcre2pattern docu-
|
||||
mentation for details. The long synonyms for property names that Perl
|
||||
supports (such as \p{Letter}) are not supported by PCRE2, nor is it
|
||||
permitted to prefix any of these properties with "Is".
|
||||
such as Lu and Nd, script names such as Greek or Han, Bidi_Class,
|
||||
Bidi_Control, and the derived properties Any and LC (synonym L&). Both
|
||||
PCRE2 and Perl support the Cs (surrogate) property, but in PCRE2 its
|
||||
use is limited. See the pcre2pattern documentation for details. The
|
||||
long synonyms for property names that Perl supports (such as \p{Let-
|
||||
ter}) are not supported by PCRE2, nor is it permitted to prefix any of
|
||||
these properties with "Is".
|
||||
|
||||
6. PCRE2 supports the \Q...\E escape for quoting substrings. Characters
|
||||
7. PCRE2 supports the \Q...\E escape for quoting substrings. Characters
|
||||
in between are treated as literals. However, this is slightly different
|
||||
from Perl in that $ and @ are also handled as literals inside the
|
||||
quotes. In Perl, they cause variable interpolation (but of course PCRE2
|
||||
does not have variables). Also, Perl does "double-quotish backslash in-
|
||||
terpolation" on any backslashes between \Q and \E which, its documenta-
|
||||
tion says, "may lead to confusing results". PCRE2 treats a backslash
|
||||
between \Q and \E just like any other character. Note the following ex-
|
||||
amples:
|
||||
quotes. In Perl, they cause variable interpolation (PCRE2 does not have
|
||||
variables). Also, Perl does "double-quotish backslash interpolation" on
|
||||
any backslashes between \Q and \E which, its documentation says, "may
|
||||
lead to confusing results". PCRE2 treats a backslash between \Q and \E
|
||||
just like any other character. Note the following examples:
|
||||
|
||||
Pattern PCRE2 matches Perl matches
|
||||
|
||||
|
@ -4954,16 +4968,16 @@ DIFFERENCES BETWEEN PCRE2 AND PERL
|
|||
The \Q...\E sequence is recognized both inside and outside character
|
||||
classes by both PCRE2 and Perl.
|
||||
|
||||
7. Fairly obviously, PCRE2 does not support the (?{code}) and
|
||||
8. Fairly obviously, PCRE2 does not support the (?{code}) and
|
||||
(??{code}) constructions. However, PCRE2 does have a "callout" feature,
|
||||
which allows an external function to be called during pattern matching.
|
||||
See the pcre2callout documentation for details.
|
||||
|
||||
8. Subroutine calls (whether recursive or not) were treated as atomic
|
||||
9. Subroutine calls (whether recursive or not) were treated as atomic
|
||||
groups up to PCRE2 release 10.23, but from release 10.30 this changed,
|
||||
and backtracking into subroutine calls is now supported, as in Perl.
|
||||
|
||||
9. In PCRE2, if any of the backtracking control verbs are used in a
|
||||
10. In PCRE2, if any of the backtracking control verbs are used in a
|
||||
group that is called as a subroutine (whether or not recursively),
|
||||
their effect is confined to that group; it does not extend to the sur-
|
||||
rounding pattern. This is not always the case in Perl. In particular,
|
||||
|
@ -4972,18 +4986,18 @@ DIFFERENCES BETWEEN PCRE2 AND PERL
|
|||
| characters. Note that such groups are processed as anchored at the
|
||||
point where they are tested.
|
||||
|
||||
10. If a pattern contains more than one backtracking control verb, the
|
||||
11. If a pattern contains more than one backtracking control verb, the
|
||||
first one that is backtracked onto acts. For example, in the pattern
|
||||
A(*COMMIT)B(*PRUNE)C a failure in B triggers (*COMMIT), but a failure
|
||||
in C triggers (*PRUNE). Perl's behaviour is more complex; in many cases
|
||||
it is the same as PCRE2, but there are cases where it differs.
|
||||
|
||||
11. There are some differences that are concerned with the settings of
|
||||
12. There are some differences that are concerned with the settings of
|
||||
captured strings when part of a pattern is repeated. For example,
|
||||
matching "aba" against the pattern /^(a(b)?)+$/ in Perl leaves $2 un-
|
||||
set, but in PCRE2 it is set to "b".
|
||||
|
||||
12. PCRE2's handling of duplicate capture group numbers and names is
|
||||
13. PCRE2's handling of duplicate capture group numbers and names is
|
||||
not as general as Perl's. This is a consequence of the fact the PCRE2
|
||||
works internally just with numbers, using an external table to trans-
|
||||
late between numbers and names. In particular, a pattern such as
|
||||
|
@ -4993,39 +5007,40 @@ DIFFERENCES BETWEEN PCRE2 AND PERL
|
|||
group matched, because both names map to capture group number 1. To
|
||||
avoid this confusing situation, an error is given at compile time.
|
||||
|
||||
13. Perl used to recognize comments in some places that PCRE2 does not,
|
||||
14. Perl used to recognize comments in some places that PCRE2 does not,
|
||||
for example, between the ( and ? at the start of a group. If the /x
|
||||
modifier is set, Perl allowed white space between ( and ? though the
|
||||
latest Perls give an error (for a while it was just deprecated). There
|
||||
may still be some cases where Perl behaves differently.
|
||||
|
||||
14. Perl, when in warning mode, gives warnings for character classes
|
||||
15. Perl, when in warning mode, gives warnings for character classes
|
||||
such as [A-\d] or [a-[:digit:]]. It then treats the hyphens as liter-
|
||||
als. PCRE2 has no warning features, so it gives an error in these cases
|
||||
because they are almost certainly user mistakes.
|
||||
|
||||
15. In PCRE2, the upper/lower case character properties Lu and Ll are
|
||||
16. In PCRE2, the upper/lower case character properties Lu and Ll are
|
||||
not affected when case-independent matching is specified. For example,
|
||||
\p{Lu} always matches an upper case letter. I think Perl has changed in
|
||||
this respect; in the release at the time of writing (5.32), \p{Lu} and
|
||||
this respect; in the release at the time of writing (5.34), \p{Lu} and
|
||||
\p{Ll} match all letters, regardless of case, when case independence is
|
||||
specified.
|
||||
|
||||
16. From release 5.32.0, Perl locks out the use of \K in lookaround as-
|
||||
17. From release 5.32.0, Perl locks out the use of \K in lookaround as-
|
||||
sertions. From release 10.38 PCRE2 does the same by default. However,
|
||||
there is an option for re-enabling the previous behaviour. When this
|
||||
option is set, \K is acted on when it occurs in positive assertions,
|
||||
but is ignored in negative assertions.
|
||||
|
||||
17. PCRE2 provides some extensions to the Perl regular expression fa-
|
||||
18. PCRE2 provides some extensions to the Perl regular expression fa-
|
||||
cilities. Perl 5.10 included new features that were not in earlier
|
||||
versions of Perl, some of which (such as named parentheses) were in
|
||||
PCRE2 for some time before. This list is with respect to Perl 5.32:
|
||||
PCRE2 for some time before. This list is with respect to Perl 5.34:
|
||||
|
||||
(a) Although lookbehind assertions in PCRE2 must match fixed length
|
||||
strings, each alternative toplevel branch of a lookbehind assertion can
|
||||
match a different length of string. Perl requires them all to have the
|
||||
same length.
|
||||
match a different length of string. Perl used to require them all to
|
||||
have the same length, but the latest version has some variable length
|
||||
support.
|
||||
|
||||
(b) From PCRE2 10.23, backreferences to groups of fixed length are sup-
|
||||
ported in lookbehinds, provided that there is no possibility of refer-
|
||||
|
@ -5067,12 +5082,12 @@ DIFFERENCES BETWEEN PCRE2 AND PERL
|
|||
an extension to the lookaround facilities. The default, Perl-compatible
|
||||
lookarounds are atomic.
|
||||
|
||||
18. The Perl /a modifier restricts /d numbers to pure ascii, and the
|
||||
19. The Perl /a modifier restricts /d numbers to pure ascii, and the
|
||||
/aa modifier restricts /i case-insensitive matching to pure ascii, ig-
|
||||
noring Unicode rules. This separation cannot be represented with
|
||||
PCRE2_UCP.
|
||||
|
||||
19. Perl has different limits than PCRE2. See the pcre2limit documenta-
|
||||
20. Perl has different limits than PCRE2. See the pcre2limit documenta-
|
||||
tion for details. Perl went with 5.10 from recursion to iteration keep-
|
||||
ing the intermediate matches on the heap, which is ~10% slower but does
|
||||
not fall into any stack-overflow limit. PCRE2 made a similar change at
|
||||
|
@ -5089,7 +5104,7 @@ AUTHOR
|
|||
|
||||
REVISION
|
||||
|
||||
Last updated: 30 August 2021
|
||||
Last updated: 08 December 2021
|
||||
Copyright (c) 1997-2021 University of Cambridge.
|
||||
------------------------------------------------------------------------------
|
||||
|
||||
|
@ -5492,10 +5507,11 @@ JIT FAST PATH API
|
|||
|
||||
When you call pcre2_match(), as well as testing for invalid options, a
|
||||
number of other sanity checks are performed on the arguments. For exam-
|
||||
ple, if the subject pointer is NULL, an immediate error is given. Also,
|
||||
unless PCRE2_NO_UTF_CHECK is set, a UTF subject string is tested for
|
||||
validity. In the interests of speed, these checks do not happen on the
|
||||
JIT fast path, and if invalid data is passed, the result is undefined.
|
||||
ple, if the subject pointer is NULL but the length is non-zero, an im-
|
||||
mediate error is given. Also, unless PCRE2_NO_UTF_CHECK is set, a UTF
|
||||
subject string is tested for validity. In the interests of speed, these
|
||||
checks do not happen on the JIT fast path, and if invalid data is
|
||||
passed, the result is undefined.
|
||||
|
||||
Bypassing the sanity checks and the pcre2_match() wrapping can give
|
||||
speedups of more than 10%.
|
||||
|
@ -5515,8 +5531,8 @@ AUTHOR
|
|||
|
||||
REVISION
|
||||
|
||||
Last updated: 23 May 2019
|
||||
Copyright (c) 1997-2019 University of Cambridge.
|
||||
Last updated: 30 November 2021
|
||||
Copyright (c) 1997-2021 University of Cambridge.
|
||||
------------------------------------------------------------------------------
|
||||
|
||||
|
||||
|
@ -5575,18 +5591,22 @@ SIZE AND OTHER LIMITATIONS
|
|||
The maximum length of a string argument to a callout is the largest
|
||||
number a 32-bit unsigned integer can hold.
|
||||
|
||||
The maximum amount of heap memory used for matching is controlled by
|
||||
the heap limit, which can be set in a pattern or in a match context.
|
||||
The default is a very large number, effectively unlimited.
|
||||
|
||||
|
||||
AUTHOR
|
||||
|
||||
Philip Hazel
|
||||
University Computing Service
|
||||
Retired from University Computing Service
|
||||
Cambridge, England.
|
||||
|
||||
|
||||
REVISION
|
||||
|
||||
Last updated: 02 February 2019
|
||||
Copyright (c) 1997-2019 University of Cambridge.
|
||||
Last updated: 26 July 2022
|
||||
Copyright (c) 1997-2022 University of Cambridge.
|
||||
------------------------------------------------------------------------------
|
||||
|
||||
|
||||
|
@ -6870,58 +6890,55 @@ BACKSLASH
|
|||
ters whose code points are less than U+0100 and U+10000, respectively.
|
||||
In 32-bit non-UTF mode, code points greater than 0x10ffff (the Unicode
|
||||
limit) may be encountered. These are all treated as being in the Un-
|
||||
known script and with an unassigned type. The extra escape sequences
|
||||
are:
|
||||
known script and with an unassigned type.
|
||||
|
||||
Matching characters by Unicode property is not fast, because PCRE2 has
|
||||
to do a multistage table lookup in order to find a character's prop-
|
||||
erty. That is why the traditional escape sequences such as \d and \w do
|
||||
not use Unicode properties in PCRE2 by default, though you can make
|
||||
them do so by setting the PCRE2_UCP option or by starting the pattern
|
||||
with (*UCP).
|
||||
|
||||
The extra escape sequences that provide property support are:
|
||||
|
||||
\p{xx} a character with the xx property
|
||||
\P{xx} a character without the xx property
|
||||
\X a Unicode extended grapheme cluster
|
||||
|
||||
The property names represented by xx above are case-sensitive. There is
|
||||
support for Unicode script names, Unicode general category properties,
|
||||
"Any", which matches any character (including newline), and some spe-
|
||||
cial PCRE2 properties (described in the next section). Other Perl
|
||||
properties such as "InMusicalSymbols" are not supported by PCRE2. Note
|
||||
that \P{Any} does not match any characters, so always causes a match
|
||||
failure.
|
||||
The property names represented by xx above are not case-sensitive, and
|
||||
in accordance with Unicode's "loose matching" rules, spaces, hyphens,
|
||||
and underscores are ignored. There is support for Unicode script names,
|
||||
Unicode general category properties, "Any", which matches any character
|
||||
(including newline), Bidi_Class, a number of binary (yes/no) proper-
|
||||
ties, and some special PCRE2 properties (described below). Certain
|
||||
other Perl properties such as "InMusicalSymbols" are not supported by
|
||||
PCRE2. Note that \P{Any} does not match any characters, so always
|
||||
causes a match failure.
|
||||
|
||||
Sets of Unicode characters are defined as belonging to certain scripts.
|
||||
A character from one of these sets can be matched using a script name.
|
||||
For example:
|
||||
Script properties for \p and \P
|
||||
|
||||
\p{Greek}
|
||||
\P{Han}
|
||||
There are three different syntax forms for matching a script. Each Uni-
|
||||
code character has a basic script and, optionally, a list of other
|
||||
scripts ("Script Extensions") with which it is commonly used. Using the
|
||||
Adlam script as an example, \p{sc:Adlam} matches characters whose basic
|
||||
script is Adlam, whereas \p{scx:Adlam} matches, in addition, characters
|
||||
that have Adlam in their extensions list. The full names "script" and
|
||||
"script extensions" for the property types are recognized, and a equals
|
||||
sign is an alternative to the colon. If a script name is given without
|
||||
a property type, for example, \p{Adlam}, it is treated as \p{scx:Ad-
|
||||
lam}. Perl changed to this interpretation at release 5.26 and PCRE2
|
||||
changed at release 10.40.
|
||||
|
||||
Unassigned characters (and in non-UTF 32-bit mode, characters with code
|
||||
points greater than 0x10FFFF) are assigned the "Unknown" script. Others
|
||||
that are not part of an identified script are lumped together as "Com-
|
||||
mon". The current list of scripts is:
|
||||
mon". The current list of recognized script names and their 4-character
|
||||
abbreviations can be obtained by running this command:
|
||||
|
||||
Adlam, Ahom, Anatolian_Hieroglyphs, Arabic, Armenian, Avestan, Bali-
|
||||
nese, Bamum, Bassa_Vah, Batak, Bengali, Bhaiksuki, Bopomofo, Brahmi,
|
||||
Braille, Buginese, Buhid, Canadian_Aboriginal, Carian, Caucasian_Alba-
|
||||
nian, Chakma, Cham, Cherokee, Chorasmian, Common, Coptic, Cuneiform,
|
||||
Cypriot, Cypro_Minoan, Cyrillic, Deseret, Devanagari, Dives_Akuru, Do-
|
||||
gra, Duployan, Egyptian_Hieroglyphs, Elbasan, Elymaic, Ethiopic, Geor-
|
||||
gian, Glagolitic, Gothic, Grantha, Greek, Gujarati, Gunjala_Gondi, Gur-
|
||||
mukhi, Han, Hangul, Hanifi_Rohingya, Hanunoo, Hatran, Hebrew, Hiragana,
|
||||
Imperial_Aramaic, Inherited, Inscriptional_Pahlavi, Inscrip-
|
||||
tional_Parthian, Javanese, Kaithi, Kannada, Katakana, Kayah_Li,
|
||||
Kharoshthi, Khitan_Small_Script, Khmer, Khojki, Khudawadi, Lao, Latin,
|
||||
Lepcha, Limbu, Linear_A, Linear_B, Lisu, Lycian, Lydian, Mahajani,
|
||||
Makasar, Malayalam, Mandaic, Manichaean, Marchen, Masaram_Gondi, Mede-
|
||||
faidrin, Meetei_Mayek, Mende_Kikakui, Meroitic_Cursive, Meroitic_Hiero-
|
||||
glyphs, Miao, Modi, Mongolian, Mro, Multani, Myanmar, Nabataean, Nandi-
|
||||
nagari, New_Tai_Lue, Newa, Nko, Nushu, Nyakeng_Puachue_Hmong, Ogham,
|
||||
Ol_Chiki, Old_Hungarian, Old_Italic, Old_North_Arabian, Old_Permic,
|
||||
Old_Persian, Old_Sogdian, Old_South_Arabian, Old_Turkic, Old_Uyghur,
|
||||
Oriya, Osage, Osmanya, Pahawh_Hmong, Palmyrene, Pau_Cin_Hau, Phags_Pa,
|
||||
Phoenician, Psalter_Pahlavi, Rejang, Runic, Samaritan, Saurashtra,
|
||||
Sharada, Shavian, Siddham, SignWriting, Sinhala, Sogdian, Sora_Sompeng,
|
||||
Soyombo, Sundanese, Syloti_Nagri, Syriac, Tagalog, Tagbanwa, Tai_Le,
|
||||
Tai_Tham, Tai_Viet, Takri, Tamil, Tangsa, Tangut, Telugu, Thaana, Thai,
|
||||
Tibetan, Tifinagh, Tirhuta, Toto, Ugaritic, Unknown, Vai, Vithkuqi,
|
||||
Wancho, Warang_Citi, Yezidi, Yi, Zanabazar_Square.
|
||||
pcre2test -LS
|
||||
|
||||
|
||||
The general category property for \p and \P
|
||||
|
||||
Each character has exactly one Unicode general category property, spec-
|
||||
ified by a two-letter abbreviation. For compatibility with Perl, nega-
|
||||
|
@ -6983,9 +7000,9 @@ BACKSLASH
|
|||
Zp Paragraph separator
|
||||
Zs Space separator
|
||||
|
||||
The special property L& is also supported: it matches a character that
|
||||
has the Lu, Ll, or Lt property, in other words, a letter that is not
|
||||
classified as a modifier or "other".
|
||||
The special property LC, which has the synonym L&, is also supported:
|
||||
it matches a character that has the Lu, Ll, or Lt property, in other
|
||||
words, a letter that is not classified as a modifier or "other".
|
||||
|
||||
The Cs (Surrogate) property applies only to characters whose code
|
||||
points are in the range U+D800 to U+DFFF. These characters are no dif-
|
||||
|
@ -7007,12 +7024,49 @@ BACKSLASH
|
|||
For example, \p{Lu} always matches only upper case letters. This is
|
||||
different from the behaviour of current versions of Perl.
|
||||
|
||||
Matching characters by Unicode property is not fast, because PCRE2 has
|
||||
to do a multistage table lookup in order to find a character's prop-
|
||||
erty. That is why the traditional escape sequences such as \d and \w do
|
||||
not use Unicode properties in PCRE2 by default, though you can make
|
||||
them do so by setting the PCRE2_UCP option or by starting the pattern
|
||||
with (*UCP).
|
||||
Binary (yes/no) properties for \p and \P
|
||||
|
||||
Unicode defines a number of binary properties, that is, properties
|
||||
whose only values are true or false. You can obtain a list of those
|
||||
that are recognized by \p and \P, along with their abbreviations, by
|
||||
running this command:
|
||||
|
||||
pcre2test -LP
|
||||
|
||||
|
||||
The Bidi_Class property for \p and \P
|
||||
|
||||
\p{Bidi_Class:<class>} matches a character with the given class
|
||||
\p{BC:<class>} matches a character with the given class
|
||||
|
||||
The recognized classes are:
|
||||
|
||||
AL Arabic letter
|
||||
AN Arabic number
|
||||
B paragraph separator
|
||||
BN boundary neutral
|
||||
CS common separator
|
||||
EN European number
|
||||
ES European separator
|
||||
ET European terminator
|
||||
FSI first strong isolate
|
||||
L left-to-right
|
||||
LRE left-to-right embedding
|
||||
LRI left-to-right isolate
|
||||
LRO left-to-right override
|
||||
NSM non-spacing mark
|
||||
ON other neutral
|
||||
PDF pop directional format
|
||||
PDI pop directional isolate
|
||||
R right-to-left
|
||||
RLE right-to-left embedding
|
||||
RLI right-to-left isolate
|
||||
RLO right-to-left override
|
||||
S segment separator
|
||||
WS which space
|
||||
|
||||
An equals sign may be used instead of a colon. The class names are
|
||||
case-insensitive; only the short names listed above are recognized.
|
||||
|
||||
Extended grapheme clusters
|
||||
|
||||
|
@ -7267,14 +7321,16 @@ FULL STOP (PERIOD, DOT) AND \N
|
|||
|
||||
Outside a character class, a dot in the pattern matches any one charac-
|
||||
ter in the subject string except (by default) a character that signi-
|
||||
fies the end of a line.
|
||||
fies the end of a line. One or more characters may be specified as line
|
||||
terminators (see "Newline conventions" above).
|
||||
|
||||
When a line ending is defined as a single character, dot never matches
|
||||
that character; when the two-character sequence CRLF is used, dot does
|
||||
not match CR if it is immediately followed by LF, but otherwise it
|
||||
matches all characters (including isolated CRs and LFs). When any Uni-
|
||||
code line endings are being recognized, dot does not match CR or LF or
|
||||
any of the other line ending characters.
|
||||
Dot never matches a single line-ending character. When the two-charac-
|
||||
ter sequence CRLF is the only line ending, dot does not match CR if it
|
||||
is immediately followed by LF, but otherwise it matches all characters
|
||||
(including isolated CRs and LFs). When ANYCRLF is selected for line
|
||||
endings, no occurences of CR of LF match dot. When all Unicode line
|
||||
endings are being recognized, dot does not match CR or LF or any of the
|
||||
other line ending characters.
|
||||
|
||||
The behaviour of dot with regard to newlines can be changed. If the
|
||||
PCRE2_DOTALL option is set, a dot matches any one character, without
|
||||
|
@ -9640,8 +9696,8 @@ AUTHOR
|
|||
|
||||
REVISION
|
||||
|
||||
Last updated: 30 August 2021
|
||||
Copyright (c) 1997-2021 University of Cambridge.
|
||||
Last updated: 12 January 2022
|
||||
Copyright (c) 1997-2022 University of Cambridge.
|
||||
------------------------------------------------------------------------------
|
||||
|
||||
|
||||
|
@ -9716,12 +9772,29 @@ STACK AND HEAP USAGE AT RUN TIME
|
|||
sive function calls could use a great deal of stack, and this could
|
||||
cause problems, but this usage has been eliminated. Backtracking posi-
|
||||
tions are now explicitly remembered in memory frames controlled by the
|
||||
code. An initial 20KiB vector of frames is allocated on the system
|
||||
stack (enough for about 100 frames for small patterns), but if this is
|
||||
insufficient, heap memory is used. The amount of heap memory can be
|
||||
limited; if the limit is set to zero, only the initial stack vector is
|
||||
used. Rewriting patterns to be time-efficient, as described below, may
|
||||
also reduce the memory requirements.
|
||||
code.
|
||||
|
||||
The size of each frame depends on the size of pointer variables and the
|
||||
number of capturing parenthesized groups in the pattern being matched.
|
||||
On a 64-bit system the frame size for a pattern with no captures is 128
|
||||
bytes. For each capturing group the size increases by 16 bytes.
|
||||
|
||||
Until release 10.41, an initial 20KiB frames vector was allocated on
|
||||
the system stack, but this still caused some issues for multi-thread
|
||||
applications where each thread has a very small stack. From release
|
||||
10.41 backtracking memory frames are always held in heap memory. An
|
||||
initial heap allocation is obtained the first time any match data block
|
||||
is passed to pcre2_match(). This is remembered with the match data
|
||||
block and re-used if that block is used for another match. It is freed
|
||||
when the match data block itself is freed.
|
||||
|
||||
The size of the initial block is the larger of 20KiB or ten times the
|
||||
pattern's frame size, unless the heap limit is less than this, in which
|
||||
case the heap limit is used. If the initial block proves to be too
|
||||
small during matching, it is replaced by a larger block, subject to the
|
||||
heap limit. The heap limit is checked only when a new block is to be
|
||||
allocated. Reducing the heap limit between calls to pcre2_match() with
|
||||
the same match data block does not affect the saved block.
|
||||
|
||||
In contrast to pcre2_match(), pcre2_dfa_match() does use recursive
|
||||
function calls, but only for processing atomic groups, lookaround as-
|
||||
|
@ -9869,14 +9942,14 @@ PROCESSING TIME
|
|||
AUTHOR
|
||||
|
||||
Philip Hazel
|
||||
University Computing Service
|
||||
Retired from University Computing Service
|
||||
Cambridge, England.
|
||||
|
||||
|
||||
REVISION
|
||||
|
||||
Last updated: 03 February 2019
|
||||
Copyright (c) 1997-2019 University of Cambridge.
|
||||
Last updated: 27 July 2022
|
||||
Copyright (c) 1997-2022 University of Cambridge.
|
||||
------------------------------------------------------------------------------
|
||||
|
||||
|
||||
|
@ -10312,11 +10385,11 @@ NAME
|
|||
SAVING AND RE-USING PRECOMPILED PCRE2 PATTERNS
|
||||
|
||||
int32_t pcre2_serialize_decode(pcre2_code **codes,
|
||||
int32_t number_of_codes, const uint32_t *bytes,
|
||||
int32_t number_of_codes, const uint8_t *bytes,
|
||||
pcre2_general_context *gcontext);
|
||||
|
||||
int32_t pcre2_serialize_encode(pcre2_code **codes,
|
||||
int32_t number_of_codes, uint32_t **serialized_bytes,
|
||||
int32_t pcre2_serialize_encode(const pcre2_code **codes,
|
||||
int32_t number_of_codes, uint8_t **serialized_bytes,
|
||||
PCRE2_SIZE *serialized_size, pcre2_general_context *gcontext);
|
||||
|
||||
void pcre2_serialize_free(uint8_t *bytes);
|
||||
|
@ -10379,7 +10452,7 @@ SAVING COMPILED PATTERNS
|
|||
|
||||
PCRE2_ERROR_BADDATA the number of patterns is zero or less
|
||||
PCRE2_ERROR_BADMAGIC mismatch of id bytes in one of the patterns
|
||||
PCRE2_ERROR_MEMORY memory allocation failed
|
||||
PCRE2_ERROR_NOMEMORY memory allocation failed
|
||||
PCRE2_ERROR_MIXEDTABLES the patterns do not all use the same tables
|
||||
PCRE2_ERROR_NULL the 1st, 3rd, or 4th argument is NULL
|
||||
|
||||
|
@ -10440,7 +10513,6 @@ RE-USING PRECOMPILED PATTERNS
|
|||
If this argument is NULL, malloc() and free() are used. After deserial-
|
||||
ization, the byte stream is no longer needed and can be discarded.
|
||||
|
||||
int32_t number_of_codes;
|
||||
pcre2_code *list_of_codes[2];
|
||||
uint8_t *bytes = <serialized data>;
|
||||
int32_t number_of_codes =
|
||||
|
@ -10588,6 +10660,10 @@ CHARACTER TYPES
|
|||
iour of these escape sequences is changed to use Unicode properties and
|
||||
they match many more characters.
|
||||
|
||||
Property descriptions in \p and \P are matched caselessly; hyphens, un-
|
||||
derscores, and white space are ignored, in accordance with Unicode's
|
||||
"loose matching" rules.
|
||||
|
||||
|
||||
GENERAL CATEGORY PROPERTIES FOR \p and \P
|
||||
|
||||
|
@ -10604,6 +10680,7 @@ GENERAL CATEGORY PROPERTIES FOR \p and \P
|
|||
Lo Other letter
|
||||
Lt Title case letter
|
||||
Lu Upper case letter
|
||||
Lc Ll, Lu, or Lt
|
||||
L& Ll, Lu, or Lt
|
||||
|
||||
M Mark
|
||||
|
@ -10650,33 +10727,56 @@ PCRE2 SPECIAL CATEGORY PROPERTIES FOR \p and \P
|
|||
acter set at release 5.18.
|
||||
|
||||
|
||||
SCRIPT NAMES FOR \p AND \P
|
||||
BINARY PROPERTIES FOR \p AND \P
|
||||
|
||||
Adlam, Ahom, Anatolian_Hieroglyphs, Arabic, Armenian, Avestan, Bali-
|
||||
nese, Bamum, Bassa_Vah, Batak, Bengali, Bhaiksuki, Bopomofo, Brahmi,
|
||||
Braille, Buginese, Buhid, Canadian_Aboriginal, Carian, Caucasian_Alba-
|
||||
nian, Chakma, Cham, Cherokee, Chorasmian, Common, Coptic, Cuneiform,
|
||||
Cypriot, Cypro_Minoan, Cyrillic, Deseret, Devanagari, Dives_Akuru, Do-
|
||||
gra, Duployan, Egyptian_Hieroglyphs, Elbasan, Elymaic, Ethiopic, Geor-
|
||||
gian, Glagolitic, Gothic, Grantha, Greek, Gujarati, Gunjala_Gondi, Gur-
|
||||
mukhi, Han, Hangul, Hanifi_Rohingya, Hanunoo, Hatran, Hebrew, Hiragana,
|
||||
Imperial_Aramaic, Inherited, Inscriptional_Pahlavi, Inscrip-
|
||||
tional_Parthian, Javanese, Kaithi, Kannada, Katakana, Kayah_Li,
|
||||
Kharoshthi, Khitan_Small_Script, Khmer, Khojki, Khudawadi, Lao, Latin,
|
||||
Lepcha, Limbu, Linear_A, Linear_B, Lisu, Lycian, Lydian, Mahajani,
|
||||
Makasar, Malayalam, Mandaic, Manichaean, Marchen, Masaram_Gondi, Mede-
|
||||
faidrin, Meetei_Mayek, Mende_Kikakui, Meroitic_Cursive, Meroitic_Hiero-
|
||||
glyphs, Miao, Modi, Mongolian, Mro, Multani, Myanmar, Nabataean, Nandi-
|
||||
nagari, New_Tai_Lue, Newa, Nko, Nushu, Nyakeng_Puachue_Hmong, Ogham,
|
||||
Ol_Chiki, Old_Hungarian, Old_Italic, Old_North_Arabian, Old_Permic,
|
||||
Old_Persian, Old_Sogdian, Old_South_Arabian, Old_Turkic, Old_Uyghur,
|
||||
Oriya, Osage, Osmanya, Pahawh_Hmong, Palmyrene, Pau_Cin_Hau, Phags_Pa,
|
||||
Phoenician, Psalter_Pahlavi, Rejang, Runic, Samaritan, Saurashtra,
|
||||
Sharada, Shavian, Siddham, SignWriting, Sinhala, Sogdian, Sora_Sompeng,
|
||||
Soyombo, Sundanese, Syloti_Nagri, Syriac, Tagalog, Tagbanwa, Tai_Le,
|
||||
Tai_Tham, Tai_Viet, Takri, Tamil, Tangsa, Tangut, Telugu, Thaana, Thai,
|
||||
Tibetan, Tifinagh, Tirhuta, Toto, Ugaritic, Vai, Vithkuqi, Wancho,
|
||||
Warang_Citi, Yezidi, Yi, Zanabazar_Square.
|
||||
Unicode defines a number of binary properties, that is, properties
|
||||
whose only values are true or false. You can obtain a list of those
|
||||
that are recognized by \p and \P, along with their abbreviations, by
|
||||
running this command:
|
||||
|
||||
pcre2test -LP
|
||||
|
||||
|
||||
SCRIPT MATCHING WITH \p AND \P
|
||||
|
||||
Many script names and their 4-letter abbreviations are recognized in
|
||||
\p{sc:...} or \p{scx:...} items, or on their own with \p (and also \P
|
||||
of course). You can obtain a list of these scripts by running this com-
|
||||
mand:
|
||||
|
||||
pcre2test -LS
|
||||
|
||||
|
||||
THE BIDI_CLASS PROPERTY FOR \p AND \P
|
||||
|
||||
\p{Bidi_Class:<class>} matches a character with the given class
|
||||
\p{BC:<class>} matches a character with the given class
|
||||
|
||||
The recognized classes are:
|
||||
|
||||
AL Arabic letter
|
||||
AN Arabic number
|
||||
B paragraph separator
|
||||
BN boundary neutral
|
||||
CS common separator
|
||||
EN European number
|
||||
ES European separator
|
||||
ET European terminator
|
||||
FSI first strong isolate
|
||||
L left-to-right
|
||||
LRE left-to-right embedding
|
||||
LRI left-to-right isolate
|
||||
LRO left-to-right override
|
||||
NSM non-spacing mark
|
||||
ON other neutral
|
||||
PDF pop directional format
|
||||
PDI pop directional isolate
|
||||
R right-to-left
|
||||
RLE right-to-left embedding
|
||||
RLI right-to-left isolate
|
||||
RLO right-to-left override
|
||||
S segment separator
|
||||
WS which space
|
||||
|
||||
|
||||
CHARACTER CLASSES
|
||||
|
@ -11008,8 +11108,8 @@ AUTHOR
|
|||
|
||||
REVISION
|
||||
|
||||
Last updated: 30 August 2021
|
||||
Copyright (c) 1997-2021 University of Cambridge.
|
||||
Last updated: 12 January 2022
|
||||
Copyright (c) 1997-2022 University of Cambridge.
|
||||
------------------------------------------------------------------------------
|
||||
|
||||
|
||||
|
@ -11051,15 +11151,17 @@ UNICODE PROPERTY SUPPORT
|
|||
|
||||
When PCRE2 is built with Unicode support, the escape sequences \p{..},
|
||||
\P{..}, and \X can be used. This is not dependent on the PCRE2_UTF set-
|
||||
ting. The Unicode properties that can be tested are limited to the
|
||||
general category properties such as Lu for an upper case letter or Nd
|
||||
for a decimal number, the Unicode script names such as Arabic or Han,
|
||||
and the derived properties Any and L&. Full lists are given in the
|
||||
pcre2pattern and pcre2syntax documentation. Only the short names for
|
||||
properties are supported. For example, \p{L} matches a letter. Its Perl
|
||||
synonym, \p{Letter}, is not supported. Furthermore, in Perl, many
|
||||
properties may optionally be prefixed by "Is", for compatibility with
|
||||
Perl 5.6. PCRE2 does not support this.
|
||||
ting. The Unicode properties that can be tested are a subset of those
|
||||
that Perl supports. Currently they are limited to the general category
|
||||
properties such as Lu for an upper case letter or Nd for a decimal num-
|
||||
ber, the Unicode script names such as Arabic or Han, Bidi_Class,
|
||||
Bidi_Control, and the derived properties Any and LC (synonym L&). Full
|
||||
lists are given in the pcre2pattern and pcre2syntax documentation. In
|
||||
general, only the short names for properties are supported. For exam-
|
||||
ple, \p{L} matches a letter. Its longer synonym, \p{Letter}, is not
|
||||
supported. Furthermore, in Perl, many properties may optionally be pre-
|
||||
fixed by "Is", for compatibility with Perl 5.6. PCRE2 does not support
|
||||
this.
|
||||
|
||||
|
||||
WIDE CHARACTERS AND UTF MODES
|
||||
|
@ -11437,14 +11539,14 @@ MATCHING IN INVALID UTF STRINGS
|
|||
AUTHOR
|
||||
|
||||
Philip Hazel
|
||||
University Computing Service
|
||||
Retired from University Computing Service
|
||||
Cambridge, England.
|
||||
|
||||
|
||||
REVISION
|
||||
|
||||
Last updated: 23 February 2020
|
||||
Copyright (c) 1997-2020 University of Cambridge.
|
||||
Last updated: 22 December 2021
|
||||
Copyright (c) 1997-2021 University of Cambridge.
|
||||
------------------------------------------------------------------------------
|
||||
|
||||
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
.TH PCRE2_COMPILE 3 "23 May 2019" "PCRE2 10.34"
|
||||
.TH PCRE2_COMPILE 3 "22 April 2022" "PCRE2 10.41"
|
||||
.SH NAME
|
||||
PCRE2 - Perl-compatible regular expressions (revised API)
|
||||
.SH SYNOPSIS
|
||||
|
@ -80,8 +80,17 @@ Additional options may be set in the compile context via the
|
|||
.\"
|
||||
function.
|
||||
.P
|
||||
The yield of this function is a pointer to a private data structure that
|
||||
contains the compiled pattern, or NULL if an error was detected.
|
||||
If either of \fIerrorcode\fP or \fIerroroffset\fP is NULL, the function returns
|
||||
NULL immediately. Otherwise, the yield of this function is a pointer to a
|
||||
private data structure that contains the compiled pattern, or NULL if an error
|
||||
was detected. In the error case, a text error message can be obtained by
|
||||
passing the value returned via the \fIerrorcode\fP argument to the the
|
||||
\fBpcre2_get_error_message()\fP function. The offset (in code units) where the
|
||||
error was encountered is returned via the \fIerroroffset\fP argument.
|
||||
.P
|
||||
If there is no error, the value passed via \fIerrorcode\fP returns the message
|
||||
"no error" if passed to \fBpcre2_get_error_message()\fP, and the value passed
|
||||
via \fIerroroffset\fP is zero.
|
||||
.P
|
||||
There is a complete description of the PCRE2 native API, with more detail on
|
||||
each option, in the
|
||||
|
|
|
@ -22,7 +22,8 @@ allocation. The result can be passed to the JIT run-time code by calling
|
|||
\fBpcre2_jit_stack_assign()\fP to associate the stack with a compiled pattern,
|
||||
which can then be processed by \fBpcre2_match()\fP or \fBpcre2_jit_match()\fP.
|
||||
A maximum stack size of 512KiB to 1MiB should be more than enough for any
|
||||
pattern. For more details, see the
|
||||
pattern. If the stack couldn't be allocated or the values passed were not
|
||||
reasonable, NULL will be returned. For more details, see the
|
||||
.\" HREF
|
||||
\fBpcre2jit\fP
|
||||
.\"
|
||||
|
|
|
@ -36,7 +36,7 @@ the following negative error codes:
|
|||
PCRE2_ERROR_BADDATA \fInumber_of_codes\fP is zero or less
|
||||
PCRE2_ERROR_BADMAGIC mismatch of id bytes in \fIbytes\fP
|
||||
PCRE2_ERROR_BADMODE mismatch of variable unit size or PCRE version
|
||||
PCRE2_ERROR_MEMORY memory allocation failed
|
||||
PCRE2_ERROR_NOMEMORY memory allocation failed
|
||||
PCRE2_ERROR_NULL \fIcodes\fP or \fIbytes\fP is NULL
|
||||
.sp
|
||||
PCRE2_ERROR_BADMAGIC may mean that the data is corrupt, or that it was compiled
|
||||
|
|
|
@ -18,9 +18,9 @@ This function sets additional option bits for \fBpcre2_compile()\fP that are
|
|||
housed in a compile context. It completely replaces all the bits. The extra
|
||||
options are:
|
||||
.sp
|
||||
.\" JOIN
|
||||
PCRE2_EXTRA_ALLOW_LOOKAROUND_BSK Allow \eK in lookarounds
|
||||
PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES Allow \ex{df800} to \ex{dfff}
|
||||
.\" JOIN
|
||||
PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES Allow \ex{d800} to \ex{dfff}
|
||||
in UTF-8 and UTF-32 modes
|
||||
.\" JOIN
|
||||
PCRE2_EXTRA_ALT_BSUX Extended alternate \eu, \eU, and
|
||||
|
|
|
@ -56,22 +56,32 @@ The subject and replacement lengths can be given as PCRE2_ZERO_TERMINATED for
|
|||
zero-terminated strings. The options are:
|
||||
.sp
|
||||
PCRE2_ANCHORED Match only at the first position
|
||||
PCRE2_ENDANCHORED Pattern can match only at end of subject
|
||||
PCRE2_NOTBOL Subject is not the beginning of a line
|
||||
PCRE2_NOTEOL Subject is not the end of a line
|
||||
PCRE2_NOTEMPTY An empty string is not a valid match
|
||||
PCRE2_ENDANCHORED Match only at end of subject
|
||||
.\" JOIN
|
||||
PCRE2_NOTEMPTY_ATSTART An empty string at the start of the
|
||||
subject is not a valid match
|
||||
PCRE2_NOTBOL Subject is not the beginning of a
|
||||
line
|
||||
PCRE2_NOTEOL Subject is not the end of a line
|
||||
.\" JOIN
|
||||
PCRE2_NOTEMPTY An empty string is not a
|
||||
valid match
|
||||
.\" JOIN
|
||||
PCRE2_NOTEMPTY_ATSTART An empty string at the start of
|
||||
the subject is not a valid match
|
||||
PCRE2_NO_JIT Do not use JIT matching
|
||||
.\" JOIN
|
||||
PCRE2_NO_UTF_CHECK Do not check the subject or replacement
|
||||
for UTF validity (only relevant if
|
||||
PCRE2_UTF was set at compile time)
|
||||
PCRE2_NO_UTF_CHECK Do not check for UTF validity in
|
||||
the subject or replacement
|
||||
.\" JOIN
|
||||
(only relevant if PCRE2_UTF was
|
||||
set at compile time)
|
||||
PCRE2_SUBSTITUTE_EXTENDED Do extended replacement processing
|
||||
PCRE2_SUBSTITUTE_GLOBAL Replace all occurrences in the subject
|
||||
.\" JOIN
|
||||
PCRE2_SUBSTITUTE_GLOBAL Replace all occurrences in the
|
||||
subject
|
||||
PCRE2_SUBSTITUTE_LITERAL The replacement string is literal
|
||||
PCRE2_SUBSTITUTE_MATCHED Use pre-existing match data for 1st match
|
||||
.\" JOIN
|
||||
PCRE2_SUBSTITUTE_MATCHED Use pre-existing match data for
|
||||
first match
|
||||
PCRE2_SUBSTITUTE_OVERFLOW_LENGTH If overflow, compute needed length
|
||||
PCRE2_SUBSTITUTE_REPLACEMENT_ONLY Return only replacement string(s)
|
||||
PCRE2_SUBSTITUTE_UNKNOWN_UNSET Treat unknown group as unset
|
||||
|
@ -80,7 +90,7 @@ zero-terminated strings. The options are:
|
|||
If PCRE2_SUBSTITUTE_LITERAL is set, PCRE2_SUBSTITUTE_EXTENDED,
|
||||
PCRE2_SUBSTITUTE_UNKNOWN_UNSET, and PCRE2_SUBSTITUTE_UNSET_EMPTY are ignored.
|
||||
.P
|
||||
If PCRE2_SUBSTITUTE_MATCHED is set, \fImatch_data\fP must be non-zero; its
|
||||
If PCRE2_SUBSTITUTE_MATCHED is set, \fImatch_data\fP must be non-NULL; its
|
||||
contents must be the result of a call to \fBpcre2_match()\fP using the same
|
||||
pattern and subject.
|
||||
.P
|
||||
|
|
119
doc/pcre2api.3
119
doc/pcre2api.3
|
@ -1,4 +1,4 @@
|
|||
.TH PCRE2API 3 "30 August 2021" "PCRE2 10.38"
|
||||
.TH PCRE2API 3 "27 July 2022" "PCRE2 10.41"
|
||||
.SH NAME
|
||||
PCRE2 - Perl-compatible regular expressions (revised API)
|
||||
.sp
|
||||
|
@ -953,7 +953,7 @@ has its own memory control arrangements (see the
|
|||
documentation for more details). If the limit is reached, the negative error
|
||||
code PCRE2_ERROR_HEAPLIMIT is returned. The default limit can be set when PCRE2
|
||||
is built; if it is not, the default is set very large and is essentially
|
||||
"unlimited".
|
||||
unlimited.
|
||||
.P
|
||||
A value for the heap limit may also be supplied by an item at the start of a
|
||||
pattern of the form
|
||||
|
@ -964,18 +964,18 @@ where ddd is a decimal number. However, such a setting is ignored unless ddd is
|
|||
less than the limit set by the caller of \fBpcre2_match()\fP or, if no such
|
||||
limit is set, less than the default.
|
||||
.P
|
||||
The \fBpcre2_match()\fP function starts out using a 20KiB vector on the system
|
||||
stack for recording backtracking points. The more nested backtracking points
|
||||
there are (that is, the deeper the search tree), the more memory is needed.
|
||||
Heap memory is used only if the initial vector is too small. If the heap limit
|
||||
is set to a value less than 21 (in particular, zero) no heap memory will be
|
||||
used. In this case, only patterns that do not have a lot of nested backtracking
|
||||
can be successfully processed.
|
||||
The \fBpcre2_match()\fP function always needs some heap memory, so setting a
|
||||
value of zero guarantees a "heap limit exceeded" error. Details of how
|
||||
\fBpcre2_match()\fP uses the heap are given in the
|
||||
.\" HREF
|
||||
\fBpcre2perform\fP
|
||||
.\"
|
||||
documentation.
|
||||
.P
|
||||
Similarly, for \fBpcre2_dfa_match()\fP, a vector on the system stack is used
|
||||
when processing pattern recursions, lookarounds, or atomic groups, and only if
|
||||
this is not big enough is heap memory used. In this case, too, setting a value
|
||||
of zero disables the use of the heap.
|
||||
For \fBpcre2_dfa_match()\fP, a vector on the system stack is used when
|
||||
processing pattern recursions, lookarounds, or atomic groups, and only if this
|
||||
is not big enough is heap memory used. In this case, setting a value of zero
|
||||
disables the use of the heap.
|
||||
.sp
|
||||
.nf
|
||||
.B int pcre2_set_match_limit(pcre2_match_context *\fImcontext\fP,
|
||||
|
@ -1019,10 +1019,10 @@ less than the limit set by the caller of \fBpcre2_match()\fP or
|
|||
.fi
|
||||
.sp
|
||||
This parameter limits the depth of nested backtracking in \fBpcre2_match()\fP.
|
||||
Each time a nested backtracking point is passed, a new memory "frame" is used
|
||||
Each time a nested backtracking point is passed, a new memory frame is used
|
||||
to remember the state of matching at that point. Thus, this parameter
|
||||
indirectly limits the amount of memory that is used in a match. However,
|
||||
because the size of each memory "frame" depends on the number of capturing
|
||||
because the size of each memory frame depends on the number of capturing
|
||||
parentheses, the actual memory limit varies from pattern to pattern. This limit
|
||||
was more useful in versions before 10.30, where function recursion was used for
|
||||
backtracking.
|
||||
|
@ -1323,8 +1323,7 @@ If \fIerrorcode\fP or \fIerroroffset\fP is NULL, \fBpcre2_compile()\fP returns
|
|||
NULL immediately. Otherwise, the variables to which these point are set to an
|
||||
error code and an offset (number of code units) within the pattern,
|
||||
respectively, when \fBpcre2_compile()\fP returns NULL because a compilation
|
||||
error has occurred. The values are not defined when compilation is successful
|
||||
and \fBpcre2_compile()\fP returns a non-NULL value.
|
||||
error has occurred.
|
||||
.P
|
||||
There are nearly 100 positive error codes that \fBpcre2_compile()\fP may return
|
||||
if it finds an error in the pattern. There are also some negative error codes
|
||||
|
@ -1343,14 +1342,17 @@ message"
|
|||
below)
|
||||
.\"
|
||||
should be self-explanatory. Macro names starting with PCRE2_ERROR_ are defined
|
||||
for both positive and negative error codes in \fBpcre2.h\fP.
|
||||
for both positive and negative error codes in \fBpcre2.h\fP. When compilation
|
||||
is successful \fIerrorcode\fP is set to a value that returns the message "no
|
||||
error" if passed to \fBpcre2_get_error_message()\fP.
|
||||
.P
|
||||
The value returned in \fIerroroffset\fP is an indication of where in the
|
||||
pattern the error occurred. It is not necessarily the furthest point in the
|
||||
pattern that was read. For example, after the error "lookbehind assertion is
|
||||
not fixed length", the error offset points to the start of the failing
|
||||
assertion. For an invalid UTF-8 or UTF-16 string, the offset is that of the
|
||||
first code unit of the failing character.
|
||||
pattern an error occurred. When there is no error, zero is returned. A non-zero
|
||||
value is not necessarily the furthest point in the pattern that was read. For
|
||||
example, after the error "lookbehind assertion is not fixed length", the error
|
||||
offset points to the start of the failing assertion. For an invalid UTF-8 or
|
||||
UTF-16 string, the offset is that of the first code unit of the failing
|
||||
character.
|
||||
.P
|
||||
Some errors are not detected until the whole pattern has been scanned; in these
|
||||
cases, the offset passed back is the length of the pattern. Note that the
|
||||
|
@ -1794,7 +1796,7 @@ it is set, the effect of passing an invalid UTF string as a pattern is
|
|||
undefined. It may cause your program to crash or loop.
|
||||
.P
|
||||
Note that this option can also be passed to \fBpcre2_match()\fP and
|
||||
\fBpcre_dfa_match()\fP, to suppress UTF validity checking of the subject
|
||||
\fBpcre2_dfa_match()\fP, to suppress UTF validity checking of the subject
|
||||
string.
|
||||
.P
|
||||
Note also that setting PCRE2_NO_UTF_CHECK at compile time does not disable the
|
||||
|
@ -2015,8 +2017,8 @@ point. However, this applies only to characters whose code points are less than
|
|||
256. By default, higher-valued code points never match escapes such as \ew or
|
||||
\ed.
|
||||
.P
|
||||
When PCRE2 is built with Unicode support (the default), the Unicode properties
|
||||
of all characters can be tested with \ep and \eP, or, alternatively, the
|
||||
When PCRE2 is built with Unicode support (the default), certain Unicode
|
||||
character properties can be tested with \ep and \eP, or, alternatively, the
|
||||
PCRE2_UCP option can be set when a pattern is compiled; this causes \ew and
|
||||
friends to use Unicode property support instead of the built-in tables.
|
||||
PCRE2_UCP also causes upper/lower casing operations on characters with code
|
||||
|
@ -2624,7 +2626,9 @@ The subject string is passed to \fBpcre2_match()\fP as a pointer in
|
|||
\fIstartoffset\fP. The length and offset are in code units, not characters.
|
||||
That is, they are in bytes for the 8-bit library, 16-bit code units for the
|
||||
16-bit library, and 32-bit code units for the 32-bit library, whether or not
|
||||
UTF processing is enabled.
|
||||
UTF processing is enabled. As a special case, if \fIsubject\fP is NULL and
|
||||
\fIlength\fP is zero, the subject is assumed to be an empty string. If
|
||||
\fIlength\fP is non-zero, an error occurs if \fIsubject\fP is NULL.
|
||||
.P
|
||||
If \fIstartoffset\fP is greater than the length of the subject,
|
||||
\fBpcre2_match()\fP returns PCRE2_ERROR_BADOFFSET. When the starting offset is
|
||||
|
@ -3158,11 +3162,11 @@ The backtracking match limit was reached.
|
|||
.sp
|
||||
PCRE2_ERROR_NOMEMORY
|
||||
.sp
|
||||
If a pattern contains many nested backtracking points, heap memory is used to
|
||||
remember them. This error is given when the memory allocation function (default
|
||||
or custom) fails. Note that a different error, PCRE2_ERROR_HEAPLIMIT, is given
|
||||
if the amount of memory needed exceeds the heap limit. PCRE2_ERROR_NOMEMORY is
|
||||
also returned if PCRE2_COPY_MATCHED_SUBJECT is set and memory allocation fails.
|
||||
Heap memory is used to remember backgracking points. This error is given when
|
||||
the memory allocation function (default or custom) fails. Note that a different
|
||||
error, PCRE2_ERROR_HEAPLIMIT, is given if the amount of memory needed exceeds
|
||||
the heap limit. PCRE2_ERROR_NOMEMORY is also returned if
|
||||
PCRE2_COPY_MATCHED_SUBJECT is set and memory allocation fails.
|
||||
.sp
|
||||
PCRE2_ERROR_NULL
|
||||
.sp
|
||||
|
@ -3413,12 +3417,16 @@ same number causes an error at compile time.
|
|||
.P
|
||||
This function optionally calls \fBpcre2_match()\fP and then makes a copy of the
|
||||
subject string in \fIoutputbuffer\fP, replacing parts that were matched with
|
||||
the \fIreplacement\fP string, whose length is supplied in \fBrlength\fP. This
|
||||
can be given as PCRE2_ZERO_TERMINATED for a zero-terminated string. There is an
|
||||
option (see PCRE2_SUBSTITUTE_REPLACEMENT_ONLY below) to return just the
|
||||
replacement string(s). The default action is to perform just one replacement if
|
||||
the pattern matches, but there is an option that requests multiple replacements
|
||||
(see PCRE2_SUBSTITUTE_GLOBAL below).
|
||||
the \fIreplacement\fP string, whose length is supplied in \fBrlength\fP, which
|
||||
can be given as PCRE2_ZERO_TERMINATED for a zero-terminated string. As a
|
||||
special case, if \fIreplacement\fP is NULL and \fIrlength\fP is zero, the
|
||||
replacement is assumed to be an empty string. If \fIrlength\fP is non-zero, an
|
||||
error occurs if \fIreplacement\fP is NULL.
|
||||
.P
|
||||
There is an option (see PCRE2_SUBSTITUTE_REPLACEMENT_ONLY below) to return just
|
||||
the replacement string(s). The default action is to perform just one
|
||||
replacement if the pattern matches, but there is an option that requests
|
||||
multiple replacements (see PCRE2_SUBSTITUTE_GLOBAL below).
|
||||
.P
|
||||
If successful, \fBpcre2_substitute()\fP returns the number of substitutions
|
||||
that were carried out. This may be zero if no match was found, and is never
|
||||
|
@ -3447,12 +3455,12 @@ block may or may not have been changed.
|
|||
As well as the usual options for \fBpcre2_match()\fP, a number of additional
|
||||
options can be set in the \fIoptions\fP argument of \fBpcre2_substitute()\fP.
|
||||
One such option is PCRE2_SUBSTITUTE_MATCHED. When this is set, an external
|
||||
\fImatch_data\fP block must be provided, and it must have been used for an
|
||||
external call to \fBpcre2_match()\fP. The data in the \fImatch_data\fP block
|
||||
(return code, offset vector) is used for the first substitution instead of
|
||||
calling \fBpcre2_match()\fP from within \fBpcre2_substitute()\fP. This allows
|
||||
an application to check for a match before choosing to substitute, without
|
||||
having to repeat the match.
|
||||
\fImatch_data\fP block must be provided, and it must have already been used for
|
||||
an external call to \fBpcre2_match()\fP with the same pattern and subject
|
||||
arguments. The data in the \fImatch_data\fP block (return code, offset vector)
|
||||
is then used for the first substitution instead of calling \fBpcre2_match()\fP
|
||||
from within \fBpcre2_substitute()\fP. This allows an application to check for a
|
||||
match before choosing to substitute, without having to repeat the match.
|
||||
.P
|
||||
The contents of the externally supplied match data block are not changed when
|
||||
PCRE2_SUBSTITUTE_MATCHED is set. If PCRE2_SUBSTITUTE_GLOBAL is also set,
|
||||
|
@ -3649,7 +3657,9 @@ needed is returned via \fIoutlengthptr\fP. Note that this does not happen by
|
|||
default.
|
||||
.P
|
||||
PCRE2_ERROR_NULL is returned if PCRE2_SUBSTITUTE_MATCHED is set but the
|
||||
\fImatch_data\fP argument is NULL.
|
||||
\fImatch_data\fP argument is NULL or if the \fIsubject\fP or \fIreplacement\fP
|
||||
arguments are NULL. For backward compatibility reasons an exception is made for
|
||||
the \fIreplacement\fP argument if the \fIrlength\fP argument is also 0.
|
||||
.P
|
||||
PCRE2_ERROR_BADREPLACEMENT is used for miscellaneous syntax errors in the
|
||||
replacement string, with more particular errors being PCRE2_ERROR_BADREPESCAPE
|
||||
|
@ -3811,12 +3821,13 @@ other alternatives. Ultimately, when it runs out of matches,
|
|||
.P
|
||||
The function \fBpcre2_dfa_match()\fP is called to match a subject string
|
||||
against a compiled pattern, using a matching algorithm that scans the subject
|
||||
string just once (not counting lookaround assertions), and does not backtrack.
|
||||
This has different characteristics to the normal algorithm, and is not
|
||||
compatible with Perl. Some of the features of PCRE2 patterns are not supported.
|
||||
Nevertheless, there are times when this kind of matching can be useful. For a
|
||||
discussion of the two matching algorithms, and a list of features that
|
||||
\fBpcre2_dfa_match()\fP does not support, see the
|
||||
string just once (not counting lookaround assertions), and does not backtrack
|
||||
(except when processing lookaround assertions). This has different
|
||||
characteristics to the normal algorithm, and is not compatible with Perl. Some
|
||||
of the features of PCRE2 patterns are not supported. Nevertheless, there are
|
||||
times when this kind of matching can be useful. For a discussion of the two
|
||||
matching algorithms, and a list of features that \fBpcre2_dfa_match()\fP does
|
||||
not support, see the
|
||||
.\" HREF
|
||||
\fBpcre2matching\fP
|
||||
.\"
|
||||
|
@ -3848,7 +3859,7 @@ Here is an example of a simple call to \fBpcre2_dfa_match()\fP:
|
|||
wspace, /* working space vector */
|
||||
20); /* number of elements (NOT size in bytes) */
|
||||
.
|
||||
.SS "Option bits for \fBpcre_dfa_match()\fP"
|
||||
.SS "Option bits for \fBpcre2_dfa_match()\fP"
|
||||
.rs
|
||||
.sp
|
||||
The unused bits of the \fIoptions\fP argument for \fBpcre2_dfa_match()\fP must
|
||||
|
@ -4016,6 +4027,6 @@ Cambridge, England.
|
|||
.rs
|
||||
.sp
|
||||
.nf
|
||||
Last updated: 30 August 2021
|
||||
Copyright (c) 1997-2021 University of Cambridge.
|
||||
Last updated: 27 July 2022
|
||||
Copyright (c) 1997-2022 University of Cambridge.
|
||||
.fi
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
.TH PCRE2BUILD 3 "20 March 2020" "PCRE2 10.35"
|
||||
.TH PCRE2BUILD 3 "27 July 2022" "PCRE2 10.41"
|
||||
.SH NAME
|
||||
PCRE2 - Perl-compatible regular expressions (revised API)
|
||||
.
|
||||
|
@ -122,8 +122,9 @@ locked this out by setting PCRE2_NEVER_UTF.
|
|||
UTF support allows the libraries to process character code points up to
|
||||
0x10ffff in the strings that they handle. Unicode support also gives access to
|
||||
the Unicode properties of characters, using pattern escapes such as \eP, \ep,
|
||||
and \eX. Only the general category properties such as \fILu\fP and \fINd\fP are
|
||||
supported. Details are given in the
|
||||
and \eX. Only the general category properties such as \fILu\fP and \fINd\fP,
|
||||
script names, and some bi-directional properties are supported. Details are
|
||||
given in the
|
||||
.\" HREF
|
||||
\fBpcre2pattern\fP
|
||||
.\"
|
||||
|
@ -277,12 +278,11 @@ to the \fBconfigure\fP command. This setting also applies to the
|
|||
\fBpcre2_dfa_match()\fP matching function, and to JIT matching (though the
|
||||
counting is done differently).
|
||||
.P
|
||||
The \fBpcre2_match()\fP function starts out using a 20KiB vector on the system
|
||||
stack to record backtracking points. The more nested backtracking points there
|
||||
are (that is, the deeper the search tree), the more memory is needed. If the
|
||||
initial vector is not large enough, heap memory is used, up to a certain limit,
|
||||
which is specified in kibibytes (units of 1024 bytes). The limit can be changed
|
||||
at run time, as described in the
|
||||
The \fBpcre2_match()\fP function uses heap memory to record backtracking
|
||||
points. The more nested backtracking points there are (that is, the deeper the
|
||||
search tree), the more memory is needed. There is an upper limit, specified in
|
||||
kibibytes (units of 1024 bytes). This limit can be changed at run time, as
|
||||
described in the
|
||||
.\" HREF
|
||||
\fBpcre2api\fP
|
||||
.\"
|
||||
|
@ -302,7 +302,7 @@ You can also explicitly limit the depth of nested backtracking in the
|
|||
for --with-match-limit. You can set a lower default limit by adding, for
|
||||
example,
|
||||
.sp
|
||||
--with-match-limit_depth=10000
|
||||
--with-match-limit-depth=10000
|
||||
.sp
|
||||
to the \fBconfigure\fP command. This value can be overridden at run time. This
|
||||
depth limit indirectly limits the amount of heap memory that is used, but
|
||||
|
@ -624,7 +624,7 @@ give a warning.
|
|||
.sp
|
||||
.nf
|
||||
Philip Hazel
|
||||
University Computing Service
|
||||
Retired from University Computing Service
|
||||
Cambridge, England.
|
||||
.fi
|
||||
.
|
||||
|
@ -633,6 +633,6 @@ Cambridge, England.
|
|||
.rs
|
||||
.sp
|
||||
.nf
|
||||
Last updated: 20 March 2020
|
||||
Copyright (c) 1997-2020 University of Cambridge.
|
||||
Last updated: 27 July 2022
|
||||
Copyright (c) 1997-2022 University of Cambridge.
|
||||
.fi
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
.TH PCRE2COMPAT 3 "30 August 2021" "PCRE2 10.38"
|
||||
.TH PCRE2COMPAT 3 "08 December 2021" "PCRE2 10.40"
|
||||
.SH NAME
|
||||
PCRE2 - Perl-compatible regular expressions (revised API)
|
||||
.SH "DIFFERENCES BETWEEN PCRE2 AND PERL"
|
||||
|
@ -6,31 +6,38 @@ PCRE2 - Perl-compatible regular expressions (revised API)
|
|||
.sp
|
||||
This document describes some of the differences in the ways that PCRE2 and Perl
|
||||
handle regular expressions. The differences described here are with respect to
|
||||
Perl version 5.32.0, but as both Perl and PCRE2 are continually changing, the
|
||||
Perl version 5.34.0, but as both Perl and PCRE2 are continually changing, the
|
||||
information may at times be out of date.
|
||||
.P
|
||||
1. PCRE2 has only a subset of Perl's Unicode support. Details of what it does
|
||||
1. When PCRE2_DOTALL (equivalent to Perl's /s qualifier) is not set, the
|
||||
behaviour of the '.' metacharacter differs from Perl. In PCRE2, '.' matches the
|
||||
next character unless it is the start of a newline sequence. This means that,
|
||||
if the newline setting is CR, CRLF, or NUL, '.' will match the code point LF
|
||||
(0x0A) in ASCII/Unicode environments, and NL (either 0x15 or 0x25) when using
|
||||
EBCDIC. In Perl, '.' appears never to match LF, even when 0x0A is not a newline
|
||||
indicator.
|
||||
.P
|
||||
2. PCRE2 has only a subset of Perl's Unicode support. Details of what it does
|
||||
have are given in the
|
||||
.\" HREF
|
||||
\fBpcre2unicode\fP
|
||||
.\"
|
||||
page.
|
||||
.P
|
||||
2. Like Perl, PCRE2 allows repeat quantifiers on parenthesized assertions, but
|
||||
3. Like Perl, PCRE2 allows repeat quantifiers on parenthesized assertions, but
|
||||
they do not mean what you might think. For example, (?!a){3} does not assert
|
||||
that the next three characters are not "a". It just asserts that the next
|
||||
character is not "a" three times (in principle; PCRE2 optimizes this to run the
|
||||
assertion just once). Perl allows some repeat quantifiers on other assertions,
|
||||
for example, \eb* (but not \eb{3}, though oddly it does allow ^{3}), but these
|
||||
do not seem to have any use. PCRE2 does not allow any kind of quantifier on
|
||||
non-lookaround assertions.
|
||||
for example, \eb* , but these do not seem to have any use. PCRE2 does not allow
|
||||
any kind of quantifier on non-lookaround assertions.
|
||||
.P
|
||||
3. Capture groups that occur inside negative lookaround assertions are counted,
|
||||
4. Capture groups that occur inside negative lookaround assertions are counted,
|
||||
but their entries in the offsets vector are set only when a negative assertion
|
||||
is a condition that has a matching branch (that is, the condition is false).
|
||||
Perl may set such capture groups in other circumstances.
|
||||
.P
|
||||
4. The following Perl escape sequences are not supported: \eF, \el, \eL, \eu,
|
||||
5. The following Perl escape sequences are not supported: \eF, \el, \eL, \eu,
|
||||
\eU, and \eN when followed by a character name. \eN on its own, matching a
|
||||
non-newline character, and \eN{U+dd..}, matching a Unicode code point, are
|
||||
supported. The escapes that modify the case of following letters are
|
||||
|
@ -40,12 +47,12 @@ generated by default. However, if either of the PCRE2_ALT_BSUX or
|
|||
PCRE2_EXTRA_ALT_BSUX options is set, \eU and \eu are interpreted as ECMAScript
|
||||
interprets them.
|
||||
.P
|
||||
5. The Perl escape sequences \ep, \eP, and \eX are supported only if PCRE2 is
|
||||
6. The Perl escape sequences \ep, \eP, and \eX are supported only if PCRE2 is
|
||||
built with Unicode support (the default). The properties that can be tested
|
||||
with \ep and \eP are limited to the general category properties such as Lu and
|
||||
Nd, script names such as Greek or Han, and the derived properties Any and L&.
|
||||
Both PCRE2 and Perl support the Cs (surrogate) property, but in PCRE2 its use
|
||||
is limited. See the
|
||||
Nd, script names such as Greek or Han, Bidi_Class, Bidi_Control, and the
|
||||
derived properties Any and LC (synonym L&). Both PCRE2 and Perl support the Cs
|
||||
(surrogate) property, but in PCRE2 its use is limited. See the
|
||||
.\" HREF
|
||||
\fBpcre2pattern\fP
|
||||
.\"
|
||||
|
@ -53,14 +60,14 @@ documentation for details. The long synonyms for property names that Perl
|
|||
supports (such as \ep{Letter}) are not supported by PCRE2, nor is it permitted
|
||||
to prefix any of these properties with "Is".
|
||||
.P
|
||||
6. PCRE2 supports the \eQ...\eE escape for quoting substrings. Characters
|
||||
7. PCRE2 supports the \eQ...\eE escape for quoting substrings. Characters
|
||||
in between are treated as literals. However, this is slightly different from
|
||||
Perl in that $ and @ are also handled as literals inside the quotes. In Perl,
|
||||
they cause variable interpolation (but of course PCRE2 does not have
|
||||
variables). Also, Perl does "double-quotish backslash interpolation" on any
|
||||
backslashes between \eQ and \eE which, its documentation says, "may lead to
|
||||
confusing results". PCRE2 treats a backslash between \eQ and \eE just like any
|
||||
other character. Note the following examples:
|
||||
they cause variable interpolation (PCRE2 does not have variables). Also, Perl
|
||||
does "double-quotish backslash interpolation" on any backslashes between \eQ
|
||||
and \eE which, its documentation says, "may lead to confusing results". PCRE2
|
||||
treats a backslash between \eQ and \eE just like any other character. Note the
|
||||
following examples:
|
||||
.sp
|
||||
Pattern PCRE2 matches Perl matches
|
||||
.sp
|
||||
|
@ -75,7 +82,7 @@ other character. Note the following examples:
|
|||
The \eQ...\eE sequence is recognized both inside and outside character classes
|
||||
by both PCRE2 and Perl.
|
||||
.P
|
||||
7. Fairly obviously, PCRE2 does not support the (?{code}) and (??{code})
|
||||
8. Fairly obviously, PCRE2 does not support the (?{code}) and (??{code})
|
||||
constructions. However, PCRE2 does have a "callout" feature, which allows an
|
||||
external function to be called during pattern matching. See the
|
||||
.\" HREF
|
||||
|
@ -83,11 +90,11 @@ external function to be called during pattern matching. See the
|
|||
.\"
|
||||
documentation for details.
|
||||
.P
|
||||
8. Subroutine calls (whether recursive or not) were treated as atomic groups up
|
||||
9. Subroutine calls (whether recursive or not) were treated as atomic groups up
|
||||
to PCRE2 release 10.23, but from release 10.30 this changed, and backtracking
|
||||
into subroutine calls is now supported, as in Perl.
|
||||
.P
|
||||
9. In PCRE2, if any of the backtracking control verbs are used in a group that
|
||||
10. In PCRE2, if any of the backtracking control verbs are used in a group that
|
||||
is called as a subroutine (whether or not recursively), their effect is
|
||||
confined to that group; it does not extend to the surrounding pattern. This is
|
||||
not always the case in Perl. In particular, if (*THEN) is present in a group
|
||||
|
@ -95,18 +102,18 @@ that is called as a subroutine, its action is limited to that group, even if
|
|||
the group does not contain any | characters. Note that such groups are
|
||||
processed as anchored at the point where they are tested.
|
||||
.P
|
||||
10. If a pattern contains more than one backtracking control verb, the first
|
||||
11. If a pattern contains more than one backtracking control verb, the first
|
||||
one that is backtracked onto acts. For example, in the pattern
|
||||
A(*COMMIT)B(*PRUNE)C a failure in B triggers (*COMMIT), but a failure in C
|
||||
triggers (*PRUNE). Perl's behaviour is more complex; in many cases it is the
|
||||
same as PCRE2, but there are cases where it differs.
|
||||
.P
|
||||
11. There are some differences that are concerned with the settings of captured
|
||||
12. There are some differences that are concerned with the settings of captured
|
||||
strings when part of a pattern is repeated. For example, matching "aba" against
|
||||
the pattern /^(a(b)?)+$/ in Perl leaves $2 unset, but in PCRE2 it is set to
|
||||
"b".
|
||||
.P
|
||||
12. PCRE2's handling of duplicate capture group numbers and names is not as
|
||||
13. PCRE2's handling of duplicate capture group numbers and names is not as
|
||||
general as Perl's. This is a consequence of the fact the PCRE2 works internally
|
||||
just with numbers, using an external table to translate between numbers and
|
||||
names. In particular, a pattern such as (?|(?<a>A)|(?<b>B)), where the two
|
||||
|
@ -115,37 +122,38 @@ causes an error at compile time. If it were allowed, it would not be possible
|
|||
to distinguish which group matched, because both names map to capture group
|
||||
number 1. To avoid this confusing situation, an error is given at compile time.
|
||||
.P
|
||||
13. Perl used to recognize comments in some places that PCRE2 does not, for
|
||||
14. Perl used to recognize comments in some places that PCRE2 does not, for
|
||||
example, between the ( and ? at the start of a group. If the /x modifier is
|
||||
set, Perl allowed white space between ( and ? though the latest Perls give an
|
||||
error (for a while it was just deprecated). There may still be some cases where
|
||||
Perl behaves differently.
|
||||
.P
|
||||
14. Perl, when in warning mode, gives warnings for character classes such as
|
||||
15. Perl, when in warning mode, gives warnings for character classes such as
|
||||
[A-\ed] or [a-[:digit:]]. It then treats the hyphens as literals. PCRE2 has no
|
||||
warning features, so it gives an error in these cases because they are almost
|
||||
certainly user mistakes.
|
||||
.P
|
||||
15. In PCRE2, the upper/lower case character properties Lu and Ll are not
|
||||
16. In PCRE2, the upper/lower case character properties Lu and Ll are not
|
||||
affected when case-independent matching is specified. For example, \ep{Lu}
|
||||
always matches an upper case letter. I think Perl has changed in this respect;
|
||||
in the release at the time of writing (5.32), \ep{Lu} and \ep{Ll} match all
|
||||
in the release at the time of writing (5.34), \ep{Lu} and \ep{Ll} match all
|
||||
letters, regardless of case, when case independence is specified.
|
||||
.P
|
||||
16. From release 5.32.0, Perl locks out the use of \eK in lookaround
|
||||
17. From release 5.32.0, Perl locks out the use of \eK in lookaround
|
||||
assertions. From release 10.38 PCRE2 does the same by default. However, there
|
||||
is an option for re-enabling the previous behaviour. When this option is set,
|
||||
\eK is acted on when it occurs in positive assertions, but is ignored in
|
||||
negative assertions.
|
||||
.P
|
||||
17. PCRE2 provides some extensions to the Perl regular expression facilities.
|
||||
18. PCRE2 provides some extensions to the Perl regular expression facilities.
|
||||
Perl 5.10 included new features that were not in earlier versions of Perl, some
|
||||
of which (such as named parentheses) were in PCRE2 for some time before. This
|
||||
list is with respect to Perl 5.32:
|
||||
list is with respect to Perl 5.34:
|
||||
.sp
|
||||
(a) Although lookbehind assertions in PCRE2 must match fixed length strings,
|
||||
each alternative toplevel branch of a lookbehind assertion can match a
|
||||
different length of string. Perl requires them all to have the same length.
|
||||
different length of string. Perl used to require them all to have the same
|
||||
length, but the latest version has some variable length support.
|
||||
.sp
|
||||
(b) From PCRE2 10.23, backreferences to groups of fixed length are supported
|
||||
in lookbehinds, provided that there is no possibility of referencing a
|
||||
|
@ -186,11 +194,11 @@ the pattern.
|
|||
extension to the lookaround facilities. The default, Perl-compatible
|
||||
lookarounds are atomic.
|
||||
.P
|
||||
18. The Perl /a modifier restricts /d numbers to pure ascii, and the /aa
|
||||
19. The Perl /a modifier restricts /d numbers to pure ascii, and the /aa
|
||||
modifier restricts /i case-insensitive matching to pure ascii, ignoring Unicode
|
||||
rules. This separation cannot be represented with PCRE2_UCP.
|
||||
.P
|
||||
19. Perl has different limits than PCRE2. See the
|
||||
20. Perl has different limits than PCRE2. See the
|
||||
.\" HREF
|
||||
\fBpcre2limit\fP
|
||||
.\"
|
||||
|
@ -214,6 +222,6 @@ Cambridge, England.
|
|||
.rs
|
||||
.sp
|
||||
.nf
|
||||
Last updated: 30 August 2021
|
||||
Last updated: 08 December 2021
|
||||
Copyright (c) 1997-2021 University of Cambridge.
|
||||
.fi
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
.TH PCRE2GREP 1 "31 August 2021" "PCRE2 10.38"
|
||||
.TH PCRE2GREP 1 "30 July 2022" "PCRE2 10.41"
|
||||
.SH NAME
|
||||
pcre2grep - a grep with Perl-compatible regular expressions.
|
||||
.SH SYNOPSIS
|
||||
|
@ -43,13 +43,15 @@ For example:
|
|||
.sp
|
||||
pcre2grep some-pattern file1 - file3
|
||||
.sp
|
||||
Input files are searched line by line. By default, each line that matches a
|
||||
By default, input files are searched line by line. Each line that matches a
|
||||
pattern is copied to the standard output, and if there is more than one file,
|
||||
the file name is output at the start of each line, followed by a colon.
|
||||
However, there are options that can change how \fBpcre2grep\fP behaves. In
|
||||
particular, the \fB-M\fP option makes it possible to search for strings that
|
||||
span line boundaries. What defines a line boundary is controlled by the
|
||||
\fB-N\fP (\fB--newline\fP) option.
|
||||
However, there are options that can change how \fBpcre2grep\fP behaves. For
|
||||
example, the \fB-M\fP option makes it possible to search for strings that span
|
||||
line boundaries. What defines a line boundary is controlled by the \fB-N\fP
|
||||
(\fB--newline\fP) option. The \fB-h\fP and \fB-H\fP options control whether or
|
||||
not file names are shown, and the \fB-Z\fP option changes the file name
|
||||
terminator to a zero byte.
|
||||
.P
|
||||
The amount of memory used for buffering files that are being scanned is
|
||||
controlled by parameters that can be set by the \fB--buffer-size\fP and
|
||||
|
@ -149,9 +151,11 @@ Output up to \fInumber\fP lines of context after each matching line. Fewer
|
|||
lines are output if the next match or the end of the file is reached, or if the
|
||||
processing buffer size has been set too small. If file names and/or line
|
||||
numbers are being output, a hyphen separator is used instead of a colon for the
|
||||
context lines. A line containing "--" is output between each group of lines,
|
||||
unless they are in fact contiguous in the input file. The value of \fInumber\fP
|
||||
is expected to be relatively small. When \fB-c\fP is used, \fB-A\fP is ignored.
|
||||
context lines (the \fB-Z\fP option can be used to change the file name
|
||||
terminator to a zero byte). A line containing "--" is output between each group
|
||||
of lines, unless they are in fact contiguous in the input file. The value of
|
||||
\fInumber\fP is expected to be relatively small. When \fB-c\fP is used,
|
||||
\fB-A\fP is ignored.
|
||||
.TP
|
||||
\fB-a\fP, \fB--text\fP
|
||||
Treat binary files as text. This is equivalent to
|
||||
|
@ -167,9 +171,10 @@ Output up to \fInumber\fP lines of context before each matching line. Fewer
|
|||
lines are output if the previous match or the start of the file is within
|
||||
\fInumber\fP lines, or if the processing buffer size has been set too small. If
|
||||
file names and/or line numbers are being output, a hyphen separator is used
|
||||
instead of a colon for the context lines. A line containing "--" is output
|
||||
between each group of lines, unless they are in fact contiguous in the input
|
||||
file. The value of \fInumber\fP is expected to be relatively small. When
|
||||
instead of a colon for the context lines (the \fB-Z\fP option can be used to
|
||||
change the file name terminator to a zero byte). A line containing "--" is
|
||||
output between each group of lines, unless they are in fact contiguous in the
|
||||
input file. The value of \fInumber\fP is expected to be relatively small. When
|
||||
\fB-c\fP is used, \fB-B\fP is ignored.
|
||||
.TP
|
||||
\fB--binary-files=\fP\fIword\fP
|
||||
|
@ -356,19 +361,21 @@ shown separately. This option is mutually exclusive with \fB--output\fP,
|
|||
.TP
|
||||
\fB-H\fP, \fB--with-filename\fP
|
||||
Force the inclusion of the file name at the start of output lines when
|
||||
searching a single file. By default, the file name is not shown in this case.
|
||||
For matching lines, the file name is followed by a colon; for context lines, a
|
||||
hyphen separator is used. If a line number is also being output, it follows the
|
||||
file name. When the \fB-M\fP option causes a pattern to match more than one
|
||||
line, only the first is preceded by the file name. This option overrides any
|
||||
previous \fB-h\fP, \fB-l\fP, or \fB-L\fP options.
|
||||
searching a single file. The file name is not normally shown in this case.
|
||||
By default, for matching lines, the file name is followed by a colon; for
|
||||
context lines, a hyphen separator is used. The \fB-Z\fP option can be used to
|
||||
change the terminator to a zero byte. If a line number is also being output,
|
||||
it follows the file name. When the \fB-M\fP option causes a pattern to match
|
||||
more than one line, only the first is preceded by the file name. This option
|
||||
overrides any previous \fB-h\fP, \fB-l\fP, or \fB-L\fP options.
|
||||
.TP
|
||||
\fB-h\fP, \fB--no-filename\fP
|
||||
Suppress the output file names when searching multiple files. By default,
|
||||
file names are shown when multiple files are searched. For matching lines, the
|
||||
file name is followed by a colon; for context lines, a hyphen separator is used.
|
||||
If a line number is also being output, it follows the file name. This option
|
||||
overrides any previous \fB-H\fP, \fB-L\fP, or \fB-l\fP options.
|
||||
Suppress the output file names when searching multiple files. File names are
|
||||
normally shown when multiple files are searched. By default, for matching
|
||||
lines, the file name is followed by a colon; for context lines, a hyphen
|
||||
separator is used. The \fB-Z\fP option can be used to change the terminator to
|
||||
a zero byte. If a line number is also being output, it follows the file name.
|
||||
This option overrides any previous \fB-H\fP, \fB-L\fP, or \fB-l\fP options.
|
||||
.TP
|
||||
\fB--heap-limit\fP=\fInumber\fP
|
||||
See \fB--match-limit\fP below.
|
||||
|
@ -417,17 +424,19 @@ given any number of times. If a directory matches both \fB--include-dir\fP and
|
|||
\fB-L\fP, \fB--files-without-match\fP
|
||||
Instead of outputting lines from the files, just output the names of the files
|
||||
that do not contain any lines that would have been output. Each file name is
|
||||
output once, on a separate line. This option overrides any previous \fB-H\fP,
|
||||
\fB-h\fP, or \fB-l\fP options.
|
||||
output once, on a separate line by default, but if the \fB-Z\fP option is set,
|
||||
they are separated by zero bytes instead of newlines. This option overrides any
|
||||
previous \fB-H\fP, \fB-h\fP, or \fB-l\fP options.
|
||||
.TP
|
||||
\fB-l\fP, \fB--files-with-matches\fP
|
||||
Instead of outputting lines from the files, just output the names of the files
|
||||
containing lines that would have been output. Each file name is output once, on
|
||||
a separate line. Searching normally stops as soon as a matching line is found
|
||||
in a file. However, if the \fB-c\fP (count) option is also used, matching
|
||||
continues in order to obtain the correct count, and those files that have at
|
||||
least one match are listed along with their counts. Using this option with
|
||||
\fB-c\fP is a way of suppressing the listing of files with no matches that
|
||||
a separate line, but if the \fB-Z\fP option is set, they are separated by zero
|
||||
bytes instead of newlines. Searching normally stops as soon as a matching line
|
||||
is found in a file. However, if the \fB-c\fP (count) option is also used,
|
||||
matching continues in order to obtain the correct count, and those files that
|
||||
have at least one match are listed along with their counts. Using this option
|
||||
with \fB-c\fP is a way of suppressing the listing of files with no matches that
|
||||
occurs with \fB-c\fP on its own. This option overrides any previous \fB-H\fP,
|
||||
\fB-h\fP, or \fB-L\fP options.
|
||||
.TP
|
||||
|
@ -516,10 +525,7 @@ counter that is incremented each time around its main processing loop. If the
|
|||
value set by \fB--match-limit\fP is reached, an error occurs.
|
||||
.sp
|
||||
The \fB--heap-limit\fP option specifies, as a number of kibibytes (units of
|
||||
1024 bytes), the amount of heap memory that may be used for matching. Heap
|
||||
memory is needed only if matching the pattern requires a significant number of
|
||||
nested backtracking points to be remembered. This parameter can be set to zero
|
||||
to forbid the use of heap memory altogether.
|
||||
1024 bytes), the maximum amount of heap memory that may be used for matching.
|
||||
.sp
|
||||
The \fB--depth-limit\fP option limits the depth of nested backtracking points,
|
||||
which indirectly limits the amount of memory that is used. The amount of memory
|
||||
|
@ -732,6 +738,12 @@ be more than one line. This is equivalent to having "^(?:" at the start of each
|
|||
pattern and ")$" at the end. This option applies only to the patterns that are
|
||||
matched against the contents of files; it does not apply to patterns specified
|
||||
by any of the \fB--include\fP or \fB--exclude\fP options.
|
||||
.TP
|
||||
\fB-Z\fP, \fB--null\fP
|
||||
Terminate files names in the regular output with a zero byte (the NUL
|
||||
character) instead of what would normally appear. This is useful when file
|
||||
names contain unusual characters such as colons, hyphens, or even newlines. The
|
||||
option does not apply to file names in error messages.
|
||||
.
|
||||
.
|
||||
.SH "ENVIRONMENT VARIABLES"
|
||||
|
@ -960,6 +972,6 @@ Cambridge, England.
|
|||
.rs
|
||||
.sp
|
||||
.nf
|
||||
Last updated: 31 August 2021
|
||||
Copyright (c) 1997-2021 University of Cambridge.
|
||||
Last updated: 30 July 2022
|
||||
Copyright (c) 1997-2022 University of Cambridge.
|
||||
.fi
|
||||
|
|
|
@ -42,13 +42,15 @@ DESCRIPTION
|
|||
|
||||
pcre2grep some-pattern file1 - file3
|
||||
|
||||
Input files are searched line by line. By default, each line that
|
||||
By default, input files are searched line by line. Each line that
|
||||
matches a pattern is copied to the standard output, and if there is
|
||||
more than one file, the file name is output at the start of each line,
|
||||
followed by a colon. However, there are options that can change how
|
||||
pcre2grep behaves. In particular, the -M option makes it possible to
|
||||
pcre2grep behaves. For example, the -M option makes it possible to
|
||||
search for strings that span line boundaries. What defines a line
|
||||
boundary is controlled by the -N (--newline) option.
|
||||
boundary is controlled by the -N (--newline) option. The -h and -H op-
|
||||
tions control whether or not file names are shown, and the -Z option
|
||||
changes the file name terminator to a zero byte.
|
||||
|
||||
The amount of memory used for buffering files that are being scanned is
|
||||
controlled by parameters that can be set by the --buffer-size and
|
||||
|
@ -149,10 +151,12 @@ OPTIONS
|
|||
the file is reached, or if the processing buffer size has
|
||||
been set too small. If file names and/or line numbers are be-
|
||||
ing output, a hyphen separator is used instead of a colon for
|
||||
the context lines. A line containing "--" is output between
|
||||
each group of lines, unless they are in fact contiguous in
|
||||
the input file. The value of number is expected to be rela-
|
||||
tively small. When -c is used, -A is ignored.
|
||||
the context lines (the -Z option can be used to change the
|
||||
file name terminator to a zero byte). A line containing "--"
|
||||
is output between each group of lines, unless they are in
|
||||
fact contiguous in the input file. The value of number is ex-
|
||||
pected to be relatively small. When -c is used, -A is ig-
|
||||
nored.
|
||||
|
||||
-a, --text
|
||||
Treat binary files as text. This is equivalent to --binary-
|
||||
|
@ -170,11 +174,12 @@ OPTIONS
|
|||
start of the file is within number lines, or if the process-
|
||||
ing buffer size has been set too small. If file names and/or
|
||||
line numbers are being output, a hyphen separator is used in-
|
||||
stead of a colon for the context lines. A line containing
|
||||
"--" is output between each group of lines, unless they are
|
||||
in fact contiguous in the input file. The value of number is
|
||||
expected to be relatively small. When -c is used, -B is ig-
|
||||
nored.
|
||||
stead of a colon for the context lines (the -Z option can be
|
||||
used to change the file name terminator to a zero byte). A
|
||||
line containing "--" is output between each group of lines,
|
||||
unless they are in fact contiguous in the input file. The
|
||||
value of number is expected to be relatively small. When -c
|
||||
is used, -B is ignored.
|
||||
|
||||
--binary-files=word
|
||||
Specify how binary files are to be processed. If the word is
|
||||
|
@ -387,22 +392,25 @@ OPTIONS
|
|||
|
||||
-H, --with-filename
|
||||
Force the inclusion of the file name at the start of output
|
||||
lines when searching a single file. By default, the file name
|
||||
is not shown in this case. For matching lines, the file name
|
||||
is followed by a colon; for context lines, a hyphen separator
|
||||
is used. If a line number is also being output, it follows
|
||||
the file name. When the -M option causes a pattern to match
|
||||
more than one line, only the first is preceded by the file
|
||||
name. This option overrides any previous -h, -l, or -L op-
|
||||
tions.
|
||||
lines when searching a single file. The file name is not nor-
|
||||
mally shown in this case. By default, for matching lines,
|
||||
the file name is followed by a colon; for context lines, a
|
||||
hyphen separator is used. The -Z option can be used to change
|
||||
the terminator to a zero byte. If a line number is also being
|
||||
output, it follows the file name. When the -M option causes a
|
||||
pattern to match more than one line, only the first is pre-
|
||||
ceded by the file name. This option overrides any previous
|
||||
-h, -l, or -L options.
|
||||
|
||||
-h, --no-filename
|
||||
Suppress the output file names when searching multiple files.
|
||||
By default, file names are shown when multiple files are
|
||||
searched. For matching lines, the file name is followed by a
|
||||
colon; for context lines, a hyphen separator is used. If a
|
||||
line number is also being output, it follows the file name.
|
||||
This option overrides any previous -H, -L, or -l options.
|
||||
File names are normally shown when multiple files are
|
||||
searched. By default, for matching lines, the file name is
|
||||
followed by a colon; for context lines, a hyphen separator is
|
||||
used. The -Z option can be used to change the terminator to a
|
||||
zero byte. If a line number is also being output, it follows
|
||||
the file name. This option overrides any previous -H, -L, or
|
||||
-l options.
|
||||
|
||||
--heap-limit=number
|
||||
See --match-limit below.
|
||||
|
@ -455,21 +463,23 @@ OPTIONS
|
|||
Instead of outputting lines from the files, just output the
|
||||
names of the files that do not contain any lines that would
|
||||
have been output. Each file name is output once, on a sepa-
|
||||
rate line. This option overrides any previous -H, -h, or -l
|
||||
options.
|
||||
rate line by default, but if the -Z option is set, they are
|
||||
separated by zero bytes instead of newlines. This option
|
||||
overrides any previous -H, -h, or -l options.
|
||||
|
||||
-l, --files-with-matches
|
||||
Instead of outputting lines from the files, just output the
|
||||
names of the files containing lines that would have been out-
|
||||
put. Each file name is output once, on a separate line.
|
||||
Searching normally stops as soon as a matching line is found
|
||||
in a file. However, if the -c (count) option is also used,
|
||||
matching continues in order to obtain the correct count, and
|
||||
those files that have at least one match are listed along
|
||||
with their counts. Using this option with -c is a way of sup-
|
||||
pressing the listing of files with no matches that occurs
|
||||
with -c on its own. This option overrides any previous -H,
|
||||
-h, or -L options.
|
||||
put. Each file name is output once, on a separate line, but
|
||||
if the -Z option is set, they are separated by zero bytes in-
|
||||
stead of newlines. Searching normally stops as soon as a
|
||||
matching line is found in a file. However, if the -c (count)
|
||||
option is also used, matching continues in order to obtain
|
||||
the correct count, and those files that have at least one
|
||||
match are listed along with their counts. Using this option
|
||||
with -c is a way of suppressing the listing of files with no
|
||||
matches that occurs with -c on its own. This option overrides
|
||||
any previous -H, -h, or -L options.
|
||||
|
||||
--label=name
|
||||
This option supplies a name to be used for the standard input
|
||||
|
@ -571,11 +581,8 @@ OPTIONS
|
|||
an error occurs.
|
||||
|
||||
The --heap-limit option specifies, as a number of kibibytes
|
||||
(units of 1024 bytes), the amount of heap memory that may be
|
||||
used for matching. Heap memory is needed only if matching the
|
||||
pattern requires a significant number of nested backtracking
|
||||
points to be remembered. This parameter can be set to zero to
|
||||
forbid the use of heap memory altogether.
|
||||
(units of 1024 bytes), the maximum amount of heap memory that
|
||||
may be used for matching.
|
||||
|
||||
The --depth-limit option limits the depth of nested back-
|
||||
tracking points, which indirectly limits the amount of memory
|
||||
|
@ -812,6 +819,13 @@ OPTIONS
|
|||
does not apply to patterns specified by any of the --include
|
||||
or --exclude options.
|
||||
|
||||
-Z, --null
|
||||
Terminate files names in the regular output with a zero byte
|
||||
(the NUL character) instead of what would normally appear.
|
||||
This is useful when file names contain unusual characters
|
||||
such as colons, hyphens, or even newlines. The option does
|
||||
not apply to file names in error messages.
|
||||
|
||||
|
||||
ENVIRONMENT VARIABLES
|
||||
|
||||
|
@ -1022,5 +1036,5 @@ AUTHOR
|
|||
|
||||
REVISION
|
||||
|
||||
Last updated: 31 August 2021
|
||||
Copyright (c) 1997-2021 University of Cambridge.
|
||||
Last updated: 30 July 2022
|
||||
Copyright (c) 1997-2022 University of Cambridge.
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
.TH PCRE2JIT 3 "23 May 2019" "PCRE2 10.34"
|
||||
.TH PCRE2JIT 3 "30 November 2021" "PCRE2 10.40"
|
||||
.SH NAME
|
||||
PCRE2 - Perl-compatible regular expressions (revised API)
|
||||
.SH "PCRE2 JUST-IN-TIME COMPILER SUPPORT"
|
||||
|
@ -251,11 +251,11 @@ non-sequential matches in one thread is to use callouts: if a callout function
|
|||
starts another match, that match must use a different JIT stack to the one used
|
||||
for currently suspended match(es).
|
||||
.P
|
||||
In a multithread application, if you do not
|
||||
specify a JIT stack, or if you assign or pass back NULL from a callback, that
|
||||
is thread-safe, because each thread has its own machine stack. However, if you
|
||||
assign or pass back a non-NULL JIT stack, this must be a different stack for
|
||||
each thread so that the application is thread-safe.
|
||||
In a multithread application, if you do not specify a JIT stack, or if you
|
||||
assign or pass back NULL from a callback, that is thread-safe, because each
|
||||
thread has its own machine stack. However, if you assign or pass back a
|
||||
non-NULL JIT stack, this must be a different stack for each thread so that the
|
||||
application is thread-safe.
|
||||
.P
|
||||
Strictly speaking, even more is allowed. You can assign the same non-NULL stack
|
||||
to a match context that is used by any number of patterns, as long as they are
|
||||
|
@ -355,8 +355,8 @@ out this complicated API.
|
|||
.B void pcre2_jit_free_unused_memory(pcre2_general_context *\fIgcontext\fP);
|
||||
.fi
|
||||
.P
|
||||
The JIT executable allocator does not free all memory when it is possible.
|
||||
It expects new allocations, and keeps some free memory around to improve
|
||||
The JIT executable allocator does not free all memory when it is possible. It
|
||||
expects new allocations, and keeps some free memory around to improve
|
||||
allocation speed. However, in low memory conditions, it might be better to free
|
||||
all possible memory. You can cause this to happen by calling
|
||||
pcre2_jit_free_unused_memory(). Its argument is a general context, for custom
|
||||
|
@ -416,10 +416,10 @@ that was not compiled.
|
|||
.P
|
||||
When you call \fBpcre2_match()\fP, as well as testing for invalid options, a
|
||||
number of other sanity checks are performed on the arguments. For example, if
|
||||
the subject pointer is NULL, an immediate error is given. Also, unless
|
||||
PCRE2_NO_UTF_CHECK is set, a UTF subject string is tested for validity. In the
|
||||
interests of speed, these checks do not happen on the JIT fast path, and if
|
||||
invalid data is passed, the result is undefined.
|
||||
the subject pointer is NULL but the length is non-zero, an immediate error is
|
||||
given. Also, unless PCRE2_NO_UTF_CHECK is set, a UTF subject string is tested
|
||||
for validity. In the interests of speed, these checks do not happen on the JIT
|
||||
fast path, and if invalid data is passed, the result is undefined.
|
||||
.P
|
||||
Bypassing the sanity checks and the \fBpcre2_match()\fP wrapping can give
|
||||
speedups of more than 10%.
|
||||
|
@ -445,6 +445,6 @@ Cambridge, England.
|
|||
.rs
|
||||
.sp
|
||||
.nf
|
||||
Last updated: 23 May 2019
|
||||
Copyright (c) 1997-2019 University of Cambridge.
|
||||
Last updated: 30 November 2021
|
||||
Copyright (c) 1997-2021 University of Cambridge.
|
||||
.fi
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
.TH PCRE2LIMITS 3 "03 February 2019" "PCRE2 10.33"
|
||||
.TH PCRE2LIMITS 3 "26 July 2022" "PCRE2 10.41"
|
||||
.SH NAME
|
||||
PCRE2 - Perl-compatible regular expressions (revised API)
|
||||
.SH "SIZE AND OTHER LIMITATIONS"
|
||||
|
@ -51,6 +51,10 @@ is 255 code units for the 8-bit library and 65535 code units for the 16-bit and
|
|||
.P
|
||||
The maximum length of a string argument to a callout is the largest number a
|
||||
32-bit unsigned integer can hold.
|
||||
.P
|
||||
The maximum amount of heap memory used for matching is controlled by the heap
|
||||
limit, which can be set in a pattern or in a match context. The default is a
|
||||
very large number, effectively unlimited.
|
||||
.
|
||||
.
|
||||
.SH AUTHOR
|
||||
|
@ -58,7 +62,7 @@ The maximum length of a string argument to a callout is the largest number a
|
|||
.sp
|
||||
.nf
|
||||
Philip Hazel
|
||||
University Computing Service
|
||||
Retired from University Computing Service
|
||||
Cambridge, England.
|
||||
.fi
|
||||
.
|
||||
|
@ -67,6 +71,6 @@ Cambridge, England.
|
|||
.rs
|
||||
.sp
|
||||
.nf
|
||||
Last updated: 02 February 2019
|
||||
Copyright (c) 1997-2019 University of Cambridge.
|
||||
Last updated: 26 July 2022
|
||||
Copyright (c) 1997-2022 University of Cambridge.
|
||||
.fi
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
.TH PCRE2PATTERN 3 "3o0 August 2021" "PCRE2 10.38"
|
||||
.TH PCRE2PATTERN 3 "12 January 2022" "PCRE2 10.40"
|
||||
.SH NAME
|
||||
PCRE2 - Perl-compatible regular expressions (revised API)
|
||||
.SH "PCRE2 REGULAR EXPRESSION DETAILS"
|
||||
|
@ -509,7 +509,6 @@ for themselves. For example, outside a character class:
|
|||
.\" JOIN
|
||||
\e377 might be a backreference, otherwise
|
||||
the value 255 (decimal)
|
||||
.\" JOIN
|
||||
\e81 is always a backreference
|
||||
.sp
|
||||
Note that octal values of 100 or greater that are specified using this syntax
|
||||
|
@ -773,200 +772,64 @@ can be used in any mode, though in 8-bit and 16-bit non-UTF modes these
|
|||
sequences are of course limited to testing characters whose code points are
|
||||
less than U+0100 and U+10000, respectively. In 32-bit non-UTF mode, code points
|
||||
greater than 0x10ffff (the Unicode limit) may be encountered. These are all
|
||||
treated as being in the Unknown script and with an unassigned type. The extra
|
||||
escape sequences are:
|
||||
treated as being in the Unknown script and with an unassigned type.
|
||||
.P
|
||||
Matching characters by Unicode property is not fast, because PCRE2 has to do a
|
||||
multistage table lookup in order to find a character's property. That is why
|
||||
the traditional escape sequences such as \ed and \ew do not use Unicode
|
||||
properties in PCRE2 by default, though you can make them do so by setting the
|
||||
PCRE2_UCP option or by starting the pattern with (*UCP).
|
||||
.P
|
||||
The extra escape sequences that provide property support are:
|
||||
.sp
|
||||
\ep{\fIxx\fP} a character with the \fIxx\fP property
|
||||
\eP{\fIxx\fP} a character without the \fIxx\fP property
|
||||
\eX a Unicode extended grapheme cluster
|
||||
.sp
|
||||
The property names represented by \fIxx\fP above are case-sensitive. There is
|
||||
support for Unicode script names, Unicode general category properties, "Any",
|
||||
which matches any character (including newline), and some special PCRE2
|
||||
properties (described in the
|
||||
The property names represented by \fIxx\fP above are not case-sensitive, and in
|
||||
accordance with Unicode's "loose matching" rules, spaces, hyphens, and
|
||||
underscores are ignored. There is support for Unicode script names, Unicode
|
||||
general category properties, "Any", which matches any character (including
|
||||
newline), Bidi_Class, a number of binary (yes/no) properties, and some special
|
||||
PCRE2 properties (described
|
||||
.\" HTML <a href="#extraprops">
|
||||
.\" </a>
|
||||
next section).
|
||||
below).
|
||||
.\"
|
||||
Other Perl properties such as "InMusicalSymbols" are not supported by PCRE2.
|
||||
Note that \eP{Any} does not match any characters, so always causes a match
|
||||
failure.
|
||||
Certain other Perl properties such as "InMusicalSymbols" are not supported by
|
||||
PCRE2. Note that \eP{Any} does not match any characters, so always causes a
|
||||
match failure.
|
||||
.
|
||||
.
|
||||
.
|
||||
.SS "Script properties for \ep and \eP"
|
||||
.rs
|
||||
.sp
|
||||
There are three different syntax forms for matching a script. Each Unicode
|
||||
character has a basic script and, optionally, a list of other scripts ("Script
|
||||
Extensions") with which it is commonly used. Using the Adlam script as an
|
||||
example, \ep{sc:Adlam} matches characters whose basic script is Adlam, whereas
|
||||
\ep{scx:Adlam} matches, in addition, characters that have Adlam in their
|
||||
extensions list. The full names "script" and "script extensions" for the
|
||||
property types are recognized, and a equals sign is an alternative to the
|
||||
colon. If a script name is given without a property type, for example,
|
||||
\ep{Adlam}, it is treated as \ep{scx:Adlam}. Perl changed to this
|
||||
interpretation at release 5.26 and PCRE2 changed at release 10.40.
|
||||
.P
|
||||
Sets of Unicode characters are defined as belonging to certain scripts. A
|
||||
character from one of these sets can be matched using a script name. For
|
||||
example:
|
||||
.sp
|
||||
\ep{Greek}
|
||||
\eP{Han}
|
||||
.sp
|
||||
Unassigned characters (and in non-UTF 32-bit mode, characters with code points
|
||||
greater than 0x10FFFF) are assigned the "Unknown" script. Others that are not
|
||||
part of an identified script are lumped together as "Common". The current list
|
||||
of scripts is:
|
||||
.P
|
||||
Adlam,
|
||||
Ahom,
|
||||
Anatolian_Hieroglyphs,
|
||||
Arabic,
|
||||
Armenian,
|
||||
Avestan,
|
||||
Balinese,
|
||||
Bamum,
|
||||
Bassa_Vah,
|
||||
Batak,
|
||||
Bengali,
|
||||
Bhaiksuki,
|
||||
Bopomofo,
|
||||
Brahmi,
|
||||
Braille,
|
||||
Buginese,
|
||||
Buhid,
|
||||
Canadian_Aboriginal,
|
||||
Carian,
|
||||
Caucasian_Albanian,
|
||||
Chakma,
|
||||
Cham,
|
||||
Cherokee,
|
||||
Chorasmian,
|
||||
Common,
|
||||
Coptic,
|
||||
Cuneiform,
|
||||
Cypriot,
|
||||
Cypro_Minoan,
|
||||
Cyrillic,
|
||||
Deseret,
|
||||
Devanagari,
|
||||
Dives_Akuru,
|
||||
Dogra,
|
||||
Duployan,
|
||||
Egyptian_Hieroglyphs,
|
||||
Elbasan,
|
||||
Elymaic,
|
||||
Ethiopic,
|
||||
Georgian,
|
||||
Glagolitic,
|
||||
Gothic,
|
||||
Grantha,
|
||||
Greek,
|
||||
Gujarati,
|
||||
Gunjala_Gondi,
|
||||
Gurmukhi,
|
||||
Han,
|
||||
Hangul,
|
||||
Hanifi_Rohingya,
|
||||
Hanunoo,
|
||||
Hatran,
|
||||
Hebrew,
|
||||
Hiragana,
|
||||
Imperial_Aramaic,
|
||||
Inherited,
|
||||
Inscriptional_Pahlavi,
|
||||
Inscriptional_Parthian,
|
||||
Javanese,
|
||||
Kaithi,
|
||||
Kannada,
|
||||
Katakana,
|
||||
Kayah_Li,
|
||||
Kharoshthi,
|
||||
Khitan_Small_Script,
|
||||
Khmer,
|
||||
Khojki,
|
||||
Khudawadi,
|
||||
Lao,
|
||||
Latin,
|
||||
Lepcha,
|
||||
Limbu,
|
||||
Linear_A,
|
||||
Linear_B,
|
||||
Lisu,
|
||||
Lycian,
|
||||
Lydian,
|
||||
Mahajani,
|
||||
Makasar,
|
||||
Malayalam,
|
||||
Mandaic,
|
||||
Manichaean,
|
||||
Marchen,
|
||||
Masaram_Gondi,
|
||||
Medefaidrin,
|
||||
Meetei_Mayek,
|
||||
Mende_Kikakui,
|
||||
Meroitic_Cursive,
|
||||
Meroitic_Hieroglyphs,
|
||||
Miao,
|
||||
Modi,
|
||||
Mongolian,
|
||||
Mro,
|
||||
Multani,
|
||||
Myanmar,
|
||||
Nabataean,
|
||||
Nandinagari,
|
||||
New_Tai_Lue,
|
||||
Newa,
|
||||
Nko,
|
||||
Nushu,
|
||||
Nyakeng_Puachue_Hmong,
|
||||
Ogham,
|
||||
Ol_Chiki,
|
||||
Old_Hungarian,
|
||||
Old_Italic,
|
||||
Old_North_Arabian,
|
||||
Old_Permic,
|
||||
Old_Persian,
|
||||
Old_Sogdian,
|
||||
Old_South_Arabian,
|
||||
Old_Turkic,
|
||||
Old_Uyghur,
|
||||
Oriya,
|
||||
Osage,
|
||||
Osmanya,
|
||||
Pahawh_Hmong,
|
||||
Palmyrene,
|
||||
Pau_Cin_Hau,
|
||||
Phags_Pa,
|
||||
Phoenician,
|
||||
Psalter_Pahlavi,
|
||||
Rejang,
|
||||
Runic,
|
||||
Samaritan,
|
||||
Saurashtra,
|
||||
Sharada,
|
||||
Shavian,
|
||||
Siddham,
|
||||
SignWriting,
|
||||
Sinhala,
|
||||
Sogdian,
|
||||
Sora_Sompeng,
|
||||
Soyombo,
|
||||
Sundanese,
|
||||
Syloti_Nagri,
|
||||
Syriac,
|
||||
Tagalog,
|
||||
Tagbanwa,
|
||||
Tai_Le,
|
||||
Tai_Tham,
|
||||
Tai_Viet,
|
||||
Takri,
|
||||
Tamil,
|
||||
Tangsa,
|
||||
Tangut,
|
||||
Telugu,
|
||||
Thaana,
|
||||
Thai,
|
||||
Tibetan,
|
||||
Tifinagh,
|
||||
Tirhuta,
|
||||
Toto,
|
||||
Ugaritic,
|
||||
Unknown,
|
||||
Vai,
|
||||
Vithkuqi,
|
||||
Wancho,
|
||||
Warang_Citi,
|
||||
Yezidi,
|
||||
Yi,
|
||||
Zanabazar_Square.
|
||||
.P
|
||||
of recognized script names and their 4-character abbreviations can be obtained
|
||||
by running this command:
|
||||
.sp
|
||||
pcre2test -LS
|
||||
.sp
|
||||
.
|
||||
.
|
||||
.
|
||||
.SS "The general category property for \ep and \eP"
|
||||
.rs
|
||||
.sp
|
||||
Each character has exactly one Unicode general category property, specified by
|
||||
a two-letter abbreviation. For compatibility with Perl, negation can be
|
||||
specified by including a circumflex between the opening brace and the property
|
||||
|
@ -1026,9 +889,9 @@ The following general category property codes are supported:
|
|||
Zp Paragraph separator
|
||||
Zs Space separator
|
||||
.sp
|
||||
The special property L& is also supported: it matches a character that has
|
||||
the Lu, Ll, or Lt property, in other words, a letter that is not classified as
|
||||
a modifier or "other".
|
||||
The special property LC, which has the synonym L&, is also supported: it
|
||||
matches a character that has the Lu, Ll, or Lt property, in other words, a
|
||||
letter that is not classified as a modifier or "other".
|
||||
.P
|
||||
The Cs (Surrogate) property applies only to characters whose code points are in
|
||||
the range U+D800 to U+DFFF. These characters are no different to any other
|
||||
|
@ -1052,12 +915,53 @@ Unicode table.
|
|||
Specifying caseless matching does not affect these escape sequences. For
|
||||
example, \ep{Lu} always matches only upper case letters. This is different from
|
||||
the behaviour of current versions of Perl.
|
||||
.P
|
||||
Matching characters by Unicode property is not fast, because PCRE2 has to do a
|
||||
multistage table lookup in order to find a character's property. That is why
|
||||
the traditional escape sequences such as \ed and \ew do not use Unicode
|
||||
properties in PCRE2 by default, though you can make them do so by setting the
|
||||
PCRE2_UCP option or by starting the pattern with (*UCP).
|
||||
.
|
||||
.
|
||||
.SS "Binary (yes/no) properties for \ep and \eP"
|
||||
.rs
|
||||
.sp
|
||||
Unicode defines a number of binary properties, that is, properties whose only
|
||||
values are true or false. You can obtain a list of those that are recognized by
|
||||
\ep and \eP, along with their abbreviations, by running this command:
|
||||
.sp
|
||||
pcre2test -LP
|
||||
.sp
|
||||
.
|
||||
.
|
||||
.SS "The Bidi_Class property for \ep and \eP"
|
||||
.rs
|
||||
.sp
|
||||
\ep{Bidi_Class:<class>} matches a character with the given class
|
||||
\ep{BC:<class>} matches a character with the given class
|
||||
.sp
|
||||
The recognized classes are:
|
||||
.sp
|
||||
AL Arabic letter
|
||||
AN Arabic number
|
||||
B paragraph separator
|
||||
BN boundary neutral
|
||||
CS common separator
|
||||
EN European number
|
||||
ES European separator
|
||||
ET European terminator
|
||||
FSI first strong isolate
|
||||
L left-to-right
|
||||
LRE left-to-right embedding
|
||||
LRI left-to-right isolate
|
||||
LRO left-to-right override
|
||||
NSM non-spacing mark
|
||||
ON other neutral
|
||||
PDF pop directional format
|
||||
PDI pop directional isolate
|
||||
R right-to-left
|
||||
RLE right-to-left embedding
|
||||
RLI right-to-left isolate
|
||||
RLO right-to-left override
|
||||
S segment separator
|
||||
WS which space
|
||||
.sp
|
||||
An equals sign may be used instead of a colon. The class names are
|
||||
case-insensitive; only the short names listed above are recognized.
|
||||
.
|
||||
.
|
||||
.SS Extended grapheme clusters
|
||||
|
@ -1336,14 +1240,19 @@ end of the subject in both modes, and if all branches of a pattern start with
|
|||
.sp
|
||||
Outside a character class, a dot in the pattern matches any one character in
|
||||
the subject string except (by default) a character that signifies the end of a
|
||||
line.
|
||||
line. One or more characters may be specified as line terminators (see
|
||||
.\" HTML <a href="#newlines">
|
||||
.\" </a>
|
||||
"Newline conventions"
|
||||
.\"
|
||||
above).
|
||||
.P
|
||||
When a line ending is defined as a single character, dot never matches that
|
||||
character; when the two-character sequence CRLF is used, dot does not match CR
|
||||
if it is immediately followed by LF, but otherwise it matches all characters
|
||||
(including isolated CRs and LFs). When any Unicode line endings are being
|
||||
recognized, dot does not match CR or LF or any of the other line ending
|
||||
characters.
|
||||
Dot never matches a single line-ending character. When the two-character
|
||||
sequence CRLF is the only line ending, dot does not match CR if it is
|
||||
immediately followed by LF, but otherwise it matches all characters (including
|
||||
isolated CRs and LFs). When ANYCRLF is selected for line endings, no occurences
|
||||
of CR of LF match dot. When all Unicode line endings are being recognized, dot
|
||||
does not match CR or LF or any of the other line ending characters.
|
||||
.P
|
||||
The behaviour of dot with regard to newlines can be changed. If the
|
||||
PCRE2_DOTALL option is set, a dot matches any one character, without exception.
|
||||
|
@ -2186,10 +2095,10 @@ be easier to remember:
|
|||
.sp
|
||||
(*atomic:\ed+)foo
|
||||
.sp
|
||||
This kind of parenthesized group "locks up" the part of the pattern it
|
||||
contains once it has matched, and a failure further into the pattern is
|
||||
prevented from backtracking into it. Backtracking past it to previous items,
|
||||
however, works as normal.
|
||||
This kind of parenthesized group "locks up" the part of the pattern it contains
|
||||
once it has matched, and a failure further into the pattern is prevented from
|
||||
backtracking into it. Backtracking past it to previous items, however, works as
|
||||
normal.
|
||||
.P
|
||||
An alternative description is that a group of this type matches exactly the
|
||||
string of characters that an identical standalone pattern would match, if
|
||||
|
@ -3905,6 +3814,6 @@ Cambridge, England.
|
|||
.rs
|
||||
.sp
|
||||
.nf
|
||||
Last updated: 30 August 2021
|
||||
Copyright (c) 1997-2021 University of Cambridge.
|
||||
Last updated: 12 January 2022
|
||||
Copyright (c) 1997-2022 University of Cambridge.
|
||||
.fi
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
.TH PCRE2PERFORM 3 "03 February 2019" "PCRE2 10.33"
|
||||
.TH PCRE2PERFORM 3 "27 July 2022" "PCRE2 10.41"
|
||||
.SH NAME
|
||||
PCRE2 - Perl-compatible regular expressions (revised API)
|
||||
.SH "PCRE2 PERFORMANCE"
|
||||
|
@ -69,12 +69,28 @@ From release 10.30, the interpretive (non-JIT) version of \fBpcre2_match()\fP
|
|||
uses very little system stack at run time. In earlier releases recursive
|
||||
function calls could use a great deal of stack, and this could cause problems,
|
||||
but this usage has been eliminated. Backtracking positions are now explicitly
|
||||
remembered in memory frames controlled by the code. An initial 20KiB vector of
|
||||
frames is allocated on the system stack (enough for about 100 frames for small
|
||||
patterns), but if this is insufficient, heap memory is used. The amount of heap
|
||||
memory can be limited; if the limit is set to zero, only the initial stack
|
||||
vector is used. Rewriting patterns to be time-efficient, as described below,
|
||||
may also reduce the memory requirements.
|
||||
remembered in memory frames controlled by the code.
|
||||
.P
|
||||
The size of each frame depends on the size of pointer variables and the number
|
||||
of capturing parenthesized groups in the pattern being matched. On a 64-bit
|
||||
system the frame size for a pattern with no captures is 128 bytes. For each
|
||||
capturing group the size increases by 16 bytes.
|
||||
.P
|
||||
Until release 10.41, an initial 20KiB frames vector was allocated on the system
|
||||
stack, but this still caused some issues for multi-thread applications where
|
||||
each thread has a very small stack. From release 10.41 backtracking memory
|
||||
frames are always held in heap memory. An initial heap allocation is obtained
|
||||
the first time any match data block is passed to \fBpcre2_match()\fP. This is
|
||||
remembered with the match data block and re-used if that block is used for
|
||||
another match. It is freed when the match data block itself is freed.
|
||||
.P
|
||||
The size of the initial block is the larger of 20KiB or ten times the pattern's
|
||||
frame size, unless the heap limit is less than this, in which case the heap
|
||||
limit is used. If the initial block proves to be too small during matching, it
|
||||
is replaced by a larger block, subject to the heap limit. The heap limit is
|
||||
checked only when a new block is to be allocated. Reducing the heap limit
|
||||
between calls to \fBpcre2_match()\fP with the same match data block does not
|
||||
affect the saved block.
|
||||
.P
|
||||
In contrast to \fBpcre2_match()\fP, \fBpcre2_dfa_match()\fP does use recursive
|
||||
function calls, but only for processing atomic groups, lookaround assertions,
|
||||
|
@ -230,7 +246,7 @@ pattern to match. This is done by repeatedly matching with different limits.
|
|||
.sp
|
||||
.nf
|
||||
Philip Hazel
|
||||
University Computing Service
|
||||
Retired from University Computing Service
|
||||
Cambridge, England.
|
||||
.fi
|
||||
.
|
||||
|
@ -239,6 +255,6 @@ Cambridge, England.
|
|||
.rs
|
||||
.sp
|
||||
.nf
|
||||
Last updated: 03 February 2019
|
||||
Copyright (c) 1997-2019 University of Cambridge.
|
||||
Last updated: 27 July 2022
|
||||
Copyright (c) 1997-2022 University of Cambridge.
|
||||
.fi
|
||||
|
|
|
@ -6,11 +6,11 @@ PCRE2 - Perl-compatible regular expressions (revised API)
|
|||
.sp
|
||||
.nf
|
||||
.B int32_t pcre2_serialize_decode(pcre2_code **\fIcodes\fP,
|
||||
.B " int32_t \fInumber_of_codes\fP, const uint32_t *\fIbytes\fP,"
|
||||
.B " int32_t \fInumber_of_codes\fP, const uint8_t *\fIbytes\fP,"
|
||||
.B " pcre2_general_context *\fIgcontext\fP);"
|
||||
.sp
|
||||
.B int32_t pcre2_serialize_encode(pcre2_code **\fIcodes\fP,
|
||||
.B " int32_t \fInumber_of_codes\fP, uint32_t **\fIserialized_bytes\fP,"
|
||||
.B int32_t pcre2_serialize_encode(const pcre2_code **\fIcodes\fP,
|
||||
.B " int32_t \fInumber_of_codes\fP, uint8_t **\fIserialized_bytes\fP,"
|
||||
.B " PCRE2_SIZE *\fIserialized_size\fP, pcre2_general_context *\fIgcontext\fP);"
|
||||
.sp
|
||||
.B void pcre2_serialize_free(uint8_t *\fIbytes\fP);
|
||||
|
@ -81,7 +81,7 @@ of serialized patterns, or one of the following negative error codes:
|
|||
.sp
|
||||
PCRE2_ERROR_BADDATA the number of patterns is zero or less
|
||||
PCRE2_ERROR_BADMAGIC mismatch of id bytes in one of the patterns
|
||||
PCRE2_ERROR_MEMORY memory allocation failed
|
||||
PCRE2_ERROR_NOMEMORY memory allocation failed
|
||||
PCRE2_ERROR_MIXEDTABLES the patterns do not all use the same tables
|
||||
PCRE2_ERROR_NULL the 1st, 3rd, or 4th argument is NULL
|
||||
.sp
|
||||
|
@ -141,7 +141,6 @@ mangagement functions for the decoded patterns. If this argument is NULL,
|
|||
\fBmalloc()\fP and \fBfree()\fP are used. After deserialization, the byte
|
||||
stream is no longer needed and can be discarded.
|
||||
.sp
|
||||
int32_t number_of_codes;
|
||||
pcre2_code *list_of_codes[2];
|
||||
uint8_t *bytes = <serialized data>;
|
||||
int32_t number_of_codes =
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
.TH PCRE2SYNTAX 3 "30 August 2021" "PCRE2 10.38"
|
||||
.TH PCRE2SYNTAX 3 "12 January 2022" "PCRE2 10.40"
|
||||
.SH NAME
|
||||
PCRE2 - Perl-compatible regular expressions (revised API)
|
||||
.SH "PCRE2 REGULAR EXPRESSION SYNTAX SUMMARY"
|
||||
|
@ -102,6 +102,10 @@ happening, \es and \ew may also match characters with code points in the range
|
|||
128-255. If the PCRE2_UCP option is set, the behaviour of these escape
|
||||
sequences is changed to use Unicode properties and they match many more
|
||||
characters.
|
||||
.P
|
||||
Property descriptions in \ep and \eP are matched caselessly; hyphens,
|
||||
underscores, and white space are ignored, in accordance with Unicode's "loose
|
||||
matching" rules.
|
||||
.
|
||||
.
|
||||
.SH "GENERAL CATEGORY PROPERTIES FOR \ep and \eP"
|
||||
|
@ -120,6 +124,7 @@ characters.
|
|||
Lo Other letter
|
||||
Lt Title case letter
|
||||
Lu Upper case letter
|
||||
Lc Ll, Lu, or Lt
|
||||
L& Ll, Lu, or Lt
|
||||
.sp
|
||||
M Mark
|
||||
|
@ -167,170 +172,59 @@ Perl and POSIX space are now the same. Perl added VT to its space character set
|
|||
at release 5.18.
|
||||
.
|
||||
.
|
||||
.SH "SCRIPT NAMES FOR \ep AND \eP"
|
||||
.SH "BINARY PROPERTIES FOR \ep AND \eP"
|
||||
.rs
|
||||
.sp
|
||||
Adlam,
|
||||
Ahom,
|
||||
Anatolian_Hieroglyphs,
|
||||
Arabic,
|
||||
Armenian,
|
||||
Avestan,
|
||||
Balinese,
|
||||
Bamum,
|
||||
Bassa_Vah,
|
||||
Batak,
|
||||
Bengali,
|
||||
Bhaiksuki,
|
||||
Bopomofo,
|
||||
Brahmi,
|
||||
Braille,
|
||||
Buginese,
|
||||
Buhid,
|
||||
Canadian_Aboriginal,
|
||||
Carian,
|
||||
Caucasian_Albanian,
|
||||
Chakma,
|
||||
Cham,
|
||||
Cherokee,
|
||||
Chorasmian,
|
||||
Common,
|
||||
Coptic,
|
||||
Cuneiform,
|
||||
Cypriot,
|
||||
Cypro_Minoan,
|
||||
Cyrillic,
|
||||
Deseret,
|
||||
Devanagari,
|
||||
Dives_Akuru,
|
||||
Dogra,
|
||||
Duployan,
|
||||
Egyptian_Hieroglyphs,
|
||||
Elbasan,
|
||||
Elymaic,
|
||||
Ethiopic,
|
||||
Georgian,
|
||||
Glagolitic,
|
||||
Gothic,
|
||||
Grantha,
|
||||
Greek,
|
||||
Gujarati,
|
||||
Gunjala_Gondi,
|
||||
Gurmukhi,
|
||||
Han,
|
||||
Hangul,
|
||||
Hanifi_Rohingya,
|
||||
Hanunoo,
|
||||
Hatran,
|
||||
Hebrew,
|
||||
Hiragana,
|
||||
Imperial_Aramaic,
|
||||
Inherited,
|
||||
Inscriptional_Pahlavi,
|
||||
Inscriptional_Parthian,
|
||||
Javanese,
|
||||
Kaithi,
|
||||
Kannada,
|
||||
Katakana,
|
||||
Kayah_Li,
|
||||
Kharoshthi,
|
||||
Khitan_Small_Script,
|
||||
Khmer,
|
||||
Khojki,
|
||||
Khudawadi,
|
||||
Lao,
|
||||
Latin,
|
||||
Lepcha,
|
||||
Limbu,
|
||||
Linear_A,
|
||||
Linear_B,
|
||||
Lisu,
|
||||
Lycian,
|
||||
Lydian,
|
||||
Mahajani,
|
||||
Makasar,
|
||||
Malayalam,
|
||||
Mandaic,
|
||||
Manichaean,
|
||||
Marchen,
|
||||
Masaram_Gondi,
|
||||
Medefaidrin,
|
||||
Meetei_Mayek,
|
||||
Mende_Kikakui,
|
||||
Meroitic_Cursive,
|
||||
Meroitic_Hieroglyphs,
|
||||
Miao,
|
||||
Modi,
|
||||
Mongolian,
|
||||
Mro,
|
||||
Multani,
|
||||
Myanmar,
|
||||
Nabataean,
|
||||
Nandinagari,
|
||||
New_Tai_Lue,
|
||||
Newa,
|
||||
Nko,
|
||||
Nushu,
|
||||
Nyakeng_Puachue_Hmong,
|
||||
Ogham,
|
||||
Ol_Chiki,
|
||||
Old_Hungarian,
|
||||
Old_Italic,
|
||||
Old_North_Arabian,
|
||||
Old_Permic,
|
||||
Old_Persian,
|
||||
Old_Sogdian,
|
||||
Old_South_Arabian,
|
||||
Old_Turkic,
|
||||
Old_Uyghur,
|
||||
Oriya,
|
||||
Osage,
|
||||
Osmanya,
|
||||
Pahawh_Hmong,
|
||||
Palmyrene,
|
||||
Pau_Cin_Hau,
|
||||
Phags_Pa,
|
||||
Phoenician,
|
||||
Psalter_Pahlavi,
|
||||
Rejang,
|
||||
Runic,
|
||||
Samaritan,
|
||||
Saurashtra,
|
||||
Sharada,
|
||||
Shavian,
|
||||
Siddham,
|
||||
SignWriting,
|
||||
Sinhala,
|
||||
Sogdian,
|
||||
Sora_Sompeng,
|
||||
Soyombo,
|
||||
Sundanese,
|
||||
Syloti_Nagri,
|
||||
Syriac,
|
||||
Tagalog,
|
||||
Tagbanwa,
|
||||
Tai_Le,
|
||||
Tai_Tham,
|
||||
Tai_Viet,
|
||||
Takri,
|
||||
Tamil,
|
||||
Tangsa,
|
||||
Tangut,
|
||||
Telugu,
|
||||
Thaana,
|
||||
Thai,
|
||||
Tibetan,
|
||||
Tifinagh,
|
||||
Tirhuta,
|
||||
Toto,
|
||||
Ugaritic,
|
||||
Vai,
|
||||
Vithkuqi,
|
||||
Wancho,
|
||||
Warang_Citi,
|
||||
Yezidi,
|
||||
Yi,
|
||||
Zanabazar_Square.
|
||||
Unicode defines a number of binary properties, that is, properties whose only
|
||||
values are true or false. You can obtain a list of those that are recognized by
|
||||
\ep and \eP, along with their abbreviations, by running this command:
|
||||
.sp
|
||||
pcre2test -LP
|
||||
.
|
||||
.
|
||||
.
|
||||
.SH "SCRIPT MATCHING WITH \ep AND \eP"
|
||||
.rs
|
||||
.sp
|
||||
Many script names and their 4-letter abbreviations are recognized in
|
||||
\ep{sc:...} or \ep{scx:...} items, or on their own with \ep (and also \eP of
|
||||
course). You can obtain a list of these scripts by running this command:
|
||||
.sp
|
||||
pcre2test -LS
|
||||
.
|
||||
.
|
||||
.
|
||||
.SH "THE BIDI_CLASS PROPERTY FOR \ep AND \eP"
|
||||
.rs
|
||||
.sp
|
||||
\ep{Bidi_Class:<class>} matches a character with the given class
|
||||
\ep{BC:<class>} matches a character with the given class
|
||||
.sp
|
||||
The recognized classes are:
|
||||
.sp
|
||||
AL Arabic letter
|
||||
AN Arabic number
|
||||
B paragraph separator
|
||||
BN boundary neutral
|
||||
CS common separator
|
||||
EN European number
|
||||
ES European separator
|
||||
ET European terminator
|
||||
FSI first strong isolate
|
||||
L left-to-right
|
||||
LRE left-to-right embedding
|
||||
LRI left-to-right isolate
|
||||
LRO left-to-right override
|
||||
NSM non-spacing mark
|
||||
ON other neutral
|
||||
PDF pop directional format
|
||||
PDI pop directional isolate
|
||||
R right-to-left
|
||||
RLE right-to-left embedding
|
||||
RLI right-to-left isolate
|
||||
RLO right-to-left override
|
||||
S segment separator
|
||||
WS which space
|
||||
.
|
||||
.
|
||||
.SH "CHARACTER CLASSES"
|
||||
|
@ -684,6 +578,6 @@ Cambridge, England.
|
|||
.rs
|
||||
.sp
|
||||
.nf
|
||||
Last updated: 30 August 2021
|
||||
Copyright (c) 1997-2021 University of Cambridge.
|
||||
Last updated: 12 January 2022
|
||||
Copyright (c) 1997-2022 University of Cambridge.
|
||||
.fi
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
.TH PCRE2TEST 1 "30 August 2021" "PCRE 10.38"
|
||||
.TH PCRE2TEST 1 "27 July 2022" "PCRE 10.41"
|
||||
.SH NAME
|
||||
pcre2test - a program for testing Perl-compatible regular expressions.
|
||||
.SH SYNOPSIS
|
||||
|
@ -47,7 +47,7 @@ format before being passed to the library functions. Results are converted back
|
|||
to 8-bit code units for output.
|
||||
.P
|
||||
In the rest of this document, the names of library functions and structures
|
||||
are given in generic form, for example, \fBpcre_compile()\fP. The actual
|
||||
are given in generic form, for example, \fBpcre2_compile()\fP. The actual
|
||||
names used in the libraries have a suffix _8, _16, or _32, as appropriate.
|
||||
.
|
||||
.
|
||||
|
@ -211,7 +211,17 @@ available, and the use of JIT for matching is verified.
|
|||
\fB-LM\fP
|
||||
List modifiers: write a list of available pattern and subject modifiers to the
|
||||
standard output, then exit with zero exit code. All other options are ignored.
|
||||
If both -C and -LM are present, whichever is first is recognized.
|
||||
If both -C and any -Lx options are present, whichever is first is recognized.
|
||||
.TP 10
|
||||
\fB-LP\fP
|
||||
List properties: write a list of recognized Unicode properties to the standard
|
||||
output, then exit with zero exit code. All other options are ignored. If both
|
||||
-C and any -Lx options are present, whichever is first is recognized.
|
||||
.TP 10
|
||||
\fB-LS\fP
|
||||
List scripts: write a list of recogized Unicode script names to the standard
|
||||
output, then exit with zero exit code. All other options are ignored. If both
|
||||
-C and any -Lx options are present, whichever is first is recognized.
|
||||
.TP 10
|
||||
\fB-pattern\fP \fImodifier-list\fP
|
||||
Behave as if each pattern line contains the given modifiers.
|
||||
|
@ -1196,7 +1206,8 @@ pattern, but can be overridden by modifiers on the subject.
|
|||
copy=<number or name> copy captured substring
|
||||
depth_limit=<n> set a depth limit
|
||||
dfa use \fBpcre2_dfa_match()\fP
|
||||
find_limits find match and depth limits
|
||||
find_limits find heap, match and depth limits
|
||||
find_limits_noheap find match and depth limits
|
||||
get=<number or name> extract captured substring
|
||||
getall extract all captured substrings
|
||||
/g global global matching
|
||||
|
@ -1206,6 +1217,8 @@ pattern, but can be overridden by modifiers on the subject.
|
|||
match_limit=<n> set a match limit
|
||||
memory show heap memory usage
|
||||
null_context match with a NULL context
|
||||
null_replacement substitute with NULL replacement
|
||||
null_subject match with NULL subject
|
||||
offset=<n> set starting offset
|
||||
offset_limit=<n> set offset limit
|
||||
ovector=<n> set size of output vector
|
||||
|
@ -1516,7 +1529,7 @@ value that was set on the pattern.
|
|||
.sp
|
||||
The \fBheap_limit\fP, \fBmatch_limit\fP, and \fBdepth_limit\fP modifiers set
|
||||
the appropriate limits in the match context. These values are ignored when the
|
||||
\fBfind_limits\fP modifier is specified.
|
||||
\fBfind_limits\fP or \fBfind_limits_noheap\fP modifier is specified.
|
||||
.
|
||||
.
|
||||
.SS "Finding minimum limits"
|
||||
|
@ -1526,8 +1539,12 @@ If the \fBfind_limits\fP modifier is present on a subject line, \fBpcre2test\fP
|
|||
calls the relevant matching function several times, setting different values in
|
||||
the match context via \fBpcre2_set_heap_limit()\fP,
|
||||
\fBpcre2_set_match_limit()\fP, or \fBpcre2_set_depth_limit()\fP until it finds
|
||||
the minimum values for each parameter that allows the match to complete without
|
||||
error. If JIT is being used, only the match limit is relevant.
|
||||
the smallest value for each parameter that allows the match to complete without
|
||||
a "limit exceeded" error. The match itself may succeed or fail. An alternative
|
||||
modifier, \fBfind_limits_noheap\fP, omits the heap limit. This is used in the
|
||||
standard tests, because the minimum heap limit varies between systems. If JIT
|
||||
is being used, only the match limit is relevant, and the other two are
|
||||
automatically omitted.
|
||||
.P
|
||||
When using this modifier, the pattern should not contain any limit settings
|
||||
such as (*LIMIT_MATCH=...) within it. If such a setting is present and is
|
||||
|
@ -1551,9 +1568,7 @@ and non-recursive, to the internal matching function, thus controlling the
|
|||
overall amount of computing resource that is used.
|
||||
.P
|
||||
For both kinds of matching, the \fIheap_limit\fP number, which is in kibibytes
|
||||
(units of 1024 bytes), limits the amount of heap memory used for matching. A
|
||||
value of zero disables the use of any heap memory; many simple pattern matches
|
||||
can be done without using the heap, so zero is not an unreasonable setting.
|
||||
(units of 1024 bytes), limits the amount of heap memory used for matching.
|
||||
.
|
||||
.
|
||||
.SS "Showing MARK names"
|
||||
|
@ -1572,12 +1587,10 @@ is added to the non-match message.
|
|||
.sp
|
||||
The \fBmemory\fP modifier causes \fBpcre2test\fP to log the sizes of all heap
|
||||
memory allocation and freeing calls that occur during a call to
|
||||
\fBpcre2_match()\fP or \fBpcre2_dfa_match()\fP. These occur only when a match
|
||||
requires a bigger vector than the default for remembering backtracking points
|
||||
(\fBpcre2_match()\fP) or for internal workspace (\fBpcre2_dfa_match()\fP). In
|
||||
many cases there will be no heap memory used and therefore no additional
|
||||
output. No heap memory is allocated during matching with JIT, so in that case
|
||||
the \fBmemory\fP modifier never has any effect. For this modifier to work, the
|
||||
\fBpcre2_match()\fP or \fBpcre2_dfa_match()\fP. In the latter case, heap memory
|
||||
is used only when a match requires more internal workspace that the default
|
||||
allocation on the stack, so in many cases there will be no output. No heap
|
||||
memory is allocated during matching with JIT. For this modifier to work, the
|
||||
\fBnull_context\fP modifier must not be set on both the pattern and the
|
||||
subject, though it can be set on one or the other.
|
||||
.
|
||||
|
@ -1629,7 +1642,7 @@ When testing \fBpcre2_substitute()\fP, this modifier also has the effect of
|
|||
passing the replacement string as zero-terminated.
|
||||
.
|
||||
.
|
||||
.SS "Passing a NULL context"
|
||||
.SS "Passing a NULL context, subject, or replacement"
|
||||
.rs
|
||||
.sp
|
||||
Normally, \fBpcre2test\fP passes a context block to \fBpcre2_match()\fP,
|
||||
|
@ -1637,7 +1650,12 @@ Normally, \fBpcre2test\fP passes a context block to \fBpcre2_match()\fP,
|
|||
If the \fBnull_context\fP modifier is set, however, NULL is passed. This is for
|
||||
testing that the matching and substitution functions behave correctly in this
|
||||
case (they use default values). This modifier cannot be used with the
|
||||
\fBfind_limits\fP or \fBsubstitute_callout\fP modifiers.
|
||||
\fBfind_limits\fP, \fBfind_limits_noheap\fP, or \fBsubstitute_callout\fP
|
||||
modifiers.
|
||||
.P
|
||||
Similarly, for testing purposes, if the \fBnull_subject\fP or
|
||||
\fBnull_replacement\fP modifier is set, the subject or replacement string
|
||||
pointers are passed as NULL, respectively, to the relevant functions.
|
||||
.
|
||||
.
|
||||
.SH "THE ALTERNATIVE MATCHING FUNCTION"
|
||||
|
@ -2103,6 +2121,6 @@ Cambridge, England.
|
|||
.rs
|
||||
.sp
|
||||
.nf
|
||||
Last updated: 30 August 2021
|
||||
Copyright (c) 1997-2021 University of Cambridge.
|
||||
Last updated: 27 July 2022
|
||||
Copyright (c) 1997-2022 University of Cambridge.
|
||||
.fi
|
||||
|
|
|
@ -44,7 +44,7 @@ PCRE2's 8-BIT, 16-BIT AND 32-BIT LIBRARIES
|
|||
output.
|
||||
|
||||
In the rest of this document, the names of library functions and struc-
|
||||
tures are given in generic form, for example, pcre_compile(). The ac-
|
||||
tures are given in generic form, for example, pcre2_compile(). The ac-
|
||||
tual names used in the libraries have a suffix _8, _16, or _32, as ap-
|
||||
propriate.
|
||||
|
||||
|
@ -197,7 +197,17 @@ COMMAND LINE OPTIONS
|
|||
|
||||
-LM List modifiers: write a list of available pattern and subject
|
||||
modifiers to the standard output, then exit with zero exit
|
||||
code. All other options are ignored. If both -C and -LM are
|
||||
code. All other options are ignored. If both -C and any -Lx
|
||||
options are present, whichever is first is recognized.
|
||||
|
||||
-LP List properties: write a list of recognized Unicode proper-
|
||||
ties to the standard output, then exit with zero exit code.
|
||||
All other options are ignored. If both -C and any -Lx options
|
||||
are present, whichever is first is recognized.
|
||||
|
||||
-LS List scripts: write a list of recogized Unicode script names
|
||||
to the standard output, then exit with zero exit code. All
|
||||
other options are ignored. If both -C and any -Lx options are
|
||||
present, whichever is first is recognized.
|
||||
|
||||
-pattern modifier-list
|
||||
|
@ -1101,7 +1111,8 @@ SUBJECT MODIFIERS
|
|||
copy=<number or name> copy captured substring
|
||||
depth_limit=<n> set a depth limit
|
||||
dfa use pcre2_dfa_match()
|
||||
find_limits find match and depth limits
|
||||
find_limits find heap, match and depth limits
|
||||
find_limits_noheap find match and depth limits
|
||||
get=<number or name> extract captured substring
|
||||
getall extract all captured substrings
|
||||
/g global global matching
|
||||
|
@ -1111,6 +1122,8 @@ SUBJECT MODIFIERS
|
|||
match_limit=<n> set a match limit
|
||||
memory show heap memory usage
|
||||
null_context match with a NULL context
|
||||
null_replacement substitute with NULL replacement
|
||||
null_subject match with NULL subject
|
||||
offset=<n> set starting offset
|
||||
offset_limit=<n> set offset limit
|
||||
ovector=<n> set size of output vector
|
||||
|
@ -1399,7 +1412,7 @@ SUBJECT MODIFIERS
|
|||
|
||||
The heap_limit, match_limit, and depth_limit modifiers set the appro-
|
||||
priate limits in the match context. These values are ignored when the
|
||||
find_limits modifier is specified.
|
||||
find_limits or find_limits_noheap modifier is specified.
|
||||
|
||||
Finding minimum limits
|
||||
|
||||
|
@ -1407,8 +1420,12 @@ SUBJECT MODIFIERS
|
|||
calls the relevant matching function several times, setting different
|
||||
values in the match context via pcre2_set_heap_limit(),
|
||||
pcre2_set_match_limit(), or pcre2_set_depth_limit() until it finds the
|
||||
minimum values for each parameter that allows the match to complete
|
||||
without error. If JIT is being used, only the match limit is relevant.
|
||||
smallest value for each parameter that allows the match to complete
|
||||
without a "limit exceeded" error. The match itself may succeed or fail.
|
||||
An alternative modifier, find_limits_noheap, omits the heap limit. This
|
||||
is used in the standard tests, because the minimum heap limit varies
|
||||
between systems. If JIT is being used, only the match limit is rele-
|
||||
vant, and the other two are automatically omitted.
|
||||
|
||||
When using this modifier, the pattern should not contain any limit set-
|
||||
tings such as (*LIMIT_MATCH=...) within it. If such a setting is
|
||||
|
@ -1434,9 +1451,7 @@ SUBJECT MODIFIERS
|
|||
|
||||
For both kinds of matching, the heap_limit number, which is in
|
||||
kibibytes (units of 1024 bytes), limits the amount of heap memory used
|
||||
for matching. A value of zero disables the use of any heap memory; many
|
||||
simple pattern matches can be done without using the heap, so zero is
|
||||
not an unreasonable setting.
|
||||
for matching.
|
||||
|
||||
Showing MARK names
|
||||
|
||||
|
@ -1451,13 +1466,11 @@ SUBJECT MODIFIERS
|
|||
|
||||
The memory modifier causes pcre2test to log the sizes of all heap mem-
|
||||
ory allocation and freeing calls that occur during a call to
|
||||
pcre2_match() or pcre2_dfa_match(). These occur only when a match re-
|
||||
quires a bigger vector than the default for remembering backtracking
|
||||
points (pcre2_match()) or for internal workspace (pcre2_dfa_match()).
|
||||
In many cases there will be no heap memory used and therefore no addi-
|
||||
tional output. No heap memory is allocated during matching with JIT, so
|
||||
in that case the memory modifier never has any effect. For this modi-
|
||||
fier to work, the null_context modifier must not be set on both the
|
||||
pcre2_match() or pcre2_dfa_match(). In the latter case, heap memory is
|
||||
used only when a match requires more internal workspace that the de-
|
||||
fault allocation on the stack, so in many cases there will be no out-
|
||||
put. No heap memory is allocated during matching with JIT. For this
|
||||
modifier to work, the null_context modifier must not be set on both the
|
||||
pattern and the subject, though it can be set on one or the other.
|
||||
|
||||
Setting a starting offset
|
||||
|
@ -1499,14 +1512,19 @@ SUBJECT MODIFIERS
|
|||
When testing pcre2_substitute(), this modifier also has the effect of
|
||||
passing the replacement string as zero-terminated.
|
||||
|
||||
Passing a NULL context
|
||||
Passing a NULL context, subject, or replacement
|
||||
|
||||
Normally, pcre2test passes a context block to pcre2_match(),
|
||||
pcre2_dfa_match(), pcre2_jit_match() or pcre2_substitute(). If the
|
||||
null_context modifier is set, however, NULL is passed. This is for
|
||||
testing that the matching and substitution functions behave correctly
|
||||
in this case (they use default values). This modifier cannot be used
|
||||
with the find_limits or substitute_callout modifiers.
|
||||
with the find_limits, find_limits_noheap, or substitute_callout modi-
|
||||
fiers.
|
||||
|
||||
Similarly, for testing purposes, if the null_subject or null_replace-
|
||||
ment modifier is set, the subject or replacement string pointers are
|
||||
passed as NULL, respectively, to the relevant functions.
|
||||
|
||||
|
||||
THE ALTERNATIVE MATCHING FUNCTION
|
||||
|
@ -1933,5 +1951,5 @@ AUTHOR
|
|||
|
||||
REVISION
|
||||
|
||||
Last updated: 30 August 2021
|
||||
Copyright (c) 1997-2021 University of Cambridge.
|
||||
Last updated: 27 July 2022
|
||||
Copyright (c) 1997-2022 University of Cambridge.
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
.TH PCRE2UNICODE 3 "23 February 2020" "PCRE2 10.35"
|
||||
.TH PCRE2UNICODE 3 "22 December 2021" "PCRE2 10.40"
|
||||
.SH NAME
|
||||
PCRE - Perl-compatible regular expressions (revised API)
|
||||
.SH "UNICODE AND UTF SUPPORT"
|
||||
|
@ -40,10 +40,11 @@ handled, as documented below.
|
|||
.sp
|
||||
When PCRE2 is built with Unicode support, the escape sequences \ep{..},
|
||||
\eP{..}, and \eX can be used. This is not dependent on the PCRE2_UTF setting.
|
||||
The Unicode properties that can be tested are limited to the general category
|
||||
properties such as Lu for an upper case letter or Nd for a decimal number, the
|
||||
Unicode script names such as Arabic or Han, and the derived properties Any and
|
||||
L&. Full lists are given in the
|
||||
The Unicode properties that can be tested are a subset of those that Perl
|
||||
supports. Currently they are limited to the general category properties such as
|
||||
Lu for an upper case letter or Nd for a decimal number, the Unicode script
|
||||
names such as Arabic or Han, Bidi_Class, Bidi_Control, and the derived
|
||||
properties Any and LC (synonym L&). Full lists are given in the
|
||||
.\" HREF
|
||||
\fBpcre2pattern\fP
|
||||
.\"
|
||||
|
@ -51,10 +52,10 @@ and
|
|||
.\" HREF
|
||||
\fBpcre2syntax\fP
|
||||
.\"
|
||||
documentation. Only the short names for properties are supported. For example,
|
||||
\ep{L} matches a letter. Its Perl synonym, \ep{Letter}, is not supported.
|
||||
Furthermore, in Perl, many properties may optionally be prefixed by "Is", for
|
||||
compatibility with Perl 5.6. PCRE2 does not support this.
|
||||
documentation. In general, only the short names for properties are supported.
|
||||
For example, \ep{L} matches a letter. Its longer synonym, \ep{Letter}, is not
|
||||
supported. Furthermore, in Perl, many properties may optionally be prefixed by
|
||||
"Is", for compatibility with Perl 5.6. PCRE2 does not support this.
|
||||
.
|
||||
.
|
||||
.SH "WIDE CHARACTERS AND UTF MODES"
|
||||
|
@ -448,7 +449,7 @@ can be useful when searching for UTF text in executable or other binary files.
|
|||
.sp
|
||||
.nf
|
||||
Philip Hazel
|
||||
University Computing Service
|
||||
Retired from University Computing Service
|
||||
Cambridge, England.
|
||||
.fi
|
||||
.
|
||||
|
@ -457,6 +458,6 @@ Cambridge, England.
|
|||
.rs
|
||||
.sp
|
||||
.nf
|
||||
Last updated: 23 February 2020
|
||||
Copyright (c) 1997-2020 University of Cambridge.
|
||||
Last updated: 22 December 2021
|
||||
Copyright (c) 1997-2021 University of Cambridge.
|
||||
.fi
|
||||
|
|
8
index.md
8
index.md
|
@ -14,14 +14,14 @@ flexible API, the code of PCRE2 has been much improved since the fork.
|
|||
## Download
|
||||
|
||||
As well as downloading from the
|
||||
[GitHub site](https://github.com/PhilipHazel/pcre2), you can download PCRE2
|
||||
[GitHub site](https://github.com/PCRE2Project/pcre2), you can download PCRE2
|
||||
or the older, unmaintained PCRE1 library from an
|
||||
[*unofficial* mirror](https://sourceforge.net/projects/pcre/files/) at SourceForge.
|
||||
|
||||
You can check out the PCRE2 source code via Git or Subversion:
|
||||
|
||||
git clone https://github.com/PhilipHazel/pcre2.git
|
||||
svn co https://github.com/PhilipHazel/pcre2.git
|
||||
git clone https://github.com/PCRE2Project/pcre2.git
|
||||
svn co https://github.com/PCRE2Project/pcre2.git
|
||||
|
||||
## Contributed Ports
|
||||
|
||||
|
@ -36,7 +36,7 @@ default character encoding, can be found at
|
|||
## Documentation
|
||||
|
||||
You can read the PCRE2 documentation
|
||||
[here](https://philiphazel.github.io/pcre2/doc/html/index.html).
|
||||
[here](https://PCRE2Project.github.io/pcre2/doc/html/index.html).
|
||||
|
||||
Comparisons to Perl's regular expression semantics can be found in the
|
||||
community authored Wikipedia entry for PCRE.
|
||||
|
|
|
@ -0,0 +1,355 @@
|
|||
#! /usr/bin/python
|
||||
|
||||
# PCRE2 UNICODE PROPERTY SUPPORT
|
||||
# ------------------------------
|
||||
|
||||
# This file is a Python module containing common lists and functions for the
|
||||
# GenerateXXX scripts that create various.c and .h files from Unicode data
|
||||
# files. It was created as part of a re-organizaton of these scripts in
|
||||
# December 2021.
|
||||
|
||||
|
||||
import re
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# DATA LISTS
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
# BIDI classes in the DerivedBidiClass.txt file, with comments.
|
||||
|
||||
bidi_classes = [
|
||||
'AL', 'Arabic letter',
|
||||
'AN', 'Arabic number',
|
||||
'B', 'Paragraph separator',
|
||||
'BN', 'Boundary neutral',
|
||||
'CS', 'Common separator',
|
||||
'EN', 'European number',
|
||||
'ES', 'European separator',
|
||||
'ET', 'European terminator',
|
||||
'FSI', 'First strong isolate',
|
||||
'L', 'Left to right',
|
||||
'LRE', 'Left to right embedding',
|
||||
'LRI', 'Left to right isolate',
|
||||
'LRO', 'Left to right override',
|
||||
'NSM', 'Non-spacing mark',
|
||||
'ON', 'Other neutral',
|
||||
'PDF', 'Pop directional format',
|
||||
'PDI', 'Pop directional isolate',
|
||||
'R', 'Right to left',
|
||||
'RLE', 'Right to left embedding',
|
||||
'RLI', 'Right to left isolate',
|
||||
'RLO', 'Right to left override',
|
||||
'S', 'Segment separator',
|
||||
'WS', 'White space'
|
||||
]
|
||||
|
||||
# Particular category property names, with comments. NOTE: If ever this list
|
||||
# is changed, the table called "catposstab" in the pcre2_auto_possess.c file
|
||||
# must be edited to keep in step.
|
||||
|
||||
category_names = [
|
||||
'Cc', 'Control',
|
||||
'Cf', 'Format',
|
||||
'Cn', 'Unassigned',
|
||||
'Co', 'Private use',
|
||||
'Cs', 'Surrogate',
|
||||
'Ll', 'Lower case letter',
|
||||
'Lm', 'Modifier letter',
|
||||
'Lo', 'Other letter',
|
||||
'Lt', 'Title case letter',
|
||||
'Lu', 'Upper case letter',
|
||||
'Mc', 'Spacing mark',
|
||||
'Me', 'Enclosing mark',
|
||||
'Mn', 'Non-spacing mark',
|
||||
'Nd', 'Decimal number',
|
||||
'Nl', 'Letter number',
|
||||
'No', 'Other number',
|
||||
'Pc', 'Connector punctuation',
|
||||
'Pd', 'Dash punctuation',
|
||||
'Pe', 'Close punctuation',
|
||||
'Pf', 'Final punctuation',
|
||||
'Pi', 'Initial punctuation',
|
||||
'Po', 'Other punctuation',
|
||||
'Ps', 'Open punctuation',
|
||||
'Sc', 'Currency symbol',
|
||||
'Sk', 'Modifier symbol',
|
||||
'Sm', 'Mathematical symbol',
|
||||
'So', 'Other symbol',
|
||||
'Zl', 'Line separator',
|
||||
'Zp', 'Paragraph separator',
|
||||
'Zs', 'Space separator'
|
||||
]
|
||||
|
||||
# The Extended_Pictographic property is not found in the file where all the
|
||||
# others are (GraphemeBreakProperty.txt). It comes from the emoji-data.txt
|
||||
# file, but we list it here so that the name has the correct index value.
|
||||
|
||||
break_properties = [
|
||||
'CR', ' 0',
|
||||
'LF', ' 1',
|
||||
'Control', ' 2',
|
||||
'Extend', ' 3',
|
||||
'Prepend', ' 4',
|
||||
'SpacingMark', ' 5',
|
||||
'L', ' 6 Hangul syllable type L',
|
||||
'V', ' 7 Hangul syllable type V',
|
||||
'T', ' 8 Hangul syllable type T',
|
||||
'LV', ' 9 Hangul syllable type LV',
|
||||
'LVT', '10 Hangul syllable type LVT',
|
||||
'Regional_Indicator', '11',
|
||||
'Other', '12',
|
||||
'ZWJ', '13',
|
||||
'Extended_Pictographic', '14'
|
||||
]
|
||||
|
||||
# List of files from which the names of Boolean properties are obtained, along
|
||||
# with a list of regex patterns for properties to be ignored, and a list of
|
||||
# extra pattern names to add.
|
||||
|
||||
bool_propsfiles = ['PropList.txt', 'DerivedCoreProperties.txt', 'emoji-data.txt']
|
||||
bool_propsignore = [r'^Other_', r'^Hyphen$']
|
||||
bool_propsextras = ['ASCII', 'Bidi_Mirrored']
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# GET BOOLEAN PROPERTY NAMES
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
# Get a list of Boolean property names from a number of files.
|
||||
|
||||
def getbpropslist():
|
||||
bplist = []
|
||||
bplast = ""
|
||||
|
||||
for filename in bool_propsfiles:
|
||||
try:
|
||||
file = open('Unicode.tables/' + filename, 'r')
|
||||
except IOError:
|
||||
print(f"** Couldn't open {'Unicode.tables/' + filename}\n")
|
||||
sys.exit(1)
|
||||
|
||||
for line in file:
|
||||
line = re.sub(r'#.*', '', line)
|
||||
data = list(map(str.strip, line.split(';')))
|
||||
if len(data) <= 1 or data[1] == bplast:
|
||||
continue
|
||||
bplast = data[1]
|
||||
for pat in bool_propsignore:
|
||||
if re.match(pat, bplast) != None:
|
||||
break
|
||||
else:
|
||||
bplist.append(bplast)
|
||||
|
||||
file.close()
|
||||
|
||||
bplist.extend(bool_propsextras)
|
||||
bplist.sort()
|
||||
return bplist
|
||||
|
||||
bool_properties = getbpropslist()
|
||||
bool_props_list_item_size = (len(bool_properties) + 31) // 32
|
||||
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# COLLECTING PROPERTY NAMES AND ALIASES
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
script_names = ['Unknown']
|
||||
abbreviations = {}
|
||||
|
||||
def collect_property_names():
|
||||
global script_names
|
||||
global abbreviations
|
||||
|
||||
names_re = re.compile(r'^[0-9A-F]{4,6}(?:\.\.[0-9A-F]{4,6})? +; ([A-Za-z_]+) #')
|
||||
|
||||
last_script_name = ""
|
||||
with open("Unicode.tables/Scripts.txt") as f:
|
||||
for line in f:
|
||||
match_obj = names_re.match(line)
|
||||
|
||||
if match_obj == None or match_obj.group(1) == last_script_name:
|
||||
continue
|
||||
|
||||
last_script_name = match_obj.group(1)
|
||||
script_names.append(last_script_name)
|
||||
|
||||
# Sometimes there is comment in the line
|
||||
# so splitting around semicolon is not enough
|
||||
value_alias_re = re.compile(r' *([A-Za-z_]+) *; *([A-Za-z_]+) *; *([A-Za-z_]+)(?: *; *([A-Za-z_ ]+))?')
|
||||
|
||||
with open("Unicode.tables/PropertyValueAliases.txt") as f:
|
||||
for line in f:
|
||||
match_obj = value_alias_re.match(line)
|
||||
|
||||
if match_obj == None:
|
||||
continue
|
||||
|
||||
if match_obj.group(1) == "sc":
|
||||
if match_obj.group(2) == match_obj.group(3):
|
||||
abbreviations[match_obj.group(3)] = ()
|
||||
elif match_obj.group(4) == None:
|
||||
abbreviations[match_obj.group(3)] = (match_obj.group(2),)
|
||||
else:
|
||||
abbreviations[match_obj.group(3)] = (match_obj.group(2), match_obj.group(4))
|
||||
|
||||
# We can also collect Boolean property abbreviations into the same dictionary
|
||||
|
||||
bin_alias_re = re.compile(r' *([A-Za-z_]+) *; *([A-Za-z_]+)(?: *; *([A-Za-z_]+))?')
|
||||
with open("Unicode.tables/PropertyAliases.txt") as f:
|
||||
for line in f:
|
||||
match_obj = bin_alias_re.match(line)
|
||||
if match_obj == None:
|
||||
continue
|
||||
|
||||
if match_obj.group(2) in bool_properties:
|
||||
if match_obj.group(3) == None:
|
||||
abbreviations[match_obj.group(2)] = (match_obj.group(1),)
|
||||
else:
|
||||
abbreviations[match_obj.group(2)] = (match_obj.group(1), match_obj.group(3))
|
||||
|
||||
collect_property_names()
|
||||
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# REORDERING SCRIPT NAMES
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
script_abbrevs = []
|
||||
|
||||
def reorder_scripts():
|
||||
global script_names
|
||||
global script_abbrevs
|
||||
global abbreviations
|
||||
|
||||
for name in script_names:
|
||||
abbrevs = abbreviations[name]
|
||||
script_abbrevs.append(name if len(abbrevs) == 0 else abbrevs[0])
|
||||
|
||||
extended_script_abbrevs = set()
|
||||
with open("Unicode.tables/ScriptExtensions.txt") as f:
|
||||
names_re = re.compile(r'^[0-9A-F]{4,6}(?:\.\.[0-9A-F]{4,6})? +; ([A-Za-z_ ]+) #')
|
||||
|
||||
for line in f:
|
||||
match_obj = names_re.match(line)
|
||||
|
||||
if match_obj == None:
|
||||
continue
|
||||
|
||||
for name in match_obj.group(1).split(" "):
|
||||
extended_script_abbrevs.add(name)
|
||||
|
||||
new_script_names = []
|
||||
new_script_abbrevs = []
|
||||
|
||||
for idx, abbrev in enumerate(script_abbrevs):
|
||||
if abbrev in extended_script_abbrevs:
|
||||
new_script_names.append(script_names[idx])
|
||||
new_script_abbrevs.append(abbrev)
|
||||
|
||||
for idx, abbrev in enumerate(script_abbrevs):
|
||||
if abbrev not in extended_script_abbrevs:
|
||||
new_script_names.append(script_names[idx])
|
||||
new_script_abbrevs.append(abbrev)
|
||||
|
||||
script_names = new_script_names
|
||||
script_abbrevs = new_script_abbrevs
|
||||
|
||||
reorder_scripts()
|
||||
script_list_item_size = (script_names.index('Unknown') + 31) // 32
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# DERIVED LISTS
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
# Create general character property names from the first letters of the
|
||||
# particular categories.
|
||||
|
||||
gcn_set = set(category_names[i][0] for i in range(0, len(category_names), 2))
|
||||
general_category_names = list(gcn_set)
|
||||
general_category_names.sort()
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# FUNCTIONS
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
import sys
|
||||
|
||||
# Open an output file, using the command's argument or a default. Write common
|
||||
# preliminary header information.
|
||||
|
||||
def open_output(default):
|
||||
if len(sys.argv) > 2:
|
||||
print('** Too many arguments: just give a file name')
|
||||
sys.exit(1)
|
||||
if len(sys.argv) == 2:
|
||||
output_name = sys.argv[1]
|
||||
else:
|
||||
output_name = default
|
||||
try:
|
||||
file = open(output_name, "w")
|
||||
except IOError:
|
||||
print ("** Couldn't open %s" % output_name)
|
||||
sys.exit(1)
|
||||
|
||||
script_name = sys.argv[0]
|
||||
i = script_name.rfind('/')
|
||||
if i >= 0:
|
||||
script_name = script_name[i+1:]
|
||||
|
||||
file.write("""\
|
||||
/*************************************************
|
||||
* Perl-Compatible Regular Expressions *
|
||||
*************************************************/
|
||||
|
||||
/* PCRE is a library of functions to support regular expressions whose syntax
|
||||
and semantics are as close as possible to those of the Perl 5 language.
|
||||
|
||||
Written by Philip Hazel
|
||||
Original API code Copyright (c) 1997-2012 University of Cambridge
|
||||
New API code Copyright (c) 2016-2022 University of Cambridge
|
||||
|
||||
This module is auto-generated from Unicode data files. DO NOT EDIT MANUALLY!
|
||||
""")
|
||||
|
||||
file.write("Instead, modify the maint/%s script and run it to generate\n"
|
||||
"a new version of this code.\n\n" % script_name)
|
||||
|
||||
file.write("""\
|
||||
-----------------------------------------------------------------------------
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright notice,
|
||||
this list of conditions and the following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
|
||||
* Neither the name of the University of Cambridge nor the names of its
|
||||
contributors may be used to endorse or promote products derived from
|
||||
this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
POSSIBILITY OF SUCH DAMAGE.
|
||||
-----------------------------------------------------------------------------
|
||||
*/
|
||||
\n""")
|
||||
return file
|
||||
|
||||
# End of UcpCommon.py
|
|
@ -0,0 +1,188 @@
|
|||
#! /usr/bin/python
|
||||
|
||||
# PCRE2 UNICODE PROPERTY SUPPORT
|
||||
# ------------------------------
|
||||
#
|
||||
# This file auto-generates unicode property tests and their expected output.
|
||||
# It is recommended to re-run this generator after the unicode files are
|
||||
# updated. The names of the generated files are `testinput26` and `testoutput26`
|
||||
|
||||
import re
|
||||
import sys
|
||||
|
||||
from GenerateCommon import \
|
||||
script_names, \
|
||||
script_abbrevs
|
||||
|
||||
def write_both(text):
|
||||
input_file.write(text)
|
||||
output_file.write(text)
|
||||
|
||||
def to_string_char(ch_idx):
|
||||
if ch_idx < 128:
|
||||
if ch_idx < 16:
|
||||
return "\\x{0%x}" % ch_idx
|
||||
if ch_idx >= 32:
|
||||
return chr(ch_idx)
|
||||
return "\\x{%x}" % ch_idx
|
||||
|
||||
output_directory = ""
|
||||
|
||||
if len(sys.argv) > 2:
|
||||
print('** Too many arguments: just give a directory name')
|
||||
sys.exit(1)
|
||||
if len(sys.argv) == 2:
|
||||
output_directory = sys.argv[1]
|
||||
if not output_directory.endswith("/"):
|
||||
output_directory += "/"
|
||||
|
||||
try:
|
||||
input_file = open(output_directory + "testinput26", "w")
|
||||
output_file = open(output_directory + "testoutput26", "w")
|
||||
except IOError:
|
||||
print ("** Couldn't open output files")
|
||||
sys.exit(1)
|
||||
|
||||
write_both("# These tests are generated by maint/GenerateTest26.py, do not edit.\n\n")
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# UNICODE SCRIPT EXTENSION TESTS
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
write_both("# Unicode Script Extension tests.\n\n")
|
||||
|
||||
def gen_script_tests():
|
||||
script_data = [None] * len(script_names)
|
||||
char_data = [None] * 0x110000
|
||||
|
||||
property_re = re.compile("^([0-9A-F]{4,6})(?:\\.\\.([0-9A-F]{4,6}))? +; ([A-Za-z_ ]+) #")
|
||||
prev_name = ""
|
||||
script_idx = -1
|
||||
|
||||
with open("Unicode.tables/Scripts.txt") as f:
|
||||
for line in f:
|
||||
match_obj = property_re.match(line)
|
||||
|
||||
if match_obj == None:
|
||||
continue
|
||||
|
||||
name = match_obj.group(3)
|
||||
if name != prev_name:
|
||||
script_idx = script_names.index(name)
|
||||
prev_name = name
|
||||
|
||||
low = int(match_obj.group(1), 16)
|
||||
high = low
|
||||
char_data[low] = name
|
||||
|
||||
if match_obj.group(2) != None:
|
||||
high = int(match_obj.group(2), 16)
|
||||
for idx in range(low + 1, high + 1):
|
||||
char_data[idx] = name
|
||||
|
||||
if script_data[script_idx] == None:
|
||||
script_data[script_idx] = [low, None, None, None, None]
|
||||
script_data[script_idx][1] = high
|
||||
|
||||
extended_script_indicies = {}
|
||||
|
||||
with open("Unicode.tables/ScriptExtensions.txt") as f:
|
||||
for line in f:
|
||||
match_obj = property_re.match(line)
|
||||
|
||||
if match_obj == None:
|
||||
continue
|
||||
|
||||
low = int(match_obj.group(1), 16)
|
||||
high = low
|
||||
if match_obj.group(2) != None:
|
||||
high = int(match_obj.group(2), 16)
|
||||
|
||||
for abbrev in match_obj.group(3).split(" "):
|
||||
if abbrev not in extended_script_indicies:
|
||||
idx = script_abbrevs.index(abbrev)
|
||||
extended_script_indicies[abbrev] = idx
|
||||
rec = script_data[idx]
|
||||
rec[2] = low
|
||||
rec[3] = high
|
||||
else:
|
||||
idx = extended_script_indicies[abbrev]
|
||||
rec = script_data[idx]
|
||||
if rec[2] > low:
|
||||
rec[2] = low
|
||||
if rec[3] < high:
|
||||
rec[3] = high
|
||||
|
||||
if rec[4] == None:
|
||||
name = script_names[idx]
|
||||
for idx in range(low, high + 1):
|
||||
if char_data[idx] != name:
|
||||
rec[4] = idx
|
||||
break
|
||||
|
||||
long_property_name = False
|
||||
|
||||
for idx, rec in enumerate(script_data):
|
||||
script_name = script_names[idx]
|
||||
|
||||
if script_name == "Unknown":
|
||||
continue
|
||||
|
||||
script_abbrev = script_abbrevs[idx]
|
||||
|
||||
write_both("# Base script check\n")
|
||||
write_both("/^\\p{sc=%s}/utf\n" % script_name)
|
||||
write_both(" %s\n" % to_string_char(rec[0]))
|
||||
output_file.write(" 0: %s\n" % to_string_char(rec[0]))
|
||||
write_both("\n")
|
||||
|
||||
write_both("/^\\p{Script=%s}/utf\n" % script_abbrev)
|
||||
write_both(" %s\n" % to_string_char(rec[1]))
|
||||
output_file.write(" 0: %s\n" % to_string_char(rec[1]))
|
||||
write_both("\n")
|
||||
|
||||
if rec[2] != None:
|
||||
property_name = "scx"
|
||||
if long_property_name:
|
||||
property_name = "Script_Extensions"
|
||||
|
||||
write_both("# Script extension check\n")
|
||||
write_both("/^\\p{%s}/utf\n" % script_name)
|
||||
write_both(" %s\n" % to_string_char(rec[2]))
|
||||
output_file.write(" 0: %s\n" % to_string_char(rec[2]))
|
||||
write_both("\n")
|
||||
|
||||
write_both("/^\\p{%s=%s}/utf\n" % (property_name, script_abbrev))
|
||||
write_both(" %s\n" % to_string_char(rec[3]))
|
||||
output_file.write(" 0: %s\n" % to_string_char(rec[3]))
|
||||
write_both("\n")
|
||||
|
||||
long_property_name = not long_property_name
|
||||
|
||||
if rec[4] != None:
|
||||
write_both("# Script extension only character\n")
|
||||
write_both("/^\\p{%s}/utf\n" % script_name)
|
||||
write_both(" %s\n" % to_string_char(rec[4]))
|
||||
output_file.write(" 0: %s\n" % to_string_char(rec[4]))
|
||||
write_both("\n")
|
||||
|
||||
write_both("/^\\p{sc=%s}/utf\n" % script_name)
|
||||
write_both(" %s\n" % to_string_char(rec[4]))
|
||||
output_file.write("No match\n")
|
||||
write_both("\n")
|
||||
else:
|
||||
print("External character has not found for %s" % script_name)
|
||||
|
||||
high = rec[1]
|
||||
if rec[3] != None and rec[3] > rec[1]:
|
||||
high = rec[3]
|
||||
write_both("# Character not in script\n")
|
||||
write_both("/^\\p{%s}/utf\n" % script_name)
|
||||
write_both(" %s\n" % to_string_char(high + 1))
|
||||
output_file.write("No match\n")
|
||||
write_both("\n")
|
||||
|
||||
|
||||
gen_script_tests()
|
||||
|
||||
write_both("# End of testinput26\n")
|
|
@ -0,0 +1,923 @@
|
|||
#! /usr/bin/python
|
||||
|
||||
# PCRE2 UNICODE PROPERTY SUPPORT
|
||||
# ------------------------------
|
||||
#
|
||||
# This script generates the pcre2_ucd.c file from Unicode data files. This is
|
||||
# the compressed Unicode property data used by PCRE2. The script was created in
|
||||
# December 2021 as part of the Unicode data generation refactoring. It is
|
||||
# basically a re-working of the MultiStage2.py script that was submitted to the
|
||||
# PCRE project by Peter Kankowski in 2008 as part of a previous upgrading of
|
||||
# Unicode property support. A number of extensions have since been added. The
|
||||
# main difference in the 2021 upgrade (apart from comments and layout) is that
|
||||
# the data tables (e.g. list of script names) are now listed in or generated by
|
||||
# a separate Python module that is shared with the other Generate scripts.
|
||||
#
|
||||
# This script must be run in the "maint" directory. It requires the following
|
||||
# Unicode data tables: BidiMirrorring.txt, CaseFolding.txt,
|
||||
# DerivedBidiClass.txt, DerivedCoreProperties.txt, DerivedGeneralCategory.txt,
|
||||
# GraphemeBreakProperty.txt, PropList.txt, PropertyAliases.txt,
|
||||
# PropertyValueAliases.txt, ScriptExtensions.txt, Scripts.txt, and
|
||||
# emoji-data.txt. These must be in the Unicode.tables subdirectory.
|
||||
#
|
||||
# The emoji-data.txt file is found in the "emoji" subdirectory even though it
|
||||
# is technically part of a different (but coordinated) standard as shown
|
||||
# in files associated with Unicode Technical Standard #51 ("Unicode Emoji"),
|
||||
# for example:
|
||||
#
|
||||
# http://unicode.org/Public/emoji/13.0/ReadMe.txt
|
||||
#
|
||||
# DerivedBidiClass.txt and DerivedGeneralCategory.txt are in the "extracted"
|
||||
# subdirectory of the Unicode database (UCD) on the Unicode web site;
|
||||
# GraphemeBreakProperty.txt is in the "auxiliary" subdirectory. The other files
|
||||
# are in the top-level UCD directory.
|
||||
#
|
||||
# -----------------------------------------------------------------------------
|
||||
# Minor modifications made to the original script:
|
||||
# Added #! line at start
|
||||
# Removed tabs
|
||||
# Made it work with Python 2.4 by rewriting two statements that needed 2.5
|
||||
# Consequent code tidy
|
||||
# Adjusted data file names to take from the Unicode.tables directory
|
||||
# Adjusted global table names by prefixing _pcre_.
|
||||
# Commented out stuff relating to the casefolding table, which isn't used;
|
||||
# removed completely in 2012.
|
||||
# Corrected size calculation
|
||||
# Add #ifndef SUPPORT_UCP to use dummy tables when no UCP support is needed.
|
||||
# Update for PCRE2: name changes, and SUPPORT_UCP is abolished.
|
||||
#
|
||||
# Major modifications made to the original script:
|
||||
# Added code to add a grapheme break property field to records.
|
||||
#
|
||||
# Added code to search for sets of more than two characters that must match
|
||||
# each other caselessly. A new table is output containing these sets, and
|
||||
# offsets into the table are added to the main output records. This new
|
||||
# code scans CaseFolding.txt instead of UnicodeData.txt, which is no longer
|
||||
# used.
|
||||
#
|
||||
# Update for Python3:
|
||||
# . Processed with 2to3, but that didn't fix everything
|
||||
# . Changed string.strip to str.strip
|
||||
# . Added encoding='utf-8' to the open() call
|
||||
# . Inserted 'int' before blocksize/ELEMS_PER_LINE because an int is
|
||||
# required and the result of the division is a float
|
||||
#
|
||||
# Added code to scan the emoji-data.txt file to find the Extended Pictographic
|
||||
# property, which is used by PCRE2 as a grapheme breaking property. This was
|
||||
# done when updating to Unicode 11.0.0 (July 2018).
|
||||
#
|
||||
# Added code to add a Script Extensions field to records. This has increased
|
||||
# their size from 8 to 12 bytes, only 10 of which are currently used.
|
||||
#
|
||||
# Added code to add a bidi class field to records by scanning the
|
||||
# DerivedBidiClass.txt and PropList.txt files. This uses one of the two spare
|
||||
# bytes, so now 11 out of 12 are in use.
|
||||
#
|
||||
# 01-March-2010: Updated list of scripts for Unicode 5.2.0
|
||||
# 30-April-2011: Updated list of scripts for Unicode 6.0.0
|
||||
# July-2012: Updated list of scripts for Unicode 6.1.0
|
||||
# 20-August-2012: Added scan of GraphemeBreakProperty.txt and added a new
|
||||
# field in the record to hold the value. Luckily, the
|
||||
# structure had a hole in it, so the resulting table is
|
||||
# not much bigger than before.
|
||||
# 18-September-2012: Added code for multiple caseless sets. This uses the
|
||||
# final hole in the structure.
|
||||
# 30-September-2012: Added RegionalIndicator break property from Unicode 6.2.0
|
||||
# 13-May-2014: Updated for PCRE2
|
||||
# 03-June-2014: Updated for Python 3
|
||||
# 20-June-2014: Updated for Unicode 7.0.0
|
||||
# 12-August-2014: Updated to put Unicode version into the file
|
||||
# 19-June-2015: Updated for Unicode 8.0.0
|
||||
# 02-July-2017: Updated for Unicode 10.0.0
|
||||
# 03-July-2018: Updated for Unicode 11.0.0
|
||||
# 07-July-2018: Added code to scan emoji-data.txt for the Extended
|
||||
# Pictographic property.
|
||||
# 01-October-2018: Added the 'Unknown' script name
|
||||
# 03-October-2018: Added new field for Script Extensions
|
||||
# 27-July-2019: Updated for Unicode 12.1.0
|
||||
# 10-March-2020: Updated for Unicode 13.0.0
|
||||
# PCRE2-10.39: Updated for Unicode 14.0.0
|
||||
# 05-December-2021: Added code to scan DerivedBidiClass.txt for bidi class,
|
||||
# and also PropList.txt for the Bidi_Control property
|
||||
# 19-December-2021: Reworked script extensions lists to be bit maps instead
|
||||
# of zero-terminated lists of script numbers.
|
||||
# ----------------------------------------------------------------------------
|
||||
#
|
||||
# Changes to the refactored script:
|
||||
#
|
||||
# 26-December-2021: Refactoring completed
|
||||
# 10-January-2022: Addition of general Boolean property support
|
||||
# 12-January-2022: Merge scriptx and bidiclass fields
|
||||
# 14-January-2022: Enlarge Boolean property offset to 12 bits
|
||||
#
|
||||
# ----------------------------------------------------------------------------
|
||||
#
|
||||
#
|
||||
# The main tables generated by this script are used by macros defined in
|
||||
# pcre2_internal.h. They look up Unicode character properties using short
|
||||
# sequences of code that contains no branches, which makes for greater speed.
|
||||
#
|
||||
# Conceptually, there is a table of records (of type ucd_record), one for each
|
||||
# Unicode character. Each record contains the script number, script extension
|
||||
# value, character type, grapheme break type, offset to caseless matching set,
|
||||
# offset to the character's other case, the bidi class, and offset to bitmap of
|
||||
# Boolean properties.
|
||||
#
|
||||
# A real table covering all Unicode characters would be far too big. It can be
|
||||
# efficiently compressed by observing that many characters have the same
|
||||
# record, and many blocks of characters (taking 128 characters in a block) have
|
||||
# the same set of records as other blocks. This leads to a 2-stage lookup
|
||||
# process.
|
||||
#
|
||||
# This script constructs seven tables. The ucd_caseless_sets table contains
|
||||
# lists of characters that all match each other caselessly. Each list is
|
||||
# in order, and is terminated by NOTACHAR (0xffffffff), which is larger than
|
||||
# any valid character. The first list is empty; this is used for characters
|
||||
# that are not part of any list.
|
||||
#
|
||||
# The ucd_digit_sets table contains the code points of the '9' characters in
|
||||
# each set of 10 decimal digits in Unicode. This is used to ensure that digits
|
||||
# in script runs all come from the same set. The first element in the vector
|
||||
# contains the number of subsequent elements, which are in ascending order.
|
||||
#
|
||||
# Scripts are partitioned into two groups. Scripts that appear in at least one
|
||||
# character's script extension list come first, followed by "Unknown" and then
|
||||
# all the rest. This sorting is done automatically in the GenerateCommon.py
|
||||
# script. A script's number is its index in the script_names list.
|
||||
#
|
||||
# The ucd_script_sets table contains bitmaps that represent lists of scripts
|
||||
# for Script Extensions properties. Each bitmap consists of a fixed number of
|
||||
# unsigned 32-bit numbers, enough to allocate a bit for every script that is
|
||||
# used in any character's extension list, that is, enough for every script
|
||||
# whose number is less than ucp_Unknown. A character's script extension value
|
||||
# in its ucd record is an offset into the ucd_script_sets vector. The first
|
||||
# bitmap has no bits set; characters that have no script extensions have zero
|
||||
# as their script extensions value so that they use this map.
|
||||
#
|
||||
# The ucd_boolprop_sets table contains bitmaps that represent lists of Boolean
|
||||
# properties. Each bitmap consists of a fixed number of unsigned 32-bit
|
||||
# numbers, enough to allocate a bit for each supported Boolean property.
|
||||
#
|
||||
# The ucd_records table contains one instance of every unique character record
|
||||
# that is required. The ucd_stage1 table is indexed by a character's block
|
||||
# number, which is the character's code point divided by 128, since 128 is the
|
||||
# size of each block. The result of a lookup in ucd_stage1 a "virtual" block
|
||||
# number.
|
||||
#
|
||||
# The ucd_stage2 table is a table of "virtual" blocks; each block is indexed by
|
||||
# the offset of a character within its own block, and the result is the index
|
||||
# number of the required record in the ucd_records vector.
|
||||
#
|
||||
# The following examples are correct for the Unicode 14.0.0 database. Future
|
||||
# updates may make change the actual lookup values.
|
||||
#
|
||||
# Example: lowercase "a" (U+0061) is in block 0
|
||||
# lookup 0 in stage1 table yields 0
|
||||
# lookup 97 (0x61) in the first table in stage2 yields 35
|
||||
# record 35 is { 0, 5, 12, 0, -32, 18432, 44 }
|
||||
# 0 = ucp_Latin => Latin script
|
||||
# 5 = ucp_Ll => Lower case letter
|
||||
# 12 = ucp_gbOther => Grapheme break property "Other"
|
||||
# 0 => Not part of a caseless set
|
||||
# -32 (-0x20) => Other case is U+0041
|
||||
# 18432 = 0x4800 => Combined Bidi class + script extension values
|
||||
# 44 => Offset to Boolean properties
|
||||
#
|
||||
# The top 5 bits of the sixth field are the Bidi class, with the rest being the
|
||||
# script extension value, giving:
|
||||
#
|
||||
# 9 = ucp_bidiL => Bidi class left-to-right
|
||||
# 0 => No special script extension property
|
||||
#
|
||||
# Almost all lowercase latin characters resolve to the same record. One or two
|
||||
# are different because they are part of a multi-character caseless set (for
|
||||
# example, k, K and the Kelvin symbol are such a set).
|
||||
#
|
||||
# Example: hiragana letter A (U+3042) is in block 96 (0x60)
|
||||
# lookup 96 in stage1 table yields 93
|
||||
# lookup 66 (0x42) in table 93 in stage2 yields 819
|
||||
# record 819 is { 20, 7, 12, 0, 0, 18432, 82 }
|
||||
# 20 = ucp_Hiragana => Hiragana script
|
||||
# 7 = ucp_Lo => Other letter
|
||||
# 12 = ucp_gbOther => Grapheme break property "Other"
|
||||
# 0 => Not part of a caseless set
|
||||
# 0 => No other case
|
||||
# 18432 = 0x4800 => Combined Bidi class + script extension values
|
||||
# 82 => Offset to Boolean properties
|
||||
#
|
||||
# The top 5 bits of the sixth field are the Bidi class, with the rest being the
|
||||
# script extension value, giving:
|
||||
#
|
||||
# 9 = ucp_bidiL => Bidi class left-to-right
|
||||
# 0 => No special script extension property
|
||||
#
|
||||
# Example: vedic tone karshana (U+1CD0) is in block 57 (0x39)
|
||||
# lookup 57 in stage1 table yields 55
|
||||
# lookup 80 (0x50) in table 55 in stage2 yields 621
|
||||
# record 621 is { 84, 12, 3, 0, 0, 26762, 96 }
|
||||
# 84 = ucp_Inherited => Script inherited from predecessor
|
||||
# 12 = ucp_Mn => Non-spacing mark
|
||||
# 3 = ucp_gbExtend => Grapheme break property "Extend"
|
||||
# 0 => Not part of a caseless set
|
||||
# 0 => No other case
|
||||
# 26762 = 0x688A => Combined Bidi class + script extension values
|
||||
# 96 => Offset to Boolean properties
|
||||
#
|
||||
# The top 5 bits of the sixth field are the Bidi class, with the rest being the
|
||||
# script extension value, giving:
|
||||
#
|
||||
# 13 = ucp_bidiNSM => Bidi class non-spacing mark
|
||||
# 138 => Script Extension list offset = 138
|
||||
#
|
||||
# At offset 138 in the ucd_script_sets vector we find a bitmap with bits 1, 8,
|
||||
# 18, and 47 set. This means that this character is expected to be used with
|
||||
# any of those scripts, which are Bengali, Devanagari, Kannada, and Grantha.
|
||||
#
|
||||
# Philip Hazel, last updated 14 January 2022.
|
||||
##############################################################################
|
||||
|
||||
|
||||
# Import standard modules
|
||||
|
||||
import re
|
||||
import string
|
||||
import sys
|
||||
|
||||
# Import common data lists and functions
|
||||
|
||||
from GenerateCommon import \
|
||||
bidi_classes, \
|
||||
bool_properties, \
|
||||
bool_propsfiles, \
|
||||
bool_props_list_item_size, \
|
||||
break_properties, \
|
||||
category_names, \
|
||||
general_category_names, \
|
||||
script_abbrevs, \
|
||||
script_list_item_size, \
|
||||
script_names, \
|
||||
open_output
|
||||
|
||||
# Some general parameters
|
||||
|
||||
MAX_UNICODE = 0x110000
|
||||
NOTACHAR = 0xffffffff
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# DEFINE FUNCTIONS
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
# Parse a line of Scripts.txt, GraphemeBreakProperty.txt, DerivedBidiClass.txt
|
||||
# or DerivedGeneralCategory.txt
|
||||
|
||||
def make_get_names(enum):
|
||||
return lambda chardata: enum.index(chardata[1])
|
||||
|
||||
|
||||
# Parse a line of CaseFolding.txt
|
||||
|
||||
def get_other_case(chardata):
|
||||
if chardata[1] == 'C' or chardata[1] == 'S':
|
||||
return int(chardata[2], 16) - int(chardata[0], 16)
|
||||
return 0
|
||||
|
||||
|
||||
# Parse a line of ScriptExtensions.txt
|
||||
|
||||
def get_script_extension(chardata):
|
||||
global last_script_extension
|
||||
|
||||
offset = len(script_lists) * script_list_item_size
|
||||
if last_script_extension == chardata[1]:
|
||||
return offset - script_list_item_size
|
||||
|
||||
last_script_extension = chardata[1]
|
||||
script_lists.append(tuple(script_abbrevs.index(abbrev) for abbrev in last_script_extension.split(' ')))
|
||||
return offset
|
||||
|
||||
|
||||
# Read a whole table in memory, setting/checking the Unicode version
|
||||
|
||||
def read_table(file_name, get_value, default_value):
|
||||
global unicode_version
|
||||
|
||||
f = re.match(r'^[^/]+/([^.]+)\.txt$', file_name)
|
||||
file_base = f.group(1)
|
||||
version_pat = r"^# " + re.escape(file_base) + r"-(\d+\.\d+\.\d+)\.txt$"
|
||||
file = open(file_name, 'r', encoding='utf-8')
|
||||
f = re.match(version_pat, file.readline())
|
||||
version = f.group(1)
|
||||
if unicode_version == "":
|
||||
unicode_version = version
|
||||
elif unicode_version != version:
|
||||
print("WARNING: Unicode version differs in %s", file_name, file=sys.stderr)
|
||||
|
||||
table = [default_value] * MAX_UNICODE
|
||||
for line in file:
|
||||
line = re.sub(r'#.*', '', line)
|
||||
chardata = list(map(str.strip, line.split(';')))
|
||||
if len(chardata) <= 1:
|
||||
continue
|
||||
value = get_value(chardata)
|
||||
m = re.match(r'([0-9a-fA-F]+)(\.\.([0-9a-fA-F]+))?$', chardata[0])
|
||||
char = int(m.group(1), 16)
|
||||
if m.group(3) is None:
|
||||
last = char
|
||||
else:
|
||||
last = int(m.group(3), 16)
|
||||
for i in range(char, last + 1):
|
||||
# It is important not to overwrite a previously set value because in the
|
||||
# CaseFolding file there are lines to be ignored (returning the default
|
||||
# value of 0) which often come after a line which has already set data.
|
||||
if table[i] == default_value:
|
||||
table[i] = value
|
||||
file.close()
|
||||
return table
|
||||
|
||||
|
||||
# Get the smallest possible C language type for the values in a table
|
||||
|
||||
def get_type_size(table):
|
||||
type_size = [("uint8_t", 1), ("uint16_t", 2), ("uint32_t", 4),
|
||||
("signed char", 1), ("int16_t", 2), ("int32_t", 4)]
|
||||
limits = [(0, 255), (0, 65535), (0, 4294967295), (-128, 127),
|
||||
(-32768, 32767), (-2147483648, 2147483647)]
|
||||
minval = min(table)
|
||||
maxval = max(table)
|
||||
for num, (minlimit, maxlimit) in enumerate(limits):
|
||||
if minlimit <= minval and maxval <= maxlimit:
|
||||
return type_size[num]
|
||||
raise OverflowError("Too large to fit into C types")
|
||||
|
||||
|
||||
# Get the total size of a list of tables
|
||||
|
||||
def get_tables_size(*tables):
|
||||
total_size = 0
|
||||
for table in tables:
|
||||
type, size = get_type_size(table)
|
||||
total_size += size * len(table)
|
||||
return total_size
|
||||
|
||||
|
||||
# Compress a table into the two stages
|
||||
|
||||
def compress_table(table, block_size):
|
||||
blocks = {} # Dictionary for finding identical blocks
|
||||
stage1 = [] # Stage 1 table contains block numbers (indices into stage 2 table)
|
||||
stage2 = [] # Stage 2 table contains the blocks with property values
|
||||
table = tuple(table)
|
||||
for i in range(0, len(table), block_size):
|
||||
block = table[i:i+block_size]
|
||||
start = blocks.get(block)
|
||||
if start is None:
|
||||
# Allocate a new block
|
||||
start = len(stage2) / block_size
|
||||
stage2 += block
|
||||
blocks[block] = start
|
||||
stage1.append(start)
|
||||
return stage1, stage2
|
||||
|
||||
|
||||
# Output a table
|
||||
|
||||
def write_table(table, table_name, block_size = None):
|
||||
type, size = get_type_size(table)
|
||||
ELEMS_PER_LINE = 16
|
||||
|
||||
s = "const %s %s[] = { /* %d bytes" % (type, table_name, size * len(table))
|
||||
if block_size:
|
||||
s += ", block = %d" % block_size
|
||||
f.write(s + " */\n")
|
||||
table = tuple(table)
|
||||
if block_size is None:
|
||||
fmt = "%3d," * ELEMS_PER_LINE + " /* U+%04X */\n"
|
||||
mult = MAX_UNICODE / len(table)
|
||||
for i in range(0, len(table), ELEMS_PER_LINE):
|
||||
f.write(fmt % (table[i:i+ELEMS_PER_LINE] + (int(i * mult),)))
|
||||
else:
|
||||
if block_size > ELEMS_PER_LINE:
|
||||
el = ELEMS_PER_LINE
|
||||
else:
|
||||
el = block_size
|
||||
fmt = "%3d," * el + "\n"
|
||||
if block_size > ELEMS_PER_LINE:
|
||||
fmt = fmt * int(block_size / ELEMS_PER_LINE)
|
||||
for i in range(0, len(table), block_size):
|
||||
f.write(("\n/* block %d */\n" + fmt) % ((i / block_size,) + table[i:i+block_size]))
|
||||
f.write("};\n\n")
|
||||
|
||||
|
||||
# Extract the unique combinations of properties into records
|
||||
|
||||
def combine_tables(*tables):
|
||||
records = {}
|
||||
index = []
|
||||
for t in zip(*tables):
|
||||
i = records.get(t)
|
||||
if i is None:
|
||||
i = records[t] = len(records)
|
||||
index.append(i)
|
||||
return index, records
|
||||
|
||||
|
||||
# Create a record struct
|
||||
|
||||
def get_record_size_struct(records):
|
||||
size = 0
|
||||
structure = 'typedef struct {\n'
|
||||
for i in range(len(records[0])):
|
||||
record_slice = [record[i] for record in records]
|
||||
slice_type, slice_size = get_type_size(record_slice)
|
||||
# add padding: round up to the nearest power of slice_size
|
||||
size = (size + slice_size - 1) & -slice_size
|
||||
size += slice_size
|
||||
structure += '%s property_%d;\n' % (slice_type, i)
|
||||
|
||||
# round up to the first item of the next structure in array
|
||||
record_slice = [record[0] for record in records]
|
||||
slice_type, slice_size = get_type_size(record_slice)
|
||||
size = (size + slice_size - 1) & -slice_size
|
||||
|
||||
structure += '} ucd_record;\n*/\n'
|
||||
return size, structure
|
||||
|
||||
|
||||
# Write records
|
||||
|
||||
def write_records(records, record_size):
|
||||
f.write('const ucd_record PRIV(ucd_records)[] = { ' + \
|
||||
'/* %d bytes, record size %d */\n' % (len(records) * record_size, record_size))
|
||||
records = list(zip(list(records.keys()), list(records.values())))
|
||||
records.sort(key = lambda x: x[1])
|
||||
for i, record in enumerate(records):
|
||||
f.write((' {' + '%6d, ' * len(record[0]) + '}, /* %3d */\n') % (record[0] + (i,)))
|
||||
f.write('};\n\n')
|
||||
|
||||
|
||||
# Write a bit set
|
||||
|
||||
def write_bitsets(list, item_size):
|
||||
for d in list:
|
||||
bitwords = [0] * item_size
|
||||
for idx in d:
|
||||
bitwords[idx // 32] |= 1 << (idx & 31)
|
||||
s = " "
|
||||
for x in bitwords:
|
||||
f.write("%s" % s)
|
||||
s = ", "
|
||||
f.write("0x%08xu" % x)
|
||||
f.write(",\n")
|
||||
f.write("};\n\n")
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# This bit of code must have been useful when the original script was being
|
||||
# developed. Retain it just in case it is ever needed again.
|
||||
|
||||
# def test_record_size():
|
||||
# tests = [ \
|
||||
# ( [(3,), (6,), (6,), (1,)], 1 ), \
|
||||
# ( [(300,), (600,), (600,), (100,)], 2 ), \
|
||||
# ( [(25, 3), (6, 6), (34, 6), (68, 1)], 2 ), \
|
||||
# ( [(300, 3), (6, 6), (340, 6), (690, 1)], 4 ), \
|
||||
# ( [(3, 300), (6, 6), (6, 340), (1, 690)], 4 ), \
|
||||
# ( [(300, 300), (6, 6), (6, 340), (1, 690)], 4 ), \
|
||||
# ( [(3, 100000), (6, 6), (6, 123456), (1, 690)], 8 ), \
|
||||
# ( [(100000, 300), (6, 6), (123456, 6), (1, 690)], 8 ), \
|
||||
# ]
|
||||
# for test in tests:
|
||||
# size, struct = get_record_size_struct(test[0])
|
||||
# assert(size == test[1])
|
||||
# test_record_size()
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# MAIN CODE FOR CREATING TABLES
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
unicode_version = ""
|
||||
|
||||
# Some of the tables imported from GenerateCommon.py have alternate comment
|
||||
# strings for use by GenerateUcpHeader. The comments are not wanted here, so
|
||||
# remove them.
|
||||
|
||||
bidi_classes = bidi_classes[::2]
|
||||
break_properties = break_properties[::2]
|
||||
category_names = category_names[::2]
|
||||
|
||||
# Create the various tables from Unicode data files
|
||||
|
||||
script = read_table('Unicode.tables/Scripts.txt', make_get_names(script_names), script_names.index('Unknown'))
|
||||
category = read_table('Unicode.tables/DerivedGeneralCategory.txt', make_get_names(category_names), category_names.index('Cn'))
|
||||
break_props = read_table('Unicode.tables/GraphemeBreakProperty.txt', make_get_names(break_properties), break_properties.index('Other'))
|
||||
other_case = read_table('Unicode.tables/CaseFolding.txt', get_other_case, 0)
|
||||
bidi_class = read_table('Unicode.tables/DerivedBidiClass.txt', make_get_names(bidi_classes), bidi_classes.index('L'))
|
||||
|
||||
# The grapheme breaking rules were changed for Unicode 11.0.0 (June 2018). Now
|
||||
# we need to find the Extended_Pictographic property for emoji characters. This
|
||||
# can be set as an additional grapheme break property, because the default for
|
||||
# all the emojis is "other". We scan the emoji-data.txt file and modify the
|
||||
# break-props table.
|
||||
|
||||
file = open('Unicode.tables/emoji-data.txt', 'r', encoding='utf-8')
|
||||
for line in file:
|
||||
line = re.sub(r'#.*', '', line)
|
||||
chardata = list(map(str.strip, line.split(';')))
|
||||
if len(chardata) <= 1:
|
||||
continue
|
||||
if chardata[1] != "Extended_Pictographic":
|
||||
continue
|
||||
m = re.match(r'([0-9a-fA-F]+)(\.\.([0-9a-fA-F]+))?$', chardata[0])
|
||||
char = int(m.group(1), 16)
|
||||
if m.group(3) is None:
|
||||
last = char
|
||||
else:
|
||||
last = int(m.group(3), 16)
|
||||
for i in range(char, last + 1):
|
||||
if break_props[i] != break_properties.index('Other'):
|
||||
print("WARNING: Emoji 0x%x has break property %s, not 'Other'",
|
||||
i, break_properties[break_props[i]], file=sys.stderr)
|
||||
break_props[i] = break_properties.index('Extended_Pictographic')
|
||||
file.close()
|
||||
|
||||
# Handle script extensions. The get_script_extesion() function maintains a
|
||||
# list of unique bitmaps representing lists of scripts, returning the offset
|
||||
# in that list. Initialize the list with an empty set, which is used for
|
||||
# characters that have no script extensions.
|
||||
|
||||
script_lists = [[]]
|
||||
last_script_extension = ""
|
||||
scriptx_bidi_class = read_table('Unicode.tables/ScriptExtensions.txt', get_script_extension, 0)
|
||||
|
||||
for idx in range(len(scriptx_bidi_class)):
|
||||
scriptx_bidi_class[idx] = scriptx_bidi_class[idx] | (bidi_class[idx] << 11)
|
||||
bidi_class = None
|
||||
|
||||
# Find the Boolean properties of each character. This next bit of magic creates
|
||||
# a list of empty lists. Using [[]] * MAX_UNICODE gives a list of references to
|
||||
# the *same* list, which is not what we want.
|
||||
|
||||
bprops = [[] for _ in range(MAX_UNICODE)]
|
||||
|
||||
# Collect the properties from the various files
|
||||
|
||||
for filename in bool_propsfiles:
|
||||
try:
|
||||
file = open('Unicode.tables/' + filename, 'r')
|
||||
except IOError:
|
||||
print(f"** Couldn't open {'Unicode.tables/' + filename}\n")
|
||||
sys.exit(1)
|
||||
|
||||
for line in file:
|
||||
line = re.sub(r'#.*', '', line)
|
||||
data = list(map(str.strip, line.split(';')))
|
||||
if len(data) <= 1:
|
||||
continue
|
||||
|
||||
try:
|
||||
ix = bool_properties.index(data[1])
|
||||
except ValueError:
|
||||
continue
|
||||
|
||||
m = re.match(r'([0-9a-fA-F]+)(\.\.([0-9a-fA-F]+))?$', data[0])
|
||||
char = int(m.group(1), 16)
|
||||
if m.group(3) is None:
|
||||
last = char
|
||||
else:
|
||||
last = int(m.group(3), 16)
|
||||
|
||||
for i in range(char, last + 1):
|
||||
bprops[i].append(ix)
|
||||
|
||||
file.close()
|
||||
|
||||
# The ASCII property isn't listed in any files, but it is easy enough to add
|
||||
# it manually.
|
||||
|
||||
ix = bool_properties.index("ASCII")
|
||||
for i in range(128):
|
||||
bprops[i].append(ix)
|
||||
|
||||
# The Bidi_Mirrored property isn't listed in any property files. We have to
|
||||
# deduce it from the file that lists the mirrored characters.
|
||||
|
||||
ix = bool_properties.index("Bidi_Mirrored")
|
||||
|
||||
try:
|
||||
file = open('Unicode.tables/BidiMirroring.txt', 'r')
|
||||
except IOError:
|
||||
print(f"** Couldn't open {'Unicode.tables/BidiMirroring.txt'}\n")
|
||||
sys.exit(1)
|
||||
|
||||
for line in file:
|
||||
line = re.sub(r'#.*', '', line)
|
||||
data = list(map(str.strip, line.split(';')))
|
||||
if len(data) <= 1:
|
||||
continue
|
||||
c = int(data[0], 16)
|
||||
bprops[c].append(ix)
|
||||
|
||||
file.close()
|
||||
|
||||
# Scan each character's boolean property list and created a list of unique
|
||||
# lists, at the same time, setting the index in that list for each property in
|
||||
# the bool_props vector.
|
||||
|
||||
bool_props = [0] * MAX_UNICODE
|
||||
bool_props_lists = [[]]
|
||||
|
||||
for c in range(MAX_UNICODE):
|
||||
s = set(bprops[c])
|
||||
for i in range(len(bool_props_lists)):
|
||||
if s == set(bool_props_lists[i]):
|
||||
break;
|
||||
else:
|
||||
bool_props_lists.append(bprops[c])
|
||||
i += 1
|
||||
|
||||
bool_props[c] = i * bool_props_list_item_size
|
||||
|
||||
# This block of code was added by PH in September 2012. It scans the other_case
|
||||
# table to find sets of more than two characters that must all match each other
|
||||
# caselessly. Later in this script a table of these sets is written out.
|
||||
# However, we have to do this work here in order to compute the offsets in the
|
||||
# table that are inserted into the main table.
|
||||
|
||||
# The CaseFolding.txt file lists pairs, but the common logic for reading data
|
||||
# sets only one value, so first we go through the table and set "return"
|
||||
# offsets for those that are not already set.
|
||||
|
||||
for c in range(MAX_UNICODE):
|
||||
if other_case[c] != 0 and other_case[c + other_case[c]] == 0:
|
||||
other_case[c + other_case[c]] = -other_case[c]
|
||||
|
||||
# Now scan again and create equivalence sets.
|
||||
|
||||
caseless_sets = []
|
||||
|
||||
for c in range(MAX_UNICODE):
|
||||
o = c + other_case[c]
|
||||
|
||||
# Trigger when this character's other case does not point back here. We
|
||||
# now have three characters that are case-equivalent.
|
||||
|
||||
if other_case[o] != -other_case[c]:
|
||||
t = o + other_case[o]
|
||||
|
||||
# Scan the existing sets to see if any of the three characters are already
|
||||
# part of a set. If so, unite the existing set with the new set.
|
||||
|
||||
appended = 0
|
||||
for s in caseless_sets:
|
||||
found = 0
|
||||
for x in s:
|
||||
if x == c or x == o or x == t:
|
||||
found = 1
|
||||
|
||||
# Add new characters to an existing set
|
||||
|
||||
if found:
|
||||
found = 0
|
||||
for y in [c, o, t]:
|
||||
for x in s:
|
||||
if x == y:
|
||||
found = 1
|
||||
if not found:
|
||||
s.append(y)
|
||||
appended = 1
|
||||
|
||||
# If we have not added to an existing set, create a new one.
|
||||
|
||||
if not appended:
|
||||
caseless_sets.append([c, o, t])
|
||||
|
||||
# End of loop looking for caseless sets.
|
||||
|
||||
# Now scan the sets and set appropriate offsets for the characters.
|
||||
|
||||
caseless_offsets = [0] * MAX_UNICODE
|
||||
|
||||
offset = 1;
|
||||
for s in caseless_sets:
|
||||
for x in s:
|
||||
caseless_offsets[x] = offset
|
||||
offset += len(s) + 1
|
||||
|
||||
# End of block of code for creating offsets for caseless matching sets.
|
||||
|
||||
|
||||
# Combine all the tables
|
||||
|
||||
table, records = combine_tables(script, category, break_props,
|
||||
caseless_offsets, other_case, scriptx_bidi_class, bool_props)
|
||||
|
||||
# Find the record size and create a string definition of the structure for
|
||||
# outputting as a comment.
|
||||
|
||||
record_size, record_struct = get_record_size_struct(list(records.keys()))
|
||||
|
||||
# Find the optimum block size for the two-stage table
|
||||
|
||||
min_size = sys.maxsize
|
||||
for block_size in [2 ** i for i in range(5,10)]:
|
||||
size = len(records) * record_size
|
||||
stage1, stage2 = compress_table(table, block_size)
|
||||
size += get_tables_size(stage1, stage2)
|
||||
#print "/* block size %5d => %5d bytes */" % (block_size, size)
|
||||
if size < min_size:
|
||||
min_size = size
|
||||
min_stage1, min_stage2 = stage1, stage2
|
||||
min_block_size = block_size
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# MAIN CODE FOR WRITING THE OUTPUT FILE
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
# Open the output file (no return on failure). This call also writes standard
|
||||
# header boilerplate.
|
||||
|
||||
f = open_output("pcre2_ucd.c")
|
||||
|
||||
# Output this file's heading text
|
||||
|
||||
f.write("""\
|
||||
/* This file contains tables of Unicode properties that are extracted from
|
||||
Unicode data files. See the comments at the start of maint/GenerateUcd.py for
|
||||
details.
|
||||
|
||||
As well as being part of the PCRE2 library, this file is #included by the
|
||||
pcre2test program, which redefines the PRIV macro to change table names from
|
||||
_pcre2_xxx to xxxx, thereby avoiding name clashes with the library. At present,
|
||||
just one of these tables is actually needed. When compiling the library, some
|
||||
headers are needed. */
|
||||
|
||||
#ifndef PCRE2_PCRE2TEST
|
||||
#ifdef HAVE_CONFIG_H
|
||||
#include "config.h"
|
||||
#endif
|
||||
#include "pcre2_internal.h"
|
||||
#endif /* PCRE2_PCRE2TEST */
|
||||
|
||||
/* The tables herein are needed only when UCP support is built, and in PCRE2
|
||||
that happens automatically with UTF support. This module should not be
|
||||
referenced otherwise, so it should not matter whether it is compiled or not.
|
||||
However a comment was received about space saving - maybe the guy linked all
|
||||
the modules rather than using a library - so we include a condition to cut out
|
||||
the tables when not needed. But don't leave a totally empty module because some
|
||||
compilers barf at that. Instead, just supply some small dummy tables. */
|
||||
|
||||
#ifndef SUPPORT_UNICODE
|
||||
const ucd_record PRIV(ucd_records)[] = {{0,0,0,0,0,0,0}};
|
||||
const uint16_t PRIV(ucd_stage1)[] = {0};
|
||||
const uint16_t PRIV(ucd_stage2)[] = {0};
|
||||
const uint32_t PRIV(ucd_caseless_sets)[] = {0};
|
||||
#else
|
||||
\n""")
|
||||
|
||||
# --- Output some variable heading stuff ---
|
||||
|
||||
f.write("/* Total size: %d bytes, block size: %d. */\n\n" % (min_size, min_block_size))
|
||||
f.write('const char *PRIV(unicode_version) = "{}";\n\n'.format(unicode_version))
|
||||
|
||||
f.write("""\
|
||||
/* When recompiling tables with a new Unicode version, please check the types
|
||||
in this structure definition with those in pcre2_internal.h (the actual field
|
||||
names will be different).
|
||||
\n""")
|
||||
|
||||
f.write(record_struct)
|
||||
|
||||
f.write("""
|
||||
/* If the 32-bit library is run in non-32-bit mode, character values greater
|
||||
than 0x10ffff may be encountered. For these we set up a special record. */
|
||||
|
||||
#if PCRE2_CODE_UNIT_WIDTH == 32
|
||||
const ucd_record PRIV(dummy_ucd_record)[] = {{
|
||||
ucp_Unknown, /* script */
|
||||
ucp_Cn, /* type unassigned */
|
||||
ucp_gbOther, /* grapheme break property */
|
||||
0, /* case set */
|
||||
0, /* other case */
|
||||
0 | (ucp_bidiL << UCD_BIDICLASS_SHIFT), /* script extension and bidi class */
|
||||
0, /* bool properties offset */
|
||||
}};
|
||||
#endif
|
||||
\n""")
|
||||
|
||||
# --- Output the table of caseless character sets ---
|
||||
|
||||
f.write("""\
|
||||
/* This table contains lists of characters that are caseless sets of
|
||||
more than one character. Each list is terminated by NOTACHAR. */
|
||||
|
||||
const uint32_t PRIV(ucd_caseless_sets)[] = {
|
||||
NOTACHAR,
|
||||
""")
|
||||
|
||||
for s in caseless_sets:
|
||||
s = sorted(s)
|
||||
for x in s:
|
||||
f.write(' 0x%04x,' % x)
|
||||
f.write(' NOTACHAR,\n')
|
||||
f.write('};\n\n')
|
||||
|
||||
# --- Other tables are not needed by pcre2test ---
|
||||
|
||||
f.write("""\
|
||||
/* When #included in pcre2test, we don't need the table of digit sets, nor the
|
||||
the large main UCD tables. */
|
||||
|
||||
#ifndef PCRE2_PCRE2TEST
|
||||
\n""")
|
||||
|
||||
# --- Read Scripts.txt again for the sets of 10 digits. ---
|
||||
|
||||
digitsets = []
|
||||
file = open('Unicode.tables/Scripts.txt', 'r', encoding='utf-8')
|
||||
|
||||
for line in file:
|
||||
m = re.match(r'([0-9a-fA-F]+)\.\.([0-9a-fA-F]+)\s+;\s+\S+\s+#\s+Nd\s+', line)
|
||||
if m is None:
|
||||
continue
|
||||
first = int(m.group(1),16)
|
||||
last = int(m.group(2),16)
|
||||
if ((last - first + 1) % 10) != 0:
|
||||
f.write("ERROR: %04x..%04x does not contain a multiple of 10 characters" % (first, last),
|
||||
file=sys.stderr)
|
||||
while first < last:
|
||||
digitsets.append(first + 9)
|
||||
first += 10
|
||||
file.close()
|
||||
digitsets.sort()
|
||||
|
||||
f.write("""\
|
||||
/* This table lists the code points for the '9' characters in each set of
|
||||
decimal digits. It is used to ensure that all the digits in a script run come
|
||||
from the same set. */
|
||||
|
||||
const uint32_t PRIV(ucd_digit_sets)[] = {
|
||||
""")
|
||||
|
||||
f.write(" %d, /* Number of subsequent values */" % len(digitsets))
|
||||
count = 8
|
||||
for d in digitsets:
|
||||
if count == 8:
|
||||
f.write("\n ")
|
||||
count = 0
|
||||
f.write(" 0x%05x," % d)
|
||||
count += 1
|
||||
f.write("\n};\n\n")
|
||||
|
||||
f.write("""\
|
||||
/* This vector is a list of script bitsets for the Script Extension property.
|
||||
The number of 32-bit words in each bitset is #defined in pcre2_ucp.h as
|
||||
ucd_script_sets_item_size. */
|
||||
|
||||
const uint32_t PRIV(ucd_script_sets)[] = {
|
||||
""")
|
||||
write_bitsets(script_lists, script_list_item_size)
|
||||
|
||||
f.write("""\
|
||||
/* This vector is a list of bitsets for Boolean properties. The number of
|
||||
32_bit words in each bitset is #defined as ucd_boolprop_sets_item_size in
|
||||
pcre2_ucp.h. */
|
||||
|
||||
const uint32_t PRIV(ucd_boolprop_sets)[] = {
|
||||
""")
|
||||
write_bitsets(bool_props_lists, bool_props_list_item_size)
|
||||
|
||||
|
||||
# Output the main UCD tables.
|
||||
|
||||
f.write("""\
|
||||
/* These are the main two-stage UCD tables. The fields in each record are:
|
||||
script (8 bits), character type (8 bits), grapheme break property (8 bits),
|
||||
offset to multichar other cases or zero (8 bits), offset to other case or zero
|
||||
(32 bits, signed), bidi class (5 bits) and script extension (11 bits) packed
|
||||
into a 16-bit field, and offset in binary properties table (16 bits). */
|
||||
\n""")
|
||||
|
||||
write_records(records, record_size)
|
||||
write_table(min_stage1, 'PRIV(ucd_stage1)')
|
||||
write_table(min_stage2, 'PRIV(ucd_stage2)', min_block_size)
|
||||
|
||||
f.write("#if UCD_BLOCK_SIZE != %d\n" % min_block_size)
|
||||
f.write("""\
|
||||
#error Please correct UCD_BLOCK_SIZE in pcre2_internal.h
|
||||
#endif
|
||||
#endif /* SUPPORT_UNICODE */
|
||||
|
||||
#endif /* PCRE2_PCRE2TEST */
|
||||
|
||||
/* End of pcre2_ucd.c */
|
||||
""")
|
||||
|
||||
f.close
|
||||
|
||||
# End
|
|
@ -0,0 +1,98 @@
|
|||
#! /usr/bin/python
|
||||
|
||||
# PCRE2 UNICODE PROPERTY SUPPORT
|
||||
# ------------------------------
|
||||
|
||||
# This script generates the pcre2_ucp.h file from Unicode data files. This
|
||||
# header uses enumerations to give names to Unicode property types and script
|
||||
# names.
|
||||
|
||||
# This script was created in December 2021 as part of the Unicode data
|
||||
# generation refactoring.
|
||||
|
||||
|
||||
# Import common data lists and functions
|
||||
|
||||
from GenerateCommon import \
|
||||
bidi_classes, \
|
||||
bool_properties, \
|
||||
bool_props_list_item_size, \
|
||||
break_properties, \
|
||||
category_names, \
|
||||
general_category_names, \
|
||||
script_list_item_size, \
|
||||
script_names, \
|
||||
open_output
|
||||
|
||||
# Open the output file (no return on failure). This call also writes standard
|
||||
# header boilerplate.
|
||||
|
||||
f = open_output("pcre2_ucp.h")
|
||||
|
||||
# Output this file's heading text
|
||||
|
||||
f.write("""\
|
||||
#ifndef PCRE2_UCP_H_IDEMPOTENT_GUARD
|
||||
#define PCRE2_UCP_H_IDEMPOTENT_GUARD
|
||||
|
||||
/* This file contains definitions of the Unicode property values that are
|
||||
returned by the UCD access macros and used throughout PCRE2.
|
||||
|
||||
IMPORTANT: The specific values of the first two enums (general and particular
|
||||
character categories) are assumed by the table called catposstab in the file
|
||||
pcre2_auto_possess.c. They are unlikely to change, but should be checked after
|
||||
an update. */
|
||||
\n""")
|
||||
|
||||
f.write("/* These are the general character categories. */\n\nenum {\n")
|
||||
for i in general_category_names:
|
||||
f.write(" ucp_%s,\n" % i)
|
||||
f.write("};\n\n")
|
||||
|
||||
f.write("/* These are the particular character categories. */\n\nenum {\n")
|
||||
for i in range(0, len(category_names), 2):
|
||||
f.write(" ucp_%s, /* %s */\n" % (category_names[i], category_names[i+1]))
|
||||
f.write("};\n\n")
|
||||
|
||||
f.write("/* These are Boolean properties. */\n\nenum {\n")
|
||||
for i in bool_properties:
|
||||
f.write(" ucp_%s,\n" % i)
|
||||
|
||||
f.write(" /* This must be last */\n")
|
||||
f.write(" ucp_Bprop_Count\n};\n\n")
|
||||
|
||||
f.write("/* Size of entries in ucd_boolprop_sets[] */\n\n")
|
||||
f.write("#define ucd_boolprop_sets_item_size %d\n\n" % bool_props_list_item_size)
|
||||
|
||||
f.write("/* These are the bidi class values. */\n\nenum {\n")
|
||||
for i in range(0, len(bidi_classes), 2):
|
||||
sp = ' ' * (4 - len(bidi_classes[i]))
|
||||
f.write(" ucp_bidi%s,%s /* %s */\n" % (bidi_classes[i], sp, bidi_classes[i+1]))
|
||||
f.write("};\n\n")
|
||||
|
||||
f.write("/* These are grapheme break properties. The Extended Pictographic "
|
||||
"property\ncomes from the emoji-data.txt file. */\n\nenum {\n")
|
||||
for i in range(0, len(break_properties), 2):
|
||||
sp = ' ' * (21 - len(break_properties[i]))
|
||||
f.write(" ucp_gb%s,%s /* %s */\n" % (break_properties[i], sp, break_properties[i+1]))
|
||||
f.write("};\n\n")
|
||||
|
||||
f.write("/* These are the script identifications. */\n\nenum {\n /* Scripts which has characters in other scripts. */\n")
|
||||
for i in script_names:
|
||||
if i == "Unknown":
|
||||
f.write("\n /* Scripts which has no characters in other scripts. */\n")
|
||||
f.write(" ucp_%s,\n" % i)
|
||||
f.write("\n")
|
||||
|
||||
f.write(" /* This must be last */\n")
|
||||
f.write(" ucp_Script_Count\n};\n\n")
|
||||
|
||||
f.write("/* Size of entries in ucd_script_sets[] */\n\n")
|
||||
f.write("#define ucd_script_sets_item_size %d\n\n" % script_list_item_size)
|
||||
|
||||
f.write("#endif /* PCRE2_UCP_H_IDEMPOTENT_GUARD */\n\n")
|
||||
f.write("/* End of pcre2_ucp.h */\n")
|
||||
|
||||
f.close()
|
||||
|
||||
# End
|
|
@ -0,0 +1,203 @@
|
|||
#! /usr/bin/python
|
||||
|
||||
# PCRE2 UNICODE PROPERTY SUPPORT
|
||||
# ------------------------------
|
||||
|
||||
# This script generates the pcre2_ucptables.c file, which contains tables for
|
||||
# recognizing Unicode property names. It is #included by pcre2_tables.c. In
|
||||
# order to reduce the number of relocations when loading the PCRE2 library, the
|
||||
# names are held as a single large string, with offsets in the table. This is
|
||||
# tedious to maintain by hand. Therefore, a script is used to generate the
|
||||
# table.
|
||||
|
||||
# This script was created in December 2021 based on the previous GenerateUtt
|
||||
# script, whose output had to be manually edited into pcre2_tables.c. Here is
|
||||
# the history of the original script:
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
# Modified by PH 17-March-2009 to generate the more verbose form that works
|
||||
# for UTF-support in EBCDIC as well as ASCII environments.
|
||||
# Modified by PH 01-March-2010 to add new scripts for Unicode 5.2.0.
|
||||
# Modified by PH 04-May-2010 to add new "X.." special categories.
|
||||
# Modified by PH 30-April-2011 to add new scripts for Unicode 6.0.0
|
||||
# Modified by ChPe 30-September-2012 to add this note; no other changes were
|
||||
# necessary for Unicode 6.2.0 support.
|
||||
# Modfied by PH 26-February-2013 to add the Xuc special category.
|
||||
# Comment modified by PH 13-May-2014 to update to PCRE2 file names.
|
||||
# Script updated to Python 3 by running it through the 2to3 converter.
|
||||
# Added script names for Unicode 7.0.0, 20-June-2014.
|
||||
# Added script names for Unicode 8.0.0, 19-June-2015.
|
||||
# Added script names for Unicode 10.0.0, 02-July-2017.
|
||||
# Added script names for Unicode 11.0.0, 03-July-2018.
|
||||
# Added 'Unknown' script, 01-October-2018.
|
||||
# Added script names for Unicode 12.1.0, 27-July-2019.
|
||||
# Added script names for Unicode 13.0.0, 10-March-2020.
|
||||
# Added Script names for Unicode 14.0.0, PCRE2-10.39
|
||||
# Added support for bidi class and bidi control, 06-December-2021
|
||||
# This also involved lower casing strings and removing underscores, in
|
||||
# accordance with Unicode's "loose matching" rules, which Perl observes.
|
||||
# Changed default script type from PT_SC to PT_SCX, 18-December-2021
|
||||
# -----------------------------------------------------------------------------
|
||||
#
|
||||
# Note subsequent changes here:
|
||||
#
|
||||
# 27-December-2021: Added support for 4-letter script abbreviations.
|
||||
# 10-January-2022: Further updates for Boolean property support
|
||||
# -----------------------------------------------------------------------------
|
||||
|
||||
|
||||
# Import common data lists and functions
|
||||
|
||||
from GenerateCommon import \
|
||||
abbreviations, \
|
||||
bool_properties, \
|
||||
bidi_classes, \
|
||||
category_names, \
|
||||
general_category_names, \
|
||||
script_names, \
|
||||
open_output
|
||||
|
||||
# Open the output file (no return on failure). This call also writes standard
|
||||
# header boilerplate.
|
||||
|
||||
f = open_output("pcre2_ucptables.c")
|
||||
|
||||
# The list in bidi_classes contains just the Unicode classes such as AN, LRE,
|
||||
# etc., along with comments. We need to add "bidi" in front of each value, in
|
||||
# order to create names that don't clash with other types of property.
|
||||
|
||||
bidi_class_names = []
|
||||
for i in range(0, len(bidi_classes), 2):
|
||||
bidi_class_names.append("bidi" + bidi_classes[i])
|
||||
|
||||
# Remove the comments from other lists that contain them.
|
||||
|
||||
category_names = category_names[::2]
|
||||
|
||||
# Create standardized versions of the names by lowercasing and removing
|
||||
# underscores.
|
||||
|
||||
def stdname(x):
|
||||
return x.lower().replace('_', '')
|
||||
|
||||
def stdnames(x):
|
||||
y = [''] * len(x)
|
||||
for i in range(len(x)):
|
||||
y[i] = stdname(x[i])
|
||||
return y
|
||||
|
||||
std_category_names = stdnames(category_names)
|
||||
std_general_category_names = stdnames(general_category_names)
|
||||
std_bidi_class_names = stdnames(bidi_class_names)
|
||||
std_bool_properties = stdnames(bool_properties)
|
||||
|
||||
# Create the table, starting with the Unicode script, category and bidi class
|
||||
# names. We keep both the standardized name and the original, because the
|
||||
# latter is used for the ucp_xx names. NOTE: for the script abbreviations, we
|
||||
# still use the full original names.
|
||||
|
||||
utt_table = []
|
||||
|
||||
scx_end = script_names.index('Unknown')
|
||||
|
||||
for idx, name in enumerate(script_names):
|
||||
pt_type = 'PT_SCX' if idx < scx_end else 'PT_SC'
|
||||
utt_table.append((stdname(name), name, pt_type))
|
||||
for abbrev in abbreviations[name]:
|
||||
utt_table.append((stdname(abbrev), name, pt_type))
|
||||
|
||||
# Add the remaining property lists
|
||||
|
||||
utt_table += list(zip(std_category_names, category_names, ['PT_PC'] * len(category_names)))
|
||||
utt_table += list(zip(std_general_category_names, general_category_names, ['PT_GC'] * len(general_category_names)))
|
||||
utt_table += list(zip(std_bidi_class_names, bidi_class_names, ['PT_BIDICL'] * len(bidi_class_names)))
|
||||
|
||||
for name in bool_properties:
|
||||
utt_table.append((stdname(name), name, 'PT_BOOL'))
|
||||
if name in abbreviations:
|
||||
for abbrev in abbreviations[name]:
|
||||
utt_table.append((stdname(abbrev), name, 'PT_BOOL'))
|
||||
|
||||
# Now add specials and synonyms. Note both the standardized and capitalized
|
||||
# forms are needed.
|
||||
|
||||
utt_table.append(('any', 'Any', 'PT_ANY'))
|
||||
utt_table.append(('l&', 'L&', 'PT_LAMP'))
|
||||
utt_table.append(('lc', 'LC', 'PT_LAMP'))
|
||||
utt_table.append(('xan', 'Xan', 'PT_ALNUM'))
|
||||
utt_table.append(('xps', 'Xps', 'PT_PXSPACE'))
|
||||
utt_table.append(('xsp', 'Xsp', 'PT_SPACE'))
|
||||
utt_table.append(('xuc', 'Xuc', 'PT_UCNC'))
|
||||
utt_table.append(('xwd', 'Xwd', 'PT_WORD'))
|
||||
|
||||
# Remove duplicates from the table and then sort it.
|
||||
|
||||
utt_table = list(set(utt_table))
|
||||
utt_table.sort()
|
||||
|
||||
# Output file-specific heading
|
||||
|
||||
f.write("""\
|
||||
#ifdef SUPPORT_UNICODE
|
||||
|
||||
/* The PRIV(utt)[] table below translates Unicode property names into type and
|
||||
code values. It is searched by binary chop, so must be in collating sequence of
|
||||
name. Originally, the table contained pointers to the name strings in the first
|
||||
field of each entry. However, that leads to a large number of relocations when
|
||||
a shared library is dynamically loaded. A significant reduction is made by
|
||||
putting all the names into a single, large string and using offsets instead.
|
||||
All letters are lower cased, and underscores are removed, in accordance with
|
||||
the "loose matching" rules that Unicode advises and Perl uses. */
|
||||
\n""")
|
||||
|
||||
# We have to use STR_ macros to define the strings so that it all works in
|
||||
# UTF-8 mode on EBCDIC platforms.
|
||||
|
||||
for utt in utt_table:
|
||||
f.write('#define STRING_%s0' % (utt[0].replace('&', '_AMPERSAND')))
|
||||
for c in utt[0]:
|
||||
if c == '&':
|
||||
f.write(' STR_AMPERSAND')
|
||||
else:
|
||||
f.write(' STR_%s' % c);
|
||||
f.write(' "\\0"\n')
|
||||
|
||||
# Output the long string of concatenated names
|
||||
|
||||
f.write('\nconst char PRIV(utt_names)[] =\n');
|
||||
last = ''
|
||||
for utt in utt_table:
|
||||
if utt == utt_table[-1]:
|
||||
last = ';'
|
||||
f.write(' STRING_%s0%s\n' % (utt[0].replace('&', '_AMPERSAND'), last))
|
||||
|
||||
# Output the property type table
|
||||
|
||||
f.write('\nconst ucp_type_table PRIV(utt)[] = {\n')
|
||||
offset = 0
|
||||
last = ','
|
||||
for utt in utt_table:
|
||||
if utt[2] in ('PT_ANY', 'PT_LAMP', 'PT_ALNUM', 'PT_PXSPACE',
|
||||
'PT_SPACE', 'PT_UCNC', 'PT_WORD'):
|
||||
value = '0'
|
||||
else:
|
||||
value = 'ucp_' + utt[1]
|
||||
if utt == utt_table[-1]:
|
||||
last = ''
|
||||
f.write(' { %3d, %s, %s }%s\n' % (offset, utt[2], value, last))
|
||||
offset += len(utt[0]) + 1
|
||||
f.write('};\n\n')
|
||||
|
||||
# Ending text
|
||||
|
||||
f.write("""\
|
||||
const size_t PRIV(utt_size) = sizeof(PRIV(utt)) / sizeof(ucp_type_table);
|
||||
|
||||
#endif /* SUPPORT_UNICODE */
|
||||
|
||||
/* End of pcre2_ucptables.c */
|
||||
""")
|
||||
|
||||
f.close
|
||||
|
||||
# End
|
|
@ -1,140 +0,0 @@
|
|||
#! /usr/bin/python
|
||||
|
||||
# Generate utt tables. Note: this script has now been converted to Python 3.
|
||||
|
||||
# The source file pcre2_tables.c contains (amongst other things), a table that
|
||||
# is indexed by script name. In order to reduce the number of relocations when
|
||||
# loading the library, the names are held as a single large string, with
|
||||
# offsets in the table. This is tedious to maintain by hand. Therefore, this
|
||||
# script is used to generate the table. The output is sent to stdout; usually
|
||||
# that should be directed to a temporary file. Then pcre2_tables.c can be
|
||||
# edited by replacing the relevant definitions and table therein with the
|
||||
# temporary file.
|
||||
|
||||
# Modified by PH 17-March-2009 to generate the more verbose form that works
|
||||
# for UTF-support in EBCDIC as well as ASCII environments.
|
||||
# Modified by PH 01-March-2010 to add new scripts for Unicode 5.2.0.
|
||||
# Modified by PH 04-May-2010 to add new "X.." special categories.
|
||||
# Modified by PH 30-April-2011 to add new scripts for Unicode 6.0.0
|
||||
# Modified by ChPe 30-September-2012 to add this note; no other changes were
|
||||
# necessary for Unicode 6.2.0 support.
|
||||
# Modfied by PH 26-February-2013 to add the Xuc special category.
|
||||
# Comment modified by PH 13-May-2014 to update to PCRE2 file names.
|
||||
# Script updated to Python 3 by running it through the 2to3 converter.
|
||||
# Added script names for Unicode 7.0.0, 20-June-2014.
|
||||
# Added script names for Unicode 8.0.0, 19-June-2015.
|
||||
# Added script names for Unicode 10.0.0, 02-July-2017.
|
||||
# Added script names for Unicode 11.0.0, 03-July-2018.
|
||||
# Added 'Unknown' script, 01-October-2018.
|
||||
# Added script names for Unicode 12.1.0, 27-July-2019.
|
||||
# Added script names for Unicode 13.0.0, 10-March-2020.
|
||||
# Added Script names for Unicode 14.0.0, PCRE2-10.39
|
||||
|
||||
script_names = ['Unknown', 'Arabic', 'Armenian', 'Bengali', 'Bopomofo', 'Braille', 'Buginese', 'Buhid', 'Canadian_Aboriginal', \
|
||||
'Cherokee', 'Common', 'Coptic', 'Cypriot', 'Cyrillic', 'Deseret', 'Devanagari', 'Ethiopic', 'Georgian', \
|
||||
'Glagolitic', 'Gothic', 'Greek', 'Gujarati', 'Gurmukhi', 'Han', 'Hangul', 'Hanunoo', 'Hebrew', 'Hiragana', \
|
||||
'Inherited', 'Kannada', 'Katakana', 'Kharoshthi', 'Khmer', 'Lao', 'Latin', 'Limbu', 'Linear_B', 'Malayalam', \
|
||||
'Mongolian', 'Myanmar', 'New_Tai_Lue', 'Ogham', 'Old_Italic', 'Old_Persian', 'Oriya', 'Osmanya', 'Runic', \
|
||||
'Shavian', 'Sinhala', 'Syloti_Nagri', 'Syriac', 'Tagalog', 'Tagbanwa', 'Tai_Le', 'Tamil', 'Telugu', 'Thaana', \
|
||||
'Thai', 'Tibetan', 'Tifinagh', 'Ugaritic', 'Yi', \
|
||||
# New for Unicode 5.0
|
||||
'Balinese', 'Cuneiform', 'Nko', 'Phags_Pa', 'Phoenician', \
|
||||
# New for Unicode 5.1
|
||||
'Carian', 'Cham', 'Kayah_Li', 'Lepcha', 'Lycian', 'Lydian', 'Ol_Chiki', 'Rejang', 'Saurashtra', 'Sundanese', 'Vai', \
|
||||
# New for Unicode 5.2
|
||||
'Avestan', 'Bamum', 'Egyptian_Hieroglyphs', 'Imperial_Aramaic', \
|
||||
'Inscriptional_Pahlavi', 'Inscriptional_Parthian', \
|
||||
'Javanese', 'Kaithi', 'Lisu', 'Meetei_Mayek', \
|
||||
'Old_South_Arabian', 'Old_Turkic', 'Samaritan', 'Tai_Tham', 'Tai_Viet', \
|
||||
# New for Unicode 6.0.0
|
||||
'Batak', 'Brahmi', 'Mandaic', \
|
||||
# New for Unicode 6.1.0
|
||||
'Chakma', 'Meroitic_Cursive', 'Meroitic_Hieroglyphs', 'Miao', 'Sharada', 'Sora_Sompeng', 'Takri',
|
||||
# New for Unicode 7.0.0
|
||||
'Bassa_Vah', 'Caucasian_Albanian', 'Duployan', 'Elbasan', 'Grantha', 'Khojki', 'Khudawadi',
|
||||
'Linear_A', 'Mahajani', 'Manichaean', 'Mende_Kikakui', 'Modi', 'Mro', 'Nabataean',
|
||||
'Old_North_Arabian', 'Old_Permic', 'Pahawh_Hmong', 'Palmyrene', 'Psalter_Pahlavi',
|
||||
'Pau_Cin_Hau', 'Siddham', 'Tirhuta', 'Warang_Citi',
|
||||
# New for Unicode 8.0.0
|
||||
'Ahom', 'Anatolian_Hieroglyphs', 'Hatran', 'Multani', 'Old_Hungarian',
|
||||
'SignWriting',
|
||||
# New for Unicode 10.0.0
|
||||
'Adlam', 'Bhaiksuki', 'Marchen', 'Newa', 'Osage', 'Tangut', 'Masaram_Gondi',
|
||||
'Nushu', 'Soyombo', 'Zanabazar_Square',
|
||||
# New for Unicode 11.0.0
|
||||
'Dogra', 'Gunjala_Gondi', 'Hanifi_Rohingya', 'Makasar', 'Medefaidrin',
|
||||
'Old_Sogdian', 'Sogdian',
|
||||
# New for Unicode 12.0.0
|
||||
'Elymaic', 'Nandinagari', 'Nyiakeng_Puachue_Hmong', 'Wancho',
|
||||
# New for Unicode 13.0.0
|
||||
'Chorasmian', 'Dives_Akuru', 'Khitan_Small_Script', 'Yezidi',
|
||||
# New for Unicode 14.0.0
|
||||
'Cypro_Minoan', 'Old_Uyghur', 'Tangsa', 'Toto', 'Vithkuqi'
|
||||
]
|
||||
|
||||
category_names = ['Cc', 'Cf', 'Cn', 'Co', 'Cs', 'Ll', 'Lm', 'Lo', 'Lt', 'Lu',
|
||||
'Mc', 'Me', 'Mn', 'Nd', 'Nl', 'No', 'Pc', 'Pd', 'Pe', 'Pf', 'Pi', 'Po', 'Ps',
|
||||
'Sc', 'Sk', 'Sm', 'So', 'Zl', 'Zp', 'Zs' ]
|
||||
|
||||
general_category_names = ['C', 'L', 'M', 'N', 'P', 'S', 'Z']
|
||||
|
||||
# First add the Unicode script and category names.
|
||||
|
||||
utt_table = list(zip(script_names, ['PT_SC'] * len(script_names)))
|
||||
utt_table += list(zip(category_names, ['PT_PC'] * len(category_names)))
|
||||
utt_table += list(zip(general_category_names, ['PT_GC'] * len(general_category_names)))
|
||||
|
||||
# Now add our own specials.
|
||||
|
||||
utt_table.append(('Any', 'PT_ANY'))
|
||||
utt_table.append(('L&', 'PT_LAMP'))
|
||||
utt_table.append(('Xan', 'PT_ALNUM'))
|
||||
utt_table.append(('Xps', 'PT_PXSPACE'))
|
||||
utt_table.append(('Xsp', 'PT_SPACE'))
|
||||
utt_table.append(('Xuc', 'PT_UCNC'))
|
||||
utt_table.append(('Xwd', 'PT_WORD'))
|
||||
|
||||
# Sort the table.
|
||||
|
||||
utt_table.sort()
|
||||
|
||||
# We have to use STR_ macros to define the strings so that it all works in
|
||||
# UTF-8 mode on EBCDIC platforms.
|
||||
|
||||
for utt in utt_table:
|
||||
print('#define STRING_%s0' % (utt[0].replace('&', '_AMPERSAND')), end=' ')
|
||||
for c in utt[0]:
|
||||
if c == '_':
|
||||
print('STR_UNDERSCORE', end=' ')
|
||||
elif c == '&':
|
||||
print('STR_AMPERSAND', end=' ')
|
||||
else:
|
||||
print('STR_%s' % c, end=' ');
|
||||
print('"\\0"')
|
||||
|
||||
# Print the actual table, using the string names
|
||||
|
||||
print('')
|
||||
print('const char PRIV(utt_names)[] =');
|
||||
last = ''
|
||||
for utt in utt_table:
|
||||
if utt == utt_table[-1]:
|
||||
last = ';'
|
||||
print(' STRING_%s0%s' % (utt[0].replace('&', '_AMPERSAND'), last))
|
||||
# This was how it was done before the EBCDIC-compatible modification.
|
||||
# print ' "%s\\0"%s' % (utt[0], last)
|
||||
|
||||
print('\nconst ucp_type_table PRIV(utt)[] = {')
|
||||
offset = 0
|
||||
last = ','
|
||||
for utt in utt_table:
|
||||
if utt[1] in ('PT_ANY', 'PT_LAMP', 'PT_ALNUM', 'PT_PXSPACE',
|
||||
'PT_SPACE', 'PT_UCNC', 'PT_WORD'):
|
||||
value = '0'
|
||||
else:
|
||||
value = 'ucp_' + utt[0]
|
||||
if utt == utt_table[-1]:
|
||||
last = ''
|
||||
print(' { %3d, %s, %s }%s' % (offset, utt[1], value, last))
|
||||
offset += len(utt[0]) + 1
|
||||
print('};')
|
|
@ -1,819 +0,0 @@
|
|||
#! /usr/bin/python
|
||||
|
||||
# Multistage table builder
|
||||
# (c) Peter Kankowski, 2008
|
||||
|
||||
##############################################################################
|
||||
# This script was submitted to the PCRE project by Peter Kankowski as part of
|
||||
# the upgrading of Unicode property support. The new code speeds up property
|
||||
# matching many times. The script is for the use of PCRE maintainers, to
|
||||
# generate the pcre2_ucd.c file that contains a digested form of the Unicode
|
||||
# data tables. A number of extensions have been added to the original script.
|
||||
#
|
||||
# The script has now been upgraded to Python 3 for PCRE2, and should be run in
|
||||
# the maint subdirectory, using the command
|
||||
#
|
||||
# [python3] ./MultiStage2.py >../src/pcre2_ucd.c
|
||||
#
|
||||
# It requires six Unicode data tables: DerivedGeneralCategory.txt,
|
||||
# GraphemeBreakProperty.txt, Scripts.txt, ScriptExtensions.txt,
|
||||
# CaseFolding.txt, and emoji-data.txt. These must be in the
|
||||
# maint/Unicode.tables subdirectory.
|
||||
#
|
||||
# DerivedGeneralCategory.txt is found in the "extracted" subdirectory of the
|
||||
# Unicode database (UCD) on the Unicode web site; GraphemeBreakProperty.txt is
|
||||
# in the "auxiliary" subdirectory. Scripts.txt, ScriptExtensions.txt, and
|
||||
# CaseFolding.txt are directly in the UCD directory.
|
||||
#
|
||||
# The emoji-data.txt file is found in the "emoji" subdirectory even though it
|
||||
# is technically part of a different (but coordinated) standard as shown
|
||||
# in files associated with Unicode Technical Standard #51 ("Unicode Emoji"),
|
||||
# for example:
|
||||
#
|
||||
# http://unicode.org/Public/emoji/13.0/ReadMe.txt
|
||||
#
|
||||
# -----------------------------------------------------------------------------
|
||||
# Minor modifications made to this script:
|
||||
# Added #! line at start
|
||||
# Removed tabs
|
||||
# Made it work with Python 2.4 by rewriting two statements that needed 2.5
|
||||
# Consequent code tidy
|
||||
# Adjusted data file names to take from the Unicode.tables directory
|
||||
# Adjusted global table names by prefixing _pcre_.
|
||||
# Commented out stuff relating to the casefolding table, which isn't used;
|
||||
# removed completely in 2012.
|
||||
# Corrected size calculation
|
||||
# Add #ifndef SUPPORT_UCP to use dummy tables when no UCP support is needed.
|
||||
# Update for PCRE2: name changes, and SUPPORT_UCP is abolished.
|
||||
#
|
||||
# Major modifications made to this script:
|
||||
# Added code to add a grapheme break property field to records.
|
||||
#
|
||||
# Added code to search for sets of more than two characters that must match
|
||||
# each other caselessly. A new table is output containing these sets, and
|
||||
# offsets into the table are added to the main output records. This new
|
||||
# code scans CaseFolding.txt instead of UnicodeData.txt, which is no longer
|
||||
# used.
|
||||
#
|
||||
# Update for Python3:
|
||||
# . Processed with 2to3, but that didn't fix everything
|
||||
# . Changed string.strip to str.strip
|
||||
# . Added encoding='utf-8' to the open() call
|
||||
# . Inserted 'int' before blocksize/ELEMS_PER_LINE because an int is
|
||||
# required and the result of the division is a float
|
||||
#
|
||||
# Added code to scan the emoji-data.txt file to find the Extended Pictographic
|
||||
# property, which is used by PCRE2 as a grapheme breaking property. This was
|
||||
# done when updating to Unicode 11.0.0 (July 2018).
|
||||
#
|
||||
# Added code to add a Script Extensions field to records. This has increased
|
||||
# their size from 8 to 12 bytes, only 10 of which are currently used.
|
||||
#
|
||||
# 01-March-2010: Updated list of scripts for Unicode 5.2.0
|
||||
# 30-April-2011: Updated list of scripts for Unicode 6.0.0
|
||||
# July-2012: Updated list of scripts for Unicode 6.1.0
|
||||
# 20-August-2012: Added scan of GraphemeBreakProperty.txt and added a new
|
||||
# field in the record to hold the value. Luckily, the
|
||||
# structure had a hole in it, so the resulting table is
|
||||
# not much bigger than before.
|
||||
# 18-September-2012: Added code for multiple caseless sets. This uses the
|
||||
# final hole in the structure.
|
||||
# 30-September-2012: Added RegionalIndicator break property from Unicode 6.2.0
|
||||
# 13-May-2014: Updated for PCRE2
|
||||
# 03-June-2014: Updated for Python 3
|
||||
# 20-June-2014: Updated for Unicode 7.0.0
|
||||
# 12-August-2014: Updated to put Unicode version into the file
|
||||
# 19-June-2015: Updated for Unicode 8.0.0
|
||||
# 02-July-2017: Updated for Unicode 10.0.0
|
||||
# 03-July-2018: Updated for Unicode 11.0.0
|
||||
# 07-July-2018: Added code to scan emoji-data.txt for the Extended
|
||||
# Pictographic property.
|
||||
# 01-October-2018: Added the 'Unknown' script name
|
||||
# 03-October-2018: Added new field for Script Extensions
|
||||
# 27-July-2019: Updated for Unicode 12.1.0
|
||||
# 10-March-2020: Updated for Unicode 13.0.0
|
||||
# PCRE2-10.39: Updated for Unicode 14.0.0
|
||||
# ----------------------------------------------------------------------------
|
||||
#
|
||||
#
|
||||
# The main tables generated by this script are used by macros defined in
|
||||
# pcre2_internal.h. They look up Unicode character properties using short
|
||||
# sequences of code that contains no branches, which makes for greater speed.
|
||||
#
|
||||
# Conceptually, there is a table of records (of type ucd_record), containing a
|
||||
# script number, script extension value, character type, grapheme break type,
|
||||
# offset to caseless matching set, offset to the character's other case, for
|
||||
# every Unicode character. However, a real table covering all Unicode
|
||||
# characters would be far too big. It can be efficiently compressed by
|
||||
# observing that many characters have the same record, and many blocks of
|
||||
# characters (taking 128 characters in a block) have the same set of records as
|
||||
# other blocks. This leads to a 2-stage lookup process.
|
||||
#
|
||||
# This script constructs six tables. The ucd_caseless_sets table contains
|
||||
# lists of characters that all match each other caselessly. Each list is
|
||||
# in order, and is terminated by NOTACHAR (0xffffffff), which is larger than
|
||||
# any valid character. The first list is empty; this is used for characters
|
||||
# that are not part of any list.
|
||||
#
|
||||
# The ucd_digit_sets table contains the code points of the '9' characters in
|
||||
# each set of 10 decimal digits in Unicode. This is used to ensure that digits
|
||||
# in script runs all come from the same set. The first element in the vector
|
||||
# contains the number of subsequent elements, which are in ascending order.
|
||||
#
|
||||
# The ucd_script_sets vector contains lists of script numbers that are the
|
||||
# Script Extensions properties of certain characters. Each list is terminated
|
||||
# by zero (ucp_Unknown). A character with more than one script listed for its
|
||||
# Script Extension property has a negative value in its record. This is the
|
||||
# negated offset to the start of the relevant list in the ucd_script_sets
|
||||
# vector.
|
||||
#
|
||||
# The ucd_records table contains one instance of every unique record that is
|
||||
# required. The ucd_stage1 table is indexed by a character's block number,
|
||||
# which is the character's code point divided by 128, since 128 is the size
|
||||
# of each block. The result of a lookup in ucd_stage1 a "virtual" block number.
|
||||
#
|
||||
# The ucd_stage2 table is a table of "virtual" blocks; each block is indexed by
|
||||
# the offset of a character within its own block, and the result is the index
|
||||
# number of the required record in the ucd_records vector.
|
||||
#
|
||||
# The following examples are correct for the Unicode 11.0.0 database. Future
|
||||
# updates may make change the actual lookup values.
|
||||
#
|
||||
# Example: lowercase "a" (U+0061) is in block 0
|
||||
# lookup 0 in stage1 table yields 0
|
||||
# lookup 97 (0x61) in the first table in stage2 yields 17
|
||||
# record 17 is { 34, 5, 12, 0, -32, 34, 0 }
|
||||
# 34 = ucp_Latin => Latin script
|
||||
# 5 = ucp_Ll => Lower case letter
|
||||
# 12 = ucp_gbOther => Grapheme break property "Other"
|
||||
# 0 => Not part of a caseless set
|
||||
# -32 (-0x20) => Other case is U+0041
|
||||
# 34 = ucp_Latin => No special Script Extension property
|
||||
# 0 => Dummy value, unused at present
|
||||
#
|
||||
# Almost all lowercase latin characters resolve to the same record. One or two
|
||||
# are different because they are part of a multi-character caseless set (for
|
||||
# example, k, K and the Kelvin symbol are such a set).
|
||||
#
|
||||
# Example: hiragana letter A (U+3042) is in block 96 (0x60)
|
||||
# lookup 96 in stage1 table yields 90
|
||||
# lookup 66 (0x42) in table 90 in stage2 yields 564
|
||||
# record 564 is { 27, 7, 12, 0, 0, 27, 0 }
|
||||
# 27 = ucp_Hiragana => Hiragana script
|
||||
# 7 = ucp_Lo => Other letter
|
||||
# 12 = ucp_gbOther => Grapheme break property "Other"
|
||||
# 0 => Not part of a caseless set
|
||||
# 0 => No other case
|
||||
# 27 = ucp_Hiragana => No special Script Extension property
|
||||
# 0 => Dummy value, unused at present
|
||||
#
|
||||
# Example: vedic tone karshana (U+1CD0) is in block 57 (0x39)
|
||||
# lookup 57 in stage1 table yields 55
|
||||
# lookup 80 (0x50) in table 55 in stage2 yields 458
|
||||
# record 458 is { 28, 12, 3, 0, 0, -101, 0 }
|
||||
# 28 = ucp_Inherited => Script inherited from predecessor
|
||||
# 12 = ucp_Mn => Non-spacing mark
|
||||
# 3 = ucp_gbExtend => Grapheme break property "Extend"
|
||||
# 0 => Not part of a caseless set
|
||||
# 0 => No other case
|
||||
# -101 => Script Extension list offset = 101
|
||||
# 0 => Dummy value, unused at present
|
||||
#
|
||||
# At offset 101 in the ucd_script_sets vector we find the list 3, 15, 107, 29,
|
||||
# and terminator 0. This means that this character is expected to be used with
|
||||
# any of those scripts, which are Bengali, Devanagari, Grantha, and Kannada.
|
||||
#
|
||||
# Philip Hazel, 03 July 2008
|
||||
##############################################################################
|
||||
|
||||
|
||||
import re
|
||||
import string
|
||||
import sys
|
||||
|
||||
MAX_UNICODE = 0x110000
|
||||
NOTACHAR = 0xffffffff
|
||||
|
||||
|
||||
# Parse a line of Scripts.txt, GraphemeBreakProperty.txt or DerivedGeneralCategory.txt
|
||||
def make_get_names(enum):
|
||||
return lambda chardata: enum.index(chardata[1])
|
||||
|
||||
# Parse a line of CaseFolding.txt
|
||||
def get_other_case(chardata):
|
||||
if chardata[1] == 'C' or chardata[1] == 'S':
|
||||
return int(chardata[2], 16) - int(chardata[0], 16)
|
||||
return 0
|
||||
|
||||
# Parse a line of ScriptExtensions.txt
|
||||
def get_script_extension(chardata):
|
||||
this_script_list = list(chardata[1].split(' '))
|
||||
if len(this_script_list) == 1:
|
||||
return script_abbrevs.index(this_script_list[0])
|
||||
|
||||
script_numbers = []
|
||||
for d in this_script_list:
|
||||
script_numbers.append(script_abbrevs.index(d))
|
||||
script_numbers.append(0)
|
||||
script_numbers_length = len(script_numbers)
|
||||
|
||||
for i in range(1, len(script_lists) - script_numbers_length + 1):
|
||||
for j in range(0, script_numbers_length):
|
||||
found = True
|
||||
if script_lists[i+j] != script_numbers[j]:
|
||||
found = False
|
||||
break
|
||||
if found:
|
||||
return -i
|
||||
|
||||
# Not found in existing lists
|
||||
|
||||
return_value = len(script_lists)
|
||||
script_lists.extend(script_numbers)
|
||||
return -return_value
|
||||
|
||||
# Read the whole table in memory, setting/checking the Unicode version
|
||||
def read_table(file_name, get_value, default_value):
|
||||
global unicode_version
|
||||
|
||||
f = re.match(r'^[^/]+/([^.]+)\.txt$', file_name)
|
||||
file_base = f.group(1)
|
||||
version_pat = r"^# " + re.escape(file_base) + r"-(\d+\.\d+\.\d+)\.txt$"
|
||||
file = open(file_name, 'r', encoding='utf-8')
|
||||
f = re.match(version_pat, file.readline())
|
||||
version = f.group(1)
|
||||
if unicode_version == "":
|
||||
unicode_version = version
|
||||
elif unicode_version != version:
|
||||
print("WARNING: Unicode version differs in %s", file_name, file=sys.stderr)
|
||||
|
||||
table = [default_value] * MAX_UNICODE
|
||||
for line in file:
|
||||
line = re.sub(r'#.*', '', line)
|
||||
chardata = list(map(str.strip, line.split(';')))
|
||||
if len(chardata) <= 1:
|
||||
continue
|
||||
value = get_value(chardata)
|
||||
m = re.match(r'([0-9a-fA-F]+)(\.\.([0-9a-fA-F]+))?$', chardata[0])
|
||||
char = int(m.group(1), 16)
|
||||
if m.group(3) is None:
|
||||
last = char
|
||||
else:
|
||||
last = int(m.group(3), 16)
|
||||
for i in range(char, last + 1):
|
||||
# It is important not to overwrite a previously set
|
||||
# value because in the CaseFolding file there are lines
|
||||
# to be ignored (returning the default value of 0)
|
||||
# which often come after a line which has already set
|
||||
# data.
|
||||
if table[i] == default_value:
|
||||
table[i] = value
|
||||
file.close()
|
||||
return table
|
||||
|
||||
# Get the smallest possible C language type for the values
|
||||
def get_type_size(table):
|
||||
type_size = [("uint8_t", 1), ("uint16_t", 2), ("uint32_t", 4),
|
||||
("signed char", 1), ("pcre_int16", 2), ("pcre_int32", 4)]
|
||||
limits = [(0, 255), (0, 65535), (0, 4294967295),
|
||||
(-128, 127), (-32768, 32767), (-2147483648, 2147483647)]
|
||||
minval = min(table)
|
||||
maxval = max(table)
|
||||
for num, (minlimit, maxlimit) in enumerate(limits):
|
||||
if minlimit <= minval and maxval <= maxlimit:
|
||||
return type_size[num]
|
||||
else:
|
||||
raise OverflowError("Too large to fit into C types")
|
||||
|
||||
def get_tables_size(*tables):
|
||||
total_size = 0
|
||||
for table in tables:
|
||||
type, size = get_type_size(table)
|
||||
total_size += size * len(table)
|
||||
return total_size
|
||||
|
||||
# Compress the table into the two stages
|
||||
def compress_table(table, block_size):
|
||||
blocks = {} # Dictionary for finding identical blocks
|
||||
stage1 = [] # Stage 1 table contains block numbers (indices into stage 2 table)
|
||||
stage2 = [] # Stage 2 table contains the blocks with property values
|
||||
table = tuple(table)
|
||||
for i in range(0, len(table), block_size):
|
||||
block = table[i:i+block_size]
|
||||
start = blocks.get(block)
|
||||
if start is None:
|
||||
# Allocate a new block
|
||||
start = len(stage2) / block_size
|
||||
stage2 += block
|
||||
blocks[block] = start
|
||||
stage1.append(start)
|
||||
|
||||
return stage1, stage2
|
||||
|
||||
# Print a table
|
||||
def print_table(table, table_name, block_size = None):
|
||||
type, size = get_type_size(table)
|
||||
ELEMS_PER_LINE = 16
|
||||
|
||||
s = "const %s %s[] = { /* %d bytes" % (type, table_name, size * len(table))
|
||||
if block_size:
|
||||
s += ", block = %d" % block_size
|
||||
print(s + " */")
|
||||
table = tuple(table)
|
||||
if block_size is None:
|
||||
fmt = "%3d," * ELEMS_PER_LINE + " /* U+%04X */"
|
||||
mult = MAX_UNICODE / len(table)
|
||||
for i in range(0, len(table), ELEMS_PER_LINE):
|
||||
print(fmt % (table[i:i+ELEMS_PER_LINE] +
|
||||
(int(i * mult),)))
|
||||
else:
|
||||
if block_size > ELEMS_PER_LINE:
|
||||
el = ELEMS_PER_LINE
|
||||
else:
|
||||
el = block_size
|
||||
fmt = "%3d," * el + "\n"
|
||||
if block_size > ELEMS_PER_LINE:
|
||||
fmt = fmt * int(block_size / ELEMS_PER_LINE)
|
||||
for i in range(0, len(table), block_size):
|
||||
print(("/* block %d */\n" + fmt) % ((i / block_size,) + table[i:i+block_size]))
|
||||
print("};\n")
|
||||
|
||||
# Extract the unique combinations of properties into records
|
||||
def combine_tables(*tables):
|
||||
records = {}
|
||||
index = []
|
||||
for t in zip(*tables):
|
||||
i = records.get(t)
|
||||
if i is None:
|
||||
i = records[t] = len(records)
|
||||
index.append(i)
|
||||
return index, records
|
||||
|
||||
def get_record_size_struct(records):
|
||||
size = 0
|
||||
structure = '/* When recompiling tables with a new Unicode version, please check the\n' + \
|
||||
'types in this structure definition from pcre2_internal.h (the actual\n' + \
|
||||
'field names will be different):\n\ntypedef struct {\n'
|
||||
for i in range(len(records[0])):
|
||||
record_slice = [record[i] for record in records]
|
||||
slice_type, slice_size = get_type_size(record_slice)
|
||||
# add padding: round up to the nearest power of slice_size
|
||||
size = (size + slice_size - 1) & -slice_size
|
||||
size += slice_size
|
||||
structure += '%s property_%d;\n' % (slice_type, i)
|
||||
|
||||
# round up to the first item of the next structure in array
|
||||
record_slice = [record[0] for record in records]
|
||||
slice_type, slice_size = get_type_size(record_slice)
|
||||
size = (size + slice_size - 1) & -slice_size
|
||||
|
||||
structure += '} ucd_record;\n*/\n'
|
||||
return size, structure
|
||||
|
||||
def test_record_size():
|
||||
tests = [ \
|
||||
( [(3,), (6,), (6,), (1,)], 1 ), \
|
||||
( [(300,), (600,), (600,), (100,)], 2 ), \
|
||||
( [(25, 3), (6, 6), (34, 6), (68, 1)], 2 ), \
|
||||
( [(300, 3), (6, 6), (340, 6), (690, 1)], 4 ), \
|
||||
( [(3, 300), (6, 6), (6, 340), (1, 690)], 4 ), \
|
||||
( [(300, 300), (6, 6), (6, 340), (1, 690)], 4 ), \
|
||||
( [(3, 100000), (6, 6), (6, 123456), (1, 690)], 8 ), \
|
||||
( [(100000, 300), (6, 6), (123456, 6), (1, 690)], 8 ), \
|
||||
]
|
||||
for test in tests:
|
||||
size, struct = get_record_size_struct(test[0])
|
||||
assert(size == test[1])
|
||||
#print struct
|
||||
|
||||
def print_records(records, record_size):
|
||||
print('const ucd_record PRIV(ucd_records)[] = { ' + \
|
||||
'/* %d bytes, record size %d */' % (len(records) * record_size, record_size))
|
||||
|
||||
records = list(zip(list(records.keys()), list(records.values())))
|
||||
records.sort(key = lambda x: x[1])
|
||||
for i, record in enumerate(records):
|
||||
print((' {' + '%6d, ' * len(record[0]) + '}, /* %3d */') % (record[0] + (i,)))
|
||||
print('};\n')
|
||||
|
||||
script_names = ['Unknown', 'Arabic', 'Armenian', 'Bengali', 'Bopomofo', 'Braille', 'Buginese', 'Buhid', 'Canadian_Aboriginal',
|
||||
'Cherokee', 'Common', 'Coptic', 'Cypriot', 'Cyrillic', 'Deseret', 'Devanagari', 'Ethiopic', 'Georgian',
|
||||
'Glagolitic', 'Gothic', 'Greek', 'Gujarati', 'Gurmukhi', 'Han', 'Hangul', 'Hanunoo', 'Hebrew', 'Hiragana',
|
||||
'Inherited', 'Kannada', 'Katakana', 'Kharoshthi', 'Khmer', 'Lao', 'Latin', 'Limbu', 'Linear_B', 'Malayalam',
|
||||
'Mongolian', 'Myanmar', 'New_Tai_Lue', 'Ogham', 'Old_Italic', 'Old_Persian', 'Oriya', 'Osmanya', 'Runic',
|
||||
'Shavian', 'Sinhala', 'Syloti_Nagri', 'Syriac', 'Tagalog', 'Tagbanwa', 'Tai_Le', 'Tamil', 'Telugu', 'Thaana',
|
||||
'Thai', 'Tibetan', 'Tifinagh', 'Ugaritic', 'Yi',
|
||||
# New for Unicode 5.0
|
||||
'Balinese', 'Cuneiform', 'Nko', 'Phags_Pa', 'Phoenician',
|
||||
# New for Unicode 5.1
|
||||
'Carian', 'Cham', 'Kayah_Li', 'Lepcha', 'Lycian', 'Lydian', 'Ol_Chiki', 'Rejang', 'Saurashtra', 'Sundanese', 'Vai',
|
||||
# New for Unicode 5.2
|
||||
'Avestan', 'Bamum', 'Egyptian_Hieroglyphs', 'Imperial_Aramaic',
|
||||
'Inscriptional_Pahlavi', 'Inscriptional_Parthian',
|
||||
'Javanese', 'Kaithi', 'Lisu', 'Meetei_Mayek',
|
||||
'Old_South_Arabian', 'Old_Turkic', 'Samaritan', 'Tai_Tham', 'Tai_Viet',
|
||||
# New for Unicode 6.0.0
|
||||
'Batak', 'Brahmi', 'Mandaic',
|
||||
# New for Unicode 6.1.0
|
||||
'Chakma', 'Meroitic_Cursive', 'Meroitic_Hieroglyphs', 'Miao', 'Sharada', 'Sora_Sompeng', 'Takri',
|
||||
# New for Unicode 7.0.0
|
||||
'Bassa_Vah', 'Caucasian_Albanian', 'Duployan', 'Elbasan', 'Grantha', 'Khojki', 'Khudawadi',
|
||||
'Linear_A', 'Mahajani', 'Manichaean', 'Mende_Kikakui', 'Modi', 'Mro', 'Nabataean',
|
||||
'Old_North_Arabian', 'Old_Permic', 'Pahawh_Hmong', 'Palmyrene', 'Psalter_Pahlavi',
|
||||
'Pau_Cin_Hau', 'Siddham', 'Tirhuta', 'Warang_Citi',
|
||||
# New for Unicode 8.0.0
|
||||
'Ahom', 'Anatolian_Hieroglyphs', 'Hatran', 'Multani', 'Old_Hungarian',
|
||||
'SignWriting',
|
||||
# New for Unicode 10.0.0
|
||||
'Adlam', 'Bhaiksuki', 'Marchen', 'Newa', 'Osage', 'Tangut', 'Masaram_Gondi',
|
||||
'Nushu', 'Soyombo', 'Zanabazar_Square',
|
||||
# New for Unicode 11.0.0
|
||||
'Dogra', 'Gunjala_Gondi', 'Hanifi_Rohingya', 'Makasar', 'Medefaidrin',
|
||||
'Old_Sogdian', 'Sogdian',
|
||||
# New for Unicode 12.0.0
|
||||
'Elymaic', 'Nandinagari', 'Nyiakeng_Puachue_Hmong', 'Wancho',
|
||||
# New for Unicode 13.0.0
|
||||
'Chorasmian', 'Dives_Akuru', 'Khitan_Small_Script', 'Yezidi',
|
||||
# New for Unicode 14.0.0
|
||||
'Cypro_Minoan', 'Old_Uyghur', 'Tangsa', 'Toto', 'Vithkuqi'
|
||||
]
|
||||
|
||||
script_abbrevs = [
|
||||
'Zzzz', 'Arab', 'Armn', 'Beng', 'Bopo', 'Brai', 'Bugi', 'Buhd', 'Cans',
|
||||
'Cher', 'Zyyy', 'Copt', 'Cprt', 'Cyrl', 'Dsrt', 'Deva', 'Ethi', 'Geor',
|
||||
'Glag', 'Goth', 'Grek', 'Gujr', 'Guru', 'Hani', 'Hang', 'Hano', 'Hebr',
|
||||
'Hira', 'Zinh', 'Knda', 'Kana', 'Khar', 'Khmr', 'Laoo', 'Latn', 'Limb',
|
||||
'Linb', 'Mlym', 'Mong', 'Mymr', 'Talu', 'Ogam', 'Ital', 'Xpeo', 'Orya',
|
||||
'Osma', 'Runr', 'Shaw', 'Sinh', 'Sylo', 'Syrc', 'Tglg', 'Tagb', 'Tale',
|
||||
'Taml', 'Telu', 'Thaa', 'Thai', 'Tibt', 'Tfng', 'Ugar', 'Yiii',
|
||||
#New for Unicode 5.0
|
||||
'Bali', 'Xsux', 'Nkoo', 'Phag', 'Phnx',
|
||||
#New for Unicode 5.1
|
||||
'Cari', 'Cham', 'Kali', 'Lepc', 'Lyci', 'Lydi', 'Olck', 'Rjng', 'Saur',
|
||||
'Sund', 'Vaii',
|
||||
#New for Unicode 5.2
|
||||
'Avst', 'Bamu', 'Egyp', 'Armi', 'Phli', 'Prti', 'Java', 'Kthi', 'Lisu',
|
||||
'Mtei', 'Sarb', 'Orkh', 'Samr', 'Lana', 'Tavt',
|
||||
#New for Unicode 6.0.0
|
||||
'Batk', 'Brah', 'Mand',
|
||||
#New for Unicode 6.1.0
|
||||
'Cakm', 'Merc', 'Mero', 'Plrd', 'Shrd', 'Sora', 'Takr',
|
||||
#New for Unicode 7.0.0
|
||||
'Bass', 'Aghb', 'Dupl', 'Elba', 'Gran', 'Khoj', 'Sind', 'Lina', 'Mahj',
|
||||
'Mani', 'Mend', 'Modi', 'Mroo', 'Nbat', 'Narb', 'Perm', 'Hmng', 'Palm',
|
||||
'Phlp', 'Pauc', 'Sidd', 'Tirh', 'Wara',
|
||||
#New for Unicode 8.0.0
|
||||
'Ahom', 'Hluw', 'Hatr', 'Mult', 'Hung', 'Sgnw',
|
||||
#New for Unicode 10.0.0
|
||||
'Adlm', 'Bhks', 'Marc', 'Newa', 'Osge', 'Tang', 'Gonm', 'Nshu', 'Soyo',
|
||||
'Zanb',
|
||||
#New for Unicode 11.0.0
|
||||
'Dogr', 'Gong', 'Rohg', 'Maka', 'Medf', 'Sogo', 'Sogd',
|
||||
#New for Unicode 12.0.0
|
||||
'Elym', 'Nand', 'Hmnp', 'Wcho',
|
||||
#New for Unicode 13.0.0
|
||||
'Chrs', 'Diak', 'Kits', 'Yezi',
|
||||
#New for Unicode 14.0.0
|
||||
'Cpmn', 'Ougr', 'Tngs', 'Toto', 'Vith'
|
||||
]
|
||||
|
||||
category_names = ['Cc', 'Cf', 'Cn', 'Co', 'Cs', 'Ll', 'Lm', 'Lo', 'Lt', 'Lu',
|
||||
'Mc', 'Me', 'Mn', 'Nd', 'Nl', 'No', 'Pc', 'Pd', 'Pe', 'Pf', 'Pi', 'Po', 'Ps',
|
||||
'Sc', 'Sk', 'Sm', 'So', 'Zl', 'Zp', 'Zs' ]
|
||||
|
||||
# The Extended_Pictographic property is not found in the file where all the
|
||||
# others are (GraphemeBreakProperty.txt). It comes from the emoji-data.txt
|
||||
# file, but we list it here so that the name has the correct index value.
|
||||
|
||||
break_property_names = ['CR', 'LF', 'Control', 'Extend', 'Prepend',
|
||||
'SpacingMark', 'L', 'V', 'T', 'LV', 'LVT', 'Regional_Indicator', 'Other',
|
||||
'ZWJ', 'Extended_Pictographic' ]
|
||||
|
||||
test_record_size()
|
||||
unicode_version = ""
|
||||
|
||||
script = read_table('Unicode.tables/Scripts.txt', make_get_names(script_names), script_names.index('Unknown'))
|
||||
category = read_table('Unicode.tables/DerivedGeneralCategory.txt', make_get_names(category_names), category_names.index('Cn'))
|
||||
break_props = read_table('Unicode.tables/GraphemeBreakProperty.txt', make_get_names(break_property_names), break_property_names.index('Other'))
|
||||
other_case = read_table('Unicode.tables/CaseFolding.txt', get_other_case, 0)
|
||||
|
||||
# The grapheme breaking rules were changed for Unicode 11.0.0 (June 2018). Now
|
||||
# we need to find the Extended_Pictographic property for emoji characters. This
|
||||
# can be set as an additional grapheme break property, because the default for
|
||||
# all the emojis is "other". We scan the emoji-data.txt file and modify the
|
||||
# break-props table.
|
||||
|
||||
file = open('Unicode.tables/emoji-data.txt', 'r', encoding='utf-8')
|
||||
for line in file:
|
||||
line = re.sub(r'#.*', '', line)
|
||||
chardata = list(map(str.strip, line.split(';')))
|
||||
if len(chardata) <= 1:
|
||||
continue
|
||||
|
||||
if chardata[1] != "Extended_Pictographic":
|
||||
continue
|
||||
|
||||
m = re.match(r'([0-9a-fA-F]+)(\.\.([0-9a-fA-F]+))?$', chardata[0])
|
||||
char = int(m.group(1), 16)
|
||||
if m.group(3) is None:
|
||||
last = char
|
||||
else:
|
||||
last = int(m.group(3), 16)
|
||||
for i in range(char, last + 1):
|
||||
if break_props[i] != break_property_names.index('Other'):
|
||||
print("WARNING: Emoji 0x%x has break property %s, not 'Other'",
|
||||
i, break_property_names[break_props[i]], file=sys.stderr)
|
||||
break_props[i] = break_property_names.index('Extended_Pictographic')
|
||||
file.close()
|
||||
|
||||
# The Script Extensions property default value is the Script value. Parse the
|
||||
# file, setting 'Unknown' as the default (this will never be a Script Extension
|
||||
# value), then scan it and fill in the default from Scripts. Code added by PH
|
||||
# in October 2018. Positive values are used for just a single script for a
|
||||
# code point. Negative values are negated offsets in a list of lists of
|
||||
# multiple scripts. Initialize this list with a single entry, as the zeroth
|
||||
# element is never used.
|
||||
|
||||
script_lists = [0]
|
||||
script_abbrevs_default = script_abbrevs.index('Zzzz')
|
||||
scriptx = read_table('Unicode.tables/ScriptExtensions.txt', get_script_extension, script_abbrevs_default)
|
||||
|
||||
for i in range(0, MAX_UNICODE):
|
||||
if scriptx[i] == script_abbrevs_default:
|
||||
scriptx[i] = script[i]
|
||||
|
||||
# With the addition of the new Script Extensions field, we need some padding
|
||||
# to get the Unicode records up to 12 bytes (multiple of 4). Set a value
|
||||
# greater than 255 to make the field 16 bits.
|
||||
|
||||
padding_dummy = [0] * MAX_UNICODE
|
||||
padding_dummy[0] = 256
|
||||
|
||||
# This block of code was added by PH in September 2012. I am not a Python
|
||||
# programmer, so the style is probably dreadful, but it does the job. It scans
|
||||
# the other_case table to find sets of more than two characters that must all
|
||||
# match each other caselessly. Later in this script a table of these sets is
|
||||
# written out. However, we have to do this work here in order to compute the
|
||||
# offsets in the table that are inserted into the main table.
|
||||
|
||||
# The CaseFolding.txt file lists pairs, but the common logic for reading data
|
||||
# sets only one value, so first we go through the table and set "return"
|
||||
# offsets for those that are not already set.
|
||||
|
||||
for c in range(MAX_UNICODE):
|
||||
if other_case[c] != 0 and other_case[c + other_case[c]] == 0:
|
||||
other_case[c + other_case[c]] = -other_case[c]
|
||||
|
||||
# Now scan again and create equivalence sets.
|
||||
|
||||
sets = []
|
||||
|
||||
for c in range(MAX_UNICODE):
|
||||
o = c + other_case[c]
|
||||
|
||||
# Trigger when this character's other case does not point back here. We
|
||||
# now have three characters that are case-equivalent.
|
||||
|
||||
if other_case[o] != -other_case[c]:
|
||||
t = o + other_case[o]
|
||||
|
||||
# Scan the existing sets to see if any of the three characters are already
|
||||
# part of a set. If so, unite the existing set with the new set.
|
||||
|
||||
appended = 0
|
||||
for s in sets:
|
||||
found = 0
|
||||
for x in s:
|
||||
if x == c or x == o or x == t:
|
||||
found = 1
|
||||
|
||||
# Add new characters to an existing set
|
||||
|
||||
if found:
|
||||
found = 0
|
||||
for y in [c, o, t]:
|
||||
for x in s:
|
||||
if x == y:
|
||||
found = 1
|
||||
if not found:
|
||||
s.append(y)
|
||||
appended = 1
|
||||
|
||||
# If we have not added to an existing set, create a new one.
|
||||
|
||||
if not appended:
|
||||
sets.append([c, o, t])
|
||||
|
||||
# End of loop looking for caseless sets.
|
||||
|
||||
# Now scan the sets and set appropriate offsets for the characters.
|
||||
|
||||
caseless_offsets = [0] * MAX_UNICODE
|
||||
|
||||
offset = 1;
|
||||
for s in sets:
|
||||
for x in s:
|
||||
caseless_offsets[x] = offset
|
||||
offset += len(s) + 1
|
||||
|
||||
# End of block of code for creating offsets for caseless matching sets.
|
||||
|
||||
|
||||
# Combine the tables
|
||||
|
||||
table, records = combine_tables(script, category, break_props,
|
||||
caseless_offsets, other_case, scriptx, padding_dummy)
|
||||
|
||||
record_size, record_struct = get_record_size_struct(list(records.keys()))
|
||||
|
||||
# Find the optimum block size for the two-stage table
|
||||
min_size = sys.maxsize
|
||||
for block_size in [2 ** i for i in range(5,10)]:
|
||||
size = len(records) * record_size
|
||||
stage1, stage2 = compress_table(table, block_size)
|
||||
size += get_tables_size(stage1, stage2)
|
||||
#print "/* block size %5d => %5d bytes */" % (block_size, size)
|
||||
if size < min_size:
|
||||
min_size = size
|
||||
min_stage1, min_stage2 = stage1, stage2
|
||||
min_block_size = block_size
|
||||
|
||||
print("/* This module is generated by the maint/MultiStage2.py script.")
|
||||
print("Do not modify it by hand. Instead modify the script and run it")
|
||||
print("to regenerate this code.")
|
||||
print()
|
||||
print("As well as being part of the PCRE2 library, this module is #included")
|
||||
print("by the pcre2test program, which redefines the PRIV macro to change")
|
||||
print("table names from _pcre2_xxx to xxxx, thereby avoiding name clashes")
|
||||
print("with the library. At present, just one of these tables is actually")
|
||||
print("needed. */")
|
||||
print()
|
||||
print("#ifndef PCRE2_PCRE2TEST")
|
||||
print()
|
||||
print("#ifdef HAVE_CONFIG_H")
|
||||
print("#include \"config.h\"")
|
||||
print("#endif")
|
||||
print()
|
||||
print("#include \"pcre2_internal.h\"")
|
||||
print()
|
||||
print("#endif /* PCRE2_PCRE2TEST */")
|
||||
print()
|
||||
print("/* Unicode character database. */")
|
||||
print("/* This file was autogenerated by the MultiStage2.py script. */")
|
||||
print("/* Total size: %d bytes, block size: %d. */" % (min_size, min_block_size))
|
||||
print()
|
||||
print("/* The tables herein are needed only when UCP support is built,")
|
||||
print("and in PCRE2 that happens automatically with UTF support.")
|
||||
print("This module should not be referenced otherwise, so")
|
||||
print("it should not matter whether it is compiled or not. However")
|
||||
print("a comment was received about space saving - maybe the guy linked")
|
||||
print("all the modules rather than using a library - so we include a")
|
||||
print("condition to cut out the tables when not needed. But don't leave")
|
||||
print("a totally empty module because some compilers barf at that.")
|
||||
print("Instead, just supply some small dummy tables. */")
|
||||
print()
|
||||
print("#ifndef SUPPORT_UNICODE")
|
||||
print("const ucd_record PRIV(ucd_records)[] = {{0,0,0,0,0,0,0 }};")
|
||||
print("const uint16_t PRIV(ucd_stage1)[] = {0};")
|
||||
print("const uint16_t PRIV(ucd_stage2)[] = {0};")
|
||||
print("const uint32_t PRIV(ucd_caseless_sets)[] = {0};")
|
||||
print("#else")
|
||||
print()
|
||||
print("const char *PRIV(unicode_version) = \"{}\";".format(unicode_version))
|
||||
print()
|
||||
print("/* If the 32-bit library is run in non-32-bit mode, character values")
|
||||
print("greater than 0x10ffff may be encountered. For these we set up a")
|
||||
print("special record. */")
|
||||
print()
|
||||
print("#if PCRE2_CODE_UNIT_WIDTH == 32")
|
||||
print("const ucd_record PRIV(dummy_ucd_record)[] = {{")
|
||||
print(" ucp_Unknown, /* script */")
|
||||
print(" ucp_Cn, /* type unassigned */")
|
||||
print(" ucp_gbOther, /* grapheme break property */")
|
||||
print(" 0, /* case set */")
|
||||
print(" 0, /* other case */")
|
||||
print(" ucp_Unknown, /* script extension */")
|
||||
print(" 0, /* dummy filler */")
|
||||
print(" }};")
|
||||
print("#endif")
|
||||
print()
|
||||
print(record_struct)
|
||||
|
||||
# --- Added by PH: output the table of caseless character sets ---
|
||||
|
||||
print("/* This table contains lists of characters that are caseless sets of")
|
||||
print("more than one character. Each list is terminated by NOTACHAR. */\n")
|
||||
|
||||
print("const uint32_t PRIV(ucd_caseless_sets)[] = {")
|
||||
print(" NOTACHAR,")
|
||||
for s in sets:
|
||||
s = sorted(s)
|
||||
for x in s:
|
||||
print(' 0x%04x,' % x, end=' ')
|
||||
print(' NOTACHAR,')
|
||||
print('};')
|
||||
print()
|
||||
|
||||
# ------
|
||||
|
||||
print("/* When #included in pcre2test, we don't need the table of digit")
|
||||
print("sets, nor the the large main UCD tables. */")
|
||||
print()
|
||||
print("#ifndef PCRE2_PCRE2TEST")
|
||||
print()
|
||||
|
||||
# --- Added by PH: read Scripts.txt again for the sets of 10 digits. ---
|
||||
|
||||
digitsets = []
|
||||
file = open('Unicode.tables/Scripts.txt', 'r', encoding='utf-8')
|
||||
|
||||
for line in file:
|
||||
m = re.match(r'([0-9a-fA-F]+)\.\.([0-9a-fA-F]+)\s+;\s+\S+\s+#\s+Nd\s+', line)
|
||||
if m is None:
|
||||
continue
|
||||
first = int(m.group(1),16)
|
||||
last = int(m.group(2),16)
|
||||
if ((last - first + 1) % 10) != 0:
|
||||
print("ERROR: %04x..%04x does not contain a multiple of 10 characters" % (first, last),
|
||||
file=sys.stderr)
|
||||
while first < last:
|
||||
digitsets.append(first + 9)
|
||||
first += 10
|
||||
file.close()
|
||||
digitsets.sort()
|
||||
|
||||
print("/* This table lists the code points for the '9' characters in each")
|
||||
print("set of decimal digits. It is used to ensure that all the digits in")
|
||||
print("a script run come from the same set. */\n")
|
||||
print("const uint32_t PRIV(ucd_digit_sets)[] = {")
|
||||
|
||||
print(" %d, /* Number of subsequent values */" % len(digitsets), end='')
|
||||
count = 8
|
||||
for d in digitsets:
|
||||
if count == 8:
|
||||
print("\n ", end='')
|
||||
count = 0
|
||||
print(" 0x%05x," % d, end='')
|
||||
count += 1
|
||||
print("\n};\n")
|
||||
|
||||
print("/* This vector is a list of lists of scripts for the Script Extension")
|
||||
print("property. Each sublist is zero-terminated. */\n")
|
||||
print("const uint8_t PRIV(ucd_script_sets)[] = {")
|
||||
|
||||
count = 0
|
||||
print(" /* 0 */", end='')
|
||||
for d in script_lists:
|
||||
print(" %3d," % d, end='')
|
||||
count += 1
|
||||
if d == 0:
|
||||
print("\n /* %3d */" % count, end='')
|
||||
print("\n};\n")
|
||||
|
||||
# Output the main UCD tables.
|
||||
|
||||
print("/* These are the main two-stage UCD tables. The fields in each record are:")
|
||||
print("script (8 bits), character type (8 bits), grapheme break property (8 bits),")
|
||||
print("offset to multichar other cases or zero (8 bits), offset to other case")
|
||||
print("or zero (32 bits, signed), script extension (16 bits, signed), and a dummy")
|
||||
print("16-bit field to make the whole thing a multiple of 4 bytes. */\n")
|
||||
|
||||
print_records(records, record_size)
|
||||
print_table(min_stage1, 'PRIV(ucd_stage1)')
|
||||
print_table(min_stage2, 'PRIV(ucd_stage2)', min_block_size)
|
||||
print("#if UCD_BLOCK_SIZE != %d" % min_block_size)
|
||||
print("#error Please correct UCD_BLOCK_SIZE in pcre2_internal.h")
|
||||
print("#endif")
|
||||
print("#endif /* SUPPORT_UNICODE */")
|
||||
print()
|
||||
print("#endif /* PCRE2_PCRE2TEST */")
|
||||
|
||||
|
||||
# This code was part of the original contribution, but is commented out as it
|
||||
# was never used. A two-stage table has sufficed.
|
||||
|
||||
"""
|
||||
|
||||
# Three-stage tables:
|
||||
|
||||
# Find the optimum block size for 3-stage table
|
||||
min_size = sys.maxint
|
||||
for stage3_block in [2 ** i for i in range(2,6)]:
|
||||
stage_i, stage3 = compress_table(table, stage3_block)
|
||||
for stage2_block in [2 ** i for i in range(5,10)]:
|
||||
size = len(records) * 4
|
||||
stage1, stage2 = compress_table(stage_i, stage2_block)
|
||||
size += get_tables_size(stage1, stage2, stage3)
|
||||
# print "/* %5d / %3d => %5d bytes */" % (stage2_block, stage3_block, size)
|
||||
if size < min_size:
|
||||
min_size = size
|
||||
min_stage1, min_stage2, min_stage3 = stage1, stage2, stage3
|
||||
min_stage2_block, min_stage3_block = stage2_block, stage3_block
|
||||
|
||||
print "/* Total size: %d bytes" % min_size */
|
||||
print_records(records)
|
||||
print_table(min_stage1, 'ucd_stage1')
|
||||
print_table(min_stage2, 'ucd_stage2', min_stage2_block)
|
||||
print_table(min_stage3, 'ucd_stage3', min_stage3_block)
|
||||
|
||||
"""
|
198
maint/README
198
maint/README
|
@ -16,99 +16,122 @@ and also contains some notes for maintainers. Its contents are:
|
|||
Files in the maint directory
|
||||
============================
|
||||
|
||||
GenerateUtt.py A Python script to generate part of the pcre2_tables.c file
|
||||
that contains Unicode script names in a long string with
|
||||
offsets, which is tedious to maintain by hand.
|
||||
GenerateCommon.py
|
||||
A Python module containing data and functions that are used by the other
|
||||
Generate scripts.
|
||||
|
||||
ManyConfigTests A shell script that runs "configure, make, test" a number of
|
||||
times with different configuration settings.
|
||||
GenerateTest26.py
|
||||
A Python script that generates input and expected output test data for test
|
||||
26, which tests certain aspects of Unicode property support.
|
||||
|
||||
MultiStage2.py A Python script that generates the file pcre2_ucd.c from six
|
||||
Unicode data files, which are themselves downloaded from the
|
||||
Unicode web site. Run this script in the "maint" directory.
|
||||
The generated file is written to stdout. It contains the
|
||||
tables for a 2-stage lookup of Unicode properties, along with
|
||||
some auxiliary tables.
|
||||
GenerateUcd.py
|
||||
A Python script that generates the file pcre2_ucd.c from GenerateCommon.py
|
||||
and Unicode data files, which are themselves downloaded from the Unicode web
|
||||
site. The generated file contains the tables for a 2-stage lookup of Unicode
|
||||
properties, along with some auxiliary tables. The script starts with a long
|
||||
comment that gives details of the tables it constructs.
|
||||
|
||||
GenerateUcpHeader.py
|
||||
A Python script that generates the file pcre2_ucp.h from GenerateCommon.py
|
||||
and Unicode data files. The generated file defines constants for various
|
||||
Unicode property values.
|
||||
|
||||
GenerateUcpTables.py
|
||||
A Python script that generates the file pcre2_ucptables.c from
|
||||
GenerateCommon.py and Unicode data files. The generated file contains tables
|
||||
for looking up Unicode property names.
|
||||
|
||||
ManyConfigTests
|
||||
A shell script that runs "configure, make, test" a number of times with
|
||||
different configuration settings.
|
||||
|
||||
pcre2_chartables.c.non-standard
|
||||
This is a set of character tables that came from a Windows
|
||||
system. It has characters greater than 128 that are set as
|
||||
spaces, amongst other things. I kept it so that it can be
|
||||
used for testing from time to time.
|
||||
This is a set of character tables that came from a Windows system. It has
|
||||
characters greater than 128 that are set as spaces, amongst other things. I
|
||||
kept it so that it can be used for testing from time to time.
|
||||
|
||||
README This file.
|
||||
README
|
||||
This file.
|
||||
|
||||
Unicode.tables The files in this directory were downloaded from the Unicode
|
||||
web site. They contain information about Unicode characters
|
||||
and scripts. The ones used by the MultiStage2.py script are
|
||||
CaseFolding.txt, DerivedGeneralCategory.txt, Scripts.txt,
|
||||
ScriptExtensions.txt, GraphemeBreakProperty.txt, and
|
||||
emoji-data.txt. I've kept UnicodeData.txt (which is no longer
|
||||
used by the script) because it is useful occasionally for
|
||||
manually looking up the details of certain characters.
|
||||
However, note that character names in this file such as
|
||||
"Arabic sign sanah" do NOT mean that the character is in a
|
||||
Unicode.tables
|
||||
The files in this directory were downloaded from the Unicode web site. They
|
||||
contain information about Unicode characters and scripts, and are used by the
|
||||
Generate scripts. There is also UnicodeData.txt, which is no longer used by
|
||||
any script, because it is useful occasionally for manually looking up the
|
||||
details of certain characters. However, note that character names in this
|
||||
file such as "Arabic sign sanah" do NOT mean that the character is in a
|
||||
particular script (in this case, Arabic). Scripts.txt and
|
||||
ScriptExtensions.txt are where to look for script information.
|
||||
|
||||
ucptest.c A short C program for testing the Unicode property macros
|
||||
that do lookups in the pcre2_ucd.c data, mainly useful after
|
||||
rebuilding the Unicode property table. Compile and run this in
|
||||
the "maint" directory (see comments at its head). This program
|
||||
can also be used to find characters with specific properties.
|
||||
ucptest.c
|
||||
A program for testing the Unicode property macros that do lookups in the
|
||||
pcre2_ucd.c data, mainly useful after rebuilding the Unicode property tables.
|
||||
Compile and run this in the "maint" directory (see comments at its head).
|
||||
This program can also be used to find characters with specific properties and
|
||||
to list which properties are supported.
|
||||
|
||||
ucptestdata A directory containing four files, testinput{1,2} and
|
||||
testoutput{1,2}, for use in conjunction with the ucptest
|
||||
program.
|
||||
ucptestdata
|
||||
A directory containing four files, testinput{1,2} and testoutput{1,2}, for
|
||||
use in conjunction with the ucptest program.
|
||||
|
||||
utf8.c A short, freestanding C program for converting a Unicode code
|
||||
point into a sequence of bytes in the UTF-8 encoding, and vice
|
||||
versa. If its argument is a hex number such as 0x1234, it
|
||||
outputs a list of the equivalent UTF-8 bytes. If its argument
|
||||
is a sequence of concatenated UTF-8 bytes (e.g. e188b4) it
|
||||
treats them as a UTF-8 character and outputs the equivalent
|
||||
code point in hex. See comments at its head for details.
|
||||
utf8.c
|
||||
A short, freestanding C program for converting a Unicode code point into a
|
||||
sequence of bytes in the UTF-8 encoding, and vice versa. If its argument is a
|
||||
hex number such as 0x1234, it outputs a list of the equivalent UTF-8 bytes.
|
||||
If its argument is a sequence of concatenated UTF-8 bytes (e.g. 12e188b4) it
|
||||
treats them as a UTF-8 string and outputs the equivalent code points in hex.
|
||||
See comments at its head for details.
|
||||
|
||||
|
||||
Updating to a new Unicode release
|
||||
=================================
|
||||
|
||||
When there is a new release of Unicode, the files in Unicode.tables must be
|
||||
refreshed from the web site. If the new version of Unicode adds new character
|
||||
scripts, the source file pcre2_ucp.h and both the MultiStage2.py and the
|
||||
GenerateUtt.py scripts must be edited to add the new names. I have been adding
|
||||
each new group at the end of the relevant list, with a comment. Note also that
|
||||
both the pcre2syntax.3 and pcre2pattern.3 man pages contain lists of Unicode
|
||||
script names.
|
||||
refreshed from the web site. Once that is done, the four Python scripts that
|
||||
generate files from the Unicode data can be run from within the "maint"
|
||||
directory.
|
||||
|
||||
MultiStage2.py has two lists: the full names and the abbreviations that are
|
||||
found in the ScriptExtensions.txt file. A list of script names and their
|
||||
abbreviations can be found in the PropertyValueAliases.txt file on the
|
||||
Unicode web site. There is also a Wikipedia page that lists them, and notes the
|
||||
Unicode version in which they were introduced:
|
||||
Note: Previously, it was necessary to update lists of scripts and their
|
||||
abbreviations by hand before running the Python scripts. This is no longer
|
||||
necessary because the scripts have been upgraded to extract this information
|
||||
themselves. Also, there used to be explicit lists of scripts in two of the man
|
||||
pages. This is no longer the case; the pcre2test program can now output a list
|
||||
of supported scripts.
|
||||
|
||||
https://en.wikipedia.org/wiki/Unicode_scripts#Table_of_Unicode_scripts
|
||||
You can give an output file name as an argument to the following scripts, but
|
||||
by default:
|
||||
|
||||
Once the script name lists have been updated, MultiStage2.py can be run to
|
||||
generate a new version of pcre2_ucd.c, and GenerateUtt.py can be run to
|
||||
generate the tricky tables for inclusion in pcre2_tables.c (which must be
|
||||
hand-edited). If MultiStage2.py gives the error "ValueError: list.index(x): x
|
||||
not in list", the cause is usually a missing (or misspelt) name in one of the
|
||||
lists of scripts.
|
||||
GenerateUcd.py creates pcre2_ucd.c )
|
||||
GenerateUcpHeader.py creates pcre2_ucp.h ) in the current directory
|
||||
GenerateUcpTables.py creates pcre2_ucptables.c )
|
||||
|
||||
The ucptest program can be compiled and used to check that the new tables in
|
||||
pcre2_ucd.c work properly, using the data files in ucptestdata to check a
|
||||
number of test characters. It used to be necessary to update the source
|
||||
ucptest.c whenever new Unicode scripts were added, but this is no longer
|
||||
required because that program now uses the lists in the PCRE2 source. However,
|
||||
adding a few tests for new scripts to the files in ucptestdata is a good idea.
|
||||
These files can be compared against the existing versions in the src directory
|
||||
to check on any changes before replacing the old files, but you can also
|
||||
generate directly into the final location by running:
|
||||
|
||||
./GenerateUcd.py ../src/pcre2_ucd.c
|
||||
./GenerateUcpHeader.py ../src/pcre2_ucp.h
|
||||
./GenerateUcpTables.py ../src/pcre2_ucptables.c
|
||||
|
||||
Once the .c and .h files are in the ../src directory, the ucptest program can
|
||||
be compiled and used to check that the new tables work properly. The data files
|
||||
in ucptestdata are set up to check a number of test characters. See the
|
||||
comments at the start of ucptest.c. If there are new scripts, adding a few
|
||||
tests to the files in ucptestdata is a good idea.
|
||||
|
||||
Finally, you should run the GenerateTest26.py script to regenerate new versions
|
||||
of the input and expected output from a series of Unicode property tests that
|
||||
are automatically generated from the Unicode data files. By default, the files
|
||||
are written to testinput26 and testoutput26 in the current directory, but you
|
||||
can give an alternative directory name as an argument to the script. These
|
||||
files should eventually be installed in the main testdata directory.
|
||||
|
||||
|
||||
Preparing for a PCRE2 release
|
||||
=============================
|
||||
|
||||
This section contains a checklist of things that I consult before building a
|
||||
distribution for a new release.
|
||||
This section contains a checklist of things that I do before building a new
|
||||
release.
|
||||
|
||||
. Ensure that the version number and version date are correct in configure.ac.
|
||||
|
||||
|
@ -117,17 +140,16 @@ distribution for a new release.
|
|||
|
||||
. If new build options or new source files have been added, ensure that they
|
||||
are added to the CMake files as well as to the autoconf files. The relevant
|
||||
files are CMakeLists.txt and config-cmake.h.in. After making a release
|
||||
tarball, test it out with CMake if there have been changes here.
|
||||
files are CMakeLists.txt and config-cmake.h.in. After making a release, test
|
||||
it out with CMake if there have been changes here.
|
||||
|
||||
. Run ./autogen.sh to ensure everything is up-to-date.
|
||||
|
||||
. Compile and test with many different config options, and combinations of
|
||||
options. Also, test with valgrind by running "RunTest valgrind" and
|
||||
"RunGrepTest valgrind" (which takes quite a long time). The script
|
||||
maint/ManyConfigTests now encapsulates this testing. It runs tests with
|
||||
different configurations, and it also runs some of them with valgrind, all of
|
||||
which can take quite some time.
|
||||
"RunGrepTest valgrind". The script maint/ManyConfigTests now encapsulates
|
||||
this testing. It runs tests with different configurations, and it also runs
|
||||
some of them with valgrind, all of which can take quite some time.
|
||||
|
||||
. Run tests in both 32-bit and 64-bit environments if possible. I can no longer
|
||||
run 32-bit tests.
|
||||
|
@ -142,7 +164,8 @@ distribution for a new release.
|
|||
-fsanitize=signed-integer-overflow
|
||||
|
||||
. Do a test build using CMake. Remove src/config.h first, lest it override the
|
||||
version that CMake creates. Do NOT use parallel make.
|
||||
version that CMake creates. Also do a CMake unity build to check that it
|
||||
still works: [c]cmake -DCMAKE_UNITY_BUILD=ON sets up a unity build.
|
||||
|
||||
. Run perltest.sh on the test data for tests 1 and 4. The output should match
|
||||
the PCRE2 test output, apart from the version identification at the start of
|
||||
|
@ -161,11 +184,12 @@ distribution for a new release.
|
|||
systems. For example, on Solaris it is helpful to test using Sun's cc
|
||||
compiler as a change from gcc. Adding -xarch=v9 to the cc options does a
|
||||
64-bit test, but it also needs -S 64 for pcre2test to increase the stack size
|
||||
for test 2. Since I retired I can no longer do much of this, but instead I
|
||||
rely on putting out release candidates for testing by the community.
|
||||
for test 2. Since I retired I can no longer do much of this. There are
|
||||
automated tests under Ubuntu, Alpine, and Windows that are now set up as
|
||||
GitHub actions. Check that they are running clean.
|
||||
|
||||
. The buildbots at http://buildfarm.opencsw.org/ do some automated testing
|
||||
of PCRE2 and should be checked before putting out a release.
|
||||
of PCRE2 and should also be checked before putting out a release.
|
||||
|
||||
|
||||
Updating version info for libtool
|
||||
|
@ -221,10 +245,11 @@ it reports them and then aborts. Otherwise it removes trailing spaces from
|
|||
sources and refreshes the HTML documentation. Update the GitHub repository with
|
||||
"git push".
|
||||
|
||||
Once PrepareRelease has run clean, run "make distcheck" to create the tarball
|
||||
Once PrepareRelease has run clean, run "make distcheck" to create the tarballs
|
||||
and the zipball. I then sign these files. Double-check with "git status" that
|
||||
the repository is fully up-to-date, then create a new tag on GitHub. Upload the
|
||||
tarball, zipball, and the signatures as "assets" of the GitHub release.
|
||||
the repository is fully up-to-date, then create a new tag and a release on
|
||||
GitHub. Upload the tarballs, zipball, and the signatures as "assets" of the
|
||||
GitHub release.
|
||||
|
||||
When the new release is out, don't forget to tell webmaster@pcre.org and the
|
||||
mailing list.
|
||||
|
@ -343,8 +368,6 @@ years.
|
|||
|
||||
See Unicode TR 29. The last two are very much aimed at natural language.
|
||||
|
||||
. (?[...]) extended classes: big project.
|
||||
|
||||
. Allow a callout to specify a number of characters to skip. This can be done
|
||||
compatibly via an extra callout field.
|
||||
|
||||
|
@ -414,13 +437,8 @@ years.
|
|||
with lookarounds for \b and \B. Ideally the setting should last till the end
|
||||
of the group, which means remembering all previous settings; maybe a fixed
|
||||
amount of stack would do - how deep would anyone want to nest these things?
|
||||
See GitHub issue #13 for a compendium of character class issues.
|
||||
|
||||
. Recognize the short script names. They are already listed in maint/
|
||||
Multistage2.py because they are needed for scanning the script extensions
|
||||
file.
|
||||
|
||||
. Use script extensions for \p?
|
||||
See GitHub issue #13 for a compendium of character class issues, including
|
||||
(?[...]) extended classes.
|
||||
|
||||
. A user suggested something like --with-build-info to set a build information
|
||||
string that could be retrieved by pcre2_config(). However, there's no
|
||||
|
@ -439,4 +457,4 @@ years.
|
|||
Philip Hazel
|
||||
Email local part: Philip.Hazel
|
||||
Email domain: gmail.com
|
||||
Last updated: 26 August 2021
|
||||
Last updated: 25 April 2022
|
||||
|
|
|
@ -0,0 +1,633 @@
|
|||
# BidiMirroring-14.0.0.txt
|
||||
# Date: 2021-08-08, 22:55:00 GMT [KW, RP]
|
||||
# © 2021 Unicode®, Inc.
|
||||
# For terms of use, see https://www.unicode.org/terms_of_use.html
|
||||
#
|
||||
# Unicode Character Database
|
||||
# For documentation, see https://www.unicode.org/reports/tr44/
|
||||
#
|
||||
# Bidi_Mirroring_Glyph Property
|
||||
#
|
||||
# This file is an informative contributory data file in the
|
||||
# Unicode Character Database.
|
||||
#
|
||||
# This data file lists characters that have the Bidi_Mirrored=Yes property
|
||||
# value, for which there is another Unicode character that typically has a glyph
|
||||
# that is the mirror image of the original character's glyph.
|
||||
#
|
||||
# The repertoire covered by the file is Unicode 14.0.0.
|
||||
#
|
||||
# The file contains a list of lines with mappings from one code point
|
||||
# to another one for character-based mirroring.
|
||||
# Note that for "real" mirroring, a rendering engine needs to select
|
||||
# appropriate alternative glyphs, and that many Unicode characters do not
|
||||
# have a mirror-image Unicode character.
|
||||
#
|
||||
# Each mapping line contains two fields, separated by a semicolon (';').
|
||||
# Each of the two fields contains a code point represented as a
|
||||
# variable-length hexadecimal value with 4 to 6 digits.
|
||||
# A comment indicates where the characters are "BEST FIT" mirroring.
|
||||
#
|
||||
# Code points for which Bidi_Mirrored=Yes, but for which no appropriate
|
||||
# characters exist with mirrored glyphs, are
|
||||
# listed as comments at the end of the file.
|
||||
#
|
||||
# Formally, the default value of the Bidi_Mirroring_Glyph property
|
||||
# for each code point is <none>, unless a mapping to
|
||||
# some other character is specified in this data file. When a code
|
||||
# point has the default value for the Bidi_Mirroring_Glyph property,
|
||||
# that means that no other character exists whose glyph is suitable
|
||||
# for character-based mirroring.
|
||||
#
|
||||
# For information on bidi mirroring, see UAX #9: Unicode Bidirectional Algorithm,
|
||||
# at https://www.unicode.org/reports/tr9/
|
||||
#
|
||||
# This file was originally created by Markus Scherer.
|
||||
# Extended for Unicode 3.2, 4.0, 4.1, 5.0, 5.1, 5.2, and 6.0 by Ken Whistler,
|
||||
# and for subsequent versions by Ken Whistler, Laurentiu Iancu, and Roozbeh Pournader.
|
||||
#
|
||||
# Historical and Compatibility Information:
|
||||
#
|
||||
# The OpenType Mirroring Pairs List (OMPL) is frozen to match the
|
||||
# Unicode 5.1 version of the Bidi_Mirroring_Glyph property (2008).
|
||||
# See https://www.microsoft.com/typography/otspec/ompl.txt
|
||||
#
|
||||
# The Unicode 6.1 version of the Bidi_Mirroring_Glyph property (2011)
|
||||
# added one mirroring pair: 27CB <--> 27CD.
|
||||
#
|
||||
# The Unicode 11.0 version of the Bidi_Mirroring_Glyph property (2018)
|
||||
# underwent a substantial revision, to formally recognize all of the
|
||||
# exact mirroring pairs and "BEST FIT" mirroring pairs that had been
|
||||
# added after the freezing of the OMPL list. As a result, starting
|
||||
# with Unicode 11.0, the bmg mapping values more accurately reflect
|
||||
# the current status of glyphs for Bidi_Mirrored characters in
|
||||
# the Unicode Standard, but this listing now extends significantly
|
||||
# beyond the frozen OMPL list. Implementers should be aware of this
|
||||
# intentional distinction.
|
||||
#
|
||||
# ############################################################
|
||||
#
|
||||
# Property: Bidi_Mirroring_Glyph
|
||||
#
|
||||
# @missing: 0000..10FFFF; <none>
|
||||
|
||||
0028; 0029 # LEFT PARENTHESIS
|
||||
0029; 0028 # RIGHT PARENTHESIS
|
||||
003C; 003E # LESS-THAN SIGN
|
||||
003E; 003C # GREATER-THAN SIGN
|
||||
005B; 005D # LEFT SQUARE BRACKET
|
||||
005D; 005B # RIGHT SQUARE BRACKET
|
||||
007B; 007D # LEFT CURLY BRACKET
|
||||
007D; 007B # RIGHT CURLY BRACKET
|
||||
00AB; 00BB # LEFT-POINTING DOUBLE ANGLE QUOTATION MARK
|
||||
00BB; 00AB # RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK
|
||||
0F3A; 0F3B # TIBETAN MARK GUG RTAGS GYON
|
||||
0F3B; 0F3A # TIBETAN MARK GUG RTAGS GYAS
|
||||
0F3C; 0F3D # TIBETAN MARK ANG KHANG GYON
|
||||
0F3D; 0F3C # TIBETAN MARK ANG KHANG GYAS
|
||||
169B; 169C # OGHAM FEATHER MARK
|
||||
169C; 169B # OGHAM REVERSED FEATHER MARK
|
||||
2039; 203A # SINGLE LEFT-POINTING ANGLE QUOTATION MARK
|
||||
203A; 2039 # SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
|
||||
2045; 2046 # LEFT SQUARE BRACKET WITH QUILL
|
||||
2046; 2045 # RIGHT SQUARE BRACKET WITH QUILL
|
||||
207D; 207E # SUPERSCRIPT LEFT PARENTHESIS
|
||||
207E; 207D # SUPERSCRIPT RIGHT PARENTHESIS
|
||||
208D; 208E # SUBSCRIPT LEFT PARENTHESIS
|
||||
208E; 208D # SUBSCRIPT RIGHT PARENTHESIS
|
||||
2208; 220B # ELEMENT OF
|
||||
2209; 220C # [BEST FIT] NOT AN ELEMENT OF
|
||||
220A; 220D # SMALL ELEMENT OF
|
||||
220B; 2208 # CONTAINS AS MEMBER
|
||||
220C; 2209 # [BEST FIT] DOES NOT CONTAIN AS MEMBER
|
||||
220D; 220A # SMALL CONTAINS AS MEMBER
|
||||
2215; 29F5 # DIVISION SLASH
|
||||
221F; 2BFE # RIGHT ANGLE
|
||||
2220; 29A3 # ANGLE
|
||||
2221; 299B # MEASURED ANGLE
|
||||
2222; 29A0 # SPHERICAL ANGLE
|
||||
2224; 2AEE # DOES NOT DIVIDE
|
||||
223C; 223D # TILDE OPERATOR
|
||||
223D; 223C # REVERSED TILDE
|
||||
2243; 22CD # ASYMPTOTICALLY EQUAL TO
|
||||
2245; 224C # APPROXIMATELY EQUAL TO
|
||||
224C; 2245 # ALL EQUAL TO
|
||||
2252; 2253 # APPROXIMATELY EQUAL TO OR THE IMAGE OF
|
||||
2253; 2252 # IMAGE OF OR APPROXIMATELY EQUAL TO
|
||||
2254; 2255 # COLON EQUALS
|
||||
2255; 2254 # EQUALS COLON
|
||||
2264; 2265 # LESS-THAN OR EQUAL TO
|
||||
2265; 2264 # GREATER-THAN OR EQUAL TO
|
||||
2266; 2267 # LESS-THAN OVER EQUAL TO
|
||||
2267; 2266 # GREATER-THAN OVER EQUAL TO
|
||||
2268; 2269 # [BEST FIT] LESS-THAN BUT NOT EQUAL TO
|
||||
2269; 2268 # [BEST FIT] GREATER-THAN BUT NOT EQUAL TO
|
||||
226A; 226B # MUCH LESS-THAN
|
||||
226B; 226A # MUCH GREATER-THAN
|
||||
226E; 226F # [BEST FIT] NOT LESS-THAN
|
||||
226F; 226E # [BEST FIT] NOT GREATER-THAN
|
||||
2270; 2271 # [BEST FIT] NEITHER LESS-THAN NOR EQUAL TO
|
||||
2271; 2270 # [BEST FIT] NEITHER GREATER-THAN NOR EQUAL TO
|
||||
2272; 2273 # [BEST FIT] LESS-THAN OR EQUIVALENT TO
|
||||
2273; 2272 # [BEST FIT] GREATER-THAN OR EQUIVALENT TO
|
||||
2274; 2275 # [BEST FIT] NEITHER LESS-THAN NOR EQUIVALENT TO
|
||||
2275; 2274 # [BEST FIT] NEITHER GREATER-THAN NOR EQUIVALENT TO
|
||||
2276; 2277 # LESS-THAN OR GREATER-THAN
|
||||
2277; 2276 # GREATER-THAN OR LESS-THAN
|
||||
2278; 2279 # [BEST FIT] NEITHER LESS-THAN NOR GREATER-THAN
|
||||
2279; 2278 # [BEST FIT] NEITHER GREATER-THAN NOR LESS-THAN
|
||||
227A; 227B # PRECEDES
|
||||
227B; 227A # SUCCEEDS
|
||||
227C; 227D # PRECEDES OR EQUAL TO
|
||||
227D; 227C # SUCCEEDS OR EQUAL TO
|
||||
227E; 227F # [BEST FIT] PRECEDES OR EQUIVALENT TO
|
||||
227F; 227E # [BEST FIT] SUCCEEDS OR EQUIVALENT TO
|
||||
2280; 2281 # [BEST FIT] DOES NOT PRECEDE
|
||||
2281; 2280 # [BEST FIT] DOES NOT SUCCEED
|
||||
2282; 2283 # SUBSET OF
|
||||
2283; 2282 # SUPERSET OF
|
||||
2284; 2285 # [BEST FIT] NOT A SUBSET OF
|
||||
2285; 2284 # [BEST FIT] NOT A SUPERSET OF
|
||||
2286; 2287 # SUBSET OF OR EQUAL TO
|
||||
2287; 2286 # SUPERSET OF OR EQUAL TO
|
||||
2288; 2289 # [BEST FIT] NEITHER A SUBSET OF NOR EQUAL TO
|
||||
2289; 2288 # [BEST FIT] NEITHER A SUPERSET OF NOR EQUAL TO
|
||||
228A; 228B # [BEST FIT] SUBSET OF WITH NOT EQUAL TO
|
||||
228B; 228A # [BEST FIT] SUPERSET OF WITH NOT EQUAL TO
|
||||
228F; 2290 # SQUARE IMAGE OF
|
||||
2290; 228F # SQUARE ORIGINAL OF
|
||||
2291; 2292 # SQUARE IMAGE OF OR EQUAL TO
|
||||
2292; 2291 # SQUARE ORIGINAL OF OR EQUAL TO
|
||||
2298; 29B8 # CIRCLED DIVISION SLASH
|
||||
22A2; 22A3 # RIGHT TACK
|
||||
22A3; 22A2 # LEFT TACK
|
||||
22A6; 2ADE # ASSERTION
|
||||
22A8; 2AE4 # TRUE
|
||||
22A9; 2AE3 # FORCES
|
||||
22AB; 2AE5 # DOUBLE VERTICAL BAR DOUBLE RIGHT TURNSTILE
|
||||
22B0; 22B1 # PRECEDES UNDER RELATION
|
||||
22B1; 22B0 # SUCCEEDS UNDER RELATION
|
||||
22B2; 22B3 # NORMAL SUBGROUP OF
|
||||
22B3; 22B2 # CONTAINS AS NORMAL SUBGROUP
|
||||
22B4; 22B5 # NORMAL SUBGROUP OF OR EQUAL TO
|
||||
22B5; 22B4 # CONTAINS AS NORMAL SUBGROUP OR EQUAL TO
|
||||
22B6; 22B7 # ORIGINAL OF
|
||||
22B7; 22B6 # IMAGE OF
|
||||
22B8; 27DC # MULTIMAP
|
||||
22C9; 22CA # LEFT NORMAL FACTOR SEMIDIRECT PRODUCT
|
||||
22CA; 22C9 # RIGHT NORMAL FACTOR SEMIDIRECT PRODUCT
|
||||
22CB; 22CC # LEFT SEMIDIRECT PRODUCT
|
||||
22CC; 22CB # RIGHT SEMIDIRECT PRODUCT
|
||||
22CD; 2243 # REVERSED TILDE EQUALS
|
||||
22D0; 22D1 # DOUBLE SUBSET
|
||||
22D1; 22D0 # DOUBLE SUPERSET
|
||||
22D6; 22D7 # LESS-THAN WITH DOT
|
||||
22D7; 22D6 # GREATER-THAN WITH DOT
|
||||
22D8; 22D9 # VERY MUCH LESS-THAN
|
||||
22D9; 22D8 # VERY MUCH GREATER-THAN
|
||||
22DA; 22DB # LESS-THAN EQUAL TO OR GREATER-THAN
|
||||
22DB; 22DA # GREATER-THAN EQUAL TO OR LESS-THAN
|
||||
22DC; 22DD # EQUAL TO OR LESS-THAN
|
||||
22DD; 22DC # EQUAL TO OR GREATER-THAN
|
||||
22DE; 22DF # EQUAL TO OR PRECEDES
|
||||
22DF; 22DE # EQUAL TO OR SUCCEEDS
|
||||
22E0; 22E1 # [BEST FIT] DOES NOT PRECEDE OR EQUAL
|
||||
22E1; 22E0 # [BEST FIT] DOES NOT SUCCEED OR EQUAL
|
||||
22E2; 22E3 # [BEST FIT] NOT SQUARE IMAGE OF OR EQUAL TO
|
||||
22E3; 22E2 # [BEST FIT] NOT SQUARE ORIGINAL OF OR EQUAL TO
|
||||
22E4; 22E5 # [BEST FIT] SQUARE IMAGE OF OR NOT EQUAL TO
|
||||
22E5; 22E4 # [BEST FIT] SQUARE ORIGINAL OF OR NOT EQUAL TO
|
||||
22E6; 22E7 # [BEST FIT] LESS-THAN BUT NOT EQUIVALENT TO
|
||||
22E7; 22E6 # [BEST FIT] GREATER-THAN BUT NOT EQUIVALENT TO
|
||||
22E8; 22E9 # [BEST FIT] PRECEDES BUT NOT EQUIVALENT TO
|
||||
22E9; 22E8 # [BEST FIT] SUCCEEDS BUT NOT EQUIVALENT TO
|
||||
22EA; 22EB # [BEST FIT] NOT NORMAL SUBGROUP OF
|
||||
22EB; 22EA # [BEST FIT] DOES NOT CONTAIN AS NORMAL SUBGROUP
|
||||
22EC; 22ED # [BEST FIT] NOT NORMAL SUBGROUP OF OR EQUAL TO
|
||||
22ED; 22EC # [BEST FIT] DOES NOT CONTAIN AS NORMAL SUBGROUP OR EQUAL
|
||||
22F0; 22F1 # UP RIGHT DIAGONAL ELLIPSIS
|
||||
22F1; 22F0 # DOWN RIGHT DIAGONAL ELLIPSIS
|
||||
22F2; 22FA # ELEMENT OF WITH LONG HORIZONTAL STROKE
|
||||
22F3; 22FB # ELEMENT OF WITH VERTICAL BAR AT END OF HORIZONTAL STROKE
|
||||
22F4; 22FC # SMALL ELEMENT OF WITH VERTICAL BAR AT END OF HORIZONTAL STROKE
|
||||
22F6; 22FD # ELEMENT OF WITH OVERBAR
|
||||
22F7; 22FE # SMALL ELEMENT OF WITH OVERBAR
|
||||
22FA; 22F2 # CONTAINS WITH LONG HORIZONTAL STROKE
|
||||
22FB; 22F3 # CONTAINS WITH VERTICAL BAR AT END OF HORIZONTAL STROKE
|
||||
22FC; 22F4 # SMALL CONTAINS WITH VERTICAL BAR AT END OF HORIZONTAL STROKE
|
||||
22FD; 22F6 # CONTAINS WITH OVERBAR
|
||||
22FE; 22F7 # SMALL CONTAINS WITH OVERBAR
|
||||
2308; 2309 # LEFT CEILING
|
||||
2309; 2308 # RIGHT CEILING
|
||||
230A; 230B # LEFT FLOOR
|
||||
230B; 230A # RIGHT FLOOR
|
||||
2329; 232A # LEFT-POINTING ANGLE BRACKET
|
||||
232A; 2329 # RIGHT-POINTING ANGLE BRACKET
|
||||
2768; 2769 # MEDIUM LEFT PARENTHESIS ORNAMENT
|
||||
2769; 2768 # MEDIUM RIGHT PARENTHESIS ORNAMENT
|
||||
276A; 276B # MEDIUM FLATTENED LEFT PARENTHESIS ORNAMENT
|
||||
276B; 276A # MEDIUM FLATTENED RIGHT PARENTHESIS ORNAMENT
|
||||
276C; 276D # MEDIUM LEFT-POINTING ANGLE BRACKET ORNAMENT
|
||||
276D; 276C # MEDIUM RIGHT-POINTING ANGLE BRACKET ORNAMENT
|
||||
276E; 276F # HEAVY LEFT-POINTING ANGLE QUOTATION MARK ORNAMENT
|
||||
276F; 276E # HEAVY RIGHT-POINTING ANGLE QUOTATION MARK ORNAMENT
|
||||
2770; 2771 # HEAVY LEFT-POINTING ANGLE BRACKET ORNAMENT
|
||||
2771; 2770 # HEAVY RIGHT-POINTING ANGLE BRACKET ORNAMENT
|
||||
2772; 2773 # LIGHT LEFT TORTOISE SHELL BRACKET ORNAMENT
|
||||
2773; 2772 # LIGHT RIGHT TORTOISE SHELL BRACKET ORNAMENT
|
||||
2774; 2775 # MEDIUM LEFT CURLY BRACKET ORNAMENT
|
||||
2775; 2774 # MEDIUM RIGHT CURLY BRACKET ORNAMENT
|
||||
27C3; 27C4 # OPEN SUBSET
|
||||
27C4; 27C3 # OPEN SUPERSET
|
||||
27C5; 27C6 # LEFT S-SHAPED BAG DELIMITER
|
||||
27C6; 27C5 # RIGHT S-SHAPED BAG DELIMITER
|
||||
27C8; 27C9 # REVERSE SOLIDUS PRECEDING SUBSET
|
||||
27C9; 27C8 # SUPERSET PRECEDING SOLIDUS
|
||||
27CB; 27CD # MATHEMATICAL RISING DIAGONAL
|
||||
27CD; 27CB # MATHEMATICAL FALLING DIAGONAL
|
||||
27D5; 27D6 # LEFT OUTER JOIN
|
||||
27D6; 27D5 # RIGHT OUTER JOIN
|
||||
27DC; 22B8 # LEFT MULTIMAP
|
||||
27DD; 27DE # LONG RIGHT TACK
|
||||
27DE; 27DD # LONG LEFT TACK
|
||||
27E2; 27E3 # WHITE CONCAVE-SIDED DIAMOND WITH LEFTWARDS TICK
|
||||
27E3; 27E2 # WHITE CONCAVE-SIDED DIAMOND WITH RIGHTWARDS TICK
|
||||
27E4; 27E5 # WHITE SQUARE WITH LEFTWARDS TICK
|
||||
27E5; 27E4 # WHITE SQUARE WITH RIGHTWARDS TICK
|
||||
27E6; 27E7 # MATHEMATICAL LEFT WHITE SQUARE BRACKET
|
||||
27E7; 27E6 # MATHEMATICAL RIGHT WHITE SQUARE BRACKET
|
||||
27E8; 27E9 # MATHEMATICAL LEFT ANGLE BRACKET
|
||||
27E9; 27E8 # MATHEMATICAL RIGHT ANGLE BRACKET
|
||||
27EA; 27EB # MATHEMATICAL LEFT DOUBLE ANGLE BRACKET
|
||||
27EB; 27EA # MATHEMATICAL RIGHT DOUBLE ANGLE BRACKET
|
||||
27EC; 27ED # MATHEMATICAL LEFT WHITE TORTOISE SHELL BRACKET
|
||||
27ED; 27EC # MATHEMATICAL RIGHT WHITE TORTOISE SHELL BRACKET
|
||||
27EE; 27EF # MATHEMATICAL LEFT FLATTENED PARENTHESIS
|
||||
27EF; 27EE # MATHEMATICAL RIGHT FLATTENED PARENTHESIS
|
||||
2983; 2984 # LEFT WHITE CURLY BRACKET
|
||||
2984; 2983 # RIGHT WHITE CURLY BRACKET
|
||||
2985; 2986 # LEFT WHITE PARENTHESIS
|
||||
2986; 2985 # RIGHT WHITE PARENTHESIS
|
||||
2987; 2988 # Z NOTATION LEFT IMAGE BRACKET
|
||||
2988; 2987 # Z NOTATION RIGHT IMAGE BRACKET
|
||||
2989; 298A # Z NOTATION LEFT BINDING BRACKET
|
||||
298A; 2989 # Z NOTATION RIGHT BINDING BRACKET
|
||||
298B; 298C # LEFT SQUARE BRACKET WITH UNDERBAR
|
||||
298C; 298B # RIGHT SQUARE BRACKET WITH UNDERBAR
|
||||
298D; 2990 # LEFT SQUARE BRACKET WITH TICK IN TOP CORNER
|
||||
298E; 298F # RIGHT SQUARE BRACKET WITH TICK IN BOTTOM CORNER
|
||||
298F; 298E # LEFT SQUARE BRACKET WITH TICK IN BOTTOM CORNER
|
||||
2990; 298D # RIGHT SQUARE BRACKET WITH TICK IN TOP CORNER
|
||||
2991; 2992 # LEFT ANGLE BRACKET WITH DOT
|
||||
2992; 2991 # RIGHT ANGLE BRACKET WITH DOT
|
||||
2993; 2994 # LEFT ARC LESS-THAN BRACKET
|
||||
2994; 2993 # RIGHT ARC GREATER-THAN BRACKET
|
||||
2995; 2996 # DOUBLE LEFT ARC GREATER-THAN BRACKET
|
||||
2996; 2995 # DOUBLE RIGHT ARC LESS-THAN BRACKET
|
||||
2997; 2998 # LEFT BLACK TORTOISE SHELL BRACKET
|
||||
2998; 2997 # RIGHT BLACK TORTOISE SHELL BRACKET
|
||||
299B; 2221 # MEASURED ANGLE OPENING LEFT
|
||||
29A0; 2222 # SPHERICAL ANGLE OPENING LEFT
|
||||
29A3; 2220 # REVERSED ANGLE
|
||||
29A4; 29A5 # ANGLE WITH UNDERBAR
|
||||
29A5; 29A4 # REVERSED ANGLE WITH UNDERBAR
|
||||
29A8; 29A9 # MEASURED ANGLE WITH OPEN ARM ENDING IN ARROW POINTING UP AND RIGHT
|
||||
29A9; 29A8 # MEASURED ANGLE WITH OPEN ARM ENDING IN ARROW POINTING UP AND LEFT
|
||||
29AA; 29AB # MEASURED ANGLE WITH OPEN ARM ENDING IN ARROW POINTING DOWN AND RIGHT
|
||||
29AB; 29AA # MEASURED ANGLE WITH OPEN ARM ENDING IN ARROW POINTING DOWN AND LEFT
|
||||
29AC; 29AD # MEASURED ANGLE WITH OPEN ARM ENDING IN ARROW POINTING RIGHT AND UP
|
||||
29AD; 29AC # MEASURED ANGLE WITH OPEN ARM ENDING IN ARROW POINTING LEFT AND UP
|
||||
29AE; 29AF # MEASURED ANGLE WITH OPEN ARM ENDING IN ARROW POINTING RIGHT AND DOWN
|
||||
29AF; 29AE # MEASURED ANGLE WITH OPEN ARM ENDING IN ARROW POINTING LEFT AND DOWN
|
||||
29B8; 2298 # CIRCLED REVERSE SOLIDUS
|
||||
29C0; 29C1 # CIRCLED LESS-THAN
|
||||
29C1; 29C0 # CIRCLED GREATER-THAN
|
||||
29C4; 29C5 # SQUARED RISING DIAGONAL SLASH
|
||||
29C5; 29C4 # SQUARED FALLING DIAGONAL SLASH
|
||||
29CF; 29D0 # LEFT TRIANGLE BESIDE VERTICAL BAR
|
||||
29D0; 29CF # VERTICAL BAR BESIDE RIGHT TRIANGLE
|
||||
29D1; 29D2 # BOWTIE WITH LEFT HALF BLACK
|
||||
29D2; 29D1 # BOWTIE WITH RIGHT HALF BLACK
|
||||
29D4; 29D5 # TIMES WITH LEFT HALF BLACK
|
||||
29D5; 29D4 # TIMES WITH RIGHT HALF BLACK
|
||||
29D8; 29D9 # LEFT WIGGLY FENCE
|
||||
29D9; 29D8 # RIGHT WIGGLY FENCE
|
||||
29DA; 29DB # LEFT DOUBLE WIGGLY FENCE
|
||||
29DB; 29DA # RIGHT DOUBLE WIGGLY FENCE
|
||||
29E8; 29E9 # DOWN-POINTING TRIANGLE WITH LEFT HALF BLACK
|
||||
29E9; 29E8 # DOWN-POINTING TRIANGLE WITH RIGHT HALF BLACK
|
||||
29F5; 2215 # REVERSE SOLIDUS OPERATOR
|
||||
29F8; 29F9 # BIG SOLIDUS
|
||||
29F9; 29F8 # BIG REVERSE SOLIDUS
|
||||
29FC; 29FD # LEFT-POINTING CURVED ANGLE BRACKET
|
||||
29FD; 29FC # RIGHT-POINTING CURVED ANGLE BRACKET
|
||||
2A2B; 2A2C # MINUS SIGN WITH FALLING DOTS
|
||||
2A2C; 2A2B # MINUS SIGN WITH RISING DOTS
|
||||
2A2D; 2A2E # PLUS SIGN IN LEFT HALF CIRCLE
|
||||
2A2E; 2A2D # PLUS SIGN IN RIGHT HALF CIRCLE
|
||||
2A34; 2A35 # MULTIPLICATION SIGN IN LEFT HALF CIRCLE
|
||||
2A35; 2A34 # MULTIPLICATION SIGN IN RIGHT HALF CIRCLE
|
||||
2A3C; 2A3D # INTERIOR PRODUCT
|
||||
2A3D; 2A3C # RIGHTHAND INTERIOR PRODUCT
|
||||
2A64; 2A65 # Z NOTATION DOMAIN ANTIRESTRICTION
|
||||
2A65; 2A64 # Z NOTATION RANGE ANTIRESTRICTION
|
||||
2A79; 2A7A # LESS-THAN WITH CIRCLE INSIDE
|
||||
2A7A; 2A79 # GREATER-THAN WITH CIRCLE INSIDE
|
||||
2A7B; 2A7C # [BEST FIT] LESS-THAN WITH QUESTION MARK ABOVE
|
||||
2A7C; 2A7B # [BEST FIT] GREATER-THAN WITH QUESTION MARK ABOVE
|
||||
2A7D; 2A7E # LESS-THAN OR SLANTED EQUAL TO
|
||||
2A7E; 2A7D # GREATER-THAN OR SLANTED EQUAL TO
|
||||
2A7F; 2A80 # LESS-THAN OR SLANTED EQUAL TO WITH DOT INSIDE
|
||||
2A80; 2A7F # GREATER-THAN OR SLANTED EQUAL TO WITH DOT INSIDE
|
||||
2A81; 2A82 # LESS-THAN OR SLANTED EQUAL TO WITH DOT ABOVE
|
||||
2A82; 2A81 # GREATER-THAN OR SLANTED EQUAL TO WITH DOT ABOVE
|
||||
2A83; 2A84 # LESS-THAN OR SLANTED EQUAL TO WITH DOT ABOVE RIGHT
|
||||
2A84; 2A83 # GREATER-THAN OR SLANTED EQUAL TO WITH DOT ABOVE LEFT
|
||||
2A85; 2A86 # [BEST FIT] LESS-THAN OR APPROXIMATE
|
||||
2A86; 2A85 # [BEST FIT] GREATER-THAN OR APPROXIMATE
|
||||
2A87; 2A88 # [BEST FIT] LESS-THAN AND SINGLE-LINE NOT EQUAL TO
|
||||
2A88; 2A87 # [BEST FIT] GREATER-THAN AND SINGLE-LINE NOT EQUAL TO
|
||||
2A89; 2A8A # [BEST FIT] LESS-THAN AND NOT APPROXIMATE
|
||||
2A8A; 2A89 # [BEST FIT] GREATER-THAN AND NOT APPROXIMATE
|
||||
2A8B; 2A8C # LESS-THAN ABOVE DOUBLE-LINE EQUAL ABOVE GREATER-THAN
|
||||
2A8C; 2A8B # GREATER-THAN ABOVE DOUBLE-LINE EQUAL ABOVE LESS-THAN
|
||||
2A8D; 2A8E # [BEST FIT] LESS-THAN ABOVE SIMILAR OR EQUAL
|
||||
2A8E; 2A8D # [BEST FIT] GREATER-THAN ABOVE SIMILAR OR EQUAL
|
||||
2A8F; 2A90 # [BEST FIT] LESS-THAN ABOVE SIMILAR ABOVE GREATER-THAN
|
||||
2A90; 2A8F # [BEST FIT] GREATER-THAN ABOVE SIMILAR ABOVE LESS-THAN
|
||||
2A91; 2A92 # LESS-THAN ABOVE GREATER-THAN ABOVE DOUBLE-LINE EQUAL
|
||||
2A92; 2A91 # GREATER-THAN ABOVE LESS-THAN ABOVE DOUBLE-LINE EQUAL
|
||||
2A93; 2A94 # LESS-THAN ABOVE SLANTED EQUAL ABOVE GREATER-THAN ABOVE SLANTED EQUAL
|
||||
2A94; 2A93 # GREATER-THAN ABOVE SLANTED EQUAL ABOVE LESS-THAN ABOVE SLANTED EQUAL
|
||||
2A95; 2A96 # SLANTED EQUAL TO OR LESS-THAN
|
||||
2A96; 2A95 # SLANTED EQUAL TO OR GREATER-THAN
|
||||
2A97; 2A98 # SLANTED EQUAL TO OR LESS-THAN WITH DOT INSIDE
|
||||
2A98; 2A97 # SLANTED EQUAL TO OR GREATER-THAN WITH DOT INSIDE
|
||||
2A99; 2A9A # DOUBLE-LINE EQUAL TO OR LESS-THAN
|
||||
2A9A; 2A99 # DOUBLE-LINE EQUAL TO OR GREATER-THAN
|
||||
2A9B; 2A9C # DOUBLE-LINE SLANTED EQUAL TO OR LESS-THAN
|
||||
2A9C; 2A9B # DOUBLE-LINE SLANTED EQUAL TO OR GREATER-THAN
|
||||
2A9D; 2A9E # [BEST FIT] SIMILAR OR LESS-THAN
|
||||
2A9E; 2A9D # [BEST FIT] SIMILAR OR GREATER-THAN
|
||||
2A9F; 2AA0 # [BEST FIT] SIMILAR ABOVE LESS-THAN ABOVE EQUALS SIGN
|
||||
2AA0; 2A9F # [BEST FIT] SIMILAR ABOVE GREATER-THAN ABOVE EQUALS SIGN
|
||||
2AA1; 2AA2 # DOUBLE NESTED LESS-THAN
|
||||
2AA2; 2AA1 # DOUBLE NESTED GREATER-THAN
|
||||
2AA6; 2AA7 # LESS-THAN CLOSED BY CURVE
|
||||
2AA7; 2AA6 # GREATER-THAN CLOSED BY CURVE
|
||||
2AA8; 2AA9 # LESS-THAN CLOSED BY CURVE ABOVE SLANTED EQUAL
|
||||
2AA9; 2AA8 # GREATER-THAN CLOSED BY CURVE ABOVE SLANTED EQUAL
|
||||
2AAA; 2AAB # SMALLER THAN
|
||||
2AAB; 2AAA # LARGER THAN
|
||||
2AAC; 2AAD # SMALLER THAN OR EQUAL TO
|
||||
2AAD; 2AAC # LARGER THAN OR EQUAL TO
|
||||
2AAF; 2AB0 # PRECEDES ABOVE SINGLE-LINE EQUALS SIGN
|
||||
2AB0; 2AAF # SUCCEEDS ABOVE SINGLE-LINE EQUALS SIGN
|
||||
2AB1; 2AB2 # [BEST FIT] PRECEDES ABOVE SINGLE-LINE NOT EQUAL TO
|
||||
2AB2; 2AB1 # [BEST FIT] SUCCEEDS ABOVE SINGLE-LINE NOT EQUAL TO
|
||||
2AB3; 2AB4 # PRECEDES ABOVE EQUALS SIGN
|
||||
2AB4; 2AB3 # SUCCEEDS ABOVE EQUALS SIGN
|
||||
2AB5; 2AB6 # [BEST FIT] PRECEDES ABOVE NOT EQUAL TO
|
||||
2AB6; 2AB5 # [BEST FIT] SUCCEEDS ABOVE NOT EQUAL TO
|
||||
2AB7; 2AB8 # [BEST FIT] PRECEDES ABOVE ALMOST EQUAL TO
|
||||
2AB8; 2AB7 # [BEST FIT] SUCCEEDS ABOVE ALMOST EQUAL TO
|
||||
2AB9; 2ABA # [BEST FIT] PRECEDES ABOVE NOT ALMOST EQUAL TO
|
||||
2ABA; 2AB9 # [BEST FIT] SUCCEEDS ABOVE NOT ALMOST EQUAL TO
|
||||
2ABB; 2ABC # DOUBLE PRECEDES
|
||||
2ABC; 2ABB # DOUBLE SUCCEEDS
|
||||
2ABD; 2ABE # SUBSET WITH DOT
|
||||
2ABE; 2ABD # SUPERSET WITH DOT
|
||||
2ABF; 2AC0 # SUBSET WITH PLUS SIGN BELOW
|
||||
2AC0; 2ABF # SUPERSET WITH PLUS SIGN BELOW
|
||||
2AC1; 2AC2 # SUBSET WITH MULTIPLICATION SIGN BELOW
|
||||
2AC2; 2AC1 # SUPERSET WITH MULTIPLICATION SIGN BELOW
|
||||
2AC3; 2AC4 # SUBSET OF OR EQUAL TO WITH DOT ABOVE
|
||||
2AC4; 2AC3 # SUPERSET OF OR EQUAL TO WITH DOT ABOVE
|
||||
2AC5; 2AC6 # SUBSET OF ABOVE EQUALS SIGN
|
||||
2AC6; 2AC5 # SUPERSET OF ABOVE EQUALS SIGN
|
||||
2AC7; 2AC8 # [BEST FIT] SUBSET OF ABOVE TILDE OPERATOR
|
||||
2AC8; 2AC7 # [BEST FIT] SUPERSET OF ABOVE TILDE OPERATOR
|
||||
2AC9; 2ACA # [BEST FIT] SUBSET OF ABOVE ALMOST EQUAL TO
|
||||
2ACA; 2AC9 # [BEST FIT] SUPERSET OF ABOVE ALMOST EQUAL TO
|
||||
2ACB; 2ACC # [BEST FIT] SUBSET OF ABOVE NOT EQUAL TO
|
||||
2ACC; 2ACB # [BEST FIT] SUPERSET OF ABOVE NOT EQUAL TO
|
||||
2ACD; 2ACE # SQUARE LEFT OPEN BOX OPERATOR
|
||||
2ACE; 2ACD # SQUARE RIGHT OPEN BOX OPERATOR
|
||||
2ACF; 2AD0 # CLOSED SUBSET
|
||||
2AD0; 2ACF # CLOSED SUPERSET
|
||||
2AD1; 2AD2 # CLOSED SUBSET OR EQUAL TO
|
||||
2AD2; 2AD1 # CLOSED SUPERSET OR EQUAL TO
|
||||
2AD3; 2AD4 # SUBSET ABOVE SUPERSET
|
||||
2AD4; 2AD3 # SUPERSET ABOVE SUBSET
|
||||
2AD5; 2AD6 # SUBSET ABOVE SUBSET
|
||||
2AD6; 2AD5 # SUPERSET ABOVE SUPERSET
|
||||
2ADE; 22A6 # SHORT LEFT TACK
|
||||
2AE3; 22A9 # DOUBLE VERTICAL BAR LEFT TURNSTILE
|
||||
2AE4; 22A8 # VERTICAL BAR DOUBLE LEFT TURNSTILE
|
||||
2AE5; 22AB # DOUBLE VERTICAL BAR DOUBLE LEFT TURNSTILE
|
||||
2AEC; 2AED # DOUBLE STROKE NOT SIGN
|
||||
2AED; 2AEC # REVERSED DOUBLE STROKE NOT SIGN
|
||||
2AEE; 2224 # DOES NOT DIVIDE WITH REVERSED NEGATION SLASH
|
||||
2AF7; 2AF8 # TRIPLE NESTED LESS-THAN
|
||||
2AF8; 2AF7 # TRIPLE NESTED GREATER-THAN
|
||||
2AF9; 2AFA # DOUBLE-LINE SLANTED LESS-THAN OR EQUAL TO
|
||||
2AFA; 2AF9 # DOUBLE-LINE SLANTED GREATER-THAN OR EQUAL TO
|
||||
2BFE; 221F # REVERSED RIGHT ANGLE
|
||||
2E02; 2E03 # LEFT SUBSTITUTION BRACKET
|
||||
2E03; 2E02 # RIGHT SUBSTITUTION BRACKET
|
||||
2E04; 2E05 # LEFT DOTTED SUBSTITUTION BRACKET
|
||||
2E05; 2E04 # RIGHT DOTTED SUBSTITUTION BRACKET
|
||||
2E09; 2E0A # LEFT TRANSPOSITION BRACKET
|
||||
2E0A; 2E09 # RIGHT TRANSPOSITION BRACKET
|
||||
2E0C; 2E0D # LEFT RAISED OMISSION BRACKET
|
||||
2E0D; 2E0C # RIGHT RAISED OMISSION BRACKET
|
||||
2E1C; 2E1D # LEFT LOW PARAPHRASE BRACKET
|
||||
2E1D; 2E1C # RIGHT LOW PARAPHRASE BRACKET
|
||||
2E20; 2E21 # LEFT VERTICAL BAR WITH QUILL
|
||||
2E21; 2E20 # RIGHT VERTICAL BAR WITH QUILL
|
||||
2E22; 2E23 # TOP LEFT HALF BRACKET
|
||||
2E23; 2E22 # TOP RIGHT HALF BRACKET
|
||||
2E24; 2E25 # BOTTOM LEFT HALF BRACKET
|
||||
2E25; 2E24 # BOTTOM RIGHT HALF BRACKET
|
||||
2E26; 2E27 # LEFT SIDEWAYS U BRACKET
|
||||
2E27; 2E26 # RIGHT SIDEWAYS U BRACKET
|
||||
2E28; 2E29 # LEFT DOUBLE PARENTHESIS
|
||||
2E29; 2E28 # RIGHT DOUBLE PARENTHESIS
|
||||
2E55; 2E56 # LEFT SQUARE BRACKET WITH STROKE
|
||||
2E56; 2E55 # RIGHT SQUARE BRACKET WITH STROKE
|
||||
2E57; 2E58 # LEFT SQUARE BRACKET WITH DOUBLE STROKE
|
||||
2E58; 2E57 # RIGHT SQUARE BRACKET WITH DOUBLE STROKE
|
||||
2E59; 2E5A # TOP HALF LEFT PARENTHESIS
|
||||
2E5A; 2E59 # TOP HALF RIGHT PARENTHESIS
|
||||
2E5B; 2E5C # BOTTOM HALF LEFT PARENTHESIS
|
||||
2E5C; 2E5B # BOTTOM HALF RIGHT PARENTHESIS
|
||||
3008; 3009 # LEFT ANGLE BRACKET
|
||||
3009; 3008 # RIGHT ANGLE BRACKET
|
||||
300A; 300B # LEFT DOUBLE ANGLE BRACKET
|
||||
300B; 300A # RIGHT DOUBLE ANGLE BRACKET
|
||||
300C; 300D # [BEST FIT] LEFT CORNER BRACKET
|
||||
300D; 300C # [BEST FIT] RIGHT CORNER BRACKET
|
||||
300E; 300F # [BEST FIT] LEFT WHITE CORNER BRACKET
|
||||
300F; 300E # [BEST FIT] RIGHT WHITE CORNER BRACKET
|
||||
3010; 3011 # LEFT BLACK LENTICULAR BRACKET
|
||||
3011; 3010 # RIGHT BLACK LENTICULAR BRACKET
|
||||
3014; 3015 # LEFT TORTOISE SHELL BRACKET
|
||||
3015; 3014 # RIGHT TORTOISE SHELL BRACKET
|
||||
3016; 3017 # LEFT WHITE LENTICULAR BRACKET
|
||||
3017; 3016 # RIGHT WHITE LENTICULAR BRACKET
|
||||
3018; 3019 # LEFT WHITE TORTOISE SHELL BRACKET
|
||||
3019; 3018 # RIGHT WHITE TORTOISE SHELL BRACKET
|
||||
301A; 301B # LEFT WHITE SQUARE BRACKET
|
||||
301B; 301A # RIGHT WHITE SQUARE BRACKET
|
||||
FE59; FE5A # SMALL LEFT PARENTHESIS
|
||||
FE5A; FE59 # SMALL RIGHT PARENTHESIS
|
||||
FE5B; FE5C # SMALL LEFT CURLY BRACKET
|
||||
FE5C; FE5B # SMALL RIGHT CURLY BRACKET
|
||||
FE5D; FE5E # SMALL LEFT TORTOISE SHELL BRACKET
|
||||
FE5E; FE5D # SMALL RIGHT TORTOISE SHELL BRACKET
|
||||
FE64; FE65 # SMALL LESS-THAN SIGN
|
||||
FE65; FE64 # SMALL GREATER-THAN SIGN
|
||||
FF08; FF09 # FULLWIDTH LEFT PARENTHESIS
|
||||
FF09; FF08 # FULLWIDTH RIGHT PARENTHESIS
|
||||
FF1C; FF1E # FULLWIDTH LESS-THAN SIGN
|
||||
FF1E; FF1C # FULLWIDTH GREATER-THAN SIGN
|
||||
FF3B; FF3D # FULLWIDTH LEFT SQUARE BRACKET
|
||||
FF3D; FF3B # FULLWIDTH RIGHT SQUARE BRACKET
|
||||
FF5B; FF5D # FULLWIDTH LEFT CURLY BRACKET
|
||||
FF5D; FF5B # FULLWIDTH RIGHT CURLY BRACKET
|
||||
FF5F; FF60 # FULLWIDTH LEFT WHITE PARENTHESIS
|
||||
FF60; FF5F # FULLWIDTH RIGHT WHITE PARENTHESIS
|
||||
FF62; FF63 # [BEST FIT] HALFWIDTH LEFT CORNER BRACKET
|
||||
FF63; FF62 # [BEST FIT] HALFWIDTH RIGHT CORNER BRACKET
|
||||
|
||||
# The following characters have no appropriate mirroring character.
|
||||
# For these characters it is up to the rendering system
|
||||
# to provide mirrored glyphs.
|
||||
|
||||
# 2140; DOUBLE-STRUCK N-ARY SUMMATION
|
||||
# 2201; COMPLEMENT
|
||||
# 2202; PARTIAL DIFFERENTIAL
|
||||
# 2203; THERE EXISTS
|
||||
# 2204; THERE DOES NOT EXIST
|
||||
# 2211; N-ARY SUMMATION
|
||||
# 2216; SET MINUS
|
||||
# 221A; SQUARE ROOT
|
||||
# 221B; CUBE ROOT
|
||||
# 221C; FOURTH ROOT
|
||||
# 221D; PROPORTIONAL TO
|
||||
# 2226; NOT PARALLEL TO
|
||||
# 222B; INTEGRAL
|
||||
# 222C; DOUBLE INTEGRAL
|
||||
# 222D; TRIPLE INTEGRAL
|
||||
# 222E; CONTOUR INTEGRAL
|
||||
# 222F; SURFACE INTEGRAL
|
||||
# 2230; VOLUME INTEGRAL
|
||||
# 2231; CLOCKWISE INTEGRAL
|
||||
# 2232; CLOCKWISE CONTOUR INTEGRAL
|
||||
# 2233; ANTICLOCKWISE CONTOUR INTEGRAL
|
||||
# 2239; EXCESS
|
||||
# 223B; HOMOTHETIC
|
||||
# 223E; INVERTED LAZY S
|
||||
# 223F; SINE WAVE
|
||||
# 2240; WREATH PRODUCT
|
||||
# 2241; NOT TILDE
|
||||
# 2242; MINUS TILDE
|
||||
# 2244; NOT ASYMPTOTICALLY EQUAL TO
|
||||
# 2246; APPROXIMATELY BUT NOT ACTUALLY EQUAL TO
|
||||
# 2247; NEITHER APPROXIMATELY NOR ACTUALLY EQUAL TO
|
||||
# 2248; ALMOST EQUAL TO
|
||||
# 2249; NOT ALMOST EQUAL TO
|
||||
# 224A; ALMOST EQUAL OR EQUAL TO
|
||||
# 224B; TRIPLE TILDE
|
||||
# 225F; QUESTIONED EQUAL TO
|
||||
# 2260; NOT EQUAL TO
|
||||
# 2262; NOT IDENTICAL TO
|
||||
# 228C; MULTISET
|
||||
# 22A7; MODELS
|
||||
# 22AA; TRIPLE VERTICAL BAR RIGHT TURNSTILE
|
||||
# 22AC; DOES NOT PROVE
|
||||
# 22AD; NOT TRUE
|
||||
# 22AE; DOES NOT FORCE
|
||||
# 22AF; NEGATED DOUBLE VERTICAL BAR DOUBLE RIGHT TURNSTILE
|
||||
# 22BE; RIGHT ANGLE WITH ARC
|
||||
# 22BF; RIGHT TRIANGLE
|
||||
# 22F5; ELEMENT OF WITH DOT ABOVE
|
||||
# 22F8; ELEMENT OF WITH UNDERBAR
|
||||
# 22F9; ELEMENT OF WITH TWO HORIZONTAL STROKES
|
||||
# 22FF; Z NOTATION BAG MEMBERSHIP
|
||||
# 2320; TOP HALF INTEGRAL
|
||||
# 2321; BOTTOM HALF INTEGRAL
|
||||
# 27C0; THREE DIMENSIONAL ANGLE
|
||||
# 27CC; LONG DIVISION
|
||||
# 27D3; LOWER RIGHT CORNER WITH DOT
|
||||
# 27D4; UPPER LEFT CORNER WITH DOT
|
||||
# 299C; RIGHT ANGLE VARIANT WITH SQUARE
|
||||
# 299D; MEASURED RIGHT ANGLE WITH DOT
|
||||
# 299E; ANGLE WITH S INSIDE
|
||||
# 299F; ACUTE ANGLE
|
||||
# 29A2; TURNED ANGLE
|
||||
# 29A6; OBLIQUE ANGLE OPENING UP
|
||||
# 29A7; OBLIQUE ANGLE OPENING DOWN
|
||||
# 29C2; CIRCLE WITH SMALL CIRCLE TO THE RIGHT
|
||||
# 29C3; CIRCLE WITH TWO HORIZONTAL STROKES TO THE RIGHT
|
||||
# 29C9; TWO JOINED SQUARES
|
||||
# 29CE; RIGHT TRIANGLE ABOVE LEFT TRIANGLE
|
||||
# 29DC; INCOMPLETE INFINITY
|
||||
# 29E1; INCREASES AS
|
||||
# 29E3; EQUALS SIGN AND SLANTED PARALLEL
|
||||
# 29E4; EQUALS SIGN AND SLANTED PARALLEL WITH TILDE ABOVE
|
||||
# 29E5; IDENTICAL TO AND SLANTED PARALLEL
|
||||
# 29F4; RULE-DELAYED
|
||||
# 29F6; SOLIDUS WITH OVERBAR
|
||||
# 29F7; REVERSE SOLIDUS WITH HORIZONTAL STROKE
|
||||
# 2A0A; MODULO TWO SUM
|
||||
# 2A0B; SUMMATION WITH INTEGRAL
|
||||
# 2A0C; QUADRUPLE INTEGRAL OPERATOR
|
||||
# 2A0D; FINITE PART INTEGRAL
|
||||
# 2A0E; INTEGRAL WITH DOUBLE STROKE
|
||||
# 2A0F; INTEGRAL AVERAGE WITH SLASH
|
||||
# 2A10; CIRCULATION FUNCTION
|
||||
# 2A11; ANTICLOCKWISE INTEGRATION
|
||||
# 2A12; LINE INTEGRATION WITH RECTANGULAR PATH AROUND POLE
|
||||
# 2A13; LINE INTEGRATION WITH SEMICIRCULAR PATH AROUND POLE
|
||||
# 2A14; LINE INTEGRATION NOT INCLUDING THE POLE
|
||||
# 2A15; INTEGRAL AROUND A POINT OPERATOR
|
||||
# 2A16; QUATERNION INTEGRAL OPERATOR
|
||||
# 2A17; INTEGRAL WITH LEFTWARDS ARROW WITH HOOK
|
||||
# 2A18; INTEGRAL WITH TIMES SIGN
|
||||
# 2A19; INTEGRAL WITH INTERSECTION
|
||||
# 2A1A; INTEGRAL WITH UNION
|
||||
# 2A1B; INTEGRAL WITH OVERBAR
|
||||
# 2A1C; INTEGRAL WITH UNDERBAR
|
||||
# 2A1E; LARGE LEFT TRIANGLE OPERATOR
|
||||
# 2A1F; Z NOTATION SCHEMA COMPOSITION
|
||||
# 2A20; Z NOTATION SCHEMA PIPING
|
||||
# 2A21; Z NOTATION SCHEMA PROJECTION
|
||||
# 2A24; PLUS SIGN WITH TILDE ABOVE
|
||||
# 2A26; PLUS SIGN WITH TILDE BELOW
|
||||
# 2A29; MINUS SIGN WITH COMMA ABOVE
|
||||
# 2A3E; Z NOTATION RELATIONAL COMPOSITION
|
||||
# 2A57; SLOPING LARGE OR
|
||||
# 2A58; SLOPING LARGE AND
|
||||
# 2A6A; TILDE OPERATOR WITH DOT ABOVE
|
||||
# 2A6B; TILDE OPERATOR WITH RISING DOTS
|
||||
# 2A6C; SIMILAR MINUS SIMILAR
|
||||
# 2A6D; CONGRUENT WITH DOT ABOVE
|
||||
# 2A6F; ALMOST EQUAL TO WITH CIRCUMFLEX ACCENT
|
||||
# 2A70; APPROXIMATELY EQUAL OR EQUAL TO
|
||||
# 2A73; EQUALS SIGN ABOVE TILDE OPERATOR
|
||||
# 2A74; DOUBLE COLON EQUAL
|
||||
# 2AA3; DOUBLE NESTED LESS-THAN WITH UNDERBAR
|
||||
# 2ADC; FORKING
|
||||
# 2AE2; VERTICAL BAR TRIPLE RIGHT TURNSTILE
|
||||
# 2AE6; LONG DASH FROM LEFT MEMBER OF DOUBLE VERTICAL
|
||||
# 2AF3; PARALLEL WITH TILDE OPERATOR
|
||||
# 2AFB; TRIPLE SOLIDUS BINARY RELATION
|
||||
# 2AFD; DOUBLE SOLIDUS OPERATOR
|
||||
# 1D6DB; MATHEMATICAL BOLD PARTIAL DIFFERENTIAL
|
||||
# 1D715; MATHEMATICAL ITALIC PARTIAL DIFFERENTIAL
|
||||
# 1D74F; MATHEMATICAL BOLD ITALIC PARTIAL DIFFERENTIAL
|
||||
# 1D789; MATHEMATICAL SANS-SERIF BOLD PARTIAL DIFFERENTIAL
|
||||
# 1D7C3; MATHEMATICAL SANS-SERIF BOLD ITALIC PARTIAL DIFFERENTIAL
|
||||
|
||||
# EOF
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,212 @@
|
|||
# PropertyAliases-14.0.0.txt
|
||||
# Date: 2021-03-08, 19:35:48 GMT
|
||||
# © 2021 Unicode®, Inc.
|
||||
# Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries.
|
||||
# For terms of use, see http://www.unicode.org/terms_of_use.html
|
||||
#
|
||||
# Unicode Character Database
|
||||
# For documentation, see http://www.unicode.org/reports/tr44/
|
||||
#
|
||||
# This file contains aliases for properties used in the UCD.
|
||||
# These names can be used for XML formats of UCD data, for regular-expression
|
||||
# property tests, and other programmatic textual descriptions of Unicode data.
|
||||
#
|
||||
# The names may be translated in appropriate environments, and additional
|
||||
# aliases may be useful.
|
||||
#
|
||||
# FORMAT
|
||||
#
|
||||
# Each line has two or more fields, separated by semicolons.
|
||||
#
|
||||
# First Field: The first field is the short name for the property.
|
||||
# It is typically an abbreviation, but in a number of cases it is simply
|
||||
# a duplicate of the "long name" in the second field.
|
||||
# For Unihan database tags, the short name is actually a longer string than
|
||||
# the tag specified in the second field.
|
||||
#
|
||||
# Second Field: The second field is the long name for the property,
|
||||
# typically the formal name used in documentation about the property.
|
||||
#
|
||||
# The above are the preferred aliases. Other aliases may be listed in additional fields.
|
||||
#
|
||||
# Loose matching should be applied to all property names and property values, with
|
||||
# the exception of String Property values. With loose matching of property names and
|
||||
# values, the case distinctions, whitespace, and '_' are ignored. For Numeric Property
|
||||
# values, numeric equivalencies are applied: thus "01.00" is equivalent to "1".
|
||||
#
|
||||
# NOTE: Property value names are NOT unique across properties. For example:
|
||||
#
|
||||
# AL means Arabic Letter for the Bidi_Class property, and
|
||||
# AL means Above_Left for the Combining_Class property, and
|
||||
# AL means Alphabetic for the Line_Break property.
|
||||
#
|
||||
# In addition, some property names may be the same as some property value names.
|
||||
# For example:
|
||||
#
|
||||
# sc means the Script property, and
|
||||
# Sc means the General_Category property value Currency_Symbol (Sc)
|
||||
#
|
||||
# The combination of property value and property name is, however, unique.
|
||||
#
|
||||
# For more information, see UAX #44, Unicode Character Database, and
|
||||
# UTS #18, Unicode Regular Expressions.
|
||||
# ================================================
|
||||
|
||||
|
||||
# ================================================
|
||||
# Numeric Properties
|
||||
# ================================================
|
||||
cjkAccountingNumeric ; kAccountingNumeric
|
||||
cjkOtherNumeric ; kOtherNumeric
|
||||
cjkPrimaryNumeric ; kPrimaryNumeric
|
||||
nv ; Numeric_Value
|
||||
|
||||
# ================================================
|
||||
# String Properties
|
||||
# ================================================
|
||||
cf ; Case_Folding
|
||||
cjkCompatibilityVariant ; kCompatibilityVariant
|
||||
dm ; Decomposition_Mapping
|
||||
FC_NFKC ; FC_NFKC_Closure
|
||||
lc ; Lowercase_Mapping
|
||||
NFKC_CF ; NFKC_Casefold
|
||||
scf ; Simple_Case_Folding ; sfc
|
||||
slc ; Simple_Lowercase_Mapping
|
||||
stc ; Simple_Titlecase_Mapping
|
||||
suc ; Simple_Uppercase_Mapping
|
||||
tc ; Titlecase_Mapping
|
||||
uc ; Uppercase_Mapping
|
||||
|
||||
# ================================================
|
||||
# Miscellaneous Properties
|
||||
# ================================================
|
||||
bmg ; Bidi_Mirroring_Glyph
|
||||
bpb ; Bidi_Paired_Bracket
|
||||
cjkIICore ; kIICore
|
||||
cjkIRG_GSource ; kIRG_GSource
|
||||
cjkIRG_HSource ; kIRG_HSource
|
||||
cjkIRG_JSource ; kIRG_JSource
|
||||
cjkIRG_KPSource ; kIRG_KPSource
|
||||
cjkIRG_KSource ; kIRG_KSource
|
||||
cjkIRG_MSource ; kIRG_MSource
|
||||
cjkIRG_SSource ; kIRG_SSource
|
||||
cjkIRG_TSource ; kIRG_TSource
|
||||
cjkIRG_UKSource ; kIRG_UKSource
|
||||
cjkIRG_USource ; kIRG_USource
|
||||
cjkIRG_VSource ; kIRG_VSource
|
||||
cjkRSUnicode ; kRSUnicode ; Unicode_Radical_Stroke; URS
|
||||
EqUIdeo ; Equivalent_Unified_Ideograph
|
||||
isc ; ISO_Comment
|
||||
JSN ; Jamo_Short_Name
|
||||
na ; Name
|
||||
na1 ; Unicode_1_Name
|
||||
Name_Alias ; Name_Alias
|
||||
scx ; Script_Extensions
|
||||
|
||||
# ================================================
|
||||
# Catalog Properties
|
||||
# ================================================
|
||||
age ; Age
|
||||
blk ; Block
|
||||
sc ; Script
|
||||
|
||||
# ================================================
|
||||
# Enumerated Properties
|
||||
# ================================================
|
||||
bc ; Bidi_Class
|
||||
bpt ; Bidi_Paired_Bracket_Type
|
||||
ccc ; Canonical_Combining_Class
|
||||
dt ; Decomposition_Type
|
||||
ea ; East_Asian_Width
|
||||
gc ; General_Category
|
||||
GCB ; Grapheme_Cluster_Break
|
||||
hst ; Hangul_Syllable_Type
|
||||
InPC ; Indic_Positional_Category
|
||||
InSC ; Indic_Syllabic_Category
|
||||
jg ; Joining_Group
|
||||
jt ; Joining_Type
|
||||
lb ; Line_Break
|
||||
NFC_QC ; NFC_Quick_Check
|
||||
NFD_QC ; NFD_Quick_Check
|
||||
NFKC_QC ; NFKC_Quick_Check
|
||||
NFKD_QC ; NFKD_Quick_Check
|
||||
nt ; Numeric_Type
|
||||
SB ; Sentence_Break
|
||||
vo ; Vertical_Orientation
|
||||
WB ; Word_Break
|
||||
|
||||
# ================================================
|
||||
# Binary Properties
|
||||
# ================================================
|
||||
AHex ; ASCII_Hex_Digit
|
||||
Alpha ; Alphabetic
|
||||
Bidi_C ; Bidi_Control
|
||||
Bidi_M ; Bidi_Mirrored
|
||||
Cased ; Cased
|
||||
CE ; Composition_Exclusion
|
||||
CI ; Case_Ignorable
|
||||
Comp_Ex ; Full_Composition_Exclusion
|
||||
CWCF ; Changes_When_Casefolded
|
||||
CWCM ; Changes_When_Casemapped
|
||||
CWKCF ; Changes_When_NFKC_Casefolded
|
||||
CWL ; Changes_When_Lowercased
|
||||
CWT ; Changes_When_Titlecased
|
||||
CWU ; Changes_When_Uppercased
|
||||
Dash ; Dash
|
||||
Dep ; Deprecated
|
||||
DI ; Default_Ignorable_Code_Point
|
||||
Dia ; Diacritic
|
||||
EBase ; Emoji_Modifier_Base
|
||||
EComp ; Emoji_Component
|
||||
EMod ; Emoji_Modifier
|
||||
Emoji ; Emoji
|
||||
EPres ; Emoji_Presentation
|
||||
Ext ; Extender
|
||||
ExtPict ; Extended_Pictographic
|
||||
Gr_Base ; Grapheme_Base
|
||||
Gr_Ext ; Grapheme_Extend
|
||||
Gr_Link ; Grapheme_Link
|
||||
Hex ; Hex_Digit
|
||||
Hyphen ; Hyphen
|
||||
IDC ; ID_Continue
|
||||
Ideo ; Ideographic
|
||||
IDS ; ID_Start
|
||||
IDSB ; IDS_Binary_Operator
|
||||
IDST ; IDS_Trinary_Operator
|
||||
Join_C ; Join_Control
|
||||
LOE ; Logical_Order_Exception
|
||||
Lower ; Lowercase
|
||||
Math ; Math
|
||||
NChar ; Noncharacter_Code_Point
|
||||
OAlpha ; Other_Alphabetic
|
||||
ODI ; Other_Default_Ignorable_Code_Point
|
||||
OGr_Ext ; Other_Grapheme_Extend
|
||||
OIDC ; Other_ID_Continue
|
||||
OIDS ; Other_ID_Start
|
||||
OLower ; Other_Lowercase
|
||||
OMath ; Other_Math
|
||||
OUpper ; Other_Uppercase
|
||||
Pat_Syn ; Pattern_Syntax
|
||||
Pat_WS ; Pattern_White_Space
|
||||
PCM ; Prepended_Concatenation_Mark
|
||||
QMark ; Quotation_Mark
|
||||
Radical ; Radical
|
||||
RI ; Regional_Indicator
|
||||
SD ; Soft_Dotted
|
||||
STerm ; Sentence_Terminal
|
||||
Term ; Terminal_Punctuation
|
||||
UIdeo ; Unified_Ideograph
|
||||
Upper ; Uppercase
|
||||
VS ; Variation_Selector
|
||||
WSpace ; White_Space ; space
|
||||
XIDC ; XID_Continue
|
||||
XIDS ; XID_Start
|
||||
XO_NFC ; Expands_On_NFC
|
||||
XO_NFD ; Expands_On_NFD
|
||||
XO_NFKC ; Expands_On_NFKC
|
||||
XO_NFKD ; Expands_On_NFKD
|
||||
|
||||
# ================================================
|
||||
# Total: 129
|
||||
|
||||
# EOF
|
File diff suppressed because it is too large
Load Diff
438
maint/ucptest.c
438
maint/ucptest.c
|
@ -2,7 +2,7 @@
|
|||
* A program for testing the Unicode property table *
|
||||
***************************************************/
|
||||
|
||||
/* Copyright (c) University of Cambridge 2008-2020 */
|
||||
/* Copyright (c) University of Cambridge 2008-2022 */
|
||||
|
||||
/* Compile thus:
|
||||
|
||||
|
@ -14,10 +14,10 @@
|
|||
*/
|
||||
|
||||
/* This is a hacked-up program for testing the Unicode properties tables of
|
||||
PCRE2. It can also be used for finding characters with certain properties.
|
||||
I wrote it to help with debugging PCRE, and have added things that I found
|
||||
useful, in a rather haphazard way. The code has never been seriously tidied or
|
||||
checked for robustness, but it shouldn't now give compiler warnings.
|
||||
PCRE2. It can also be used for finding characters with certain properties. I
|
||||
wrote it to help with debugging, and have added things that I found useful, in
|
||||
a rather haphazard way. The code has never been seriously tidied or checked for
|
||||
robustness, but it shouldn't now give compiler warnings.
|
||||
|
||||
There is only one option: "-s". If given, it applies only to the "findprop"
|
||||
command. It causes the UTF-8 sequence of bytes that encode the character to be
|
||||
|
@ -33,21 +33,31 @@ return code is always zero.
|
|||
|
||||
There are three commands:
|
||||
|
||||
"findprop" must be followed by a space-separated list of Unicode code points as
|
||||
hex numbers, either without any prefix or starting with "U+". The output is one
|
||||
line per character, giving its Unicode properties followed by its other case or
|
||||
cases if one or more exist, followed by its Script Extension list if it is not
|
||||
just the same as the base script. This list is in square brackets. The
|
||||
properties are:
|
||||
The command "findprop" must be followed by a space-separated list of Unicode
|
||||
code points as hex numbers, either without any prefix or starting with "U+", or
|
||||
as individual UTF-8 characters preceded by '+'. For example:
|
||||
|
||||
General type e.g. Letter
|
||||
Specific type e.g. Upper case letter
|
||||
Script e.g. Medefaidrin
|
||||
Grapheme break type e.g. Extend (most common is Other)
|
||||
findprop U+1234 5Abc +?
|
||||
|
||||
"find" must be followed by a list of property names and their values. The
|
||||
values are case-sensitive. This finds characters that have those properties. If
|
||||
multiple properties are listed, they must all be matched. Currently supported:
|
||||
The output is one long line per character, listing Unicode properties that have
|
||||
values, followed by its other case or cases if one or more exist, followed by
|
||||
its Script Extension list if there is one. This list is in square brackets. A
|
||||
second list in square brackets gives all the Boolean properties of the
|
||||
character. The properties that come first are:
|
||||
|
||||
Bidi class e.g. NSM (most common is L)
|
||||
General type e.g. Letter
|
||||
Specific type e.g. Upper case letter
|
||||
Script e.g. Medefaidrin
|
||||
Grapheme break type e.g. Extend (most common is Other)
|
||||
|
||||
Script names and Boolean property names are all in lower case, with underscores
|
||||
and hyphens removed, because that's how they are stored for "loose" matching.
|
||||
|
||||
The command "find" must be followed by a list of property types and their
|
||||
values. The values are case-sensitive, except for bidi class. This finds
|
||||
characters that have those properties. If multiple properties are listed, they
|
||||
must all be matched. Currently supported:
|
||||
|
||||
script <name> The character must have this script property. Only one
|
||||
such script may be given.
|
||||
|
@ -56,17 +66,20 @@ multiple properties are listed, they must all be matched. Currently supported:
|
|||
scripts must be present.
|
||||
type <abbrev> The character's specific type (e.g. Lu or Nd) must match.
|
||||
gbreak <name> The grapheme break property must match.
|
||||
bidi <class> The character's bidi class must match.
|
||||
bool <name> The character's Boolean property list must contain this
|
||||
property.
|
||||
|
||||
If a <name> or <abbrev> is preceded by !, the value must NOT be present. For
|
||||
Script Extensions, there may be a mixture of positive and negative
|
||||
requirements. All must be satisfied.
|
||||
Script Extensions and Boolean properties, there may be a mixture of positive
|
||||
and negative requirements. All must be satisfied.
|
||||
|
||||
Sequences of two or more characters are shown as ranges, for example
|
||||
U+0041..U+004A. No more than 100 lines are are output. If there are more
|
||||
characters, the list ends with ...
|
||||
|
||||
"list" must be followed by a property name (script, type, or gbreak). The
|
||||
defined values for that property are listed. */
|
||||
The command "list" must be followed by one of property names script, bool,
|
||||
type, gbreak or bidi. The defined values for that property are listed. */
|
||||
|
||||
|
||||
#ifdef HAVE_CONFIG_H
|
||||
|
@ -97,6 +110,9 @@ defined values for that property are listed. */
|
|||
#include <editline/readline.h>
|
||||
#else
|
||||
#include <readline/readline.h>
|
||||
#ifdef RL_VERSION_MAJOR
|
||||
#include <readline/history.h>
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
|
@ -160,12 +176,37 @@ static const unsigned char *gb_names[] = {
|
|||
US"T", US"Hangul syllable type T",
|
||||
US"LV", US"Hangul syllable type LV",
|
||||
US"LVT", US"Hangul syllable type LVT",
|
||||
US"RegionalIndicator", US"",
|
||||
US"Regional_Indicator", US"",
|
||||
US"Other", US"",
|
||||
US"ZWJ", US"zero width joiner",
|
||||
US"Extended_Pictographic", US""
|
||||
};
|
||||
|
||||
static const unsigned char *bd_names[] = {
|
||||
US"AL", US"Arabic letter",
|
||||
US"AN", US"Arabid number",
|
||||
US"B", US"Paragraph separator",
|
||||
US"BN", US"Boundary neutral",
|
||||
US"CS", US"Common separator",
|
||||
US"EN", US"European number",
|
||||
US"ES", US"European separator",
|
||||
US"ET", US"European terminator",
|
||||
US"FSI", US"First string isolate",
|
||||
US"L", US"Left-to-right",
|
||||
US"LRE", US"Left-to-right embedding",
|
||||
US"LRI", US"Left-to-right isolate",
|
||||
US"LRO", US"Left-to-right override",
|
||||
US"NSM", US"Non-spacing mark",
|
||||
US"ON", US"Other neutral",
|
||||
US"PDF", US"Pop directional format",
|
||||
US"PDI", US"Pop directional isolate",
|
||||
US"R", US"Right-to-left",
|
||||
US"RLE", US"Right-to-left embedding",
|
||||
US"RLI", US"Right-to-left isolate",
|
||||
US"RLO", US"Right-to-left override",
|
||||
US"S", US"Segment separator",
|
||||
US"WS", US"White space"
|
||||
};
|
||||
|
||||
static const unsigned int utf8_table1[] = {
|
||||
0x0000007f, 0x000007ff, 0x0000ffff, 0x001fffff, 0x03ffffff, 0x7fffffff};
|
||||
|
@ -173,6 +214,41 @@ static const unsigned int utf8_table1[] = {
|
|||
static const int utf8_table2[] = {
|
||||
0, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc};
|
||||
|
||||
/* Macro to pick up the remaining bytes of a UTF-8 character, advancing
|
||||
the pointer. */
|
||||
|
||||
#define GETUTF8INC(c, eptr) \
|
||||
{ \
|
||||
if ((c & 0x20u) == 0) \
|
||||
c = ((c & 0x1fu) << 6) | (*eptr++ & 0x3fu); \
|
||||
else if ((c & 0x10u) == 0) \
|
||||
{ \
|
||||
c = ((c & 0x0fu) << 12) | ((*eptr & 0x3fu) << 6) | (eptr[1] & 0x3fu); \
|
||||
eptr += 2; \
|
||||
} \
|
||||
else if ((c & 0x08u) == 0) \
|
||||
{ \
|
||||
c = ((c & 0x07u) << 18) | ((*eptr & 0x3fu) << 12) | \
|
||||
((eptr[1] & 0x3fu) << 6) | (eptr[2] & 0x3fu); \
|
||||
eptr += 3; \
|
||||
} \
|
||||
else if ((c & 0x04u) == 0) \
|
||||
{ \
|
||||
c = ((c & 0x03u) << 24) | ((*eptr & 0x3fu) << 18) | \
|
||||
((eptr[1] & 0x3fu) << 12) | ((eptr[2] & 0x3fu) << 6) | \
|
||||
(eptr[3] & 0x3fu); \
|
||||
eptr += 4; \
|
||||
} \
|
||||
else \
|
||||
{ \
|
||||
c = ((c & 0x01u) << 30) | ((*eptr & 0x3fu) << 24) | \
|
||||
((eptr[1] & 0x3fu) << 18) | ((eptr[2] & 0x3fu) << 12) | \
|
||||
((eptr[3] & 0x3fu) << 6) | (eptr[4] & 0x3fu); \
|
||||
eptr += 5; \
|
||||
} \
|
||||
}
|
||||
|
||||
|
||||
|
||||
/*************************************************
|
||||
* Convert character value to UTF-8 *
|
||||
|
@ -224,24 +300,53 @@ return isatty(fileno(stdin));
|
|||
|
||||
|
||||
/*************************************************
|
||||
* Get script name from ucp ident *
|
||||
* Get name from ucp ident *
|
||||
*************************************************/
|
||||
|
||||
static const char *
|
||||
get_scriptname(int script)
|
||||
{
|
||||
size_t i;
|
||||
const ucp_type_table *u;
|
||||
/* The utt table contains both full names and abbreviations. So search for both
|
||||
and use the longer if two are found, unless the first one is only 3 characters
|
||||
and we are looking for a script (some scripts have 3-character names). If this
|
||||
were not just a test program it might be worth making some kind of reverse
|
||||
index. */
|
||||
|
||||
static const char *
|
||||
get_propname(int prop, int type)
|
||||
{
|
||||
size_t i, j, len;
|
||||
size_t foundlist[2];
|
||||
const char *yield;
|
||||
int typex = (type == PT_SC)? PT_SCX : type;
|
||||
|
||||
j = 0;
|
||||
for (i = 0; i < PRIV(utt_size); i++)
|
||||
{
|
||||
u = PRIV(utt) + i;
|
||||
if (u->type == PT_SC && u->value == script) break;
|
||||
const ucp_type_table *u = PRIV(utt) + i;
|
||||
if ((u->type == type || u->type == typex) && u->value == prop)
|
||||
{
|
||||
foundlist[j++] = i;
|
||||
if (j >= 2) break;
|
||||
}
|
||||
}
|
||||
if (i < PRIV(utt_size))
|
||||
return PRIV(utt_names) + u->name_offset;
|
||||
|
||||
return "??";
|
||||
if (j == 0) return "??";
|
||||
|
||||
yield = NULL;
|
||||
len = 0;
|
||||
|
||||
for (i = 0; i < j; i++)
|
||||
{
|
||||
const char *s = PRIV(utt_names) + (PRIV(utt) + foundlist[i])->name_offset;
|
||||
size_t sl = strlen(s);
|
||||
|
||||
if (sl > len)
|
||||
{
|
||||
yield = s;
|
||||
if (sl == 3 && type == PT_SC) break;
|
||||
len = sl;
|
||||
}
|
||||
}
|
||||
|
||||
return yield;
|
||||
}
|
||||
|
||||
|
||||
|
@ -257,13 +362,16 @@ int fulltype = UCD_CHARTYPE(c);
|
|||
int script = UCD_SCRIPT(c);
|
||||
int scriptx = UCD_SCRIPTX(c);
|
||||
int gbprop = UCD_GRAPHBREAK(c);
|
||||
int bidi = UCD_BIDICLASS(c);
|
||||
unsigned int othercase = UCD_OTHERCASE(c);
|
||||
int caseset = UCD_CASESET(c);
|
||||
int bprops = UCD_BPROPS(c);
|
||||
|
||||
const unsigned char *fulltypename = US"??";
|
||||
const unsigned char *typename = US"??";
|
||||
const unsigned char *graphbreak = US"??";
|
||||
const unsigned char *scriptname = CUS get_scriptname(script);
|
||||
const unsigned char *bidiclass = US"??";
|
||||
const unsigned char *scriptname = CUS get_propname(script, PT_SC);
|
||||
|
||||
switch (type)
|
||||
{
|
||||
|
@ -323,7 +431,7 @@ switch(gbprop)
|
|||
case ucp_gbT: graphbreak = US"Hangul syllable type T"; break;
|
||||
case ucp_gbLV: graphbreak = US"Hangul syllable type LV"; break;
|
||||
case ucp_gbLVT: graphbreak = US"Hangul syllable type LVT"; break;
|
||||
case ucp_gbRegionalIndicator:
|
||||
case ucp_gbRegional_Indicator:
|
||||
graphbreak = US"Regional Indicator"; break;
|
||||
case ucp_gbOther: graphbreak = US"Other"; break;
|
||||
case ucp_gbZWJ: graphbreak = US"Zero Width Joiner"; break;
|
||||
|
@ -332,7 +440,37 @@ switch(gbprop)
|
|||
default: graphbreak = US"Unknown"; break;
|
||||
}
|
||||
|
||||
printf("U+%04X %s: %s, %s, %s", c, typename, fulltypename, scriptname, graphbreak);
|
||||
switch(bidi)
|
||||
{
|
||||
case ucp_bidiAL: bidiclass = US"AL "; break;
|
||||
case ucp_bidiFSI: bidiclass = US"FSI"; break;
|
||||
case ucp_bidiL: bidiclass = US"L "; break;
|
||||
case ucp_bidiLRE: bidiclass = US"LRE"; break;
|
||||
case ucp_bidiLRI: bidiclass = US"LRI"; break;
|
||||
case ucp_bidiLRO: bidiclass = US"LRO"; break;
|
||||
case ucp_bidiPDF: bidiclass = US"PDF"; break;
|
||||
case ucp_bidiPDI: bidiclass = US"PDI"; break;
|
||||
case ucp_bidiR: bidiclass = US"R "; break;
|
||||
case ucp_bidiRLE: bidiclass = US"RLE"; break;
|
||||
case ucp_bidiRLI: bidiclass = US"RLI"; break;
|
||||
case ucp_bidiRLO: bidiclass = US"RLO"; break;
|
||||
case ucp_bidiAN: bidiclass = US"AN "; break;
|
||||
case ucp_bidiB: bidiclass = US"B "; break;
|
||||
case ucp_bidiBN: bidiclass = US"BN "; break;
|
||||
case ucp_bidiCS: bidiclass = US"CS "; break;
|
||||
case ucp_bidiEN: bidiclass = US"EN "; break;
|
||||
case ucp_bidiES: bidiclass = US"ES "; break;
|
||||
case ucp_bidiET: bidiclass = US"ET "; break;
|
||||
case ucp_bidiNSM: bidiclass = US"NSM"; break;
|
||||
case ucp_bidiON: bidiclass = US"ON "; break;
|
||||
case ucp_bidiS: bidiclass = US"S "; break;
|
||||
case ucp_bidiWS: bidiclass = US"WS "; break;
|
||||
default: bidiclass = US"???"; break;
|
||||
}
|
||||
|
||||
printf("U+%04X %s %s: %s, %s, %s", c, bidiclass, typename, fulltypename,
|
||||
scriptname, graphbreak);
|
||||
|
||||
if (is_just_one && othercase != c)
|
||||
{
|
||||
printf(", U+%04X", othercase);
|
||||
|
@ -347,20 +485,31 @@ if (is_just_one && othercase != c)
|
|||
}
|
||||
}
|
||||
|
||||
if (scriptx != script)
|
||||
{
|
||||
printf(", [");
|
||||
if (scriptx >= 0)
|
||||
printf("%s", get_scriptname(scriptx));
|
||||
else
|
||||
if (scriptx != 0)
|
||||
{
|
||||
const char *sep = "";
|
||||
const uint8_t *p = PRIV(ucd_script_sets) - scriptx;
|
||||
while (*p != 0)
|
||||
const uint32_t *p = PRIV(ucd_script_sets) + scriptx;
|
||||
printf(", [");
|
||||
for (int i = 0; i < ucp_Unknown; i++)
|
||||
if (MAPBIT(p, i) != 0)
|
||||
{
|
||||
printf("%s%s", sep, get_scriptname(*p++));
|
||||
printf("%s%s", sep, get_propname(i, PT_SC));
|
||||
sep = ", ";
|
||||
}
|
||||
printf("]");
|
||||
}
|
||||
|
||||
if (bprops != 0)
|
||||
{
|
||||
const char *sep = "";
|
||||
const uint32_t *p = PRIV(ucd_boolprop_sets) +
|
||||
bprops * ucd_boolprop_sets_item_size;
|
||||
printf(", [");
|
||||
for (int i = 0; i < ucp_Bprop_Count; i++)
|
||||
if (MAPBIT(p, i) != 0)
|
||||
{
|
||||
printf("%s%s", sep, get_propname(i, PT_BOOL));
|
||||
sep = ", ";
|
||||
}
|
||||
printf("]");
|
||||
}
|
||||
|
@ -384,19 +533,23 @@ printf("\n");
|
|||
static void
|
||||
find_chars(unsigned char *s)
|
||||
{
|
||||
unsigned char name[24];
|
||||
unsigned char value[24];
|
||||
unsigned char name[128];
|
||||
unsigned char value[128];
|
||||
unsigned char *t;
|
||||
unsigned int count= 0;
|
||||
int scriptx_list[24];
|
||||
int scriptx_list[128];
|
||||
unsigned int scriptx_count = 0;
|
||||
int bprop_list[128];
|
||||
unsigned int bprop_count = 0;
|
||||
uint32_t i, c;
|
||||
int script = -1;
|
||||
int type = -1;
|
||||
int gbreak = -1;
|
||||
int bidiclass = -1;
|
||||
BOOL script_not = FALSE;
|
||||
BOOL type_not = FALSE;
|
||||
BOOL gbreak_not = FALSE;
|
||||
BOOL bidiclass_not = FALSE;
|
||||
BOOL hadrange = FALSE;
|
||||
const ucd_record *ucd, *next_ucd;
|
||||
const char *pad = " ";
|
||||
|
@ -410,13 +563,18 @@ while (*s != 0)
|
|||
*t = 0;
|
||||
while (isspace(*s)) s++;
|
||||
|
||||
for (t = value; *s != 0 && !isspace(*s); s++) *t++ = *s;
|
||||
for (t = value; *s != 0 && !isspace(*s); s++)
|
||||
{
|
||||
if (*s != '_' && *s != '-') *t++ = *s;
|
||||
}
|
||||
*t = 0;
|
||||
while (isspace(*s)) s++;
|
||||
|
||||
if (strcmp(CS name, "script") == 0 ||
|
||||
strcmp(CS name, "scriptx") == 0)
|
||||
{
|
||||
for (t = value; *t != 0; t++) *t = tolower(*t);
|
||||
|
||||
if (value[0] == '!')
|
||||
{
|
||||
if (name[6] == 'x') scriptx_not = TRUE;
|
||||
|
@ -427,7 +585,7 @@ while (*s != 0)
|
|||
for (i = 0; i < PRIV(utt_size); i++)
|
||||
{
|
||||
const ucp_type_table *u = PRIV(utt) + i;
|
||||
if (u->type == PT_SC && strcmp(CS(value + offset),
|
||||
if ((u->type == PT_SCX || u->type == PT_SC) && strcmp(CS(value + offset),
|
||||
PRIV(utt_names) + u->name_offset) == 0)
|
||||
{
|
||||
c = u->value;
|
||||
|
@ -454,6 +612,33 @@ while (*s != 0)
|
|||
}
|
||||
}
|
||||
|
||||
else if (strcmp(CS name, "bool") == 0)
|
||||
{
|
||||
int not = 1;
|
||||
if (value[0] == '!')
|
||||
{
|
||||
not = -1;
|
||||
offset = 1;
|
||||
}
|
||||
|
||||
for (i = 0; i < PRIV(utt_size); i++)
|
||||
{
|
||||
const ucp_type_table *u = PRIV(utt) + i;
|
||||
if (u->type == PT_BOOL && strcmp(CS(value + offset),
|
||||
PRIV(utt_names) + u->name_offset) == 0)
|
||||
{
|
||||
bprop_list[bprop_count++] = u->value * not;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (i >= PRIV(utt_size))
|
||||
{
|
||||
printf("** Unrecognized property name \"%s\"\n", value);
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
else if (strcmp(CS name, "type") == 0)
|
||||
{
|
||||
if (type >= 0)
|
||||
|
@ -516,6 +701,38 @@ while (*s != 0)
|
|||
}
|
||||
}
|
||||
|
||||
else if (strcmp(CS name, "bidi") == 0 ||
|
||||
strcmp(CS name, "bidiclass") == 0 ||
|
||||
strcmp(CS name, "bidi_class") == 0 )
|
||||
{
|
||||
if (bidiclass >= 0)
|
||||
{
|
||||
printf("** Only 1 bidi class value allowed\n");
|
||||
return;
|
||||
}
|
||||
else
|
||||
{
|
||||
if (value[0] == '!')
|
||||
{
|
||||
bidiclass_not = TRUE;
|
||||
offset = 1;
|
||||
}
|
||||
for (i = 0; i < sizeof(bd_names)/sizeof(char *); i += 2)
|
||||
{
|
||||
if (strcasecmp(CS (value + offset), CS bd_names[i]) == 0)
|
||||
{
|
||||
bidiclass = i/2;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (i >= sizeof(bd_names)/sizeof(char *))
|
||||
{
|
||||
printf("** Unrecognized bidi class name \"%s\"\n", value);
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
else
|
||||
{
|
||||
printf("** Unrecognized property name \"%s\"\n", name);
|
||||
|
@ -523,7 +740,8 @@ while (*s != 0)
|
|||
}
|
||||
}
|
||||
|
||||
if (script < 0 && scriptx_count == 0 && type < 0 && gbreak < 0)
|
||||
if (script < 0 && scriptx_count == 0 && bprop_count == 0 && type < 0 &&
|
||||
gbreak < 0 && bidiclass < 0)
|
||||
{
|
||||
printf("** No properties specified\n");
|
||||
return;
|
||||
|
@ -535,55 +753,55 @@ for (c = 0; c <= 0x10ffff; c++)
|
|||
|
||||
if (scriptx_count > 0)
|
||||
{
|
||||
const uint8_t *char_scriptx = NULL;
|
||||
const uint32_t *bits_scriptx = PRIV(ucd_script_sets) + UCD_SCRIPTX(c);
|
||||
unsigned int found = 0;
|
||||
int scriptx = UCD_SCRIPTX(c);
|
||||
|
||||
if (scriptx < 0) char_scriptx = PRIV(ucd_script_sets) - scriptx;
|
||||
|
||||
for (i = 0; i < scriptx_count; i++)
|
||||
{
|
||||
int x = scriptx_list[i]/32;
|
||||
int y = scriptx_list[i]%32;
|
||||
|
||||
/* Positive requirment */
|
||||
if (scriptx_list[i] >= 0)
|
||||
{
|
||||
if (scriptx >= 0)
|
||||
{
|
||||
if (scriptx == scriptx_list[i]) found++;
|
||||
}
|
||||
|
||||
else
|
||||
{
|
||||
const uint8_t *p;
|
||||
for (p = char_scriptx; *p != 0; p++)
|
||||
{
|
||||
if (scriptx_list[i] == *p)
|
||||
{
|
||||
found++;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
if ((bits_scriptx[x] & (1u<<y)) != 0) found++;
|
||||
}
|
||||
/* Negative requirement */
|
||||
else
|
||||
{
|
||||
if (scriptx >= 0)
|
||||
{
|
||||
if (scriptx != -scriptx_list[i]) found++;
|
||||
}
|
||||
else
|
||||
{
|
||||
const uint8_t *p;
|
||||
for (p = char_scriptx; *p != 0; p++)
|
||||
if (-scriptx_list[i] == *p) break;
|
||||
if (*p == 0) found++;
|
||||
}
|
||||
if ((bits_scriptx[x] & (1u<<y)) == 0) found++;
|
||||
}
|
||||
}
|
||||
|
||||
if (found != scriptx_count) continue;
|
||||
}
|
||||
|
||||
if (bprop_count > 0)
|
||||
{
|
||||
const uint32_t *bits_bprop = PRIV(ucd_boolprop_sets) +
|
||||
UCD_BPROPS(c) * ucd_boolprop_sets_item_size;
|
||||
unsigned int found = 0;
|
||||
|
||||
for (i = 0; i < bprop_count; i++)
|
||||
{
|
||||
int x = bprop_list[i]/32;
|
||||
int y = bprop_list[i]%32;
|
||||
|
||||
/* Positive requirement */
|
||||
if (bprop_list[i] >= 0)
|
||||
{
|
||||
if ((bits_bprop[x] & (1u<<y)) != 0) found++;
|
||||
}
|
||||
/* Negative requirement */
|
||||
else
|
||||
{
|
||||
if ((bits_bprop[-x] & (1u<<(-y))) == 0) found++;
|
||||
}
|
||||
}
|
||||
|
||||
if (found != bprop_count) continue;
|
||||
}
|
||||
|
||||
if (type >= 0)
|
||||
{
|
||||
if (type_not)
|
||||
|
@ -608,6 +826,18 @@ for (c = 0; c <= 0x10ffff; c++)
|
|||
}
|
||||
}
|
||||
|
||||
if (bidiclass >= 0)
|
||||
{
|
||||
if (bidiclass_not)
|
||||
{
|
||||
if (bidiclass == UCD_BIDICLASS(c)) continue;
|
||||
}
|
||||
else
|
||||
{
|
||||
if (bidiclass != UCD_BIDICLASS(c)) continue;
|
||||
}
|
||||
}
|
||||
|
||||
/* All conditions are met. Look for runs. */
|
||||
|
||||
ucd = GET_UCD(c);
|
||||
|
@ -666,12 +896,26 @@ if (strcmp(CS name, "findprop") == 0)
|
|||
unsigned int c;
|
||||
unsigned char *endptr;
|
||||
t = s;
|
||||
|
||||
if (*t == '+')
|
||||
{
|
||||
c = *(++t);
|
||||
if (c > 0x7fu)
|
||||
{
|
||||
GETCHARINC(c, t);
|
||||
}
|
||||
endptr = t+1;
|
||||
}
|
||||
else
|
||||
{
|
||||
if (strncmp(CS t, "U+", 2) == 0) t += 2;
|
||||
c = strtoul(CS t, CSS(&endptr), 16);
|
||||
}
|
||||
|
||||
if (*endptr != 0 && !isspace(*endptr))
|
||||
{
|
||||
while (*endptr != 0 && !isspace(*endptr)) endptr++;
|
||||
printf("** Invalid hex number: ignored \"%.*s\"\n", (int)(endptr-s), s);
|
||||
printf("** Invalid character specifier: ignored \"%.*s\"\n", (int)(endptr-s), s);
|
||||
}
|
||||
else
|
||||
{
|
||||
|
@ -702,7 +946,14 @@ else if (strcmp(CS name, "list") == 0)
|
|||
if (strcmp(CS name, "script") == 0 || strcmp(CS name, "scripts") == 0)
|
||||
{
|
||||
for (i = 0; i < PRIV(utt_size); i++)
|
||||
if (PRIV(utt)[i].type == PT_SC)
|
||||
if (PRIV(utt)[i].type == PT_SCX || PRIV(utt)[i].type == PT_SC)
|
||||
printf("%s\n", PRIV(utt_names) + PRIV(utt)[i].name_offset);
|
||||
}
|
||||
|
||||
else if (strcmp(CS name, "bool") == 0)
|
||||
{
|
||||
for (i = 0; i < PRIV(utt_size); i++)
|
||||
if (PRIV(utt)[i].type == PT_BOOL)
|
||||
printf("%s\n", PRIV(utt_names) + PRIV(utt)[i].name_offset);
|
||||
}
|
||||
|
||||
|
@ -723,6 +974,13 @@ else if (strcmp(CS name, "list") == 0)
|
|||
}
|
||||
}
|
||||
|
||||
else if (strcmp(CS name, "bidi") == 0 ||
|
||||
strcmp(CS name, "bidiclasses") == 0)
|
||||
{
|
||||
for (i = 0; i < sizeof(bd_names)/sizeof(char *); i += 2)
|
||||
printf("%3s %s\n", bd_names[i], bd_names[i+1]);
|
||||
}
|
||||
|
||||
else
|
||||
{
|
||||
printf("** Unknown property \"%s\"\n", name);
|
||||
|
@ -756,19 +1014,19 @@ if (argc > 1 && strcmp(argv[1], "-s") == 0)
|
|||
if (argc > first_arg)
|
||||
{
|
||||
int i;
|
||||
BOOL hexfirst = TRUE;
|
||||
BOOL datafirst = TRUE;
|
||||
char *arg = argv[first_arg];
|
||||
unsigned char *s = buffer;
|
||||
|
||||
if (strncmp(arg, "U+", 2) != 0 && !isdigit(*arg))
|
||||
if (*arg != '+' && strncmp(arg, "U+", 2) != 0 && !isdigit(*arg))
|
||||
{
|
||||
while (*arg != 0)
|
||||
{
|
||||
if (!isxdigit(*arg++)) { hexfirst = FALSE; break; }
|
||||
if (!isxdigit(*arg++)) { datafirst = FALSE; break; }
|
||||
}
|
||||
}
|
||||
|
||||
if (hexfirst)
|
||||
if (datafirst)
|
||||
{
|
||||
strcpy(CS s, "findprop ");
|
||||
s += 9;
|
||||
|
|
|
@ -46,3 +46,5 @@ findprop 32ff
|
|||
findprop 1f16d
|
||||
|
||||
findprop U+10e93 U+10eaa
|
||||
|
||||
findprop 0602 202a 202b 202c 2068 2069 202d 202e 2067
|
||||
|
|
|
@ -4,3 +4,16 @@ find type Sk
|
|||
find type Pd
|
||||
find gbreak LVT
|
||||
find script Old_Uyghur
|
||||
find bidi PDF
|
||||
find bidi CS
|
||||
find bidi CS type Sm
|
||||
find bidi B
|
||||
find bidi FSI
|
||||
find bidi PDI
|
||||
find bidi RLI
|
||||
find bidi RLO
|
||||
find bidi S
|
||||
find bidi WS
|
||||
find script bopo
|
||||
find bool prependedconcatenationmark
|
||||
find bool pcm
|
||||
|
|
|
@ -1,398 +1,409 @@
|
|||
findprop 00 01 02 03 04 05 06 07 08 09 0a 0b 0c 0d 0e 0f
|
||||
U+0000 Control: Control, Common, Control
|
||||
U+0001 Control: Control, Common, Control
|
||||
U+0002 Control: Control, Common, Control
|
||||
U+0003 Control: Control, Common, Control
|
||||
U+0004 Control: Control, Common, Control
|
||||
U+0005 Control: Control, Common, Control
|
||||
U+0006 Control: Control, Common, Control
|
||||
U+0007 Control: Control, Common, Control
|
||||
U+0008 Control: Control, Common, Control
|
||||
U+0009 Control: Control, Common, Control
|
||||
U+000A Control: Control, Common, LF
|
||||
U+000B Control: Control, Common, Control
|
||||
U+000C Control: Control, Common, Control
|
||||
U+000D Control: Control, Common, CR
|
||||
U+000E Control: Control, Common, Control
|
||||
U+000F Control: Control, Common, Control
|
||||
U+0000 BN Control: Control, common, Control, [ascii, patternwhitespace, whitespace]
|
||||
U+0001 BN Control: Control, common, Control, [ascii, patternwhitespace, whitespace]
|
||||
U+0002 BN Control: Control, common, Control, [ascii, patternwhitespace, whitespace]
|
||||
U+0003 BN Control: Control, common, Control, [ascii, patternwhitespace, whitespace]
|
||||
U+0004 BN Control: Control, common, Control, [ascii, patternwhitespace, whitespace]
|
||||
U+0005 BN Control: Control, common, Control, [ascii, patternwhitespace, whitespace]
|
||||
U+0006 BN Control: Control, common, Control, [ascii, patternwhitespace, whitespace]
|
||||
U+0007 BN Control: Control, common, Control, [ascii, patternwhitespace, whitespace]
|
||||
U+0008 BN Control: Control, common, Control, [ascii, patternwhitespace, whitespace]
|
||||
U+0009 S Control: Control, common, Control, [ascii, graphemebase, patternsyntax, sentenceterminal, terminalpunctuation]
|
||||
U+000A B Control: Control, common, LF, [ascii, graphemebase, patternsyntax, sentenceterminal, terminalpunctuation]
|
||||
U+000B S Control: Control, common, Control, [ascii, graphemebase, patternsyntax, sentenceterminal, terminalpunctuation]
|
||||
U+000C WS Control: Control, common, Control, [ascii, graphemebase, patternsyntax, sentenceterminal, terminalpunctuation]
|
||||
U+000D B Control: Control, common, CR, [ascii, graphemebase, patternsyntax, sentenceterminal, terminalpunctuation]
|
||||
U+000E BN Control: Control, common, Control, [ascii, patternwhitespace, whitespace]
|
||||
U+000F BN Control: Control, common, Control, [ascii, patternwhitespace, whitespace]
|
||||
findprop 10 11 12 13 14 15 16 17 18 19 1a 1b 1c 1d 1e 1f
|
||||
U+0010 Control: Control, Common, Control
|
||||
U+0011 Control: Control, Common, Control
|
||||
U+0012 Control: Control, Common, Control
|
||||
U+0013 Control: Control, Common, Control
|
||||
U+0014 Control: Control, Common, Control
|
||||
U+0015 Control: Control, Common, Control
|
||||
U+0016 Control: Control, Common, Control
|
||||
U+0017 Control: Control, Common, Control
|
||||
U+0018 Control: Control, Common, Control
|
||||
U+0019 Control: Control, Common, Control
|
||||
U+001A Control: Control, Common, Control
|
||||
U+001B Control: Control, Common, Control
|
||||
U+001C Control: Control, Common, Control
|
||||
U+001D Control: Control, Common, Control
|
||||
U+001E Control: Control, Common, Control
|
||||
U+001F Control: Control, Common, Control
|
||||
U+0010 BN Control: Control, common, Control, [ascii, patternwhitespace, whitespace]
|
||||
U+0011 BN Control: Control, common, Control, [ascii, patternwhitespace, whitespace]
|
||||
U+0012 BN Control: Control, common, Control, [ascii, patternwhitespace, whitespace]
|
||||
U+0013 BN Control: Control, common, Control, [ascii, patternwhitespace, whitespace]
|
||||
U+0014 BN Control: Control, common, Control, [ascii, patternwhitespace, whitespace]
|
||||
U+0015 BN Control: Control, common, Control, [ascii, patternwhitespace, whitespace]
|
||||
U+0016 BN Control: Control, common, Control, [ascii, patternwhitespace, whitespace]
|
||||
U+0017 BN Control: Control, common, Control, [ascii, patternwhitespace, whitespace]
|
||||
U+0018 BN Control: Control, common, Control, [ascii, patternwhitespace, whitespace]
|
||||
U+0019 BN Control: Control, common, Control, [ascii, patternwhitespace, whitespace]
|
||||
U+001A BN Control: Control, common, Control, [ascii, patternwhitespace, whitespace]
|
||||
U+001B BN Control: Control, common, Control, [ascii, patternwhitespace, whitespace]
|
||||
U+001C B Control: Control, common, Control, [ascii, patternwhitespace, whitespace]
|
||||
U+001D B Control: Control, common, Control, [ascii, patternwhitespace, whitespace]
|
||||
U+001E B Control: Control, common, Control, [ascii, patternwhitespace, whitespace]
|
||||
U+001F S Control: Control, common, Control, [ascii, patternwhitespace, whitespace]
|
||||
findprop 20 21 22 23 24 25 26 27 28 29 2a 2b 2c 2d 2e 2f
|
||||
U+0020 Separator: Space separator, Common, Other
|
||||
U+0021 Punctuation: Other punctuation, Common, Other
|
||||
U+0022 Punctuation: Other punctuation, Common, Other
|
||||
U+0023 Punctuation: Other punctuation, Common, Other
|
||||
U+0024 Symbol: Currency symbol, Common, Other
|
||||
U+0025 Punctuation: Other punctuation, Common, Other
|
||||
U+0026 Punctuation: Other punctuation, Common, Other
|
||||
U+0027 Punctuation: Other punctuation, Common, Other
|
||||
U+0028 Punctuation: Open punctuation, Common, Other
|
||||
U+0029 Punctuation: Close punctuation, Common, Other
|
||||
U+002A Punctuation: Other punctuation, Common, Other
|
||||
U+002B Symbol: Mathematical symbol, Common, Other
|
||||
U+002C Punctuation: Other punctuation, Common, Other
|
||||
U+002D Punctuation: Dash punctuation, Common, Other
|
||||
U+002E Punctuation: Other punctuation, Common, Other
|
||||
U+002F Punctuation: Other punctuation, Common, Other
|
||||
U+0020 WS Separator: Space separator, common, Other, [ascii, emoji, emojicomponent, graphemebase, patternsyntax]
|
||||
U+0021 ON Punctuation: Other punctuation, common, Other, [ascii, caseignorable, graphemebase, patternsyntax, quotationmark]
|
||||
U+0022 ON Punctuation: Other punctuation, common, Other, [ascii, graphemebase, math, patternsyntax]
|
||||
U+0023 ET Punctuation: Other punctuation, common, Other, [ascii, dash, graphemebase, patternsyntax]
|
||||
U+0024 ET Symbol: Currency symbol, common, Other, [ascii, asciihexdigit, emoji, emojicomponent, graphemebase, hexdigit, idcontinue, xidcontinue]
|
||||
U+0025 ET Punctuation: Other punctuation, common, Other, [ascii, asciihexdigit, emoji, emojicomponent, graphemebase, hexdigit, idcontinue, xidcontinue]
|
||||
U+0026 ON Punctuation: Other punctuation, common, Other, [ascii, asciihexdigit, emoji, emojicomponent, graphemebase, hexdigit, idcontinue, xidcontinue]
|
||||
U+0027 ON Punctuation: Other punctuation, common, Other, [ascii, bidimirrored, graphemebase, math, patternsyntax]
|
||||
U+0028 ON Punctuation: Open punctuation, common, Other, [ascii, alphabetic, cased, changeswhencasefolded, changeswhencasemapped, changeswhenlowercased, graphemebase, idcontinue, idstart, uppercase, xidcontinue, xidstart]
|
||||
U+0029 ON Punctuation: Close punctuation, common, Other, [ascii, alphabetic, cased, changeswhencasefolded, changeswhencasemapped, changeswhenlowercased, graphemebase, idcontinue, idstart, uppercase, xidcontinue, xidstart]
|
||||
U+002A ON Punctuation: Other punctuation, common, Other, [ascii, dash, graphemebase, patternsyntax]
|
||||
U+002B ES Symbol: Mathematical symbol, common, Other, [ascii, graphemebase, idcontinue, xidcontinue]
|
||||
U+002C CS Punctuation: Other punctuation, common, Other, [ascii, asciihexdigit, alphabetic, cased, changeswhencasemapped, changeswhentitlecased, changeswhenuppercased, graphemebase, hexdigit, idcontinue, idstart, lowercase, xidcontinue, xidstart]
|
||||
U+002D ES Punctuation: Dash punctuation, common, Other, [ascii, alphabetic, cased, changeswhencasemapped, changeswhentitlecased, changeswhenuppercased, graphemebase, idcontinue, idstart, lowercase, softdotted, xidcontinue, xidstart]
|
||||
U+002E CS Punctuation: Other punctuation, common, Other, [graphemebase, whitespace]
|
||||
U+002F CS Punctuation: Other punctuation, common, Other, [ascii, asciihexdigit, emoji, emojicomponent, graphemebase, hexdigit, idcontinue, xidcontinue]
|
||||
findprop 30 31 32 33 34 35 36 37 38 39 3a 3b 3c 3d 3e 3f
|
||||
U+0030 Number: Decimal number, Common, Other
|
||||
U+0031 Number: Decimal number, Common, Other
|
||||
U+0032 Number: Decimal number, Common, Other
|
||||
U+0033 Number: Decimal number, Common, Other
|
||||
U+0034 Number: Decimal number, Common, Other
|
||||
U+0035 Number: Decimal number, Common, Other
|
||||
U+0036 Number: Decimal number, Common, Other
|
||||
U+0037 Number: Decimal number, Common, Other
|
||||
U+0038 Number: Decimal number, Common, Other
|
||||
U+0039 Number: Decimal number, Common, Other
|
||||
U+003A Punctuation: Other punctuation, Common, Other
|
||||
U+003B Punctuation: Other punctuation, Common, Other
|
||||
U+003C Symbol: Mathematical symbol, Common, Other
|
||||
U+003D Symbol: Mathematical symbol, Common, Other
|
||||
U+003E Symbol: Mathematical symbol, Common, Other
|
||||
U+003F Punctuation: Other punctuation, Common, Other
|
||||
U+0030 EN Number: Decimal number, common, Other, [caseignorable, diacritic, graphemebase]
|
||||
U+0031 EN Number: Decimal number, common, Other, [caseignorable, diacritic, graphemebase]
|
||||
U+0032 EN Number: Decimal number, common, Other, [caseignorable, diacritic, graphemebase]
|
||||
U+0033 EN Number: Decimal number, common, Other, [caseignorable, diacritic, graphemebase]
|
||||
U+0034 EN Number: Decimal number, common, Other, [caseignorable, diacritic, graphemebase]
|
||||
U+0035 EN Number: Decimal number, common, Other, [caseignorable, diacritic, graphemebase]
|
||||
U+0036 EN Number: Decimal number, common, Other, [caseignorable, diacritic, graphemebase]
|
||||
U+0037 EN Number: Decimal number, common, Other, [caseignorable, diacritic, graphemebase]
|
||||
U+0038 EN Number: Decimal number, common, Other, [caseignorable, diacritic, graphemebase]
|
||||
U+0039 EN Number: Decimal number, common, Other, [caseignorable, diacritic, graphemebase]
|
||||
U+003A CS Punctuation: Other punctuation, common, Other, [alphabetic, cased, graphemebase, idcontinue, idstart, lowercase, xidcontinue, xidstart]
|
||||
U+003B ON Punctuation: Other punctuation, common, Other, [ascii, asciihexdigit, alphabetic, cased, changeswhencasemapped, changeswhentitlecased, changeswhenuppercased, graphemebase, hexdigit, idcontinue, idstart, lowercase, xidcontinue, xidstart]
|
||||
U+003C ON Symbol: Mathematical symbol, common, Other, [graphemebase, math, patternsyntax]
|
||||
U+003D ON Symbol: Mathematical symbol, common, Other, [ascii, graphemebase, idcontinue, xidcontinue]
|
||||
U+003E ON Symbol: Mathematical symbol, common, Other, [graphemebase, math, patternsyntax]
|
||||
U+003F ON Punctuation: Other punctuation, common, Other, [ascii, caseignorable, graphemebase, patternsyntax, quotationmark]
|
||||
findprop 40 41 42 43 44 45 46 47 48 49 4a 4b 4c 4d 4e 4f
|
||||
U+0040 Punctuation: Other punctuation, Common, Other
|
||||
U+0041 Letter: Upper case letter, Latin, Other, U+0061
|
||||
U+0042 Letter: Upper case letter, Latin, Other, U+0062
|
||||
U+0043 Letter: Upper case letter, Latin, Other, U+0063
|
||||
U+0044 Letter: Upper case letter, Latin, Other, U+0064
|
||||
U+0045 Letter: Upper case letter, Latin, Other, U+0065
|
||||
U+0046 Letter: Upper case letter, Latin, Other, U+0066
|
||||
U+0047 Letter: Upper case letter, Latin, Other, U+0067
|
||||
U+0048 Letter: Upper case letter, Latin, Other, U+0068
|
||||
U+0049 Letter: Upper case letter, Latin, Other, U+0069
|
||||
U+004A Letter: Upper case letter, Latin, Other, U+006A
|
||||
U+004B Letter: Upper case letter, Latin, Other, U+006B, U+212A
|
||||
U+004C Letter: Upper case letter, Latin, Other, U+006C
|
||||
U+004D Letter: Upper case letter, Latin, Other, U+006D
|
||||
U+004E Letter: Upper case letter, Latin, Other, U+006E
|
||||
U+004F Letter: Upper case letter, Latin, Other, U+006F
|
||||
U+0040 ON Punctuation: Other punctuation, common, Other, [ascii, asciihexdigit, emoji, emojicomponent, graphemebase, hexdigit, idcontinue, xidcontinue]
|
||||
U+0041 L Letter: Upper case letter, latin, Other, U+0061, [graphemebase]
|
||||
U+0042 L Letter: Upper case letter, latin, Other, U+0062, [graphemebase]
|
||||
U+0043 L Letter: Upper case letter, latin, Other, U+0063, [graphemebase]
|
||||
U+0044 L Letter: Upper case letter, latin, Other, U+0064, [graphemebase]
|
||||
U+0045 L Letter: Upper case letter, latin, Other, U+0065, [graphemebase]
|
||||
U+0046 L Letter: Upper case letter, latin, Other, U+0066, [graphemebase]
|
||||
U+0047 L Letter: Upper case letter, latin, Other, U+0067, [caseignorable, diacritic, extender, graphemebase, idcontinue, xidcontinue]
|
||||
U+0048 L Letter: Upper case letter, latin, Other, U+0068, [caseignorable, diacritic, extender, graphemebase, idcontinue, xidcontinue]
|
||||
U+0049 L Letter: Upper case letter, latin, Other, U+0069, [caseignorable, diacritic, extender, graphemebase, idcontinue, xidcontinue]
|
||||
U+004A L Letter: Upper case letter, latin, Other, U+006A, [caseignorable, diacritic, extender, graphemebase, idcontinue, xidcontinue]
|
||||
U+004B L Letter: Upper case letter, latin, Other, U+006B, U+212A, [caseignorable, diacritic, extender, graphemebase, idcontinue, xidcontinue]
|
||||
U+004C L Letter: Upper case letter, latin, Other, U+006C, [caseignorable, diacritic, extender, graphemebase, idcontinue, xidcontinue]
|
||||
U+004D L Letter: Upper case letter, latin, Other, U+006D, [caseignorable, diacritic, extender, graphemebase, idcontinue, xidcontinue]
|
||||
U+004E L Letter: Upper case letter, latin, Other, U+006E, [caseignorable, diacritic, extender, graphemebase, idcontinue, xidcontinue]
|
||||
U+004F L Letter: Upper case letter, latin, Other, U+006F, [caseignorable, diacritic, extender, graphemebase, idcontinue, xidcontinue]
|
||||
findprop 50 51 52 53 54 55 56 57 58 59 5a 5b 5c 5d 5e 5f
|
||||
U+0050 Letter: Upper case letter, Latin, Other, U+0070
|
||||
U+0051 Letter: Upper case letter, Latin, Other, U+0071
|
||||
U+0052 Letter: Upper case letter, Latin, Other, U+0072
|
||||
U+0053 Letter: Upper case letter, Latin, Other, U+0073, U+017F
|
||||
U+0054 Letter: Upper case letter, Latin, Other, U+0074
|
||||
U+0055 Letter: Upper case letter, Latin, Other, U+0075
|
||||
U+0056 Letter: Upper case letter, Latin, Other, U+0076
|
||||
U+0057 Letter: Upper case letter, Latin, Other, U+0077
|
||||
U+0058 Letter: Upper case letter, Latin, Other, U+0078
|
||||
U+0059 Letter: Upper case letter, Latin, Other, U+0079
|
||||
U+005A Letter: Upper case letter, Latin, Other, U+007A
|
||||
U+005B Punctuation: Open punctuation, Common, Other
|
||||
U+005C Punctuation: Other punctuation, Common, Other
|
||||
U+005D Punctuation: Close punctuation, Common, Other
|
||||
U+005E Symbol: Modifier symbol, Common, Other
|
||||
U+005F Punctuation: Connector punctuation, Common, Other
|
||||
U+0050 L Letter: Upper case letter, latin, Other, U+0070, [caseignorable, diacritic, extender, graphemebase, idcontinue, xidcontinue]
|
||||
U+0051 L Letter: Upper case letter, latin, Other, U+0071, [caseignorable, diacritic, extender, graphemebase, idcontinue, xidcontinue]
|
||||
U+0052 L Letter: Upper case letter, latin, Other, U+0072, [caseignorable, diacritic, extender, graphemebase, idcontinue, xidcontinue]
|
||||
U+0053 L Letter: Upper case letter, latin, Other, U+0073, U+017F, [caseignorable, diacritic, extender, graphemebase, idcontinue, xidcontinue]
|
||||
U+0054 L Letter: Upper case letter, latin, Other, U+0074, [caseignorable, diacritic, extender, graphemebase, idcontinue, xidcontinue]
|
||||
U+0055 L Letter: Upper case letter, latin, Other, U+0075, [caseignorable, diacritic, extender, graphemebase, idcontinue, xidcontinue]
|
||||
U+0056 L Letter: Upper case letter, latin, Other, U+0076, [caseignorable, diacritic, extender, graphemebase, idcontinue, xidcontinue]
|
||||
U+0057 L Letter: Upper case letter, latin, Other, U+0077, [caseignorable, diacritic, extender, graphemebase, idcontinue, xidcontinue]
|
||||
U+0058 L Letter: Upper case letter, latin, Other, U+0078, [caseignorable, diacritic, extender, graphemebase, idcontinue, xidcontinue]
|
||||
U+0059 L Letter: Upper case letter, latin, Other, U+0079, [caseignorable, diacritic, extender, graphemebase, idcontinue, xidcontinue]
|
||||
U+005A L Letter: Upper case letter, latin, Other, U+007A, [caseignorable, diacritic, extender, graphemebase, idcontinue, xidcontinue]
|
||||
U+005B ON Punctuation: Open punctuation, common, Other, [ascii, alphabetic, cased, changeswhencasefolded, changeswhencasemapped, changeswhenlowercased, graphemebase, idcontinue, idstart, uppercase, xidcontinue, xidstart]
|
||||
U+005C ON Punctuation: Other punctuation, common, Other, [ascii, asciihexdigit, emoji, emojicomponent, graphemebase, hexdigit, idcontinue, xidcontinue]
|
||||
U+005D ON Punctuation: Close punctuation, common, Other, [ascii, alphabetic, cased, changeswhencasefolded, changeswhencasemapped, changeswhenlowercased, graphemebase, idcontinue, idstart, uppercase, xidcontinue, xidstart]
|
||||
U+005E ON Symbol: Modifier symbol, common, Other, [alphabetic, cased, changeswhencasemapped, changeswhentitlecased, changeswhenuppercased, graphemebase, idcontinue, idstart, lowercase, xidcontinue, xidstart]
|
||||
U+005F ON Punctuation: Connector punctuation, common, Other, [alphabetic, cased, changeswhencasefolded, changeswhencasemapped, changeswhentitlecased, changeswhenuppercased, deprecated, graphemebase, idcontinue, idstart, lowercase, xidcontinue, xidstart]
|
||||
findprop 60 61 62 63 64 65 66 67 68 69 6a 6b 6c 6d 6e 6f
|
||||
U+0060 Symbol: Modifier symbol, Common, Other
|
||||
U+0061 Letter: Lower case letter, Latin, Other, U+0041
|
||||
U+0062 Letter: Lower case letter, Latin, Other, U+0042
|
||||
U+0063 Letter: Lower case letter, Latin, Other, U+0043
|
||||
U+0064 Letter: Lower case letter, Latin, Other, U+0044
|
||||
U+0065 Letter: Lower case letter, Latin, Other, U+0045
|
||||
U+0066 Letter: Lower case letter, Latin, Other, U+0046
|
||||
U+0067 Letter: Lower case letter, Latin, Other, U+0047
|
||||
U+0068 Letter: Lower case letter, Latin, Other, U+0048
|
||||
U+0069 Letter: Lower case letter, Latin, Other, U+0049
|
||||
U+006A Letter: Lower case letter, Latin, Other, U+004A
|
||||
U+006B Letter: Lower case letter, Latin, Other, U+004B, U+212A
|
||||
U+006C Letter: Lower case letter, Latin, Other, U+004C
|
||||
U+006D Letter: Lower case letter, Latin, Other, U+004D
|
||||
U+006E Letter: Lower case letter, Latin, Other, U+004E
|
||||
U+006F Letter: Lower case letter, Latin, Other, U+004F
|
||||
U+0060 ON Symbol: Modifier symbol, common, Other, [alphabetic, cased, changeswhencasefolded, changeswhencasemapped, changeswhenlowercased, changeswhentitlecased, graphemebase, idcontinue, idstart, uppercase, xidcontinue, xidstart]
|
||||
U+0061 L Letter: Lower case letter, latin, Other, U+0041, [alphabetic, caseignorable, cased, diacritic, graphemebase, idcontinue, idstart, lowercase, xidcontinue, xidstart]
|
||||
U+0062 L Letter: Lower case letter, latin, Other, U+0042, [alphabetic, caseignorable, cased, diacritic, graphemebase, idcontinue, idstart, lowercase, xidcontinue, xidstart]
|
||||
U+0063 L Letter: Lower case letter, latin, Other, U+0043, [alphabetic, caseignorable, cased, diacritic, graphemebase, idcontinue, idstart, lowercase, xidcontinue, xidstart]
|
||||
U+0064 L Letter: Lower case letter, latin, Other, U+0044, [alphabetic, caseignorable, cased, diacritic, graphemebase, idcontinue, idstart, lowercase, xidcontinue, xidstart]
|
||||
U+0065 L Letter: Lower case letter, latin, Other, U+0045, [alphabetic, caseignorable, cased, diacritic, graphemebase, idcontinue, idstart, lowercase, xidcontinue, xidstart]
|
||||
U+0066 L Letter: Lower case letter, latin, Other, U+0046, [alphabetic, caseignorable, cased, diacritic, graphemebase, idcontinue, idstart, lowercase, xidcontinue, xidstart]
|
||||
U+0067 L Letter: Lower case letter, latin, Other, U+0047, [alphabetic, caseignorable, diacritic, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
|
||||
U+0068 L Letter: Lower case letter, latin, Other, U+0048, [alphabetic, caseignorable, diacritic, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
|
||||
U+0069 L Letter: Lower case letter, latin, Other, U+0049, [caseignorable, diacritic, graphemeextend, idcontinue, xidcontinue]
|
||||
U+006A L Letter: Lower case letter, latin, Other, U+004A, [caseignorable, diacritic, graphemeextend, idcontinue, xidcontinue]
|
||||
U+006B L Letter: Lower case letter, latin, Other, U+004B, U+212A, [alphabetic, caseignorable, diacritic, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
|
||||
U+006C L Letter: Lower case letter, latin, Other, U+004C, [alphabetic, caseignorable, diacritic, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
|
||||
U+006D L Letter: Lower case letter, latin, Other, U+004D, [alphabetic, caseignorable, diacritic, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
|
||||
U+006E L Letter: Lower case letter, latin, Other, U+004E, [alphabetic, caseignorable, diacritic, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
|
||||
U+006F L Letter: Lower case letter, latin, Other, U+004F, [alphabetic, caseignorable, diacritic, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
|
||||
findprop 70 71 72 73 74 75 76 77 78 79 7a 7b 7c 7d 7e 7f
|
||||
U+0070 Letter: Lower case letter, Latin, Other, U+0050
|
||||
U+0071 Letter: Lower case letter, Latin, Other, U+0051
|
||||
U+0072 Letter: Lower case letter, Latin, Other, U+0052
|
||||
U+0073 Letter: Lower case letter, Latin, Other, U+0053, U+017F
|
||||
U+0074 Letter: Lower case letter, Latin, Other, U+0054
|
||||
U+0075 Letter: Lower case letter, Latin, Other, U+0055
|
||||
U+0076 Letter: Lower case letter, Latin, Other, U+0056
|
||||
U+0077 Letter: Lower case letter, Latin, Other, U+0057
|
||||
U+0078 Letter: Lower case letter, Latin, Other, U+0058
|
||||
U+0079 Letter: Lower case letter, Latin, Other, U+0059
|
||||
U+007A Letter: Lower case letter, Latin, Other, U+005A
|
||||
U+007B Punctuation: Open punctuation, Common, Other
|
||||
U+007C Symbol: Mathematical symbol, Common, Other
|
||||
U+007D Punctuation: Close punctuation, Common, Other
|
||||
U+007E Symbol: Mathematical symbol, Common, Other
|
||||
U+007F Control: Control, Common, Control
|
||||
U+0070 L Letter: Lower case letter, latin, Other, U+0050, [alphabetic, caseignorable, diacritic, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
|
||||
U+0071 L Letter: Lower case letter, latin, Other, U+0051, [alphabetic, caseignorable, diacritic, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
|
||||
U+0072 L Letter: Lower case letter, latin, Other, U+0052, [alphabetic, caseignorable, diacritic, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
|
||||
U+0073 L Letter: Lower case letter, latin, Other, U+0053, U+017F, [alphabetic, caseignorable, diacritic, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
|
||||
U+0074 L Letter: Lower case letter, latin, Other, U+0054, [alphabetic, caseignorable, diacritic, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
|
||||
U+0075 L Letter: Lower case letter, latin, Other, U+0055, [alphabetic, caseignorable, diacritic, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
|
||||
U+0076 L Letter: Lower case letter, latin, Other, U+0056, [alphabetic, caseignorable, diacritic, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
|
||||
U+0077 L Letter: Lower case letter, latin, Other, U+0057, [alphabetic, caseignorable, diacritic, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
|
||||
U+0078 L Letter: Lower case letter, latin, Other, U+0058, [alphabetic, caseignorable, diacritic, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
|
||||
U+0079 L Letter: Lower case letter, latin, Other, U+0059, [alphabetic, caseignorable, diacritic, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
|
||||
U+007A L Letter: Lower case letter, latin, Other, U+005A, [alphabetic, caseignorable, diacritic, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
|
||||
U+007B ON Punctuation: Open punctuation, common, Other, [ascii, alphabetic, cased, changeswhencasefolded, changeswhencasemapped, changeswhenlowercased, graphemebase, idcontinue, idstart, uppercase, xidcontinue, xidstart]
|
||||
U+007C ON Symbol: Mathematical symbol, common, Other, [ascii, graphemebase, idcontinue, xidcontinue]
|
||||
U+007D ON Punctuation: Close punctuation, common, Other, [ascii, alphabetic, cased, changeswhencasefolded, changeswhencasemapped, changeswhenlowercased, graphemebase, idcontinue, idstart, uppercase, xidcontinue, xidstart]
|
||||
U+007E ON Symbol: Mathematical symbol, common, Other, [ascii, graphemebase, idcontinue, xidcontinue]
|
||||
U+007F BN Control: Control, common, Control, [ascii, patternwhitespace, whitespace]
|
||||
|
||||
findprop 80 81 82 83 84 85 86 87 88 89 8a 8b 8c 8d 8e 8f
|
||||
U+0080 Control: Control, Common, Control
|
||||
U+0081 Control: Control, Common, Control
|
||||
U+0082 Control: Control, Common, Control
|
||||
U+0083 Control: Control, Common, Control
|
||||
U+0084 Control: Control, Common, Control
|
||||
U+0085 Control: Control, Common, Control
|
||||
U+0086 Control: Control, Common, Control
|
||||
U+0087 Control: Control, Common, Control
|
||||
U+0088 Control: Control, Common, Control
|
||||
U+0089 Control: Control, Common, Control
|
||||
U+008A Control: Control, Common, Control
|
||||
U+008B Control: Control, Common, Control
|
||||
U+008C Control: Control, Common, Control
|
||||
U+008D Control: Control, Common, Control
|
||||
U+008E Control: Control, Common, Control
|
||||
U+008F Control: Control, Common, Control
|
||||
U+0080 BN Control: Control, common, Control
|
||||
U+0081 BN Control: Control, common, Control
|
||||
U+0082 BN Control: Control, common, Control
|
||||
U+0083 BN Control: Control, common, Control
|
||||
U+0084 BN Control: Control, common, Control
|
||||
U+0085 B Control: Control, common, Control, [caseignorable, defaultignorablecodepoint, graphemeextend, idcontinue, xidcontinue]
|
||||
U+0086 BN Control: Control, common, Control
|
||||
U+0087 BN Control: Control, common, Control
|
||||
U+0088 BN Control: Control, common, Control
|
||||
U+0089 BN Control: Control, common, Control
|
||||
U+008A BN Control: Control, common, Control
|
||||
U+008B BN Control: Control, common, Control
|
||||
U+008C BN Control: Control, common, Control
|
||||
U+008D BN Control: Control, common, Control
|
||||
U+008E BN Control: Control, common, Control
|
||||
U+008F BN Control: Control, common, Control
|
||||
findprop 90 91 92 93 94 95 96 97 98 99 9a 9b 9c 9d 9e 9f
|
||||
U+0090 Control: Control, Common, Control
|
||||
U+0091 Control: Control, Common, Control
|
||||
U+0092 Control: Control, Common, Control
|
||||
U+0093 Control: Control, Common, Control
|
||||
U+0094 Control: Control, Common, Control
|
||||
U+0095 Control: Control, Common, Control
|
||||
U+0096 Control: Control, Common, Control
|
||||
U+0097 Control: Control, Common, Control
|
||||
U+0098 Control: Control, Common, Control
|
||||
U+0099 Control: Control, Common, Control
|
||||
U+009A Control: Control, Common, Control
|
||||
U+009B Control: Control, Common, Control
|
||||
U+009C Control: Control, Common, Control
|
||||
U+009D Control: Control, Common, Control
|
||||
U+009E Control: Control, Common, Control
|
||||
U+009F Control: Control, Common, Control
|
||||
U+0090 BN Control: Control, common, Control
|
||||
U+0091 BN Control: Control, common, Control
|
||||
U+0092 BN Control: Control, common, Control
|
||||
U+0093 BN Control: Control, common, Control
|
||||
U+0094 BN Control: Control, common, Control
|
||||
U+0095 BN Control: Control, common, Control
|
||||
U+0096 BN Control: Control, common, Control
|
||||
U+0097 BN Control: Control, common, Control
|
||||
U+0098 BN Control: Control, common, Control
|
||||
U+0099 BN Control: Control, common, Control
|
||||
U+009A BN Control: Control, common, Control
|
||||
U+009B BN Control: Control, common, Control
|
||||
U+009C BN Control: Control, common, Control
|
||||
U+009D BN Control: Control, common, Control
|
||||
U+009E BN Control: Control, common, Control
|
||||
U+009F BN Control: Control, common, Control
|
||||
findprop a0 a1 a2 a3 a4 a5 a6 a7 a8 a9 aa ab ac ad ae af
|
||||
U+00A0 Separator: Space separator, Common, Other
|
||||
U+00A1 Punctuation: Other punctuation, Common, Other
|
||||
U+00A2 Symbol: Currency symbol, Common, Other
|
||||
U+00A3 Symbol: Currency symbol, Common, Other
|
||||
U+00A4 Symbol: Currency symbol, Common, Other
|
||||
U+00A5 Symbol: Currency symbol, Common, Other
|
||||
U+00A6 Symbol: Other symbol, Common, Other
|
||||
U+00A7 Punctuation: Other punctuation, Common, Other
|
||||
U+00A8 Symbol: Modifier symbol, Common, Other
|
||||
U+00A9 Symbol: Other symbol, Common, Extended Pictographic
|
||||
U+00AA Letter: Other letter, Latin, Other
|
||||
U+00AB Punctuation: Initial punctuation, Common, Other
|
||||
U+00AC Symbol: Mathematical symbol, Common, Other
|
||||
U+00AD Control: Format, Common, Control
|
||||
U+00AE Symbol: Other symbol, Common, Extended Pictographic
|
||||
U+00AF Symbol: Modifier symbol, Common, Other
|
||||
U+00A0 CS Separator: Space separator, common, Other, [alphabetic, caseignorable, cased, diacritic, graphemebase, idcontinue, idstart, lowercase]
|
||||
U+00A1 ON Punctuation: Other punctuation, common, Other, [caseignorable, graphemebase, idcontinue, terminalpunctuation, xidcontinue]
|
||||
U+00A2 ET Symbol: Currency symbol, common, Other, [caseignorable, graphemebase, idcontinue, terminalpunctuation, xidcontinue]
|
||||
U+00A3 ET Symbol: Currency symbol, common, Other, [caseignorable, graphemebase, idcontinue, terminalpunctuation, xidcontinue]
|
||||
U+00A4 ET Symbol: Currency symbol, common, Other, [caseignorable, graphemebase, idcontinue, terminalpunctuation, xidcontinue]
|
||||
U+00A5 ET Symbol: Currency symbol, common, Other, [caseignorable, graphemebase, idcontinue, terminalpunctuation, xidcontinue]
|
||||
U+00A6 ON Symbol: Other symbol, common, Other, [caseignorable, graphemebase, idcontinue, terminalpunctuation, xidcontinue]
|
||||
U+00A7 ON Punctuation: Other punctuation, common, Other, [caseignorable, graphemebase, idcontinue, terminalpunctuation, xidcontinue]
|
||||
U+00A8 ON Symbol: Modifier symbol, common, Other, [alphabetic, cased, graphemebase, idcontinue, idstart, math, uppercase, xidcontinue, xidstart]
|
||||
U+00A9 ON Symbol: Other symbol, common, Extended Pictographic, [alphabetic, cased, changeswhencasefolded, changeswhencasemapped, changeswhenlowercased, graphemebase, idcontinue, idstart, math, uppercase, xidcontinue, xidstart]
|
||||
U+00AA L Letter: Other letter, latin, Other, [caseignorable, graphemeextend]
|
||||
U+00AB ON Punctuation: Initial punctuation, common, Other, [graphemebase, sentenceterminal, terminalpunctuation]
|
||||
U+00AC ON Symbol: Mathematical symbol, common, Other, [alphabetic, caseignorable, diacritic, graphemeextend, idcontinue, xidcontinue]
|
||||
U+00AD BN Control: Format, common, Control, [caseignorable, prependedconcatenationmark]
|
||||
U+00AE ON Symbol: Other symbol, common, Extended Pictographic, [alphabetic, cased, changeswhencasefolded, changeswhencasemapped, changeswhenlowercased, graphemebase, idcontinue, idstart, math, uppercase, xidcontinue, xidstart]
|
||||
U+00AF ON Symbol: Modifier symbol, common, Other, [alphabetic, cased, graphemebase, idcontinue, idstart, math, uppercase, xidcontinue, xidstart]
|
||||
findprop b0 b1 b2 b3 b4 b5 b6 b7 b8 b9 ba bb bc bd be bf
|
||||
U+00B0 Symbol: Other symbol, Common, Other
|
||||
U+00B1 Symbol: Mathematical symbol, Common, Other
|
||||
U+00B2 Number: Other number, Common, Other
|
||||
U+00B3 Number: Other number, Common, Other
|
||||
U+00B4 Symbol: Modifier symbol, Common, Other
|
||||
U+00B5 Letter: Lower case letter, Common, Other, U+03BC, U+039C
|
||||
U+00B6 Punctuation: Other punctuation, Common, Other
|
||||
U+00B7 Punctuation: Other punctuation, Common, Other
|
||||
U+00B8 Symbol: Modifier symbol, Common, Other
|
||||
U+00B9 Number: Other number, Common, Other
|
||||
U+00BA Letter: Other letter, Latin, Other
|
||||
U+00BB Punctuation: Final punctuation, Common, Other
|
||||
U+00BC Number: Other number, Common, Other
|
||||
U+00BD Number: Other number, Common, Other
|
||||
U+00BE Number: Other number, Common, Other
|
||||
U+00BF Punctuation: Other punctuation, Common, Other
|
||||
U+00B0 ET Symbol: Other symbol, common, Other, [caseignorable, graphemebase, idcontinue, terminalpunctuation, xidcontinue]
|
||||
U+00B1 ET Symbol: Mathematical symbol, common, Other, [alphabetic, caseignorable, diacritic, graphemeextend, idcontinue, xidcontinue]
|
||||
U+00B2 EN Number: Other number, common, Other, [alphabetic, caseignorable, extender, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
|
||||
U+00B3 EN Number: Other number, common, Other, [alphabetic, caseignorable, extender, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
|
||||
U+00B4 ON Symbol: Modifier symbol, common, Other, [alphabetic, cased, graphemebase, idcontinue, idstart, math, uppercase, xidcontinue, xidstart]
|
||||
U+00B5 L Letter: Lower case letter, common, Other, U+03BC, U+039C, [alphabetic, deprecated, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
|
||||
U+00B6 ON Punctuation: Other punctuation, common, Other, [caseignorable, graphemebase, idcontinue, terminalpunctuation, xidcontinue]
|
||||
U+00B7 ON Punctuation: Other punctuation, common, Other, [alphabetic, graphemebase, idcontinue, xidcontinue]
|
||||
U+00B8 ON Symbol: Modifier symbol, common, Other, [alphabetic, cased, graphemebase, idcontinue, idstart, math, uppercase, xidcontinue, xidstart]
|
||||
U+00B9 EN Number: Other number, common, Other, [alphabetic, caseignorable, extender, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
|
||||
U+00BA L Letter: Other letter, latin, Other, [caseignorable, graphemeextend]
|
||||
U+00BB ON Punctuation: Final punctuation, common, Other, [graphemebase, sentenceterminal, terminalpunctuation]
|
||||
U+00BC ON Number: Other number, common, Other, [alphabetic, caseignorable, extender, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
|
||||
U+00BD ON Number: Other number, common, Other, [alphabetic, caseignorable, extender, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
|
||||
U+00BE ON Number: Other number, common, Other, [alphabetic, caseignorable, extender, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
|
||||
U+00BF ON Punctuation: Other punctuation, common, Other, [caseignorable, graphemebase, idcontinue, terminalpunctuation, xidcontinue]
|
||||
findprop c0 c1 c2 c3 c4 c5 c6 c7 c8 c9 ca cb cc cd ce cf
|
||||
U+00C0 Letter: Upper case letter, Latin, Other, U+00E0
|
||||
U+00C1 Letter: Upper case letter, Latin, Other, U+00E1
|
||||
U+00C2 Letter: Upper case letter, Latin, Other, U+00E2
|
||||
U+00C3 Letter: Upper case letter, Latin, Other, U+00E3
|
||||
U+00C4 Letter: Upper case letter, Latin, Other, U+00E4
|
||||
U+00C5 Letter: Upper case letter, Latin, Other, U+00E5, U+212B
|
||||
U+00C6 Letter: Upper case letter, Latin, Other, U+00E6
|
||||
U+00C7 Letter: Upper case letter, Latin, Other, U+00E7
|
||||
U+00C8 Letter: Upper case letter, Latin, Other, U+00E8
|
||||
U+00C9 Letter: Upper case letter, Latin, Other, U+00E9
|
||||
U+00CA Letter: Upper case letter, Latin, Other, U+00EA
|
||||
U+00CB Letter: Upper case letter, Latin, Other, U+00EB
|
||||
U+00CC Letter: Upper case letter, Latin, Other, U+00EC
|
||||
U+00CD Letter: Upper case letter, Latin, Other, U+00ED
|
||||
U+00CE Letter: Upper case letter, Latin, Other, U+00EE
|
||||
U+00CF Letter: Upper case letter, Latin, Other, U+00EF
|
||||
U+00C0 L Letter: Upper case letter, latin, Other, U+00E0, [alphabetic, graphemeextend, idcontinue, xidcontinue]
|
||||
U+00C1 L Letter: Upper case letter, latin, Other, U+00E1, [alphabetic, graphemeextend, idcontinue, xidcontinue]
|
||||
U+00C2 L Letter: Upper case letter, latin, Other, U+00E2, [alphabetic, graphemeextend, idcontinue, xidcontinue]
|
||||
U+00C3 L Letter: Upper case letter, latin, Other, U+00E3, [alphabetic, graphemeextend, idcontinue, xidcontinue]
|
||||
U+00C4 L Letter: Upper case letter, latin, Other, U+00E4, [alphabetic, graphemeextend, idcontinue, xidcontinue]
|
||||
U+00C5 L Letter: Upper case letter, latin, Other, U+00E5, U+212B, [alphabetic, graphemeextend, idcontinue, xidcontinue]
|
||||
U+00C6 L Letter: Upper case letter, latin, Other, U+00E6, [alphabetic, graphemeextend, idcontinue, xidcontinue]
|
||||
U+00C7 L Letter: Upper case letter, latin, Other, U+00E7, [alphabetic, graphemeextend, idcontinue, xidcontinue]
|
||||
U+00C8 L Letter: Upper case letter, latin, Other, U+00E8, [alphabetic, graphemeextend, idcontinue, xidcontinue]
|
||||
U+00C9 L Letter: Upper case letter, latin, Other, U+00E9, [alphabetic, graphemeextend, idcontinue, xidcontinue]
|
||||
U+00CA L Letter: Upper case letter, latin, Other, U+00EA, [alphabetic, graphemeextend, idcontinue, xidcontinue]
|
||||
U+00CB L Letter: Upper case letter, latin, Other, U+00EB, [alphabetic, graphemeextend, idcontinue, xidcontinue]
|
||||
U+00CC L Letter: Upper case letter, latin, Other, U+00EC, [alphabetic, graphemeextend, idcontinue, xidcontinue]
|
||||
U+00CD L Letter: Upper case letter, latin, Other, U+00ED, [alphabetic, graphemeextend, idcontinue, xidcontinue]
|
||||
U+00CE L Letter: Upper case letter, latin, Other, U+00EE, [alphabetic, graphemeextend, idcontinue, xidcontinue]
|
||||
U+00CF L Letter: Upper case letter, latin, Other, U+00EF, [alphabetic, graphemeextend, idcontinue, xidcontinue]
|
||||
findprop d0 d1 d2 d3 d4 d5 d6 d7 d8 d9 da db dc dd de df
|
||||
U+00D0 Letter: Upper case letter, Latin, Other, U+00F0
|
||||
U+00D1 Letter: Upper case letter, Latin, Other, U+00F1
|
||||
U+00D2 Letter: Upper case letter, Latin, Other, U+00F2
|
||||
U+00D3 Letter: Upper case letter, Latin, Other, U+00F3
|
||||
U+00D4 Letter: Upper case letter, Latin, Other, U+00F4
|
||||
U+00D5 Letter: Upper case letter, Latin, Other, U+00F5
|
||||
U+00D6 Letter: Upper case letter, Latin, Other, U+00F6
|
||||
U+00D7 Symbol: Mathematical symbol, Common, Other
|
||||
U+00D8 Letter: Upper case letter, Latin, Other, U+00F8
|
||||
U+00D9 Letter: Upper case letter, Latin, Other, U+00F9
|
||||
U+00DA Letter: Upper case letter, Latin, Other, U+00FA
|
||||
U+00DB Letter: Upper case letter, Latin, Other, U+00FB
|
||||
U+00DC Letter: Upper case letter, Latin, Other, U+00FC
|
||||
U+00DD Letter: Upper case letter, Latin, Other, U+00FD
|
||||
U+00DE Letter: Upper case letter, Latin, Other, U+00FE
|
||||
U+00DF Letter: Lower case letter, Latin, Other, U+1E9E
|
||||
U+00D0 L Letter: Upper case letter, latin, Other, U+00F0, [alphabetic, graphemeextend, idcontinue, xidcontinue]
|
||||
U+00D1 L Letter: Upper case letter, latin, Other, U+00F1, [alphabetic, graphemeextend, idcontinue, xidcontinue]
|
||||
U+00D2 L Letter: Upper case letter, latin, Other, U+00F2, [alphabetic, graphemeextend, idcontinue, xidcontinue]
|
||||
U+00D3 L Letter: Upper case letter, latin, Other, U+00F3, [alphabetic, graphemeextend, idcontinue, xidcontinue]
|
||||
U+00D4 L Letter: Upper case letter, latin, Other, U+00F4, [alphabetic, graphemeextend, idcontinue, xidcontinue]
|
||||
U+00D5 L Letter: Upper case letter, latin, Other, U+00F5, [alphabetic, graphemeextend, idcontinue, xidcontinue]
|
||||
U+00D6 L Letter: Upper case letter, latin, Other, U+00F6, [alphabetic, graphemeextend, idcontinue, xidcontinue]
|
||||
U+00D7 ON Symbol: Mathematical symbol, common, Other, [alphabetic, caseignorable, diacritic, graphemeextend, idcontinue, xidcontinue]
|
||||
U+00D8 L Letter: Upper case letter, latin, Other, U+00F8, [alphabetic, graphemeextend, idcontinue, xidcontinue]
|
||||
U+00D9 L Letter: Upper case letter, latin, Other, U+00F9, [alphabetic, graphemeextend, idcontinue, xidcontinue]
|
||||
U+00DA L Letter: Upper case letter, latin, Other, U+00FA, [alphabetic, graphemeextend, idcontinue, xidcontinue]
|
||||
U+00DB L Letter: Upper case letter, latin, Other, U+00FB, [alphabetic, graphemeextend, idcontinue, xidcontinue]
|
||||
U+00DC L Letter: Upper case letter, latin, Other, U+00FC, [alphabetic, graphemeextend, idcontinue, xidcontinue]
|
||||
U+00DD L Letter: Upper case letter, latin, Other, U+00FD, [alphabetic, graphemeextend, idcontinue, xidcontinue]
|
||||
U+00DE L Letter: Upper case letter, latin, Other, U+00FE, [alphabetic, graphemeextend, idcontinue, xidcontinue]
|
||||
U+00DF L Letter: Lower case letter, latin, Other, U+1E9E, [alphabetic, deprecated, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
|
||||
findprop e0 e1 e2 e3 e4 e5 e6 e7 e8 e9 ea eb ec ed ee ef
|
||||
U+00E0 Letter: Lower case letter, Latin, Other, U+00C0
|
||||
U+00E1 Letter: Lower case letter, Latin, Other, U+00C1
|
||||
U+00E2 Letter: Lower case letter, Latin, Other, U+00C2
|
||||
U+00E3 Letter: Lower case letter, Latin, Other, U+00C3
|
||||
U+00E4 Letter: Lower case letter, Latin, Other, U+00C4
|
||||
U+00E5 Letter: Lower case letter, Latin, Other, U+00C5, U+212B
|
||||
U+00E6 Letter: Lower case letter, Latin, Other, U+00C6
|
||||
U+00E7 Letter: Lower case letter, Latin, Other, U+00C7
|
||||
U+00E8 Letter: Lower case letter, Latin, Other, U+00C8
|
||||
U+00E9 Letter: Lower case letter, Latin, Other, U+00C9
|
||||
U+00EA Letter: Lower case letter, Latin, Other, U+00CA
|
||||
U+00EB Letter: Lower case letter, Latin, Other, U+00CB
|
||||
U+00EC Letter: Lower case letter, Latin, Other, U+00CC
|
||||
U+00ED Letter: Lower case letter, Latin, Other, U+00CD
|
||||
U+00EE Letter: Lower case letter, Latin, Other, U+00CE
|
||||
U+00EF Letter: Lower case letter, Latin, Other, U+00CF
|
||||
U+00E0 L Letter: Lower case letter, latin, Other, U+00C0, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue]
|
||||
U+00E1 L Letter: Lower case letter, latin, Other, U+00C1, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue]
|
||||
U+00E2 L Letter: Lower case letter, latin, Other, U+00C2, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue]
|
||||
U+00E3 L Letter: Lower case letter, latin, Other, U+00C3, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue]
|
||||
U+00E4 L Letter: Lower case letter, latin, Other, U+00C4, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue]
|
||||
U+00E5 L Letter: Lower case letter, latin, Other, U+00C5, U+212B, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue]
|
||||
U+00E6 L Letter: Lower case letter, latin, Other, U+00C6, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue]
|
||||
U+00E7 L Letter: Lower case letter, latin, Other, U+00C7, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue]
|
||||
U+00E8 L Letter: Lower case letter, latin, Other, U+00C8, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue]
|
||||
U+00E9 L Letter: Lower case letter, latin, Other, U+00C9, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue]
|
||||
U+00EA L Letter: Lower case letter, latin, Other, U+00CA, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue]
|
||||
U+00EB L Letter: Lower case letter, latin, Other, U+00CB, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue]
|
||||
U+00EC L Letter: Lower case letter, latin, Other, U+00CC, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue]
|
||||
U+00ED L Letter: Lower case letter, latin, Other, U+00CD, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue]
|
||||
U+00EE L Letter: Lower case letter, latin, Other, U+00CE, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue]
|
||||
U+00EF L Letter: Lower case letter, latin, Other, U+00CF, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue]
|
||||
findprop f0 f1 f2 f3 f4 f5 f6 f7 f8 f9 fa fb fc fd fe ff
|
||||
U+00F0 Letter: Lower case letter, Latin, Other, U+00D0
|
||||
U+00F1 Letter: Lower case letter, Latin, Other, U+00D1
|
||||
U+00F2 Letter: Lower case letter, Latin, Other, U+00D2
|
||||
U+00F3 Letter: Lower case letter, Latin, Other, U+00D3
|
||||
U+00F4 Letter: Lower case letter, Latin, Other, U+00D4
|
||||
U+00F5 Letter: Lower case letter, Latin, Other, U+00D5
|
||||
U+00F6 Letter: Lower case letter, Latin, Other, U+00D6
|
||||
U+00F7 Symbol: Mathematical symbol, Common, Other
|
||||
U+00F8 Letter: Lower case letter, Latin, Other, U+00D8
|
||||
U+00F9 Letter: Lower case letter, Latin, Other, U+00D9
|
||||
U+00FA Letter: Lower case letter, Latin, Other, U+00DA
|
||||
U+00FB Letter: Lower case letter, Latin, Other, U+00DB
|
||||
U+00FC Letter: Lower case letter, Latin, Other, U+00DC
|
||||
U+00FD Letter: Lower case letter, Latin, Other, U+00DD
|
||||
U+00FE Letter: Lower case letter, Latin, Other, U+00DE
|
||||
U+00FF Letter: Lower case letter, Latin, Other, U+0178
|
||||
U+00F0 L Letter: Lower case letter, latin, Other, U+00D0, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue]
|
||||
U+00F1 L Letter: Lower case letter, latin, Other, U+00D1, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue]
|
||||
U+00F2 L Letter: Lower case letter, latin, Other, U+00D2, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue]
|
||||
U+00F3 L Letter: Lower case letter, latin, Other, U+00D3, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue]
|
||||
U+00F4 L Letter: Lower case letter, latin, Other, U+00D4, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue]
|
||||
U+00F5 L Letter: Lower case letter, latin, Other, U+00D5, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue]
|
||||
U+00F6 L Letter: Lower case letter, latin, Other, U+00D6, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue]
|
||||
U+00F7 ON Symbol: Mathematical symbol, common, Other, [alphabetic, caseignorable, diacritic, graphemeextend, idcontinue, xidcontinue]
|
||||
U+00F8 L Letter: Lower case letter, latin, Other, U+00D8, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue]
|
||||
U+00F9 L Letter: Lower case letter, latin, Other, U+00D9, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue]
|
||||
U+00FA L Letter: Lower case letter, latin, Other, U+00DA, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue]
|
||||
U+00FB L Letter: Lower case letter, latin, Other, U+00DB, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue]
|
||||
U+00FC L Letter: Lower case letter, latin, Other, U+00DC, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue]
|
||||
U+00FD L Letter: Lower case letter, latin, Other, U+00DD, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue]
|
||||
U+00FE L Letter: Lower case letter, latin, Other, U+00DE, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue]
|
||||
U+00FF L Letter: Lower case letter, latin, Other, U+0178, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue]
|
||||
|
||||
findprop 0100 0101 0102 0103 0104 0105 0106
|
||||
U+0100 Letter: Upper case letter, Latin, Other, U+0101
|
||||
U+0101 Letter: Lower case letter, Latin, Other, U+0100
|
||||
U+0102 Letter: Upper case letter, Latin, Other, U+0103
|
||||
U+0103 Letter: Lower case letter, Latin, Other, U+0102
|
||||
U+0104 Letter: Upper case letter, Latin, Other, U+0105
|
||||
U+0105 Letter: Lower case letter, Latin, Other, U+0104
|
||||
U+0106 Letter: Upper case letter, Latin, Other, U+0107
|
||||
U+0100 L Letter: Upper case letter, latin, Other, U+0101, [alphabetic, graphemeextend, idcontinue, xidcontinue]
|
||||
U+0101 L Letter: Lower case letter, latin, Other, U+0100, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue]
|
||||
U+0102 L Letter: Upper case letter, latin, Other, U+0103, [alphabetic, graphemeextend, idcontinue, xidcontinue]
|
||||
U+0103 L Letter: Lower case letter, latin, Other, U+0102, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue]
|
||||
U+0104 L Letter: Upper case letter, latin, Other, U+0105, [alphabetic, graphemeextend, idcontinue, xidcontinue]
|
||||
U+0105 L Letter: Lower case letter, latin, Other, U+0104, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue]
|
||||
U+0106 L Letter: Upper case letter, latin, Other, U+0107, [alphabetic, graphemeextend, idcontinue, xidcontinue]
|
||||
|
||||
findprop ffe0 ffe1 ffe2 ffe3 ffe4 ffe5 ffe6 ffe7
|
||||
U+FFE0 Symbol: Currency symbol, Common, Other
|
||||
U+FFE1 Symbol: Currency symbol, Common, Other
|
||||
U+FFE2 Symbol: Mathematical symbol, Common, Other
|
||||
U+FFE3 Symbol: Modifier symbol, Common, Other
|
||||
U+FFE4 Symbol: Other symbol, Common, Other
|
||||
U+FFE5 Symbol: Currency symbol, Common, Other
|
||||
U+FFE6 Symbol: Currency symbol, Common, Other
|
||||
U+FFE7 Control: Unassigned, Unknown, Other
|
||||
U+FFE0 ET Symbol: Currency symbol, common, Other, [alphabetic, caseignorable, extender, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
|
||||
U+FFE1 ET Symbol: Currency symbol, common, Other, [alphabetic, caseignorable, extender, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
|
||||
U+FFE2 ON Symbol: Mathematical symbol, common, Other, [emoji, extendedpictographic, graphemebase]
|
||||
U+FFE3 ON Symbol: Modifier symbol, common, Other, [alphabetic, cased, graphemebase, idcontinue, idstart, math, uppercase, xidcontinue, xidstart]
|
||||
U+FFE4 ON Symbol: Other symbol, common, Other, [alphabetic, caseignorable, extender, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
|
||||
U+FFE5 ET Symbol: Currency symbol, common, Other, [alphabetic, caseignorable, extender, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
|
||||
U+FFE6 ET Symbol: Currency symbol, common, Other, [alphabetic, caseignorable, extender, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
|
||||
U+FFE7 L Control: Unassigned, unknown, Other
|
||||
findprop ffe8 ffe9 ffea ffeb ffec ffed ffee ffef
|
||||
U+FFE8 Symbol: Other symbol, Common, Other
|
||||
U+FFE9 Symbol: Mathematical symbol, Common, Other
|
||||
U+FFEA Symbol: Mathematical symbol, Common, Other
|
||||
U+FFEB Symbol: Mathematical symbol, Common, Other
|
||||
U+FFEC Symbol: Mathematical symbol, Common, Other
|
||||
U+FFED Symbol: Other symbol, Common, Other
|
||||
U+FFEE Symbol: Other symbol, Common, Other
|
||||
U+FFEF Control: Unassigned, Unknown, Other
|
||||
U+FFE8 ON Symbol: Other symbol, common, Other, [alphabetic, caseignorable, extender, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
|
||||
U+FFE9 ON Symbol: Mathematical symbol, common, Other, [emoji, extendedpictographic, graphemebase]
|
||||
U+FFEA ON Symbol: Mathematical symbol, common, Other, [emoji, extendedpictographic, graphemebase]
|
||||
U+FFEB ON Symbol: Mathematical symbol, common, Other, [emoji, extendedpictographic, graphemebase]
|
||||
U+FFEC ON Symbol: Mathematical symbol, common, Other, [emoji, extendedpictographic, graphemebase]
|
||||
U+FFED ON Symbol: Other symbol, common, Other, [alphabetic, caseignorable, extender, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
|
||||
U+FFEE ON Symbol: Other symbol, common, Other, [alphabetic, caseignorable, extender, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
|
||||
U+FFEF L Control: Unassigned, unknown, Other
|
||||
findprop fff8 fff9 fffa fffb fffc fffd fffe ffff
|
||||
U+FFF8 Control: Unassigned, Unknown, Control
|
||||
U+FFF9 Control: Format, Common, Control
|
||||
U+FFFA Control: Format, Common, Control
|
||||
U+FFFB Control: Format, Common, Control
|
||||
U+FFFC Symbol: Other symbol, Common, Other
|
||||
U+FFFD Symbol: Other symbol, Common, Other
|
||||
U+FFFE Control: Unassigned, Unknown, Other
|
||||
U+FFFF Control: Unassigned, Unknown, Other
|
||||
U+FFF8 BN Control: Unassigned, unknown, Control, [dash, defaultignorablecodepoint, deprecated, extendedpictographic, joincontrol, lowercase, patternwhitespace, quotationmark, sentenceterminal, softdotted, xidcontinue, xidstart]
|
||||
U+FFF9 ON Control: Format, common, Control, [changeswhenuppercased, deprecated, emojimodifier, emojipresentation, extender, sentenceterminal, xidcontinue, xidstart]
|
||||
U+FFFA ON Control: Format, common, Control, [changeswhenuppercased, deprecated, emojimodifier, emojipresentation, extender, sentenceterminal, xidcontinue, xidstart]
|
||||
U+FFFB ON Control: Format, common, Control, [changeswhenuppercased, deprecated, emojimodifier, emojipresentation, extender, sentenceterminal, xidcontinue, xidstart]
|
||||
U+FFFC ON Symbol: Other symbol, common, Other, [alphabetic, caseignorable, extender, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
|
||||
U+FFFD ON Symbol: Other symbol, common, Other, [alphabetic, caseignorable, extender, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
|
||||
U+FFFE BN Control: Unassigned, unknown, Other, [changeswhenuppercased, deprecated, emojicomponent, emojimodifierbase, extender, quotationmark, sentenceterminal, xidcontinue, xidstart]
|
||||
U+FFFF BN Control: Unassigned, unknown, Other, [changeswhenuppercased, deprecated, emojicomponent, emojimodifierbase, extender, quotationmark, sentenceterminal, xidcontinue, xidstart]
|
||||
findprop 10000 10001 e01ef f0000 100000
|
||||
U+10000 Letter: Other letter, Linear_B, Other
|
||||
U+10001 Letter: Other letter, Linear_B, Other
|
||||
U+E01EF Mark: Non-spacing mark, Inherited, Extend
|
||||
U+F0000 Control: Private use, Unknown, Other
|
||||
U+100000 Control: Private use, Unknown, Other
|
||||
U+10000 L Letter: Other letter, linearb, Other, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+10001 L Letter: Other letter, linearb, Other, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+E01EF NSM Mark: Non-spacing mark, inherited, Extend, []
|
||||
U+F0000 L Control: Private use, unknown, Other
|
||||
U+100000 L Control: Private use, unknown, Other
|
||||
|
||||
findprop 1b00 12000 7c0 a840 10900
|
||||
U+1B00 Mark: Non-spacing mark, Balinese, Extend
|
||||
U+12000 Letter: Other letter, Cuneiform, Other
|
||||
U+07C0 Number: Decimal number, Nko, Other
|
||||
U+A840 Letter: Other letter, Phags_Pa, Other
|
||||
U+10900 Letter: Other letter, Phoenician, Other
|
||||
U+1B00 NSM Mark: Non-spacing mark, balinese, Extend, [alphabetic, cased, changeswhencasefolded, changeswhencasemapped, changeswhenlowercased, graphemebase, uppercase]
|
||||
U+12000 L Letter: Other letter, cuneiform, Other, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+07C0 R Number: Decimal number, nko, Other, [graphemebase, patternsyntax, terminalpunctuation]
|
||||
U+A840 L Letter: Other letter, phagspa, Other, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+10900 R Letter: Other letter, phoenician, Other, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
findprop 1d79 a77d
|
||||
U+1D79 Letter: Lower case letter, Latin, Other, U+A77D
|
||||
U+A77D Letter: Upper case letter, Latin, Other, U+1D79
|
||||
U+1D79 L Letter: Lower case letter, latin, Other, U+A77D, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue]
|
||||
U+A77D L Letter: Upper case letter, latin, Other, U+1D79, [alphabetic, graphemeextend, idcontinue, xidcontinue]
|
||||
|
||||
findprop 0800 083e a4d0 a4f7 aa80 aadf
|
||||
U+0800 Letter: Other letter, Samaritan, Other
|
||||
U+083E Punctuation: Other punctuation, Samaritan, Other
|
||||
U+A4D0 Letter: Other letter, Lisu, Other
|
||||
U+A4F7 Letter: Other letter, Lisu, Other
|
||||
U+AA80 Letter: Other letter, Tai_Viet, Other
|
||||
U+AADF Punctuation: Other punctuation, Tai_Viet, Other
|
||||
U+0800 R Letter: Other letter, samaritan, Other, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+083E R Punctuation: Other punctuation, samaritan, Other, [bidimirrored, graphemebase, math, patternsyntax]
|
||||
U+A4D0 L Letter: Other letter, lisu, Other, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+A4F7 L Letter: Other letter, lisu, Other, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+AA80 L Letter: Other letter, taiviet, Other, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+AADF L Punctuation: Other punctuation, taiviet, Other, [graphemebase, patternsyntax, sentenceterminal, terminalpunctuation]
|
||||
findprop 10b00 10b35 13000 1342e 10840 10855
|
||||
U+10B00 Letter: Other letter, Avestan, Other
|
||||
U+10B35 Letter: Other letter, Avestan, Other
|
||||
U+13000 Letter: Other letter, Egyptian_Hieroglyphs, Other
|
||||
U+1342E Letter: Other letter, Egyptian_Hieroglyphs, Other
|
||||
U+10840 Letter: Other letter, Imperial_Aramaic, Other
|
||||
U+10855 Letter: Other letter, Imperial_Aramaic, Other
|
||||
U+10B00 R Letter: Other letter, avestan, Other, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+10B35 R Letter: Other letter, avestan, Other, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+13000 L Letter: Other letter, egyptianhieroglyphs, Other, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+1342E L Letter: Other letter, egyptianhieroglyphs, Other, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+10840 R Letter: Other letter, imperialaramaic, Other, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+10855 R Letter: Other letter, imperialaramaic, Other, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
|
||||
findprop 11100 1113c 11680 116c0
|
||||
U+11100 Mark: Non-spacing mark, Chakma, Extend
|
||||
U+1113C Number: Decimal number, Chakma, Other
|
||||
U+11680 Letter: Other letter, Takri, Other
|
||||
U+116C0 Number: Decimal number, Takri, Other
|
||||
U+11100 NSM Mark: Non-spacing mark, chakma, Extend, [alphabetic, cased, changeswhencasefolded, changeswhencasemapped, changeswhenlowercased, graphemebase, uppercase]
|
||||
U+1113C L Number: Decimal number, chakma, Other, [graphemebase, patternsyntax, terminalpunctuation]
|
||||
U+11680 L Letter: Other letter, takri, Other, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+116C0 L Number: Decimal number, takri, Other, [graphemebase, patternsyntax, terminalpunctuation]
|
||||
|
||||
findprop 0d 0a 0e 0711 1b04 1111 1169 11fe ae4c ad89
|
||||
U+000D Control: Control, Common, CR
|
||||
U+000A Control: Control, Common, LF
|
||||
U+000E Control: Control, Common, Control
|
||||
U+0711 Mark: Non-spacing mark, Syriac, Extend
|
||||
U+1B04 Mark: Spacing mark, Balinese, SpacingMark
|
||||
U+1111 Letter: Other letter, Hangul, Hangul syllable type L
|
||||
U+1169 Letter: Other letter, Hangul, Hangul syllable type V
|
||||
U+11FE Letter: Other letter, Hangul, Hangul syllable type T
|
||||
U+AE4C Letter: Other letter, Hangul, Hangul syllable type LV
|
||||
U+AD89 Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||
U+000D B Control: Control, common, CR, [ascii, graphemebase, patternsyntax, sentenceterminal, terminalpunctuation]
|
||||
U+000A B Control: Control, common, LF, [ascii, graphemebase, patternsyntax, sentenceterminal, terminalpunctuation]
|
||||
U+000E BN Control: Control, common, Control, [ascii, patternwhitespace, whitespace]
|
||||
U+0711 NSM Mark: Non-spacing mark, syriac, Extend, [alphabetic, cased, changeswhencasefolded, changeswhencasemapped, changeswhenlowercased, graphemebase, uppercase]
|
||||
U+1B04 L Mark: Spacing mark, balinese, SpacingMark, [dash, emoji, extendedpictographic, graphemebase, patternsyntax]
|
||||
U+1111 L Letter: Other letter, hangul, Hangul syllable type L, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+1169 L Letter: Other letter, hangul, Hangul syllable type V, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+11FE L Letter: Other letter, hangul, Hangul syllable type T, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+AE4C L Letter: Other letter, hangul, Hangul syllable type LV, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+AD89 L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
|
||||
findprop 118a0 11ac7 16ad0
|
||||
U+118A0 Letter: Upper case letter, Warang_Citi, Other, U+118C0
|
||||
U+11AC7 Letter: Other letter, Pau_Cin_Hau, Other
|
||||
U+16AD0 Letter: Other letter, Bassa_Vah, Other
|
||||
U+118A0 L Letter: Upper case letter, warangciti, Other, U+118C0, [alphabetic, graphemeextend, idcontinue, xidcontinue]
|
||||
U+11AC7 L Letter: Other letter, paucinhau, Other, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+16AD0 L Letter: Other letter, bassavah, Other, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
|
||||
findprop 11700 14400 108e0 11280 1d800
|
||||
U+11700 Letter: Other letter, Ahom, Other
|
||||
U+14400 Letter: Other letter, Anatolian_Hieroglyphs, Other
|
||||
U+108E0 Letter: Other letter, Hatran, Other
|
||||
U+11280 Letter: Other letter, Multani, Other
|
||||
U+1D800 Symbol: Other symbol, SignWriting, Other
|
||||
U+11700 L Letter: Other letter, ahom, Other, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+14400 L Letter: Other letter, anatolianhieroglyphs, Other, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+108E0 R Letter: Other letter, hatran, Other, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+11280 L Letter: Other letter, multani, Other, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+1D800 L Symbol: Other symbol, signwriting, Other, [alphabetic, caseignorable, extender, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
|
||||
|
||||
findprop 11800 1e903 11da9 10d27 11ee0 16e48 10f27 10f30
|
||||
U+11800 Letter: Other letter, Dogra, Other
|
||||
U+1E903 Letter: Upper case letter, Adlam, Other, U+1E925
|
||||
U+11DA9 Number: Decimal number, Gunjala_Gondi, Other
|
||||
U+10D27 Mark: Non-spacing mark, Hanifi_Rohingya, Extend
|
||||
U+11EE0 Letter: Other letter, Makasar, Other
|
||||
U+16E48 Letter: Upper case letter, Medefaidrin, Other, U+16E68
|
||||
U+10F27 Letter: Other letter, Old_Sogdian, Other
|
||||
U+10F30 Letter: Other letter, Sogdian, Other
|
||||
U+11800 L Letter: Other letter, dogra, Other, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+1E903 R Letter: Upper case letter, adlam, Other, U+1E925, [alphabetic, graphemeextend, idcontinue, xidcontinue]
|
||||
U+11DA9 L Number: Decimal number, gunjalagondi, Other, [graphemebase, patternsyntax, terminalpunctuation]
|
||||
U+10D27 NSM Mark: Non-spacing mark, hanifirohingya, Extend, [extendedpictographic, graphemebase, patternsyntax]
|
||||
U+11EE0 L Letter: Other letter, makasar, Other, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+16E48 L Letter: Upper case letter, medefaidrin, Other, U+16E68, [alphabetic, graphemeextend, idcontinue, xidcontinue]
|
||||
U+10F27 R Letter: Other letter, oldsogdian, Other, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+10F30 AL Letter: Other letter, sogdian, Other, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
|
||||
findprop a836 a833 1cf4 20f0 1cd0
|
||||
U+A836 Symbol: Other symbol, Common, Other, [Devanagari, Dogra, Gujarati, Gurmukhi, Khojki, Kaithi, Mahajani, Modi, Khudawadi, Takri, Tirhuta]
|
||||
U+A833 Number: Other number, Common, Other, [Devanagari, Dogra, Gujarati, Gurmukhi, Khojki, Kannada, Kaithi, Mahajani, Modi, Nandinagari, Khudawadi, Takri, Tirhuta]
|
||||
U+1CF4 Mark: Non-spacing mark, Inherited, Extend, [Devanagari, Grantha, Kannada]
|
||||
U+20F0 Mark: Non-spacing mark, Inherited, Extend, [Devanagari, Grantha, Latin]
|
||||
U+1CD0 Mark: Non-spacing mark, Inherited, Extend, [Bengali, Devanagari, Grantha, Kannada]
|
||||
U+A836 L Symbol: Other symbol, common, Other, [devanagari, gurmukhi, gujarati, kaithi, takri, khojki, mahajani, modi, khudawadi, tirhuta, dogra], [alphabetic, caseignorable, extender, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
|
||||
U+A833 L Number: Other number, common, Other, [devanagari, gurmukhi, gujarati, kannada, kaithi, takri, khojki, mahajani, modi, khudawadi, tirhuta, dogra, nandinagari], [alphabetic, caseignorable, extender, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
|
||||
U+1CF4 NSM Mark: Non-spacing mark, inherited, Extend, [devanagari, kannada, grantha], [alphabetic, cased, graphemebase, idcontinue, idstart, lowercase, softdotted, xidcontinue, xidstart]
|
||||
U+20F0 NSM Mark: Non-spacing mark, inherited, Extend, [latin, devanagari, grantha], [caseignorable, graphemebase, patternsyntax, quotationmark]
|
||||
U+1CD0 NSM Mark: Non-spacing mark, inherited, Extend, [devanagari, bengali, kannada, grantha], [alphabetic, cased, graphemebase, idcontinue, idstart, lowercase, softdotted, xidcontinue, xidstart]
|
||||
|
||||
findprop 32ff
|
||||
U+32FF Symbol: Other symbol, Common, Other, [Han]
|
||||
U+32FF L Symbol: Other symbol, common, Other, [han], [alphabetic, caseignorable, extender, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
|
||||
|
||||
findprop 1f16d
|
||||
U+1F16D Symbol: Other symbol, Common, Extended Pictographic
|
||||
U+1F16D ON Symbol: Other symbol, common, Extended Pictographic, [ascii, sentenceterminal, unifiedideograph, whitespace, xidcontinue]
|
||||
|
||||
findprop U+10e93 U+10eaa
|
||||
U+10E93 Letter: Other letter, Yezidi, Other
|
||||
U+10EAA Control: Unassigned, Unknown, Other
|
||||
U+10E93 R Letter: Other letter, yezidi, Other, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+10EAA R Control: Unassigned, unknown, Other
|
||||
|
||||
findprop 0602 202a 202b 202c 2068 2069 202d 202e 2067
|
||||
U+0602 AN Control: Format, arabic, Prepend, [alphabetic, cased, changeswhencasemapped, changeswhentitlecased, changeswhenuppercased, graphemebase, lowercase]
|
||||
U+202A LRE Control: Format, common, Control, [extendedpictographic, graphemebase, math, patternsyntax]
|
||||
U+202B RLE Control: Format, common, Control, [extendedpictographic, graphemebase, math, patternsyntax]
|
||||
U+202C PDF Control: Format, common, Control, [extendedpictographic, graphemebase, math, patternsyntax]
|
||||
U+2068 FSI Control: Format, common, Control, [extendedpictographic, graphemebase, math, patternsyntax]
|
||||
U+2069 PDI Control: Format, common, Control, [extendedpictographic, graphemebase, math, patternsyntax]
|
||||
U+202D LRO Control: Format, common, Control, [extendedpictographic, graphemebase, math, patternsyntax]
|
||||
U+202E RLO Control: Format, common, Control, [extendedpictographic, graphemebase, math, patternsyntax]
|
||||
U+2067 RLI Control: Format, common, Control, [extendedpictographic, graphemebase, math, patternsyntax]
|
||||
|
|
|
@ -1,196 +1,298 @@
|
|||
find script Han
|
||||
U+2E80..U+2E99 Symbol: Other symbol, Han, Other
|
||||
U+2E9B..U+2EF3 Symbol: Other symbol, Han, Other
|
||||
U+2F00..U+2FD5 Symbol: Other symbol, Han, Other
|
||||
U+3005 Letter: Modifier letter, Han, Other
|
||||
U+3007 Number: Letter number, Han, Other
|
||||
U+3021..U+3029 Number: Letter number, Han, Other
|
||||
U+3038..U+303A Number: Letter number, Han, Other
|
||||
U+303B Letter: Modifier letter, Han, Other
|
||||
U+3400..U+4DBF Letter: Other letter, Han, Other
|
||||
U+4E00..U+9FFF Letter: Other letter, Han, Other
|
||||
U+F900..U+FA6D Letter: Other letter, Han, Other
|
||||
U+FA70..U+FAD9 Letter: Other letter, Han, Other
|
||||
U+16FE2 Punctuation: Other punctuation, Han, Other
|
||||
U+16FE3 Letter: Modifier letter, Han, Other
|
||||
U+16FF0..U+16FF1 Mark: Spacing mark, Han, SpacingMark
|
||||
U+20000..U+2A6DF Letter: Other letter, Han, Other
|
||||
U+2A700..U+2B738 Letter: Other letter, Han, Other
|
||||
U+2B740..U+2B81D Letter: Other letter, Han, Other
|
||||
U+2B820..U+2CEA1 Letter: Other letter, Han, Other
|
||||
U+2CEB0..U+2EBE0 Letter: Other letter, Han, Other
|
||||
U+2F800..U+2FA1D Letter: Other letter, Han, Other
|
||||
U+30000..U+3134A Letter: Other letter, Han, Other
|
||||
U+2E80..U+2E99 ON Symbol: Other symbol, han, Other, [ascii, sentenceterminal, unifiedideograph, whitespace, xidstart]
|
||||
U+2E9B..U+2EF3 ON Symbol: Other symbol, han, Other, [ascii, sentenceterminal, unifiedideograph, whitespace, xidstart]
|
||||
U+2F00..U+2FD5 ON Symbol: Other symbol, han, Other, [ascii, sentenceterminal, unifiedideograph, whitespace, xidstart]
|
||||
U+3005 L Letter: Modifier letter, han, Other, [emoji, emojimodifierbase, emojipresentation, extendedpictographic, graphemebase, patternsyntax]
|
||||
U+3007 L Number: Letter number, han, Other, [sentenceterminal, unifiedideograph, xidcontinue, xidstart]
|
||||
U+3021..U+3029 L Number: Letter number, han, Other, [sentenceterminal, unifiedideograph, xidcontinue, xidstart]
|
||||
U+3038..U+303A L Number: Letter number, han, Other, [sentenceterminal, unifiedideograph, xidcontinue, xidstart]
|
||||
U+303B L Letter: Modifier letter, han, Other, [alphabetic, graphemebase, idcontinue, idstart, ideographic, xidcontinue, xidstart]
|
||||
U+3400..U+4DBF L Letter: Other letter, han, Other, [changeswhenuppercased, deprecated, emojimodifier, emojimodifierbase, extender, quotationmark, sentenceterminal, xidcontinue, xidstart]
|
||||
U+4E00..U+9FFF L Letter: Other letter, han, Other, [changeswhenuppercased, deprecated, emojimodifier, emojimodifierbase, extender, quotationmark, sentenceterminal, xidcontinue, xidstart]
|
||||
U+F900..U+FA0D L Letter: Other letter, han, Other, [sentenceterminal, unifiedideograph, xidcontinue, xidstart]
|
||||
U+FA0E..U+FA0F L Letter: Other letter, han, Other, [changeswhenuppercased, deprecated, emojimodifier, emojimodifierbase, extender, quotationmark, sentenceterminal, xidcontinue, xidstart]
|
||||
U+FA10 L Letter: Other letter, han, Other, [sentenceterminal, unifiedideograph, xidcontinue, xidstart]
|
||||
U+FA11 L Letter: Other letter, han, Other, [changeswhenuppercased, deprecated, emojimodifier, emojimodifierbase, extender, quotationmark, sentenceterminal, xidcontinue, xidstart]
|
||||
U+FA12 L Letter: Other letter, han, Other, [sentenceterminal, unifiedideograph, xidcontinue, xidstart]
|
||||
U+FA13..U+FA14 L Letter: Other letter, han, Other, [changeswhenuppercased, deprecated, emojimodifier, emojimodifierbase, extender, quotationmark, sentenceterminal, xidcontinue, xidstart]
|
||||
U+FA15..U+FA1E L Letter: Other letter, han, Other, [sentenceterminal, unifiedideograph, xidcontinue, xidstart]
|
||||
U+FA1F L Letter: Other letter, han, Other, [changeswhenuppercased, deprecated, emojimodifier, emojimodifierbase, extender, quotationmark, sentenceterminal, xidcontinue, xidstart]
|
||||
U+FA20 L Letter: Other letter, han, Other, [sentenceterminal, unifiedideograph, xidcontinue, xidstart]
|
||||
U+FA21 L Letter: Other letter, han, Other, [changeswhenuppercased, deprecated, emojimodifier, emojimodifierbase, extender, quotationmark, sentenceterminal, xidcontinue, xidstart]
|
||||
U+FA22 L Letter: Other letter, han, Other, [sentenceterminal, unifiedideograph, xidcontinue, xidstart]
|
||||
U+FA23..U+FA24 L Letter: Other letter, han, Other, [changeswhenuppercased, deprecated, emojimodifier, emojimodifierbase, extender, quotationmark, sentenceterminal, xidcontinue, xidstart]
|
||||
U+FA25..U+FA26 L Letter: Other letter, han, Other, [sentenceterminal, unifiedideograph, xidcontinue, xidstart]
|
||||
U+FA27..U+FA29 L Letter: Other letter, han, Other, [changeswhenuppercased, deprecated, emojimodifier, emojimodifierbase, extender, quotationmark, sentenceterminal, xidcontinue, xidstart]
|
||||
U+FA2A..U+FA6D L Letter: Other letter, han, Other, [sentenceterminal, unifiedideograph, xidcontinue, xidstart]
|
||||
U+FA70..U+FAD9 L Letter: Other letter, han, Other, [sentenceterminal, unifiedideograph, xidcontinue, xidstart]
|
||||
U+16FE2 ON Punctuation: Other punctuation, han, Other, [alphabetic, caseignorable, extender, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
|
||||
U+16FE3 L Letter: Modifier letter, han, Other, [emoji, emojimodifierbase, emojipresentation, extendedpictographic, graphemebase, patternsyntax]
|
||||
U+16FF0..U+16FF1 L Mark: Spacing mark, han, SpacingMark, [caseignorable, graphemeextend, idcontinue, ideographic, xidcontinue]
|
||||
U+20000..U+2A6DF L Letter: Other letter, han, Other, [changeswhenuppercased, deprecated, emojimodifier, emojimodifierbase, extender, quotationmark, sentenceterminal, xidcontinue, xidstart]
|
||||
U+2A700..U+2B738 L Letter: Other letter, han, Other, [changeswhenuppercased, deprecated, emojimodifier, emojimodifierbase, extender, quotationmark, sentenceterminal, xidcontinue, xidstart]
|
||||
U+2B740..U+2B81D L Letter: Other letter, han, Other, [changeswhenuppercased, deprecated, emojimodifier, emojimodifierbase, extender, quotationmark, sentenceterminal, xidcontinue, xidstart]
|
||||
U+2B820..U+2CEA1 L Letter: Other letter, han, Other, [changeswhenuppercased, deprecated, emojimodifier, emojimodifierbase, extender, quotationmark, sentenceterminal, xidcontinue, xidstart]
|
||||
U+2CEB0..U+2EBE0 L Letter: Other letter, han, Other, [changeswhenuppercased, deprecated, emojimodifier, emojimodifierbase, extender, quotationmark, sentenceterminal, xidcontinue, xidstart]
|
||||
U+2F800..U+2FA1D L Letter: Other letter, han, Other, [sentenceterminal, unifiedideograph, xidcontinue, xidstart]
|
||||
U+30000..U+3134A L Letter: Other letter, han, Other, [changeswhenuppercased, deprecated, emojimodifier, emojimodifierbase, extender, quotationmark, sentenceterminal, xidcontinue, xidstart]
|
||||
find type Pe script Common scriptx Hangul
|
||||
U+3009 Punctuation: Close punctuation, Common, Other, [Bopomofo, Hangul, Han, Hiragana, Katakana, Yi]
|
||||
U+300B Punctuation: Close punctuation, Common, Other, [Bopomofo, Hangul, Han, Hiragana, Katakana, Yi]
|
||||
U+300D Punctuation: Close punctuation, Common, Other, [Bopomofo, Hangul, Han, Hiragana, Katakana, Yi]
|
||||
U+300F Punctuation: Close punctuation, Common, Other, [Bopomofo, Hangul, Han, Hiragana, Katakana, Yi]
|
||||
U+3011 Punctuation: Close punctuation, Common, Other, [Bopomofo, Hangul, Han, Hiragana, Katakana, Yi]
|
||||
U+3015 Punctuation: Close punctuation, Common, Other, [Bopomofo, Hangul, Han, Hiragana, Katakana, Yi]
|
||||
U+3017 Punctuation: Close punctuation, Common, Other, [Bopomofo, Hangul, Han, Hiragana, Katakana, Yi]
|
||||
U+3019 Punctuation: Close punctuation, Common, Other, [Bopomofo, Hangul, Han, Hiragana, Katakana, Yi]
|
||||
U+301B Punctuation: Close punctuation, Common, Other, [Bopomofo, Hangul, Han, Hiragana, Katakana, Yi]
|
||||
U+301E..U+301F Punctuation: Close punctuation, Common, Other, [Bopomofo, Hangul, Han, Hiragana, Katakana]
|
||||
U+FF63 Punctuation: Close punctuation, Common, Other, [Bopomofo, Hangul, Han, Hiragana, Katakana, Yi]
|
||||
U+3009 ON Punctuation: Close punctuation, common, Other, [hangul, hiragana, katakana, bopomofo, han, yiii], [ascii, alphabetic, cased, changeswhencasemapped, changeswhentitlecased, dash, emojimodifier, emojimodifierbase]
|
||||
U+300B ON Punctuation: Close punctuation, common, Other, [hangul, hiragana, katakana, bopomofo, han, yiii], [ascii, alphabetic, cased, changeswhencasemapped, changeswhentitlecased, dash, emojimodifier, emojimodifierbase]
|
||||
U+300D ON Punctuation: Close punctuation, common, Other, [hangul, hiragana, katakana, bopomofo, han, yiii], [graphemebase, sentenceterminal, terminalpunctuation]
|
||||
U+300F ON Punctuation: Close punctuation, common, Other, [hangul, hiragana, katakana, bopomofo, han, yiii], [graphemebase, sentenceterminal, terminalpunctuation]
|
||||
U+3011 ON Punctuation: Close punctuation, common, Other, [hangul, hiragana, katakana, bopomofo, han, yiii], [ascii, alphabetic, cased, changeswhencasemapped, changeswhentitlecased, dash, emojimodifier, emojimodifierbase]
|
||||
U+3015 ON Punctuation: Close punctuation, common, Other, [hangul, hiragana, katakana, bopomofo, han, yiii], [ascii, alphabetic, cased, changeswhencasemapped, changeswhentitlecased, dash, emojimodifier, emojimodifierbase]
|
||||
U+3017 ON Punctuation: Close punctuation, common, Other, [hangul, hiragana, katakana, bopomofo, han, yiii], [ascii, alphabetic, cased, changeswhencasemapped, changeswhentitlecased, dash, emojimodifier, emojimodifierbase]
|
||||
U+3019 ON Punctuation: Close punctuation, common, Other, [hangul, hiragana, katakana, bopomofo, han, yiii], [ascii, alphabetic, cased, changeswhencasemapped, changeswhentitlecased, dash, emojimodifier, emojimodifierbase]
|
||||
U+301B ON Punctuation: Close punctuation, common, Other, [hangul, hiragana, katakana, bopomofo, han, yiii], [ascii, alphabetic, cased, changeswhencasemapped, changeswhentitlecased, dash, emojimodifier, emojimodifierbase]
|
||||
U+301E..U+301F ON Punctuation: Close punctuation, common, Other, [hangul, hiragana, katakana, bopomofo, han], [softdotted, terminalpunctuation, unifiedideograph, xidcontinue, xidstart]
|
||||
U+FF63 ON Punctuation: Close punctuation, common, Other, [hangul, hiragana, katakana, bopomofo, han, yiii], [changeswhencasemapped, changeswhenlowercased, changeswhentitlecased, emojimodifier, emojimodifierbase]
|
||||
find type Sk
|
||||
U+005E Symbol: Modifier symbol, Common, Other
|
||||
U+0060 Symbol: Modifier symbol, Common, Other
|
||||
U+00A8 Symbol: Modifier symbol, Common, Other
|
||||
U+00AF Symbol: Modifier symbol, Common, Other
|
||||
U+00B4 Symbol: Modifier symbol, Common, Other
|
||||
U+00B8 Symbol: Modifier symbol, Common, Other
|
||||
U+02C2..U+02C5 Symbol: Modifier symbol, Common, Other
|
||||
U+02D2..U+02DF Symbol: Modifier symbol, Common, Other
|
||||
U+02E5..U+02E9 Symbol: Modifier symbol, Common, Other
|
||||
U+02EA..U+02EB Symbol: Modifier symbol, Bopomofo, Other
|
||||
U+02ED Symbol: Modifier symbol, Common, Other
|
||||
U+02EF..U+02FF Symbol: Modifier symbol, Common, Other
|
||||
U+0375 Symbol: Modifier symbol, Greek, Other
|
||||
U+0384 Symbol: Modifier symbol, Greek, Other
|
||||
U+0385 Symbol: Modifier symbol, Common, Other
|
||||
U+0888 Symbol: Modifier symbol, Arabic, Other
|
||||
U+1FBD Symbol: Modifier symbol, Greek, Other
|
||||
U+1FBF..U+1FC1 Symbol: Modifier symbol, Greek, Other
|
||||
U+1FCD..U+1FCF Symbol: Modifier symbol, Greek, Other
|
||||
U+1FDD..U+1FDF Symbol: Modifier symbol, Greek, Other
|
||||
U+1FED..U+1FEF Symbol: Modifier symbol, Greek, Other
|
||||
U+1FFD..U+1FFE Symbol: Modifier symbol, Greek, Other
|
||||
U+309B..U+309C Symbol: Modifier symbol, Common, Other, [Hiragana, Katakana]
|
||||
U+A700..U+A707 Symbol: Modifier symbol, Common, Other, [Han, Latin]
|
||||
U+A708..U+A716 Symbol: Modifier symbol, Common, Other
|
||||
U+A720..U+A721 Symbol: Modifier symbol, Common, Other
|
||||
U+A789..U+A78A Symbol: Modifier symbol, Common, Other
|
||||
U+AB5B Symbol: Modifier symbol, Common, Other
|
||||
U+AB6A..U+AB6B Symbol: Modifier symbol, Common, Other
|
||||
U+FBB2..U+FBC2 Symbol: Modifier symbol, Arabic, Other
|
||||
U+FF3E Symbol: Modifier symbol, Common, Other
|
||||
U+FF40 Symbol: Modifier symbol, Common, Other
|
||||
U+FFE3 Symbol: Modifier symbol, Common, Other
|
||||
U+1F3FB..U+1F3FF Symbol: Modifier symbol, Common, Extend
|
||||
U+005E ON Symbol: Modifier symbol, common, Other, [alphabetic, cased, changeswhencasemapped, changeswhentitlecased, changeswhenuppercased, graphemebase, idcontinue, idstart, lowercase, xidcontinue, xidstart]
|
||||
U+0060 ON Symbol: Modifier symbol, common, Other, [alphabetic, cased, changeswhencasefolded, changeswhencasemapped, changeswhenlowercased, changeswhentitlecased, graphemebase, idcontinue, idstart, uppercase, xidcontinue, xidstart]
|
||||
U+00A8 ON Symbol: Modifier symbol, common, Other, [alphabetic, cased, graphemebase, idcontinue, idstart, math, uppercase, xidcontinue, xidstart]
|
||||
U+00AF ON Symbol: Modifier symbol, common, Other, [alphabetic, cased, graphemebase, idcontinue, idstart, math, uppercase, xidcontinue, xidstart]
|
||||
U+00B4 ON Symbol: Modifier symbol, common, Other, [alphabetic, cased, graphemebase, idcontinue, idstart, math, uppercase, xidcontinue, xidstart]
|
||||
U+00B8 ON Symbol: Modifier symbol, common, Other, [alphabetic, cased, graphemebase, idcontinue, idstart, math, uppercase, xidcontinue, xidstart]
|
||||
U+02C2..U+02C5 ON Symbol: Modifier symbol, common, Other, [alphabetic, cased, graphemebase, idcontinue, idstart, math, uppercase, xidcontinue, xidstart]
|
||||
U+02D2..U+02DF ON Symbol: Modifier symbol, common, Other, [alphabetic, cased, graphemebase, idcontinue, idstart, math, uppercase, xidcontinue, xidstart]
|
||||
U+02E5..U+02E9 ON Symbol: Modifier symbol, common, Other, [alphabetic, cased, graphemebase, idcontinue, idstart, math, uppercase, xidcontinue, xidstart]
|
||||
U+02EA..U+02EB ON Symbol: Modifier symbol, bopomofo, Other, [alphabetic, cased, graphemebase, idcontinue, idstart, math, uppercase, xidcontinue, xidstart]
|
||||
U+02ED ON Symbol: Modifier symbol, common, Other, [alphabetic, cased, graphemebase, idcontinue, idstart, math, uppercase, xidcontinue, xidstart]
|
||||
U+02EF..U+02FF ON Symbol: Modifier symbol, common, Other, [alphabetic, cased, graphemebase, idcontinue, idstart, math, uppercase, xidcontinue, xidstart]
|
||||
U+0375 ON Symbol: Modifier symbol, greek, Other, [alphabetic, cased, graphemebase, idcontinue, idstart, math, uppercase, xidcontinue, xidstart]
|
||||
U+0384 ON Symbol: Modifier symbol, greek, Other, [alphabetic, cased, graphemebase, idcontinue, idstart, math, uppercase, xidcontinue, xidstart]
|
||||
U+0385 ON Symbol: Modifier symbol, common, Other, [alphabetic, cased, graphemebase, idcontinue, idstart, math, uppercase, xidcontinue, xidstart]
|
||||
U+0888 AL Symbol: Modifier symbol, arabic, Other, [alphabetic, cased, graphemebase, idcontinue, idstart, lowercase, math, softdotted, xidcontinue, xidstart]
|
||||
U+1FBD ON Symbol: Modifier symbol, greek, Other, [alphabetic, cased, graphemebase, idcontinue, idstart, math, uppercase, xidcontinue, xidstart]
|
||||
U+1FBF..U+1FC1 ON Symbol: Modifier symbol, greek, Other, [alphabetic, cased, graphemebase, idcontinue, idstart, math, uppercase, xidcontinue, xidstart]
|
||||
U+1FCD..U+1FCF ON Symbol: Modifier symbol, greek, Other, [alphabetic, cased, graphemebase, idcontinue, idstart, math, uppercase, xidcontinue, xidstart]
|
||||
U+1FDD..U+1FDF ON Symbol: Modifier symbol, greek, Other, [alphabetic, cased, graphemebase, idcontinue, idstart, math, uppercase, xidcontinue, xidstart]
|
||||
U+1FED..U+1FEF ON Symbol: Modifier symbol, greek, Other, [alphabetic, cased, graphemebase, idcontinue, idstart, math, uppercase, xidcontinue, xidstart]
|
||||
U+1FFD..U+1FFE ON Symbol: Modifier symbol, greek, Other, [alphabetic, cased, graphemebase, idcontinue, idstart, math, uppercase, xidcontinue, xidstart]
|
||||
U+309B..U+309C ON Symbol: Modifier symbol, common, Other, [hiragana, katakana], [alphabetic, bidimirrored, caseignorable, cased, changeswhencasefolded, changeswhenlowercased, changeswhentitlecased, changeswhenuppercased, dash, defaultignorablecodepoint, deprecated, diacritic, emoji, emojicomponent, emojimodifier, emojimodifierbase, emojipresentation, extendedpictographic, extender, graphemebase, graphemeextend, graphemelink, hexdigit, idsbinaryoperator, idstrinaryoperator, idcontinue, idstart, ideographic, sentenceterminal, unifiedideograph, whitespace, xidcontinue]
|
||||
U+A700..U+A707 ON Symbol: Modifier symbol, common, Other, [latin, han], [alphabetic, cased, graphemebase, idcontinue, idstart, math, uppercase, xidcontinue, xidstart]
|
||||
U+A708..U+A716 ON Symbol: Modifier symbol, common, Other, [alphabetic, cased, graphemebase, idcontinue, idstart, math, uppercase, xidcontinue, xidstart]
|
||||
U+A720..U+A721 ON Symbol: Modifier symbol, common, Other, [alphabetic, cased, graphemebase, idcontinue, idstart, math, uppercase, xidcontinue, xidstart]
|
||||
U+A789..U+A78A L Symbol: Modifier symbol, common, Other, [alphabetic, cased, graphemebase, idcontinue, idstart, math, uppercase, xidcontinue, xidstart]
|
||||
U+AB5B L Symbol: Modifier symbol, common, Other, [alphabetic, cased, graphemebase, idcontinue, idstart, math, uppercase, xidcontinue, xidstart]
|
||||
U+AB6A..U+AB6B ON Symbol: Modifier symbol, common, Other, [alphabetic, cased, graphemebase, idcontinue, idstart, math, uppercase, xidcontinue, xidstart]
|
||||
U+FBB2..U+FBC2 AL Symbol: Modifier symbol, arabic, Other, [alphabetic, cased, graphemebase, idcontinue, idstart, lowercase, math, softdotted, xidcontinue, xidstart]
|
||||
U+FF3E ON Symbol: Modifier symbol, common, Other, [asciihexdigit, bidicontrol, bidimirrored, cased, changeswhencasefolded, sentenceterminal, unifiedideograph, whitespace, xidstart]
|
||||
U+FF40 ON Symbol: Modifier symbol, common, Other, [alphabetic, cased, graphemebase, idcontinue, idstart, math, uppercase, xidcontinue, xidstart]
|
||||
U+FFE3 ON Symbol: Modifier symbol, common, Other, [alphabetic, cased, graphemebase, idcontinue, idstart, math, uppercase, xidcontinue, xidstart]
|
||||
U+1F3FB..U+1F3FF ON Symbol: Modifier symbol, common, Extend, [changeswhencasemapped, changeswhenuppercased, emojimodifier, emojimodifierbase, math, patternsyntax, radical, sentenceterminal, terminalpunctuation]
|
||||
find type Pd
|
||||
U+002D Punctuation: Dash punctuation, Common, Other
|
||||
U+058A Punctuation: Dash punctuation, Armenian, Other
|
||||
U+05BE Punctuation: Dash punctuation, Hebrew, Other
|
||||
U+1400 Punctuation: Dash punctuation, Canadian_Aboriginal, Other
|
||||
U+1806 Punctuation: Dash punctuation, Mongolian, Other
|
||||
U+2010..U+2015 Punctuation: Dash punctuation, Common, Other
|
||||
U+2E17 Punctuation: Dash punctuation, Common, Other
|
||||
U+2E1A Punctuation: Dash punctuation, Common, Other
|
||||
U+2E3A..U+2E3B Punctuation: Dash punctuation, Common, Other
|
||||
U+2E40 Punctuation: Dash punctuation, Common, Other
|
||||
U+2E5D Punctuation: Dash punctuation, Common, Other
|
||||
U+301C Punctuation: Dash punctuation, Common, Other, [Bopomofo, Hangul, Han, Hiragana, Katakana]
|
||||
U+3030 Punctuation: Dash punctuation, Common, Extended Pictographic, [Bopomofo, Hangul, Han, Hiragana, Katakana]
|
||||
U+30A0 Punctuation: Dash punctuation, Common, Other, [Hiragana, Katakana]
|
||||
U+FE31..U+FE32 Punctuation: Dash punctuation, Common, Other
|
||||
U+FE58 Punctuation: Dash punctuation, Common, Other
|
||||
U+FE63 Punctuation: Dash punctuation, Common, Other
|
||||
U+FF0D Punctuation: Dash punctuation, Common, Other
|
||||
U+10EAD Punctuation: Dash punctuation, Yezidi, Other
|
||||
U+002D ES Punctuation: Dash punctuation, common, Other, [ascii, alphabetic, cased, changeswhencasemapped, changeswhentitlecased, changeswhenuppercased, graphemebase, idcontinue, idstart, lowercase, softdotted, xidcontinue, xidstart]
|
||||
U+058A ON Punctuation: Dash punctuation, armenian, Other, [emoji, emojipresentation, extendedpictographic, graphemebase, patternsyntax]
|
||||
U+05BE R Punctuation: Dash punctuation, hebrew, Other, [emoji, emojipresentation, extendedpictographic, graphemebase, patternsyntax]
|
||||
U+1400 ON Punctuation: Dash punctuation, canadianaboriginal, Other, [emoji, emojipresentation, extendedpictographic, graphemebase, patternsyntax]
|
||||
U+1806 ON Punctuation: Dash punctuation, mongolian, Other, [emoji, emojipresentation, extendedpictographic, graphemebase, patternsyntax]
|
||||
U+2010..U+2015 ON Punctuation: Dash punctuation, common, Other, [dash, defaultignorablecodepoint, deprecated, emojipresentation, joincontrol, lowercase, patternwhitespace, radical, regionalindicator, softdotted, xidcontinue, xidstart]
|
||||
U+2E17 ON Punctuation: Dash punctuation, common, Other, [dash, defaultignorablecodepoint, deprecated, emojipresentation, joincontrol, lowercase, patternwhitespace, radical, regionalindicator, softdotted, xidcontinue, xidstart]
|
||||
U+2E1A ON Punctuation: Dash punctuation, common, Other, [dash, defaultignorablecodepoint, deprecated, emojipresentation, joincontrol, lowercase, patternwhitespace, radical, regionalindicator, softdotted, xidcontinue, xidstart]
|
||||
U+2E3A..U+2E3B ON Punctuation: Dash punctuation, common, Other, [dash, defaultignorablecodepoint, deprecated, emojipresentation, joincontrol, lowercase, patternwhitespace, radical, regionalindicator, softdotted, xidcontinue, xidstart]
|
||||
U+2E40 ON Punctuation: Dash punctuation, common, Other, [dash, defaultignorablecodepoint, deprecated, emojipresentation, joincontrol, lowercase, patternwhitespace, radical, regionalindicator, softdotted, xidcontinue, xidstart]
|
||||
U+2E5D ON Punctuation: Dash punctuation, common, Other, [dash, defaultignorablecodepoint, deprecated, emojipresentation, joincontrol, lowercase, patternwhitespace, radical, regionalindicator, softdotted, xidcontinue, xidstart]
|
||||
U+301C ON Punctuation: Dash punctuation, common, Other, [hangul, hiragana, katakana, bopomofo, han], [dash, defaultignorablecodepoint, deprecated, emojipresentation, joincontrol, lowercase, patternwhitespace, radical, regionalindicator, softdotted, xidcontinue, xidstart]
|
||||
U+3030 ON Punctuation: Dash punctuation, common, Extended Pictographic, [hangul, hiragana, katakana, bopomofo, han], [changeswhencasemapped, changeswhenuppercased, emojimodifier, emojimodifierbase, joincontrol, logicalorderexception, lowercase, prependedconcatenationmark, quotationmark, radical, regionalindicator, sentenceterminal, softdotted, terminalpunctuation, unifiedideograph, uppercase, variationselector, whitespace, xidcontinue, xidstart]
|
||||
U+30A0 ON Punctuation: Dash punctuation, common, Other, [hiragana, katakana], [emoji, emojipresentation, extendedpictographic, graphemebase, patternsyntax]
|
||||
U+FE31..U+FE32 ON Punctuation: Dash punctuation, common, Other, [emoji, emojipresentation, extendedpictographic, graphemebase, patternsyntax]
|
||||
U+FE58 ON Punctuation: Dash punctuation, common, Other, [emoji, emojipresentation, extendedpictographic, graphemebase, patternsyntax]
|
||||
U+FE63 ES Punctuation: Dash punctuation, common, Other, [caseignorable, sentenceterminal, unifiedideograph, xidcontinue]
|
||||
U+FF0D ES Punctuation: Dash punctuation, common, Other, [emoji, emojipresentation, extendedpictographic, graphemebase, patternsyntax]
|
||||
U+10EAD R Punctuation: Dash punctuation, yezidi, Other, [emoji, emojipresentation, extendedpictographic, graphemebase, patternsyntax]
|
||||
find gbreak LVT
|
||||
U+AC01..U+AC1B Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||
U+AC1D..U+AC37 Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||
U+AC39..U+AC53 Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||
U+AC55..U+AC6F Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||
U+AC71..U+AC8B Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||
U+AC8D..U+ACA7 Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||
U+ACA9..U+ACC3 Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||
U+ACC5..U+ACDF Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||
U+ACE1..U+ACFB Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||
U+ACFD..U+AD17 Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||
U+AD19..U+AD33 Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||
U+AD35..U+AD4F Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||
U+AD51..U+AD6B Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||
U+AD6D..U+AD87 Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||
U+AD89..U+ADA3 Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||
U+ADA5..U+ADBF Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||
U+ADC1..U+ADDB Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||
U+ADDD..U+ADF7 Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||
U+ADF9..U+AE13 Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||
U+AE15..U+AE2F Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||
U+AE31..U+AE4B Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||
U+AE4D..U+AE67 Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||
U+AE69..U+AE83 Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||
U+AE85..U+AE9F Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||
U+AEA1..U+AEBB Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||
U+AEBD..U+AED7 Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||
U+AED9..U+AEF3 Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||
U+AEF5..U+AF0F Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||
U+AF11..U+AF2B Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||
U+AF2D..U+AF47 Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||
U+AF49..U+AF63 Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||
U+AF65..U+AF7F Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||
U+AF81..U+AF9B Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||
U+AF9D..U+AFB7 Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||
U+AFB9..U+AFD3 Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||
U+AFD5..U+AFEF Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||
U+AFF1..U+B00B Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||
U+B00D..U+B027 Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||
U+B029..U+B043 Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||
U+B045..U+B05F Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||
U+B061..U+B07B Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||
U+B07D..U+B097 Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||
U+B099..U+B0B3 Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||
U+B0B5..U+B0CF Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||
U+B0D1..U+B0EB Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||
U+B0ED..U+B107 Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||
U+B109..U+B123 Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||
U+B125..U+B13F Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||
U+B141..U+B15B Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||
U+B15D..U+B177 Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||
U+B179..U+B193 Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||
U+B195..U+B1AF Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||
U+B1B1..U+B1CB Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||
U+B1CD..U+B1E7 Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||
U+B1E9..U+B203 Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||
U+B205..U+B21F Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||
U+B221..U+B23B Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||
U+B23D..U+B257 Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||
U+B259..U+B273 Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||
U+B275..U+B28F Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||
U+B291..U+B2AB Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||
U+B2AD..U+B2C7 Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||
U+B2C9..U+B2E3 Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||
U+B2E5..U+B2FF Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||
U+B301..U+B31B Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||
U+B31D..U+B337 Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||
U+B339..U+B353 Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||
U+B355..U+B36F Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||
U+B371..U+B38B Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||
U+B38D..U+B3A7 Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||
U+B3A9..U+B3C3 Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||
U+B3C5..U+B3DF Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||
U+B3E1..U+B3FB Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||
U+B3FD..U+B417 Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||
U+B419..U+B433 Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||
U+B435..U+B44F Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||
U+B451..U+B46B Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||
U+B46D..U+B487 Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||
U+B489..U+B4A3 Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||
U+B4A5..U+B4BF Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||
U+B4C1..U+B4DB Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||
U+B4DD..U+B4F7 Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||
U+B4F9..U+B513 Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||
U+B515..U+B52F Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||
U+B531..U+B54B Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||
U+B54D..U+B567 Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||
U+B569..U+B583 Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||
U+B585..U+B59F Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||
U+B5A1..U+B5BB Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||
U+B5BD..U+B5D7 Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||
U+B5D9..U+B5F3 Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||
U+B5F5..U+B60F Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||
U+B611..U+B62B Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||
U+B62D..U+B647 Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||
U+B649..U+B663 Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||
U+B665..U+B67F Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||
U+B681..U+B69B Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||
U+B69D..U+B6B7 Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||
U+B6B9..U+B6D3 Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||
U+B6D5..U+B6EF Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||
U+AC01..U+AC1B L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+AC1D..U+AC37 L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+AC39..U+AC53 L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+AC55..U+AC6F L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+AC71..U+AC8B L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+AC8D..U+ACA7 L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+ACA9..U+ACC3 L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+ACC5..U+ACDF L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+ACE1..U+ACFB L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+ACFD..U+AD17 L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+AD19..U+AD33 L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+AD35..U+AD4F L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+AD51..U+AD6B L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+AD6D..U+AD87 L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+AD89..U+ADA3 L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+ADA5..U+ADBF L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+ADC1..U+ADDB L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+ADDD..U+ADF7 L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+ADF9..U+AE13 L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+AE15..U+AE2F L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+AE31..U+AE4B L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+AE4D..U+AE67 L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+AE69..U+AE83 L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+AE85..U+AE9F L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+AEA1..U+AEBB L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+AEBD..U+AED7 L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+AED9..U+AEF3 L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+AEF5..U+AF0F L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+AF11..U+AF2B L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+AF2D..U+AF47 L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+AF49..U+AF63 L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+AF65..U+AF7F L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+AF81..U+AF9B L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+AF9D..U+AFB7 L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+AFB9..U+AFD3 L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+AFD5..U+AFEF L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+AFF1..U+B00B L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+B00D..U+B027 L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+B029..U+B043 L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+B045..U+B05F L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+B061..U+B07B L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+B07D..U+B097 L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+B099..U+B0B3 L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+B0B5..U+B0CF L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+B0D1..U+B0EB L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+B0ED..U+B107 L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+B109..U+B123 L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+B125..U+B13F L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+B141..U+B15B L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+B15D..U+B177 L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+B179..U+B193 L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+B195..U+B1AF L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+B1B1..U+B1CB L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+B1CD..U+B1E7 L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+B1E9..U+B203 L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+B205..U+B21F L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+B221..U+B23B L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+B23D..U+B257 L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+B259..U+B273 L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+B275..U+B28F L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+B291..U+B2AB L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+B2AD..U+B2C7 L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+B2C9..U+B2E3 L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+B2E5..U+B2FF L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+B301..U+B31B L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+B31D..U+B337 L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+B339..U+B353 L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+B355..U+B36F L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+B371..U+B38B L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+B38D..U+B3A7 L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+B3A9..U+B3C3 L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+B3C5..U+B3DF L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+B3E1..U+B3FB L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+B3FD..U+B417 L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+B419..U+B433 L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+B435..U+B44F L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+B451..U+B46B L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+B46D..U+B487 L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+B489..U+B4A3 L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+B4A5..U+B4BF L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+B4C1..U+B4DB L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+B4DD..U+B4F7 L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+B4F9..U+B513 L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+B515..U+B52F L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+B531..U+B54B L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+B54D..U+B567 L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+B569..U+B583 L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+B585..U+B59F L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+B5A1..U+B5BB L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+B5BD..U+B5D7 L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+B5D9..U+B5F3 L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+B5F5..U+B60F L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+B611..U+B62B L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+B62D..U+B647 L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+B649..U+B663 L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+B665..U+B67F L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+B681..U+B69B L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+B69D..U+B6B7 L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+B6B9..U+B6D3 L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+B6D5..U+B6EF L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
...
|
||||
find script Old_Uyghur
|
||||
U+10F70..U+10F81 Letter: Other letter, Old_Uyghur, Other
|
||||
U+10F82..U+10F85 Mark: Non-spacing mark, Old_Uyghur, Extend
|
||||
U+10F86..U+10F89 Punctuation: Other punctuation, Old_Uyghur, Other
|
||||
U+10F70..U+10F81 R Letter: Other letter, olduyghur, Other, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+10F82..U+10F85 NSM Mark: Non-spacing mark, olduyghur, Extend, [alphabetic, cased, graphemebase, idcontinue, idstart, lowercase, softdotted, xidcontinue, xidstart]
|
||||
U+10F86..U+10F89 R Punctuation: Other punctuation, olduyghur, Other, [bidimirrored, graphemebase, math, patternsyntax]
|
||||
find bidi PDF
|
||||
U+202C PDF Control: Format, common, Control, [extendedpictographic, graphemebase, math, patternsyntax]
|
||||
find bidi CS
|
||||
U+002C CS Punctuation: Other punctuation, common, Other, [ascii, asciihexdigit, alphabetic, cased, changeswhencasemapped, changeswhentitlecased, changeswhenuppercased, graphemebase, hexdigit, idcontinue, idstart, lowercase, xidcontinue, xidstart]
|
||||
U+002E CS Punctuation: Other punctuation, common, Other, [graphemebase, whitespace]
|
||||
U+002F CS Punctuation: Other punctuation, common, Other, [ascii, asciihexdigit, emoji, emojicomponent, graphemebase, hexdigit, idcontinue, xidcontinue]
|
||||
U+003A CS Punctuation: Other punctuation, common, Other, [alphabetic, cased, graphemebase, idcontinue, idstart, lowercase, xidcontinue, xidstart]
|
||||
U+00A0 CS Separator: Space separator, common, Other, [alphabetic, caseignorable, cased, diacritic, graphemebase, idcontinue, idstart, lowercase]
|
||||
U+060C CS Punctuation: Other punctuation, common, Other, [arabic, syriac, thaana, nko, hanifirohingya, yezidi], [graphemebase, patternsyntax, sentenceterminal, terminalpunctuation]
|
||||
U+202F CS Separator: Space separator, common, Other, [latin, mongolian], [alphabetic, caseignorable, cased, diacritic, graphemebase, idcontinue, idstart, lowercase]
|
||||
U+2044 CS Symbol: Mathematical symbol, common, Other, [alphabetic, caseignorable, diacritic, graphemeextend, idcontinue, xidcontinue]
|
||||
U+FE50 CS Punctuation: Other punctuation, common, Other, [graphemebase, patternsyntax, sentenceterminal, terminalpunctuation]
|
||||
U+FE52 CS Punctuation: Other punctuation, common, Other, [changeswhenuppercased, deprecated, emojimodifier, emojimodifierbase, extender, quotationmark, sentenceterminal, xidcontinue, xidstart]
|
||||
U+FE55 CS Punctuation: Other punctuation, common, Other, [changeswhencasemapped, changeswhenuppercased, emojimodifier, emojimodifierbase, joincontrol, noncharactercodepoint, patternwhitespace, prependedconcatenationmark]
|
||||
U+FF0C CS Punctuation: Other punctuation, common, Other, [graphemebase, patternsyntax, sentenceterminal, terminalpunctuation]
|
||||
U+FF0E CS Punctuation: Other punctuation, common, Other, [changeswhenuppercased, deprecated, emojimodifier, emojimodifierbase, extender, quotationmark, sentenceterminal, xidcontinue, xidstart]
|
||||
U+FF0F CS Punctuation: Other punctuation, common, Other, [alphabetic, caseignorable, extender, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
|
||||
U+FF1A CS Punctuation: Other punctuation, common, Other, [changeswhencasemapped, changeswhenuppercased, emojimodifier, emojimodifierbase, joincontrol, noncharactercodepoint, patternwhitespace, prependedconcatenationmark]
|
||||
find bidi CS type Sm
|
||||
U+2044 CS Symbol: Mathematical symbol, common, Other, [alphabetic, caseignorable, diacritic, graphemeextend, idcontinue, xidcontinue]
|
||||
find bidi B
|
||||
U+000A B Control: Control, common, LF, [ascii, graphemebase, patternsyntax, sentenceterminal, terminalpunctuation]
|
||||
U+000D B Control: Control, common, CR, [ascii, graphemebase, patternsyntax, sentenceterminal, terminalpunctuation]
|
||||
U+001C..U+001E B Control: Control, common, Control, [ascii, patternwhitespace, whitespace]
|
||||
U+0085 B Control: Control, common, Control, [caseignorable, defaultignorablecodepoint, graphemeextend, idcontinue, xidcontinue]
|
||||
U+2029 B Separator: Paragraph separator, common, Control, [caseignorable, defaultignorablecodepoint, graphemeextend, idcontinue, xidcontinue]
|
||||
find bidi FSI
|
||||
U+2068 FSI Control: Format, common, Control, [extendedpictographic, graphemebase, math, patternsyntax]
|
||||
find bidi PDI
|
||||
U+2069 PDI Control: Format, common, Control, [extendedpictographic, graphemebase, math, patternsyntax]
|
||||
find bidi RLI
|
||||
U+2067 RLI Control: Format, common, Control, [extendedpictographic, graphemebase, math, patternsyntax]
|
||||
find bidi RLO
|
||||
U+202E RLO Control: Format, common, Control, [extendedpictographic, graphemebase, math, patternsyntax]
|
||||
find bidi S
|
||||
U+0009 S Control: Control, common, Control, [ascii, graphemebase, patternsyntax, sentenceterminal, terminalpunctuation]
|
||||
U+000B S Control: Control, common, Control, [ascii, graphemebase, patternsyntax, sentenceterminal, terminalpunctuation]
|
||||
U+001F S Control: Control, common, Control, [ascii, patternwhitespace, whitespace]
|
||||
find bidi WS
|
||||
U+000C WS Control: Control, common, Control, [ascii, graphemebase, patternsyntax, sentenceterminal, terminalpunctuation]
|
||||
U+0020 WS Separator: Space separator, common, Other, [ascii, emoji, emojicomponent, graphemebase, patternsyntax]
|
||||
U+1680 WS Separator: Space separator, ogham, Other, [alphabetic, caseignorable, cased, diacritic, graphemebase, idcontinue, idstart, lowercase]
|
||||
U+2000..U+200A WS Separator: Space separator, common, Other, [alphabetic, caseignorable, cased, diacritic, graphemebase, idcontinue, idstart, lowercase]
|
||||
U+2028 WS Separator: Line separator, common, Control, [caseignorable, defaultignorablecodepoint, graphemeextend, idcontinue, xidcontinue]
|
||||
U+205F WS Separator: Space separator, common, Other, [alphabetic, caseignorable, cased, diacritic, graphemebase, idcontinue, idstart, lowercase]
|
||||
U+3000 WS Separator: Space separator, common, Other, [alphabetic, caseignorable, cased, diacritic, graphemebase, idcontinue, idstart, lowercase]
|
||||
find script bopo
|
||||
U+02EA..U+02EB ON Symbol: Modifier symbol, bopomofo, Other, [alphabetic, cased, graphemebase, idcontinue, idstart, math, uppercase, xidcontinue, xidstart]
|
||||
U+3105..U+312F L Letter: Other letter, bopomofo, Other, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+31A0..U+31BF L Letter: Other letter, bopomofo, Other, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
find bool prependedconcatenationmark
|
||||
U+00AD BN Control: Format, common, Control, [caseignorable, prependedconcatenationmark]
|
||||
U+180E BN Control: Format, mongolian, Control, [caseignorable, prependedconcatenationmark]
|
||||
U+200B BN Control: Format, common, Control, [caseignorable, prependedconcatenationmark]
|
||||
U+2060 BN Control: Format, common, Control, [caseignorable, prependedconcatenationmark]
|
||||
U+2118 ON Symbol: Mathematical symbol, common, Other, [changeswhencasemapped, changeswhentitlecased, emojimodifier, emojimodifierbase, patternsyntax, patternwhitespace, prependedconcatenationmark, quotationmark, radical, regionalindicator, sentenceterminal, softdotted, terminalpunctuation, unifiedideograph, uppercase, variationselector, whitespace, xidcontinue, xidstart]
|
||||
U+3030 ON Punctuation: Dash punctuation, common, Extended Pictographic, [hangul, hiragana, katakana, bopomofo, han], [changeswhencasemapped, changeswhenuppercased, emojimodifier, emojimodifierbase, joincontrol, logicalorderexception, lowercase, prependedconcatenationmark, quotationmark, radical, regionalindicator, sentenceterminal, softdotted, terminalpunctuation, unifiedideograph, uppercase, variationselector, whitespace, xidcontinue, xidstart]
|
||||
U+AAC0 L Letter: Other letter, taiviet, Other, [changeswhencasemapped, changeswhenuppercased, emojimodifier, emojimodifierbase, logicalorderexception, lowercase, math, patternwhitespace, prependedconcatenationmark]
|
||||
U+AAC2 L Letter: Other letter, taiviet, Other, [changeswhencasemapped, changeswhenuppercased, emojimodifier, emojimodifierbase, logicalorderexception, lowercase, math, patternwhitespace, prependedconcatenationmark]
|
||||
U+FE0F NSM Mark: Non-spacing mark, inherited, Extend, [changeswhencasemapped, changeswhenuppercased, emojimodifier, emojimodifierbase, joincontrol, logicalorderexception, math, patternwhitespace, prependedconcatenationmark]
|
||||
U+FE55 CS Punctuation: Other punctuation, common, Other, [changeswhencasemapped, changeswhenuppercased, emojimodifier, emojimodifierbase, joincontrol, noncharactercodepoint, patternwhitespace, prependedconcatenationmark]
|
||||
U+FEFF BN Control: Format, common, Control, [caseignorable, prependedconcatenationmark]
|
||||
U+FF1A CS Punctuation: Other punctuation, common, Other, [changeswhencasemapped, changeswhenuppercased, emojimodifier, emojimodifierbase, joincontrol, noncharactercodepoint, patternwhitespace, prependedconcatenationmark]
|
||||
U+FF21..U+FF26 L Letter: Upper case letter, latin, Other, [changeswhencasemapped, changeswhenuppercased, emojimodifier, emojimodifierbase, logicalorderexception, lowercase, noncharactercodepoint, patternwhitespace, prependedconcatenationmark]
|
||||
U+10D22..U+10D23 AL Letter: Other letter, hanifirohingya, Other, [changeswhencasemapped, changeswhenuppercased, emojimodifier, emojimodifierbase, logicalorderexception, lowercase, math, patternwhitespace, prependedconcatenationmark]
|
||||
U+1135D L Letter: Other letter, grantha, Other, [changeswhencasemapped, changeswhentitlecased, emojimodifier, emojimodifierbase, graphemeextend, hexdigit, logicalorderexception, lowercase, math, noncharactercodepoint, patternsyntax, patternwhitespace, prependedconcatenationmark, quotationmark, radical, regionalindicator, sentenceterminal, softdotted, terminalpunctuation, unifiedideograph, uppercase, variationselector, whitespace, xidcontinue, xidstart]
|
||||
U+1BCA0..U+1BCA3 BN Control: Format, common, Control, [duployan], [caseignorable, prependedconcatenationmark]
|
||||
U+1D173..U+1D17A BN Control: Format, common, Control, [caseignorable, prependedconcatenationmark]
|
||||
U+1F1E6..U+1F1FF L Symbol: Other symbol, common, Regional Indicator, [changeswhencasemapped, changeswhenuppercased, emojimodifier, emojimodifierbase, math, patternwhitespace, prependedconcatenationmark, quotationmark, radical, regionalindicator, sentenceterminal, softdotted, terminalpunctuation, unifiedideograph, uppercase, variationselector, whitespace, xidcontinue, xidstart]
|
||||
find bool pcm
|
||||
U+00AD BN Control: Format, common, Control, [caseignorable, prependedconcatenationmark]
|
||||
U+180E BN Control: Format, mongolian, Control, [caseignorable, prependedconcatenationmark]
|
||||
U+200B BN Control: Format, common, Control, [caseignorable, prependedconcatenationmark]
|
||||
U+2060 BN Control: Format, common, Control, [caseignorable, prependedconcatenationmark]
|
||||
U+2118 ON Symbol: Mathematical symbol, common, Other, [changeswhencasemapped, changeswhentitlecased, emojimodifier, emojimodifierbase, patternsyntax, patternwhitespace, prependedconcatenationmark, quotationmark, radical, regionalindicator, sentenceterminal, softdotted, terminalpunctuation, unifiedideograph, uppercase, variationselector, whitespace, xidcontinue, xidstart]
|
||||
U+3030 ON Punctuation: Dash punctuation, common, Extended Pictographic, [hangul, hiragana, katakana, bopomofo, han], [changeswhencasemapped, changeswhenuppercased, emojimodifier, emojimodifierbase, joincontrol, logicalorderexception, lowercase, prependedconcatenationmark, quotationmark, radical, regionalindicator, sentenceterminal, softdotted, terminalpunctuation, unifiedideograph, uppercase, variationselector, whitespace, xidcontinue, xidstart]
|
||||
U+AAC0 L Letter: Other letter, taiviet, Other, [changeswhencasemapped, changeswhenuppercased, emojimodifier, emojimodifierbase, logicalorderexception, lowercase, math, patternwhitespace, prependedconcatenationmark]
|
||||
U+AAC2 L Letter: Other letter, taiviet, Other, [changeswhencasemapped, changeswhenuppercased, emojimodifier, emojimodifierbase, logicalorderexception, lowercase, math, patternwhitespace, prependedconcatenationmark]
|
||||
U+FE0F NSM Mark: Non-spacing mark, inherited, Extend, [changeswhencasemapped, changeswhenuppercased, emojimodifier, emojimodifierbase, joincontrol, logicalorderexception, math, patternwhitespace, prependedconcatenationmark]
|
||||
U+FE55 CS Punctuation: Other punctuation, common, Other, [changeswhencasemapped, changeswhenuppercased, emojimodifier, emojimodifierbase, joincontrol, noncharactercodepoint, patternwhitespace, prependedconcatenationmark]
|
||||
U+FEFF BN Control: Format, common, Control, [caseignorable, prependedconcatenationmark]
|
||||
U+FF1A CS Punctuation: Other punctuation, common, Other, [changeswhencasemapped, changeswhenuppercased, emojimodifier, emojimodifierbase, joincontrol, noncharactercodepoint, patternwhitespace, prependedconcatenationmark]
|
||||
U+FF21..U+FF26 L Letter: Upper case letter, latin, Other, [changeswhencasemapped, changeswhenuppercased, emojimodifier, emojimodifierbase, logicalorderexception, lowercase, noncharactercodepoint, patternwhitespace, prependedconcatenationmark]
|
||||
U+10D22..U+10D23 AL Letter: Other letter, hanifirohingya, Other, [changeswhencasemapped, changeswhenuppercased, emojimodifier, emojimodifierbase, logicalorderexception, lowercase, math, patternwhitespace, prependedconcatenationmark]
|
||||
U+1135D L Letter: Other letter, grantha, Other, [changeswhencasemapped, changeswhentitlecased, emojimodifier, emojimodifierbase, graphemeextend, hexdigit, logicalorderexception, lowercase, math, noncharactercodepoint, patternsyntax, patternwhitespace, prependedconcatenationmark, quotationmark, radical, regionalindicator, sentenceterminal, softdotted, terminalpunctuation, unifiedideograph, uppercase, variationselector, whitespace, xidcontinue, xidstart]
|
||||
U+1BCA0..U+1BCA3 BN Control: Format, common, Control, [duployan], [caseignorable, prependedconcatenationmark]
|
||||
U+1D173..U+1D17A BN Control: Format, common, Control, [caseignorable, prependedconcatenationmark]
|
||||
U+1F1E6..U+1F1FF L Symbol: Other symbol, common, Regional Indicator, [changeswhencasemapped, changeswhenuppercased, emojimodifier, emojimodifierbase, math, patternwhitespace, prependedconcatenationmark, quotationmark, radical, regionalindicator, sentenceterminal, softdotted, terminalpunctuation, unifiedideograph, uppercase, variationselector, whitespace, xidcontinue, xidstart]
|
||||
|
|
|
@ -97,6 +97,9 @@ sure both macros are undefined; an emulation function will then be used. */
|
|||
/* Have PTHREAD_PRIO_INHERIT. */
|
||||
/* #undef HAVE_PTHREAD_PRIO_INHERIT */
|
||||
|
||||
/* Define to 1 if you have the <readline.h> header file. */
|
||||
/* #undef HAVE_READLINE_H */
|
||||
|
||||
/* Define to 1 if you have the <readline/history.h> header file. */
|
||||
/* #undef HAVE_READLINE_HISTORY_H */
|
||||
|
||||
|
@ -233,7 +236,7 @@ sure both macros are undefined; an emulation function will then be used. */
|
|||
#define PACKAGE_NAME "PCRE2"
|
||||
|
||||
/* Define to the full name and version of this package. */
|
||||
#define PACKAGE_STRING "PCRE2 10.39"
|
||||
#define PACKAGE_STRING "PCRE2 10.40"
|
||||
|
||||
/* Define to the one symbol short name of this package. */
|
||||
#define PACKAGE_TARNAME "pcre2"
|
||||
|
@ -242,7 +245,7 @@ sure both macros are undefined; an emulation function will then be used. */
|
|||
#define PACKAGE_URL ""
|
||||
|
||||
/* Define to the version of this package. */
|
||||
#define PACKAGE_VERSION "10.39"
|
||||
#define PACKAGE_VERSION "10.40"
|
||||
|
||||
/* The value of PARENS_NEST_LIMIT specifies the maximum depth of nested
|
||||
parentheses (of any kind) in a pattern. This limits the amount of system
|
||||
|
@ -435,7 +438,7 @@ sure both macros are undefined; an emulation function will then be used. */
|
|||
#endif
|
||||
|
||||
/* Version number of package */
|
||||
#define VERSION "10.39"
|
||||
#define VERSION "10.40"
|
||||
|
||||
/* Define to empty if `const' does not conform to ANSI C. */
|
||||
/* #undef const */
|
||||
|
|
|
@ -97,6 +97,9 @@ sure both macros are undefined; an emulation function will then be used. */
|
|||
/* Have PTHREAD_PRIO_INHERIT. */
|
||||
#undef HAVE_PTHREAD_PRIO_INHERIT
|
||||
|
||||
/* Define to 1 if you have the <readline.h> header file. */
|
||||
#undef HAVE_READLINE_H
|
||||
|
||||
/* Define to 1 if you have the <readline/history.h> header file. */
|
||||
#undef HAVE_READLINE_HISTORY_H
|
||||
|
||||
|
|
|
@ -42,9 +42,9 @@ POSSIBILITY OF SUCH DAMAGE.
|
|||
/* The current PCRE version information. */
|
||||
|
||||
#define PCRE2_MAJOR 10
|
||||
#define PCRE2_MINOR 39
|
||||
#define PCRE2_MINOR 40
|
||||
#define PCRE2_PRERELEASE
|
||||
#define PCRE2_DATE 2021-10-29
|
||||
#define PCRE2_DATE 2022-04-14
|
||||
|
||||
/* When an application links to a PCRE DLL in Windows, the symbols that are
|
||||
imported have to be identified as such. When building PCRE2, the appropriate
|
||||
|
|
|
@ -7,7 +7,7 @@ and semantics are as close as possible to those of the Perl 5 language.
|
|||
|
||||
Written by Philip Hazel
|
||||
Original API code Copyright (c) 1997-2012 University of Cambridge
|
||||
New API code Copyright (c) 2016-2021 University of Cambridge
|
||||
New API code Copyright (c) 2016-2022 University of Cambridge
|
||||
|
||||
-----------------------------------------------------------------------------
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
|
@ -123,18 +123,21 @@ opcode is used to select the column. The values are as follows:
|
|||
*/
|
||||
|
||||
static const uint8_t propposstab[PT_TABSIZE][PT_TABSIZE] = {
|
||||
/* ANY LAMP GC PC SC ALNUM SPACE PXSPACE WORD CLIST UCNC */
|
||||
{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, /* PT_ANY */
|
||||
{ 0, 3, 0, 0, 0, 3, 1, 1, 0, 0, 0 }, /* PT_LAMP */
|
||||
{ 0, 0, 2, 4, 0, 9, 10, 10, 11, 0, 0 }, /* PT_GC */
|
||||
{ 0, 0, 5, 2, 0, 15, 16, 16, 17, 0, 0 }, /* PT_PC */
|
||||
{ 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0 }, /* PT_SC */
|
||||
{ 0, 3, 6, 12, 0, 3, 1, 1, 0, 0, 0 }, /* PT_ALNUM */
|
||||
{ 0, 1, 7, 13, 0, 1, 3, 3, 1, 0, 0 }, /* PT_SPACE */
|
||||
{ 0, 1, 7, 13, 0, 1, 3, 3, 1, 0, 0 }, /* PT_PXSPACE */
|
||||
{ 0, 0, 8, 14, 0, 0, 1, 1, 3, 0, 0 }, /* PT_WORD */
|
||||
{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, /* PT_CLIST */
|
||||
{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3 } /* PT_UCNC */
|
||||
/* ANY LAMP GC PC SC SCX ALNUM SPACE PXSPACE WORD CLIST UCNC BIDICL BOOL */
|
||||
{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, /* PT_ANY */
|
||||
{ 0, 3, 0, 0, 0, 0, 3, 1, 1, 0, 0, 0, 0, 0 }, /* PT_LAMP */
|
||||
{ 0, 0, 2, 4, 0, 0, 9, 10, 10, 11, 0, 0, 0, 0 }, /* PT_GC */
|
||||
{ 0, 0, 5, 2, 0, 0, 15, 16, 16, 17, 0, 0, 0, 0 }, /* PT_PC */
|
||||
{ 0, 0, 0, 0, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0 }, /* PT_SC */
|
||||
{ 0, 0, 0, 0, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0 }, /* PT_SCX */
|
||||
{ 0, 3, 6, 12, 0, 0, 3, 1, 1, 0, 0, 0, 0, 0 }, /* PT_ALNUM */
|
||||
{ 0, 1, 7, 13, 0, 0, 1, 3, 3, 1, 0, 0, 0, 0 }, /* PT_SPACE */
|
||||
{ 0, 1, 7, 13, 0, 0, 1, 3, 3, 1, 0, 0, 0, 0 }, /* PT_PXSPACE */
|
||||
{ 0, 0, 8, 14, 0, 0, 0, 1, 1, 3, 0, 0, 0, 0 }, /* PT_WORD */
|
||||
{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, /* PT_CLIST */
|
||||
{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0 }, /* PT_UCNC */
|
||||
{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, /* PT_BIDICL */
|
||||
{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } /* PT_BOOL */
|
||||
};
|
||||
|
||||
/* This table is used to check whether auto-possessification is possible
|
||||
|
@ -196,6 +199,7 @@ static BOOL
|
|||
check_char_prop(uint32_t c, unsigned int ptype, unsigned int pdata,
|
||||
BOOL negated)
|
||||
{
|
||||
BOOL ok;
|
||||
const uint32_t *p;
|
||||
const ucd_record *prop = GET_UCD(c);
|
||||
|
||||
|
@ -215,6 +219,11 @@ switch(ptype)
|
|||
case PT_SC:
|
||||
return (pdata == prop->script) == negated;
|
||||
|
||||
case PT_SCX:
|
||||
ok = (pdata == prop->script
|
||||
|| MAPBIT(PRIV(ucd_script_sets) + UCD_SCRIPTX_PROP(prop), pdata) != 0);
|
||||
return ok == negated;
|
||||
|
||||
/* These are specials */
|
||||
|
||||
case PT_ALNUM:
|
||||
|
@ -251,6 +260,14 @@ switch(ptype)
|
|||
if (c == *p++) return negated;
|
||||
}
|
||||
break; /* Control never reaches here */
|
||||
|
||||
/* Haven't yet thought these through. */
|
||||
|
||||
case PT_BIDICL:
|
||||
return FALSE;
|
||||
|
||||
case PT_BOOL:
|
||||
return FALSE;
|
||||
}
|
||||
|
||||
return FALSE;
|
||||
|
|
|
@ -7,7 +7,7 @@ and semantics are as close as possible to those of the Perl 5 language.
|
|||
|
||||
Written by Philip Hazel
|
||||
Original API code Copyright (c) 1997-2012 University of Cambridge
|
||||
New API code Copyright (c) 2016-2021 University of Cambridge
|
||||
New API code Copyright (c) 2016-2022 University of Cambridge
|
||||
|
||||
-----------------------------------------------------------------------------
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
|
@ -124,7 +124,7 @@ static unsigned int
|
|||
|
||||
static int
|
||||
compile_regex(uint32_t, PCRE2_UCHAR **, uint32_t **, int *, uint32_t,
|
||||
uint32_t *, int32_t *, uint32_t *, int32_t *, branch_chain *,
|
||||
uint32_t *, uint32_t *, uint32_t *, uint32_t *, branch_chain *,
|
||||
compile_block *, PCRE2_SIZE *);
|
||||
|
||||
static int
|
||||
|
@ -385,13 +385,15 @@ compiler is clever with identical subexpressions. */
|
|||
|
||||
#define SETBIT(a,b) a[(b)/8] = (uint8_t)(a[(b)/8] | (1u << ((b)&7)))
|
||||
|
||||
/* Private flags added to firstcu and reqcu. */
|
||||
/* Values and flags for the unsigned xxcuflags variables that accompany xxcu
|
||||
variables, which are concerned with first and required code units. A value
|
||||
greater than or equal to REQ_NONE means "no code unit set"; otherwise the
|
||||
matching xxcu variable is set, and the low valued bits are relevant. */
|
||||
|
||||
#define REQ_CASELESS (1u << 0) /* Indicates caselessness */
|
||||
#define REQ_VARY (1u << 1) /* reqcu followed non-literal item */
|
||||
/* Negative values for the firstcu and reqcu flags */
|
||||
#define REQ_UNSET (-2) /* Not yet found anything */
|
||||
#define REQ_NONE (-1) /* Found not fixed char */
|
||||
#define REQ_UNSET 0xffffffffu /* Not yet found anything */
|
||||
#define REQ_NONE 0xfffffffeu /* Found not fixed character */
|
||||
#define REQ_CASELESS 0x00000001u /* Code unit in xxcu is caseless */
|
||||
#define REQ_VARY 0x00000002u /* Code unit is followed by non-literal */
|
||||
|
||||
/* These flags are used in the groupinfo vector. */
|
||||
|
||||
|
@ -1264,8 +1266,10 @@ PCRE2_SIZE* ref_count;
|
|||
|
||||
if (code != NULL)
|
||||
{
|
||||
#ifdef SUPPORT_JIT
|
||||
if (code->executable_jit != NULL)
|
||||
PRIV(jit_free)(code->executable_jit, &code->memctl);
|
||||
#endif
|
||||
|
||||
if ((code->flags & PCRE2_DEREF_TABLES) != 0)
|
||||
{
|
||||
|
@ -2088,7 +2092,9 @@ get_ucp(PCRE2_SPTR *ptrptr, BOOL *negptr, uint16_t *ptypeptr,
|
|||
PCRE2_UCHAR c;
|
||||
PCRE2_SIZE i, bot, top;
|
||||
PCRE2_SPTR ptr = *ptrptr;
|
||||
PCRE2_UCHAR name[32];
|
||||
PCRE2_UCHAR name[50];
|
||||
PCRE2_UCHAR *vptr = NULL;
|
||||
uint16_t ptscript = PT_NOTSCRIPT;
|
||||
|
||||
if (ptr >= cb->end_pattern) goto ERROR_RETURN;
|
||||
c = *ptr++;
|
||||
|
@ -2100,36 +2106,95 @@ negation. */
|
|||
if (c == CHAR_LEFT_CURLY_BRACKET)
|
||||
{
|
||||
if (ptr >= cb->end_pattern) goto ERROR_RETURN;
|
||||
|
||||
if (*ptr == CHAR_CIRCUMFLEX_ACCENT)
|
||||
{
|
||||
*negptr = TRUE;
|
||||
ptr++;
|
||||
}
|
||||
|
||||
for (i = 0; i < (int)(sizeof(name) / sizeof(PCRE2_UCHAR)) - 1; i++)
|
||||
{
|
||||
if (ptr >= cb->end_pattern) goto ERROR_RETURN;
|
||||
c = *ptr++;
|
||||
while (c == '_' || c == '-' || isspace(c))
|
||||
{
|
||||
if (ptr >= cb->end_pattern) goto ERROR_RETURN;
|
||||
c = *ptr++;
|
||||
}
|
||||
if (c == CHAR_NUL) goto ERROR_RETURN;
|
||||
if (c == CHAR_RIGHT_CURLY_BRACKET) break;
|
||||
name[i] = c;
|
||||
name[i] = tolower(c);
|
||||
if ((c == ':' || c == '=') && vptr == NULL) vptr = name + i;
|
||||
}
|
||||
|
||||
if (c != CHAR_RIGHT_CURLY_BRACKET) goto ERROR_RETURN;
|
||||
name[i] = 0;
|
||||
}
|
||||
|
||||
/* Otherwise there is just one following character, which must be an ASCII
|
||||
letter. */
|
||||
/* If { doesn't follow \p or \P there is just one following character, which
|
||||
must be an ASCII letter. */
|
||||
|
||||
else if (MAX_255(c) && (cb->ctypes[c] & ctype_letter) != 0)
|
||||
{
|
||||
name[0] = c;
|
||||
name[0] = tolower(c);
|
||||
name[1] = 0;
|
||||
}
|
||||
else goto ERROR_RETURN;
|
||||
|
||||
*ptrptr = ptr;
|
||||
|
||||
/* Search for a recognized property name using binary chop. */
|
||||
/* If the property contains ':' or '=' we have class name and value separately
|
||||
specified. The following are supported:
|
||||
|
||||
. Bidi_Class (synonym bc), for which the property names are "bidi<name>".
|
||||
. Script (synonym sc) for which the property name is the script name
|
||||
. Script_Extensions (synonym scx), ditto
|
||||
|
||||
As this is a small number, we currently just check the names directly. If this
|
||||
grows, a sorted table and a switch will be neater.
|
||||
|
||||
For both the script properties, set a PT_xxx value so that (1) they can be
|
||||
distinguished and (2) invalid script names that happen to be the name of
|
||||
another property can be diagnosed. */
|
||||
|
||||
if (vptr != NULL)
|
||||
{
|
||||
int offset = 0;
|
||||
PCRE2_UCHAR sname[8];
|
||||
|
||||
*vptr = 0; /* Terminate property name */
|
||||
if (PRIV(strcmp_c8)(name, STRING_bidiclass) == 0 ||
|
||||
PRIV(strcmp_c8)(name, STRING_bc) == 0)
|
||||
{
|
||||
offset = 4;
|
||||
sname[0] = CHAR_b;
|
||||
sname[1] = CHAR_i; /* There is no strcpy_c8 function */
|
||||
sname[2] = CHAR_d;
|
||||
sname[3] = CHAR_i;
|
||||
}
|
||||
|
||||
else if (PRIV(strcmp_c8)(name, STRING_script) == 0 ||
|
||||
PRIV(strcmp_c8)(name, STRING_sc) == 0)
|
||||
ptscript = PT_SC;
|
||||
|
||||
else if (PRIV(strcmp_c8)(name, STRING_scriptextensions) == 0 ||
|
||||
PRIV(strcmp_c8)(name, STRING_scx) == 0)
|
||||
ptscript = PT_SCX;
|
||||
|
||||
else
|
||||
{
|
||||
*errorcodeptr = ERR47;
|
||||
return FALSE;
|
||||
}
|
||||
|
||||
/* Adjust the string in name[] as needed */
|
||||
|
||||
memmove(name + offset, vptr + 1, (name + i - vptr)*sizeof(PCRE2_UCHAR));
|
||||
if (offset != 0) memmove(name, sname, offset*sizeof(PCRE2_UCHAR));
|
||||
}
|
||||
|
||||
/* Search for a recognized property using binary chop. */
|
||||
|
||||
bot = 0;
|
||||
top = PRIV(utt_size);
|
||||
|
@ -2139,15 +2204,37 @@ while (bot < top)
|
|||
int r;
|
||||
i = (bot + top) >> 1;
|
||||
r = PRIV(strcmp_c8)(name, PRIV(utt_names) + PRIV(utt)[i].name_offset);
|
||||
|
||||
/* When a matching property is found, some extra checking is needed when the
|
||||
\p{xx:yy} syntax is used and xx is either sc or scx. */
|
||||
|
||||
if (r == 0)
|
||||
{
|
||||
*ptypeptr = PRIV(utt)[i].type;
|
||||
*pdataptr = PRIV(utt)[i].value;
|
||||
if (vptr == NULL || ptscript == PT_NOTSCRIPT)
|
||||
{
|
||||
*ptypeptr = PRIV(utt)[i].type;
|
||||
return TRUE;
|
||||
}
|
||||
|
||||
switch (PRIV(utt)[i].type)
|
||||
{
|
||||
case PT_SC:
|
||||
*ptypeptr = PT_SC;
|
||||
return TRUE;
|
||||
|
||||
case PT_SCX:
|
||||
*ptypeptr = ptscript;
|
||||
return TRUE;
|
||||
}
|
||||
|
||||
break; /* Non-script found */
|
||||
}
|
||||
|
||||
if (r > 0) bot = i + 1; else top = i;
|
||||
}
|
||||
*errorcodeptr = ERR47; /* Unrecognized name */
|
||||
|
||||
*errorcodeptr = ERR47; /* Unrecognized property */
|
||||
return FALSE;
|
||||
|
||||
ERROR_RETURN: /* Malformed \P or \p */
|
||||
|
@ -5285,9 +5372,9 @@ Arguments:
|
|||
pptrptr points to the current parsed pattern pointer
|
||||
errorcodeptr points to error code variable
|
||||
firstcuptr place to put the first required code unit
|
||||
firstcuflagsptr place to put the first code unit flags, or a negative number
|
||||
firstcuflagsptr place to put the first code unit flags
|
||||
reqcuptr place to put the last required code unit
|
||||
reqcuflagsptr place to put the last required code unit flags, or a negative number
|
||||
reqcuflagsptr place to put the last required code unit flags
|
||||
bcptr points to current branch chain
|
||||
cb contains pointers to tables etc.
|
||||
lengthptr NULL during the real compile phase
|
||||
|
@ -5300,8 +5387,8 @@ Returns: 0 There's been an error, *errorcodeptr is non-zero
|
|||
|
||||
static int
|
||||
compile_branch(uint32_t *optionsptr, PCRE2_UCHAR **codeptr, uint32_t **pptrptr,
|
||||
int *errorcodeptr, uint32_t *firstcuptr, int32_t *firstcuflagsptr,
|
||||
uint32_t *reqcuptr, int32_t *reqcuflagsptr, branch_chain *bcptr,
|
||||
int *errorcodeptr, uint32_t *firstcuptr, uint32_t *firstcuflagsptr,
|
||||
uint32_t *reqcuptr, uint32_t *reqcuflagsptr, branch_chain *bcptr,
|
||||
compile_block *cb, PCRE2_SIZE *lengthptr)
|
||||
{
|
||||
int bravalue = 0;
|
||||
|
@ -5316,9 +5403,9 @@ uint32_t zeroreqcu, zerofirstcu;
|
|||
uint32_t escape;
|
||||
uint32_t *pptr = *pptrptr;
|
||||
uint32_t meta, meta_arg;
|
||||
int32_t firstcuflags, reqcuflags;
|
||||
int32_t zeroreqcuflags, zerofirstcuflags;
|
||||
int32_t req_caseopt, reqvary, tempreqvary;
|
||||
uint32_t firstcuflags, reqcuflags;
|
||||
uint32_t zeroreqcuflags, zerofirstcuflags;
|
||||
uint32_t req_caseopt, reqvary, tempreqvary;
|
||||
PCRE2_SIZE offset = 0;
|
||||
PCRE2_SIZE length_prevgroup = 0;
|
||||
PCRE2_UCHAR *code = *codeptr;
|
||||
|
@ -5374,13 +5461,13 @@ item types that can be repeated set these backoff variables appropriately. */
|
|||
firstcu = reqcu = zerofirstcu = zeroreqcu = 0;
|
||||
firstcuflags = reqcuflags = zerofirstcuflags = zeroreqcuflags = REQ_UNSET;
|
||||
|
||||
/* The variable req_caseopt contains either the REQ_CASELESS value or zero,
|
||||
/* The variable req_caseopt contains either the REQ_CASELESS bit or zero,
|
||||
according to the current setting of the caseless flag. The REQ_CASELESS value
|
||||
leaves the lower 28 bit empty. It is added into the firstcu or reqcu variables
|
||||
to record the case status of the value. This is used only for ASCII characters.
|
||||
*/
|
||||
|
||||
req_caseopt = ((options & PCRE2_CASELESS) != 0)? REQ_CASELESS:0;
|
||||
req_caseopt = ((options & PCRE2_CASELESS) != 0)? REQ_CASELESS : 0;
|
||||
|
||||
/* Switch on next META item until the end of the branch */
|
||||
|
||||
|
@ -5395,13 +5482,12 @@ for (;; pptr++)
|
|||
BOOL possessive_quantifier;
|
||||
BOOL note_group_empty;
|
||||
int class_has_8bitchar;
|
||||
int i;
|
||||
uint32_t mclength;
|
||||
uint32_t skipunits;
|
||||
uint32_t subreqcu, subfirstcu;
|
||||
uint32_t groupnumber;
|
||||
uint32_t verbarglen, verbculen;
|
||||
int32_t subreqcuflags, subfirstcuflags; /* Must be signed */
|
||||
uint32_t subreqcuflags, subfirstcuflags;
|
||||
open_capitem *oc;
|
||||
PCRE2_UCHAR mcbuffer[8];
|
||||
|
||||
|
@ -5770,9 +5856,9 @@ for (;; pptr++)
|
|||
if (taboffset >= 0)
|
||||
{
|
||||
if (tabopt >= 0)
|
||||
for (i = 0; i < 32; i++) pbits[i] |= cbits[(int)i + taboffset];
|
||||
for (int i = 0; i < 32; i++) pbits[i] |= cbits[(int)i + taboffset];
|
||||
else
|
||||
for (i = 0; i < 32; i++) pbits[i] &= ~cbits[(int)i + taboffset];
|
||||
for (int i = 0; i < 32; i++) pbits[i] &= ~cbits[(int)i + taboffset];
|
||||
}
|
||||
|
||||
/* Now see if we need to remove any special characters. An option
|
||||
|
@ -5786,9 +5872,9 @@ for (;; pptr++)
|
|||
being built and we are done. */
|
||||
|
||||
if (local_negate)
|
||||
for (i = 0; i < 32; i++) classbits[i] |= ~pbits[i];
|
||||
for (int i = 0; i < 32; i++) classbits[i] |= (uint8_t)(~pbits[i]);
|
||||
else
|
||||
for (i = 0; i < 32; i++) classbits[i] |= pbits[i];
|
||||
for (int i = 0; i < 32; i++) classbits[i] |= pbits[i];
|
||||
|
||||
/* Every class contains at least one < 256 character. */
|
||||
|
||||
|
@ -5827,21 +5913,23 @@ for (;; pptr++)
|
|||
switch(escape)
|
||||
{
|
||||
case ESC_d:
|
||||
for (i = 0; i < 32; i++) classbits[i] |= cbits[i+cbit_digit];
|
||||
for (int i = 0; i < 32; i++) classbits[i] |= cbits[i+cbit_digit];
|
||||
break;
|
||||
|
||||
case ESC_D:
|
||||
should_flip_negation = TRUE;
|
||||
for (i = 0; i < 32; i++) classbits[i] |= ~cbits[i+cbit_digit];
|
||||
for (int i = 0; i < 32; i++)
|
||||
classbits[i] |= (uint8_t)(~cbits[i+cbit_digit]);
|
||||
break;
|
||||
|
||||
case ESC_w:
|
||||
for (i = 0; i < 32; i++) classbits[i] |= cbits[i+cbit_word];
|
||||
for (int i = 0; i < 32; i++) classbits[i] |= cbits[i+cbit_word];
|
||||
break;
|
||||
|
||||
case ESC_W:
|
||||
should_flip_negation = TRUE;
|
||||
for (i = 0; i < 32; i++) classbits[i] |= ~cbits[i+cbit_word];
|
||||
for (int i = 0; i < 32; i++)
|
||||
classbits[i] |= (uint8_t)(~cbits[i+cbit_word]);
|
||||
break;
|
||||
|
||||
/* Perl 5.004 onwards omitted VT from \s, but restored it at Perl
|
||||
|
@ -5852,12 +5940,13 @@ for (;; pptr++)
|
|||
longer treat \s and \S specially. */
|
||||
|
||||
case ESC_s:
|
||||
for (i = 0; i < 32; i++) classbits[i] |= cbits[i+cbit_space];
|
||||
for (int i = 0; i < 32; i++) classbits[i] |= cbits[i+cbit_space];
|
||||
break;
|
||||
|
||||
case ESC_S:
|
||||
should_flip_negation = TRUE;
|
||||
for (i = 0; i < 32; i++) classbits[i] |= ~cbits[i+cbit_space];
|
||||
for (int i = 0; i < 32; i++)
|
||||
classbits[i] |= (uint8_t)(~cbits[i+cbit_space]);
|
||||
break;
|
||||
|
||||
/* When adding the horizontal or vertical space lists to a class, or
|
||||
|
@ -6098,7 +6187,7 @@ for (;; pptr++)
|
|||
if (negate_class && !xclass_has_prop)
|
||||
{
|
||||
/* Using 255 ^ instead of ~ avoids clang sanitize warning. */
|
||||
for (i = 0; i < 32; i++) classbits[i] = 255 ^ classbits[i];
|
||||
for (int i = 0; i < 32; i++) classbits[i] = 255 ^ classbits[i];
|
||||
}
|
||||
memcpy(code, classbits, 32);
|
||||
code = class_uchardata + (32 / sizeof(PCRE2_UCHAR));
|
||||
|
@ -6124,7 +6213,7 @@ for (;; pptr++)
|
|||
if (negate_class)
|
||||
{
|
||||
/* Using 255 ^ instead of ~ avoids clang sanitize warning. */
|
||||
for (i = 0; i < 32; i++) classbits[i] = 255 ^ classbits[i];
|
||||
for (int i = 0; i < 32; i++) classbits[i] = 255 ^ classbits[i];
|
||||
}
|
||||
memcpy(code, classbits, 32);
|
||||
}
|
||||
|
@ -6198,7 +6287,7 @@ for (;; pptr++)
|
|||
verbarglen = *(++pptr);
|
||||
verbculen = 0;
|
||||
tempcode = code++;
|
||||
for (i = 0; i < (int)verbarglen; i++)
|
||||
for (int i = 0; i < (int)verbarglen; i++)
|
||||
{
|
||||
meta = *(++pptr);
|
||||
#ifdef SUPPORT_UNICODE
|
||||
|
@ -6247,6 +6336,7 @@ for (;; pptr++)
|
|||
bravalue = OP_COND;
|
||||
{
|
||||
int count, index;
|
||||
unsigned int i;
|
||||
PCRE2_SPTR name;
|
||||
named_group *ng = cb->named_groups;
|
||||
uint32_t length = *(++pptr);
|
||||
|
@ -6286,7 +6376,7 @@ for (;; pptr++)
|
|||
groupnumber = 0;
|
||||
if (meta == META_COND_RNUMBER)
|
||||
{
|
||||
for (i = 1; i < (int)length; i++)
|
||||
for (i = 1; i < length; i++)
|
||||
{
|
||||
groupnumber = groupnumber * 10 + name[i] - CHAR_0;
|
||||
if (groupnumber > MAX_GROUP_NUMBER)
|
||||
|
@ -6608,7 +6698,7 @@ for (;; pptr++)
|
|||
|
||||
if (firstcuflags == REQ_UNSET && subfirstcuflags != REQ_UNSET)
|
||||
{
|
||||
if (subfirstcuflags >= 0)
|
||||
if (subfirstcuflags < REQ_NONE)
|
||||
{
|
||||
firstcu = subfirstcu;
|
||||
firstcuflags = subfirstcuflags;
|
||||
|
@ -6622,7 +6712,7 @@ for (;; pptr++)
|
|||
into reqcu if there wasn't one, using the vary flag that was in
|
||||
existence beforehand. */
|
||||
|
||||
else if (subfirstcuflags >= 0 && subreqcuflags < 0)
|
||||
else if (subfirstcuflags < REQ_NONE && subreqcuflags >= REQ_NONE)
|
||||
{
|
||||
subreqcu = subfirstcu;
|
||||
subreqcuflags = subfirstcuflags | tempreqvary;
|
||||
|
@ -6631,7 +6721,7 @@ for (;; pptr++)
|
|||
/* If the subpattern set a required code unit (or set a first code unit
|
||||
that isn't really the first code unit - see above), set it. */
|
||||
|
||||
if (subreqcuflags >= 0)
|
||||
if (subreqcuflags < REQ_NONE)
|
||||
{
|
||||
reqcu = subreqcu;
|
||||
reqcuflags = subreqcuflags;
|
||||
|
@ -6650,7 +6740,7 @@ for (;; pptr++)
|
|||
in that example, 'X' ends up set for both. */
|
||||
|
||||
else if ((bravalue == OP_ASSERT || bravalue == OP_ASSERT_NA) &&
|
||||
subreqcuflags >= 0 && subfirstcuflags >= 0)
|
||||
subreqcuflags < REQ_NONE && subfirstcuflags < REQ_NONE)
|
||||
{
|
||||
reqcu = subreqcu;
|
||||
reqcuflags = subreqcuflags;
|
||||
|
@ -6680,7 +6770,7 @@ for (;; pptr++)
|
|||
this name is duplicated. */
|
||||
|
||||
groupnumber = 0;
|
||||
for (i = 0; i < cb->names_found; i++, ng++)
|
||||
for (unsigned int i = 0; i < cb->names_found; i++, ng++)
|
||||
{
|
||||
if (length == ng->length &&
|
||||
PRIV(strncmp)(name, ng->name, length) == 0)
|
||||
|
@ -6935,14 +7025,19 @@ for (;; pptr++)
|
|||
#endif /* MAYBE_UTF_MULTI */
|
||||
|
||||
/* Handle the case of a single code unit - either with no UTF support, or
|
||||
with UTF disabled, or for a single-code-unit UTF character. */
|
||||
with UTF disabled, or for a single-code-unit UTF character. In the latter
|
||||
case, for a repeated positive match, get the caseless flag for the
|
||||
required code unit from the previous character, because a class like [Aa]
|
||||
sets a caseless A but by now the req_caseopt flag has been reset. */
|
||||
|
||||
{
|
||||
mcbuffer[0] = code[-1];
|
||||
mclength = 1;
|
||||
if (op_previous <= OP_CHARI && repeat_min > 1)
|
||||
{
|
||||
reqcu = mcbuffer[0];
|
||||
reqcuflags = req_caseopt | cb->req_varyopt;
|
||||
reqcuflags = cb->req_varyopt;
|
||||
if (op_previous == OP_CHARI) reqcuflags |= REQ_CASELESS;
|
||||
}
|
||||
}
|
||||
goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */
|
||||
|
@ -7034,7 +7129,7 @@ for (;; pptr++)
|
|||
*lengthptr += delta;
|
||||
}
|
||||
|
||||
else for (i = 0; i < replicate; i++)
|
||||
else for (int i = 0; i < replicate; i++)
|
||||
{
|
||||
memcpy(code, previous, CU2BYTES(1 + LINK_SIZE));
|
||||
previous = code;
|
||||
|
@ -7210,12 +7305,12 @@ for (;; pptr++)
|
|||
|
||||
else
|
||||
{
|
||||
if (groupsetfirstcu && reqcuflags < 0)
|
||||
if (groupsetfirstcu && reqcuflags >= REQ_NONE)
|
||||
{
|
||||
reqcu = firstcu;
|
||||
reqcuflags = firstcuflags;
|
||||
}
|
||||
for (i = 1; (uint32_t)i < repeat_min; i++)
|
||||
for (uint32_t i = 1; i < repeat_min; i++)
|
||||
{
|
||||
memcpy(code, previous, CU2BYTES(len));
|
||||
code += len;
|
||||
|
@ -7259,14 +7354,14 @@ for (;; pptr++)
|
|||
|
||||
/* This is compiling for real */
|
||||
|
||||
else for (i = repeat_max - 1; i >= 0; i--)
|
||||
else for (uint32_t i = repeat_max; i >= 1; i--)
|
||||
{
|
||||
*code++ = OP_BRAZERO + repeat_type;
|
||||
|
||||
/* All but the final copy start a new nesting, maintaining the
|
||||
chain of brackets outstanding. */
|
||||
|
||||
if (i != 0)
|
||||
if (i != 1)
|
||||
{
|
||||
int linkoffset;
|
||||
*code++ = OP_BRA;
|
||||
|
@ -7985,9 +8080,9 @@ Arguments:
|
|||
errorcodeptr -> pointer to error code variable
|
||||
skipunits skip this many code units at start (for brackets and OP_COND)
|
||||
firstcuptr place to put the first required code unit
|
||||
firstcuflagsptr place to put the first code unit flags, or a negative number
|
||||
firstcuflagsptr place to put the first code unit flags
|
||||
reqcuptr place to put the last required code unit
|
||||
reqcuflagsptr place to put the last required code unit flags, or a negative number
|
||||
reqcuflagsptr place to put the last required code unit flags
|
||||
bcptr pointer to the chain of currently open branches
|
||||
cb points to the data block with tables pointers etc.
|
||||
lengthptr NULL during the real compile phase
|
||||
|
@ -8001,7 +8096,7 @@ Returns: 0 There has been an error
|
|||
static int
|
||||
compile_regex(uint32_t options, PCRE2_UCHAR **codeptr, uint32_t **pptrptr,
|
||||
int *errorcodeptr, uint32_t skipunits, uint32_t *firstcuptr,
|
||||
int32_t *firstcuflagsptr, uint32_t *reqcuptr,int32_t *reqcuflagsptr,
|
||||
uint32_t *firstcuflagsptr, uint32_t *reqcuptr, uint32_t *reqcuflagsptr,
|
||||
branch_chain *bcptr, compile_block *cb, PCRE2_SIZE *lengthptr)
|
||||
{
|
||||
PCRE2_UCHAR *code = *codeptr;
|
||||
|
@ -8014,9 +8109,9 @@ int okreturn = 1;
|
|||
uint32_t *pptr = *pptrptr;
|
||||
uint32_t firstcu, reqcu;
|
||||
uint32_t lookbehindlength;
|
||||
int32_t firstcuflags, reqcuflags;
|
||||
uint32_t firstcuflags, reqcuflags;
|
||||
uint32_t branchfirstcu, branchreqcu;
|
||||
int32_t branchfirstcuflags, branchreqcuflags;
|
||||
uint32_t branchfirstcuflags, branchreqcuflags;
|
||||
PCRE2_SIZE length;
|
||||
branch_chain bc;
|
||||
|
||||
|
@ -8135,9 +8230,9 @@ for (;;)
|
|||
|
||||
if (firstcuflags != branchfirstcuflags || firstcu != branchfirstcu)
|
||||
{
|
||||
if (firstcuflags >= 0)
|
||||
if (firstcuflags < REQ_NONE)
|
||||
{
|
||||
if (reqcuflags < 0)
|
||||
if (reqcuflags >= REQ_NONE)
|
||||
{
|
||||
reqcu = firstcu;
|
||||
reqcuflags = firstcuflags;
|
||||
|
@ -8149,8 +8244,8 @@ for (;;)
|
|||
/* If we (now or from before) have no firstcu, a firstcu from the
|
||||
branch becomes a reqcu if there isn't a branch reqcu. */
|
||||
|
||||
if (firstcuflags < 0 && branchfirstcuflags >= 0 &&
|
||||
branchreqcuflags < 0)
|
||||
if (firstcuflags >= REQ_NONE && branchfirstcuflags < REQ_NONE &&
|
||||
branchreqcuflags >= REQ_NONE)
|
||||
{
|
||||
branchreqcu = branchfirstcu;
|
||||
branchreqcuflags = branchfirstcuflags;
|
||||
|
@ -8298,7 +8393,7 @@ Returns: TRUE or FALSE
|
|||
*/
|
||||
|
||||
static BOOL
|
||||
is_anchored(PCRE2_SPTR code, unsigned int bracket_map, compile_block *cb,
|
||||
is_anchored(PCRE2_SPTR code, uint32_t bracket_map, compile_block *cb,
|
||||
int atomcount, BOOL inassert)
|
||||
{
|
||||
do {
|
||||
|
@ -8321,7 +8416,7 @@ do {
|
|||
op == OP_SCBRA || op == OP_SCBRAPOS)
|
||||
{
|
||||
int n = GET2(scode, 1+LINK_SIZE);
|
||||
int new_map = bracket_map | ((n < 32)? (1u << n) : 1);
|
||||
uint32_t new_map = bracket_map | ((n < 32)? (1u << n) : 1);
|
||||
if (!is_anchored(scode, new_map, cb, atomcount, inassert)) return FALSE;
|
||||
}
|
||||
|
||||
|
@ -8681,15 +8776,15 @@ Returns: the fixed first code unit, or 0 with REQ_NONE in flags
|
|||
*/
|
||||
|
||||
static uint32_t
|
||||
find_firstassertedcu(PCRE2_SPTR code, int32_t *flags, uint32_t inassert)
|
||||
find_firstassertedcu(PCRE2_SPTR code, uint32_t *flags, uint32_t inassert)
|
||||
{
|
||||
uint32_t c = 0;
|
||||
int cflags = REQ_NONE;
|
||||
uint32_t cflags = REQ_NONE;
|
||||
|
||||
*flags = REQ_NONE;
|
||||
do {
|
||||
uint32_t d;
|
||||
int dflags;
|
||||
uint32_t dflags;
|
||||
int xl = (*code == OP_CBRA || *code == OP_SCBRA ||
|
||||
*code == OP_CBRAPOS || *code == OP_SCBRAPOS)? IMM2_SIZE:0;
|
||||
PCRE2_SPTR scode = first_significant_code(code + 1+LINK_SIZE + xl, TRUE);
|
||||
|
@ -8712,9 +8807,8 @@ do {
|
|||
case OP_SCRIPT_RUN:
|
||||
d = find_firstassertedcu(scode, &dflags, inassert +
|
||||
((op == OP_ASSERT || op == OP_ASSERT_NA)?1:0));
|
||||
if (dflags < 0)
|
||||
return 0;
|
||||
if (cflags < 0) { c = d; cflags = dflags; }
|
||||
if (dflags >= REQ_NONE) return 0;
|
||||
if (cflags >= REQ_NONE) { c = d; cflags = dflags; }
|
||||
else if (c != d || cflags != dflags) return 0;
|
||||
break;
|
||||
|
||||
|
@ -8727,7 +8821,7 @@ do {
|
|||
case OP_MINPLUS:
|
||||
case OP_POSPLUS:
|
||||
if (inassert == 0) return 0;
|
||||
if (cflags < 0) { c = scode[1]; cflags = 0; }
|
||||
if (cflags >= REQ_NONE) { c = scode[1]; cflags = 0; }
|
||||
else if (c != scode[1]) return 0;
|
||||
break;
|
||||
|
||||
|
@ -8753,7 +8847,7 @@ do {
|
|||
#endif
|
||||
#endif
|
||||
|
||||
if (cflags < 0) { c = scode[1]; cflags = REQ_CASELESS; }
|
||||
if (cflags >= REQ_NONE) { c = scode[1]; cflags = REQ_CASELESS; }
|
||||
else if (c != scode[1]) return 0;
|
||||
break;
|
||||
}
|
||||
|
@ -9689,7 +9783,7 @@ PCRE2_SIZE re_blocksize; /* Size of memory block */
|
|||
PCRE2_SIZE big32count = 0; /* 32-bit literals >= 0x80000000 */
|
||||
PCRE2_SIZE parsed_size_needed; /* Needed for parsed pattern */
|
||||
|
||||
int32_t firstcuflags, reqcuflags; /* Type of first/req code unit */
|
||||
uint32_t firstcuflags, reqcuflags; /* Type of first/req code unit */
|
||||
uint32_t firstcu, reqcu; /* Value of first/req code unit */
|
||||
uint32_t setflags = 0; /* NL and BSR set flags */
|
||||
|
||||
|
@ -10369,13 +10463,13 @@ if ((re->overall_options & PCRE2_NO_START_OPTIMIZE) == 0)
|
|||
(these are not saved during the compile because they can cause conflicts with
|
||||
actual literals that follow). */
|
||||
|
||||
if (firstcuflags < 0)
|
||||
if (firstcuflags >= REQ_NONE)
|
||||
firstcu = find_firstassertedcu(codestart, &firstcuflags, 0);
|
||||
|
||||
/* Save the data for a first code unit. The existence of one means the
|
||||
minimum length must be at least 1. */
|
||||
|
||||
if (firstcuflags >= 0)
|
||||
if (firstcuflags < REQ_NONE)
|
||||
{
|
||||
re->first_codeunit = firstcu;
|
||||
re->flags |= PCRE2_FIRSTSET;
|
||||
|
@ -10422,16 +10516,16 @@ if ((re->overall_options & PCRE2_NO_START_OPTIMIZE) == 0)
|
|||
different character and not a non-starting code unit of the first character,
|
||||
because the minimum length count is in characters, not code units. */
|
||||
|
||||
if (reqcuflags >= 0)
|
||||
if (reqcuflags < REQ_NONE)
|
||||
{
|
||||
#if PCRE2_CODE_UNIT_WIDTH == 16
|
||||
if ((re->overall_options & PCRE2_UTF) == 0 || /* Not UTF */
|
||||
firstcuflags < 0 || /* First not set */
|
||||
firstcuflags >= REQ_NONE || /* First not set */
|
||||
(firstcu & 0xf800) != 0xd800 || /* First not surrogate */
|
||||
(reqcu & 0xfc00) != 0xdc00) /* Req not low surrogate */
|
||||
#elif PCRE2_CODE_UNIT_WIDTH == 8
|
||||
if ((re->overall_options & PCRE2_UTF) == 0 || /* Not UTF */
|
||||
firstcuflags < 0 || /* First not set */
|
||||
firstcuflags >= REQ_NONE || /* First not set */
|
||||
(firstcu & 0x80) == 0 || /* First is ASCII */
|
||||
(reqcu & 0x80) == 0) /* Req is ASCII */
|
||||
#endif
|
||||
|
@ -10528,4 +10622,10 @@ re = NULL;
|
|||
goto EXIT;
|
||||
}
|
||||
|
||||
/* These #undefs are here to enable unity builds with CMake. */
|
||||
|
||||
#undef NLBLOCK /* Block containing newline information */
|
||||
#undef PSSTART /* Field containing processed string start */
|
||||
#undef PSEND /* Field containing processed string end */
|
||||
|
||||
/* End of pcre2_compile.c */
|
||||
|
|
|
@ -7,7 +7,7 @@ and semantics are as close as possible to those of the Perl 5 language.
|
|||
|
||||
Written by Philip Hazel
|
||||
Original API code Copyright (c) 1997-2012 University of Cambridge
|
||||
New API code Copyright (c) 2016-2018 University of Cambridge
|
||||
New API code Copyright (c) 2016-2022 University of Cambridge
|
||||
|
||||
-----------------------------------------------------------------------------
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
|
@ -65,9 +65,8 @@ POSSIBILITY OF SUCH DAMAGE.
|
|||
#define STR_QUERY_s STR_LEFT_PARENTHESIS STR_QUESTION_MARK STR_s STR_RIGHT_PARENTHESIS
|
||||
#define STR_STAR_NUL STR_LEFT_PARENTHESIS STR_ASTERISK STR_N STR_U STR_L STR_RIGHT_PARENTHESIS
|
||||
|
||||
/* States for range and POSIX processing */
|
||||
/* States for POSIX processing */
|
||||
|
||||
enum { RANGE_NOT_STARTED, RANGE_STARTING, RANGE_STARTED };
|
||||
enum { POSIX_START_REGEX, POSIX_ANCHORED, POSIX_NOT_BRACKET,
|
||||
POSIX_CLASS_NOT_STARTED, POSIX_CLASS_STARTING, POSIX_CLASS_STARTED };
|
||||
|
||||
|
|
|
@ -7,7 +7,7 @@ and semantics are as close as possible to those of the Perl 5 language.
|
|||
|
||||
Written by Philip Hazel
|
||||
Original API code Copyright (c) 1997-2012 University of Cambridge
|
||||
New API code Copyright (c) 2016-2021 University of Cambridge
|
||||
New API code Copyright (c) 2016-2022 University of Cambridge
|
||||
|
||||
-----------------------------------------------------------------------------
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
|
@ -350,7 +350,7 @@ Returns: the return from the callout
|
|||
*/
|
||||
|
||||
static int
|
||||
do_callout(PCRE2_SPTR code, PCRE2_SIZE *offsets, PCRE2_SPTR current_subject,
|
||||
do_callout_dfa(PCRE2_SPTR code, PCRE2_SIZE *offsets, PCRE2_SPTR current_subject,
|
||||
PCRE2_SPTR ptr, dfa_match_block *mb, PCRE2_SIZE extracode,
|
||||
PCRE2_SIZE *lengthptr)
|
||||
{
|
||||
|
@ -1193,6 +1193,11 @@ for (;;)
|
|||
OK = prop->script == code[2];
|
||||
break;
|
||||
|
||||
case PT_SCX:
|
||||
OK = (prop->script == code[2] ||
|
||||
MAPBIT(PRIV(ucd_script_sets) + UCD_SCRIPTX_PROP(prop), code[2]) != 0);
|
||||
break;
|
||||
|
||||
/* These are specials for combination cases. */
|
||||
|
||||
case PT_ALNUM:
|
||||
|
@ -1240,6 +1245,15 @@ for (;;)
|
|||
c >= 0xe000;
|
||||
break;
|
||||
|
||||
case PT_BIDICL:
|
||||
OK = UCD_BIDICLASS(c) == code[2];
|
||||
break;
|
||||
|
||||
case PT_BOOL:
|
||||
OK = MAPBIT(PRIV(ucd_boolprop_sets) +
|
||||
UCD_BPROPS_PROP(prop), code[2]) != 0;
|
||||
break;
|
||||
|
||||
/* Should never occur, but keep compilers from grumbling. */
|
||||
|
||||
default:
|
||||
|
@ -1451,6 +1465,11 @@ for (;;)
|
|||
OK = prop->script == code[3];
|
||||
break;
|
||||
|
||||
case PT_SCX:
|
||||
OK = (prop->script == code[3] ||
|
||||
MAPBIT(PRIV(ucd_script_sets) + UCD_SCRIPTX_PROP(prop), code[3]) != 0);
|
||||
break;
|
||||
|
||||
/* These are specials for combination cases. */
|
||||
|
||||
case PT_ALNUM:
|
||||
|
@ -1498,6 +1517,15 @@ for (;;)
|
|||
c >= 0xe000;
|
||||
break;
|
||||
|
||||
case PT_BIDICL:
|
||||
OK = UCD_BIDICLASS(c) == code[3];
|
||||
break;
|
||||
|
||||
case PT_BOOL:
|
||||
OK = MAPBIT(PRIV(ucd_boolprop_sets) +
|
||||
UCD_BPROPS_PROP(prop), code[3]) != 0;
|
||||
break;
|
||||
|
||||
/* Should never occur, but keep compilers from grumbling. */
|
||||
|
||||
default:
|
||||
|
@ -1692,6 +1720,11 @@ for (;;)
|
|||
OK = prop->script == code[3];
|
||||
break;
|
||||
|
||||
case PT_SCX:
|
||||
OK = (prop->script == code[3] ||
|
||||
MAPBIT(PRIV(ucd_script_sets) + UCD_SCRIPTX_PROP(prop), code[3]) != 0);
|
||||
break;
|
||||
|
||||
/* These are specials for combination cases. */
|
||||
|
||||
case PT_ALNUM:
|
||||
|
@ -1739,6 +1772,15 @@ for (;;)
|
|||
c >= 0xe000;
|
||||
break;
|
||||
|
||||
case PT_BIDICL:
|
||||
OK = UCD_BIDICLASS(c) == code[3];
|
||||
break;
|
||||
|
||||
case PT_BOOL:
|
||||
OK = MAPBIT(PRIV(ucd_boolprop_sets) +
|
||||
UCD_BPROPS_PROP(prop), code[3]) != 0;
|
||||
break;
|
||||
|
||||
/* Should never occur, but keep compilers from grumbling. */
|
||||
|
||||
default:
|
||||
|
@ -1958,6 +2000,12 @@ for (;;)
|
|||
OK = prop->script == code[1 + IMM2_SIZE + 2];
|
||||
break;
|
||||
|
||||
case PT_SCX:
|
||||
OK = (prop->script == code[1 + IMM2_SIZE + 2] ||
|
||||
MAPBIT(PRIV(ucd_script_sets) + UCD_SCRIPTX_PROP(prop),
|
||||
code[1 + IMM2_SIZE + 2]) != 0);
|
||||
break;
|
||||
|
||||
/* These are specials for combination cases. */
|
||||
|
||||
case PT_ALNUM:
|
||||
|
@ -2005,6 +2053,15 @@ for (;;)
|
|||
c >= 0xe000;
|
||||
break;
|
||||
|
||||
case PT_BIDICL:
|
||||
OK = UCD_BIDICLASS(c) == code[1 + IMM2_SIZE + 2];
|
||||
break;
|
||||
|
||||
case PT_BOOL:
|
||||
OK = MAPBIT(PRIV(ucd_boolprop_sets) +
|
||||
UCD_BPROPS_PROP(prop), code[1 + IMM2_SIZE + 2]) != 0;
|
||||
break;
|
||||
|
||||
/* Should never occur, but keep compilers from grumbling. */
|
||||
|
||||
default:
|
||||
|
@ -2742,7 +2799,7 @@ for (;;)
|
|||
|| code[LINK_SIZE + 1] == OP_CALLOUT_STR)
|
||||
{
|
||||
PCRE2_SIZE callout_length;
|
||||
rrc = do_callout(code, offsets, current_subject, ptr, mb,
|
||||
rrc = do_callout_dfa(code, offsets, current_subject, ptr, mb,
|
||||
1 + LINK_SIZE, &callout_length);
|
||||
if (rrc < 0) return rrc; /* Abandon */
|
||||
if (rrc > 0) break; /* Fail this thread */
|
||||
|
@ -3139,7 +3196,7 @@ for (;;)
|
|||
case OP_CALLOUT_STR:
|
||||
{
|
||||
PCRE2_SIZE callout_length;
|
||||
rrc = do_callout(code, offsets, current_subject, ptr, mb, 0,
|
||||
rrc = do_callout_dfa(code, offsets, current_subject, ptr, mb, 0,
|
||||
&callout_length);
|
||||
if (rrc < 0) return rrc; /* Abandon */
|
||||
if (rrc == 0)
|
||||
|
@ -3285,8 +3342,15 @@ rws->next = NULL;
|
|||
rws->size = RWS_BASE_SIZE;
|
||||
rws->free = RWS_BASE_SIZE - RWS_ANCHOR_SIZE;
|
||||
|
||||
/* A length equal to PCRE2_ZERO_TERMINATED implies a zero-terminated
|
||||
subject string. */
|
||||
/* Recognize NULL, length 0 as an empty string. */
|
||||
|
||||
if (subject == NULL && length == 0) subject = (PCRE2_SPTR)"";
|
||||
|
||||
/* Plausibility checks */
|
||||
|
||||
if ((options & ~PUBLIC_DFA_MATCH_OPTIONS) != 0) return PCRE2_ERROR_BADOPTION;
|
||||
if (re == NULL || subject == NULL || workspace == NULL || match_data == NULL)
|
||||
return PCRE2_ERROR_NULL;
|
||||
|
||||
if (length == PCRE2_ZERO_TERMINATED)
|
||||
{
|
||||
|
@ -3294,11 +3358,6 @@ if (length == PCRE2_ZERO_TERMINATED)
|
|||
was_zero_terminated = 1;
|
||||
}
|
||||
|
||||
/* Plausibility checks */
|
||||
|
||||
if ((options & ~PUBLIC_DFA_MATCH_OPTIONS) != 0) return PCRE2_ERROR_BADOPTION;
|
||||
if (re == NULL || subject == NULL || workspace == NULL || match_data == NULL)
|
||||
return PCRE2_ERROR_NULL;
|
||||
if (wscount < 20) return PCRE2_ERROR_DFA_WSSIZE;
|
||||
if (start_offset > length) return PCRE2_ERROR_BADOFFSET;
|
||||
|
||||
|
@ -3998,4 +4057,10 @@ while (rws->next != NULL)
|
|||
return rc;
|
||||
}
|
||||
|
||||
/* These #undefs are here to enable unity builds with CMake. */
|
||||
|
||||
#undef NLBLOCK /* Block containing newline information */
|
||||
#undef PSSTART /* Field containing processed string start */
|
||||
#undef PSEND /* Field containing processed string end */
|
||||
|
||||
/* End of pcre2_dfa_match.c */
|
||||
|
|
|
@ -119,7 +119,7 @@ static const unsigned char compile_error_texts[] =
|
|||
/* 45 */
|
||||
"this version of PCRE2 does not have support for \\P, \\p, or \\X\0"
|
||||
"malformed \\P or \\p sequence\0"
|
||||
"unknown property name after \\P or \\p\0"
|
||||
"unknown property after \\P or \\p\0"
|
||||
"subpattern name is too long (maximum " XSTRING(MAX_NAME_SIZE) " code units)\0"
|
||||
"too many named subpatterns (maximum " XSTRING(MAX_NAME_COUNT) ")\0"
|
||||
/* 50 */
|
||||
|
@ -253,7 +253,7 @@ static const unsigned char match_error_texts[] =
|
|||
"unknown substring\0"
|
||||
/* 50 */
|
||||
"non-unique substring name\0"
|
||||
"NULL argument passed\0"
|
||||
"NULL argument passed with non-zero length\0"
|
||||
"nested recursion at the same subject position\0"
|
||||
"matching depth limit exceeded\0"
|
||||
"requested value is not available\0"
|
||||
|
|
|
@ -7,7 +7,7 @@ and semantics are as close as possible to those of the Perl 5 language.
|
|||
|
||||
Written by Philip Hazel
|
||||
Original API code Copyright (c) 1997-2012 University of Cambridge
|
||||
New API code Copyright (c) 2016-2019 University of Cambridge
|
||||
New API code Copyright (c) 2016-2021 University of Cambridge
|
||||
|
||||
-----------------------------------------------------------------------------
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
|
@ -105,7 +105,7 @@ while (eptr < end_subject)
|
|||
/* Not breaking between Regional Indicators is allowed only if there
|
||||
are an even number of preceding RIs. */
|
||||
|
||||
if (lgb == ucp_gbRegionalIndicator && rgb == ucp_gbRegionalIndicator)
|
||||
if (lgb == ucp_gbRegional_Indicator && rgb == ucp_gbRegional_Indicator)
|
||||
{
|
||||
int ricount = 0;
|
||||
PCRE2_SPTR bptr = eptr - 1;
|
||||
|
@ -123,7 +123,7 @@ while (eptr < end_subject)
|
|||
}
|
||||
else
|
||||
c = *bptr;
|
||||
if (UCD_GRAPHBREAK(c) != ucp_gbRegionalIndicator) break;
|
||||
if (UCD_GRAPHBREAK(c) != ucp_gbRegional_Indicator) break;
|
||||
ricount++;
|
||||
}
|
||||
if ((ricount & 1) != 0) break; /* Grapheme break required */
|
||||
|
|
|
@ -151,6 +151,10 @@ for (i = 0; i < 2; i++)
|
|||
int j;
|
||||
uint32_t save_match_options = match_options;
|
||||
|
||||
#ifdef SUPPORT_JIT
|
||||
pcre2_jit_compile(code, PCRE2_JIT_COMPLETE);
|
||||
#endif
|
||||
|
||||
/* Create match data and context blocks only when we first need them. Set
|
||||
low match and depth limits to avoid wasting too much searching large
|
||||
pattern trees. Almost all matches are going to fail. */
|
||||
|
|
|
@ -7,7 +7,7 @@ and semantics are as close as possible to those of the Perl 5 language.
|
|||
|
||||
Written by Philip Hazel
|
||||
Original API code Copyright (c) 1997-2012 University of Cambridge
|
||||
New API code Copyright (c) 2016-2020 University of Cambridge
|
||||
New API code Copyright (c) 2016-2022 University of Cambridge
|
||||
|
||||
-----------------------------------------------------------------------------
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
|
@ -220,18 +220,17 @@ not rely on this. */
|
|||
|
||||
#define COMPILE_ERROR_BASE 100
|
||||
|
||||
/* The initial frames vector for remembering backtracking points in
|
||||
pcre2_match() is allocated on the system stack, of this size (bytes). The size
|
||||
must be a multiple of sizeof(PCRE2_SPTR) in all environments, so making it a
|
||||
multiple of 8 is best. Typical frame sizes are a few hundred bytes (it depends
|
||||
on the number of capturing parentheses) so 20KiB handles quite a few frames. A
|
||||
larger vector on the heap is obtained for patterns that need more frames. The
|
||||
maximum size of this can be limited. */
|
||||
/* The initial frames vector for remembering pcre2_match() backtracking points
|
||||
is allocated on the heap, of this size (bytes) or ten times the frame size if
|
||||
larger, unless the heap limit is smaller. Typical frame sizes are a few hundred
|
||||
bytes (it depends on the number of capturing parentheses) so 20KiB handles
|
||||
quite a few frames. A larger vector on the heap is obtained for matches that
|
||||
need more frames, subject to the heap limit. */
|
||||
|
||||
#define START_FRAMES_SIZE 20480
|
||||
|
||||
/* Similarly, for DFA matching, an initial internal workspace vector is
|
||||
allocated on the stack. */
|
||||
/* For DFA matching, an initial internal workspace vector is allocated on the
|
||||
stack. The heap is used only if this turns out to be too small. */
|
||||
|
||||
#define DFA_START_RWS_SIZE 30720
|
||||
|
||||
|
@ -954,6 +953,13 @@ a positive value. */
|
|||
#define STRING_LIMIT_RECURSION_EQ "LIMIT_RECURSION="
|
||||
#define STRING_MARK "MARK"
|
||||
|
||||
#define STRING_bc "bc"
|
||||
#define STRING_bidiclass "bidiclass"
|
||||
#define STRING_sc "sc"
|
||||
#define STRING_script "script"
|
||||
#define STRING_scriptextensions "scriptextensions"
|
||||
#define STRING_scx "scx"
|
||||
|
||||
#else /* SUPPORT_UNICODE */
|
||||
|
||||
/* UTF-8 support is enabled; always use UTF-8 (=ASCII) character codes. This
|
||||
|
@ -1248,26 +1254,39 @@ only. */
|
|||
#define STRING_LIMIT_RECURSION_EQ STR_L STR_I STR_M STR_I STR_T STR_UNDERSCORE STR_R STR_E STR_C STR_U STR_R STR_S STR_I STR_O STR_N STR_EQUALS_SIGN
|
||||
#define STRING_MARK STR_M STR_A STR_R STR_K
|
||||
|
||||
#define STRING_bc STR_b STR_c
|
||||
#define STRING_bidiclass STR_b STR_i STR_d STR_i STR_c STR_l STR_a STR_s STR_s
|
||||
#define STRING_sc STR_s STR_c
|
||||
#define STRING_script STR_s STR_c STR_r STR_i STR_p STR_t
|
||||
#define STRING_scriptextensions STR_s STR_c STR_r STR_i STR_p STR_t STR_e STR_x STR_t STR_e STR_n STR_s STR_i STR_o STR_n STR_s
|
||||
#define STRING_scx STR_s STR_c STR_x
|
||||
|
||||
|
||||
#endif /* SUPPORT_UNICODE */
|
||||
|
||||
/* -------------------- End of character and string names -------------------*/
|
||||
|
||||
/* -------------------- Definitions for compiled patterns -------------------*/
|
||||
|
||||
/* Codes for different types of Unicode property */
|
||||
/* Codes for different types of Unicode property. If these definitions are
|
||||
changed, the autopossessifying table in pcre2_auto_possess.c must be updated to
|
||||
match. */
|
||||
|
||||
#define PT_ANY 0 /* Any property - matches all chars */
|
||||
#define PT_LAMP 1 /* L& - the union of Lu, Ll, Lt */
|
||||
#define PT_GC 2 /* Specified general characteristic (e.g. L) */
|
||||
#define PT_PC 3 /* Specified particular characteristic (e.g. Lu) */
|
||||
#define PT_SC 4 /* Script (e.g. Han) */
|
||||
#define PT_ALNUM 5 /* Alphanumeric - the union of L and N */
|
||||
#define PT_SPACE 6 /* Perl space - Z plus 9,10,12,13 */
|
||||
#define PT_PXSPACE 7 /* POSIX space - Z plus 9,10,11,12,13 */
|
||||
#define PT_WORD 8 /* Word - L plus N plus underscore */
|
||||
#define PT_CLIST 9 /* Pseudo-property: match character list */
|
||||
#define PT_UCNC 10 /* Universal Character nameable character */
|
||||
#define PT_TABSIZE 11 /* Size of square table for autopossessify tests */
|
||||
#define PT_SC 4 /* Script only (e.g. Han) */
|
||||
#define PT_SCX 5 /* Script extensions (includes SC) */
|
||||
#define PT_ALNUM 6 /* Alphanumeric - the union of L and N */
|
||||
#define PT_SPACE 7 /* Perl space - general category Z plus 9,10,12,13 */
|
||||
#define PT_PXSPACE 8 /* POSIX space - Z plus 9,10,11,12,13 */
|
||||
#define PT_WORD 9 /* Word - L plus N plus underscore */
|
||||
#define PT_CLIST 10 /* Pseudo-property: match character list */
|
||||
#define PT_UCNC 11 /* Universal Character nameable character */
|
||||
#define PT_BIDICL 12 /* Specified bidi class */
|
||||
#define PT_BOOL 13 /* Boolean property */
|
||||
#define PT_TABSIZE 14 /* Size of square table for autopossessify tests */
|
||||
|
||||
/* The following special properties are used only in XCLASS items, when POSIX
|
||||
classes are specified and PCRE2_UCP is set - in other words, for Unicode
|
||||
|
@ -1275,9 +1294,14 @@ handling of these classes. They are not available via the \p or \P escapes like
|
|||
those in the above list, and so they do not take part in the autopossessifying
|
||||
table. */
|
||||
|
||||
#define PT_PXGRAPH 11 /* [:graph:] - characters that mark the paper */
|
||||
#define PT_PXPRINT 12 /* [:print:] - [:graph:] plus non-control spaces */
|
||||
#define PT_PXPUNCT 13 /* [:punct:] - punctuation characters */
|
||||
#define PT_PXGRAPH 14 /* [:graph:] - characters that mark the paper */
|
||||
#define PT_PXPRINT 15 /* [:print:] - [:graph:] plus non-control spaces */
|
||||
#define PT_PXPUNCT 16 /* [:punct:] - punctuation characters */
|
||||
|
||||
/* This value is used when parsing \p and \P escapes to indicate that neither
|
||||
\p{script:...} nor \p{scx:...} has been encountered. */
|
||||
|
||||
#define PT_NOTSCRIPT 255
|
||||
|
||||
/* Flag bits and data types for the extended class (OP_XCLASS) for classes that
|
||||
contain characters with values greater than 255. */
|
||||
|
@ -1797,8 +1821,8 @@ typedef struct {
|
|||
uint8_t gbprop; /* ucp_gbControl, etc. (grapheme break property) */
|
||||
uint8_t caseset; /* offset to multichar other cases or zero */
|
||||
int32_t other_case; /* offset to other case, or zero if none */
|
||||
int16_t scriptx; /* script extension value */
|
||||
int16_t dummy; /* spare - to round to multiple of 4 bytes */
|
||||
uint16_t scriptx_bidiclass; /* script extension (11 bit) and bidi class (5 bit) values */
|
||||
uint16_t bprops; /* binary properties offset */
|
||||
} ucd_record;
|
||||
|
||||
/* UCD access macros */
|
||||
|
@ -1815,13 +1839,30 @@ typedef struct {
|
|||
#define GET_UCD(ch) REAL_GET_UCD(ch)
|
||||
#endif
|
||||
|
||||
#define UCD_SCRIPTX_MASK 0x3ff
|
||||
#define UCD_BIDICLASS_SHIFT 11
|
||||
#define UCD_BPROPS_MASK 0xfff
|
||||
|
||||
#define UCD_SCRIPTX_PROP(prop) ((prop)->scriptx_bidiclass & UCD_SCRIPTX_MASK)
|
||||
#define UCD_BIDICLASS_PROP(prop) ((prop)->scriptx_bidiclass >> UCD_BIDICLASS_SHIFT)
|
||||
#define UCD_BPROPS_PROP(prop) ((prop)->bprops & UCD_BPROPS_MASK)
|
||||
|
||||
#define UCD_CHARTYPE(ch) GET_UCD(ch)->chartype
|
||||
#define UCD_SCRIPT(ch) GET_UCD(ch)->script
|
||||
#define UCD_CATEGORY(ch) PRIV(ucp_gentype)[UCD_CHARTYPE(ch)]
|
||||
#define UCD_GRAPHBREAK(ch) GET_UCD(ch)->gbprop
|
||||
#define UCD_CASESET(ch) GET_UCD(ch)->caseset
|
||||
#define UCD_OTHERCASE(ch) ((uint32_t)((int)ch + (int)(GET_UCD(ch)->other_case)))
|
||||
#define UCD_SCRIPTX(ch) GET_UCD(ch)->scriptx
|
||||
#define UCD_SCRIPTX(ch) UCD_SCRIPTX_PROP(GET_UCD(ch))
|
||||
#define UCD_BPROPS(ch) UCD_BPROPS_PROP(GET_UCD(ch))
|
||||
#define UCD_BIDICLASS(ch) UCD_BIDICLASS_PROP(GET_UCD(ch))
|
||||
|
||||
/* The "scriptx" and bprops fields contain offsets into vectors of 32-bit words
|
||||
that form a bitmap representing a list of scripts or boolean properties. These
|
||||
macros test or set a bit in the map by number. */
|
||||
|
||||
#define MAPBIT(map,n) ((map)[(n)/32]&(1u<<((n)%32)))
|
||||
#define MAPSET(map,n) ((map)[(n)/32]|=(1u<<((n)%32)))
|
||||
|
||||
/* Header for serialized pcre2 codes. */
|
||||
|
||||
|
@ -1878,6 +1919,7 @@ extern const uint8_t PRIV(utf8_table4)[];
|
|||
#endif
|
||||
#define _pcre2_hspace_list PCRE2_SUFFIX(_pcre2_hspace_list_)
|
||||
#define _pcre2_vspace_list PCRE2_SUFFIX(_pcre2_vspace_list_)
|
||||
#define _pcre2_ucd_boolprop_sets PCRE2_SUFFIX(_pcre2_ucd_boolprop_sets_)
|
||||
#define _pcre2_ucd_caseless_sets PCRE2_SUFFIX(_pcre2_ucd_caseless_sets_)
|
||||
#define _pcre2_ucd_digit_sets PCRE2_SUFFIX(_pcre2_ucd_digit_sets_)
|
||||
#define _pcre2_ucd_script_sets PCRE2_SUFFIX(_pcre2_ucd_script_sets_)
|
||||
|
@ -1901,9 +1943,10 @@ extern const pcre2_match_context PRIV(default_match_context);
|
|||
extern const uint8_t PRIV(default_tables)[];
|
||||
extern const uint32_t PRIV(hspace_list)[];
|
||||
extern const uint32_t PRIV(vspace_list)[];
|
||||
extern const uint32_t PRIV(ucd_boolprop_sets)[];
|
||||
extern const uint32_t PRIV(ucd_caseless_sets)[];
|
||||
extern const uint32_t PRIV(ucd_digit_sets)[];
|
||||
extern const uint8_t PRIV(ucd_script_sets)[];
|
||||
extern const uint32_t PRIV(ucd_script_sets)[];
|
||||
extern const ucd_record PRIV(ucd_records)[];
|
||||
#if PCRE2_CODE_UNIT_WIDTH == 32
|
||||
extern const ucd_record PRIV(dummy_ucd_record)[];
|
||||
|
|
|
@ -7,7 +7,7 @@ and semantics are as close as possible to those of the Perl 5 language.
|
|||
|
||||
Written by Philip Hazel
|
||||
Original API code Copyright (c) 1997-2012 University of Cambridge
|
||||
New API code Copyright (c) 2016-2018 University of Cambridge
|
||||
New API code Copyright (c) 2016-2022 University of Cambridge
|
||||
|
||||
-----------------------------------------------------------------------------
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
|
@ -519,7 +519,7 @@ it is. This is called only in UTF-32 mode - we don't put a test within the
|
|||
macro because almost all calls are already within a block of UTF-32 only
|
||||
code.
|
||||
|
||||
These are all no-ops since all UTF-32 characters fit into one pcre_uchar. */
|
||||
These are all no-ops since all UTF-32 characters fit into one PCRE2_UCHAR. */
|
||||
|
||||
#define BACKCHAR(eptr) do { } while (0)
|
||||
|
||||
|
@ -649,11 +649,15 @@ the size varies from call to call. As the maximum number of capturing
|
|||
subpatterns is 65535 we must allow for 65536 strings to include the overall
|
||||
match. (See also the heapframe structure below.) */
|
||||
|
||||
struct heapframe; /* Forward reference */
|
||||
|
||||
typedef struct pcre2_real_match_data {
|
||||
pcre2_memctl memctl;
|
||||
pcre2_memctl memctl; /* Memory control fields */
|
||||
const pcre2_real_code *code; /* The pattern used for the match */
|
||||
PCRE2_SPTR subject; /* The subject that was matched */
|
||||
PCRE2_SPTR mark; /* Pointer to last mark */
|
||||
struct heapframe *heapframes; /* Backtracking frames heap memory */
|
||||
PCRE2_SIZE heapframes_size; /* Malloc-ed size */
|
||||
PCRE2_SIZE leftchar; /* Offset to leftmost code unit */
|
||||
PCRE2_SIZE rightchar; /* Offset to rightmost code unit */
|
||||
PCRE2_SIZE startchar; /* Offset to starting code unit */
|
||||
|
@ -747,8 +751,8 @@ typedef struct compile_block {
|
|||
uint32_t class_range_start; /* Overall class range start */
|
||||
uint32_t class_range_end; /* Overall class range end */
|
||||
PCRE2_UCHAR nl[4]; /* Newline string when fixed length */
|
||||
uint32_t req_varyopt; /* "After variable item" flag for reqbyte */
|
||||
int max_lookbehind; /* Maximum lookbehind (characters) */
|
||||
int req_varyopt; /* "After variable item" flag for reqbyte */
|
||||
BOOL had_accept; /* (*ACCEPT) encountered */
|
||||
BOOL had_pruneorskip; /* (*PRUNE) or (*SKIP) encountered */
|
||||
BOOL had_recurse; /* Had a recursion or subroutine call */
|
||||
|
@ -764,7 +768,7 @@ typedef struct pcre2_real_jit_stack {
|
|||
} pcre2_real_jit_stack;
|
||||
|
||||
/* Structure for items in a linked list that represents an explicit recursive
|
||||
call within the pattern when running pcre_dfa_match(). */
|
||||
call within the pattern when running pcre2_dfa_match(). */
|
||||
|
||||
typedef struct dfa_recursion_info {
|
||||
struct dfa_recursion_info *prevrec;
|
||||
|
@ -838,15 +842,22 @@ multiple of PCRE2_SIZE. See various comments above. */
|
|||
typedef char check_heapframe_size[
|
||||
((sizeof(heapframe) % sizeof(PCRE2_SIZE)) == 0)? (+1):(-1)];
|
||||
|
||||
/* Structure for computing the alignment of heapframe. */
|
||||
|
||||
typedef struct heapframe_align {
|
||||
char unalign; /* Completely unalign the current offset */
|
||||
heapframe frame; /* Offset is its alignment */
|
||||
} heapframe_align;
|
||||
|
||||
/* This define is the minimum alignment required for a heapframe, in bytes. */
|
||||
|
||||
#define HEAPFRAME_ALIGNMENT offsetof(heapframe_align, frame)
|
||||
|
||||
/* Structure for passing "static" information around between the functions
|
||||
doing traditional NFA matching (pcre2_match() and friends). */
|
||||
|
||||
typedef struct match_block {
|
||||
pcre2_memctl memctl; /* For general use */
|
||||
PCRE2_SIZE frame_vector_size; /* Size of a backtracking frame */
|
||||
heapframe *match_frames; /* Points to vector of frames */
|
||||
heapframe *match_frames_top; /* Points after the end of the vector */
|
||||
heapframe *stack_frames; /* The original vector on the stack */
|
||||
PCRE2_SIZE heap_limit; /* As it says */
|
||||
uint32_t match_limit; /* As it says */
|
||||
uint32_t match_limit_depth; /* As it says */
|
||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -120,7 +120,7 @@ else if ((options & PCRE2_PARTIAL_SOFT) != 0)
|
|||
if (functions == NULL || functions->executable_funcs[index] == NULL)
|
||||
return PCRE2_ERROR_JIT_BADOPTION;
|
||||
|
||||
/* Sanity checks should be handled by pcre_exec. */
|
||||
/* Sanity checks should be handled by pcre2_match. */
|
||||
arguments.str = subject + start_offset;
|
||||
arguments.begin = subject;
|
||||
arguments.end = subject + length;
|
||||
|
|
|
@ -135,7 +135,7 @@ return NULL;
|
|||
|
||||
pcre2_jit_stack *jit_stack;
|
||||
|
||||
if (startsize < 1 || maxsize < 1)
|
||||
if (startsize == 0 || maxsize == 0 || maxsize > SIZE_MAX - STACK_GROWTH_RATE)
|
||||
return NULL;
|
||||
if (startsize > maxsize)
|
||||
startsize = maxsize;
|
||||
|
|
|
@ -339,7 +339,7 @@ if (common->mode != PCRE2_JIT_COMPLETE)
|
|||
{
|
||||
JUMPHERE(partial_quit[0]);
|
||||
JUMPHERE(partial_quit[1]);
|
||||
OP2(SLJIT_SUB | SLJIT_SET_GREATER, SLJIT_UNUSED, 0, STR_PTR, 0, STR_END, 0);
|
||||
OP2U(SLJIT_SUB | SLJIT_SET_GREATER, STR_PTR, 0, STR_END, 0);
|
||||
CMOV(SLJIT_GREATER, STR_PTR, STR_END, 0);
|
||||
}
|
||||
else
|
||||
|
@ -537,7 +537,7 @@ if (common->match_end_ptr != 0)
|
|||
OP1(SLJIT_MOV, TMP3, 0, STR_END, 0);
|
||||
OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, SLJIT_IMM, IN_UCHARS(offs1 + 1));
|
||||
|
||||
OP2(SLJIT_SUB | SLJIT_SET_LESS, SLJIT_UNUSED, 0, TMP1, 0, STR_END, 0);
|
||||
OP2U(SLJIT_SUB | SLJIT_SET_LESS, TMP1, 0, STR_END, 0);
|
||||
CMOV(SLJIT_LESS, STR_END, TMP1, 0);
|
||||
}
|
||||
|
||||
|
@ -883,14 +883,14 @@ if (char1 == char2)
|
|||
|
||||
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
|
||||
if (common->utf && offset > 0)
|
||||
sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_RET(SW) | SLJIT_ARG1(SW) | SLJIT_ARG2(UW) | SLJIT_ARG3(UW) | SLJIT_ARG4(UW),
|
||||
SLJIT_IMM, SLJIT_FUNC_OFFSET(ffcs_utf));
|
||||
sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_ARGS4(W, W, W, W, W),
|
||||
SLJIT_IMM, SLJIT_FUNC_ADDR(ffcs_utf));
|
||||
else
|
||||
sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_RET(SW) | SLJIT_ARG1(SW) | SLJIT_ARG2(UW) | SLJIT_ARG3(UW) | SLJIT_ARG4(UW),
|
||||
SLJIT_IMM, SLJIT_FUNC_OFFSET(ffcs));
|
||||
sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_ARGS4(W, W, W, W, W),
|
||||
SLJIT_IMM, SLJIT_FUNC_ADDR(ffcs));
|
||||
#else
|
||||
sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_RET(SW) | SLJIT_ARG1(SW) | SLJIT_ARG2(UW) | SLJIT_ARG3(UW) | SLJIT_ARG4(UW),
|
||||
SLJIT_IMM, SLJIT_FUNC_OFFSET(ffcs));
|
||||
sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_ARGS4(W, W, W, W, W),
|
||||
SLJIT_IMM, SLJIT_FUNC_ADDR(ffcs));
|
||||
#endif
|
||||
}
|
||||
else
|
||||
|
@ -904,14 +904,14 @@ else
|
|||
|
||||
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
|
||||
if (common->utf && offset > 0)
|
||||
sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_RET(SW) | SLJIT_ARG1(SW) | SLJIT_ARG2(UW) | SLJIT_ARG3(UW) | SLJIT_ARG4(UW),
|
||||
SLJIT_IMM, SLJIT_FUNC_OFFSET(ffcs_mask_utf));
|
||||
sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_ARGS4(W, W, W, W, W),
|
||||
SLJIT_IMM, SLJIT_FUNC_ADDR(ffcs_mask_utf));
|
||||
else
|
||||
sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_RET(SW) | SLJIT_ARG1(SW) | SLJIT_ARG2(UW) | SLJIT_ARG3(UW) | SLJIT_ARG4(UW),
|
||||
SLJIT_IMM, SLJIT_FUNC_OFFSET(ffcs_mask));
|
||||
sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_ARGS4(W, W, W, W, W),
|
||||
SLJIT_IMM, SLJIT_FUNC_ADDR(ffcs_mask));
|
||||
#else
|
||||
sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_RET(SW) | SLJIT_ARG1(SW) | SLJIT_ARG2(UW) | SLJIT_ARG3(UW) | SLJIT_ARG4(UW),
|
||||
SLJIT_IMM, SLJIT_FUNC_OFFSET(ffcs_mask));
|
||||
sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_ARGS4(W, W, W, W, W),
|
||||
SLJIT_IMM, SLJIT_FUNC_ADDR(ffcs_mask));
|
||||
#endif
|
||||
}
|
||||
else
|
||||
|
@ -922,14 +922,14 @@ else
|
|||
|
||||
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
|
||||
if (common->utf && offset > 0)
|
||||
sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_RET(SW) | SLJIT_ARG1(SW) | SLJIT_ARG2(UW) | SLJIT_ARG3(UW) | SLJIT_ARG4(UW),
|
||||
SLJIT_IMM, SLJIT_FUNC_OFFSET(ffcs_2_utf));
|
||||
sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_ARGS4(W, W, W, W, W),
|
||||
SLJIT_IMM, SLJIT_FUNC_ADDR(ffcs_2_utf));
|
||||
else
|
||||
sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_RET(SW) | SLJIT_ARG1(SW) | SLJIT_ARG2(UW) | SLJIT_ARG3(UW) | SLJIT_ARG4(UW),
|
||||
SLJIT_IMM, SLJIT_FUNC_OFFSET(ffcs_2));
|
||||
sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_ARGS4(W, W, W, W, W),
|
||||
SLJIT_IMM, SLJIT_FUNC_ADDR(ffcs_2));
|
||||
#else
|
||||
sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_RET(SW) | SLJIT_ARG1(SW) | SLJIT_ARG2(UW) | SLJIT_ARG3(UW) | SLJIT_ARG4(UW),
|
||||
SLJIT_IMM, SLJIT_FUNC_OFFSET(ffcs_2));
|
||||
sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_ARGS4(W, W, W, W, W),
|
||||
SLJIT_IMM, SLJIT_FUNC_ADDR(ffcs_2));
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
@ -1067,7 +1067,7 @@ else
|
|||
OP1(SLJIT_MOV, SLJIT_R0, 0, SLJIT_MEM1(SLJIT_SP), common->match_end_ptr);
|
||||
OP2(SLJIT_ADD, SLJIT_R0, 0, SLJIT_R0, 0, SLJIT_IMM, IN_UCHARS(offs1 + 1));
|
||||
|
||||
OP2(SLJIT_SUB | SLJIT_SET_LESS, SLJIT_UNUSED, 0, STR_END, 0, SLJIT_R0, 0);
|
||||
OP2U(SLJIT_SUB | SLJIT_SET_LESS, STR_END, 0, SLJIT_R0, 0);
|
||||
CMOV(SLJIT_LESS, SLJIT_R0, STR_END, 0);
|
||||
}
|
||||
|
||||
|
@ -1084,31 +1084,31 @@ if (diff == 1) {
|
|||
if (char1a == char1b && char2a == char2b) {
|
||||
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
|
||||
if (common->utf)
|
||||
sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_RET(SW) | SLJIT_ARG1(SW) | SLJIT_ARG2(SW) | SLJIT_ARG3(SW) | SLJIT_ARG4(SW),
|
||||
SLJIT_IMM, SLJIT_FUNC_OFFSET(ffcps_0_utf));
|
||||
sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_ARGS4(W, W, W, W, W),
|
||||
SLJIT_IMM, SLJIT_FUNC_ADDR(ffcps_0_utf));
|
||||
else
|
||||
#endif
|
||||
sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_RET(SW) | SLJIT_ARG1(SW) | SLJIT_ARG2(SW) | SLJIT_ARG3(SW) | SLJIT_ARG4(SW),
|
||||
SLJIT_IMM, SLJIT_FUNC_OFFSET(ffcps_0));
|
||||
sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_ARGS4(W, W, W, W, W),
|
||||
SLJIT_IMM, SLJIT_FUNC_ADDR(ffcps_0));
|
||||
} else {
|
||||
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
|
||||
if (common->utf)
|
||||
sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_RET(SW) | SLJIT_ARG1(SW) | SLJIT_ARG2(SW) | SLJIT_ARG3(SW) | SLJIT_ARG4(SW),
|
||||
SLJIT_IMM, SLJIT_FUNC_OFFSET(ffcps_1_utf));
|
||||
sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_ARGS4(W, W, W, W, W),
|
||||
SLJIT_IMM, SLJIT_FUNC_ADDR(ffcps_1_utf));
|
||||
else
|
||||
#endif
|
||||
sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_RET(SW) | SLJIT_ARG1(SW) | SLJIT_ARG2(SW) | SLJIT_ARG3(SW) | SLJIT_ARG4(SW),
|
||||
SLJIT_IMM, SLJIT_FUNC_OFFSET(ffcps_1));
|
||||
sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_ARGS4(W, W, W, W, W),
|
||||
SLJIT_IMM, SLJIT_FUNC_ADDR(ffcps_1));
|
||||
}
|
||||
} else {
|
||||
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
|
||||
if (common->utf)
|
||||
sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_RET(SW) | SLJIT_ARG1(SW) | SLJIT_ARG2(SW) | SLJIT_ARG3(SW) | SLJIT_ARG4(SW),
|
||||
SLJIT_IMM, SLJIT_FUNC_OFFSET(ffcps_default_utf));
|
||||
sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_ARGS4(W, W, W, W, W),
|
||||
SLJIT_IMM, SLJIT_FUNC_ADDR(ffcps_default_utf));
|
||||
else
|
||||
#endif
|
||||
sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_RET(SW) | SLJIT_ARG1(SW) | SLJIT_ARG2(SW) | SLJIT_ARG3(SW) | SLJIT_ARG4(SW),
|
||||
SLJIT_IMM, SLJIT_FUNC_OFFSET(ffcps_default));
|
||||
sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_ARGS4(W, W, W, W, W),
|
||||
SLJIT_IMM, SLJIT_FUNC_ADDR(ffcps_default));
|
||||
}
|
||||
|
||||
/* Restore STR_PTR register. */
|
||||
|
@ -1418,7 +1418,7 @@ if (common->mode != PCRE2_JIT_COMPLETE)
|
|||
{
|
||||
JUMPHERE(partial_quit[0]);
|
||||
JUMPHERE(partial_quit[1]);
|
||||
OP2(SLJIT_SUB | SLJIT_SET_GREATER, SLJIT_UNUSED, 0, STR_PTR, 0, STR_END, 0);
|
||||
OP2U(SLJIT_SUB | SLJIT_SET_GREATER, STR_PTR, 0, STR_END, 0);
|
||||
CMOV(SLJIT_GREATER, STR_PTR, STR_END, 0);
|
||||
}
|
||||
else
|
||||
|
@ -1673,7 +1673,7 @@ if (common->match_end_ptr != 0)
|
|||
OP1(SLJIT_MOV, TMP3, 0, STR_END, 0);
|
||||
OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, SLJIT_IMM, IN_UCHARS(offs1 + 1));
|
||||
|
||||
OP2(SLJIT_SUB | SLJIT_SET_LESS, SLJIT_UNUSED, 0, TMP1, 0, STR_END, 0);
|
||||
OP2U(SLJIT_SUB | SLJIT_SET_LESS, TMP1, 0, STR_END, 0);
|
||||
CMOV(SLJIT_LESS, STR_END, TMP1, 0);
|
||||
}
|
||||
|
||||
|
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue