Compare commits
177 Commits
pcre2-10.3
...
amigaos
Author | SHA1 | Date |
---|---|---|
George Sokianos | 4a45482c9c | |
Philip Hazel | 8b133fa0ba | |
Philip Hazel | cc5e121c8e | |
Philip Hazel | 1343bdff8f | |
Philip Hazel | d90fb23878 | |
Ezekiel Warren | e47fc51584 | |
Zoltan Herczeg | b67d568201 | |
Zoltan Herczeg | 4851890ede | |
Amin Yahyaabadi | 3e52db5209 | |
Philip Hazel | 4804b00e8f | |
Philip Hazel | 7549fdca74 | |
Philip Hazel | 5271b533c4 | |
larinsv | 45af1203bd | |
Rémi Verschelde | 187b7ba050 | |
William A Rowe Jr | 06f34ba374 | |
GregThain | a334ea2a34 | |
Carlo Marcelo Arenas Belón | 15a82c3efd | |
Philip Hazel | 51a5fcdc1f | |
Philip Hazel | 104fe2fead | |
Philip Hazel | f65df06305 | |
pkeir | a13d7d4340 | |
Lucas Trzesniewski | c630e868ca | |
Joe Zhang | 77ce1ff528 | |
Philip Hazel | ff5402a378 | |
Philip Hazel | b52d055d1b | |
Carlo Marcelo Arenas Belón | a4ac97fea8 | |
Philip Hazel | fedf4d9d40 | |
Philip Hazel | 8ebf9efe7b | |
Carlo Marcelo Arenas Belón | 4edcf6ada5 | |
Philip Hazel | d0c7544e78 | |
Carlo Marcelo Arenas Belón | f28e82602d | |
Philip Hazel | 1bb2b97b29 | |
Lucas Trzesniewski | 3fec24a26f | |
Philip Hazel | 66b3cb34df | |
Philip Hazel | 29a43aa11d | |
Philip Hazel | 3103b8f20a | |
Philip Hazel | 13be26a5c2 | |
pagabuc | ba6a5f16d2 | |
Zoltan Herczeg | d07c967b3a | |
Carlo Marcelo Arenas Belón | 4279abbd7d | |
Philip Hazel | 8ff3ab27d5 | |
Zoltan Herczeg | e612e06b5d | |
Philip Hazel | 64c9baaaa4 | |
Carlo Marcelo Arenas Belón | 9c8abddc52 | |
Carlo Marcelo Arenas Belón | f11c26842d | |
Zoltan Herczeg | 4ca0530b9b | |
Zoltan Herczeg | 03654e751e | |
Zoltan Herczeg | d4fa336fbc | |
Zoltan Herczeg | 50a51cb7e6 | |
Philip Hazel | f7a7341726 | |
Philip Hazel | eef5740ff9 | |
Zoltan Herczeg | dea56d2df9 | |
Adam | 111cd470b5 | |
Philip Hazel | fdd9479108 | |
Philip Hazel | 419e3c68a3 | |
Zoltan Herczeg | e21345de97 | |
Philip Hazel | e85a81ebac | |
Philip Hazel | 504ff06fff | |
Philip Hazel | 360a84e80b | |
Zoltan Herczeg | 061e57695a | |
Philip Hazel | 7f7d3e8521 | |
Philip Hazel | bf35c0518c | |
Zoltan Herczeg | 68fbc1982e | |
Philip Hazel | 06d3a66065 | |
Philip Hazel | 87571b5af3 | |
Philip Hazel | 838cdac4dc | |
Philip Hazel | 628a804102 | |
Philip Hazel | ec091e2e44 | |
Philip Hazel | 636569a957 | |
Philip Hazel | 81d3729c66 | |
Zoltan Herczeg | f90542a209 | |
Carlo Marcelo Arenas Belón | 14dbc6e6ec | |
Philip Hazel | 80205ee2a0 | |
Jessica Clarke | 04ecb267c0 | |
Jessica Clarke | 534b4760e3 | |
Philip Hazel | 31fb2e58a1 | |
Zoltan Herczeg | 435140a0ac | |
Philip Hazel | c24047f15d | |
Zoltan Herczeg | e7457003cd | |
Philip Hazel | d888d36013 | |
Zoltan Herczeg | 6614b281bc | |
Zoltan Herczeg | afa4756d19 | |
Philip Hazel | 7713f33e46 | |
Michael Kaufmann | af2637ee5e | |
Philip Hazel | 98e7d70bc6 | |
Philip Hazel | 321b559ed4 | |
Philip Hazel | 16c8a84cce | |
Philip Hazel | 4514ddd2a2 | |
Philip Hazel | 944f0e10a1 | |
Philip Hazel | b29732063b | |
Philip Hazel | 92d7cf1dd0 | |
Philip Hazel | 1d432ee3cf | |
Philip Hazel | 194a15315a | |
Philip Hazel | 1c41a5b815 | |
Zoltan Herczeg | 4243515033 | |
Philip Hazel | 49b29f837d | |
Philip Hazel | 30abd0ac8d | |
Philip Hazel | 0246c6bf64 | |
Philip Hazel | 823d4ac956 | |
Philip Hazel | ba3d0edcbd | |
Philip Hazel | 4ef0c51d2b | |
Philip Hazel | 7ab2769728 | |
Philip Hazel | 2a294ddadb | |
Philip Hazel | cb854a912e | |
Philip Hazel | 16dccbcb13 | |
Carlo Marcelo Arenas Belón | ae4e6261e5 | |
Carlo Marcelo Arenas Belón | d24a1c9d31 | |
Carlo Marcelo Arenas Belón | 055b7ce4a9 | |
Philip Hazel | 4a8f5d104c | |
Carlo Marcelo Arenas Belón | 587b94277b | |
Philip Hazel | c8d31f1605 | |
Carlo Marcelo Arenas Belón | adf76faace | |
Zoltan Herczeg | d144199dfb | |
Carlo Marcelo Arenas Belón | eb42305f07 | |
Philip Hazel | 46890604a4 | |
Carlo Marcelo Arenas Belón | acc520924c | |
Philip Hazel | bc70a183fc | |
Carlo Marcelo Arenas Belón | dae475092d | |
Philip Hazel | 1ed34b9cb1 | |
Philip Hazel | f19e84674e | |
Carlo Marcelo Arenas Belón | 7db8784296 | |
Philip Hazel | 072717a61f | |
Philip Hazel | 35fee4193b | |
Philip Hazel | 3469b13b8e | |
Philip Hazel | 29c37f9aa3 | |
Carlo Marcelo Arenas Belón | 128c50360c | |
Philip Hazel | bf2c8cc564 | |
Philip Hazel | 87f32b9b39 | |
Philip Hazel | 7ed39af7cc | |
Carlo Marcelo Arenas Belón | 3b973ebf4b | |
Carlo Marcelo Arenas Belón | f5e4e10042 | |
Carlo Marcelo Arenas Belón | d46f1863be | |
Philip Hazel | c99f0738c5 | |
Philip Hazel | 794470b51d | |
PhilipHazel | 179c5d212c | |
Lucas Trzesniewski | ec0755b829 | |
Philip Hazel | 8d9e91228c | |
PhilipHazel | e7af7efaa1 | |
Zoltan Herczeg | 51ec2c9893 | |
Philip Hazel | 0612ed77c2 | |
Philip Hazel | 507e4dcf6f | |
Zoltan Herczeg | dc5f966635 | |
Philip Hazel | 8f3e11a355 | |
Philip Hazel | e2fde18833 | |
Philip Hazel | 857ac92372 | |
Philip Hazel | 31a46200fa | |
Philip Hazel | edcc076bd8 | |
Philip Hazel | c232286c6b | |
Philip Hazel | 21c26698b3 | |
Philip Hazel | eea410b33a | |
Philip Hazel | d5a61ee891 | |
Philip Hazel | 6c2fe9da99 | |
Philip Hazel | 5ff1daffa0 | |
Philip Hazel | f4beac6c1a | |
Philip Hazel | e1cd61c292 | |
Philip Hazel | 6ee9921a89 | |
Philip Hazel | b8c60ce272 | |
Philip Hazel | b61aa572f6 | |
Philip Hazel | 25bb9de6fc | |
Philip Hazel | e74a9b6932 | |
PhilipHazel | 30036e670f | |
Philip Hazel | a8c4ef7f20 | |
Philip Hazel | c2fc6cfa0a | |
Philip Hazel | 587e46b372 | |
Philip Hazel | d8267c20fd | |
Philip Hazel | 15b692fd82 | |
Philip Hazel | 4ccef1697a | |
Philip Hazel | 5c0d38b3a8 | |
Philip Hazel | 23c16e6ced | |
Philip Hazel | 876ba431b0 | |
Philip Hazel | f64fbed2e1 | |
Philip.Hazel | 2410fbe386 | |
Philip.Hazel | d70da76dfb | |
Zoltán Herczeg | a5389db88d | |
Zoltán Herczeg | 3d80cf5a25 | |
Zoltán Herczeg | 900921f83e | |
Zoltán Herczeg | 1951243b5d |
|
@ -0,0 +1,3 @@
|
|||
common --experimental_enable_bzlmod
|
||||
build --incompatible_enable_cc_toolchain_resolution
|
||||
build --incompatible_strict_action_env
|
|
@ -0,0 +1,77 @@
|
|||
|
||||
name: Build
|
||||
on: [push, pull_request]
|
||||
|
||||
jobs:
|
||||
linux:
|
||||
name: Linux
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v2
|
||||
|
||||
- name: Autogen
|
||||
run: ./autogen.sh
|
||||
|
||||
- name: Configure
|
||||
run: ./configure --enable-jit --enable-pcre2-8 --enable-pcre2-16 --enable-pcre2-32
|
||||
|
||||
- name: Build
|
||||
run: make
|
||||
|
||||
- name: Test (main test script)
|
||||
run: ./RunTest
|
||||
|
||||
- name: Test (JIT test program)
|
||||
run: ./pcre2_jit_test
|
||||
|
||||
- name: Test (pcre2grep test script)
|
||||
run: ./RunGrepTest
|
||||
|
||||
alpine:
|
||||
name: alpine
|
||||
runs-on: ubuntu-latest
|
||||
container: alpine
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v2
|
||||
|
||||
- name: Autotools
|
||||
run: apk add --no-cache automake autoconf gcc libtool make musl-dev
|
||||
|
||||
- name: Autogen
|
||||
run: ./autogen.sh
|
||||
|
||||
- name: Configure
|
||||
run: ./configure --enable-jit --enable-pcre2-8 --enable-pcre2-16 --enable-pcre2-32
|
||||
|
||||
- name: Build
|
||||
run: make
|
||||
|
||||
- name: Test (main test script)
|
||||
run: ./RunTest
|
||||
|
||||
- name: Test (JIT test program)
|
||||
run: ./pcre2_jit_test
|
||||
|
||||
- name: Test (pcre2grep test script)
|
||||
run: ./RunGrepTest
|
||||
|
||||
windows:
|
||||
name: 32bit Windows
|
||||
runs-on: windows-latest
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v2
|
||||
|
||||
- name: Configure
|
||||
run: cmake -DPCRE2_SUPPORT_JIT=ON -DPCRE2_BUILD_PCRE2_16=ON -DPCRE2_BUILD_PCRE2_32=ON -B build -A Win32
|
||||
|
||||
- name: Build
|
||||
run: cmake --build build
|
||||
|
||||
- name: Test
|
||||
run: |
|
||||
cd build\Debug
|
||||
..\..\RunTest.bat
|
||||
|
|
@ -0,0 +1,73 @@
|
|||
# For most projects, this workflow file will not need changing; you simply need
|
||||
# to commit it to your repository.
|
||||
#
|
||||
# You may wish to alter this file to override the set of languages analyzed,
|
||||
# or to provide custom queries or build logic.
|
||||
#
|
||||
# ******** NOTE ********
|
||||
# We have attempted to detect the languages in your repository. Please check
|
||||
# the `language` matrix defined below to confirm you have the correct set of
|
||||
# supported CodeQL languages.
|
||||
#
|
||||
name: "CodeQL"
|
||||
|
||||
on:
|
||||
push:
|
||||
branches: [ master ]
|
||||
pull_request:
|
||||
# The branches below must be a subset of the branches above
|
||||
branches: [ master ]
|
||||
schedule:
|
||||
- cron: '27 6 * * 4'
|
||||
|
||||
# Declare default permissions as read only.
|
||||
permissions: read-all
|
||||
|
||||
jobs:
|
||||
analyze:
|
||||
name: Analyze
|
||||
runs-on: ubuntu-latest
|
||||
permissions:
|
||||
actions: read
|
||||
contents: read
|
||||
security-events: write
|
||||
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
language: [ 'cpp', 'python' ]
|
||||
# CodeQL supports [ 'cpp', 'csharp', 'go', 'java', 'javascript', 'python', 'ruby' ]
|
||||
# Learn more about CodeQL language support at https://git.io/codeql-language-support
|
||||
|
||||
steps:
|
||||
- name: Checkout repository
|
||||
uses: actions/checkout@v2
|
||||
|
||||
# Initializes the CodeQL tools for scanning.
|
||||
- name: Initialize CodeQL
|
||||
uses: github/codeql-action/init@v1
|
||||
with:
|
||||
languages: ${{ matrix.language }}
|
||||
# If you wish to specify custom queries, you can do so here or in a config file.
|
||||
# By default, queries listed here will override any specified in a config file.
|
||||
# Prefix the list here with "+" to use these queries and those in the config file.
|
||||
# queries: ./path/to/local/query, your-org/your-repo/queries@main
|
||||
|
||||
# Autobuild attempts to build any compiled languages (C/C++, C#, or Java).
|
||||
# If this step fails, then you should remove it and run the build manually (see below)
|
||||
- name: Autobuild
|
||||
uses: github/codeql-action/autobuild@v1
|
||||
|
||||
# ℹ️ Command-line programs to run using the OS shell.
|
||||
# 📚 https://git.io/JvXDl
|
||||
|
||||
# ✏️ If the Autobuild fails above, remove it and uncomment the following three lines
|
||||
# and modify them (or add more) to build your code if your project
|
||||
# uses a compiled language
|
||||
|
||||
#- run: |
|
||||
# make bootstrap
|
||||
# make release
|
||||
|
||||
- name: Perform CodeQL Analysis
|
||||
uses: github/codeql-action/analyze@v1
|
|
@ -0,0 +1,55 @@
|
|||
name: Scorecards supply-chain security
|
||||
on:
|
||||
# Only the default branch is supported.
|
||||
branch_protection_rule:
|
||||
schedule:
|
||||
- cron: '23 17 * * 1'
|
||||
push:
|
||||
branches: [ master ]
|
||||
|
||||
# Declare default permissions as read only.
|
||||
permissions: read-all
|
||||
|
||||
jobs:
|
||||
analysis:
|
||||
name: Scorecards analysis
|
||||
runs-on: ubuntu-latest
|
||||
permissions:
|
||||
# Needed to upload the results to code-scanning dashboard.
|
||||
security-events: write
|
||||
actions: read
|
||||
contents: read
|
||||
|
||||
steps:
|
||||
- name: "Checkout code"
|
||||
uses: actions/checkout@ec3a7ce113134d7a93b817d10a8272cb61118579 # v2.4.0
|
||||
with:
|
||||
persist-credentials: false
|
||||
|
||||
- name: "Run analysis"
|
||||
uses: ossf/scorecard-action@c1aec4ac820532bab364f02a81873c555a0ba3a1 # v1.0.4
|
||||
with:
|
||||
results_file: results.sarif
|
||||
results_format: sarif
|
||||
# Read-only PAT token. To create it,
|
||||
# follow the steps in https://github.com/ossf/scorecard-action#pat-token-creation.
|
||||
repo_token: ${{ secrets.SCORECARD_READ_TOKEN }}
|
||||
# Publish the results to enable scorecard badges. For more details, see
|
||||
# https://github.com/ossf/scorecard-action#publishing-results.
|
||||
# For private repositories, `publish_results` will automatically be set to `false`,
|
||||
# regardless of the value entered here.
|
||||
publish_results: true
|
||||
|
||||
# Upload the results as artifacts (optional).
|
||||
- name: "Upload artifact"
|
||||
uses: actions/upload-artifact@82c141cc518b40d92cc801eee768e7aafc9c2fa2 # v2.3.1
|
||||
with:
|
||||
name: SARIF file
|
||||
path: results.sarif
|
||||
retention-days: 5
|
||||
|
||||
# Upload the results to GitHub's code scanning dashboard.
|
||||
- name: "Upload to code-scanning"
|
||||
uses: github/codeql-action/upload-sarif@5f532563584d71fdef14ee64d17bafb34f751ce5 # v1.0.26
|
||||
with:
|
||||
sarif_file: results.sarif
|
|
@ -1,47 +1,82 @@
|
|||
INSTALL
|
||||
Makefile.in
|
||||
aclocal.m4
|
||||
ar-lib
|
||||
compile
|
||||
config.guess
|
||||
config.sub
|
||||
configure
|
||||
depcomp
|
||||
install-sh
|
||||
ltmain.sh
|
||||
m4/
|
||||
missing
|
||||
test-driver
|
||||
# Public .gitignore file for PCRE2
|
||||
|
||||
Makefile
|
||||
config.log
|
||||
config.status
|
||||
libpcre2-*.pc
|
||||
libtool
|
||||
pcre2-config
|
||||
src/.deps
|
||||
src/config.h
|
||||
src/pcre2.h
|
||||
src/stamp-h1
|
||||
|
||||
.libs
|
||||
*.o
|
||||
*.lo
|
||||
*.a
|
||||
*.lo
|
||||
*.la
|
||||
src/.dirstamp
|
||||
src/pcre2_chartables.c
|
||||
*.pc
|
||||
*.o
|
||||
*~
|
||||
*.lha
|
||||
|
||||
pcre2grep
|
||||
pcre2test
|
||||
pcre2_jit_test
|
||||
__pycache__
|
||||
.deps
|
||||
.libs
|
||||
|
||||
INSTALL
|
||||
Makefile
|
||||
Makefile.in
|
||||
RunGrepTest.log
|
||||
RunGrepTest.trs
|
||||
RunTest.log
|
||||
RunTest.trs
|
||||
|
||||
aclocal.m4
|
||||
ar-lib
|
||||
compile
|
||||
config.guess
|
||||
config.log
|
||||
config.status
|
||||
config.sub
|
||||
configure
|
||||
depcomp
|
||||
install-sh
|
||||
libtool
|
||||
ltmain.sh
|
||||
missing
|
||||
pcre2-config
|
||||
pcre2_dftables
|
||||
pcre2_jit_test
|
||||
pcre2_jit_test.log
|
||||
pcre2_jit_test.trs
|
||||
pcre2demo
|
||||
pcre2fuzzcheck
|
||||
pcre2grep
|
||||
pcre2test
|
||||
test-driver
|
||||
test-suite.log
|
||||
test3input
|
||||
test3output
|
||||
testNinput
|
||||
testNinputgrep
|
||||
teststderr
|
||||
teststderrM
|
||||
teststderrgrep
|
||||
teststdout
|
||||
teststdoutM
|
||||
testtemp1
|
||||
testtemp1grep
|
||||
testtemp2
|
||||
testtemp2grep
|
||||
testtry
|
||||
testtrygrep
|
||||
|
||||
m4/libtool.m4
|
||||
m4/ltoptions.m4
|
||||
m4/ltsugar.m4
|
||||
m4/ltversion.m4
|
||||
m4/lt~obsolete.m4
|
||||
|
||||
maint/ucptest
|
||||
maint/utf8
|
||||
|
||||
src/.deps
|
||||
src/.dirstamp
|
||||
src/config.h
|
||||
src/pcre2.h
|
||||
src/pcre2_chartables.c
|
||||
src/stamp-h1
|
||||
|
||||
/bazel-*
|
||||
|
||||
# End
|
||||
|
||||
*~
|
||||
|
|
8
AUTHORS
8
AUTHORS
|
@ -5,10 +5,10 @@ Written by: Philip Hazel
|
|||
Email local part: Philip.Hazel
|
||||
Email domain: gmail.com
|
||||
|
||||
University of Cambridge Computing Service,
|
||||
Retired from University of Cambridge Computing Service,
|
||||
Cambridge, England.
|
||||
|
||||
Copyright (c) 1997-2021 University of Cambridge
|
||||
Copyright (c) 1997-2022 University of Cambridge
|
||||
All rights reserved
|
||||
|
||||
|
||||
|
@ -19,7 +19,7 @@ Written by: Zoltan Herczeg
|
|||
Email local part: hzmester
|
||||
Emain domain: freemail.hu
|
||||
|
||||
Copyright(c) 2010-2021 Zoltan Herczeg
|
||||
Copyright(c) 2010-2022 Zoltan Herczeg
|
||||
All rights reserved.
|
||||
|
||||
|
||||
|
@ -30,7 +30,7 @@ Written by: Zoltan Herczeg
|
|||
Email local part: hzmester
|
||||
Emain domain: freemail.hu
|
||||
|
||||
Copyright(c) 2009-2021 Zoltan Herczeg
|
||||
Copyright(c) 2009-2022 Zoltan Herczeg
|
||||
All rights reserved.
|
||||
|
||||
####
|
||||
|
|
|
@ -0,0 +1,72 @@
|
|||
load("@rules_cc//cc:defs.bzl", "cc_library", "cc_test")
|
||||
load("@bazel_skylib//rules:copy_file.bzl", "copy_file")
|
||||
|
||||
copy_file(
|
||||
name = "config_h_generic",
|
||||
src = "src/config.h.generic",
|
||||
out = "src/config.h",
|
||||
)
|
||||
|
||||
copy_file(
|
||||
name = "pcre2_h_generic",
|
||||
src = "src/pcre2.h.generic",
|
||||
out = "src/pcre2.h",
|
||||
)
|
||||
|
||||
copy_file(
|
||||
name = "pcre2_chartables_c",
|
||||
src = "src/pcre2_chartables.c.dist",
|
||||
out = "src/pcre2_chartables.c",
|
||||
)
|
||||
|
||||
cc_library(
|
||||
name = "pcre2",
|
||||
srcs = [
|
||||
"src/pcre2_auto_possess.c",
|
||||
"src/pcre2_compile.c",
|
||||
"src/pcre2_config.c",
|
||||
"src/pcre2_context.c",
|
||||
"src/pcre2_convert.c",
|
||||
"src/pcre2_dfa_match.c",
|
||||
"src/pcre2_error.c",
|
||||
"src/pcre2_extuni.c",
|
||||
"src/pcre2_find_bracket.c",
|
||||
"src/pcre2_maketables.c",
|
||||
"src/pcre2_match.c",
|
||||
"src/pcre2_match_data.c",
|
||||
"src/pcre2_newline.c",
|
||||
"src/pcre2_ord2utf.c",
|
||||
"src/pcre2_pattern_info.c",
|
||||
"src/pcre2_script_run.c",
|
||||
"src/pcre2_serialize.c",
|
||||
"src/pcre2_string_utils.c",
|
||||
"src/pcre2_study.c",
|
||||
"src/pcre2_substitute.c",
|
||||
"src/pcre2_substring.c",
|
||||
"src/pcre2_tables.c",
|
||||
"src/pcre2_ucd.c",
|
||||
"src/pcre2_ucptables.c",
|
||||
"src/pcre2_valid_utf.c",
|
||||
"src/pcre2_xclass.c",
|
||||
":pcre2_chartables_c",
|
||||
],
|
||||
hdrs = glob(["src/*.h"]) + [
|
||||
":config_h_generic",
|
||||
":pcre2_h_generic",
|
||||
],
|
||||
defines = [
|
||||
"HAVE_CONFIG_H",
|
||||
"PCRE2_CODE_UNIT_WIDTH=8",
|
||||
"PCRE2_STATIC",
|
||||
],
|
||||
includes = ["src"],
|
||||
strip_include_prefix = "src",
|
||||
visibility = ["//visibility:public"],
|
||||
)
|
||||
|
||||
cc_binary(
|
||||
name = "pcre2demo",
|
||||
srcs = ["src/pcre2demo.c"],
|
||||
visibility = ["//visibility:public"],
|
||||
deps = [":pcre2"],
|
||||
)
|
417
CMakeLists.txt
417
CMakeLists.txt
|
@ -94,16 +94,27 @@
|
|||
# 2020-04-28 PH added function check for memfd_create based on Carlo's patch
|
||||
# 2020-05-25 PH added a check for Intel CET
|
||||
# 2020-12-03 PH altered the definition of pcre2test as suggested by Daniel
|
||||
# 2021-06-29 JWSB added the option to build static library with PIC.
|
||||
# 2021-07-05 JWSB modified such both the static and shared library can be
|
||||
# build in one go.
|
||||
# 2021-08-28 PH increased minimum version
|
||||
# 2021-08-28 PH added test for realpath()
|
||||
|
||||
PROJECT(PCRE2 C)
|
||||
|
||||
# Increased minimum to 2.8.5 to support GNUInstallDirs.
|
||||
CMAKE_MINIMUM_REQUIRED(VERSION 2.8.5)
|
||||
# Increased minimum to 3.1 to support imported targets.
|
||||
CMAKE_MINIMUM_REQUIRED(VERSION 3.1)
|
||||
|
||||
# Set policy CMP0026 to avoid warnings for the use of LOCATION in
|
||||
# GET_TARGET_PROPERTY. This should no longer be required.
|
||||
# CMAKE_POLICY(SET CMP0026 OLD)
|
||||
|
||||
# With a recent cmake, you can provide a rootdir to look for non
|
||||
# standard installed library dependencies, but to do so, the policy
|
||||
# needs to be set to new (by uncommenting the following)
|
||||
# CMAKE_POLICY(SET CMP0074 NEW)
|
||||
|
||||
# For FindReadline.cmake. This was changed to allow setting CMAKE_MODULE_PATH
|
||||
# on the command line.
|
||||
# SET(CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake)
|
||||
|
@ -128,8 +139,6 @@ INCLUDE(CheckTypeSize)
|
|||
INCLUDE(GNUInstallDirs) # for CMAKE_INSTALL_LIBDIR
|
||||
|
||||
CHECK_INCLUDE_FILE(dirent.h HAVE_DIRENT_H)
|
||||
CHECK_INCLUDE_FILE(stdint.h HAVE_STDINT_H)
|
||||
CHECK_INCLUDE_FILE(inttypes.h HAVE_INTTYPES_H)
|
||||
CHECK_INCLUDE_FILE(sys/stat.h HAVE_SYS_STAT_H)
|
||||
CHECK_INCLUDE_FILE(sys/types.h HAVE_SYS_TYPES_H)
|
||||
CHECK_INCLUDE_FILE(unistd.h HAVE_UNISTD_H)
|
||||
|
@ -141,6 +150,13 @@ CHECK_SYMBOL_EXISTS(memmove "string.h" HAVE_MEMMOVE)
|
|||
CHECK_SYMBOL_EXISTS(secure_getenv "stdlib.h" HAVE_SECURE_GETENV)
|
||||
CHECK_SYMBOL_EXISTS(strerror "string.h" HAVE_STRERROR)
|
||||
|
||||
CHECK_C_SOURCE_COMPILES(
|
||||
"#include <stdlib.h>
|
||||
#include <limits.h>
|
||||
int main(int c, char *v[]) { char buf[PATH_MAX]; realpath(v[1], buf); return 0; }"
|
||||
HAVE_REALPATH
|
||||
)
|
||||
|
||||
set(ORIG_CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS})
|
||||
set(CMAKE_REQUIRED_FLAGS "${CMAKE_REQUIRED_FLAGS} -Werror")
|
||||
CHECK_C_SOURCE_COMPILES(
|
||||
|
@ -172,8 +188,9 @@ ENDIF(INTEL_CET_ENABLED)
|
|||
# Note: CMakeSetup displays these in alphabetical order, regardless of
|
||||
# the order we use here.
|
||||
|
||||
SET(BUILD_SHARED_LIBS OFF CACHE BOOL
|
||||
"Build shared libraries instead of static ones.")
|
||||
SET(BUILD_SHARED_LIBS OFF CACHE BOOL "Build shared libraries.")
|
||||
|
||||
OPTION(BUILD_STATIC_LIBS "Build static libraries." ON)
|
||||
|
||||
OPTION(PCRE2_BUILD_PCRE2_8 "Build 8 bit PCRE2 library" ON)
|
||||
|
||||
|
@ -181,6 +198,8 @@ OPTION(PCRE2_BUILD_PCRE2_16 "Build 16 bit PCRE2 library" OFF)
|
|||
|
||||
OPTION(PCRE2_BUILD_PCRE2_32 "Build 32 bit PCRE2 library" OFF)
|
||||
|
||||
OPTION(PCRE2_STATIC_PIC "Build the static library with the option position independent code enabled." OFF)
|
||||
|
||||
OPTION(PCRE2_DEBUG "Include debugging code" OFF)
|
||||
|
||||
OPTION(PCRE2_DISABLE_PERCENT_ZT "Disable the use of %zu and %td (rarely needed)" OFF)
|
||||
|
@ -292,9 +311,19 @@ ENDIF(PCRE2_SUPPORT_LIBZ)
|
|||
IF(EDITLINE_FOUND)
|
||||
OPTION (PCRE2_SUPPORT_LIBEDIT "Enable support for linking pcre2test with libedit." OFF)
|
||||
ENDIF(EDITLINE_FOUND)
|
||||
IF(PCRE2_SUPPORT_LIBEDIT)
|
||||
INCLUDE_DIRECTORIES(${EDITLINE_INCLUDE_DIR})
|
||||
ENDIF(PCRE2_SUPPORT_LIBEDIT)
|
||||
IF(EDITLINE_FOUND)
|
||||
IF(PCRE2_SUPPORT_LIBEDIT)
|
||||
INCLUDE_DIRECTORIES(${EDITLINE_INCLUDE_DIR})
|
||||
ENDIF(PCRE2_SUPPORT_LIBEDIT)
|
||||
ELSE(EDITLINE_FOUND)
|
||||
IF(PCRE2_SUPPORT_LIBEDIT)
|
||||
MESSAGE(FATAL_ERROR
|
||||
" libedit not found, set EDITLINE_INCLUDE_DIR to a compatible header\n"
|
||||
" or set Editline_ROOT to a full libedit installed tree, as needed\n"
|
||||
" Might need to enable policy CMP0074 in CMakeLists.txt"
|
||||
)
|
||||
ENDIF(PCRE2_SUPPORT_LIBEDIT)
|
||||
ENDIF(EDITLINE_FOUND)
|
||||
|
||||
# readline lib
|
||||
IF(READLINE_FOUND)
|
||||
|
@ -306,9 +335,9 @@ ENDIF(PCRE2_SUPPORT_LIBREADLINE)
|
|||
|
||||
# Prepare build configuration
|
||||
|
||||
IF(NOT BUILD_SHARED_LIBS)
|
||||
SET(PCRE2_STATIC 1)
|
||||
ENDIF(NOT BUILD_SHARED_LIBS)
|
||||
IF(NOT BUILD_SHARED_LIBS AND NOT BUILD_STATIC_LIBS)
|
||||
MESSAGE(FATAL_ERROR "At least one of BUILD_SHARED_LIBS or BUILD_STATIC_LIBS must be enabled.")
|
||||
ENDIF(NOT BUILD_SHARED_LIBS AND NOT BUILD_STATIC_LIBS)
|
||||
|
||||
IF(NOT PCRE2_BUILD_PCRE2_8 AND NOT PCRE2_BUILD_PCRE2_16 AND NOT PCRE2_BUILD_PCRE2_32)
|
||||
MESSAGE(FATAL_ERROR "At least one of PCRE2_BUILD_PCRE2_8, PCRE2_BUILD_PCRE2_16 or PCRE2_BUILD_PCRE2_32 must be enabled")
|
||||
|
@ -332,7 +361,12 @@ IF(PCRE2_BUILD_PCRE2GREP AND NOT PCRE2_BUILD_PCRE2_8)
|
|||
ENDIF(PCRE2_BUILD_PCRE2GREP AND NOT PCRE2_BUILD_PCRE2_8)
|
||||
|
||||
IF(PCRE2_SUPPORT_LIBREADLINE AND PCRE2_SUPPORT_LIBEDIT)
|
||||
MESSAGE(FATAL_ERROR "Only one of libreadline or libeditline can be specified")
|
||||
IF(READLINE_FOUND)
|
||||
MESSAGE(FATAL_ERROR
|
||||
" Only one of the readline compatible libraries can be enabled.\n"
|
||||
" Disable libreadline with -DPCRE2_SUPPORT_LIBREADLINE=OFF"
|
||||
)
|
||||
ENDIF(READLINE_FOUND)
|
||||
ENDIF(PCRE2_SUPPORT_LIBREADLINE AND PCRE2_SUPPORT_LIBEDIT)
|
||||
|
||||
IF(PCRE2_SUPPORT_BSR_ANYCRLF)
|
||||
|
@ -348,7 +382,13 @@ IF(PCRE2_SUPPORT_UNICODE)
|
|||
ENDIF(PCRE2_SUPPORT_UNICODE)
|
||||
|
||||
IF(PCRE2_SUPPORT_JIT)
|
||||
SET(SUPPORT_JIT 1)
|
||||
SET(SUPPORT_JIT 1)
|
||||
IF(UNIX)
|
||||
FIND_PACKAGE(Threads REQUIRED)
|
||||
IF(CMAKE_USE_PTHREADS_INIT)
|
||||
SET(REQUIRE_PTHREAD 1)
|
||||
ENDIF(CMAKE_USE_PTHREADS_INIT)
|
||||
ENDIF(UNIX)
|
||||
ENDIF(PCRE2_SUPPORT_JIT)
|
||||
|
||||
IF(PCRE2_SUPPORT_JIT_SEALLOC)
|
||||
|
@ -597,39 +637,37 @@ SET(PCRE2_SOURCES
|
|||
SET(PCRE2POSIX_HEADERS src/pcre2posix.h)
|
||||
SET(PCRE2POSIX_SOURCES src/pcre2posix.c)
|
||||
|
||||
IF(MINGW AND NOT PCRE2_STATIC)
|
||||
IF (EXISTS ${PROJECT_SOURCE_DIR}/pcre2.rc)
|
||||
ADD_CUSTOM_COMMAND(OUTPUT ${PROJECT_SOURCE_DIR}/pcre2.o
|
||||
PRE-LINK
|
||||
COMMAND windres ARGS pcre2.rc pcre2.o
|
||||
WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}
|
||||
COMMENT Using pcre2 coff info in mingw build)
|
||||
SET(PCRE2_SOURCES
|
||||
${PCRE2_SOURCES} ${PROJECT_SOURCE_DIR}/pcre2.o
|
||||
)
|
||||
ENDIF(EXISTS ${PROJECT_SOURCE_DIR}/pcre2.rc)
|
||||
IF (EXISTS ${PROJECT_SOURCE_DIR}/pcre2posix.rc)
|
||||
ADD_CUSTOM_COMMAND(OUTPUT ${PROJECT_SOURCE_DIR}/pcre2posix.o
|
||||
PRE-LINK
|
||||
COMMAND windres ARGS pcre2posix.rc pcre2posix.o
|
||||
WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}
|
||||
COMMENT Using pcre2posix coff info in mingw build)
|
||||
SET(PCRE2POSIX_SOURCES
|
||||
${PCRE2POSIX_SOURCES} ${PROJECT_SOURCE_DIR}/pcre2posix.o
|
||||
)
|
||||
ENDIF(EXISTS ${PROJECT_SOURCE_DIR}/pcre2posix.rc)
|
||||
ENDIF(MINGW AND NOT PCRE2_STATIC)
|
||||
IF(MINGW AND BUILD_SHARED_LIBS)
|
||||
IF (EXISTS ${PROJECT_SOURCE_DIR}/pcre2.rc)
|
||||
ADD_CUSTOM_COMMAND(OUTPUT ${PROJECT_SOURCE_DIR}/pcre2.o
|
||||
PRE-LINK
|
||||
COMMAND windres ARGS pcre2.rc pcre2.o
|
||||
WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}
|
||||
COMMENT Using pcre2 coff info in mingw build)
|
||||
SET(PCRE2_SOURCES ${PCRE2_SOURCES} ${PROJECT_SOURCE_DIR}/pcre2.o)
|
||||
ENDIF(EXISTS ${PROJECT_SOURCE_DIR}/pcre2.rc)
|
||||
|
||||
IF(MSVC AND NOT PCRE2_STATIC)
|
||||
IF (EXISTS ${PROJECT_SOURCE_DIR}/pcre2.rc)
|
||||
SET(PCRE2_SOURCES
|
||||
${PCRE2_SOURCES} pcre2.rc)
|
||||
ENDIF(EXISTS ${PROJECT_SOURCE_DIR}/pcre2.rc)
|
||||
IF (EXISTS ${PROJECT_SOURCE_DIR}/pcre2posix.rc)
|
||||
SET(PCRE2POSIX_SOURCES
|
||||
${PCRE2POSIX_SOURCES} pcre2posix.rc)
|
||||
ENDIF (EXISTS ${PROJECT_SOURCE_DIR}/pcre2posix.rc)
|
||||
ENDIF(MSVC AND NOT PCRE2_STATIC)
|
||||
IF (EXISTS ${PROJECT_SOURCE_DIR}/pcre2posix.rc)
|
||||
ADD_CUSTOM_COMMAND(OUTPUT ${PROJECT_SOURCE_DIR}/pcre2posix.o
|
||||
PRE-LINK
|
||||
COMMAND windres ARGS pcre2posix.rc pcre2posix.o
|
||||
WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}
|
||||
COMMENT Using pcre2posix coff info in mingw build)
|
||||
SET(PCRE2POSIX_SOURCES ${PCRE2POSIX_SOURCES} ${PROJECT_SOURCE_DIR}/pcre2posix.o)
|
||||
ENDIF(EXISTS ${PROJECT_SOURCE_DIR}/pcre2posix.rc)
|
||||
ENDIF(MINGW AND BUILD_SHARED_LIBS)
|
||||
|
||||
IF(MSVC AND BUILD_SHARED_LIBS)
|
||||
SET(dll_pdb_files ${PROJECT_BINARY_DIR}/pcre2-posix.pdb ${dll_pdb_files})
|
||||
SET(dll_pdb_debug_files ${PROJECT_BINARY_DIR}/pcre2-posixd.pdb ${dll_pdb_debug_files})
|
||||
IF (EXISTS ${PROJECT_SOURCE_DIR}/pcre2.rc)
|
||||
SET(PCRE2_SOURCES ${PCRE2_SOURCES} pcre2.rc)
|
||||
ENDIF(EXISTS ${PROJECT_SOURCE_DIR}/pcre2.rc)
|
||||
|
||||
IF (EXISTS ${PROJECT_SOURCE_DIR}/pcre2posix.rc)
|
||||
SET(PCRE2POSIX_SOURCES ${PCRE2POSIX_SOURCES} pcre2posix.rc)
|
||||
ENDIF (EXISTS ${PROJECT_SOURCE_DIR}/pcre2posix.rc)
|
||||
ENDIF(MSVC AND BUILD_SHARED_LIBS)
|
||||
|
||||
# Fix static compilation with MSVC: https://bugs.exim.org/show_bug.cgi?id=1681
|
||||
# This code was taken from the CMake wiki, not from WebM.
|
||||
|
@ -658,76 +696,213 @@ SET(targets)
|
|||
# 8-bit library
|
||||
|
||||
IF(PCRE2_BUILD_PCRE2_8)
|
||||
ADD_LIBRARY(pcre2-8 ${PCRE2_HEADERS} ${PCRE2_SOURCES} ${PROJECT_BINARY_DIR}/config.h)
|
||||
SET_TARGET_PROPERTIES(pcre2-8 PROPERTIES
|
||||
COMPILE_DEFINITIONS PCRE2_CODE_UNIT_WIDTH=8
|
||||
MACHO_COMPATIBILITY_VERSION "${LIBPCRE2_8_MACHO_COMPATIBILITY_VERSION}"
|
||||
MACHO_CURRENT_VERSION "${LIBPCRE2_8_MACHO_CURRENT_VERSION}"
|
||||
VERSION ${LIBPCRE2_8_VERSION}
|
||||
SOVERSION ${LIBPCRE2_8_SOVERSION})
|
||||
SET(targets ${targets} pcre2-8)
|
||||
ADD_LIBRARY(pcre2-posix ${PCRE2POSIX_HEADERS} ${PCRE2POSIX_SOURCES})
|
||||
SET_TARGET_PROPERTIES(pcre2-posix PROPERTIES
|
||||
COMPILE_DEFINITIONS PCRE2_CODE_UNIT_WIDTH=8
|
||||
MACHO_COMPATIBILITY_VERSION "${LIBPCRE2_POSIX_MACHO_COMPATIBILITY_VERSION}"
|
||||
MACHO_CURRENT_VERSION "${LIBPCRE2_POSIX_MACHO_CURRENT_VERSION}"
|
||||
VERSION ${LIBPCRE2_POSIX_VERSION}
|
||||
SOVERSION ${LIBPCRE2_POSIX_SOVERSION})
|
||||
SET(targets ${targets} pcre2-posix)
|
||||
TARGET_LINK_LIBRARIES(pcre2-posix pcre2-8)
|
||||
IF(BUILD_STATIC_LIBS)
|
||||
ADD_LIBRARY(pcre2-8-static STATIC ${PCRE2_HEADERS} ${PCRE2_SOURCES} ${PROJECT_BINARY_DIR}/config.h)
|
||||
SET_TARGET_PROPERTIES(pcre2-8-static PROPERTIES
|
||||
COMPILE_DEFINITIONS PCRE2_CODE_UNIT_WIDTH=8
|
||||
MACHO_COMPATIBILITY_VERSION "${LIBPCRE2_8_MACHO_COMPATIBILITY_VERSION}"
|
||||
MACHO_CURRENT_VERSION "${LIBPCRE2_8_MACHO_CURRENT_VERSION}"
|
||||
VERSION ${LIBPCRE2_8_VERSION}
|
||||
SOVERSION ${LIBPCRE2_8_SOVERSION})
|
||||
TARGET_COMPILE_DEFINITIONS(pcre2-8-static PUBLIC PCRE2_STATIC)
|
||||
TARGET_INCLUDE_DIRECTORIES(pcre2-8-static PUBLIC ${PROJECT_BINARY_DIR})
|
||||
IF(REQUIRE_PTHREAD)
|
||||
TARGET_LINK_LIBRARIES(pcre2-8-static Threads::Threads)
|
||||
ENDIF(REQUIRE_PTHREAD)
|
||||
SET(targets ${targets} pcre2-8-static)
|
||||
ADD_LIBRARY(pcre2-posix-static STATIC ${PCRE2POSIX_HEADERS} ${PCRE2POSIX_SOURCES})
|
||||
SET_TARGET_PROPERTIES(pcre2-posix-static PROPERTIES
|
||||
COMPILE_DEFINITIONS PCRE2_CODE_UNIT_WIDTH=8
|
||||
MACHO_COMPATIBILITY_VERSION "${LIBPCRE2_POSIX_MACHO_COMPATIBILITY_VERSION}"
|
||||
MACHO_CURRENT_VERSION "${LIBPCRE2_POSIX_MACHO_CURRENT_VERSION}"
|
||||
VERSION ${LIBPCRE2_POSIX_VERSION}
|
||||
SOVERSION ${LIBPCRE2_POSIX_SOVERSION})
|
||||
TARGET_LINK_LIBRARIES(pcre2-posix-static pcre2-8-static)
|
||||
TARGET_COMPILE_DEFINITIONS(pcre2-posix-static PUBLIC PCRE2_STATIC)
|
||||
TARGET_INCLUDE_DIRECTORIES(pcre2-posix-static PUBLIC ${PROJECT_BINARY_DIR})
|
||||
SET(targets ${targets} pcre2-posix-static)
|
||||
|
||||
IF(MINGW AND NOT PCRE2_STATIC)
|
||||
IF(NON_STANDARD_LIB_PREFIX)
|
||||
SET_TARGET_PROPERTIES(pcre2-8 pcre2-posix PROPERTIES PREFIX "")
|
||||
ENDIF(NON_STANDARD_LIB_PREFIX)
|
||||
IF(NON_STANDARD_LIB_SUFFIX)
|
||||
SET_TARGET_PROPERTIES(pcre2-8 pcre2-posix PROPERTIES SUFFIX "-0.dll")
|
||||
ENDIF(NON_STANDARD_LIB_SUFFIX)
|
||||
ENDIF(MINGW AND NOT PCRE2_STATIC)
|
||||
IF(MSVC)
|
||||
SET_TARGET_PROPERTIES(pcre2-8-static PROPERTIES OUTPUT_NAME pcre2-8-static)
|
||||
SET_TARGET_PROPERTIES(pcre2-posix-static PROPERTIES OUTPUT_NAME pcre2-posix-static)
|
||||
ELSE(MSVC)
|
||||
SET_TARGET_PROPERTIES(pcre2-8-static PROPERTIES OUTPUT_NAME pcre2-8)
|
||||
SET_TARGET_PROPERTIES(pcre2-posix-static PROPERTIES OUTPUT_NAME pcre2-posix)
|
||||
ENDIF(MSVC)
|
||||
IF(PCRE2_STATIC_PIC)
|
||||
SET_TARGET_PROPERTIES(pcre2-8-static pcre2-posix-static PROPERTIES POSITION_INDEPENDENT_CODE 1)
|
||||
ENDIF(PCRE2_STATIC_PIC)
|
||||
ENDIF(BUILD_STATIC_LIBS)
|
||||
|
||||
IF(BUILD_SHARED_LIBS)
|
||||
ADD_LIBRARY(pcre2-8-shared SHARED ${PCRE2_HEADERS} ${PCRE2_SOURCES} ${PROJECT_BINARY_DIR}/config.h)
|
||||
TARGET_INCLUDE_DIRECTORIES(pcre2-8-shared PUBLIC ${PROJECT_BINARY_DIR})
|
||||
SET_TARGET_PROPERTIES(pcre2-8-shared PROPERTIES
|
||||
COMPILE_DEFINITIONS PCRE2_CODE_UNIT_WIDTH=8
|
||||
MACHO_COMPATIBILITY_VERSION "${LIBPCRE2_8_MACHO_COMPATIBILITY_VERSION}"
|
||||
MACHO_CURRENT_VERSION "${LIBPCRE2_8_MACHO_CURRENT_VERSION}"
|
||||
VERSION ${LIBPCRE2_8_VERSION}
|
||||
SOVERSION ${LIBPCRE2_8_SOVERSION}
|
||||
OUTPUT_NAME pcre2-8)
|
||||
IF(REQUIRE_PTHREAD)
|
||||
TARGET_LINK_LIBRARIES(pcre2-8-shared Threads::Threads)
|
||||
ENDIF(REQUIRE_PTHREAD)
|
||||
SET(targets ${targets} pcre2-8-shared)
|
||||
ADD_LIBRARY(pcre2-posix-shared SHARED ${PCRE2POSIX_HEADERS} ${PCRE2POSIX_SOURCES})
|
||||
TARGET_INCLUDE_DIRECTORIES(pcre2-posix-shared PUBLIC ${PROJECT_BINARY_DIR})
|
||||
SET_TARGET_PROPERTIES(pcre2-posix-shared PROPERTIES
|
||||
COMPILE_DEFINITIONS PCRE2_CODE_UNIT_WIDTH=8
|
||||
MACHO_COMPATIBILITY_VERSION "${LIBPCRE2_POSIX_MACHO_COMPATIBILITY_VERSION}"
|
||||
MACHO_CURRENT_VERSION "${LIBPCRE2_POSIX_MACHO_CURRENT_VERSION}"
|
||||
VERSION ${LIBPCRE2_POSIX_VERSION}
|
||||
SOVERSION ${LIBPCRE2_POSIX_SOVERSION}
|
||||
OUTPUT_NAME pcre2-posix)
|
||||
TARGET_LINK_LIBRARIES(pcre2-posix-shared pcre2-8-shared)
|
||||
SET(targets ${targets} pcre2-posix-shared)
|
||||
SET(dll_pdb_files ${PROJECT_BINARY_DIR}/pcre2-8.pdb ${dll_pdb_files})
|
||||
SET(dll_pdb_debug_files ${PROJECT_BINARY_DIR}/pcre2-8d.pdb ${dll_pdb_debug_files})
|
||||
|
||||
IF(MINGW)
|
||||
IF(NON_STANDARD_LIB_PREFIX)
|
||||
SET_TARGET_PROPERTIES(pcre2-8-shared pcre2-posix-shared PROPERTIES PREFIX "")
|
||||
ENDIF(NON_STANDARD_LIB_PREFIX)
|
||||
IF(NON_STANDARD_LIB_SUFFIX)
|
||||
SET_TARGET_PROPERTIES(pcre2-8-shared pcre2-posix-shared PROPERTIES SUFFIX "-0.dll")
|
||||
ENDIF(NON_STANDARD_LIB_SUFFIX)
|
||||
ENDIF(MINGW)
|
||||
ENDIF(BUILD_SHARED_LIBS)
|
||||
|
||||
IF(BUILD_STATIC_LIBS)
|
||||
ADD_LIBRARY(pcre2-8 ALIAS pcre2-8-static)
|
||||
ADD_LIBRARY(pcre2-posix ALIAS pcre2-posix-static)
|
||||
ELSE(BUILD_STATIC_LIBS)
|
||||
ADD_LIBRARY(pcre2-8 ALIAS pcre2-8-shared)
|
||||
ADD_LIBRARY(pcre2-posix ALIAS pcre2-posix-shared)
|
||||
ENDIF(BUILD_STATIC_LIBS)
|
||||
ENDIF(PCRE2_BUILD_PCRE2_8)
|
||||
|
||||
# 16-bit library
|
||||
|
||||
IF(PCRE2_BUILD_PCRE2_16)
|
||||
ADD_LIBRARY(pcre2-16 ${PCRE2_HEADERS} ${PCRE2_SOURCES} ${PROJECT_BINARY_DIR}/config.h)
|
||||
SET_TARGET_PROPERTIES(pcre2-16 PROPERTIES
|
||||
COMPILE_DEFINITIONS PCRE2_CODE_UNIT_WIDTH=16
|
||||
MACHO_COMPATIBILITY_VERSION "${LIBPCRE2_32_MACHO_COMPATIBILITY_VERSION}"
|
||||
MACHO_CURRENT_VERSION "${LIBPCRE2_32_MACHO_CURRENT_VERSION}"
|
||||
VERSION ${LIBPCRE2_16_VERSION}
|
||||
SOVERSION ${LIBPCRE2_16_SOVERSION})
|
||||
SET(targets ${targets} pcre2-16)
|
||||
IF(BUILD_STATIC_LIBS)
|
||||
ADD_LIBRARY(pcre2-16-static STATIC ${PCRE2_HEADERS} ${PCRE2_SOURCES} ${PROJECT_BINARY_DIR}/config.h)
|
||||
TARGET_INCLUDE_DIRECTORIES(pcre2-16-static PUBLIC ${PROJECT_BINARY_DIR})
|
||||
SET_TARGET_PROPERTIES(pcre2-16-static PROPERTIES
|
||||
COMPILE_DEFINITIONS PCRE2_CODE_UNIT_WIDTH=16
|
||||
MACHO_COMPATIBILITY_VERSION "${LIBPCRE2_32_MACHO_COMPATIBILITY_VERSION}"
|
||||
MACHO_CURRENT_VERSION "${LIBPCRE2_32_MACHO_CURRENT_VERSION}"
|
||||
VERSION ${LIBPCRE2_16_VERSION}
|
||||
SOVERSION ${LIBPCRE2_16_SOVERSION})
|
||||
TARGET_COMPILE_DEFINITIONS(pcre2-16-static PUBLIC PCRE2_STATIC)
|
||||
IF(REQUIRE_PTHREAD)
|
||||
TARGET_LINK_LIBRARIES(pcre2-16-static Threads::Threads)
|
||||
ENDIF(REQUIRE_PTHREAD)
|
||||
SET(targets ${targets} pcre2-16-static)
|
||||
|
||||
IF(MINGW AND NOT PCRE2_STATIC)
|
||||
IF(NON_STANDARD_LIB_PREFIX)
|
||||
SET_TARGET_PROPERTIES(pcre2-16 PROPERTIES PREFIX "")
|
||||
ENDIF(NON_STANDARD_LIB_PREFIX)
|
||||
IF(NON_STANDARD_LIB_SUFFIX)
|
||||
SET_TARGET_PROPERTIES(pcre2-16 PROPERTIES SUFFIX "-0.dll")
|
||||
ENDIF(NON_STANDARD_LIB_SUFFIX)
|
||||
ENDIF(MINGW AND NOT PCRE2_STATIC)
|
||||
IF(MSVC)
|
||||
SET_TARGET_PROPERTIES(pcre2-16-static PROPERTIES OUTPUT_NAME pcre2-16-static)
|
||||
ELSE(MSVC)
|
||||
SET_TARGET_PROPERTIES(pcre2-16-static PROPERTIES OUTPUT_NAME pcre2-16)
|
||||
ENDIF(MSVC)
|
||||
IF(PCRE2_STATIC_PIC)
|
||||
SET_TARGET_PROPERTIES(pcre2-16-static PROPERTIES POSITION_INDEPENDENT_CODE 1)
|
||||
ENDIF(PCRE2_STATIC_PIC)
|
||||
ENDIF(BUILD_STATIC_LIBS)
|
||||
|
||||
IF(BUILD_SHARED_LIBS)
|
||||
ADD_LIBRARY(pcre2-16-shared SHARED ${PCRE2_HEADERS} ${PCRE2_SOURCES} ${PROJECT_BINARY_DIR}/config.h)
|
||||
TARGET_INCLUDE_DIRECTORIES(pcre2-16-shared PUBLIC ${PROJECT_BINARY_DIR})
|
||||
SET_TARGET_PROPERTIES(pcre2-16-shared PROPERTIES
|
||||
COMPILE_DEFINITIONS PCRE2_CODE_UNIT_WIDTH=16
|
||||
MACHO_COMPATIBILITY_VERSION "${LIBPCRE2_32_MACHO_COMPATIBILITY_VERSION}"
|
||||
MACHO_CURRENT_VERSION "${LIBPCRE2_32_MACHO_CURRENT_VERSION}"
|
||||
VERSION ${LIBPCRE2_16_VERSION}
|
||||
SOVERSION ${LIBPCRE2_16_SOVERSION}
|
||||
OUTPUT_NAME pcre2-16)
|
||||
IF(REQUIRE_PTHREAD)
|
||||
TARGET_LINK_LIBRARIES(pcre2-16-shared Threads::Threads)
|
||||
ENDIF(REQUIRE_PTHREAD)
|
||||
SET(targets ${targets} pcre2-16-shared)
|
||||
SET(dll_pdb_files ${PROJECT_BINARY_DIR}/pcre2-16.pdb ${dll_pdb_files})
|
||||
SET(dll_pdb_debug_files ${PROJECT_BINARY_DIR}/pcre2-16d.pdb ${dll_pdb_debug_files})
|
||||
|
||||
IF(MINGW)
|
||||
IF(NON_STANDARD_LIB_PREFIX)
|
||||
SET_TARGET_PROPERTIES(pcre2-16-shared PROPERTIES PREFIX "")
|
||||
ENDIF(NON_STANDARD_LIB_PREFIX)
|
||||
IF(NON_STANDARD_LIB_SUFFIX)
|
||||
SET_TARGET_PROPERTIES(pcre2-16-shared PROPERTIES SUFFIX "-0.dll")
|
||||
ENDIF(NON_STANDARD_LIB_SUFFIX)
|
||||
ENDIF(MINGW)
|
||||
ENDIF(BUILD_SHARED_LIBS)
|
||||
|
||||
IF(BUILD_STATIC_LIBS)
|
||||
ADD_LIBRARY(pcre2-16 ALIAS pcre2-16-static)
|
||||
ELSE(BUILD_STATIC_LIBS)
|
||||
ADD_LIBRARY(pcre2-16 ALIAS pcre2-16-shared)
|
||||
ENDIF(BUILD_STATIC_LIBS)
|
||||
ENDIF(PCRE2_BUILD_PCRE2_16)
|
||||
|
||||
# 32-bit library
|
||||
|
||||
IF(PCRE2_BUILD_PCRE2_32)
|
||||
ADD_LIBRARY(pcre2-32 ${PCRE2_HEADERS} ${PCRE2_SOURCES} ${PROJECT_BINARY_DIR}/config.h)
|
||||
SET_TARGET_PROPERTIES(pcre2-32 PROPERTIES
|
||||
COMPILE_DEFINITIONS PCRE2_CODE_UNIT_WIDTH=32
|
||||
MACHO_COMPATIBILITY_VERSION "${LIBPCRE2_32_MACHO_COMPATIBILITY_VERSION}"
|
||||
MACHO_CURRENT_VERSION "${LIBPCRE2_32_MACHO_CURRENT_VERSION}"
|
||||
VERSION ${LIBPCRE2_32_VERSION}
|
||||
SOVERSION ${LIBPCRE2_32_SOVERSION})
|
||||
SET(targets ${targets} pcre2-32)
|
||||
IF(BUILD_STATIC_LIBS)
|
||||
ADD_LIBRARY(pcre2-32-static STATIC ${PCRE2_HEADERS} ${PCRE2_SOURCES} ${PROJECT_BINARY_DIR}/config.h)
|
||||
TARGET_INCLUDE_DIRECTORIES(pcre2-32-static PUBLIC ${PROJECT_BINARY_DIR})
|
||||
SET_TARGET_PROPERTIES(pcre2-32-static PROPERTIES
|
||||
COMPILE_DEFINITIONS PCRE2_CODE_UNIT_WIDTH=32
|
||||
MACHO_COMPATIBILITY_VERSION "${LIBPCRE2_32_MACHO_COMPATIBILITY_VERSION}"
|
||||
MACHO_CURRENT_VERSION "${LIBPCRE2_32_MACHO_CURRENT_VERSION}"
|
||||
VERSION ${LIBPCRE2_32_VERSION}
|
||||
SOVERSION ${LIBPCRE2_32_SOVERSION})
|
||||
TARGET_COMPILE_DEFINITIONS(pcre2-32-static PUBLIC PCRE2_STATIC)
|
||||
IF(REQUIRE_PTHREAD)
|
||||
TARGET_LINK_LIBRARIES(pcre2-32-static Threads::Threads)
|
||||
ENDIF(REQUIRE_PTHREAD)
|
||||
SET(targets ${targets} pcre2-32-static)
|
||||
|
||||
IF(MINGW AND NOT PCRE2_STATIC)
|
||||
IF(NON_STANDARD_LIB_PREFIX)
|
||||
SET_TARGET_PROPERTIES(pcre2-32 PROPERTIES PREFIX "")
|
||||
ENDIF(NON_STANDARD_LIB_PREFIX)
|
||||
IF(NON_STANDARD_LIB_SUFFIX)
|
||||
SET_TARGET_PROPERTIES(pcre2-32 PROPERTIES SUFFIX "-0.dll")
|
||||
ENDIF(NON_STANDARD_LIB_SUFFIX)
|
||||
ENDIF(MINGW AND NOT PCRE2_STATIC)
|
||||
IF(MSVC)
|
||||
SET_TARGET_PROPERTIES(pcre2-32-static PROPERTIES OUTPUT_NAME pcre2-32-static)
|
||||
ELSE(MSVC)
|
||||
SET_TARGET_PROPERTIES(pcre2-32-static PROPERTIES OUTPUT_NAME pcre2-32)
|
||||
ENDIF(MSVC)
|
||||
IF(PCRE2_STATIC_PIC)
|
||||
SET_TARGET_PROPERTIES(pcre2-32-static PROPERTIES POSITION_INDEPENDENT_CODE 1)
|
||||
ENDIF(PCRE2_STATIC_PIC)
|
||||
ENDIF(BUILD_STATIC_LIBS)
|
||||
|
||||
IF(BUILD_SHARED_LIBS)
|
||||
ADD_LIBRARY(pcre2-32-shared SHARED ${PCRE2_HEADERS} ${PCRE2_SOURCES} ${PROJECT_BINARY_DIR}/config.h)
|
||||
TARGET_INCLUDE_DIRECTORIES(pcre2-32-shared PUBLIC ${PROJECT_BINARY_DIR})
|
||||
SET_TARGET_PROPERTIES(pcre2-32-shared PROPERTIES
|
||||
COMPILE_DEFINITIONS PCRE2_CODE_UNIT_WIDTH=32
|
||||
MACHO_COMPATIBILITY_VERSION "${LIBPCRE2_32_MACHO_COMPATIBILITY_VERSION}"
|
||||
MACHO_CURRENT_VERSION "${LIBPCRE2_32_MACHO_CURRENT_VERSION}"
|
||||
VERSION ${LIBPCRE2_32_VERSION}
|
||||
SOVERSION ${LIBPCRE2_32_SOVERSION}
|
||||
OUTPUT_NAME pcre2-32)
|
||||
IF(REQUIRE_PTHREAD)
|
||||
TARGET_LINK_LIBRARIES(pcre2-32-shared Threads::Threads)
|
||||
ENDIF(REQUIRE_PTHREAD)
|
||||
SET(targets ${targets} pcre2-32-shared)
|
||||
SET(dll_pdb_files ${PROJECT_BINARY_DIR}/pcre2-32.pdb ${dll_pdb_files})
|
||||
SET(dll_pdb_debug_files ${PROJECT_BINARY_DIR}/pcre2-32d.pdb ${dll_pdb_debug_files})
|
||||
|
||||
IF(MINGW)
|
||||
IF(NON_STANDARD_LIB_PREFIX)
|
||||
SET_TARGET_PROPERTIES(pcre2-32-shared PROPERTIES PREFIX "")
|
||||
ENDIF(NON_STANDARD_LIB_PREFIX)
|
||||
IF(NON_STANDARD_LIB_SUFFIX)
|
||||
SET_TARGET_PROPERTIES(pcre2-32-shared PROPERTIES SUFFIX "-0.dll")
|
||||
ENDIF(NON_STANDARD_LIB_SUFFIX)
|
||||
ENDIF(MINGW)
|
||||
ENDIF(BUILD_SHARED_LIBS)
|
||||
|
||||
IF(BUILD_STATIC_LIBS)
|
||||
ADD_LIBRARY(pcre2-32 ALIAS pcre2-32-static)
|
||||
ELSE(BUILD_STATIC_LIBS)
|
||||
ADD_LIBRARY(pcre2-32 ALIAS pcre2-32-shared)
|
||||
ENDIF(BUILD_STATIC_LIBS)
|
||||
ENDIF(PCRE2_BUILD_PCRE2_32)
|
||||
|
||||
# Executables
|
||||
|
@ -900,37 +1075,34 @@ INSTALL(FILES "${CMAKE_CURRENT_BINARY_DIR}/pcre2-config"
|
|||
|
||||
INSTALL(FILES ${PCRE2_HEADERS} ${PCRE2POSIX_HEADERS} DESTINATION include)
|
||||
|
||||
# CMake config files.
|
||||
set(PCRE2_CONFIG_IN ${CMAKE_CURRENT_SOURCE_DIR}/cmake/pcre2-config.cmake.in)
|
||||
set(PCRE2_CONFIG_OUT ${CMAKE_CURRENT_BINARY_DIR}/cmake/pcre2-config.cmake)
|
||||
configure_file(${PCRE2_CONFIG_IN} ${PCRE2_CONFIG_OUT} @ONLY)
|
||||
set(PCRE2_CONFIG_VERSION_IN ${CMAKE_CURRENT_SOURCE_DIR}/cmake/pcre2-config-version.cmake.in)
|
||||
set(PCRE2_CONFIG_VERSION_OUT ${CMAKE_CURRENT_BINARY_DIR}/cmake/pcre2-config-version.cmake)
|
||||
configure_file(${PCRE2_CONFIG_VERSION_IN} ${PCRE2_CONFIG_VERSION_OUT} @ONLY)
|
||||
install(FILES ${PCRE2_CONFIG_OUT} ${PCRE2_CONFIG_VERSION_OUT} DESTINATION cmake)
|
||||
|
||||
FILE(GLOB html ${PROJECT_SOURCE_DIR}/doc/html/*.html)
|
||||
FILE(GLOB man1 ${PROJECT_SOURCE_DIR}/doc/*.1)
|
||||
FILE(GLOB man3 ${PROJECT_SOURCE_DIR}/doc/*.3)
|
||||
|
||||
FOREACH(man ${man3})
|
||||
GET_FILENAME_COMPONENT(man_tmp ${man} NAME)
|
||||
SET(man3_new ${man3} ${man})
|
||||
ENDFOREACH(man ${man3})
|
||||
SET(man3 ${man3_new})
|
||||
|
||||
INSTALL(FILES ${man1} DESTINATION man/man1)
|
||||
INSTALL(FILES ${man3} DESTINATION man/man3)
|
||||
INSTALL(FILES ${html} DESTINATION share/doc/pcre2/html)
|
||||
|
||||
IF(MSVC AND INSTALL_MSVC_PDB)
|
||||
INSTALL(FILES ${PROJECT_BINARY_DIR}/pcre2.pdb
|
||||
${PROJECT_BINARY_DIR}/pcre2posix.pdb
|
||||
DESTINATION bin
|
||||
CONFIGURATIONS RelWithDebInfo)
|
||||
INSTALL(FILES ${PROJECT_BINARY_DIR}/pcre2d.pdb
|
||||
${PROJECT_BINARY_DIR}/pcre2posixd.pdb
|
||||
DESTINATION bin
|
||||
CONFIGURATIONS Debug)
|
||||
INSTALL(FILES ${dll_pdb_files} DESTINATION bin CONFIGURATIONS RelWithDebInfo)
|
||||
INSTALL(FILES ${dll_pdb_debug_files} DESTINATION bin CONFIGURATIONS Debug)
|
||||
ENDIF(MSVC AND INSTALL_MSVC_PDB)
|
||||
|
||||
# Help, only for nice output
|
||||
IF(BUILD_SHARED_LIBS)
|
||||
SET(BUILD_STATIC_LIBS OFF)
|
||||
ELSE(BUILD_SHARED_LIBS)
|
||||
IF(BUILD_STATIC_LIBS)
|
||||
SET(BUILD_STATIC_LIBS ON)
|
||||
ENDIF(BUILD_SHARED_LIBS)
|
||||
ELSE(BUILD_STATIC_LIBS)
|
||||
SET(BUILD_STATIC_LIBS OFF)
|
||||
ENDIF(BUILD_STATIC_LIBS)
|
||||
|
||||
IF(PCRE2_HEAP_MATCH_RECURSE)
|
||||
MESSAGE(WARNING "HEAP_MATCH_RECURSE is obsolete and does nothing.")
|
||||
|
@ -968,6 +1140,7 @@ IF(PCRE2_SHOW_REPORT)
|
|||
MESSAGE(STATUS " Match depth limit ............... : ${PCRE2_MATCH_LIMIT_DEPTH}")
|
||||
MESSAGE(STATUS " Build shared libs ............... : ${BUILD_SHARED_LIBS}")
|
||||
MESSAGE(STATUS " Build static libs ............... : ${BUILD_STATIC_LIBS}")
|
||||
MESSAGE(STATUS " with PIC enabled ............. : ${PCRE2_STATIC_PIC}")
|
||||
MESSAGE(STATUS " Build pcre2grep ................. : ${PCRE2_BUILD_PCRE2GREP}")
|
||||
MESSAGE(STATUS " Enable JIT in pcre2grep ......... : ${PCRE2GREP_SUPPORT_JIT}")
|
||||
MESSAGE(STATUS " Enable callouts in pcre2grep .... : ${PCRE2GREP_SUPPORT_CALLOUT}")
|
||||
|
@ -1002,10 +1175,10 @@ IF(PCRE2_SHOW_REPORT)
|
|||
MESSAGE(STATUS " Use %zu and %td ..................: AUTO" )
|
||||
ENDIF(PCRE2_DISABLE_PERCENT_ZT)
|
||||
|
||||
IF(MINGW AND NOT PCRE2_STATIC)
|
||||
IF(MINGW AND BUILD_SHARED_LIBS)
|
||||
MESSAGE(STATUS " Non-standard dll names (prefix) . : ${NON_STANDARD_LIB_PREFIX}")
|
||||
MESSAGE(STATUS " Non-standard dll names (suffix) . : ${NON_STANDARD_LIB_SUFFIX}")
|
||||
ENDIF(MINGW AND NOT PCRE2_STATIC)
|
||||
ENDIF(MINGW AND BUILD_SHARED_LIBS)
|
||||
|
||||
IF(MSVC)
|
||||
MESSAGE(STATUS " Install MSVC .pdb files ..........: ${INSTALL_MSVC_PDB}")
|
||||
|
|
285
ChangeLog
285
ChangeLog
|
@ -1,5 +1,282 @@
|
|||
Change Log for PCRE2
|
||||
--------------------
|
||||
Change Log for PCRE2 - see also the Git log
|
||||
-------------------------------------------
|
||||
|
||||
|
||||
Version 10.41 xx-xxx-2022
|
||||
-------------------------
|
||||
|
||||
1. Add fflush() before and after a fork callout in pcre2grep to get its output
|
||||
to be the same on all systems. (THere were previously ordering differences in
|
||||
Alpine Linux).
|
||||
|
||||
2. Merged patch from @carenas (GitHub #110) for pthreads support in CMake.
|
||||
|
||||
3. SSF scorecards grumbled about possible overflow in an expression in
|
||||
pcre2test. It never would have overflowed in practice, but some casts have been
|
||||
added and at the some time there's been some tidying of fprints that output
|
||||
size_t values.
|
||||
|
||||
4. PR #94 showed up an unused enum in pcre2_convert.c, which is now removed.
|
||||
|
||||
5. Minor code re-arrangement to remove gcc warning about realloc() in
|
||||
pcre2test.
|
||||
|
||||
6. Change a number of int variables that hold buffer and line lengths in
|
||||
pcre2grep to PCRE2_SIZE (aka size_t).
|
||||
|
||||
7. Added an #ifdef to cut out a call to PRIV(jit_free) when JIT is not
|
||||
supported (even though that function would do nothing in that case) at the
|
||||
request of a user who doesn't even want to link with pcre_jit_compile.o. Also
|
||||
tidied up an untidy #ifdef arrangement in pcre2test.
|
||||
|
||||
8. Fixed an issue in the backtracking optimization of character repeats in
|
||||
JIT. Furthermore optimize star repetitions, not just plus repetitions.
|
||||
|
||||
9. Removed the use of an initial backtracking frames vector on the system stack
|
||||
in pcre2_match() so that it now always uses the heap. (In a multi-thread
|
||||
environment with very small stacks there had been an issue.) This also is
|
||||
tidier for JIT matching, which didn't need that vector. The heap vector is now
|
||||
remembered in the match data block and re-used if that block itself is re-used.
|
||||
It is freed with the match data block.
|
||||
|
||||
10. Adjusted the find_limits code in pcre2test to work with change 9 above.
|
||||
|
||||
11. Added find_limits_noheap to pcre2test, because the heap limits are now
|
||||
different in different environments and so cannot be included in the standard
|
||||
tests.
|
||||
|
||||
12. Created a test for pcre2_match() heap processing that is not part of the
|
||||
tests run by 'make check', but can be run manually. The current output is from
|
||||
a 64-bit system.
|
||||
|
||||
13. Implemented -Z aka --null in pcre2grep.
|
||||
|
||||
|
||||
Version 10.40 15-April-2022
|
||||
---------------------------
|
||||
|
||||
1. Merged patch from @carenas (GitHub #35, 7db87842) to fix pcre2grep incorrect
|
||||
handling of multiple passes.
|
||||
|
||||
2. Merged patch from @carenas (GitHub #36, dae47509) to fix portability issue
|
||||
in pcre2grep with buffered fseek(stdin).
|
||||
|
||||
3. Merged patch from @carenas (GitHub #37, acc520924) to fix tests when -S is
|
||||
not supported.
|
||||
|
||||
4. Revert an unintended change in JIT repeat detection.
|
||||
|
||||
5. Merged patch from @carenas (GitHub #52, b037bfa1) to fix build on GNU Hurd.
|
||||
|
||||
6. Merged documentation and comments patches from @carenas (GitHub #47).
|
||||
|
||||
7. Merged patch from @carenas (GitHub #49) to remove obsolete JFriedl test code
|
||||
from pcre2grep.
|
||||
|
||||
8. Merged patch from @carenas (GitHub #48) to fix CMake install issue #46.
|
||||
|
||||
9. Merged patch from @carenas (GitHub #53) fixing NULL checks in matching and
|
||||
substituting.
|
||||
|
||||
10. Add null_subject and null_replacement modifiers to pcre2test.
|
||||
|
||||
11. Add check for NULL subject to POSIX regexec() function.
|
||||
|
||||
12. Add check for NULL replacement to pcre2_substitute().
|
||||
|
||||
13. For the subject arguments of pcre2_match(), pcre2_dfa_match(), and
|
||||
pcre2_substitute(), and the replacement argument of the latter, if the pointer
|
||||
is NULL and the length is zero, treat as an empty string. Apparently a number
|
||||
of applications treat NULL/0 in this way.
|
||||
|
||||
14. Added support for Bidi_Class and a number of binary Unicode properties,
|
||||
including Bidi_Control.
|
||||
|
||||
15. Fix some minor issues raised by clang sanitize.
|
||||
|
||||
16. Very minor code speed up for maximizing character property matches.
|
||||
|
||||
17. A number of changes to script matching for \p and \P:
|
||||
|
||||
(a) Script extensions for a character are now coded as a bitmap instead of
|
||||
a list of script numbers, which should be faster and does not need a
|
||||
loop.
|
||||
|
||||
(b) Added the syntax \p{script:xxx} and \p{script_extensions:xxx} (synonyms
|
||||
sc and scx).
|
||||
|
||||
(c) Changed \p{scriptname} from being the same as \p{sc:scriptname} to being
|
||||
the same as \p{scx:scriptname} because this change happened in Perl at
|
||||
release 5.26.
|
||||
|
||||
(d) The standard Unicode 4-letter abbreviations for script names are now
|
||||
recognized.
|
||||
|
||||
(e) In accordance with Unicode and Perl's "loose matching" rules, spaces,
|
||||
hyphens, and underscores are ignored in property names, which are then
|
||||
matched independent of case.
|
||||
|
||||
18. The Python scripts in the maint directory have been refactored. There are
|
||||
now three scripts that generate pcre2_ucd.c, pcre2_ucp.h, and pcre2_ucptables.c
|
||||
(which is #included by pcre2_tables.c). The data lists that used to be
|
||||
duplicated are now held in a single common Python module.
|
||||
|
||||
19. On CHERI, and thus Arm's Morello prototype, pointers are represented as
|
||||
hardware capabilities, which consist of both an integer address and additional
|
||||
metadata, meaning they are twice the size of the platform's size_t type, i.e.
|
||||
16 bytes on a 64-bit system. The ovector member of heapframe happens to only be
|
||||
8 byte aligned, and so computing frame_size ended up with a multiple of 8 but
|
||||
not 16. Whilst the first frame was always suitably aligned, this then
|
||||
misaligned the frame that follows, resulting in an alignment fault when storing
|
||||
a pointer to Fecode at the start of match. Patch to fix this issue by Jessica
|
||||
Clarke PR#72.
|
||||
|
||||
20. Added -LP and -LS listing options to pcre2test.
|
||||
|
||||
21. A user discovered that the library names in CMakeLists.txt for MSVC
|
||||
debugger (PDB) files were incorrect - perhaps never tried for PCRE2?
|
||||
|
||||
22. An item such as [Aa] is optimized into a caseless single character match.
|
||||
When this was quantified (e.g. [Aa]{2}) and was also the last literal item in a
|
||||
pattern, the optimizing "must be present for a match" character check was not
|
||||
being flagged as caseless, causing some matches that should have succeeded to
|
||||
fail.
|
||||
|
||||
23. Fixed a unicode property matching issue in JIT. The character was not
|
||||
fully read in caseless matching.
|
||||
|
||||
24. Fixed an issue affecting recursions in JIT caused by duplicated data
|
||||
transfers.
|
||||
|
||||
25. Merged patch from @carenas (GitHub #96) which fixes some problems with
|
||||
pcre2test and readline/readedit:
|
||||
|
||||
* Use the right header for libedit in FreeBSD with autoconf
|
||||
* Really allow libedit with cmake
|
||||
* Avoid using readline headers with libedit
|
||||
|
||||
|
||||
Version 10.39 29-October-2021
|
||||
-----------------------------
|
||||
|
||||
1. Fix incorrect detection of alternatives in first character search in JIT.
|
||||
|
||||
2. Merged patch from @carenas (GitHub #28):
|
||||
|
||||
Visual Studio 2013 includes support for %zu and %td, so let newer
|
||||
versions of it avoid the fallback, and while at it, make sure that
|
||||
the first check is for DISABLE_PERCENT_ZT so it will be always
|
||||
honoured if chosen.
|
||||
|
||||
prtdiff_t is signed, so use a signed type instead, and make sure
|
||||
that an appropriate width is chosen if pointers are 64bit wide and
|
||||
long is not (ex: Windows 64bit).
|
||||
|
||||
IMHO removing the cast (and therefore the possibilty of truncation)
|
||||
make the code cleaner and the fallback is likely portable enough
|
||||
with all 64-bit POSIX systems doing LP64 except for Windows.
|
||||
|
||||
3. Merged patch from @carenas (GitHub #29) to update to Unicode 14.0.0.
|
||||
|
||||
4. Merged patch from @carenas (GitHub #30):
|
||||
|
||||
* Cleanup: remove references to no longer used stdint.h
|
||||
|
||||
Since 19c50b9d (Unconditionally use inttypes.h instead of trying for stdint.h
|
||||
(simplification) and remove the now unnecessary inclusion in
|
||||
pcre2_internal.h., 2018-11-14), stdint.h is no longer used.
|
||||
|
||||
Remove checks for it in autotools and CMake and document better the expected
|
||||
build failures for systems that might have stdint.h (C99) and not inttypes.h
|
||||
(from POSIX), like old Windows.
|
||||
|
||||
* Cleanup: remove detection for inttypes.h which is a hard dependency
|
||||
|
||||
CMake checks for standard headers are not meant to be used for hard
|
||||
dependencies, so will prevent a possible fallback to work.
|
||||
|
||||
Alternatively, the header could be checked to make the configuration fail
|
||||
instead of breaking the build, but that was punted, as it was missing anyway
|
||||
from autotools.
|
||||
|
||||
5. Merged patch from @carenas (GitHub #32):
|
||||
|
||||
* jit: allow building with ancient MSVC versions
|
||||
|
||||
Visual Studio older than 2013 fails to build with JIT enabled, because it is
|
||||
unable to parse non C89 compatible syntax, with mixed declarations and code.
|
||||
While most recent compilers wouldn't even report this as a warning since it
|
||||
is valid C99, it could be also made visible by adding to gcc/clang the
|
||||
-Wdeclaration-after-statement flag at build time.
|
||||
|
||||
Move the code below the affected definitions.
|
||||
|
||||
* pcre2grep: avoid mixing declarations with code
|
||||
|
||||
Since d5a61ee8 (Patch to detect (and ignore) symlink loops in pcre2grep,
|
||||
2021-08-28), code will fail to build in a strict C89 compiler.
|
||||
|
||||
Reformat slightly to make it C89 compatible again.
|
||||
|
||||
|
||||
Version 10.38 01-October-2021
|
||||
-----------------------------
|
||||
|
||||
1. Fix invalid single character repetition issues in JIT when the repetition
|
||||
is inside a capturing bracket and the bracket is preceded by character
|
||||
literals.
|
||||
|
||||
2. Installed revised CMake configuration files provided by Jan-Willem Blokland.
|
||||
This extends the CMake build system to build both static and shared libraries
|
||||
in one go, builds the static library with PIC, and exposes PCRE2 libraries
|
||||
using the CMake config files. JWB provided these notes:
|
||||
|
||||
- Introduced CMake variable BUILD_STATIC_LIBS to build the static library.
|
||||
|
||||
- Make a small modification to config-cmake.h.in by removing the PCRE2_STATIC
|
||||
variable. Added PCRE2_STATIC variable to the static build using the
|
||||
target_compile_definitions() function.
|
||||
|
||||
- Extended the CMake config files.
|
||||
|
||||
- Introduced CMake variable PCRE2_USE_STATIC_LIBS to easily switch between
|
||||
the static and shared libraries.
|
||||
|
||||
- Added the PCRE_STATIC variable to the target compile definitions for the
|
||||
import of the static library.
|
||||
|
||||
Building static and shared libraries using MSVC results in a name clash of
|
||||
the libraries. Both static and shared library builds create, for example, the
|
||||
file pcre2-8.lib. Therefore, I decided to change the static library names by
|
||||
adding "-static". For example, pcre2-8.lib has become pcre2-8-static.lib.
|
||||
[Comment by PH: this is MSVC-specific. It doesn't happen on Linux.]
|
||||
|
||||
3. Increased the minimum release number for CMake to 3.0.0 because older than
|
||||
2.8.12 is deprecated (it was set to 2.8.5) and causes warnings. Even 3.0.0 is
|
||||
quite old; it was released in 2014.
|
||||
|
||||
4. Implemented a modified version of Thomas Tempelmann's pcre2grep patch for
|
||||
detecting symlink loops. This is dependent on the availability of realpath(),
|
||||
which is now tested for in ./configure and CMakeLists.txt.
|
||||
|
||||
5. Implemented a modified version of Thomas Tempelmann's patch for faster
|
||||
case-independent "first code unit" searches for unanchored patterns in 8-bit
|
||||
mode in the interpreters. Instead of just remembering whether one case matched
|
||||
or not, it remembers the position of a previous match so as to avoid
|
||||
unnecessary repeated searching.
|
||||
|
||||
6. Perl now locks out \K in lookarounds, so PCRE2 now does the same by default.
|
||||
However, just in case anybody was relying on the old behaviour, there is an
|
||||
option called PCRE2_EXTRA_ALLOW_LOOKAROUND_BSK that enables the old behaviour.
|
||||
An option has also been added to pcre2grep to enable this.
|
||||
|
||||
7. Re-enable a JIT optimization which was unintentionally disabled in 10.35.
|
||||
|
||||
8. There is a loop counter to catch excessively crazy patterns when checking
|
||||
the lengths of lookbehinds at compile time. This was incorrectly getting reset
|
||||
whenever a lookahead was processed, leading to some fuzzer-generated patterns
|
||||
taking a very long time to compile when (?|) was present in the pattern,
|
||||
because (?|) disables caching of group lengths.
|
||||
|
||||
|
||||
Version 10.37 26-May-2021
|
||||
|
@ -186,7 +463,7 @@ now correctly backtracked, so this unnecessary restriction has been removed.
|
|||
|
||||
7. Added PCRE2_SUBSTITUTE_MATCHED.
|
||||
|
||||
8. Added (?* and (?<* as synonms for (*napla: and (*naplb: to match another
|
||||
8. Added (?* and (?<* as synonyms for (*napla: and (*naplb: to match another
|
||||
regex engine. The Perl regex folks are aware of this usage and have made a note
|
||||
about it.
|
||||
|
||||
|
@ -617,7 +894,7 @@ Patch by Guillem Jover.
|
|||
warnings were reported.
|
||||
|
||||
38. Using the clang compiler with sanitizing options causes runtime complaints
|
||||
about truncation for statments such as x = ~x when x is an 8-bit value; it
|
||||
about truncation for statements such as x = ~x when x is an 8-bit value; it
|
||||
seems to compute ~x as a 32-bit value. Changing such statements to x = 255 ^ x
|
||||
gets rid of the warnings. There were also two missing casts in pcre2test.
|
||||
|
||||
|
|
64
HACKING
64
HACKING
|
@ -8,8 +8,8 @@ library is referred to as PCRE1 below. For information about testing PCRE2, see
|
|||
the pcre2test documentation and the comment at the head of the RunTest file.
|
||||
|
||||
PCRE1 releases were up to 8.3x when PCRE2 was developed, and later bug fix
|
||||
releases remain in the 8.xx series. PCRE2 releases started at 10.00 to avoid
|
||||
confusion with PCRE1.
|
||||
releases carried on the 8.xx series, up to the final 8.45 release. PCRE2
|
||||
releases started at 10.00 to avoid confusion with PCRE1.
|
||||
|
||||
|
||||
Historical note 1
|
||||
|
@ -38,8 +38,8 @@ Historical note 2
|
|||
By contrast, the code originally written by Henry Spencer (which was
|
||||
subsequently heavily modified for Perl) compiles the expression twice: once in
|
||||
a dummy mode in order to find out how much store will be needed, and then for
|
||||
real. (The Perl version probably doesn't do this any more; I'm talking about
|
||||
the original library.) The execution function operates by backtracking and
|
||||
real. (The Perl version may or may not still do this; I'm talking about the
|
||||
original library.) The execution function operates by backtracking and
|
||||
maximizing (or, optionally, minimizing, in Perl) the amount of the subject that
|
||||
matches individual wild portions of the pattern. This is an "NFA algorithm" in
|
||||
Friedl's terminology.
|
||||
|
@ -151,8 +151,8 @@ of code units in the item itself. The exception is the aforementioned large
|
|||
advance to check for such values. When auto-callouts are enabled, the generous
|
||||
assumption is made that there will be a callout for each pattern code unit
|
||||
(which of course is only actually true if all code units are literals) plus one
|
||||
at the end. There is a default parsed pattern vector on the system stack, but
|
||||
if this is not big enough, heap memory is used.
|
||||
at the end. A default parsed pattern vector is defined on the system stack, to
|
||||
minimize memory handling, but if this is not big enough, heap memory is used.
|
||||
|
||||
As before, the actual compiling function is run twice, the first time to
|
||||
determine the amount of memory needed for the final compiled pattern. It
|
||||
|
@ -187,7 +187,7 @@ META_CLASS_EMPTY [] empty class - only with PCRE2_ALLOW_EMPTY_CLASS
|
|||
META_CLASS_EMPTY_NOT [^] negative empty class - ditto
|
||||
META_CLASS_END ] end of non-empty class
|
||||
META_CLASS_NOT [^ start non-empty negative class
|
||||
META_COMMIT (*COMMIT)
|
||||
META_COMMIT (*COMMIT) - no argument (see below for with argument)
|
||||
META_COND_ASSERT (?(?assertion)
|
||||
META_DOLLAR $ metacharacter
|
||||
META_DOT . metacharacter
|
||||
|
@ -201,18 +201,18 @@ META_NOCAPTURE (?: no capture parens
|
|||
META_PLUS +
|
||||
META_PLUS_PLUS ++
|
||||
META_PLUS_QUERY +?
|
||||
META_PRUNE (*PRUNE) - no argument
|
||||
META_PRUNE (*PRUNE) - no argument (see below for with argument)
|
||||
META_QUERY ?
|
||||
META_QUERY_PLUS ?+
|
||||
META_QUERY_QUERY ??
|
||||
META_RANGE_ESCAPED hyphen in class range with at least one escape
|
||||
META_RANGE_LITERAL hyphen in class range defined literally
|
||||
META_SKIP (*SKIP) - no argument
|
||||
META_THEN (*THEN) - no argument
|
||||
META_SKIP (*SKIP) - no argument (see below for with argument)
|
||||
META_THEN (*THEN) - no argument (see below for with argument)
|
||||
|
||||
The two RANGE values occur only in character classes. They are positioned
|
||||
between two literals that define the start and end of the range. In an EBCDIC
|
||||
evironment it is necessary to know whether either of the range values was
|
||||
environment it is necessary to know whether either of the range values was
|
||||
specified as an escape. In an ASCII/Unicode environment the distinction is not
|
||||
relevant.
|
||||
|
||||
|
@ -229,17 +229,16 @@ If the data for META_ALT is non-zero, it is inside a lookbehind, and the data
|
|||
is the length of its branch, for which OP_REVERSE must be generated.
|
||||
|
||||
META_BACKREF, META_CAPTURE, and META_RECURSE have the capture group number as
|
||||
their data in the lower 16 bits of the element.
|
||||
their data in the lower 16 bits of the element. META_RECURSE is followed by an
|
||||
offset, for use in error messages.
|
||||
|
||||
META_BACKREF is followed by an offset if the back reference group number is 10
|
||||
or more. The offsets of the first ocurrences of references to groups whose
|
||||
or more. The offsets of the first occurrences of references to groups whose
|
||||
numbers are less than 10 are put in cb->small_ref_offset[] (only the first
|
||||
occurrence is useful). On 64-bit systems this avoids using more than two parsed
|
||||
pattern elements for items such as \3. The offset is used when an error occurs
|
||||
because the reference is to a non-existent group.
|
||||
|
||||
META_RECURSE is always followed by an offset, for use in error messages.
|
||||
|
||||
META_ESCAPE has an ESC_xxx value as its data. For ESC_P and ESC_p, the next
|
||||
element contains the 16-bit type and data property values, packed together.
|
||||
ESC_g and ESC_k are used only for named references - numerical ones are turned
|
||||
|
@ -291,9 +290,9 @@ META_LOOKBEHIND (?<= start of lookbehind
|
|||
META_LOOKBEHIND_NA (*naplb: start of non-atomic lookbehind
|
||||
META_LOOKBEHINDNOT (?<! start of negative lookbehind
|
||||
|
||||
The following are followed by two elements, the minimum and maximum. Repeat
|
||||
values are limited to 65535 (MAX_REPEAT). A maximum value of "unlimited" is
|
||||
represented by UNLIMITED_REPEAT, which is bigger than MAX_REPEAT:
|
||||
The following are followed by two elements, the minimum and maximum. The
|
||||
maximum value is limited to 65535 (MAX_REPEAT). A maximum value of "unlimited"
|
||||
is represented by UNLIMITED_REPEAT, which is bigger than MAX_REPEAT:
|
||||
|
||||
META_MINMAX {n,m} repeat
|
||||
META_MINMAX_PLUS {n,m}+ repeat
|
||||
|
@ -347,11 +346,11 @@ support is not available for this kind of matching.
|
|||
Changeable options
|
||||
------------------
|
||||
|
||||
The /i, /m, or /s options (PCRE2_CASELESS, PCRE2_MULTILINE, PCRE2_DOTALL, and
|
||||
others) may be changed in the middle of patterns by items such as (?i). Their
|
||||
processing is handled entirely at compile time by generating different opcodes
|
||||
for the different settings. The runtime functions do not need to keep track of
|
||||
an option's state.
|
||||
The /i, /m, or /s options (PCRE2_CASELESS, PCRE2_MULTILINE, PCRE2_DOTALL) and
|
||||
some others may be changed in the middle of patterns by items such as (?i).
|
||||
Their processing is handled entirely at compile time by generating different
|
||||
opcodes for the different settings. The runtime functions do not need to keep
|
||||
track of an option's state.
|
||||
|
||||
PCRE2_DUPNAMES, PCRE2_EXTENDED, PCRE2_EXTENDED_MORE, and PCRE2_NO_AUTO_CAPTURE
|
||||
are tracked and processed during the parsing pre-pass. The others are handled
|
||||
|
@ -437,7 +436,7 @@ Backtracking control verbs
|
|||
--------------------------
|
||||
|
||||
Verbs with no arguments generate opcodes with no following data (as listed
|
||||
in the section above).
|
||||
in the section above).
|
||||
|
||||
(*MARK:NAME) generates OP_MARK followed by the mark name, preceded by a
|
||||
length in one code unit, and followed by a binary zero. The name length is
|
||||
|
@ -468,8 +467,8 @@ Caseless matching (positive or negative) of characters that have more than two
|
|||
case-equivalent code points (which is possible only in UTF mode) is handled by
|
||||
compiling a Unicode property item (see below), with the pseudo-property
|
||||
PT_CLIST. The value of this property is an offset in a vector called
|
||||
"ucd_caseless_sets" which identifies the start of a short list of equivalent
|
||||
characters, terminated by the value NOTACHAR (0xffffffff).
|
||||
"ucd_caseless_sets" which identifies the start of a short list of case
|
||||
equivalent characters, terminated by the value NOTACHAR (0xffffffff).
|
||||
|
||||
|
||||
Repeating single characters
|
||||
|
@ -546,8 +545,9 @@ Each is followed by two code units that encode the desired property as a type
|
|||
and a value. The types are a set of #defines of the form PT_xxx, and the values
|
||||
are enumerations of the form ucp_xx, defined in the pcre2_ucp.h source file.
|
||||
The value is relevant only for PT_GC (General Category), PT_PC (Particular
|
||||
Category), PT_SC (Script), and the pseudo-property PT_CLIST, which is used to
|
||||
identify a list of case-equivalent characters when there are three or more.
|
||||
Category), PT_SC (Script), PT_BIDICL (Bidi Class), PT_BOOL (Boolean property),
|
||||
and the pseudo-property PT_CLIST, which is used to identify a list of
|
||||
case-equivalent characters when there are three or more (see above).
|
||||
|
||||
Repeats of these items use the OP_TYPESTAR etc. set of opcodes, followed by
|
||||
three code units: OP_PROP or OP_NOTPROP, and then the desired property type and
|
||||
|
@ -665,9 +665,9 @@ a count that immediately follows the offset.
|
|||
There are several opcodes that mark the end of a subpattern group. OP_KET is
|
||||
used for subpatterns that do not repeat indefinitely, OP_KETRMIN and
|
||||
OP_KETRMAX are used for indefinite repetitions, minimally or maximally
|
||||
respectively, and OP_KETRPOS for possessive repetitions (see below for more
|
||||
respectively, and OP_KETRPOS for possessive repetitions (see below for more
|
||||
details). All four are followed by a LINK_SIZE value giving (as a positive
|
||||
number) the offset back to the matching bracket opcode.
|
||||
number) the offset back to the matching opening bracket opcode.
|
||||
|
||||
If a subpattern is quantified such that it is permitted to match zero times, it
|
||||
is preceded by one of OP_BRAZERO, OP_BRAMINZERO, or OP_SKIPZERO. These are
|
||||
|
@ -718,7 +718,7 @@ Assertions
|
|||
|
||||
Forward assertions are also just like other subpatterns, but starting with one
|
||||
of the opcodes OP_ASSERT, OP_ASSERT_NA (non-atomic assertion), or
|
||||
OP_ASSERT_NOT. Backward assertions use the opcodes OP_ASSERTBACK,
|
||||
OP_ASSERT_NOT. Backward assertions use the opcodes OP_ASSERTBACK,
|
||||
OP_ASSERTBACK_NA, and OP_ASSERTBACK_NOT, and the first opcode inside the
|
||||
assertion is OP_REVERSE, followed by a count of the number of characters to
|
||||
move back the pointer in the subject string. In ASCII or UTF-32 mode, the count
|
||||
|
@ -827,4 +827,4 @@ not a real opcode, but is used to check at compile time that tables indexed by
|
|||
opcode are the correct length, in order to catch updating errors.
|
||||
|
||||
Philip Hazel
|
||||
12 July 2019
|
||||
April 2022
|
||||
|
|
8
LICENCE
8
LICENCE
|
@ -23,10 +23,10 @@ Written by: Philip Hazel
|
|||
Email local part: Philip.Hazel
|
||||
Email domain: gmail.com
|
||||
|
||||
University of Cambridge Computing Service,
|
||||
Retired from University of Cambridge Computing Service,
|
||||
Cambridge, England.
|
||||
|
||||
Copyright (c) 1997-2021 University of Cambridge
|
||||
Copyright (c) 1997-2022 University of Cambridge
|
||||
All rights reserved.
|
||||
|
||||
|
||||
|
@ -37,7 +37,7 @@ Written by: Zoltan Herczeg
|
|||
Email local part: hzmester
|
||||
Email domain: freemail.hu
|
||||
|
||||
Copyright(c) 2010-2021 Zoltan Herczeg
|
||||
Copyright(c) 2010-2022 Zoltan Herczeg
|
||||
All rights reserved.
|
||||
|
||||
|
||||
|
@ -48,7 +48,7 @@ Written by: Zoltan Herczeg
|
|||
Email local part: hzmester
|
||||
Email domain: freemail.hu
|
||||
|
||||
Copyright(c) 2009-2021 Zoltan Herczeg
|
||||
Copyright(c) 2009-2022 Zoltan Herczeg
|
||||
All rights reserved.
|
||||
|
||||
|
||||
|
|
|
@ -0,0 +1,8 @@
|
|||
module(
|
||||
name = "pcre2",
|
||||
version = "10.40",
|
||||
compatibility_level = 1,
|
||||
)
|
||||
|
||||
bazel_dep(name = "rules_cc", version = "0.0.1")
|
||||
bazel_dep(name = "bazel_skylib", version = "1.2.1")
|
15
Makefile.am
15
Makefile.am
|
@ -382,6 +382,10 @@ COMMON_SOURCES = \
|
|||
src/pcre2_valid_utf.c \
|
||||
src/pcre2_xclass.c
|
||||
|
||||
# The pcre2_ucptables.c file is #included by pcre2_tables.c
|
||||
|
||||
EXTRA_DIST += src/pcre2_ucptables.c
|
||||
|
||||
if WITH_PCRE2_8
|
||||
lib_LTLIBRARIES += libpcre2-8.la
|
||||
libpcre2_8_la_SOURCES = \
|
||||
|
@ -448,9 +452,10 @@ EXTRA_DIST += \
|
|||
src/sljit/sljitNativePPC_32.c \
|
||||
src/sljit/sljitNativePPC_64.c \
|
||||
src/sljit/sljitNativePPC_common.c \
|
||||
src/sljit/sljitNativeRISCV_32.c \
|
||||
src/sljit/sljitNativeRISCV_64.c \
|
||||
src/sljit/sljitNativeRISCV_common.c \
|
||||
src/sljit/sljitNativeS390X.c \
|
||||
src/sljit/sljitNativeSPARC_32.c \
|
||||
src/sljit/sljitNativeSPARC_common.c \
|
||||
src/sljit/sljitNativeX86_32.c \
|
||||
src/sljit/sljitNativeX86_64.c \
|
||||
src/sljit/sljitNativeX86_common.c \
|
||||
|
@ -663,6 +668,7 @@ EXTRA_DIST += \
|
|||
testdata/testinput23 \
|
||||
testdata/testinput24 \
|
||||
testdata/testinput25 \
|
||||
testdata/testinput26 \
|
||||
testdata/testinputEBC \
|
||||
testdata/testoutput1 \
|
||||
testdata/testoutput2 \
|
||||
|
@ -705,6 +711,7 @@ EXTRA_DIST += \
|
|||
testdata/testoutput23 \
|
||||
testdata/testoutput24 \
|
||||
testdata/testoutput25 \
|
||||
testdata/testoutput26 \
|
||||
testdata/testoutputEBC \
|
||||
testdata/valgrind-jit.supp \
|
||||
testdata/wintestinput3 \
|
||||
|
@ -859,9 +866,11 @@ endif # WITH_GCOV
|
|||
|
||||
EXTRA_DIST += \
|
||||
cmake/COPYING-CMAKE-SCRIPTS \
|
||||
cmake/FindEditline.cmake \
|
||||
cmake/FindPackageHandleStandardArgs.cmake \
|
||||
cmake/FindReadline.cmake \
|
||||
cmake/FindEditline.cmake \
|
||||
cmake/pcre2-config-version.cmake.in \
|
||||
cmake/pcre2-config.cmake.in \
|
||||
CMakeLists.txt \
|
||||
config-cmake.h.in
|
||||
|
||||
|
|
|
@ -0,0 +1,271 @@
|
|||
#
|
||||
# Project: pcre2
|
||||
#
|
||||
# Created on: 10-01-2022 22:01:46
|
||||
#
|
||||
# commands to use:
|
||||
# make -f Makefile.os4 libpcre2.a
|
||||
# make -f Makefile.os4 libpcre2-posix.a
|
||||
# make -f Makefile.os4 pcre2test
|
||||
# sh RunTest
|
||||
# make -f Makefile.os4 clean
|
||||
#
|
||||
|
||||
###################################################################
|
||||
##
|
||||
##//// Objects
|
||||
##
|
||||
###################################################################
|
||||
|
||||
libpcre2_OBJ := \
|
||||
src/pcre2_chartables.o src/pcre2_auto_possess.o src/pcre2_compile.o \
|
||||
src/pcre2_config.o src/pcre2_context.o src/pcre2_convert.o \
|
||||
src/pcre2_dfa_match.o src/pcre2_error.o src/pcre2_extuni.o \
|
||||
src/pcre2_find_bracket.o src/pcre2_jit_compile.o src/pcre2_maketables.o \
|
||||
src/pcre2_match.o src/pcre2_match_data.o src/pcre2_newline.o \
|
||||
src/pcre2_ord2utf.o src/pcre2_pattern_info.o src/pcre2_script_run.o \
|
||||
src/pcre2_serialize.o src/pcre2_string_utils.o src/pcre2_study.o \
|
||||
src/pcre2_substitute.o src/pcre2_substring.o src/pcre2_tables.o \
|
||||
src/pcre2_ucd.o src/pcre2_valid_utf.o src/pcre2_xclass.o \
|
||||
|
||||
|
||||
|
||||
pcre2posix_OBJ := \
|
||||
src/pcre2posix.o
|
||||
|
||||
|
||||
pcre2test_OBJ := \
|
||||
src/pcre2test.o
|
||||
|
||||
|
||||
pcre2grep_OBJ := \
|
||||
src/pcre2grep.o
|
||||
|
||||
###################################################################
|
||||
##
|
||||
##//// Variables and Environment
|
||||
##
|
||||
###################################################################
|
||||
|
||||
MCRT := -mcrt=newlib
|
||||
ifeq ($(USE_CLIB2), yes)
|
||||
MCRT := -mcrt=clib2
|
||||
endif
|
||||
|
||||
CC := gcc:bin/gcc
|
||||
|
||||
INCPATH := -I. -Isrc
|
||||
|
||||
# for pcre2test
|
||||
CFLAGS := $(MCRT) $(INCPATH) -O2 -DHAVE_CONFIG_H -DPCRE2_CODE_UNIT_WIDTH=8
|
||||
|
||||
###################################################################
|
||||
##
|
||||
##//// General rules
|
||||
##
|
||||
###################################################################
|
||||
|
||||
.PHONY: all all-before all-after clean clean-custom realclean
|
||||
|
||||
all: all-before libpcre2.a libpcre2-posix.a all-after
|
||||
|
||||
all-before:
|
||||
# You can add rules here to execute before the project is built
|
||||
|
||||
all-after:
|
||||
# You can add rules here to execute after the project is built
|
||||
|
||||
tests: pcre2test pcre2grep
|
||||
|
||||
clean: clean-custom
|
||||
@echo "Cleaning compiler objects..."
|
||||
@rm -f $(libpcre2_OBJ) $(pcre2posix_OBJ) $(pcre2test_OBJ)
|
||||
|
||||
cleanall: clean
|
||||
@echo "Cleaning compiler targets..."
|
||||
@rm -f libpcre.a libpcre-posix.a pcre2test pcre2grep
|
||||
|
||||
###################################################################
|
||||
##
|
||||
##//// Targets
|
||||
##
|
||||
###################################################################
|
||||
|
||||
libpcre2.a: $(libpcre2_OBJ)
|
||||
ar -rcs libpcre2.a $(libpcre2_OBJ)
|
||||
ranlib libpcre2.a
|
||||
|
||||
libpcre2-posix.a: $(pcre2posix_OBJ)
|
||||
ar -rcs libpcre2-posix.a $(pcre2posix_OBJ)
|
||||
ranlib libpcre2-posix.a
|
||||
|
||||
pcre2test: libpcre2.a libpcre2-posix.a $(pcre2test_OBJ)
|
||||
@echo "Linking pcre2test"
|
||||
@gcc:bin/gcc $(MCRT) -o pcre2test $(pcre2test_OBJ) -L. -lauto -lpcre2 -lpcre2-posix
|
||||
@echo "Removing stale debug target: pcre2test"
|
||||
@rm -f pcre2test.debug
|
||||
|
||||
pcre2grep: libpcre2.a $(pcre2grep_OBJ)
|
||||
@echo "Linking pcre2grep"
|
||||
@gcc:bin/gcc $(MCRT) -o pcre2grep $(pcre2grep_OBJ) -L . -lauto -lpcre2
|
||||
@echo "Removing stale debug target: pcre2grep"
|
||||
@rm -f pcre2grep.debug
|
||||
|
||||
|
||||
###################################################################
|
||||
##
|
||||
##//// Standard rules
|
||||
##
|
||||
###################################################################
|
||||
|
||||
# A default rule to make all the objects listed below
|
||||
# because we are hiding compiler commands from the output
|
||||
|
||||
.c.o:
|
||||
@echo "Compiling $<"
|
||||
@$(CC) -c $< -o $*.o $(CFLAGS)
|
||||
|
||||
src/pcre2_chartables.o: src/pcre2_chartables.c src/config.h src/pcre2_internal.h \
|
||||
src/pcre2.h src/pcre2_ucp.h
|
||||
|
||||
src/pcre2_auto_possess.o: src/pcre2_auto_possess.c src/config.h src/pcre2_internal.h \
|
||||
src/pcre2.h src/pcre2_ucp.h
|
||||
|
||||
src/pcre2_compile.o: src/pcre2_compile.c src/config.h src/pcre2_internal.h \
|
||||
src/pcre2.h src/pcre2_ucp.h src/pcre2_intmodedep.h \
|
||||
|
||||
|
||||
src/pcre2_config.o: src/pcre2_config.c src/config.h src/pcre2_internal.h \
|
||||
src/pcre2.h src/pcre2_ucp.h
|
||||
|
||||
src/pcre2_context.o: src/pcre2_context.c src/config.h src/pcre2_internal.h \
|
||||
src/pcre2.h src/pcre2_ucp.h
|
||||
|
||||
src/pcre2_convert.o: src/pcre2_convert.c src/config.h src/pcre2_internal.h \
|
||||
src/pcre2.h src/pcre2_ucp.h
|
||||
|
||||
src/pcre2_dfa_match.o: src/pcre2_dfa_match.c src/config.h src/pcre2_internal.h \
|
||||
src/pcre2.h src/pcre2_ucp.h
|
||||
|
||||
src/pcre2_error.o: src/pcre2_error.c src/config.h src/pcre2_internal.h \
|
||||
src/pcre2.h src/pcre2_ucp.h
|
||||
|
||||
src/pcre2_extuni.o: src/pcre2_extuni.c src/config.h src/pcre2_internal.h \
|
||||
src/pcre2.h src/pcre2_ucp.h
|
||||
|
||||
src/pcre2_find_bracket.o: src/pcre2_find_bracket.c src/config.h src/pcre2_internal.h \
|
||||
src/pcre2.h src/pcre2_ucp.h
|
||||
|
||||
src/pcre2_jit_compile.o: src/pcre2_jit_compile.c src/config.h src/pcre2_internal.h \
|
||||
src/pcre2.h src/pcre2_ucp.h src/pcre2_intmodedep.h \
|
||||
src/sljit/sljitLir.c src/sljit/sljitLir.h src/sljit/sljitConfig.h \
|
||||
src/sljit/sljitConfigInternal.h src/sljit/sljitUtils.c src/sljit/sljitProtExecAllocator.c \
|
||||
src/sljit/sljitWXExecAllocator.c src/sljit/sljitExecAllocator.c src/pcre2_jit_simd_inc.h \
|
||||
src/pcre2_jit_neon_inc.h src/pcre2_jit_match.c
|
||||
|
||||
src/pcre2_maketables.o: src/pcre2_maketables.c
|
||||
|
||||
src/pcre2_match.o: src/pcre2_match.c src/config.h src/pcre2_internal.h \
|
||||
src/pcre2.h src/pcre2_ucp.h
|
||||
|
||||
src/pcre2_match_data.o: src/pcre2_match_data.c src/config.h src/pcre2_internal.h \
|
||||
src/pcre2.h src/pcre2_ucp.h
|
||||
|
||||
src/pcre2_newline.o: src/pcre2_newline.c src/config.h src/pcre2_internal.h \
|
||||
src/pcre2.h src/pcre2_ucp.h
|
||||
|
||||
src/pcre2_ord2utf.o: src/pcre2_ord2utf.c src/config.h src/pcre2_internal.h \
|
||||
src/pcre2.h src/pcre2_ucp.h
|
||||
|
||||
src/pcre2_pattern_info.o: src/pcre2_pattern_info.c src/config.h src/pcre2_internal.h \
|
||||
src/pcre2.h src/pcre2_ucp.h
|
||||
|
||||
src/pcre2_script_run.o: src/pcre2_script_run.c src/config.h src/pcre2_internal.h \
|
||||
src/pcre2.h src/pcre2_ucp.h
|
||||
|
||||
src/pcre2_serialize.o: src/pcre2_serialize.c src/config.h src/pcre2_internal.h \
|
||||
src/pcre2.h src/pcre2_ucp.h
|
||||
|
||||
src/pcre2test.o: src/pcre2test.c src/config.h src/pcre2.h \
|
||||
src/pcre2posix.h src/pcre2_internal.h src/pcre2_ucp.h \
|
||||
src/pcre2_intmodedep.h src/pcre2_tables.c src/pcre2_ucptables.c \
|
||||
src/pcre2_ucd.c src/pcre2_printint.c
|
||||
|
||||
src/pcre2_string_utils.o: src/pcre2_string_utils.c src/config.h src/pcre2_internal.h \
|
||||
src/pcre2.h src/pcre2_ucp.h
|
||||
|
||||
src/pcre2_study.o: src/pcre2_study.c src/config.h src/pcre2_internal.h \
|
||||
src/pcre2.h src/pcre2_ucp.h
|
||||
|
||||
src/pcre2_substitute.o: src/pcre2_substitute.c src/config.h src/pcre2_internal.h \
|
||||
src/pcre2.h src/pcre2_ucp.h
|
||||
|
||||
src/pcre2_substring.o: src/pcre2_substring.c src/config.h src/pcre2_internal.h \
|
||||
src/pcre2.h src/pcre2_ucp.h
|
||||
|
||||
src/pcre2posix.o: src/pcre2posix.c src/config.h src/pcre2.h \
|
||||
|
||||
|
||||
src/pcre2_tables.o: src/pcre2_tables.c src/config.h src/pcre2_internal.h \
|
||||
src/pcre2.h src/pcre2_ucp.h src/pcre2_intmodedep.h \
|
||||
|
||||
|
||||
src/pcre2_ucd.o: src/pcre2_ucd.c src/config.h src/pcre2_internal.h \
|
||||
src/pcre2.h src/pcre2_ucp.h
|
||||
|
||||
src/pcre2_valid_utf.o: src/pcre2_valid_utf.c src/config.h src/pcre2_internal.h \
|
||||
src/pcre2.h src/pcre2_ucp.h
|
||||
|
||||
src/pcre2_xclass.o: src/pcre2_xclass.c src/config.h src/pcre2_internal.h \
|
||||
src/pcre2.h src/pcre2_ucp.h
|
||||
|
||||
|
||||
src/pcre2grep.o: src/pcre2grep.c src/config.h
|
||||
|
||||
###################################################################
|
||||
##
|
||||
##//// Custom rules
|
||||
##
|
||||
###################################################################
|
||||
|
||||
runtests: libpcre2.a libpcre2-posix.a tests
|
||||
sh RunTest
|
||||
sh RunGrepTest
|
||||
|
||||
release:
|
||||
@echo "Create release folders..."
|
||||
@mkdir -p release/local/newlib/lib release/local/clib2/lib release/local/Documentation/pcre2 release/local/common/include
|
||||
|
||||
@echo "Building newlib based libraries..."
|
||||
@make -f Makefile.os4 all
|
||||
@cp libpcre2.a release/local/newlib/lib/
|
||||
@cp libpcre2-posix.a release/local/newlib/lib/
|
||||
|
||||
@echo "Clean build and libraries files..."
|
||||
@make -f Makefile.os4 cleanall
|
||||
|
||||
@echo "Building clib2 based libraries..."
|
||||
@make -f Makefile.os4 all USE_CLIB2=yes
|
||||
@cp libpcre2.a release/local/clib2/lib/
|
||||
@cp libpcre2-posix.a release/local/clib2/lib/
|
||||
|
||||
@echo "Copy the necessary files..."
|
||||
@cp src/pcre2.h release/local/common/include/
|
||||
@cp src/pcre2posix.h release/local/common/include/
|
||||
@cp COPYING release/local/Documentation/pcre2/
|
||||
@cp HACKING release/local/Documentation/pcre2/
|
||||
@cp LICENCE release/local/Documentation/pcre2/
|
||||
@cp README release/local/Documentation/pcre2/
|
||||
@cp README-OS4.md release/local/Documentation/pcre2/
|
||||
|
||||
@echo "Clean build and libraries files..."
|
||||
@make -f Makefile.os4 cleanall
|
||||
|
||||
@echo "Creating the lha release file..."
|
||||
@rm -f pcre2.lha
|
||||
@lha -aeqr3 a pcre2.lha release/
|
||||
|
||||
@rm -rf release
|
||||
|
||||
###################################################################
|
||||
|
58
NEWS
58
NEWS
|
@ -2,6 +2,64 @@ News about PCRE2 releases
|
|||
-------------------------
|
||||
|
||||
|
||||
Version 10.40 15-April-2022
|
||||
---------------------------
|
||||
|
||||
This is mostly a bug-fixing and code-tidying release. However, there are some
|
||||
extensions to Unicode property handling:
|
||||
|
||||
* Added support for Bidi_Class and a number of binary Unicode properties,
|
||||
including Bidi_Control.
|
||||
|
||||
* A number of changes to script matching for \p and \P:
|
||||
|
||||
(a) Script extensions for a character are now coded as a bitmap instead of
|
||||
a list of script numbers, which should be faster and does not need a
|
||||
loop.
|
||||
|
||||
(b) Added the syntax \p{script:xxx} and \p{script_extensions:xxx} (synonyms
|
||||
sc and scx).
|
||||
|
||||
(c) Changed \p{scriptname} from being the same as \p{sc:scriptname} to being
|
||||
the same as \p{scx:scriptname} because this change happened in Perl at
|
||||
release 5.26.
|
||||
|
||||
(d) The standard Unicode 4-letter abbreviations for script names are now
|
||||
recognized.
|
||||
|
||||
(e) In accordance with Unicode and Perl's "loose matching" rules, spaces,
|
||||
hyphens, and underscores are ignored in property names, which are then
|
||||
matched independent of case.
|
||||
|
||||
As always, see ChangeLog for a list of all changes (also the Git log).
|
||||
|
||||
|
||||
Version 10.39 29-October-2021
|
||||
-----------------------------
|
||||
|
||||
This release is happening soon after 10.38 because the bug fix is important.
|
||||
|
||||
1. Fix incorrect detection of alternatives in first character search in JIT.
|
||||
|
||||
2. Update to Unicode 14.0.0.
|
||||
|
||||
3. Some code cleanups (see ChangeLog).
|
||||
|
||||
|
||||
Version 10.38 01-October-2021
|
||||
-----------------------------
|
||||
|
||||
As well as some bug fixes and tidies (as always, see ChangeLog for details),
|
||||
the documentation is updated to list the new URLs, following the move of the
|
||||
source repository to GitHub and the mailing list to Google Groups.
|
||||
|
||||
* The CMake build system can now build both static and shared libraries in one
|
||||
go.
|
||||
|
||||
* Following Perl's lead, \K is now locked out in lookaround assertions by
|
||||
default, but an option is provided to re-enable the previous behaviour.
|
||||
|
||||
|
||||
Version 10.37 26-May-2021
|
||||
-------------------------
|
||||
|
||||
|
|
|
@ -121,6 +121,7 @@ environment, for example.
|
|||
pcre2_substring.c
|
||||
pcre2_tables.c
|
||||
pcre2_ucd.c
|
||||
pcre2_ucptables.c
|
||||
pcre2_valid_utf.c
|
||||
pcre2_xclass.c
|
||||
|
||||
|
@ -306,7 +307,7 @@ cache can be deleted by selecting "File > Delete Cache".
|
|||
3. Create a new, empty build directory, preferably a subdirectory of the
|
||||
source dir. For example, C:\pcre2\pcre2-xx\build.
|
||||
|
||||
4. Run cmake-gui from the Shell envirornment of your build tool, for example,
|
||||
4. Run cmake-gui from the Shell environment of your build tool, for example,
|
||||
Msys for Msys/MinGW or Visual Studio Command Prompt for VC/VC++. Do not try
|
||||
to start Cmake from the Windows Start menu, as this can lead to errors.
|
||||
|
||||
|
@ -343,10 +344,10 @@ cache can be deleted by selecting "File > Delete Cache".
|
|||
|
||||
BUILDING PCRE2 ON WINDOWS WITH VISUAL STUDIO
|
||||
|
||||
The code currently cannot be compiled without a stdint.h header, which is
|
||||
available only in relatively recent versions of Visual Studio. However, this
|
||||
portable and permissively-licensed implementation of the header worked without
|
||||
issue:
|
||||
The code currently cannot be compiled without an inttypes.h header, which is
|
||||
available only with Visual Studio 2013 or newer. However, this portable and
|
||||
permissively-licensed implementation of the stdint.h header could be used as an
|
||||
alternative:
|
||||
|
||||
http://www.azillionmonkeys.com/qed/pstdint.h
|
||||
|
||||
|
@ -373,7 +374,7 @@ Otherwise:
|
|||
1. Copy RunTest.bat into the directory where pcre2test.exe and pcre2grep.exe
|
||||
have been created.
|
||||
|
||||
2. Edit RunTest.bat to indentify the full or relative location of
|
||||
2. Edit RunTest.bat to identify the full or relative location of
|
||||
the pcre2 source (wherein which the testdata folder resides), e.g.:
|
||||
|
||||
set srcdir=C:\pcre2\pcre2-10.00
|
||||
|
|
70
README
70
README
|
@ -5,18 +5,19 @@ PCRE2 is a re-working of the original PCRE1 library to provide an entirely new
|
|||
API. Since its initial release in 2015, there has been further development of
|
||||
the code and it now differs from PCRE1 in more than just the API. There are new
|
||||
features, and the internals have been improved. The original PCRE1 library is
|
||||
now obsolete and should not be used in new projects. The latest release of
|
||||
PCRE2 is available in three alternative formats from:
|
||||
now obsolete and no longer maintained. The latest release of PCRE2 is available
|
||||
in .tar.gz, tar.bz2, or .zip form from this GitHub repository:
|
||||
|
||||
https://ftp.pcre.org/pub/pcre/pcre2-10.xx.tar.gz
|
||||
https://ftp.pcre.org/pub/pcre/pcre2-10.xx.tar.bz2
|
||||
https://ftp.pcre.org/pub/pcre/pcre2-10.xx.tar.zip
|
||||
https://github.com/PCRE2Project/pcre2/releases
|
||||
|
||||
There is a mailing list for discussion about the development of PCRE at
|
||||
pcre-dev@exim.org. You can access the archives and subscribe or manage your
|
||||
subscription here:
|
||||
There is a mailing list for discussion about the development of PCRE2 at
|
||||
pcre2-dev@googlegroups.com. You can subscribe by sending an email to
|
||||
pcre2-dev+subscribe@googlegroups.com.
|
||||
|
||||
https://lists.exim.org/mailman/listinfo/pcre-dev
|
||||
You can access the archives and also subscribe or manage your subscription
|
||||
here:
|
||||
|
||||
https://groups.google.com/g/pcre2-dev
|
||||
|
||||
Please read the NEWS file if you are upgrading from a previous release. The
|
||||
contents of this README file are:
|
||||
|
@ -113,12 +114,18 @@ Building PCRE2 using autotools
|
|||
The following instructions assume the use of the widely used "configure; make;
|
||||
make install" (autotools) process.
|
||||
|
||||
To build PCRE2 on system that supports autotools, first run the "configure"
|
||||
command from the PCRE2 distribution directory, with your current directory set
|
||||
If you have downloaded and unpacked a PCRE2 release tarball, run the
|
||||
"configure" command from the PCRE2 directory, with your current directory set
|
||||
to the directory where you want the files to be created. This command is a
|
||||
standard GNU "autoconf" configuration script, for which generic instructions
|
||||
are supplied in the file INSTALL.
|
||||
|
||||
The files in the GitHub repository do not contain "configure". If you have
|
||||
downloaded the PCRE2 source files from GitHub, before you can run "configure"
|
||||
you must run the shell script called autogen.sh. This runs a number of
|
||||
autotools to create a "configure" script (you must of course have the autotools
|
||||
commands installed in order to do this).
|
||||
|
||||
Most commonly, people build PCRE2 within its own distribution directory, and in
|
||||
this case, on many systems, just running "./configure" is sufficient. However,
|
||||
the usual methods of changing standard defaults are available. For example:
|
||||
|
@ -187,10 +194,10 @@ library. They are also documented in the pcre2build man page.
|
|||
|
||||
As well as supporting UTF strings, Unicode support includes support for the
|
||||
\P, \p, and \X sequences that recognize Unicode character properties.
|
||||
However, only the basic two-letter properties such as Lu are supported.
|
||||
Escape sequences such as \d and \w in patterns do not by default make use of
|
||||
Unicode properties, but can be made to do so by setting the PCRE2_UCP option
|
||||
or starting a pattern with (*UCP).
|
||||
However, only a subset of Unicode properties are supported; see the
|
||||
pcre2pattern man page for details. Escape sequences such as \d and \w in
|
||||
patterns do not by default make use of Unicode properties, but can be made to
|
||||
do so by setting the PCRE2_UCP option or starting a pattern with (*UCP).
|
||||
|
||||
. You can build PCRE2 to recognize either CR or LF or the sequence CRLF, or any
|
||||
of the preceding, or any of the Unicode newline sequences, or the NUL (zero)
|
||||
|
@ -368,19 +375,20 @@ library. They are also documented in the pcre2build man page.
|
|||
necessary to specify something like LIBS="-lncurses" as well. This is
|
||||
because, to quote the readline INSTALL, "Readline uses the termcap functions,
|
||||
but does not link with the termcap or curses library itself, allowing
|
||||
applications which link with readline the to choose an appropriate library."
|
||||
applications which link with readline the option to choose an appropriate
|
||||
library."
|
||||
If you get error messages about missing functions tgetstr, tgetent, tputs,
|
||||
tgetflag, or tgoto, this is the problem, and linking with the ncurses library
|
||||
should fix it.
|
||||
|
||||
. The C99 standard defines formatting modifiers z and t for size_t and
|
||||
ptrdiff_t values, respectively. By default, PCRE2 uses these modifiers in
|
||||
environments other than Microsoft Visual Studio when __STDC_VERSION__ is
|
||||
defined and has a value greater than or equal to 199901L (indicating C99).
|
||||
However, there is at least one environment that claims to be C99 but does not
|
||||
support these modifiers. If --disable-percent-zt is specified, no use is made
|
||||
of the z or t modifiers. Instead or %td or %zu, %lu is used, with a cast for
|
||||
size_t values.
|
||||
environments other than Microsoft Visual Studio versions earlier than 2013
|
||||
when __STDC_VERSION__ is defined and has a value greater than or equal to
|
||||
199901L (indicating C99). However, there is at least one environment that
|
||||
claims to be C99 but does not support these modifiers. If
|
||||
--disable-percent-zt is specified, no use is made of the z or t modifiers.
|
||||
Instead of %td or %zu, %lu is used, with a cast for size_t values.
|
||||
|
||||
. There is a special option called --enable-fuzz-support for use by people who
|
||||
want to run fuzzing tests on PCRE2. At present this applies only to the 8-bit
|
||||
|
@ -393,10 +401,10 @@ library. They are also documented in the pcre2build man page.
|
|||
Setting --enable-fuzz-support also causes a binary called pcre2fuzzcheck to
|
||||
be created. This is normally run under valgrind or used when PCRE2 is
|
||||
compiled with address sanitizing enabled. It calls the fuzzing function and
|
||||
outputs information about it is doing. The input strings are specified by
|
||||
arguments: if an argument starts with "=" the rest of it is a literal input
|
||||
string. Otherwise, it is assumed to be a file name, and the contents of the
|
||||
file are the test string.
|
||||
outputs information about what it is doing. The input strings are specified
|
||||
by arguments: if an argument starts with "=" the rest of it is a literal
|
||||
input string. Otherwise, it is assumed to be a file name, and the contents
|
||||
of the file are the test string.
|
||||
|
||||
. Releases before 10.30 could be compiled with --disable-stack-for-recursion,
|
||||
which caused pcre2_match() to use individual blocks on the heap for
|
||||
|
@ -410,7 +418,7 @@ The "configure" script builds the following files for the basic C library:
|
|||
. Makefile the makefile that builds the library
|
||||
. src/config.h build-time configuration options for the library
|
||||
. src/pcre2.h the public PCRE2 header file
|
||||
. pcre2-config script that shows the building settings such as CFLAGS
|
||||
. pcre2-config script that shows the building settings such as CFLAGS
|
||||
that were set for "configure"
|
||||
. libpcre2-8.pc )
|
||||
. libpcre2-16.pc ) data for the pkg-config command
|
||||
|
@ -601,13 +609,13 @@ is available. RunTest outputs a comment when it skips a test.
|
|||
|
||||
Many (but not all) of the tests that are not skipped are run twice if JIT
|
||||
support is available. On the second run, JIT compilation is forced. This
|
||||
testing can be suppressed by putting "nojit" on the RunTest command line.
|
||||
testing can be suppressed by putting "-nojit" on the RunTest command line.
|
||||
|
||||
The entire set of tests is run once for each of the 8-bit, 16-bit and 32-bit
|
||||
libraries that are enabled. If you want to run just one set of tests, call
|
||||
RunTest with either the -8, -16 or -32 option.
|
||||
|
||||
If valgrind is installed, you can run the tests under it by putting "valgrind"
|
||||
If valgrind is installed, you can run the tests under it by putting "-valgrind"
|
||||
on the RunTest command line. To run pcre2test on just one or more specific test
|
||||
files, give their numbers as arguments to RunTest, for example:
|
||||
|
||||
|
@ -688,7 +696,7 @@ Test 14 contains some special UTF and UCP tests that give different output for
|
|||
different code unit widths.
|
||||
|
||||
Test 15 contains a number of tests that must not be run with JIT. They check,
|
||||
among other non-JIT things, the match-limiting features of the intepretive
|
||||
among other non-JIT things, the match-limiting features of the interpretive
|
||||
matcher.
|
||||
|
||||
Test 16 is run only when JIT support is not available. It checks that an
|
||||
|
@ -904,4 +912,4 @@ The distribution should contain the files listed below.
|
|||
Philip Hazel
|
||||
Email local part: Philip.Hazel
|
||||
Email domain: gmail.com
|
||||
Last updated: 28 April 2021
|
||||
Last updated: 15 April 2022
|
||||
|
|
|
@ -0,0 +1,39 @@
|
|||
PCRE2 (Perl-compatible regular expression library)
|
||||
---------------------------------------------------------------------------
|
||||
|
||||
This is a port of PCRE2 10.40 by Philip Hazel for AmigaOS 4, as found at the
|
||||
GitHub repository https://github.com/PCRE2Project/pcre2
|
||||
|
||||
More information about PCRE can be found at its official website
|
||||
at https://www.pcre.org and at the documentation that comes with this
|
||||
package.
|
||||
|
||||
In the archive both newlib and clib2 libraries are included. It has been
|
||||
tested with various applications, but in case you find issues please
|
||||
contact me.
|
||||
|
||||
To install it into your AmigaOS 4 SDK installation, just extract all the
|
||||
files in the SDK: path.
|
||||
|
||||
Compile
|
||||
--------------------------
|
||||
The source and the changes I did can be found at my personale repository
|
||||
https://git.walkero.gr/walkero/pcre2
|
||||
|
||||
You can compile it using the Makefile.os4 file, and produce the libraries
|
||||
yourself.
|
||||
|
||||
* with newlib run:
|
||||
```bash
|
||||
make -f Makefile.os4 all
|
||||
```
|
||||
* with clib2 run:
|
||||
```bash
|
||||
make -f Makefile.os4 all USE_CLIB2=yes
|
||||
```
|
||||
|
||||
Changelog
|
||||
--------------------------
|
||||
v10.40r1 - 2022-07-31
|
||||
* First release
|
||||
|
|
@ -0,0 +1,56 @@
|
|||
# PCRE2 - Perl-Compatible Regular Expressions
|
||||
|
||||
The PCRE2 library is a set of C functions that implement regular expression
|
||||
pattern matching using the same syntax and semantics as Perl 5. PCRE2 has its
|
||||
own native API, as well as a set of wrapper functions that correspond to the
|
||||
POSIX regular expression API. The PCRE2 library is free, even for building
|
||||
proprietary software. It comes in three forms, for processing 8-bit, 16-bit,
|
||||
or 32-bit code units, in either literal or UTF encoding.
|
||||
|
||||
PCRE2 was first released in 2015 to replace the API in the original PCRE
|
||||
library, which is now obsolete and no longer maintained. As well as a more
|
||||
flexible API, the code of PCRE2 has been much improved since the fork.
|
||||
|
||||
## Download
|
||||
|
||||
As well as downloading from the
|
||||
[GitHub site](https://github.com/PCRE2Project/pcre2), you can download PCRE2
|
||||
or the older, unmaintained PCRE1 library from an
|
||||
[*unofficial* mirror](https://sourceforge.net/projects/pcre/files/) at SourceForge.
|
||||
|
||||
You can check out the PCRE2 source code via Git or Subversion:
|
||||
|
||||
git clone https://github.com/PCRE2Project/pcre2.git
|
||||
svn co https://github.com/PCRE2Project/pcre2.git
|
||||
|
||||
## Contributed Ports
|
||||
|
||||
If you just need the command-line PCRE2 tools on Windows, precompiled binary
|
||||
versions are available at this
|
||||
[Rexegg page](http://www.rexegg.com/pcregrep-pcretest.html).
|
||||
|
||||
A PCRE2 port for z/OS, a mainframe operating system which uses EBCDIC as its
|
||||
default character encoding, can be found at
|
||||
[http://www.cbttape.org](http://www.cbttape.org/) (File 939).
|
||||
|
||||
## Documentation
|
||||
|
||||
You can read the PCRE2 documentation
|
||||
[here](https://PCRE2Project.github.io/pcre2/doc/html/index.html).
|
||||
|
||||
Comparisons to Perl's regular expression semantics can be found in the
|
||||
community authored Wikipedia entry for PCRE.
|
||||
|
||||
There is a curated summary of changes for each PCRE release, copies of
|
||||
documentation from older releases, and other useful information from the third
|
||||
party authored
|
||||
[RexEgg PCRE Documentation and Change Log page](http://www.rexegg.com/pcre-documentation.html).
|
||||
|
||||
## Contact
|
||||
|
||||
To report a problem with the PCRE2 library, or to make a feature request, please
|
||||
use the PCRE2 GitHub issues tracker. There is a mailing list for discussion of
|
||||
PCRE2 issues and development at pcre2-dev@googlegroups.com, which is where any
|
||||
announcements will be made. You can browse the
|
||||
[list archives](https://groups.google.com/g/pcre2-dev).
|
||||
|
60
RunGrepTest
60
RunGrepTest
|
@ -68,6 +68,22 @@ diff -b /dev/null /dev/null 2>/dev/null && cf="diff -b"
|
|||
diff -u /dev/null /dev/null 2>/dev/null && cf="diff -u"
|
||||
diff -ub /dev/null /dev/null 2>/dev/null && cf="diff -ub"
|
||||
|
||||
# Some tests involve NUL characters. It seems impossible to handle them easily
|
||||
# in many operating systems. An earlier version of this script used sed to
|
||||
# translate NUL into the string ZERO, but this didn't work on Solaris (aka
|
||||
# SunOS), where the version of sed explicitly doesn't like them, and also MacOS
|
||||
# (Darwin), OpenBSD, FreeBSD, NetBSD, and some Linux distributions like Alpine,
|
||||
# even when using GNU sed. A user suggested using tr instead, which
|
||||
# necessitates translating to a single character. However, on (some versions
|
||||
# of?) Solaris, the normal "tr" cannot handle binary zeros, but if
|
||||
# /usr/xpg4/bin/tr is available, it can do so, so test for that.
|
||||
|
||||
if [ -x /usr/xpg4/bin/tr ] ; then
|
||||
tr=/usr/xpg4/bin/tr
|
||||
else
|
||||
tr=tr
|
||||
fi
|
||||
|
||||
# If this test is being run from "make check", $srcdir will be set. If not, set
|
||||
# it to the current or parent directory, whichever one contains the test data.
|
||||
# Subsequently, we run most of the pcre2grep tests in the source directory so
|
||||
|
@ -558,7 +574,7 @@ echo "RC=$?" >>testtrygrep
|
|||
echo "---------------------------- Test 107 -----------------------------" >>testtrygrep
|
||||
echo "a" >testtemp1grep
|
||||
echo "aaaaa" >>testtemp1grep
|
||||
(cd $srcdir; $valgrind $vjs $pcre2grep --line-offsets '(?<=\Ka)' $builddir/testtemp1grep) >>testtrygrep 2>&1
|
||||
(cd $srcdir; $valgrind $vjs $pcre2grep --line-offsets --allow-lookaround-bsk '(?<=\Ka)' $builddir/testtemp1grep) >>testtrygrep 2>&1
|
||||
echo "RC=$?" >>testtrygrep
|
||||
|
||||
echo "---------------------------- Test 108 ------------------------------" >>testtrygrep
|
||||
|
@ -638,13 +654,13 @@ echo "RC=$?" >>testtrygrep
|
|||
|
||||
echo "---------------------------- Test 125 -----------------------------" >>testtrygrep
|
||||
printf 'abcd\n' >testNinputgrep
|
||||
$valgrind $vjs $pcre2grep --colour=always '(?<=\K.)' testNinputgrep >>testtrygrep
|
||||
$valgrind $vjs $pcre2grep --colour=always --allow-lookaround-bsk '(?<=\K.)' testNinputgrep >>testtrygrep
|
||||
echo "RC=$?" >>testtrygrep
|
||||
$valgrind $vjs $pcre2grep --colour=always '(?=.\K)' testNinputgrep >>testtrygrep
|
||||
$valgrind $vjs $pcre2grep --colour=always --allow-lookaround-bsk '(?=.\K)' testNinputgrep >>testtrygrep
|
||||
echo "RC=$?" >>testtrygrep
|
||||
$valgrind $vjs $pcre2grep --colour=always '(?<=\K[ac])' testNinputgrep >>testtrygrep
|
||||
$valgrind $vjs $pcre2grep --colour=always --allow-lookaround-bsk '(?<=\K[ac])' testNinputgrep >>testtrygrep
|
||||
echo "RC=$?" >>testtrygrep
|
||||
$valgrind $vjs $pcre2grep --colour=always '(?=[ac]\K)' testNinputgrep >>testtrygrep
|
||||
$valgrind $vjs $pcre2grep --colour=always --allow-lookaround-bsk '(?=[ac]\K)' testNinputgrep >>testtrygrep
|
||||
echo "RC=$?" >>testtrygrep
|
||||
|
||||
echo "---------------------------- Test 126 -----------------------------" >>testtrygrep
|
||||
|
@ -674,13 +690,27 @@ echo "---------------------------- Test 131 -----------------------------" >>tes
|
|||
echo "RC=$?" >>testtrygrep
|
||||
|
||||
echo "---------------------------- Test 132 -----------------------------" >>testtrygrep
|
||||
(cd $srcdir; $valgrind $vjs $pcre2grep -m1 -A3 '^match'; echo '---'; head -1) <$srcdir/testdata/grepinput >>testtrygrep 2>&1
|
||||
(cd $srcdir; exec 3<testdata/grepinput; $valgrind $vjs $pcre2grep -m1 -A3 '^match' <&3; echo '---'; head -1 <&3; exec 3<&-) >>testtrygrep 2>&1
|
||||
echo "RC=$?" >>testtrygrep
|
||||
|
||||
echo "---------------------------- Test 133 -----------------------------" >>testtrygrep
|
||||
(cd $srcdir; exec 3<testdata/grepinput; $valgrind $vjs $pcre2grep -m1 -A3 '^match' <&3; echo '---'; $valgrind $vjs $pcre2grep -m1 -A3 '^match' <&3; exec 3<&-) >>testtrygrep 2>&1
|
||||
echo "RC=$?" >>testtrygrep
|
||||
|
||||
echo "---------------------------- Test 134 -----------------------------" >>testtrygrep
|
||||
(cd $srcdir; $valgrind $vjs $pcre2grep -m1 -O '=$x{41}$x423$o{103}$o1045=' 'fox') <$srcdir/testdata/grepinputv >>testtrygrep 2>&1
|
||||
echo "RC=$?" >>testtrygrep
|
||||
|
||||
echo "---------------------------- Test 135 -----------------------------" >>testtrygrep
|
||||
(cd $srcdir; $valgrind $vjs $pcre2grep -HZ 'word' ./testdata/grepinputv) | $tr '\000' '@' >>testtrygrep
|
||||
echo "RC=$?" >>testtrygrep
|
||||
(cd $srcdir; $valgrind $vjs $pcre2grep -lZ 'word' ./testdata/grepinputv ./testdata/grepinputv) | $tr '\000' '@' >>testtrygrep
|
||||
echo "RC=$?" >>testtrygrep
|
||||
(cd $srcdir; $valgrind $vjs $pcre2grep -A 1 -B 1 -HZ 'word' ./testdata/grepinputv) | $tr '\000' '@' >>testtrygrep
|
||||
echo "RC=$?" >>testtrygrep
|
||||
(cd $srcdir; $valgrind $vjs $pcre2grep -MHZn 'start[\s]+end' testdata/grepinputM) >>testtrygrep
|
||||
echo "RC=$?" >>testtrygrep
|
||||
|
||||
# Now compare the results.
|
||||
|
||||
$cf $srcdir/testdata/grepoutput testtrygrep
|
||||
|
@ -701,7 +731,7 @@ if [ $utf8 -ne 0 ] ; then
|
|||
echo "RC=$?" >>testtrygrep
|
||||
|
||||
echo "---------------------------- Test U3 ------------------------------" >>testtrygrep
|
||||
(cd $srcdir; $valgrind $vjs $pcre2grep --line-offsets -u --newline=any '(?<=\K\x{17f})' ./testdata/grepinput8) >>testtrygrep
|
||||
(cd $srcdir; $valgrind $vjs $pcre2grep --line-offsets -u --newline=any --allow-lookaround-bsk '(?<=\K\x{17f})' ./testdata/grepinput8) >>testtrygrep
|
||||
echo "RC=$?" >>testtrygrep
|
||||
|
||||
echo "---------------------------- Test U4 ------------------------------" >>testtrygrep
|
||||
|
@ -755,22 +785,6 @@ $valgrind $vjs $pcre2grep -n --newline=any "^(abc|def|ghi|jkl)" testNinputgrep >
|
|||
printf '%c--------------------------- Test N6 ------------------------------\r\n' - >>testtrygrep
|
||||
$valgrind $vjs $pcre2grep -n --newline=anycrlf "^(abc|def|ghi|jkl)" testNinputgrep >>testtrygrep
|
||||
|
||||
# This next test involves NUL characters. It seems impossible to handle them
|
||||
# easily in many operating systems. An earlier version of this script used sed
|
||||
# to translate NUL into the string ZERO, but this didn't work on Solaris (aka
|
||||
# SunOS), where the version of sed explicitly doesn't like them, and also MacOS
|
||||
# (Darwin), OpenBSD, FreeBSD, NetBSD, and some Linux distributions like Alpine,
|
||||
# even when using GNU sed. A user suggested using tr instead, which
|
||||
# necessitates translating to a single character (@). However, on (some
|
||||
# versions of?) Solaris, the normal "tr" cannot handle binary zeros, but if
|
||||
# /usr/xpg4/bin/tr is available, it can do so, so test for that.
|
||||
|
||||
if [ -x /usr/xpg4/bin/tr ] ; then
|
||||
tr=/usr/xpg4/bin/tr
|
||||
else
|
||||
tr=tr
|
||||
fi
|
||||
|
||||
printf '%c--------------------------- Test N7 ------------------------------\r\n' - >>testtrygrep
|
||||
printf 'abc\0def' >testNinputgrep
|
||||
$valgrind $vjs $pcre2grep -na --newline=nul "^(abc|def)" testNinputgrep | $tr '\000' '@' >>testtrygrep
|
||||
|
|
63
RunTest
63
RunTest
|
@ -17,8 +17,16 @@
|
|||
# individual test numbers, ranges of tests such as 3-6 or 3- (meaning 3 to the
|
||||
# end), or a number preceded by ~ to exclude a test. For example, "3-15 ~10"
|
||||
# runs tests 3 to 15, excluding test 10, and just "~10" runs all the tests
|
||||
# except test 10. Whatever order the arguments are in, the tests are always run
|
||||
# in numerical order.
|
||||
# except test 10. Whatever order the arguments are in, these tests are always
|
||||
# run in numerical order.
|
||||
#
|
||||
# If no specific tests are selected (which is the case when this script is run
|
||||
# via 'make check') the default is to run all the numbered tests.
|
||||
#
|
||||
# There may also be named (as well as numbered) tests for special purposes. At
|
||||
# present there is just one, called "heap". This test's output contains the
|
||||
# sizes of heap frames and frame vectors, which depend on the environment. It
|
||||
# is therefore not run unless explicitly requested.
|
||||
#
|
||||
# Inappropriate tests are automatically skipped (with a comment to say so). For
|
||||
# example, if JIT support is not compiled, test 16 is skipped, whereas if JIT
|
||||
|
@ -80,7 +88,9 @@ title22="Test 22: \C tests with UTF (not supported for DFA matching)"
|
|||
title23="Test 23: \C disabled test"
|
||||
title24="Test 24: Non-UTF pattern conversion tests"
|
||||
title25="Test 25: UTF pattern conversion tests"
|
||||
maxtest=25
|
||||
title26="Test 26: Auto-generated unicode property tests"
|
||||
maxtest=26
|
||||
titleheap="Test 'heap': Environment-specific heap tests"
|
||||
|
||||
if [ $# -eq 1 -a "$1" = "list" ]; then
|
||||
echo $title0
|
||||
|
@ -109,6 +119,12 @@ if [ $# -eq 1 -a "$1" = "list" ]; then
|
|||
echo $title23
|
||||
echo $title24
|
||||
echo $title25
|
||||
echo $title26
|
||||
echo ""
|
||||
echo $titleheap
|
||||
echo ""
|
||||
echo "Numbered tests are automatically run if nothing selected."
|
||||
echo "Named tests must be explicitly selected."
|
||||
exit 0
|
||||
fi
|
||||
|
||||
|
@ -238,6 +254,8 @@ do22=no
|
|||
do23=no
|
||||
do24=no
|
||||
do25=no
|
||||
do26=no
|
||||
doheap=no
|
||||
|
||||
while [ $# -gt 0 ] ; do
|
||||
case $1 in
|
||||
|
@ -267,6 +285,8 @@ while [ $# -gt 0 ] ; do
|
|||
23) do23=yes;;
|
||||
24) do24=yes;;
|
||||
25) do25=yes;;
|
||||
26) do26=yes;;
|
||||
heap) doheap=yes;;
|
||||
-8) arg8=yes;;
|
||||
-16) arg16=yes;;
|
||||
-32) arg32=yes;;
|
||||
|
@ -320,7 +340,8 @@ fi
|
|||
# set up a large stack.
|
||||
|
||||
$sim ./pcre2test -S 64 /dev/null /dev/null
|
||||
if [ $? -eq 0 -a "$bigstack" != "" ] ; then
|
||||
support_setstack=$?
|
||||
if [ $support_setstack -eq 0 -a "$bigstack" != "" ] ; then
|
||||
setstack="-S 64"
|
||||
else
|
||||
setstack=""
|
||||
|
@ -407,8 +428,8 @@ if [ $jit -ne 0 -a "$nojit" != "yes" ] ; then
|
|||
fi
|
||||
fi
|
||||
|
||||
# If no specific tests were requested, select all. Those that are not
|
||||
# relevant will be automatically skipped.
|
||||
# If no specific tests were requested, select all the numbered tests. Those
|
||||
# that are not relevant will be automatically skipped.
|
||||
|
||||
if [ $do0 = no -a $do1 = no -a $do2 = no -a $do3 = no -a \
|
||||
$do4 = no -a $do5 = no -a $do6 = no -a $do7 = no -a \
|
||||
|
@ -416,7 +437,7 @@ if [ $do0 = no -a $do1 = no -a $do2 = no -a $do3 = no -a \
|
|||
$do12 = no -a $do13 = no -a $do14 = no -a $do15 = no -a \
|
||||
$do16 = no -a $do17 = no -a $do18 = no -a $do19 = no -a \
|
||||
$do20 = no -a $do21 = no -a $do22 = no -a $do23 = no -a \
|
||||
$do24 = no -a $do25 = no \
|
||||
$do24 = no -a $do25 = no -a $do26 = no -a $doheap = no \
|
||||
]; then
|
||||
do0=yes
|
||||
do1=yes
|
||||
|
@ -444,6 +465,7 @@ if [ $do0 = no -a $do1 = no -a $do2 = no -a $do3 = no -a \
|
|||
do23=yes
|
||||
do24=yes
|
||||
do25=yes
|
||||
do26=yes
|
||||
fi
|
||||
|
||||
# Handle any explicit skips at this stage, so that an argument list may consist
|
||||
|
@ -479,7 +501,9 @@ for bmode in "$test8" "$test16" "$test32"; do
|
|||
echo '' >testtry
|
||||
checkspecial '-C'
|
||||
checkspecial '--help'
|
||||
checkspecial '-S 1 -t 10 testSinput'
|
||||
if [ $support_setstack -eq 0 ] ; then
|
||||
checkspecial '-S 1 -t 10 testSinput'
|
||||
fi
|
||||
echo " OK"
|
||||
fi
|
||||
|
||||
|
@ -860,6 +884,29 @@ for bmode in "$test8" "$test16" "$test32"; do
|
|||
fi
|
||||
fi
|
||||
|
||||
# Auto-generated unicode property tests
|
||||
|
||||
if [ $do26 = yes ] ; then
|
||||
echo $title26
|
||||
if [ $utf -eq 0 ] ; then
|
||||
echo " Skipped because UTF-$bits support is not available"
|
||||
else
|
||||
for opt in "" $jitopt; do
|
||||
$sim $valgrind ${opt:+$vjs} ./pcre2test -q $setstack $bmode $opt $testdata/testinput26 testtry
|
||||
checkresult $? 26 "$opt"
|
||||
done
|
||||
fi
|
||||
fi
|
||||
|
||||
# Manually selected heap tests - output may vary in different environments,
|
||||
# which is why that are not automatically run.
|
||||
|
||||
if [ $doheap = yes ] ; then
|
||||
echo $titleheap
|
||||
$sim $valgrind ./pcre2test -q $setstack $bmode $testdata/testinputheap testtry
|
||||
checkresult $? heap-$bits ""
|
||||
fi
|
||||
|
||||
# End of loop for 8/16/32-bit tests
|
||||
done
|
||||
|
||||
|
|
|
@ -135,9 +135,9 @@ if "%all%" == "yes" (
|
|||
set do7=yes
|
||||
set do8=yes
|
||||
set do9=yes
|
||||
set do10=yes
|
||||
set do10=no
|
||||
set do11=yes
|
||||
set do12=yes
|
||||
set do12=no
|
||||
set do13=yes
|
||||
set do14=yes
|
||||
set do15=yes
|
||||
|
|
|
@ -0,0 +1 @@
|
|||
# See MODULE.bazel
|
|
@ -1,17 +1,16 @@
|
|||
# Modified from FindReadline.cmake (PH Feb 2012)
|
||||
|
||||
if(EDITLINE_INCLUDE_DIR AND EDITLINE_LIBRARY AND NCURSES_LIBRARY)
|
||||
if(EDITLINE_INCLUDE_DIR AND EDITLINE_LIBRARY)
|
||||
set(EDITLINE_FOUND TRUE)
|
||||
else(EDITLINE_INCLUDE_DIR AND EDITLINE_LIBRARY AND NCURSES_LIBRARY)
|
||||
FIND_PATH(EDITLINE_INCLUDE_DIR readline.h
|
||||
/usr/include/editline
|
||||
/usr/include/edit/readline
|
||||
/usr/include/readline
|
||||
else(EDITLINE_INCLUDE_DIR AND EDITLINE_LIBRARY)
|
||||
FIND_PATH(EDITLINE_INCLUDE_DIR readline.h PATH_SUFFIXES
|
||||
editline
|
||||
edit/readline
|
||||
)
|
||||
|
||||
FIND_LIBRARY(EDITLINE_LIBRARY NAMES edit)
|
||||
include(FindPackageHandleStandardArgs)
|
||||
FIND_PACKAGE_HANDLE_STANDARD_ARGS(Editline DEFAULT_MSG EDITLINE_INCLUDE_DIR EDITLINE_LIBRARY )
|
||||
FIND_PACKAGE_HANDLE_STANDARD_ARGS(Editline DEFAULT_MSG EDITLINE_INCLUDE_DIR EDITLINE_LIBRARY)
|
||||
|
||||
MARK_AS_ADVANCED(EDITLINE_INCLUDE_DIR EDITLINE_LIBRARY)
|
||||
endif(EDITLINE_INCLUDE_DIR AND EDITLINE_LIBRARY AND NCURSES_LIBRARY)
|
||||
endif(EDITLINE_INCLUDE_DIR AND EDITLINE_LIBRARY)
|
||||
|
|
|
@ -0,0 +1,15 @@
|
|||
set(PACKAGE_VERSION_MAJOR @PCRE2_MAJOR@)
|
||||
set(PACKAGE_VERSION_MINOR @PCRE2_MINOR@)
|
||||
set(PACKAGE_VERSION_PATCH 0)
|
||||
set(PACKAGE_VERSION @PCRE2_MAJOR@.@PCRE2_MINOR@.0)
|
||||
|
||||
# Check whether the requested PACKAGE_FIND_VERSION is compatible
|
||||
if(PACKAGE_VERSION VERSION_LESS PACKAGE_FIND_VERSION OR
|
||||
PACKAGE_VERSION_MAJOR GREATER PACKAGE_FIND_VERSION_MAJOR)
|
||||
set(PACKAGE_VERSION_COMPATIBLE FALSE)
|
||||
else()
|
||||
set(PACKAGE_VERSION_COMPATIBLE TRUE)
|
||||
if(PACKAGE_VERSION VERSION_EQUAL PACKAGE_FIND_VERSION)
|
||||
set(PACKAGE_VERSION_EXACT TRUE)
|
||||
endif()
|
||||
endif()
|
|
@ -0,0 +1,145 @@
|
|||
# pcre2-config.cmake
|
||||
# ----------------
|
||||
#
|
||||
# Finds the PCRE2 library, specify the starting search path in PCRE2_ROOT.
|
||||
#
|
||||
# Static vs. shared
|
||||
# -----------------
|
||||
# To make use of the static library instead of the shared one, one needs
|
||||
# to set the variable PCRE2_USE_STATIC_LIBS to ON before calling find_package.
|
||||
# Example:
|
||||
# set(PCRE2_USE_STATIC_LIBS ON)
|
||||
# find_package(PCRE2 CONFIG COMPONENTS 8BIT)
|
||||
#
|
||||
# This will define the following variables:
|
||||
#
|
||||
# PCRE2_FOUND - True if the system has the PCRE2 library.
|
||||
# PCRE2_VERSION - The version of the PCRE2 library which was found.
|
||||
#
|
||||
# and the following imported targets:
|
||||
#
|
||||
# PCRE2::8BIT - The 8 bit PCRE2 library.
|
||||
# PCRE2::16BIT - The 16 bit PCRE2 library.
|
||||
# PCRE2::32BIT - The 32 bit PCRE2 library.
|
||||
# PCRE2::POSIX - The POSIX PCRE2 library.
|
||||
|
||||
set(PCRE2_NON_STANDARD_LIB_PREFIX @NON_STANDARD_LIB_PREFIX@)
|
||||
set(PCRE2_NON_STANDARD_LIB_SUFFIX @NON_STANDARD_LIB_SUFFIX@)
|
||||
set(PCRE2_8BIT_NAME pcre2-8)
|
||||
set(PCRE2_16BIT_NAME pcre2-16)
|
||||
set(PCRE2_32BIT_NAME pcre2-32)
|
||||
set(PCRE2_POSIX_NAME pcre2-posix)
|
||||
find_path(PCRE2_INCLUDE_DIR NAMES pcre2.h DOC "PCRE2 include directory")
|
||||
if (PCRE2_USE_STATIC_LIBS)
|
||||
if (MSVC)
|
||||
set(PCRE2_8BIT_NAME pcre2-8-static)
|
||||
set(PCRE2_16BIT_NAME pcre2-16-static)
|
||||
set(PCRE2_32BIT_NAME pcre2-32-static)
|
||||
set(PCRE2_POSIX_NAME pcre2-posix-static)
|
||||
endif ()
|
||||
|
||||
set(PCRE2_PREFIX ${CMAKE_STATIC_LIBRARY_PREFIX})
|
||||
set(PCRE2_SUFFIX ${CMAKE_STATIC_LIBRARY_SUFFIX})
|
||||
else ()
|
||||
set(PCRE2_PREFIX ${CMAKE_SHARED_LIBRARY_PREFIX})
|
||||
if (MINGW AND PCRE2_NON_STANDARD_LIB_PREFIX)
|
||||
set(PCRE2_PREFIX "")
|
||||
endif ()
|
||||
|
||||
set(PCRE2_SUFFIX ${CMAKE_SHARED_LIBRARY_SUFFIX})
|
||||
if (MINGW AND PCRE2_NON_STANDARD_LIB_SUFFIX)
|
||||
set(PCRE2_SUFFIX "-0.dll")
|
||||
endif ()
|
||||
endif ()
|
||||
find_library(PCRE2_8BIT_LIBRARY NAMES ${PCRE2_PREFIX}${PCRE2_8BIT_NAME}${PCRE2_SUFFIX} ${PCRE2_PREFIX}${PCRE2_8BIT_NAME}d${PCRE2_SUFFIX} DOC "8 bit PCRE2 library")
|
||||
find_library(PCRE2_16BIT_LIBRARY NAMES ${PCRE2_PREFIX}${PCRE2_16BIT_NAME}${PCRE2_SUFFIX} ${PCRE2_PREFIX}${PCRE2_8BIT_NAME}d${PCRE2_SUFFIX} DOC "16 bit PCRE2 library")
|
||||
find_library(PCRE2_32BIT_LIBRARY NAMES ${PCRE2_PREFIX}${PCRE2_32BIT_NAME}${PCRE2_SUFFIX} ${PCRE2_PREFIX}${PCRE2_8BIT_NAME}d${PCRE2_SUFFIX} DOC "32 bit PCRE2 library")
|
||||
find_library(PCRE2_POSIX_LIBRARY NAMES ${PCRE2_PREFIX}${PCRE2_POSIX_NAME}${PCRE2_SUFFIX} ${PCRE2_PREFIX}${PCRE2_8BIT_NAME}d${PCRE2_SUFFIX} DOC "8 bit POSIX PCRE2 library")
|
||||
unset(PCRE2_NON_STANDARD_LIB_PREFIX)
|
||||
unset(PCRE2_NON_STANDARD_LIB_SUFFIX)
|
||||
unset(PCRE2_8BIT_NAME)
|
||||
unset(PCRE2_16BIT_NAME)
|
||||
unset(PCRE2_32BIT_NAME)
|
||||
unset(PCRE2_POSIX_NAME)
|
||||
|
||||
# Set version
|
||||
if (PCRE2_INCLUDE_DIR)
|
||||
set(PCRE2_VERSION "@PCRE2_MAJOR@.@PCRE2_MINOR@.0")
|
||||
endif ()
|
||||
|
||||
# Which components have been found.
|
||||
if (PCRE2_8BIT_LIBRARY)
|
||||
set(PCRE2_8BIT_FOUND TRUE)
|
||||
endif ()
|
||||
if (PCRE2_16BIT_LIBRARY)
|
||||
set(PCRE2_16BIT_FOUND TRUE)
|
||||
endif ()
|
||||
if (PCRE2_32BIT_LIBRARY)
|
||||
set(PCRE2_32BIT_FOUND TRUE)
|
||||
endif ()
|
||||
if (PCRE2_POSIX_LIBRARY)
|
||||
set(PCRE2_POSIX_FOUND TRUE)
|
||||
endif ()
|
||||
|
||||
# Check if at least one component has been specified.
|
||||
list(LENGTH PCRE2_FIND_COMPONENTS PCRE2_NCOMPONENTS)
|
||||
if (PCRE2_NCOMPONENTS LESS 1)
|
||||
message(FATAL_ERROR "No components have been specified. This is not allowed. Please, specify at least one component.")
|
||||
endif ()
|
||||
unset(PCRE2_NCOMPONENTS)
|
||||
|
||||
# When POSIX component has been specified make sure that also 8BIT component is specified.
|
||||
set(PCRE2_8BIT_COMPONENT FALSE)
|
||||
set(PCRE2_POSIX_COMPONENT FALSE)
|
||||
foreach(component ${PCRE2_FIND_COMPONENTS})
|
||||
if (component STREQUAL "8BIT")
|
||||
set(PCRE2_8BIT_COMPONENT TRUE)
|
||||
elseif (component STREQUAL "POSIX")
|
||||
set(PCRE2_POSIX_COMPONENT TRUE)
|
||||
endif ()
|
||||
endforeach()
|
||||
|
||||
if (PCRE2_POSIX_COMPONENT AND NOT PCRE2_8BIT_COMPONENT)
|
||||
message(FATAL_ERROR "The component POSIX is specified while the 8BIT one is not. This is not allowed. Please, also specify the 8BIT component.")
|
||||
endif()
|
||||
unset(PCRE2_8BIT_COMPONENT)
|
||||
unset(PCRE2_POSIX_COMPONENT)
|
||||
|
||||
include(FindPackageHandleStandardArgs)
|
||||
set(${CMAKE_FIND_PACKAGE_NAME}_CONFIG "${CMAKE_CURRENT_LIST_FILE}")
|
||||
find_package_handle_standard_args(PCRE2
|
||||
FOUND_VAR PCRE2_FOUND
|
||||
REQUIRED_VARS PCRE2_INCLUDE_DIR
|
||||
HANDLE_COMPONENTS
|
||||
VERSION_VAR PCRE2_VERSION
|
||||
CONFIG_MODE
|
||||
)
|
||||
|
||||
set(PCRE2_LIBRARIES)
|
||||
if (PCRE2_FOUND)
|
||||
foreach(component ${PCRE2_FIND_COMPONENTS})
|
||||
if (PCRE2_USE_STATIC_LIBS)
|
||||
add_library(PCRE2::${component} STATIC IMPORTED)
|
||||
target_compile_definitions(PCRE2::${component} INTERFACE PCRE2_STATIC)
|
||||
else ()
|
||||
add_library(PCRE2::${component} SHARED IMPORTED)
|
||||
endif ()
|
||||
set_target_properties(PCRE2::${component} PROPERTIES
|
||||
IMPORTED_LOCATION "${PCRE2_${component}_LIBRARY}"
|
||||
INTERFACE_INCLUDE_DIRECTORIES "${PCRE2_INCLUDE_DIR}"
|
||||
)
|
||||
if (component STREQUAL "POSIX")
|
||||
set_target_properties(PCRE2::${component} PROPERTIES
|
||||
INTERFACE_LINK_LIBRARIES "PCRE2::8BIT"
|
||||
LINK_LIBRARIES "PCRE2::8BIT"
|
||||
)
|
||||
endif ()
|
||||
|
||||
set(PCRE2_LIBRARIES ${PCRE2_LIBRARIES} ${PCRE2_${component}_LIBRARY})
|
||||
mark_as_advanced(PCRE2_${component}_LIBRARY)
|
||||
endforeach()
|
||||
endif ()
|
||||
|
||||
mark_as_advanced(
|
||||
PCRE2_INCLUDE_DIR
|
||||
)
|
|
@ -2,8 +2,6 @@
|
|||
|
||||
#cmakedefine HAVE_ATTRIBUTE_UNINITIALIZED 1
|
||||
#cmakedefine HAVE_DIRENT_H 1
|
||||
#cmakedefine HAVE_INTTYPES_H 1
|
||||
#cmakedefine HAVE_STDINT_H 1
|
||||
#cmakedefine HAVE_STRERROR 1
|
||||
#cmakedefine HAVE_SYS_STAT_H 1
|
||||
#cmakedefine HAVE_SYS_TYPES_H 1
|
||||
|
@ -16,8 +14,6 @@
|
|||
#cmakedefine HAVE_SECURE_GETENV 1
|
||||
#cmakedefine HAVE_STRERROR 1
|
||||
|
||||
#cmakedefine PCRE2_STATIC 1
|
||||
|
||||
#cmakedefine SUPPORT_PCRE2_8 1
|
||||
#cmakedefine SUPPORT_PCRE2_16 1
|
||||
#cmakedefine SUPPORT_PCRE2_32 1
|
||||
|
|
40
configure.ac
40
configure.ac
|
@ -9,15 +9,15 @@ dnl The PCRE2_PRERELEASE feature is for identifying release candidates. It might
|
|||
dnl be defined as -RC2, for example. For real releases, it should be empty.
|
||||
|
||||
m4_define(pcre2_major, [10])
|
||||
m4_define(pcre2_minor, [37])
|
||||
m4_define(pcre2_minor, [41])
|
||||
m4_define(pcre2_prerelease, [])
|
||||
m4_define(pcre2_date, [2021-05-26])
|
||||
m4_define(pcre2_date, [2022-xx-xx])
|
||||
|
||||
# Libtool shared library interface versions (current:revision:age)
|
||||
m4_define(libpcre2_8_version, [10:2:10])
|
||||
m4_define(libpcre2_16_version, [10:2:10])
|
||||
m4_define(libpcre2_32_version, [10:2:10])
|
||||
m4_define(libpcre2_posix_version, [3:0:0])
|
||||
m4_define(libpcre2_8_version, [11:0:11])
|
||||
m4_define(libpcre2_16_version, [11:0:11])
|
||||
m4_define(libpcre2_32_version, [11:0:11])
|
||||
m4_define(libpcre2_posix_version, [3:2:0])
|
||||
|
||||
# NOTE: The CMakeLists.txt file searches for the above variables in the first
|
||||
# 50 lines of this file. Please update that if the variables above are moved.
|
||||
|
@ -513,6 +513,19 @@ AC_TYPE_SIZE_T
|
|||
# Checks for library functions.
|
||||
|
||||
AC_CHECK_FUNCS(bcopy memfd_create memmove mkostemp secure_getenv strerror)
|
||||
AC_MSG_CHECKING([for realpath])
|
||||
AC_LINK_IFELSE([AC_LANG_PROGRAM([[
|
||||
#include <stdlib.h>
|
||||
#include <limits.h>
|
||||
]],[[
|
||||
char buffer[PATH_MAX];
|
||||
realpath(".", buffer);
|
||||
]])],
|
||||
[AC_MSG_RESULT([yes])
|
||||
AC_DEFINE([HAVE_REALPATH], 1,
|
||||
[Define to 1 if you have the `realpath' function.])
|
||||
],
|
||||
AC_MSG_RESULT([no]))
|
||||
|
||||
# Check for the availability of libz (aka zlib)
|
||||
|
||||
|
@ -584,14 +597,14 @@ if test "$enable_pcre2test_libreadline" = "yes"; then
|
|||
fi
|
||||
fi
|
||||
|
||||
|
||||
# Check for the availability of libedit. Different distributions put its
|
||||
# headers in different places. Try to cover the most common ones.
|
||||
|
||||
if test "$enable_pcre2test_libedit" = "yes"; then
|
||||
AC_CHECK_HEADERS([editline/readline.h], [HAVE_EDITLINE_READLINE_H=1],
|
||||
[AC_CHECK_HEADERS([edit/readline/readline.h], [HAVE_READLINE_READLINE_H=1],
|
||||
[AC_CHECK_HEADERS([readline/readline.h], [HAVE_READLINE_READLINE_H=1])])])
|
||||
AC_CHECK_HEADERS([editline/readline.h edit/readline/readline.h readline.h], [
|
||||
HAVE_LIBEDIT_HEADER=1
|
||||
break
|
||||
])
|
||||
AC_CHECK_LIB([edit], [readline], [LIBEDIT="-ledit"])
|
||||
fi
|
||||
|
||||
|
@ -927,10 +940,9 @@ if test "$enable_pcre2test_libedit" = "yes"; then
|
|||
echo "** Cannot use both --enable-pcre2test-libedit and --enable-pcre2test-readline"
|
||||
exit 1
|
||||
fi
|
||||
if test "$HAVE_EDITLINE_READLINE_H" != "1" -a \
|
||||
"$HAVE_READLINE_READLINE_H" != "1"; then
|
||||
echo "** Cannot --enable-pcre2test-libedit because neither editline/readline.h"
|
||||
echo "** nor readline/readline.h was found."
|
||||
if test -z "$HAVE_LIBEDIT_HEADER"; then
|
||||
echo "** Cannot --enable-pcre2test-libedit because neither editline/readline.h,"
|
||||
echo "** edit/readline/readline.h nor a compatible header was found."
|
||||
exit 1
|
||||
fi
|
||||
if test -z "$LIBEDIT"; then
|
||||
|
|
|
@ -121,6 +121,7 @@ environment, for example.
|
|||
pcre2_substring.c
|
||||
pcre2_tables.c
|
||||
pcre2_ucd.c
|
||||
pcre2_ucptables.c
|
||||
pcre2_valid_utf.c
|
||||
pcre2_xclass.c
|
||||
|
||||
|
@ -306,7 +307,7 @@ cache can be deleted by selecting "File > Delete Cache".
|
|||
3. Create a new, empty build directory, preferably a subdirectory of the
|
||||
source dir. For example, C:\pcre2\pcre2-xx\build.
|
||||
|
||||
4. Run cmake-gui from the Shell envirornment of your build tool, for example,
|
||||
4. Run cmake-gui from the Shell environment of your build tool, for example,
|
||||
Msys for Msys/MinGW or Visual Studio Command Prompt for VC/VC++. Do not try
|
||||
to start Cmake from the Windows Start menu, as this can lead to errors.
|
||||
|
||||
|
@ -343,10 +344,10 @@ cache can be deleted by selecting "File > Delete Cache".
|
|||
|
||||
BUILDING PCRE2 ON WINDOWS WITH VISUAL STUDIO
|
||||
|
||||
The code currently cannot be compiled without a stdint.h header, which is
|
||||
available only in relatively recent versions of Visual Studio. However, this
|
||||
portable and permissively-licensed implementation of the header worked without
|
||||
issue:
|
||||
The code currently cannot be compiled without an inttypes.h header, which is
|
||||
available only with Visual Studio 2013 or newer. However, this portable and
|
||||
permissively-licensed implementation of the stdint.h header could be used as an
|
||||
alternative:
|
||||
|
||||
http://www.azillionmonkeys.com/qed/pstdint.h
|
||||
|
||||
|
@ -373,7 +374,7 @@ Otherwise:
|
|||
1. Copy RunTest.bat into the directory where pcre2test.exe and pcre2grep.exe
|
||||
have been created.
|
||||
|
||||
2. Edit RunTest.bat to indentify the full or relative location of
|
||||
2. Edit RunTest.bat to identify the full or relative location of
|
||||
the pcre2 source (wherein which the testdata folder resides), e.g.:
|
||||
|
||||
set srcdir=C:\pcre2\pcre2-10.00
|
||||
|
|
|
@ -5,18 +5,19 @@ PCRE2 is a re-working of the original PCRE1 library to provide an entirely new
|
|||
API. Since its initial release in 2015, there has been further development of
|
||||
the code and it now differs from PCRE1 in more than just the API. There are new
|
||||
features, and the internals have been improved. The original PCRE1 library is
|
||||
now obsolete and should not be used in new projects. The latest release of
|
||||
PCRE2 is available in three alternative formats from:
|
||||
now obsolete and no longer maintained. The latest release of PCRE2 is available
|
||||
in .tar.gz, tar.bz2, or .zip form from this GitHub repository:
|
||||
|
||||
https://ftp.pcre.org/pub/pcre/pcre2-10.xx.tar.gz
|
||||
https://ftp.pcre.org/pub/pcre/pcre2-10.xx.tar.bz2
|
||||
https://ftp.pcre.org/pub/pcre/pcre2-10.xx.tar.zip
|
||||
https://github.com/PCRE2Project/pcre2/releases
|
||||
|
||||
There is a mailing list for discussion about the development of PCRE at
|
||||
pcre-dev@exim.org. You can access the archives and subscribe or manage your
|
||||
subscription here:
|
||||
There is a mailing list for discussion about the development of PCRE2 at
|
||||
pcre2-dev@googlegroups.com. You can subscribe by sending an email to
|
||||
pcre2-dev+subscribe@googlegroups.com.
|
||||
|
||||
https://lists.exim.org/mailman/listinfo/pcre-dev
|
||||
You can access the archives and also subscribe or manage your subscription
|
||||
here:
|
||||
|
||||
https://groups.google.com/g/pcre2-dev
|
||||
|
||||
Please read the NEWS file if you are upgrading from a previous release. The
|
||||
contents of this README file are:
|
||||
|
@ -113,12 +114,18 @@ Building PCRE2 using autotools
|
|||
The following instructions assume the use of the widely used "configure; make;
|
||||
make install" (autotools) process.
|
||||
|
||||
To build PCRE2 on system that supports autotools, first run the "configure"
|
||||
command from the PCRE2 distribution directory, with your current directory set
|
||||
If you have downloaded and unpacked a PCRE2 release tarball, run the
|
||||
"configure" command from the PCRE2 directory, with your current directory set
|
||||
to the directory where you want the files to be created. This command is a
|
||||
standard GNU "autoconf" configuration script, for which generic instructions
|
||||
are supplied in the file INSTALL.
|
||||
|
||||
The files in the GitHub repository do not contain "configure". If you have
|
||||
downloaded the PCRE2 source files from GitHub, before you can run "configure"
|
||||
you must run the shell script called autogen.sh. This runs a number of
|
||||
autotools to create a "configure" script (you must of course have the autotools
|
||||
commands installed in order to do this).
|
||||
|
||||
Most commonly, people build PCRE2 within its own distribution directory, and in
|
||||
this case, on many systems, just running "./configure" is sufficient. However,
|
||||
the usual methods of changing standard defaults are available. For example:
|
||||
|
@ -187,10 +194,10 @@ library. They are also documented in the pcre2build man page.
|
|||
|
||||
As well as supporting UTF strings, Unicode support includes support for the
|
||||
\P, \p, and \X sequences that recognize Unicode character properties.
|
||||
However, only the basic two-letter properties such as Lu are supported.
|
||||
Escape sequences such as \d and \w in patterns do not by default make use of
|
||||
Unicode properties, but can be made to do so by setting the PCRE2_UCP option
|
||||
or starting a pattern with (*UCP).
|
||||
However, only a subset of Unicode properties are supported; see the
|
||||
pcre2pattern man page for details. Escape sequences such as \d and \w in
|
||||
patterns do not by default make use of Unicode properties, but can be made to
|
||||
do so by setting the PCRE2_UCP option or starting a pattern with (*UCP).
|
||||
|
||||
. You can build PCRE2 to recognize either CR or LF or the sequence CRLF, or any
|
||||
of the preceding, or any of the Unicode newline sequences, or the NUL (zero)
|
||||
|
@ -368,19 +375,20 @@ library. They are also documented in the pcre2build man page.
|
|||
necessary to specify something like LIBS="-lncurses" as well. This is
|
||||
because, to quote the readline INSTALL, "Readline uses the termcap functions,
|
||||
but does not link with the termcap or curses library itself, allowing
|
||||
applications which link with readline the to choose an appropriate library."
|
||||
applications which link with readline the option to choose an appropriate
|
||||
library."
|
||||
If you get error messages about missing functions tgetstr, tgetent, tputs,
|
||||
tgetflag, or tgoto, this is the problem, and linking with the ncurses library
|
||||
should fix it.
|
||||
|
||||
. The C99 standard defines formatting modifiers z and t for size_t and
|
||||
ptrdiff_t values, respectively. By default, PCRE2 uses these modifiers in
|
||||
environments other than Microsoft Visual Studio when __STDC_VERSION__ is
|
||||
defined and has a value greater than or equal to 199901L (indicating C99).
|
||||
However, there is at least one environment that claims to be C99 but does not
|
||||
support these modifiers. If --disable-percent-zt is specified, no use is made
|
||||
of the z or t modifiers. Instead or %td or %zu, %lu is used, with a cast for
|
||||
size_t values.
|
||||
environments other than Microsoft Visual Studio versions earlier than 2013
|
||||
when __STDC_VERSION__ is defined and has a value greater than or equal to
|
||||
199901L (indicating C99). However, there is at least one environment that
|
||||
claims to be C99 but does not support these modifiers. If
|
||||
--disable-percent-zt is specified, no use is made of the z or t modifiers.
|
||||
Instead of %td or %zu, %lu is used, with a cast for size_t values.
|
||||
|
||||
. There is a special option called --enable-fuzz-support for use by people who
|
||||
want to run fuzzing tests on PCRE2. At present this applies only to the 8-bit
|
||||
|
@ -393,10 +401,10 @@ library. They are also documented in the pcre2build man page.
|
|||
Setting --enable-fuzz-support also causes a binary called pcre2fuzzcheck to
|
||||
be created. This is normally run under valgrind or used when PCRE2 is
|
||||
compiled with address sanitizing enabled. It calls the fuzzing function and
|
||||
outputs information about it is doing. The input strings are specified by
|
||||
arguments: if an argument starts with "=" the rest of it is a literal input
|
||||
string. Otherwise, it is assumed to be a file name, and the contents of the
|
||||
file are the test string.
|
||||
outputs information about what it is doing. The input strings are specified
|
||||
by arguments: if an argument starts with "=" the rest of it is a literal
|
||||
input string. Otherwise, it is assumed to be a file name, and the contents
|
||||
of the file are the test string.
|
||||
|
||||
. Releases before 10.30 could be compiled with --disable-stack-for-recursion,
|
||||
which caused pcre2_match() to use individual blocks on the heap for
|
||||
|
@ -410,7 +418,7 @@ The "configure" script builds the following files for the basic C library:
|
|||
. Makefile the makefile that builds the library
|
||||
. src/config.h build-time configuration options for the library
|
||||
. src/pcre2.h the public PCRE2 header file
|
||||
. pcre2-config script that shows the building settings such as CFLAGS
|
||||
. pcre2-config script that shows the building settings such as CFLAGS
|
||||
that were set for "configure"
|
||||
. libpcre2-8.pc )
|
||||
. libpcre2-16.pc ) data for the pkg-config command
|
||||
|
@ -601,13 +609,13 @@ is available. RunTest outputs a comment when it skips a test.
|
|||
|
||||
Many (but not all) of the tests that are not skipped are run twice if JIT
|
||||
support is available. On the second run, JIT compilation is forced. This
|
||||
testing can be suppressed by putting "nojit" on the RunTest command line.
|
||||
testing can be suppressed by putting "-nojit" on the RunTest command line.
|
||||
|
||||
The entire set of tests is run once for each of the 8-bit, 16-bit and 32-bit
|
||||
libraries that are enabled. If you want to run just one set of tests, call
|
||||
RunTest with either the -8, -16 or -32 option.
|
||||
|
||||
If valgrind is installed, you can run the tests under it by putting "valgrind"
|
||||
If valgrind is installed, you can run the tests under it by putting "-valgrind"
|
||||
on the RunTest command line. To run pcre2test on just one or more specific test
|
||||
files, give their numbers as arguments to RunTest, for example:
|
||||
|
||||
|
@ -688,7 +696,7 @@ Test 14 contains some special UTF and UCP tests that give different output for
|
|||
different code unit widths.
|
||||
|
||||
Test 15 contains a number of tests that must not be run with JIT. They check,
|
||||
among other non-JIT things, the match-limiting features of the intepretive
|
||||
among other non-JIT things, the match-limiting features of the interpretive
|
||||
matcher.
|
||||
|
||||
Test 16 is run only when JIT support is not available. It checks that an
|
||||
|
@ -904,4 +912,4 @@ The distribution should contain the files listed below.
|
|||
Philip Hazel
|
||||
Email local part: Philip.Hazel
|
||||
Email domain: gmail.com
|
||||
Last updated: 28 April 2021
|
||||
Last updated: 15 April 2022
|
||||
|
|
|
@ -28,7 +28,8 @@ nearly two decades, the limitations of the original API were making development
|
|||
increasingly difficult. The new API is more extensible, and it was simplified
|
||||
by abolishing the separate "study" optimizing function; in PCRE2, patterns are
|
||||
automatically optimized where possible. Since forking from PCRE1, the code has
|
||||
been extensively refactored and new features introduced.
|
||||
been extensively refactored and new features introduced. The old library is now
|
||||
obsolete and is no longer maintained.
|
||||
</P>
|
||||
<P>
|
||||
As well as Perl-style regular expression patterns, some features that appeared
|
||||
|
@ -193,18 +194,18 @@ function, listing its arguments and results.
|
|||
<P>
|
||||
Philip Hazel
|
||||
<br>
|
||||
University Computing Service
|
||||
Retired from University Computing Service
|
||||
<br>
|
||||
Cambridge, England.
|
||||
<br>
|
||||
</P>
|
||||
<P>
|
||||
Putting an actual email address here is a spam magnet. If you want to email me,
|
||||
use my two initials, followed by the two digits 10, at the domain cam.ac.uk.
|
||||
use my two names separated by a dot at gmail.com.
|
||||
</P>
|
||||
<br><a name="SEC5" href="#TOC1">REVISION</a><br>
|
||||
<P>
|
||||
Last updated: 28 April 2021
|
||||
Last updated: 27 August 2021
|
||||
<br>
|
||||
Copyright © 1997-2021 University of Cambridge.
|
||||
<br>
|
||||
|
|
|
@ -92,8 +92,18 @@ Additional options may be set in the compile context via the
|
|||
function.
|
||||
</P>
|
||||
<P>
|
||||
The yield of this function is a pointer to a private data structure that
|
||||
contains the compiled pattern, or NULL if an error was detected.
|
||||
If either of <i>errorcode</i> or <i>erroroffset</i> is NULL, the function returns
|
||||
NULL immediately. Otherwise, the yield of this function is a pointer to a
|
||||
private data structure that contains the compiled pattern, or NULL if an error
|
||||
was detected. In the error case, a text error message can be obtained by
|
||||
passing the value returned via the <i>errorcode</i> argument to the the
|
||||
<b>pcre2_get_error_message()</b> function. The offset (in code units) where the
|
||||
error was encountered is returned via the <i>erroroffset</i> argument.
|
||||
</P>
|
||||
<P>
|
||||
If there is no error, the value passed via <i>errorcode</i> returns the message
|
||||
"no error" if passed to <b>pcre2_get_error_message()</b>, and the value passed
|
||||
via <i>erroroffset</i> is zero.
|
||||
</P>
|
||||
<P>
|
||||
There is a complete description of the PCRE2 native API, with more detail on
|
||||
|
|
|
@ -45,10 +45,16 @@ just once (except when processing lookaround assertions). This function is
|
|||
<i>workspace</i> Points to a vector of ints used as working space
|
||||
<i>wscount</i> Number of elements in the vector
|
||||
</pre>
|
||||
For <b>pcre2_dfa_match()</b>, a match context is needed only if you want to set
|
||||
up a callout function or specify the heap limit or the match or the recursion
|
||||
depth limits. The <i>length</i> and <i>startoffset</i> values are code units, not
|
||||
characters. The options are:
|
||||
The size of output vector needed to contain all the results depends on the
|
||||
number of simultaneous matches, not on the number of parentheses in the
|
||||
pattern. Using <b>pcre2_match_data_create_from_pattern()</b> to create the match
|
||||
data block is therefore not advisable when using this function.
|
||||
</P>
|
||||
<P>
|
||||
A match context is needed only if you want to set up a callout function or
|
||||
specify the heap limit or the match or the recursion depth limits. The
|
||||
<i>length</i> and <i>startoffset</i> values are code units, not characters. The
|
||||
options are:
|
||||
<pre>
|
||||
PCRE2_ANCHORED Match only at the first position
|
||||
PCRE2_COPY_MATCHED_SUBJECT
|
||||
|
|
|
@ -34,7 +34,8 @@ allocation. The result can be passed to the JIT run-time code by calling
|
|||
<b>pcre2_jit_stack_assign()</b> to associate the stack with a compiled pattern,
|
||||
which can then be processed by <b>pcre2_match()</b> or <b>pcre2_jit_match()</b>.
|
||||
A maximum stack size of 512KiB to 1MiB should be more than enough for any
|
||||
pattern. For more details, see the
|
||||
pattern. If the stack couldn't be allocated or the values passed were not
|
||||
reasonable, NULL will be returned. For more details, see the
|
||||
<a href="pcre2jit.html"><b>pcre2jit</b></a>
|
||||
page.
|
||||
</P>
|
||||
|
|
|
@ -30,8 +30,9 @@ This function creates a new match data block, which is used for holding the
|
|||
result of a match. The first argument specifies the number of pairs of offsets
|
||||
that are required. These form the "output vector" (ovector) within the match
|
||||
data block, and are used to identify the matched string and any captured
|
||||
substrings. There is always one pair of offsets; if <b>ovecsize</b> is zero, it
|
||||
is treated as one.
|
||||
substrings when matching with <b>pcre2_match()</b>, or a number of different
|
||||
matches at the same point when used with <b>pcre2_dfa_match()</b>. There is
|
||||
always one pair of offsets; if <b>ovecsize</b> is zero, it is treated as one.
|
||||
</P>
|
||||
<P>
|
||||
The second argument points to a general context, for custom memory management,
|
||||
|
|
|
@ -26,12 +26,15 @@ SYNOPSIS
|
|||
DESCRIPTION
|
||||
</b><br>
|
||||
<P>
|
||||
This function creates a new match data block, which is used for holding the
|
||||
result of a match. The first argument points to a compiled pattern. The number
|
||||
of capturing parentheses within the pattern is used to compute the number of
|
||||
pairs of offsets that are required in the match data block. These form the
|
||||
"output vector" (ovector) within the match data block, and are used to identify
|
||||
the matched string and any captured substrings.
|
||||
This function creates a new match data block for holding the result of a match.
|
||||
The first argument points to a compiled pattern. The number of capturing
|
||||
parentheses within the pattern is used to compute the number of pairs of
|
||||
offsets that are required in the match data block. These form the "output
|
||||
vector" (ovector) within the match data block, and are used to identify the
|
||||
matched string and any captured substrings when matching with
|
||||
<b>pcre2_match()</b>. If you are using <b>pcre2_dfa_match()</b>, which uses the
|
||||
outut vector in a different way, you should use <b>pcre2_match_data_create()</b>
|
||||
instead of this function.
|
||||
</P>
|
||||
<P>
|
||||
The second argument points to a general context, for custom memory management,
|
||||
|
|
|
@ -48,7 +48,7 @@ the following negative error codes:
|
|||
PCRE2_ERROR_BADDATA <i>number_of_codes</i> is zero or less
|
||||
PCRE2_ERROR_BADMAGIC mismatch of id bytes in <i>bytes</i>
|
||||
PCRE2_ERROR_BADMODE mismatch of variable unit size or PCRE version
|
||||
PCRE2_ERROR_MEMORY memory allocation failed
|
||||
PCRE2_ERROR_NOMEMORY memory allocation failed
|
||||
PCRE2_ERROR_NULL <i>codes</i> or <i>bytes</i> is NULL
|
||||
</pre>
|
||||
PCRE2_ERROR_BADMAGIC may mean that the data is corrupt, or that it was compiled
|
||||
|
|
|
@ -30,7 +30,8 @@ This function sets additional option bits for <b>pcre2_compile()</b> that are
|
|||
housed in a compile context. It completely replaces all the bits. The extra
|
||||
options are:
|
||||
<pre>
|
||||
PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES Allow \x{df800} to \x{dfff} in UTF-8 and UTF-32 modes
|
||||
PCRE2_EXTRA_ALLOW_LOOKAROUND_BSK Allow \K in lookarounds
|
||||
PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES Allow \x{d800} to \x{dfff} in UTF-8 and UTF-32 modes
|
||||
PCRE2_EXTRA_ALT_BSUX Extended alternate \u, \U, and \x handling
|
||||
PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL Treat all invalid escapes as a literal following character
|
||||
PCRE2_EXTRA_ESCAPED_CR_IS_LF Interpret \r as \n
|
||||
|
|
|
@ -68,29 +68,29 @@ automatically added.
|
|||
The subject and replacement lengths can be given as PCRE2_ZERO_TERMINATED for
|
||||
zero-terminated strings. The options are:
|
||||
<pre>
|
||||
PCRE2_ANCHORED Match only at the first position
|
||||
PCRE2_ENDANCHORED Pattern can match only at end of subject
|
||||
PCRE2_NOTBOL Subject is not the beginning of a line
|
||||
PCRE2_NOTEOL Subject is not the end of a line
|
||||
PCRE2_NOTEMPTY An empty string is not a valid match
|
||||
PCRE2_NOTEMPTY_ATSTART An empty string at the start of the subject is not a valid match
|
||||
PCRE2_NO_JIT Do not use JIT matching
|
||||
PCRE2_NO_UTF_CHECK Do not check the subject or replacement for UTF validity (only relevant if
|
||||
PCRE2_UTF was set at compile time)
|
||||
PCRE2_SUBSTITUTE_EXTENDED Do extended replacement processing
|
||||
PCRE2_SUBSTITUTE_GLOBAL Replace all occurrences in the subject
|
||||
PCRE2_SUBSTITUTE_LITERAL The replacement string is literal
|
||||
PCRE2_SUBSTITUTE_MATCHED Use pre-existing match data for 1st match
|
||||
PCRE2_SUBSTITUTE_OVERFLOW_LENGTH If overflow, compute needed length
|
||||
PCRE2_ANCHORED Match only at the first position
|
||||
PCRE2_ENDANCHORED Match only at end of subject
|
||||
PCRE2_NOTBOL Subject is not the beginning of a line
|
||||
PCRE2_NOTEOL Subject is not the end of a line
|
||||
PCRE2_NOTEMPTY An empty string is not a valid match
|
||||
PCRE2_NOTEMPTY_ATSTART An empty string at the start of the subject is not a valid match
|
||||
PCRE2_NO_JIT Do not use JIT matching
|
||||
PCRE2_NO_UTF_CHECK Do not check for UTF validity in the subject or replacement
|
||||
(only relevant if PCRE2_UTF was set at compile time)
|
||||
PCRE2_SUBSTITUTE_EXTENDED Do extended replacement processing
|
||||
PCRE2_SUBSTITUTE_GLOBAL Replace all occurrences in the subject
|
||||
PCRE2_SUBSTITUTE_LITERAL The replacement string is literal
|
||||
PCRE2_SUBSTITUTE_MATCHED Use pre-existing match data for first match
|
||||
PCRE2_SUBSTITUTE_OVERFLOW_LENGTH If overflow, compute needed length
|
||||
PCRE2_SUBSTITUTE_REPLACEMENT_ONLY Return only replacement string(s)
|
||||
PCRE2_SUBSTITUTE_UNKNOWN_UNSET Treat unknown group as unset
|
||||
PCRE2_SUBSTITUTE_UNSET_EMPTY Simple unset insert = empty string
|
||||
PCRE2_SUBSTITUTE_UNKNOWN_UNSET Treat unknown group as unset
|
||||
PCRE2_SUBSTITUTE_UNSET_EMPTY Simple unset insert = empty string
|
||||
</pre>
|
||||
If PCRE2_SUBSTITUTE_LITERAL is set, PCRE2_SUBSTITUTE_EXTENDED,
|
||||
PCRE2_SUBSTITUTE_UNKNOWN_UNSET, and PCRE2_SUBSTITUTE_UNSET_EMPTY are ignored.
|
||||
</P>
|
||||
<P>
|
||||
If PCRE2_SUBSTITUTE_MATCHED is set, <i>match_data</i> must be non-zero; its
|
||||
If PCRE2_SUBSTITUTE_MATCHED is set, <i>match_data</i> must be non-NULL; its
|
||||
contents must be the result of a call to <b>pcre2_match()</b> using the same
|
||||
pattern and subject.
|
||||
</P>
|
||||
|
|
|
@ -1017,7 +1017,7 @@ has its own memory control arrangements (see the
|
|||
documentation for more details). If the limit is reached, the negative error
|
||||
code PCRE2_ERROR_HEAPLIMIT is returned. The default limit can be set when PCRE2
|
||||
is built; if it is not, the default is set very large and is essentially
|
||||
"unlimited".
|
||||
unlimited.
|
||||
</P>
|
||||
<P>
|
||||
A value for the heap limit may also be supplied by an item at the start of a
|
||||
|
@ -1030,19 +1030,17 @@ less than the limit set by the caller of <b>pcre2_match()</b> or, if no such
|
|||
limit is set, less than the default.
|
||||
</P>
|
||||
<P>
|
||||
The <b>pcre2_match()</b> function starts out using a 20KiB vector on the system
|
||||
stack for recording backtracking points. The more nested backtracking points
|
||||
there are (that is, the deeper the search tree), the more memory is needed.
|
||||
Heap memory is used only if the initial vector is too small. If the heap limit
|
||||
is set to a value less than 21 (in particular, zero) no heap memory will be
|
||||
used. In this case, only patterns that do not have a lot of nested backtracking
|
||||
can be successfully processed.
|
||||
The <b>pcre2_match()</b> function always needs some heap memory, so setting a
|
||||
value of zero guarantees a "heap limit exceeded" error. Details of how
|
||||
<b>pcre2_match()</b> uses the heap are given in the
|
||||
<a href="pcre2perform.html"><b>pcre2perform</b></a>
|
||||
documentation.
|
||||
</P>
|
||||
<P>
|
||||
Similarly, for <b>pcre2_dfa_match()</b>, a vector on the system stack is used
|
||||
when processing pattern recursions, lookarounds, or atomic groups, and only if
|
||||
this is not big enough is heap memory used. In this case, too, setting a value
|
||||
of zero disables the use of the heap.
|
||||
For <b>pcre2_dfa_match()</b>, a vector on the system stack is used when
|
||||
processing pattern recursions, lookarounds, or atomic groups, and only if this
|
||||
is not big enough is heap memory used. In this case, setting a value of zero
|
||||
disables the use of the heap.
|
||||
<br>
|
||||
<br>
|
||||
<b>int pcre2_set_match_limit(pcre2_match_context *<i>mcontext</i>,</b>
|
||||
|
@ -1089,10 +1087,10 @@ less than the limit set by the caller of <b>pcre2_match()</b> or
|
|||
<br>
|
||||
<br>
|
||||
This parameter limits the depth of nested backtracking in <b>pcre2_match()</b>.
|
||||
Each time a nested backtracking point is passed, a new memory "frame" is used
|
||||
Each time a nested backtracking point is passed, a new memory frame is used
|
||||
to remember the state of matching at that point. Thus, this parameter
|
||||
indirectly limits the amount of memory that is used in a match. However,
|
||||
because the size of each memory "frame" depends on the number of capturing
|
||||
because the size of each memory frame depends on the number of capturing
|
||||
parentheses, the actual memory limit varies from pattern to pattern. This limit
|
||||
was more useful in versions before 10.30, where function recursion was used for
|
||||
backtracking.
|
||||
|
@ -1383,8 +1381,7 @@ If <i>errorcode</i> or <i>erroroffset</i> is NULL, <b>pcre2_compile()</b> return
|
|||
NULL immediately. Otherwise, the variables to which these point are set to an
|
||||
error code and an offset (number of code units) within the pattern,
|
||||
respectively, when <b>pcre2_compile()</b> returns NULL because a compilation
|
||||
error has occurred. The values are not defined when compilation is successful
|
||||
and <b>pcre2_compile()</b> returns a non-NULL value.
|
||||
error has occurred.
|
||||
</P>
|
||||
<P>
|
||||
There are nearly 100 positive error codes that <b>pcre2_compile()</b> may return
|
||||
|
@ -1399,15 +1396,18 @@ because the textual error messages that are obtained by calling the
|
|||
message"
|
||||
<a href="#geterrormessage">below)</a>
|
||||
should be self-explanatory. Macro names starting with PCRE2_ERROR_ are defined
|
||||
for both positive and negative error codes in <b>pcre2.h</b>.
|
||||
for both positive and negative error codes in <b>pcre2.h</b>. When compilation
|
||||
is successful <i>errorcode</i> is set to a value that returns the message "no
|
||||
error" if passed to <b>pcre2_get_error_message()</b>.
|
||||
</P>
|
||||
<P>
|
||||
The value returned in <i>erroroffset</i> is an indication of where in the
|
||||
pattern the error occurred. It is not necessarily the furthest point in the
|
||||
pattern that was read. For example, after the error "lookbehind assertion is
|
||||
not fixed length", the error offset points to the start of the failing
|
||||
assertion. For an invalid UTF-8 or UTF-16 string, the offset is that of the
|
||||
first code unit of the failing character.
|
||||
pattern an error occurred. When there is no error, zero is returned. A non-zero
|
||||
value is not necessarily the furthest point in the pattern that was read. For
|
||||
example, after the error "lookbehind assertion is not fixed length", the error
|
||||
offset points to the start of the failing assertion. For an invalid UTF-8 or
|
||||
UTF-16 string, the offset is that of the first code unit of the failing
|
||||
character.
|
||||
</P>
|
||||
<P>
|
||||
Some errors are not detected until the whole pattern has been scanned; in these
|
||||
|
@ -1845,7 +1845,7 @@ undefined. It may cause your program to crash or loop.
|
|||
</P>
|
||||
<P>
|
||||
Note that this option can also be passed to <b>pcre2_match()</b> and
|
||||
<b>pcre_dfa_match()</b>, to suppress UTF validity checking of the subject
|
||||
<b>pcre2_dfa_match()</b>, to suppress UTF validity checking of the subject
|
||||
string.
|
||||
</P>
|
||||
<P>
|
||||
|
@ -1914,6 +1914,13 @@ Extra compile options
|
|||
<P>
|
||||
The option bits that can be set in a compile context by calling the
|
||||
<b>pcre2_set_compile_extra_options()</b> function are as follows:
|
||||
<pre>
|
||||
PCRE2_EXTRA_ALLOW_LOOKAROUND_BSK
|
||||
</pre>
|
||||
Since release 10.38 PCRE2 has forbidden the use of \K within lookaround
|
||||
assertions, following Perl's lead. This option is provided to re-enable the
|
||||
previous behaviour (act in positive lookarounds, ignore in negative ones) in
|
||||
case anybody is relying on it.
|
||||
<pre>
|
||||
PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES
|
||||
</pre>
|
||||
|
@ -2048,8 +2055,8 @@ point. However, this applies only to characters whose code points are less than
|
|||
\d.
|
||||
</P>
|
||||
<P>
|
||||
When PCRE2 is built with Unicode support (the default), the Unicode properties
|
||||
of all characters can be tested with \p and \P, or, alternatively, the
|
||||
When PCRE2 is built with Unicode support (the default), certain Unicode
|
||||
character properties can be tested with \p and \P, or, alternatively, the
|
||||
PCRE2_UCP option can be set when a pattern is compiled; this causes \w and
|
||||
friends to use Unicode property support instead of the built-in tables.
|
||||
PCRE2_UCP also causes upper/lower casing operations on characters with code
|
||||
|
@ -2309,7 +2316,7 @@ return zero. The third argument should point to a <b>size_t</b> variable.
|
|||
PCRE2_INFO_LASTCODETYPE
|
||||
</pre>
|
||||
Returns 1 if there is a rightmost literal code unit that must exist in any
|
||||
matched string, other than at its start. The third argument should point to a
|
||||
matched string, other than at its start. The third argument should point to a
|
||||
<b>uint32_t</b> variable. If there is no such value, 0 is returned. When 1 is
|
||||
returned, the code unit value itself can be retrieved using
|
||||
PCRE2_INFO_LASTCODEUNIT. For anchored patterns, a last literal value is
|
||||
|
@ -2512,20 +2519,31 @@ to an abstract format like Java or .NET serialization.
|
|||
Information about a successful or unsuccessful match is placed in a match
|
||||
data block, which is an opaque structure that is accessed by function calls. In
|
||||
particular, the match data block contains a vector of offsets into the subject
|
||||
string that define the matched part of the subject and any substrings that were
|
||||
captured. This is known as the <i>ovector</i>.
|
||||
string that define the matched parts of the subject. This is known as the
|
||||
<i>ovector</i>.
|
||||
</P>
|
||||
<P>
|
||||
Before calling <b>pcre2_match()</b>, <b>pcre2_dfa_match()</b>, or
|
||||
<b>pcre2_jit_match()</b> you must create a match data block by calling one of
|
||||
the creation functions above. For <b>pcre2_match_data_create()</b>, the first
|
||||
argument is the number of pairs of offsets in the <i>ovector</i>. One pair of
|
||||
offsets is required to identify the string that matched the whole pattern, with
|
||||
an additional pair for each captured substring. For example, a value of 4
|
||||
creates enough space to record the matched portion of the subject plus three
|
||||
captured substrings. A minimum of at least 1 pair is imposed by
|
||||
<b>pcre2_match_data_create()</b>, so it is always possible to return the overall
|
||||
matched string.
|
||||
argument is the number of pairs of offsets in the <i>ovector</i>.
|
||||
</P>
|
||||
<P>
|
||||
When using <b>pcre2_match()</b>, one pair of offsets is required to identify the
|
||||
string that matched the whole pattern, with an additional pair for each
|
||||
captured substring. For example, a value of 4 creates enough space to record
|
||||
the matched portion of the subject plus three captured substrings.
|
||||
</P>
|
||||
<P>
|
||||
When using <b>pcre2_dfa_match()</b> there may be multiple matched substrings of
|
||||
different lengths at the same point in the subject. The ovector should be made
|
||||
large enough to hold as many as are expected.
|
||||
</P>
|
||||
<P>
|
||||
A minimum of at least 1 pair is imposed by <b>pcre2_match_data_create()</b>, so
|
||||
it is always possible to return the overall matched string in the case of
|
||||
<b>pcre2_match()</b> or the longest match in the case of
|
||||
<b>pcre2_dfa_match()</b>.
|
||||
</P>
|
||||
<P>
|
||||
The second argument of <b>pcre2_match_data_create()</b> is a pointer to a
|
||||
|
@ -2536,10 +2554,11 @@ pass NULL, which causes <b>malloc()</b> to be used.
|
|||
<P>
|
||||
For <b>pcre2_match_data_create_from_pattern()</b>, the first argument is a
|
||||
pointer to a compiled pattern. The ovector is created to be exactly the right
|
||||
size to hold all the substrings a pattern might capture. The second argument is
|
||||
again a pointer to a general context, but in this case if NULL is passed, the
|
||||
memory is obtained using the same allocator that was used for the compiled
|
||||
pattern (custom or default).
|
||||
size to hold all the substrings a pattern might capture when matched using
|
||||
<b>pcre2_match()</b>. You should not use this call when matching with
|
||||
<b>pcre2_dfa_match()</b>. The second argument is again a pointer to a general
|
||||
context, but in this case if NULL is passed, the memory is obtained using the
|
||||
same allocator that was used for the compiled pattern (custom or default).
|
||||
</P>
|
||||
<P>
|
||||
A match data block can be used many times, with the same or different compiled
|
||||
|
@ -2621,7 +2640,9 @@ The subject string is passed to <b>pcre2_match()</b> as a pointer in
|
|||
<i>startoffset</i>. The length and offset are in code units, not characters.
|
||||
That is, they are in bytes for the 8-bit library, 16-bit code units for the
|
||||
16-bit library, and 32-bit code units for the 32-bit library, whether or not
|
||||
UTF processing is enabled.
|
||||
UTF processing is enabled. As a special case, if <i>subject</i> is NULL and
|
||||
<i>length</i> is zero, the subject is assumed to be an empty string. If
|
||||
<i>length</i> is non-zero, an error occurs if <i>subject</i> is NULL.
|
||||
</P>
|
||||
<P>
|
||||
If <i>startoffset</i> is greater than the length of the subject,
|
||||
|
@ -2643,10 +2664,10 @@ lookbehind. For example, consider the pattern
|
|||
</pre>
|
||||
which finds occurrences of "iss" in the middle of words. (\B matches only if
|
||||
the current position in the subject is not a word boundary.) When applied to
|
||||
the string "Mississipi" the first call to <b>pcre2_match()</b> finds the first
|
||||
the string "Mississippi" the first call to <b>pcre2_match()</b> finds the first
|
||||
occurrence. If <b>pcre2_match()</b> is called again with just the remainder of
|
||||
the subject, namely "issipi", it does not match, because \B is always false at
|
||||
the start of the subject, which is deemed to be a word boundary. However, if
|
||||
the subject, namely "issippi", it does not match, because \B is always false
|
||||
at the start of the subject, which is deemed to be a word boundary. However, if
|
||||
<b>pcre2_match()</b> is passed the entire string again, but with
|
||||
<i>startoffset</i> set to 4, it finds the second occurrence of "iss" because it
|
||||
is able to look behind the starting point to discover that it is preceded by a
|
||||
|
@ -3125,11 +3146,11 @@ The backtracking match limit was reached.
|
|||
<pre>
|
||||
PCRE2_ERROR_NOMEMORY
|
||||
</pre>
|
||||
If a pattern contains many nested backtracking points, heap memory is used to
|
||||
remember them. This error is given when the memory allocation function (default
|
||||
or custom) fails. Note that a different error, PCRE2_ERROR_HEAPLIMIT, is given
|
||||
if the amount of memory needed exceeds the heap limit. PCRE2_ERROR_NOMEMORY is
|
||||
also returned if PCRE2_COPY_MATCHED_SUBJECT is set and memory allocation fails.
|
||||
Heap memory is used to remember backgracking points. This error is given when
|
||||
the memory allocation function (default or custom) fails. Note that a different
|
||||
error, PCRE2_ERROR_HEAPLIMIT, is given if the amount of memory needed exceeds
|
||||
the heap limit. PCRE2_ERROR_NOMEMORY is also returned if
|
||||
PCRE2_COPY_MATCHED_SUBJECT is set and memory allocation fails.
|
||||
<pre>
|
||||
PCRE2_ERROR_NULL
|
||||
</pre>
|
||||
|
@ -3375,12 +3396,17 @@ same number causes an error at compile time.
|
|||
<P>
|
||||
This function optionally calls <b>pcre2_match()</b> and then makes a copy of the
|
||||
subject string in <i>outputbuffer</i>, replacing parts that were matched with
|
||||
the <i>replacement</i> string, whose length is supplied in <b>rlength</b>. This
|
||||
can be given as PCRE2_ZERO_TERMINATED for a zero-terminated string. There is an
|
||||
option (see PCRE2_SUBSTITUTE_REPLACEMENT_ONLY below) to return just the
|
||||
replacement string(s). The default action is to perform just one replacement if
|
||||
the pattern matches, but there is an option that requests multiple replacements
|
||||
(see PCRE2_SUBSTITUTE_GLOBAL below).
|
||||
the <i>replacement</i> string, whose length is supplied in <b>rlength</b>, which
|
||||
can be given as PCRE2_ZERO_TERMINATED for a zero-terminated string. As a
|
||||
special case, if <i>replacement</i> is NULL and <i>rlength</i> is zero, the
|
||||
replacement is assumed to be an empty string. If <i>rlength</i> is non-zero, an
|
||||
error occurs if <i>replacement</i> is NULL.
|
||||
</P>
|
||||
<P>
|
||||
There is an option (see PCRE2_SUBSTITUTE_REPLACEMENT_ONLY below) to return just
|
||||
the replacement string(s). The default action is to perform just one
|
||||
replacement if the pattern matches, but there is an option that requests
|
||||
multiple replacements (see PCRE2_SUBSTITUTE_GLOBAL below).
|
||||
</P>
|
||||
<P>
|
||||
If successful, <b>pcre2_substitute()</b> returns the number of substitutions
|
||||
|
@ -3414,12 +3440,12 @@ block may or may not have been changed.
|
|||
As well as the usual options for <b>pcre2_match()</b>, a number of additional
|
||||
options can be set in the <i>options</i> argument of <b>pcre2_substitute()</b>.
|
||||
One such option is PCRE2_SUBSTITUTE_MATCHED. When this is set, an external
|
||||
<i>match_data</i> block must be provided, and it must have been used for an
|
||||
external call to <b>pcre2_match()</b>. The data in the <i>match_data</i> block
|
||||
(return code, offset vector) is used for the first substitution instead of
|
||||
calling <b>pcre2_match()</b> from within <b>pcre2_substitute()</b>. This allows
|
||||
an application to check for a match before choosing to substitute, without
|
||||
having to repeat the match.
|
||||
<i>match_data</i> block must be provided, and it must have already been used for
|
||||
an external call to <b>pcre2_match()</b> with the same pattern and subject
|
||||
arguments. The data in the <i>match_data</i> block (return code, offset vector)
|
||||
is then used for the first substitution instead of calling <b>pcre2_match()</b>
|
||||
from within <b>pcre2_substitute()</b>. This allows an application to check for a
|
||||
match before choosing to substitute, without having to repeat the match.
|
||||
</P>
|
||||
<P>
|
||||
The contents of the externally supplied match data block are not changed when
|
||||
|
@ -3564,7 +3590,7 @@ and force lower case. The escape sequences change the current state: \U and
|
|||
terminating a \Q quoted sequence) reverts to no case forcing. The sequences
|
||||
\u and \l force the next character (if it is a letter) to upper or lower
|
||||
case, respectively, and then the state automatically reverts to no case
|
||||
forcing. Case forcing applies to all inserted characters, including those from
|
||||
forcing. Case forcing applies to all inserted characters, including those from
|
||||
capture groups and letters within \Q...\E quoted sequences. If either
|
||||
PCRE2_UTF or PCRE2_UCP was set when the pattern was compiled, Unicode
|
||||
properties are used for case forcing characters whose code points are greater
|
||||
|
@ -3636,7 +3662,9 @@ default.
|
|||
</P>
|
||||
<P>
|
||||
PCRE2_ERROR_NULL is returned if PCRE2_SUBSTITUTE_MATCHED is set but the
|
||||
<i>match_data</i> argument is NULL.
|
||||
<i>match_data</i> argument is NULL or if the <i>subject</i> or <i>replacement</i>
|
||||
arguments are NULL. For backward compatibility reasons an exception is made for
|
||||
the <i>replacement</i> argument if the <i>rlength</i> argument is also 0.
|
||||
</P>
|
||||
<P>
|
||||
PCRE2_ERROR_BADREPLACEMENT is used for miscellaneous syntax errors in the
|
||||
|
@ -3791,12 +3819,13 @@ other alternatives. Ultimately, when it runs out of matches,
|
|||
<P>
|
||||
The function <b>pcre2_dfa_match()</b> is called to match a subject string
|
||||
against a compiled pattern, using a matching algorithm that scans the subject
|
||||
string just once (not counting lookaround assertions), and does not backtrack.
|
||||
This has different characteristics to the normal algorithm, and is not
|
||||
compatible with Perl. Some of the features of PCRE2 patterns are not supported.
|
||||
Nevertheless, there are times when this kind of matching can be useful. For a
|
||||
discussion of the two matching algorithms, and a list of features that
|
||||
<b>pcre2_dfa_match()</b> does not support, see the
|
||||
string just once (not counting lookaround assertions), and does not backtrack
|
||||
(except when processing lookaround assertions). This has different
|
||||
characteristics to the normal algorithm, and is not compatible with Perl. Some
|
||||
of the features of PCRE2 patterns are not supported. Nevertheless, there are
|
||||
times when this kind of matching can be useful. For a discussion of the two
|
||||
matching algorithms, and a list of features that <b>pcre2_dfa_match()</b> does
|
||||
not support, see the
|
||||
<a href="pcre2matching.html"><b>pcre2matching</b></a>
|
||||
documentation.
|
||||
</P>
|
||||
|
@ -3831,7 +3860,7 @@ Here is an example of a simple call to <b>pcre2_dfa_match()</b>:
|
|||
</PRE>
|
||||
</P>
|
||||
<br><b>
|
||||
Option bits for <b>pcre_dfa_match()</b>
|
||||
Option bits for <b>pcre2_dfa_match()</b>
|
||||
</b><br>
|
||||
<P>
|
||||
The unused bits of the <i>options</i> argument for <b>pcre2_dfa_match()</b> must
|
||||
|
@ -3982,16 +4011,16 @@ fail, this error is given.
|
|||
<P>
|
||||
Philip Hazel
|
||||
<br>
|
||||
University Computing Service
|
||||
Retired from University Computing Service
|
||||
<br>
|
||||
Cambridge, England.
|
||||
<br>
|
||||
</P>
|
||||
<br><a name="SEC42" href="#TOC1">REVISION</a><br>
|
||||
<P>
|
||||
Last updated: 04 November 2020
|
||||
Last updated: 27 July 2022
|
||||
<br>
|
||||
Copyright © 1997-2020 University of Cambridge.
|
||||
Copyright © 1997-2022 University of Cambridge.
|
||||
<br>
|
||||
<p>
|
||||
Return to the <a href="index.html">PCRE2 index page</a>.
|
||||
|
|
|
@ -142,8 +142,9 @@ locked this out by setting PCRE2_NEVER_UTF.
|
|||
UTF support allows the libraries to process character code points up to
|
||||
0x10ffff in the strings that they handle. Unicode support also gives access to
|
||||
the Unicode properties of characters, using pattern escapes such as \P, \p,
|
||||
and \X. Only the general category properties such as <i>Lu</i> and <i>Nd</i> are
|
||||
supported. Details are given in the
|
||||
and \X. Only the general category properties such as <i>Lu</i> and <i>Nd</i>,
|
||||
script names, and some bi-directional properties are supported. Details are
|
||||
given in the
|
||||
<a href="pcre2pattern.html"><b>pcre2pattern</b></a>
|
||||
documentation.
|
||||
</P>
|
||||
|
@ -283,12 +284,11 @@ to the <b>configure</b> command. This setting also applies to the
|
|||
counting is done differently).
|
||||
</P>
|
||||
<P>
|
||||
The <b>pcre2_match()</b> function starts out using a 20KiB vector on the system
|
||||
stack to record backtracking points. The more nested backtracking points there
|
||||
are (that is, the deeper the search tree), the more memory is needed. If the
|
||||
initial vector is not large enough, heap memory is used, up to a certain limit,
|
||||
which is specified in kibibytes (units of 1024 bytes). The limit can be changed
|
||||
at run time, as described in the
|
||||
The <b>pcre2_match()</b> function uses heap memory to record backtracking
|
||||
points. The more nested backtracking points there are (that is, the deeper the
|
||||
search tree), the more memory is needed. There is an upper limit, specified in
|
||||
kibibytes (units of 1024 bytes). This limit can be changed at run time, as
|
||||
described in the
|
||||
<a href="pcre2api.html"><b>pcre2api</b></a>
|
||||
documentation. The default limit (in effect unlimited) is 20 million. You can
|
||||
change this by a setting such as
|
||||
|
@ -307,7 +307,7 @@ You can also explicitly limit the depth of nested backtracking in the
|
|||
for --with-match-limit. You can set a lower default limit by adding, for
|
||||
example,
|
||||
<pre>
|
||||
--with-match-limit_depth=10000
|
||||
--with-match-limit-depth=10000
|
||||
</pre>
|
||||
to the <b>configure</b> command. This value can be overridden at run time. This
|
||||
depth limit indirectly limits the amount of heap memory that is used, but
|
||||
|
@ -553,15 +553,16 @@ documentation.
|
|||
<P>
|
||||
The C99 standard defines formatting modifiers z and t for size_t and
|
||||
ptrdiff_t values, respectively. By default, PCRE2 uses these modifiers in
|
||||
environments other than Microsoft Visual Studio when __STDC_VERSION__ is
|
||||
defined and has a value greater than or equal to 199901L (indicating C99).
|
||||
environments other than old versions of Microsoft Visual Studio when
|
||||
__STDC_VERSION__ is defined and has a value greater than or equal to 199901L
|
||||
(indicating support for C99).
|
||||
However, there is at least one environment that claims to be C99 but does not
|
||||
support these modifiers. If
|
||||
<pre>
|
||||
--disable-percent-zt
|
||||
</pre>
|
||||
is specified, no use is made of the z or t modifiers. Instead of %td or %zu,
|
||||
%lu is used, with a cast for size_t values.
|
||||
a suitable format is used depending in the size of long for the platform.
|
||||
</P>
|
||||
<br><a name="SEC22" href="#TOC1">SUPPORT FOR FUZZERS</a><br>
|
||||
<P>
|
||||
|
@ -607,16 +608,16 @@ give a warning.
|
|||
<P>
|
||||
Philip Hazel
|
||||
<br>
|
||||
University Computing Service
|
||||
Retired from University Computing Service
|
||||
<br>
|
||||
Cambridge, England.
|
||||
<br>
|
||||
</P>
|
||||
<br><a name="SEC26" href="#TOC1">REVISION</a><br>
|
||||
<P>
|
||||
Last updated: 20 March 2020
|
||||
Last updated: 27 July 2022
|
||||
<br>
|
||||
Copyright © 1997-2020 University of Cambridge.
|
||||
Copyright © 1997-2022 University of Cambridge.
|
||||
<br>
|
||||
<p>
|
||||
Return to the <a href="index.html">PCRE2 index page</a>.
|
||||
|
|
|
@ -18,33 +18,41 @@ DIFFERENCES BETWEEN PCRE2 AND PERL
|
|||
<P>
|
||||
This document describes some of the differences in the ways that PCRE2 and Perl
|
||||
handle regular expressions. The differences described here are with respect to
|
||||
Perl version 5.32.0, but as both Perl and PCRE2 are continually changing, the
|
||||
Perl version 5.34.0, but as both Perl and PCRE2 are continually changing, the
|
||||
information may at times be out of date.
|
||||
</P>
|
||||
<P>
|
||||
1. PCRE2 has only a subset of Perl's Unicode support. Details of what it does
|
||||
1. When PCRE2_DOTALL (equivalent to Perl's /s qualifier) is not set, the
|
||||
behaviour of the '.' metacharacter differs from Perl. In PCRE2, '.' matches the
|
||||
next character unless it is the start of a newline sequence. This means that,
|
||||
if the newline setting is CR, CRLF, or NUL, '.' will match the code point LF
|
||||
(0x0A) in ASCII/Unicode environments, and NL (either 0x15 or 0x25) when using
|
||||
EBCDIC. In Perl, '.' appears never to match LF, even when 0x0A is not a newline
|
||||
indicator.
|
||||
</P>
|
||||
<P>
|
||||
2. PCRE2 has only a subset of Perl's Unicode support. Details of what it does
|
||||
have are given in the
|
||||
<a href="pcre2unicode.html"><b>pcre2unicode</b></a>
|
||||
page.
|
||||
</P>
|
||||
<P>
|
||||
2. Like Perl, PCRE2 allows repeat quantifiers on parenthesized assertions, but
|
||||
3. Like Perl, PCRE2 allows repeat quantifiers on parenthesized assertions, but
|
||||
they do not mean what you might think. For example, (?!a){3} does not assert
|
||||
that the next three characters are not "a". It just asserts that the next
|
||||
character is not "a" three times (in principle; PCRE2 optimizes this to run the
|
||||
assertion just once). Perl allows some repeat quantifiers on other assertions,
|
||||
for example, \b* (but not \b{3}, though oddly it does allow ^{3}), but these
|
||||
do not seem to have any use. PCRE2 does not allow any kind of quantifier on
|
||||
non-lookaround assertions.
|
||||
for example, \b* , but these do not seem to have any use. PCRE2 does not allow
|
||||
any kind of quantifier on non-lookaround assertions.
|
||||
</P>
|
||||
<P>
|
||||
3. Capture groups that occur inside negative lookaround assertions are counted,
|
||||
4. Capture groups that occur inside negative lookaround assertions are counted,
|
||||
but their entries in the offsets vector are set only when a negative assertion
|
||||
is a condition that has a matching branch (that is, the condition is false).
|
||||
Perl may set such capture groups in other circumstances.
|
||||
</P>
|
||||
<P>
|
||||
4. The following Perl escape sequences are not supported: \F, \l, \L, \u,
|
||||
5. The following Perl escape sequences are not supported: \F, \l, \L, \u,
|
||||
\U, and \N when followed by a character name. \N on its own, matching a
|
||||
non-newline character, and \N{U+dd..}, matching a Unicode code point, are
|
||||
supported. The escapes that modify the case of following letters are
|
||||
|
@ -55,26 +63,26 @@ PCRE2_EXTRA_ALT_BSUX options is set, \U and \u are interpreted as ECMAScript
|
|||
interprets them.
|
||||
</P>
|
||||
<P>
|
||||
5. The Perl escape sequences \p, \P, and \X are supported only if PCRE2 is
|
||||
6. The Perl escape sequences \p, \P, and \X are supported only if PCRE2 is
|
||||
built with Unicode support (the default). The properties that can be tested
|
||||
with \p and \P are limited to the general category properties such as Lu and
|
||||
Nd, script names such as Greek or Han, and the derived properties Any and L&.
|
||||
Both PCRE2 and Perl support the Cs (surrogate) property, but in PCRE2 its use
|
||||
is limited. See the
|
||||
Nd, script names such as Greek or Han, Bidi_Class, Bidi_Control, and the
|
||||
derived properties Any and LC (synonym L&). Both PCRE2 and Perl support the Cs
|
||||
(surrogate) property, but in PCRE2 its use is limited. See the
|
||||
<a href="pcre2pattern.html"><b>pcre2pattern</b></a>
|
||||
documentation for details. The long synonyms for property names that Perl
|
||||
supports (such as \p{Letter}) are not supported by PCRE2, nor is it permitted
|
||||
to prefix any of these properties with "Is".
|
||||
</P>
|
||||
<P>
|
||||
6. PCRE2 supports the \Q...\E escape for quoting substrings. Characters
|
||||
7. PCRE2 supports the \Q...\E escape for quoting substrings. Characters
|
||||
in between are treated as literals. However, this is slightly different from
|
||||
Perl in that $ and @ are also handled as literals inside the quotes. In Perl,
|
||||
they cause variable interpolation (but of course PCRE2 does not have
|
||||
variables). Also, Perl does "double-quotish backslash interpolation" on any
|
||||
backslashes between \Q and \E which, its documentation says, "may lead to
|
||||
confusing results". PCRE2 treats a backslash between \Q and \E just like any
|
||||
other character. Note the following examples:
|
||||
they cause variable interpolation (PCRE2 does not have variables). Also, Perl
|
||||
does "double-quotish backslash interpolation" on any backslashes between \Q
|
||||
and \E which, its documentation says, "may lead to confusing results". PCRE2
|
||||
treats a backslash between \Q and \E just like any other character. Note the
|
||||
following examples:
|
||||
<pre>
|
||||
Pattern PCRE2 matches Perl matches
|
||||
|
||||
|
@ -88,19 +96,19 @@ The \Q...\E sequence is recognized both inside and outside character classes
|
|||
by both PCRE2 and Perl.
|
||||
</P>
|
||||
<P>
|
||||
7. Fairly obviously, PCRE2 does not support the (?{code}) and (??{code})
|
||||
8. Fairly obviously, PCRE2 does not support the (?{code}) and (??{code})
|
||||
constructions. However, PCRE2 does have a "callout" feature, which allows an
|
||||
external function to be called during pattern matching. See the
|
||||
<a href="pcre2callout.html"><b>pcre2callout</b></a>
|
||||
documentation for details.
|
||||
</P>
|
||||
<P>
|
||||
8. Subroutine calls (whether recursive or not) were treated as atomic groups up
|
||||
9. Subroutine calls (whether recursive or not) were treated as atomic groups up
|
||||
to PCRE2 release 10.23, but from release 10.30 this changed, and backtracking
|
||||
into subroutine calls is now supported, as in Perl.
|
||||
</P>
|
||||
<P>
|
||||
9. In PCRE2, if any of the backtracking control verbs are used in a group that
|
||||
10. In PCRE2, if any of the backtracking control verbs are used in a group that
|
||||
is called as a subroutine (whether or not recursively), their effect is
|
||||
confined to that group; it does not extend to the surrounding pattern. This is
|
||||
not always the case in Perl. In particular, if (*THEN) is present in a group
|
||||
|
@ -109,20 +117,20 @@ the group does not contain any | characters. Note that such groups are
|
|||
processed as anchored at the point where they are tested.
|
||||
</P>
|
||||
<P>
|
||||
10. If a pattern contains more than one backtracking control verb, the first
|
||||
11. If a pattern contains more than one backtracking control verb, the first
|
||||
one that is backtracked onto acts. For example, in the pattern
|
||||
A(*COMMIT)B(*PRUNE)C a failure in B triggers (*COMMIT), but a failure in C
|
||||
triggers (*PRUNE). Perl's behaviour is more complex; in many cases it is the
|
||||
same as PCRE2, but there are cases where it differs.
|
||||
</P>
|
||||
<P>
|
||||
11. There are some differences that are concerned with the settings of captured
|
||||
12. There are some differences that are concerned with the settings of captured
|
||||
strings when part of a pattern is repeated. For example, matching "aba" against
|
||||
the pattern /^(a(b)?)+$/ in Perl leaves $2 unset, but in PCRE2 it is set to
|
||||
"b".
|
||||
</P>
|
||||
<P>
|
||||
12. PCRE2's handling of duplicate capture group numbers and names is not as
|
||||
13. PCRE2's handling of duplicate capture group numbers and names is not as
|
||||
general as Perl's. This is a consequence of the fact the PCRE2 works internally
|
||||
just with numbers, using an external table to translate between numbers and
|
||||
names. In particular, a pattern such as (?|(?<a>A)|(?<b>B)), where the two
|
||||
|
@ -132,40 +140,43 @@ to distinguish which group matched, because both names map to capture group
|
|||
number 1. To avoid this confusing situation, an error is given at compile time.
|
||||
</P>
|
||||
<P>
|
||||
13. Perl used to recognize comments in some places that PCRE2 does not, for
|
||||
14. Perl used to recognize comments in some places that PCRE2 does not, for
|
||||
example, between the ( and ? at the start of a group. If the /x modifier is
|
||||
set, Perl allowed white space between ( and ? though the latest Perls give an
|
||||
error (for a while it was just deprecated). There may still be some cases where
|
||||
Perl behaves differently.
|
||||
</P>
|
||||
<P>
|
||||
14. Perl, when in warning mode, gives warnings for character classes such as
|
||||
15. Perl, when in warning mode, gives warnings for character classes such as
|
||||
[A-\d] or [a-[:digit:]]. It then treats the hyphens as literals. PCRE2 has no
|
||||
warning features, so it gives an error in these cases because they are almost
|
||||
certainly user mistakes.
|
||||
</P>
|
||||
<P>
|
||||
15. In PCRE2, the upper/lower case character properties Lu and Ll are not
|
||||
16. In PCRE2, the upper/lower case character properties Lu and Ll are not
|
||||
affected when case-independent matching is specified. For example, \p{Lu}
|
||||
always matches an upper case letter. I think Perl has changed in this respect;
|
||||
in the release at the time of writing (5.32), \p{Lu} and \p{Ll} match all
|
||||
in the release at the time of writing (5.34), \p{Lu} and \p{Ll} match all
|
||||
letters, regardless of case, when case independence is specified.
|
||||
</P>
|
||||
<P>
|
||||
16. From release 5.32.0, Perl locks out the use of \K in lookaround
|
||||
assertions. In PCRE2, \K is acted on when it occurs in positive assertions,
|
||||
but is ignored in negative assertions.
|
||||
17. From release 5.32.0, Perl locks out the use of \K in lookaround
|
||||
assertions. From release 10.38 PCRE2 does the same by default. However, there
|
||||
is an option for re-enabling the previous behaviour. When this option is set,
|
||||
\K is acted on when it occurs in positive assertions, but is ignored in
|
||||
negative assertions.
|
||||
</P>
|
||||
<P>
|
||||
17. PCRE2 provides some extensions to the Perl regular expression facilities.
|
||||
18. PCRE2 provides some extensions to the Perl regular expression facilities.
|
||||
Perl 5.10 included new features that were not in earlier versions of Perl, some
|
||||
of which (such as named parentheses) were in PCRE2 for some time before. This
|
||||
list is with respect to Perl 5.32:
|
||||
list is with respect to Perl 5.34:
|
||||
<br>
|
||||
<br>
|
||||
(a) Although lookbehind assertions in PCRE2 must match fixed length strings,
|
||||
each alternative toplevel branch of a lookbehind assertion can match a
|
||||
different length of string. Perl requires them all to have the same length.
|
||||
different length of string. Perl used to require them all to have the same
|
||||
length, but the latest version has some variable length support.
|
||||
<br>
|
||||
<br>
|
||||
(b) From PCRE2 10.23, backreferences to groups of fixed length are supported
|
||||
|
@ -219,12 +230,12 @@ extension to the lookaround facilities. The default, Perl-compatible
|
|||
lookarounds are atomic.
|
||||
</P>
|
||||
<P>
|
||||
18. The Perl /a modifier restricts /d numbers to pure ascii, and the /aa
|
||||
19. The Perl /a modifier restricts /d numbers to pure ascii, and the /aa
|
||||
modifier restricts /i case-insensitive matching to pure ascii, ignoring Unicode
|
||||
rules. This separation cannot be represented with PCRE2_UCP.
|
||||
</P>
|
||||
<P>
|
||||
19. Perl has different limits than PCRE2. See the
|
||||
20. Perl has different limits than PCRE2. See the
|
||||
<a href="pcre2limit.html"><b>pcre2limit</b></a>
|
||||
documentation for details. Perl went with 5.10 from recursion to iteration
|
||||
keeping the intermediate matches on the heap, which is ~10% slower but does not
|
||||
|
@ -237,7 +248,7 @@ AUTHOR
|
|||
<P>
|
||||
Philip Hazel
|
||||
<br>
|
||||
University Computing Service
|
||||
Retired from University Computing Service
|
||||
<br>
|
||||
Cambridge, England.
|
||||
<br>
|
||||
|
@ -246,9 +257,9 @@ Cambridge, England.
|
|||
REVISION
|
||||
</b><br>
|
||||
<P>
|
||||
Last updated: 06 October 2020
|
||||
Last updated: 08 December 2021
|
||||
<br>
|
||||
Copyright © 1997-2019 University of Cambridge.
|
||||
Copyright © 1997-2021 University of Cambridge.
|
||||
<br>
|
||||
<p>
|
||||
Return to the <a href="index.html">PCRE2 index page</a>.
|
||||
|
|
|
@ -141,8 +141,8 @@ permitted to match separator characters, but the double-star (**) feature
|
|||
</P>
|
||||
<P>
|
||||
PCRE2_CONVERT_GLOB_NO_WILD_SEPARATOR matches globs with wildcards allowed to
|
||||
match separator characters. PCRE2_GLOB_NO_STARSTAR matches globs with the
|
||||
double-star feature disabled. These options may be given together.
|
||||
match separator characters. PCRE2_CONVERT_GLOB_NO_STARSTAR matches globs with
|
||||
the double-star feature disabled. These options may be given together.
|
||||
</P>
|
||||
<br><a name="SEC5" href="#TOC1">CONVERTING POSIX PATTERNS</a><br>
|
||||
<P>
|
||||
|
|
|
@ -215,8 +215,8 @@ if (rc < 0)
|
|||
return 1;
|
||||
}
|
||||
|
||||
/* Match succeded. Get a pointer to the output vector, where string offsets are
|
||||
stored. */
|
||||
/* Match succeeded. Get a pointer to the output vector, where string offsets
|
||||
are stored. */
|
||||
|
||||
ovector = pcre2_get_ovector_pointer(match_data);
|
||||
printf("Match succeeded at offset %d\n", (int)ovector[0]);
|
||||
|
@ -234,9 +234,12 @@ pcre2_match_data_create_from_pattern() above. */
|
|||
if (rc == 0)
|
||||
printf("ovector was not big enough for all the captured substrings\n");
|
||||
|
||||
/* We must guard against patterns such as /(?=.\K)/ that use \K in an assertion
|
||||
to set the start of a match later than its end. In this demonstration program,
|
||||
we just detect this case and give up. */
|
||||
/* Since release 10.38 PCRE2 has locked out the use of \K in lookaround
|
||||
assertions. However, there is an option to re-enable the old behaviour. If that
|
||||
is set, it is possible to run patterns such as /(?=.\K)/ that use \K in an
|
||||
assertion to set the start of a match later than its end. In this demonstration
|
||||
program, we show how to detect this case, but it shouldn't arise because the
|
||||
option is never set. */
|
||||
|
||||
if (ovector[0] > ovector[1])
|
||||
{
|
||||
|
@ -453,7 +456,7 @@ for (;;)
|
|||
return 1;
|
||||
}
|
||||
|
||||
/* Match succeded */
|
||||
/* Match succeeded */
|
||||
|
||||
printf("\nMatch succeeded again at offset %d\n", (int)ovector[0]);
|
||||
|
||||
|
|
|
@ -71,13 +71,15 @@ For example:
|
|||
<pre>
|
||||
pcre2grep some-pattern file1 - file3
|
||||
</pre>
|
||||
Input files are searched line by line. By default, each line that matches a
|
||||
By default, input files are searched line by line. Each line that matches a
|
||||
pattern is copied to the standard output, and if there is more than one file,
|
||||
the file name is output at the start of each line, followed by a colon.
|
||||
However, there are options that can change how <b>pcre2grep</b> behaves. In
|
||||
particular, the <b>-M</b> option makes it possible to search for strings that
|
||||
span line boundaries. What defines a line boundary is controlled by the
|
||||
<b>-N</b> (<b>--newline</b>) option.
|
||||
However, there are options that can change how <b>pcre2grep</b> behaves. For
|
||||
example, the <b>-M</b> option makes it possible to search for strings that span
|
||||
line boundaries. What defines a line boundary is controlled by the <b>-N</b>
|
||||
(<b>--newline</b>) option. The <b>-h</b> and <b>-H</b> options control whether or
|
||||
not file names are shown, and the <b>-Z</b> option changes the file name
|
||||
terminator to a zero byte.
|
||||
</P>
|
||||
<P>
|
||||
The amount of memory used for buffering files that are being scanned is
|
||||
|
@ -178,9 +180,11 @@ Output up to <i>number</i> lines of context after each matching line. Fewer
|
|||
lines are output if the next match or the end of the file is reached, or if the
|
||||
processing buffer size has been set too small. If file names and/or line
|
||||
numbers are being output, a hyphen separator is used instead of a colon for the
|
||||
context lines. A line containing "--" is output between each group of lines,
|
||||
unless they are in fact contiguous in the input file. The value of <i>number</i>
|
||||
is expected to be relatively small. When <b>-c</b> is used, <b>-A</b> is ignored.
|
||||
context lines (the <b>-Z</b> option can be used to change the file name
|
||||
terminator to a zero byte). A line containing "--" is output between each group
|
||||
of lines, unless they are in fact contiguous in the input file. The value of
|
||||
<i>number</i> is expected to be relatively small. When <b>-c</b> is used,
|
||||
<b>-A</b> is ignored.
|
||||
</P>
|
||||
<P>
|
||||
<b>-a</b>, <b>--text</b>
|
||||
|
@ -188,14 +192,21 @@ Treat binary files as text. This is equivalent to
|
|||
<b>--binary-files</b>=<i>text</i>.
|
||||
</P>
|
||||
<P>
|
||||
<b>--allow-lookaround-bsk</b>
|
||||
PCRE2 now forbids the use of \K in lookarounds by default, in line with Perl.
|
||||
This option causes <b>pcre2grep</b> to set the PCRE2_EXTRA_ALLOW_LOOKAROUND_BSK
|
||||
option, which enables this somewhat dangerous usage.
|
||||
</P>
|
||||
<P>
|
||||
<b>-B</b> <i>number</i>, <b>--before-context=</b><i>number</i>
|
||||
Output up to <i>number</i> lines of context before each matching line. Fewer
|
||||
lines are output if the previous match or the start of the file is within
|
||||
<i>number</i> lines, or if the processing buffer size has been set too small. If
|
||||
file names and/or line numbers are being output, a hyphen separator is used
|
||||
instead of a colon for the context lines. A line containing "--" is output
|
||||
between each group of lines, unless they are in fact contiguous in the input
|
||||
file. The value of <i>number</i> is expected to be relatively small. When
|
||||
instead of a colon for the context lines (the <b>-Z</b> option can be used to
|
||||
change the file name terminator to a zero byte). A line containing "--" is
|
||||
output between each group of lines, unless they are in fact contiguous in the
|
||||
input file. The value of <i>number</i> is expected to be relatively small. When
|
||||
<b>-c</b> is used, <b>-B</b> is ignored.
|
||||
</P>
|
||||
<P>
|
||||
|
@ -405,20 +416,22 @@ shown separately. This option is mutually exclusive with <b>--output</b>,
|
|||
<P>
|
||||
<b>-H</b>, <b>--with-filename</b>
|
||||
Force the inclusion of the file name at the start of output lines when
|
||||
searching a single file. By default, the file name is not shown in this case.
|
||||
For matching lines, the file name is followed by a colon; for context lines, a
|
||||
hyphen separator is used. If a line number is also being output, it follows the
|
||||
file name. When the <b>-M</b> option causes a pattern to match more than one
|
||||
line, only the first is preceded by the file name. This option overrides any
|
||||
previous <b>-h</b>, <b>-l</b>, or <b>-L</b> options.
|
||||
searching a single file. The file name is not normally shown in this case.
|
||||
By default, for matching lines, the file name is followed by a colon; for
|
||||
context lines, a hyphen separator is used. The <b>-Z</b> option can be used to
|
||||
change the terminator to a zero byte. If a line number is also being output,
|
||||
it follows the file name. When the <b>-M</b> option causes a pattern to match
|
||||
more than one line, only the first is preceded by the file name. This option
|
||||
overrides any previous <b>-h</b>, <b>-l</b>, or <b>-L</b> options.
|
||||
</P>
|
||||
<P>
|
||||
<b>-h</b>, <b>--no-filename</b>
|
||||
Suppress the output file names when searching multiple files. By default,
|
||||
file names are shown when multiple files are searched. For matching lines, the
|
||||
file name is followed by a colon; for context lines, a hyphen separator is used.
|
||||
If a line number is also being output, it follows the file name. This option
|
||||
overrides any previous <b>-H</b>, <b>-L</b>, or <b>-l</b> options.
|
||||
Suppress the output file names when searching multiple files. File names are
|
||||
normally shown when multiple files are searched. By default, for matching
|
||||
lines, the file name is followed by a colon; for context lines, a hyphen
|
||||
separator is used. The <b>-Z</b> option can be used to change the terminator to
|
||||
a zero byte. If a line number is also being output, it follows the file name.
|
||||
This option overrides any previous <b>-H</b>, <b>-L</b>, or <b>-l</b> options.
|
||||
</P>
|
||||
<P>
|
||||
<b>--heap-limit</b>=<i>number</i>
|
||||
|
@ -475,18 +488,20 @@ given any number of times. If a directory matches both <b>--include-dir</b> and
|
|||
<b>-L</b>, <b>--files-without-match</b>
|
||||
Instead of outputting lines from the files, just output the names of the files
|
||||
that do not contain any lines that would have been output. Each file name is
|
||||
output once, on a separate line. This option overrides any previous <b>-H</b>,
|
||||
<b>-h</b>, or <b>-l</b> options.
|
||||
output once, on a separate line by default, but if the <b>-Z</b> option is set,
|
||||
they are separated by zero bytes instead of newlines. This option overrides any
|
||||
previous <b>-H</b>, <b>-h</b>, or <b>-l</b> options.
|
||||
</P>
|
||||
<P>
|
||||
<b>-l</b>, <b>--files-with-matches</b>
|
||||
Instead of outputting lines from the files, just output the names of the files
|
||||
containing lines that would have been output. Each file name is output once, on
|
||||
a separate line. Searching normally stops as soon as a matching line is found
|
||||
in a file. However, if the <b>-c</b> (count) option is also used, matching
|
||||
continues in order to obtain the correct count, and those files that have at
|
||||
least one match are listed along with their counts. Using this option with
|
||||
<b>-c</b> is a way of suppressing the listing of files with no matches that
|
||||
a separate line, but if the <b>-Z</b> option is set, they are separated by zero
|
||||
bytes instead of newlines. Searching normally stops as soon as a matching line
|
||||
is found in a file. However, if the <b>-c</b> (count) option is also used,
|
||||
matching continues in order to obtain the correct count, and those files that
|
||||
have at least one match are listed along with their counts. Using this option
|
||||
with <b>-c</b> is a way of suppressing the listing of files with no matches that
|
||||
occurs with <b>-c</b> on its own. This option overrides any previous <b>-H</b>,
|
||||
<b>-h</b>, or <b>-L</b> options.
|
||||
</P>
|
||||
|
@ -586,10 +601,7 @@ value set by <b>--match-limit</b> is reached, an error occurs.
|
|||
<br>
|
||||
<br>
|
||||
The <b>--heap-limit</b> option specifies, as a number of kibibytes (units of
|
||||
1024 bytes), the amount of heap memory that may be used for matching. Heap
|
||||
memory is needed only if matching the pattern requires a significant number of
|
||||
nested backtracking points to be remembered. This parameter can be set to zero
|
||||
to forbid the use of heap memory altogether.
|
||||
1024 bytes), the maximum amount of heap memory that may be used for matching.
|
||||
<br>
|
||||
<br>
|
||||
The <b>--depth-limit</b> option limits the depth of nested backtracking points,
|
||||
|
@ -833,6 +845,13 @@ pattern and ")$" at the end. This option applies only to the patterns that are
|
|||
matched against the contents of files; it does not apply to patterns specified
|
||||
by any of the <b>--include</b> or <b>--exclude</b> options.
|
||||
</P>
|
||||
<P>
|
||||
<b>-Z</b>, <b>--null</b>
|
||||
Terminate files names in the regular output with a zero byte (the NUL
|
||||
character) instead of what would normally appear. This is useful when file
|
||||
names contain unusual characters such as colons, hyphens, or even newlines. The
|
||||
option does not apply to file names in error messages.
|
||||
</P>
|
||||
<br><a name="SEC7" href="#TOC1">ENVIRONMENT VARIABLES</a><br>
|
||||
<P>
|
||||
The environment variables <b>LC_ALL</b> and <b>LC_CTYPE</b> are examined, in that
|
||||
|
@ -1040,16 +1059,16 @@ because VMS does not distinguish between exit(0) and exit(1).
|
|||
<P>
|
||||
Philip Hazel
|
||||
<br>
|
||||
University Computing Service
|
||||
Retired from University Computing Service
|
||||
<br>
|
||||
Cambridge, England.
|
||||
<br>
|
||||
</P>
|
||||
<br><a name="SEC16" href="#TOC1">REVISION</a><br>
|
||||
<P>
|
||||
Last updated: 04 October 2020
|
||||
Last updated: 30 July 2022
|
||||
<br>
|
||||
Copyright © 1997-2020 University of Cambridge.
|
||||
Copyright © 1997-2022 University of Cambridge.
|
||||
<br>
|
||||
<p>
|
||||
Return to the <a href="index.html">PCRE2 index page</a>.
|
||||
|
|
|
@ -54,6 +54,7 @@ platforms:
|
|||
<pre>
|
||||
ARM 32-bit (v5, v7, and Thumb2)
|
||||
ARM 64-bit
|
||||
IBM s390x 64 bit
|
||||
Intel x86 32-bit and 64-bit
|
||||
MIPS 32-bit and 64-bit
|
||||
Power PC 32-bit and 64-bit
|
||||
|
@ -268,11 +269,11 @@ starts another match, that match must use a different JIT stack to the one used
|
|||
for currently suspended match(es).
|
||||
</P>
|
||||
<P>
|
||||
In a multithread application, if you do not
|
||||
specify a JIT stack, or if you assign or pass back NULL from a callback, that
|
||||
is thread-safe, because each thread has its own machine stack. However, if you
|
||||
assign or pass back a non-NULL JIT stack, this must be a different stack for
|
||||
each thread so that the application is thread-safe.
|
||||
In a multithread application, if you do not specify a JIT stack, or if you
|
||||
assign or pass back NULL from a callback, that is thread-safe, because each
|
||||
thread has its own machine stack. However, if you assign or pass back a
|
||||
non-NULL JIT stack, this must be a different stack for each thread so that the
|
||||
application is thread-safe.
|
||||
</P>
|
||||
<P>
|
||||
Strictly speaking, even more is allowed. You can assign the same non-NULL stack
|
||||
|
@ -286,7 +287,7 @@ inefficient solution, and not recommended.
|
|||
This is a suggestion for how a multithreaded program that needs to set up
|
||||
non-default JIT stacks might operate:
|
||||
<pre>
|
||||
During thread initalization
|
||||
During thread initialization
|
||||
thread_local_var = pcre2_jit_stack_create(...)
|
||||
|
||||
During thread exit
|
||||
|
@ -381,8 +382,8 @@ out this complicated API.
|
|||
<b>void pcre2_jit_free_unused_memory(pcre2_general_context *<i>gcontext</i>);</b>
|
||||
</P>
|
||||
<P>
|
||||
The JIT executable allocator does not free all memory when it is possible.
|
||||
It expects new allocations, and keeps some free memory around to improve
|
||||
The JIT executable allocator does not free all memory when it is possible. It
|
||||
expects new allocations, and keeps some free memory around to improve
|
||||
allocation speed. However, in low memory conditions, it might be better to free
|
||||
all possible memory. You can cause this to happen by calling
|
||||
pcre2_jit_free_unused_memory(). Its argument is a general context, for custom
|
||||
|
@ -441,10 +442,10 @@ that was not compiled.
|
|||
<P>
|
||||
When you call <b>pcre2_match()</b>, as well as testing for invalid options, a
|
||||
number of other sanity checks are performed on the arguments. For example, if
|
||||
the subject pointer is NULL, an immediate error is given. Also, unless
|
||||
PCRE2_NO_UTF_CHECK is set, a UTF subject string is tested for validity. In the
|
||||
interests of speed, these checks do not happen on the JIT fast path, and if
|
||||
invalid data is passed, the result is undefined.
|
||||
the subject pointer is NULL but the length is non-zero, an immediate error is
|
||||
given. Also, unless PCRE2_NO_UTF_CHECK is set, a UTF subject string is tested
|
||||
for validity. In the interests of speed, these checks do not happen on the JIT
|
||||
fast path, and if invalid data is passed, the result is undefined.
|
||||
</P>
|
||||
<P>
|
||||
Bypassing the sanity checks and the <b>pcre2_match()</b> wrapping can give
|
||||
|
@ -465,9 +466,9 @@ Cambridge, England.
|
|||
</P>
|
||||
<br><a name="SEC14" href="#TOC1">REVISION</a><br>
|
||||
<P>
|
||||
Last updated: 23 May 2019
|
||||
Last updated: 30 November 2021
|
||||
<br>
|
||||
Copyright © 1997-2019 University of Cambridge.
|
||||
Copyright © 1997-2021 University of Cambridge.
|
||||
<br>
|
||||
<p>
|
||||
Return to the <a href="index.html">PCRE2 index page</a>.
|
||||
|
|
|
@ -71,13 +71,18 @@ is 255 code units for the 8-bit library and 65535 code units for the 16-bit and
|
|||
The maximum length of a string argument to a callout is the largest number a
|
||||
32-bit unsigned integer can hold.
|
||||
</P>
|
||||
<P>
|
||||
The maximum amount of heap memory used for matching is controlled by the heap
|
||||
limit, which can be set in a pattern or in a match context. The default is a
|
||||
very large number, effectively unlimited.
|
||||
</P>
|
||||
<br><b>
|
||||
AUTHOR
|
||||
</b><br>
|
||||
<P>
|
||||
Philip Hazel
|
||||
<br>
|
||||
University Computing Service
|
||||
Retired from University Computing Service
|
||||
<br>
|
||||
Cambridge, England.
|
||||
<br>
|
||||
|
@ -86,9 +91,9 @@ Cambridge, England.
|
|||
REVISION
|
||||
</b><br>
|
||||
<P>
|
||||
Last updated: 02 February 2019
|
||||
Last updated: 26 July 2022
|
||||
<br>
|
||||
Copyright © 1997-2019 University of Cambridge.
|
||||
Copyright © 1997-2022 University of Cambridge.
|
||||
<br>
|
||||
<p>
|
||||
Return to the <a href="index.html">PCRE2 index page</a>.
|
||||
|
|
|
@ -78,8 +78,9 @@ tried is controlled by the greedy or ungreedy nature of the quantifier.
|
|||
If a leaf node is reached, a matching string has been found, and at that point
|
||||
the algorithm stops. Thus, if there is more than one possible match, this
|
||||
algorithm returns the first one that it finds. Whether this is the shortest,
|
||||
the longest, or some intermediate length depends on the way the greedy and
|
||||
ungreedy repetition quantifiers are specified in the pattern.
|
||||
the longest, or some intermediate length depends on the way the alternations
|
||||
and the greedy or ungreedy repetition quantifiers are specified in the
|
||||
pattern.
|
||||
</P>
|
||||
<P>
|
||||
Because it ends up with a single path through the tree, it is relatively
|
||||
|
@ -109,11 +110,17 @@ no more unterminated paths. At this point, terminated paths represent the
|
|||
different matching possibilities (if there are none, the match has failed).
|
||||
Thus, if there is more than one possible match, this algorithm finds all of
|
||||
them, and in particular, it finds the longest. The matches are returned in
|
||||
decreasing order of length. There is an option to stop the algorithm after the
|
||||
first match (which is necessarily the shortest) is found.
|
||||
the output vector in decreasing order of length. There is an option to stop the
|
||||
algorithm after the first match (which is necessarily the shortest) is found.
|
||||
</P>
|
||||
<P>
|
||||
Note that all the matches that are found start at the same point in the
|
||||
Note that the size of vector needed to contain all the results depends on the
|
||||
number of simultaneous matches, not on the number of parentheses in the
|
||||
pattern. Using <b>pcre2_match_data_create_from_pattern()</b> to create the match
|
||||
data block is therefore not advisable when doing DFA matching.
|
||||
</P>
|
||||
<P>
|
||||
Note also that all the matches that are found start at the same point in the
|
||||
subject. If the pattern
|
||||
<pre>
|
||||
cat(er(pillar)?)?
|
||||
|
@ -194,21 +201,14 @@ supported by <b>pcre2_dfa_match()</b>.
|
|||
</P>
|
||||
<br><a name="SEC5" href="#TOC1">ADVANTAGES OF THE ALTERNATIVE ALGORITHM</a><br>
|
||||
<P>
|
||||
Using the alternative matching algorithm provides the following advantages:
|
||||
The main advantage of the alternative algorithm is that all possible matches
|
||||
(at a single point in the subject) are automatically found, and in particular,
|
||||
the longest match is found. To find more than one match at the same point using
|
||||
the standard algorithm, you have to do kludgy things with callouts.
|
||||
</P>
|
||||
<P>
|
||||
1. All possible matches (at a single point in the subject) are automatically
|
||||
found, and in particular, the longest match is found. To find more than one
|
||||
match using the standard algorithm, you have to do kludgy things with
|
||||
callouts.
|
||||
</P>
|
||||
<P>
|
||||
2. Because the alternative algorithm scans the subject string just once, and
|
||||
never needs to backtrack (except for lookbehinds), it is possible to pass very
|
||||
long subject strings to the matching function in several pieces, checking for
|
||||
partial matching each time. Although it is also possible to do multi-segment
|
||||
matching using the standard algorithm, by retaining partially matched
|
||||
substrings, it is more complicated. The
|
||||
Partial matching is possible with this algorithm, though it has some
|
||||
limitations. The
|
||||
<a href="pcre2partial.html"><b>pcre2partial</b></a>
|
||||
documentation gives details of partial matching and discusses multi-segment
|
||||
matching.
|
||||
|
@ -230,20 +230,23 @@ invalid UTF string are not supported.
|
|||
3. Although atomic groups are supported, their use does not provide the
|
||||
performance advantage that it does for the standard algorithm.
|
||||
</P>
|
||||
<P>
|
||||
4. JIT optimization is not supported.
|
||||
</P>
|
||||
<br><a name="SEC7" href="#TOC1">AUTHOR</a><br>
|
||||
<P>
|
||||
Philip Hazel
|
||||
<br>
|
||||
University Computing Service
|
||||
Retired from University Computing Service
|
||||
<br>
|
||||
Cambridge, England.
|
||||
<br>
|
||||
</P>
|
||||
<br><a name="SEC8" href="#TOC1">REVISION</a><br>
|
||||
<P>
|
||||
Last updated: 23 May 2019
|
||||
Last updated: 28 August 2021
|
||||
<br>
|
||||
Copyright © 1997-2019 University of Cambridge.
|
||||
Copyright © 1997-2021 University of Cambridge.
|
||||
<br>
|
||||
<p>
|
||||
Return to the <a href="index.html">PCRE2 index page</a>.
|
||||
|
|
|
@ -534,7 +534,7 @@ for themselves. For example, outside a character class:
|
|||
\0113 is a tab followed by the character "3"
|
||||
\113 might be a backreference, otherwise the character with octal code 113
|
||||
\377 might be a backreference, otherwise the value 255 (decimal)
|
||||
\81 is always a backreference .sp
|
||||
\81 is always a backreference
|
||||
</pre>
|
||||
Note that octal values of 100 or greater that are specified using this syntax
|
||||
must not be introduced by a leading zero, because no more than three octal
|
||||
|
@ -745,7 +745,7 @@ Unicode support is not needed for these characters to be recognized.
|
|||
<P>
|
||||
It is possible to restrict \R to match only CR, LF, or CRLF (instead of the
|
||||
complete set of Unicode line endings) by setting the option PCRE2_BSR_ANYCRLF
|
||||
at compile time. (BSR is an abbrevation for "backslash R".) This can be made
|
||||
at compile time. (BSR is an abbreviation for "backslash R".) This can be made
|
||||
the default when PCRE2 is built; if this is the case, the other behaviour can
|
||||
be requested via the PCRE2_BSR_UNICODE option. It is also possible to specify
|
||||
these settings by starting a pattern string with one of the following
|
||||
|
@ -776,194 +776,62 @@ can be used in any mode, though in 8-bit and 16-bit non-UTF modes these
|
|||
sequences are of course limited to testing characters whose code points are
|
||||
less than U+0100 and U+10000, respectively. In 32-bit non-UTF mode, code points
|
||||
greater than 0x10ffff (the Unicode limit) may be encountered. These are all
|
||||
treated as being in the Unknown script and with an unassigned type. The extra
|
||||
escape sequences are:
|
||||
treated as being in the Unknown script and with an unassigned type.
|
||||
</P>
|
||||
<P>
|
||||
Matching characters by Unicode property is not fast, because PCRE2 has to do a
|
||||
multistage table lookup in order to find a character's property. That is why
|
||||
the traditional escape sequences such as \d and \w do not use Unicode
|
||||
properties in PCRE2 by default, though you can make them do so by setting the
|
||||
PCRE2_UCP option or by starting the pattern with (*UCP).
|
||||
</P>
|
||||
<P>
|
||||
The extra escape sequences that provide property support are:
|
||||
<pre>
|
||||
\p{<i>xx</i>} a character with the <i>xx</i> property
|
||||
\P{<i>xx</i>} a character without the <i>xx</i> property
|
||||
\X a Unicode extended grapheme cluster
|
||||
</pre>
|
||||
The property names represented by <i>xx</i> above are case-sensitive. There is
|
||||
support for Unicode script names, Unicode general category properties, "Any",
|
||||
which matches any character (including newline), and some special PCRE2
|
||||
properties (described in the
|
||||
<a href="#extraprops">next section).</a>
|
||||
Other Perl properties such as "InMusicalSymbols" are not supported by PCRE2.
|
||||
Note that \P{Any} does not match any characters, so always causes a match
|
||||
failure.
|
||||
The property names represented by <i>xx</i> above are not case-sensitive, and in
|
||||
accordance with Unicode's "loose matching" rules, spaces, hyphens, and
|
||||
underscores are ignored. There is support for Unicode script names, Unicode
|
||||
general category properties, "Any", which matches any character (including
|
||||
newline), Bidi_Class, a number of binary (yes/no) properties, and some special
|
||||
PCRE2 properties (described
|
||||
<a href="#extraprops">below).</a>
|
||||
Certain other Perl properties such as "InMusicalSymbols" are not supported by
|
||||
PCRE2. Note that \P{Any} does not match any characters, so always causes a
|
||||
match failure.
|
||||
</P>
|
||||
<br><b>
|
||||
Script properties for \p and \P
|
||||
</b><br>
|
||||
<P>
|
||||
There are three different syntax forms for matching a script. Each Unicode
|
||||
character has a basic script and, optionally, a list of other scripts ("Script
|
||||
Extensions") with which it is commonly used. Using the Adlam script as an
|
||||
example, \p{sc:Adlam} matches characters whose basic script is Adlam, whereas
|
||||
\p{scx:Adlam} matches, in addition, characters that have Adlam in their
|
||||
extensions list. The full names "script" and "script extensions" for the
|
||||
property types are recognized, and a equals sign is an alternative to the
|
||||
colon. If a script name is given without a property type, for example,
|
||||
\p{Adlam}, it is treated as \p{scx:Adlam}. Perl changed to this
|
||||
interpretation at release 5.26 and PCRE2 changed at release 10.40.
|
||||
</P>
|
||||
<P>
|
||||
Sets of Unicode characters are defined as belonging to certain scripts. A
|
||||
character from one of these sets can be matched using a script name. For
|
||||
example:
|
||||
<pre>
|
||||
\p{Greek}
|
||||
\P{Han}
|
||||
</pre>
|
||||
Unassigned characters (and in non-UTF 32-bit mode, characters with code points
|
||||
greater than 0x10FFFF) are assigned the "Unknown" script. Others that are not
|
||||
part of an identified script are lumped together as "Common". The current list
|
||||
of scripts is:
|
||||
</P>
|
||||
<P>
|
||||
Adlam,
|
||||
Ahom,
|
||||
Anatolian_Hieroglyphs,
|
||||
Arabic,
|
||||
Armenian,
|
||||
Avestan,
|
||||
Balinese,
|
||||
Bamum,
|
||||
Bassa_Vah,
|
||||
Batak,
|
||||
Bengali,
|
||||
Bhaiksuki,
|
||||
Bopomofo,
|
||||
Brahmi,
|
||||
Braille,
|
||||
Buginese,
|
||||
Buhid,
|
||||
Canadian_Aboriginal,
|
||||
Carian,
|
||||
Caucasian_Albanian,
|
||||
Chakma,
|
||||
Cham,
|
||||
Cherokee,
|
||||
Chorasmian,
|
||||
Common,
|
||||
Coptic,
|
||||
Cuneiform,
|
||||
Cypriot,
|
||||
Cyrillic,
|
||||
Deseret,
|
||||
Devanagari,
|
||||
Dives_Akuru,
|
||||
Dogra,
|
||||
Duployan,
|
||||
Egyptian_Hieroglyphs,
|
||||
Elbasan,
|
||||
Elymaic,
|
||||
Ethiopic,
|
||||
Georgian,
|
||||
Glagolitic,
|
||||
Gothic,
|
||||
Grantha,
|
||||
Greek,
|
||||
Gujarati,
|
||||
Gunjala_Gondi,
|
||||
Gurmukhi,
|
||||
Han,
|
||||
Hangul,
|
||||
Hanifi_Rohingya,
|
||||
Hanunoo,
|
||||
Hatran,
|
||||
Hebrew,
|
||||
Hiragana,
|
||||
Imperial_Aramaic,
|
||||
Inherited,
|
||||
Inscriptional_Pahlavi,
|
||||
Inscriptional_Parthian,
|
||||
Javanese,
|
||||
Kaithi,
|
||||
Kannada,
|
||||
Katakana,
|
||||
Kayah_Li,
|
||||
Kharoshthi,
|
||||
Khitan_Small_Script,
|
||||
Khmer,
|
||||
Khojki,
|
||||
Khudawadi,
|
||||
Lao,
|
||||
Latin,
|
||||
Lepcha,
|
||||
Limbu,
|
||||
Linear_A,
|
||||
Linear_B,
|
||||
Lisu,
|
||||
Lycian,
|
||||
Lydian,
|
||||
Mahajani,
|
||||
Makasar,
|
||||
Malayalam,
|
||||
Mandaic,
|
||||
Manichaean,
|
||||
Marchen,
|
||||
Masaram_Gondi,
|
||||
Medefaidrin,
|
||||
Meetei_Mayek,
|
||||
Mende_Kikakui,
|
||||
Meroitic_Cursive,
|
||||
Meroitic_Hieroglyphs,
|
||||
Miao,
|
||||
Modi,
|
||||
Mongolian,
|
||||
Mro,
|
||||
Multani,
|
||||
Myanmar,
|
||||
Nabataean,
|
||||
Nandinagari,
|
||||
New_Tai_Lue,
|
||||
Newa,
|
||||
Nko,
|
||||
Nushu,
|
||||
Nyakeng_Puachue_Hmong,
|
||||
Ogham,
|
||||
Ol_Chiki,
|
||||
Old_Hungarian,
|
||||
Old_Italic,
|
||||
Old_North_Arabian,
|
||||
Old_Permic,
|
||||
Old_Persian,
|
||||
Old_Sogdian,
|
||||
Old_South_Arabian,
|
||||
Old_Turkic,
|
||||
Oriya,
|
||||
Osage,
|
||||
Osmanya,
|
||||
Pahawh_Hmong,
|
||||
Palmyrene,
|
||||
Pau_Cin_Hau,
|
||||
Phags_Pa,
|
||||
Phoenician,
|
||||
Psalter_Pahlavi,
|
||||
Rejang,
|
||||
Runic,
|
||||
Samaritan,
|
||||
Saurashtra,
|
||||
Sharada,
|
||||
Shavian,
|
||||
Siddham,
|
||||
SignWriting,
|
||||
Sinhala,
|
||||
Sogdian,
|
||||
Sora_Sompeng,
|
||||
Soyombo,
|
||||
Sundanese,
|
||||
Syloti_Nagri,
|
||||
Syriac,
|
||||
Tagalog,
|
||||
Tagbanwa,
|
||||
Tai_Le,
|
||||
Tai_Tham,
|
||||
Tai_Viet,
|
||||
Takri,
|
||||
Tamil,
|
||||
Tangut,
|
||||
Telugu,
|
||||
Thaana,
|
||||
Thai,
|
||||
Tibetan,
|
||||
Tifinagh,
|
||||
Tirhuta,
|
||||
Ugaritic,
|
||||
Unknown,
|
||||
Vai,
|
||||
Wancho,
|
||||
Warang_Citi,
|
||||
Yezidi,
|
||||
Yi,
|
||||
Zanabazar_Square.
|
||||
of recognized script names and their 4-character abbreviations can be obtained
|
||||
by running this command:
|
||||
<pre>
|
||||
pcre2test -LS
|
||||
|
||||
</PRE>
|
||||
</P>
|
||||
<br><b>
|
||||
The general category property for \p and \P
|
||||
</b><br>
|
||||
<P>
|
||||
Each character has exactly one Unicode general category property, specified by
|
||||
a two-letter abbreviation. For compatibility with Perl, negation can be
|
||||
|
@ -1025,9 +893,9 @@ The following general category property codes are supported:
|
|||
Zp Paragraph separator
|
||||
Zs Space separator
|
||||
</pre>
|
||||
The special property L& is also supported: it matches a character that has
|
||||
the Lu, Ll, or Lt property, in other words, a letter that is not classified as
|
||||
a modifier or "other".
|
||||
The special property LC, which has the synonym L&, is also supported: it
|
||||
matches a character that has the Lu, Ll, or Lt property, in other words, a
|
||||
letter that is not classified as a modifier or "other".
|
||||
</P>
|
||||
<P>
|
||||
The Cs (Surrogate) property applies only to characters whose code points are in
|
||||
|
@ -1054,12 +922,54 @@ Specifying caseless matching does not affect these escape sequences. For
|
|||
example, \p{Lu} always matches only upper case letters. This is different from
|
||||
the behaviour of current versions of Perl.
|
||||
</P>
|
||||
<br><b>
|
||||
Binary (yes/no) properties for \p and \P
|
||||
</b><br>
|
||||
<P>
|
||||
Matching characters by Unicode property is not fast, because PCRE2 has to do a
|
||||
multistage table lookup in order to find a character's property. That is why
|
||||
the traditional escape sequences such as \d and \w do not use Unicode
|
||||
properties in PCRE2 by default, though you can make them do so by setting the
|
||||
PCRE2_UCP option or by starting the pattern with (*UCP).
|
||||
Unicode defines a number of binary properties, that is, properties whose only
|
||||
values are true or false. You can obtain a list of those that are recognized by
|
||||
\p and \P, along with their abbreviations, by running this command:
|
||||
<pre>
|
||||
pcre2test -LP
|
||||
|
||||
</PRE>
|
||||
</P>
|
||||
<br><b>
|
||||
The Bidi_Class property for \p and \P
|
||||
</b><br>
|
||||
<P>
|
||||
<pre>
|
||||
\p{Bidi_Class:<class>} matches a character with the given class
|
||||
\p{BC:<class>} matches a character with the given class
|
||||
</pre>
|
||||
The recognized classes are:
|
||||
<pre>
|
||||
AL Arabic letter
|
||||
AN Arabic number
|
||||
B paragraph separator
|
||||
BN boundary neutral
|
||||
CS common separator
|
||||
EN European number
|
||||
ES European separator
|
||||
ET European terminator
|
||||
FSI first strong isolate
|
||||
L left-to-right
|
||||
LRE left-to-right embedding
|
||||
LRI left-to-right isolate
|
||||
LRO left-to-right override
|
||||
NSM non-spacing mark
|
||||
ON other neutral
|
||||
PDF pop directional format
|
||||
PDI pop directional isolate
|
||||
R right-to-left
|
||||
RLE right-to-left embedding
|
||||
RLI right-to-left isolate
|
||||
RLO right-to-left override
|
||||
S segment separator
|
||||
WS which space
|
||||
</pre>
|
||||
An equals sign may be used instead of a colon. The class names are
|
||||
case-insensitive; only the short names listed above are recognized.
|
||||
</P>
|
||||
<br><b>
|
||||
Extended grapheme clusters
|
||||
|
@ -1090,7 +1000,7 @@ additional characters according to the following rules for ending a cluster:
|
|||
3. Do not break Hangul (a Korean script) syllable sequences. Hangul characters
|
||||
are of five types: L, V, T, LV, and LVT. An L character may be followed by an
|
||||
L, V, LV, or LVT character; an LV or V character may be followed by a V or T
|
||||
character; an LVT or T character may be follwed only by a T character.
|
||||
character; an LVT or T character may be followed only by a T character.
|
||||
</P>
|
||||
<P>
|
||||
4. Do not end before extending characters or spacing marks or the "zero-width
|
||||
|
@ -1175,9 +1085,11 @@ For example, when the pattern
|
|||
matches "foobar", the first substring is still set to "foo".
|
||||
</P>
|
||||
<P>
|
||||
Perl used to document that the use of \K within lookaround assertions is "not
|
||||
well defined", but from version 5.32.0 Perl does not support this usage at all.
|
||||
In PCRE2, \K is acted upon when it occurs inside positive assertions, but is
|
||||
From version 5.32.0 Perl forbids the use of \K in lookaround assertions. From
|
||||
release 10.38 PCRE2 also forbids this by default. However, the
|
||||
PCRE2_EXTRA_ALLOW_LOOKAROUND_BSK option can be used when calling
|
||||
<b>pcre2_compile()</b> to re-enable the previous behaviour. When this option is
|
||||
set, \K is acted upon when it occurs inside positive assertions, but is
|
||||
ignored in negative assertions. Note that when a pattern such as (?=ab\K)
|
||||
matches, the reported start of the match can be greater than the end of the
|
||||
match. Using \K in a lookbehind assertion at the start of a pattern can also
|
||||
|
@ -1334,15 +1246,17 @@ end of the subject in both modes, and if all branches of a pattern start with
|
|||
<P>
|
||||
Outside a character class, a dot in the pattern matches any one character in
|
||||
the subject string except (by default) a character that signifies the end of a
|
||||
line.
|
||||
line. One or more characters may be specified as line terminators (see
|
||||
<a href="#newlines">"Newline conventions"</a>
|
||||
above).
|
||||
</P>
|
||||
<P>
|
||||
When a line ending is defined as a single character, dot never matches that
|
||||
character; when the two-character sequence CRLF is used, dot does not match CR
|
||||
if it is immediately followed by LF, but otherwise it matches all characters
|
||||
(including isolated CRs and LFs). When any Unicode line endings are being
|
||||
recognized, dot does not match CR or LF or any of the other line ending
|
||||
characters.
|
||||
Dot never matches a single line-ending character. When the two-character
|
||||
sequence CRLF is the only line ending, dot does not match CR if it is
|
||||
immediately followed by LF, but otherwise it matches all characters (including
|
||||
isolated CRs and LFs). When ANYCRLF is selected for line endings, no occurences
|
||||
of CR of LF match dot. When all Unicode line endings are being recognized, dot
|
||||
does not match CR or LF or any of the other line ending characters.
|
||||
</P>
|
||||
<P>
|
||||
The behaviour of dot with regard to newlines can be changed. If the
|
||||
|
@ -2173,10 +2087,10 @@ be easier to remember:
|
|||
<pre>
|
||||
(*atomic:\d+)foo
|
||||
</pre>
|
||||
This kind of parenthesized group "locks up" the part of the pattern it
|
||||
contains once it has matched, and a failure further into the pattern is
|
||||
prevented from backtracking into it. Backtracking past it to previous items,
|
||||
however, works as normal.
|
||||
This kind of parenthesized group "locks up" the part of the pattern it contains
|
||||
once it has matched, and a failure further into the pattern is prevented from
|
||||
backtracking into it. Backtracking past it to previous items, however, works as
|
||||
normal.
|
||||
</P>
|
||||
<P>
|
||||
An alternative description is that a group of this type matches exactly the
|
||||
|
@ -2897,7 +2811,7 @@ breaks):
|
|||
(?(DEFINE) (?<byte> 2[0-4]\d | 25[0-5] | 1\d\d | [1-9]?\d) )
|
||||
\b (?&byte) (\.(?&byte)){3} \b
|
||||
</pre>
|
||||
The first part of the pattern is a DEFINE group inside which a another group
|
||||
The first part of the pattern is a DEFINE group inside which another group
|
||||
named "byte" is defined. This matches an individual component of an IPv4
|
||||
address (a number less than 256). When matching takes place, this part of the
|
||||
pattern is skipped because DEFINE acts like a false condition. The rest of the
|
||||
|
@ -3607,7 +3521,7 @@ successful match if there is a later mismatch. Consider:
|
|||
</pre>
|
||||
If the subject is "aaaac...", after the first match attempt fails (starting at
|
||||
the first character in the string), the starting point skips on to start the
|
||||
next attempt at "c". Note that a possessive quantifer does not have the same
|
||||
next attempt at "c". Note that a possessive quantifier does not have the same
|
||||
effect as this example; although it would suppress backtracking during the
|
||||
first match attempt, the second attempt would start at the second character
|
||||
instead of skipping on to "c".
|
||||
|
@ -3845,16 +3759,16 @@ there is a backtrack at the outer level.
|
|||
<P>
|
||||
Philip Hazel
|
||||
<br>
|
||||
University Computing Service
|
||||
Retired from University Computing Service
|
||||
<br>
|
||||
Cambridge, England.
|
||||
<br>
|
||||
</P>
|
||||
<br><a name="SEC32" href="#TOC1">REVISION</a><br>
|
||||
<P>
|
||||
Last updated: 06 October 2020
|
||||
Last updated: 12 January 2022
|
||||
<br>
|
||||
Copyright © 1997-2020 University of Cambridge.
|
||||
Copyright © 1997-2022 University of Cambridge.
|
||||
<br>
|
||||
<p>
|
||||
Return to the <a href="index.html">PCRE2 index page</a>.
|
||||
|
|
|
@ -83,12 +83,31 @@ From release 10.30, the interpretive (non-JIT) version of <b>pcre2_match()</b>
|
|||
uses very little system stack at run time. In earlier releases recursive
|
||||
function calls could use a great deal of stack, and this could cause problems,
|
||||
but this usage has been eliminated. Backtracking positions are now explicitly
|
||||
remembered in memory frames controlled by the code. An initial 20KiB vector of
|
||||
frames is allocated on the system stack (enough for about 100 frames for small
|
||||
patterns), but if this is insufficient, heap memory is used. The amount of heap
|
||||
memory can be limited; if the limit is set to zero, only the initial stack
|
||||
vector is used. Rewriting patterns to be time-efficient, as described below,
|
||||
may also reduce the memory requirements.
|
||||
remembered in memory frames controlled by the code.
|
||||
</P>
|
||||
<P>
|
||||
The size of each frame depends on the size of pointer variables and the number
|
||||
of capturing parenthesized groups in the pattern being matched. On a 64-bit
|
||||
system the frame size for a pattern with no captures is 128 bytes. For each
|
||||
capturing group the size increases by 16 bytes.
|
||||
</P>
|
||||
<P>
|
||||
Until release 10.41, an initial 20KiB frames vector was allocated on the system
|
||||
stack, but this still caused some issues for multi-thread applications where
|
||||
each thread has a very small stack. From release 10.41 backtracking memory
|
||||
frames are always held in heap memory. An initial heap allocation is obtained
|
||||
the first time any match data block is passed to <b>pcre2_match()</b>. This is
|
||||
remembered with the match data block and re-used if that block is used for
|
||||
another match. It is freed when the match data block itself is freed.
|
||||
</P>
|
||||
<P>
|
||||
The size of the initial block is the larger of 20KiB or ten times the pattern's
|
||||
frame size, unless the heap limit is less than this, in which case the heap
|
||||
limit is used. If the initial block proves to be too small during matching, it
|
||||
is replaced by a larger block, subject to the heap limit. The heap limit is
|
||||
checked only when a new block is to be allocated. Reducing the heap limit
|
||||
between calls to <b>pcre2_match()</b> with the same match data block does not
|
||||
affect the saved block.
|
||||
</P>
|
||||
<P>
|
||||
In contrast to <b>pcre2_match()</b>, <b>pcre2_dfa_match()</b> does use recursive
|
||||
|
@ -245,16 +264,16 @@ pattern to match. This is done by repeatedly matching with different limits.
|
|||
<P>
|
||||
Philip Hazel
|
||||
<br>
|
||||
University Computing Service
|
||||
Retired from University Computing Service
|
||||
<br>
|
||||
Cambridge, England.
|
||||
<br>
|
||||
</P>
|
||||
<br><a name="SEC6" href="#TOC1">REVISION</a><br>
|
||||
<P>
|
||||
Last updated: 03 February 2019
|
||||
Last updated: 27 July 2022
|
||||
<br>
|
||||
Copyright © 1997-2019 University of Cambridge.
|
||||
Copyright © 1997-2022 University of Cambridge.
|
||||
<br>
|
||||
<p>
|
||||
Return to the <a href="index.html">PCRE2 index page</a>.
|
||||
|
|
|
@ -23,12 +23,12 @@ please consult the man page, in case the conversion went wrong.
|
|||
<br><a name="SEC1" href="#TOC1">SAVING AND RE-USING PRECOMPILED PCRE2 PATTERNS</a><br>
|
||||
<P>
|
||||
<b>int32_t pcre2_serialize_decode(pcre2_code **<i>codes</i>,</b>
|
||||
<b> int32_t <i>number_of_codes</i>, const uint32_t *<i>bytes</i>,</b>
|
||||
<b> int32_t <i>number_of_codes</i>, const uint8_t *<i>bytes</i>,</b>
|
||||
<b> pcre2_general_context *<i>gcontext</i>);</b>
|
||||
<br>
|
||||
<br>
|
||||
<b>int32_t pcre2_serialize_encode(pcre2_code **<i>codes</i>,</b>
|
||||
<b> int32_t <i>number_of_codes</i>, uint32_t **<i>serialized_bytes</i>,</b>
|
||||
<b>int32_t pcre2_serialize_encode(const pcre2_code **<i>codes</i>,</b>
|
||||
<b> int32_t <i>number_of_codes</i>, uint8_t **<i>serialized_bytes</i>,</b>
|
||||
<b> PCRE2_SIZE *<i>serialized_size</i>, pcre2_general_context *<i>gcontext</i>);</b>
|
||||
<br>
|
||||
<br>
|
||||
|
@ -94,7 +94,7 @@ of serialized patterns, or one of the following negative error codes:
|
|||
<pre>
|
||||
PCRE2_ERROR_BADDATA the number of patterns is zero or less
|
||||
PCRE2_ERROR_BADMAGIC mismatch of id bytes in one of the patterns
|
||||
PCRE2_ERROR_MEMORY memory allocation failed
|
||||
PCRE2_ERROR_NOMEMORY memory allocation failed
|
||||
PCRE2_ERROR_MIXEDTABLES the patterns do not all use the same tables
|
||||
PCRE2_ERROR_NULL the 1st, 3rd, or 4th argument is NULL
|
||||
</pre>
|
||||
|
@ -154,7 +154,6 @@ mangagement functions for the decoded patterns. If this argument is NULL,
|
|||
<b>malloc()</b> and <b>free()</b> are used. After deserialization, the byte
|
||||
stream is no longer needed and can be discarded.
|
||||
<pre>
|
||||
int32_t number_of_codes;
|
||||
pcre2_code *list_of_codes[2];
|
||||
uint8_t *bytes = <serialized data>;
|
||||
int32_t number_of_codes =
|
||||
|
|
|
@ -19,29 +19,31 @@ please consult the man page, in case the conversion went wrong.
|
|||
<li><a name="TOC4" href="#SEC4">CHARACTER TYPES</a>
|
||||
<li><a name="TOC5" href="#SEC5">GENERAL CATEGORY PROPERTIES FOR \p and \P</a>
|
||||
<li><a name="TOC6" href="#SEC6">PCRE2 SPECIAL CATEGORY PROPERTIES FOR \p and \P</a>
|
||||
<li><a name="TOC7" href="#SEC7">SCRIPT NAMES FOR \p AND \P</a>
|
||||
<li><a name="TOC8" href="#SEC8">CHARACTER CLASSES</a>
|
||||
<li><a name="TOC9" href="#SEC9">QUANTIFIERS</a>
|
||||
<li><a name="TOC10" href="#SEC10">ANCHORS AND SIMPLE ASSERTIONS</a>
|
||||
<li><a name="TOC11" href="#SEC11">REPORTED MATCH POINT SETTING</a>
|
||||
<li><a name="TOC12" href="#SEC12">ALTERNATION</a>
|
||||
<li><a name="TOC13" href="#SEC13">CAPTURING</a>
|
||||
<li><a name="TOC14" href="#SEC14">ATOMIC GROUPS</a>
|
||||
<li><a name="TOC15" href="#SEC15">COMMENT</a>
|
||||
<li><a name="TOC16" href="#SEC16">OPTION SETTING</a>
|
||||
<li><a name="TOC17" href="#SEC17">NEWLINE CONVENTION</a>
|
||||
<li><a name="TOC18" href="#SEC18">WHAT \R MATCHES</a>
|
||||
<li><a name="TOC19" href="#SEC19">LOOKAHEAD AND LOOKBEHIND ASSERTIONS</a>
|
||||
<li><a name="TOC20" href="#SEC20">NON-ATOMIC LOOKAROUND ASSERTIONS</a>
|
||||
<li><a name="TOC21" href="#SEC21">SCRIPT RUNS</a>
|
||||
<li><a name="TOC22" href="#SEC22">BACKREFERENCES</a>
|
||||
<li><a name="TOC23" href="#SEC23">SUBROUTINE REFERENCES (POSSIBLY RECURSIVE)</a>
|
||||
<li><a name="TOC24" href="#SEC24">CONDITIONAL PATTERNS</a>
|
||||
<li><a name="TOC25" href="#SEC25">BACKTRACKING CONTROL</a>
|
||||
<li><a name="TOC26" href="#SEC26">CALLOUTS</a>
|
||||
<li><a name="TOC27" href="#SEC27">SEE ALSO</a>
|
||||
<li><a name="TOC28" href="#SEC28">AUTHOR</a>
|
||||
<li><a name="TOC29" href="#SEC29">REVISION</a>
|
||||
<li><a name="TOC7" href="#SEC7">BINARY PROPERTIES FOR \p AND \P</a>
|
||||
<li><a name="TOC8" href="#SEC8">SCRIPT MATCHING WITH \p AND \P</a>
|
||||
<li><a name="TOC9" href="#SEC9">THE BIDI_CLASS PROPERTY FOR \p AND \P</a>
|
||||
<li><a name="TOC10" href="#SEC10">CHARACTER CLASSES</a>
|
||||
<li><a name="TOC11" href="#SEC11">QUANTIFIERS</a>
|
||||
<li><a name="TOC12" href="#SEC12">ANCHORS AND SIMPLE ASSERTIONS</a>
|
||||
<li><a name="TOC13" href="#SEC13">REPORTED MATCH POINT SETTING</a>
|
||||
<li><a name="TOC14" href="#SEC14">ALTERNATION</a>
|
||||
<li><a name="TOC15" href="#SEC15">CAPTURING</a>
|
||||
<li><a name="TOC16" href="#SEC16">ATOMIC GROUPS</a>
|
||||
<li><a name="TOC17" href="#SEC17">COMMENT</a>
|
||||
<li><a name="TOC18" href="#SEC18">OPTION SETTING</a>
|
||||
<li><a name="TOC19" href="#SEC19">NEWLINE CONVENTION</a>
|
||||
<li><a name="TOC20" href="#SEC20">WHAT \R MATCHES</a>
|
||||
<li><a name="TOC21" href="#SEC21">LOOKAHEAD AND LOOKBEHIND ASSERTIONS</a>
|
||||
<li><a name="TOC22" href="#SEC22">NON-ATOMIC LOOKAROUND ASSERTIONS</a>
|
||||
<li><a name="TOC23" href="#SEC23">SCRIPT RUNS</a>
|
||||
<li><a name="TOC24" href="#SEC24">BACKREFERENCES</a>
|
||||
<li><a name="TOC25" href="#SEC25">SUBROUTINE REFERENCES (POSSIBLY RECURSIVE)</a>
|
||||
<li><a name="TOC26" href="#SEC26">CONDITIONAL PATTERNS</a>
|
||||
<li><a name="TOC27" href="#SEC27">BACKTRACKING CONTROL</a>
|
||||
<li><a name="TOC28" href="#SEC28">CALLOUTS</a>
|
||||
<li><a name="TOC29" href="#SEC29">SEE ALSO</a>
|
||||
<li><a name="TOC30" href="#SEC30">AUTHOR</a>
|
||||
<li><a name="TOC31" href="#SEC31">REVISION</a>
|
||||
</ul>
|
||||
<br><a name="SEC1" href="#TOC1">PCRE2 REGULAR EXPRESSION SYNTAX SUMMARY</a><br>
|
||||
<P>
|
||||
|
@ -136,6 +138,11 @@ happening, \s and \w may also match characters with code points in the range
|
|||
sequences is changed to use Unicode properties and they match many more
|
||||
characters.
|
||||
</P>
|
||||
<P>
|
||||
Property descriptions in \p and \P are matched caselessly; hyphens,
|
||||
underscores, and white space are ignored, in accordance with Unicode's "loose
|
||||
matching" rules.
|
||||
</P>
|
||||
<br><a name="SEC5" href="#TOC1">GENERAL CATEGORY PROPERTIES FOR \p and \P</a><br>
|
||||
<P>
|
||||
<pre>
|
||||
|
@ -152,6 +159,7 @@ characters.
|
|||
Lo Other letter
|
||||
Lt Title case letter
|
||||
Lu Upper case letter
|
||||
Lc Ll, Lu, or Lt
|
||||
L& Ll, Lu, or Lt
|
||||
|
||||
M Mark
|
||||
|
@ -198,166 +206,58 @@ characters.
|
|||
Perl and POSIX space are now the same. Perl added VT to its space character set
|
||||
at release 5.18.
|
||||
</P>
|
||||
<br><a name="SEC7" href="#TOC1">SCRIPT NAMES FOR \p AND \P</a><br>
|
||||
<br><a name="SEC7" href="#TOC1">BINARY PROPERTIES FOR \p AND \P</a><br>
|
||||
<P>
|
||||
Adlam,
|
||||
Ahom,
|
||||
Anatolian_Hieroglyphs,
|
||||
Arabic,
|
||||
Armenian,
|
||||
Avestan,
|
||||
Balinese,
|
||||
Bamum,
|
||||
Bassa_Vah,
|
||||
Batak,
|
||||
Bengali,
|
||||
Bhaiksuki,
|
||||
Bopomofo,
|
||||
Brahmi,
|
||||
Braille,
|
||||
Buginese,
|
||||
Buhid,
|
||||
Canadian_Aboriginal,
|
||||
Carian,
|
||||
Caucasian_Albanian,
|
||||
Chakma,
|
||||
Cham,
|
||||
Cherokee,
|
||||
Chorasmian,
|
||||
Common,
|
||||
Coptic,
|
||||
Cuneiform,
|
||||
Cypriot,
|
||||
Cyrillic,
|
||||
Deseret,
|
||||
Devanagari,
|
||||
Dives_Akuru,
|
||||
Dogra,
|
||||
Duployan,
|
||||
Egyptian_Hieroglyphs,
|
||||
Elbasan,
|
||||
Elymaic,
|
||||
Ethiopic,
|
||||
Georgian,
|
||||
Glagolitic,
|
||||
Gothic,
|
||||
Grantha,
|
||||
Greek,
|
||||
Gujarati,
|
||||
Gunjala_Gondi,
|
||||
Gurmukhi,
|
||||
Han,
|
||||
Hangul,
|
||||
Hanifi_Rohingya,
|
||||
Hanunoo,
|
||||
Hatran,
|
||||
Hebrew,
|
||||
Hiragana,
|
||||
Imperial_Aramaic,
|
||||
Inherited,
|
||||
Inscriptional_Pahlavi,
|
||||
Inscriptional_Parthian,
|
||||
Javanese,
|
||||
Kaithi,
|
||||
Kannada,
|
||||
Katakana,
|
||||
Kayah_Li,
|
||||
Kharoshthi,
|
||||
Khitan_Small_Script,
|
||||
Khmer,
|
||||
Khojki,
|
||||
Khudawadi,
|
||||
Lao,
|
||||
Latin,
|
||||
Lepcha,
|
||||
Limbu,
|
||||
Linear_A,
|
||||
Linear_B,
|
||||
Lisu,
|
||||
Lycian,
|
||||
Lydian,
|
||||
Mahajani,
|
||||
Makasar,
|
||||
Malayalam,
|
||||
Mandaic,
|
||||
Manichaean,
|
||||
Marchen,
|
||||
Masaram_Gondi,
|
||||
Medefaidrin,
|
||||
Meetei_Mayek,
|
||||
Mende_Kikakui,
|
||||
Meroitic_Cursive,
|
||||
Meroitic_Hieroglyphs,
|
||||
Miao,
|
||||
Modi,
|
||||
Mongolian,
|
||||
Mro,
|
||||
Multani,
|
||||
Myanmar,
|
||||
Nabataean,
|
||||
Nandinagari,
|
||||
New_Tai_Lue,
|
||||
Newa,
|
||||
Nko,
|
||||
Nushu,
|
||||
Nyakeng_Puachue_Hmong,
|
||||
Ogham,
|
||||
Ol_Chiki,
|
||||
Old_Hungarian,
|
||||
Old_Italic,
|
||||
Old_North_Arabian,
|
||||
Old_Permic,
|
||||
Old_Persian,
|
||||
Old_Sogdian,
|
||||
Old_South_Arabian,
|
||||
Old_Turkic,
|
||||
Oriya,
|
||||
Osage,
|
||||
Osmanya,
|
||||
Pahawh_Hmong,
|
||||
Palmyrene,
|
||||
Pau_Cin_Hau,
|
||||
Phags_Pa,
|
||||
Phoenician,
|
||||
Psalter_Pahlavi,
|
||||
Rejang,
|
||||
Runic,
|
||||
Samaritan,
|
||||
Saurashtra,
|
||||
Sharada,
|
||||
Shavian,
|
||||
Siddham,
|
||||
SignWriting,
|
||||
Sinhala,
|
||||
Sogdian,
|
||||
Sora_Sompeng,
|
||||
Soyombo,
|
||||
Sundanese,
|
||||
Syloti_Nagri,
|
||||
Syriac,
|
||||
Tagalog,
|
||||
Tagbanwa,
|
||||
Tai_Le,
|
||||
Tai_Tham,
|
||||
Tai_Viet,
|
||||
Takri,
|
||||
Tamil,
|
||||
Tangut,
|
||||
Telugu,
|
||||
Thaana,
|
||||
Thai,
|
||||
Tibetan,
|
||||
Tifinagh,
|
||||
Tirhuta,
|
||||
Ugaritic,
|
||||
Vai,
|
||||
Wancho,
|
||||
Warang_Citi,
|
||||
Yezidi,
|
||||
Yi,
|
||||
Zanabazar_Square.
|
||||
Unicode defines a number of binary properties, that is, properties whose only
|
||||
values are true or false. You can obtain a list of those that are recognized by
|
||||
\p and \P, along with their abbreviations, by running this command:
|
||||
<pre>
|
||||
pcre2test -LP
|
||||
</PRE>
|
||||
</P>
|
||||
<br><a name="SEC8" href="#TOC1">CHARACTER CLASSES</a><br>
|
||||
<br><a name="SEC8" href="#TOC1">SCRIPT MATCHING WITH \p AND \P</a><br>
|
||||
<P>
|
||||
Many script names and their 4-letter abbreviations are recognized in
|
||||
\p{sc:...} or \p{scx:...} items, or on their own with \p (and also \P of
|
||||
course). You can obtain a list of these scripts by running this command:
|
||||
<pre>
|
||||
pcre2test -LS
|
||||
</PRE>
|
||||
</P>
|
||||
<br><a name="SEC9" href="#TOC1">THE BIDI_CLASS PROPERTY FOR \p AND \P</a><br>
|
||||
<P>
|
||||
<pre>
|
||||
\p{Bidi_Class:<class>} matches a character with the given class
|
||||
\p{BC:<class>} matches a character with the given class
|
||||
</pre>
|
||||
The recognized classes are:
|
||||
<pre>
|
||||
AL Arabic letter
|
||||
AN Arabic number
|
||||
B paragraph separator
|
||||
BN boundary neutral
|
||||
CS common separator
|
||||
EN European number
|
||||
ES European separator
|
||||
ET European terminator
|
||||
FSI first strong isolate
|
||||
L left-to-right
|
||||
LRE left-to-right embedding
|
||||
LRI left-to-right isolate
|
||||
LRO left-to-right override
|
||||
NSM non-spacing mark
|
||||
ON other neutral
|
||||
PDF pop directional format
|
||||
PDI pop directional isolate
|
||||
R right-to-left
|
||||
RLE right-to-left embedding
|
||||
RLI right-to-left isolate
|
||||
RLO right-to-left override
|
||||
S segment separator
|
||||
WS which space
|
||||
</PRE>
|
||||
</P>
|
||||
<br><a name="SEC10" href="#TOC1">CHARACTER CLASSES</a><br>
|
||||
<P>
|
||||
<pre>
|
||||
[...] positive character class
|
||||
|
@ -385,7 +285,7 @@ In PCRE2, POSIX character set names recognize only ASCII characters by default,
|
|||
but some of them use Unicode properties if PCRE2_UCP is set. You can use
|
||||
\Q...\E inside a character class.
|
||||
</P>
|
||||
<br><a name="SEC9" href="#TOC1">QUANTIFIERS</a><br>
|
||||
<br><a name="SEC11" href="#TOC1">QUANTIFIERS</a><br>
|
||||
<P>
|
||||
<pre>
|
||||
? 0 or 1, greedy
|
||||
|
@ -406,7 +306,7 @@ but some of them use Unicode properties if PCRE2_UCP is set. You can use
|
|||
{n,}? n or more, lazy
|
||||
</PRE>
|
||||
</P>
|
||||
<br><a name="SEC10" href="#TOC1">ANCHORS AND SIMPLE ASSERTIONS</a><br>
|
||||
<br><a name="SEC12" href="#TOC1">ANCHORS AND SIMPLE ASSERTIONS</a><br>
|
||||
<P>
|
||||
<pre>
|
||||
\b word boundary
|
||||
|
@ -424,20 +324,23 @@ but some of them use Unicode properties if PCRE2_UCP is set. You can use
|
|||
\G first matching position in subject
|
||||
</PRE>
|
||||
</P>
|
||||
<br><a name="SEC11" href="#TOC1">REPORTED MATCH POINT SETTING</a><br>
|
||||
<br><a name="SEC13" href="#TOC1">REPORTED MATCH POINT SETTING</a><br>
|
||||
<P>
|
||||
<pre>
|
||||
\K set reported start of match
|
||||
</pre>
|
||||
From release 10.38 \K is not permitted by default in lookaround assertions,
|
||||
for compatibility with Perl. However, if the PCRE2_EXTRA_ALLOW_LOOKAROUND_BSK
|
||||
option is set, the previous behaviour is re-enabled. When this option is set,
|
||||
\K is honoured in positive assertions, but ignored in negative ones.
|
||||
</P>
|
||||
<br><a name="SEC12" href="#TOC1">ALTERNATION</a><br>
|
||||
<br><a name="SEC14" href="#TOC1">ALTERNATION</a><br>
|
||||
<P>
|
||||
<pre>
|
||||
expr|expr|expr...
|
||||
</PRE>
|
||||
</P>
|
||||
<br><a name="SEC13" href="#TOC1">CAPTURING</a><br>
|
||||
<br><a name="SEC15" href="#TOC1">CAPTURING</a><br>
|
||||
<P>
|
||||
<pre>
|
||||
(...) capture group
|
||||
|
@ -452,20 +355,20 @@ In non-UTF modes, names may contain underscores and ASCII letters and digits;
|
|||
in UTF modes, any Unicode letters and Unicode decimal digits are permitted. In
|
||||
both cases, a name must not start with a digit.
|
||||
</P>
|
||||
<br><a name="SEC14" href="#TOC1">ATOMIC GROUPS</a><br>
|
||||
<br><a name="SEC16" href="#TOC1">ATOMIC GROUPS</a><br>
|
||||
<P>
|
||||
<pre>
|
||||
(?>...) atomic non-capture group
|
||||
(*atomic:...) atomic non-capture group
|
||||
</PRE>
|
||||
</P>
|
||||
<br><a name="SEC15" href="#TOC1">COMMENT</a><br>
|
||||
<br><a name="SEC17" href="#TOC1">COMMENT</a><br>
|
||||
<P>
|
||||
<pre>
|
||||
(?#....) comment (not nestable)
|
||||
</PRE>
|
||||
</P>
|
||||
<br><a name="SEC16" href="#TOC1">OPTION SETTING</a><br>
|
||||
<br><a name="SEC18" href="#TOC1">OPTION SETTING</a><br>
|
||||
<P>
|
||||
Changes of these options within a group are automatically cancelled at the end
|
||||
of the group.
|
||||
|
@ -510,7 +413,7 @@ not increase them. LIMIT_RECURSION is an obsolete synonym for LIMIT_DEPTH. The
|
|||
application can lock out the use of (*UTF) and (*UCP) by setting the
|
||||
PCRE2_NEVER_UTF or PCRE2_NEVER_UCP options, respectively, at compile time.
|
||||
</P>
|
||||
<br><a name="SEC17" href="#TOC1">NEWLINE CONVENTION</a><br>
|
||||
<br><a name="SEC19" href="#TOC1">NEWLINE CONVENTION</a><br>
|
||||
<P>
|
||||
These are recognized only at the very start of the pattern or after option
|
||||
settings with a similar syntax.
|
||||
|
@ -523,7 +426,7 @@ settings with a similar syntax.
|
|||
(*NUL) the NUL character (binary zero)
|
||||
</PRE>
|
||||
</P>
|
||||
<br><a name="SEC18" href="#TOC1">WHAT \R MATCHES</a><br>
|
||||
<br><a name="SEC20" href="#TOC1">WHAT \R MATCHES</a><br>
|
||||
<P>
|
||||
These are recognized only at the very start of the pattern or after option
|
||||
setting with a similar syntax.
|
||||
|
@ -532,7 +435,7 @@ setting with a similar syntax.
|
|||
(*BSR_UNICODE) any Unicode newline sequence
|
||||
</PRE>
|
||||
</P>
|
||||
<br><a name="SEC19" href="#TOC1">LOOKAHEAD AND LOOKBEHIND ASSERTIONS</a><br>
|
||||
<br><a name="SEC21" href="#TOC1">LOOKAHEAD AND LOOKBEHIND ASSERTIONS</a><br>
|
||||
<P>
|
||||
<pre>
|
||||
(?=...) )
|
||||
|
@ -553,7 +456,7 @@ setting with a similar syntax.
|
|||
</pre>
|
||||
Each top-level branch of a lookbehind must be of a fixed length.
|
||||
</P>
|
||||
<br><a name="SEC20" href="#TOC1">NON-ATOMIC LOOKAROUND ASSERTIONS</a><br>
|
||||
<br><a name="SEC22" href="#TOC1">NON-ATOMIC LOOKAROUND ASSERTIONS</a><br>
|
||||
<P>
|
||||
These assertions are specific to PCRE2 and are not Perl-compatible.
|
||||
<pre>
|
||||
|
@ -566,7 +469,7 @@ These assertions are specific to PCRE2 and are not Perl-compatible.
|
|||
(*non_atomic_positive_lookbehind:...) )
|
||||
</PRE>
|
||||
</P>
|
||||
<br><a name="SEC21" href="#TOC1">SCRIPT RUNS</a><br>
|
||||
<br><a name="SEC23" href="#TOC1">SCRIPT RUNS</a><br>
|
||||
<P>
|
||||
<pre>
|
||||
(*script_run:...) ) script run, can be backtracked into
|
||||
|
@ -576,7 +479,7 @@ These assertions are specific to PCRE2 and are not Perl-compatible.
|
|||
(*asr:...) )
|
||||
</PRE>
|
||||
</P>
|
||||
<br><a name="SEC22" href="#TOC1">BACKREFERENCES</a><br>
|
||||
<br><a name="SEC24" href="#TOC1">BACKREFERENCES</a><br>
|
||||
<P>
|
||||
<pre>
|
||||
\n reference by number (can be ambiguous)
|
||||
|
@ -593,7 +496,7 @@ These assertions are specific to PCRE2 and are not Perl-compatible.
|
|||
(?P=name) reference by name (Python)
|
||||
</PRE>
|
||||
</P>
|
||||
<br><a name="SEC23" href="#TOC1">SUBROUTINE REFERENCES (POSSIBLY RECURSIVE)</a><br>
|
||||
<br><a name="SEC25" href="#TOC1">SUBROUTINE REFERENCES (POSSIBLY RECURSIVE)</a><br>
|
||||
<P>
|
||||
<pre>
|
||||
(?R) recurse whole pattern
|
||||
|
@ -612,7 +515,7 @@ These assertions are specific to PCRE2 and are not Perl-compatible.
|
|||
\g'-n' call subroutine by relative number (PCRE2 extension)
|
||||
</PRE>
|
||||
</P>
|
||||
<br><a name="SEC24" href="#TOC1">CONDITIONAL PATTERNS</a><br>
|
||||
<br><a name="SEC26" href="#TOC1">CONDITIONAL PATTERNS</a><br>
|
||||
<P>
|
||||
<pre>
|
||||
(?(condition)yes-pattern)
|
||||
|
@ -635,7 +538,7 @@ Note the ambiguity of (?(R) and (?(Rn) which might be named reference
|
|||
conditions or recursion tests. Such a condition is interpreted as a reference
|
||||
condition if the relevant named group exists.
|
||||
</P>
|
||||
<br><a name="SEC25" href="#TOC1">BACKTRACKING CONTROL</a><br>
|
||||
<br><a name="SEC27" href="#TOC1">BACKTRACKING CONTROL</a><br>
|
||||
<P>
|
||||
All backtracking control verbs may be in the form (*VERB:NAME). For (*MARK) the
|
||||
name is mandatory, for the others it is optional. (*SKIP) changes its behaviour
|
||||
|
@ -662,7 +565,7 @@ pattern is not anchored.
|
|||
The effect of one of these verbs in a group called as a subroutine is confined
|
||||
to the subroutine call.
|
||||
</P>
|
||||
<br><a name="SEC26" href="#TOC1">CALLOUTS</a><br>
|
||||
<br><a name="SEC28" href="#TOC1">CALLOUTS</a><br>
|
||||
<P>
|
||||
<pre>
|
||||
(?C) callout (assumed number 0)
|
||||
|
@ -673,25 +576,25 @@ The allowed string delimiters are ` ' " ^ % # $ (which are the same for the
|
|||
start and the end), and the starting delimiter { matched with the ending
|
||||
delimiter }. To encode the ending delimiter within the string, double it.
|
||||
</P>
|
||||
<br><a name="SEC27" href="#TOC1">SEE ALSO</a><br>
|
||||
<br><a name="SEC29" href="#TOC1">SEE ALSO</a><br>
|
||||
<P>
|
||||
<b>pcre2pattern</b>(3), <b>pcre2api</b>(3), <b>pcre2callout</b>(3),
|
||||
<b>pcre2matching</b>(3), <b>pcre2</b>(3).
|
||||
</P>
|
||||
<br><a name="SEC28" href="#TOC1">AUTHOR</a><br>
|
||||
<br><a name="SEC30" href="#TOC1">AUTHOR</a><br>
|
||||
<P>
|
||||
Philip Hazel
|
||||
<br>
|
||||
University Computing Service
|
||||
Retired from University Computing Service
|
||||
<br>
|
||||
Cambridge, England.
|
||||
<br>
|
||||
</P>
|
||||
<br><a name="SEC29" href="#TOC1">REVISION</a><br>
|
||||
<br><a name="SEC31" href="#TOC1">REVISION</a><br>
|
||||
<P>
|
||||
Last updated: 28 December 2019
|
||||
Last updated: 12 January 2022
|
||||
<br>
|
||||
Copyright © 1997-2019 University of Cambridge.
|
||||
Copyright © 1997-2022 University of Cambridge.
|
||||
<br>
|
||||
<p>
|
||||
Return to the <a href="index.html">PCRE2 index page</a>.
|
||||
|
|
|
@ -59,12 +59,7 @@ patterns, and the subject lines specify PCRE2 function options, control how the
|
|||
subject is processed, and what output is produced.
|
||||
</P>
|
||||
<P>
|
||||
As the original fairly simple PCRE library evolved, it acquired many different
|
||||
features, and as a result, the original <b>pcretest</b> program ended up with a
|
||||
lot of options in a messy, arcane syntax for testing all the features. The
|
||||
move to the new PCRE2 API provided an opportunity to re-implement the test
|
||||
program as <b>pcre2test</b>, with a cleaner modifier syntax. Nevertheless, there
|
||||
are still many obscure modifiers, some of which are specifically designed for
|
||||
There are many obscure modifiers, some of which are specifically designed for
|
||||
use in conjunction with the test script and data files that are distributed as
|
||||
part of PCRE2. All the modifiers are documented here, some without much
|
||||
justification, but many of them are unlikely to be of use except when testing
|
||||
|
@ -83,16 +78,16 @@ to 8-bit code units for output.
|
|||
</P>
|
||||
<P>
|
||||
In the rest of this document, the names of library functions and structures
|
||||
are given in generic form, for example, <b>pcre_compile()</b>. The actual
|
||||
are given in generic form, for example, <b>pcre2_compile()</b>. The actual
|
||||
names used in the libraries have a suffix _8, _16, or _32, as appropriate.
|
||||
<a name="inputencoding"></a></P>
|
||||
<br><a name="SEC3" href="#TOC1">INPUT ENCODING</a><br>
|
||||
<P>
|
||||
Input to <b>pcre2test</b> is processed line by line, either by calling the C
|
||||
library's <b>fgets()</b> function, or via the <b>libreadline</b> library. In some
|
||||
Windows environments character 26 (hex 1A) causes an immediate end of file, and
|
||||
no further data is read, so this character should be avoided unless you really
|
||||
want that action.
|
||||
library's <b>fgets()</b> function, or via the <b>libreadline</b> or <b>libedit</b>
|
||||
library. In some Windows environments character 26 (hex 1A) causes an immediate
|
||||
end of file, and no further data is read, so this character should be avoided
|
||||
unless you really want that action.
|
||||
</P>
|
||||
<P>
|
||||
The input is processed using using C's string functions, so must not
|
||||
|
@ -258,7 +253,19 @@ available, and the use of JIT for matching is verified.
|
|||
<b>-LM</b>
|
||||
List modifiers: write a list of available pattern and subject modifiers to the
|
||||
standard output, then exit with zero exit code. All other options are ignored.
|
||||
If both -C and -LM are present, whichever is first is recognized.
|
||||
If both -C and any -Lx options are present, whichever is first is recognized.
|
||||
</P>
|
||||
<P>
|
||||
<b>-LP</b>
|
||||
List properties: write a list of recognized Unicode properties to the standard
|
||||
output, then exit with zero exit code. All other options are ignored. If both
|
||||
-C and any -Lx options are present, whichever is first is recognized.
|
||||
</P>
|
||||
<P>
|
||||
<b>-LS</b>
|
||||
List scripts: write a list of recogized Unicode script names to the standard
|
||||
output, then exit with zero exit code. All other options are ignored. If both
|
||||
-C and any -Lx options are present, whichever is first is recognized.
|
||||
</P>
|
||||
<P>
|
||||
<b>-pattern</b> <i>modifier-list</i>
|
||||
|
@ -486,15 +493,17 @@ excluding pattern meta-characters):
|
|||
</pre>
|
||||
This is interpreted as the pattern's delimiter. A regular expression may be
|
||||
continued over several input lines, in which case the newline characters are
|
||||
included within it. It is possible to include the delimiter within the pattern
|
||||
by escaping it with a backslash, for example
|
||||
included within it. It is possible to include the delimiter as a literal within
|
||||
the pattern by escaping it with a backslash, for example
|
||||
<pre>
|
||||
/abc\/def/
|
||||
</pre>
|
||||
If you do this, the escape and the delimiter form part of the pattern, but
|
||||
since the delimiters are all non-alphanumeric, this does not affect its
|
||||
interpretation. If the terminating delimiter is immediately followed by a
|
||||
backslash, for example,
|
||||
since the delimiters are all non-alphanumeric, the inclusion of the backslash
|
||||
does not affect the pattern's interpretation. Note, however, that this trick
|
||||
does not work within \Q...\E literal bracketing because the backslash will
|
||||
itself be interpreted as a literal. If the terminating delimiter is immediately
|
||||
followed by a backslash, for example,
|
||||
<pre>
|
||||
/abc/\
|
||||
</pre>
|
||||
|
@ -512,11 +521,11 @@ A pattern can be followed by a modifier list (details below).
|
|||
</P>
|
||||
<br><a name="SEC9" href="#TOC1">SUBJECT LINE SYNTAX</a><br>
|
||||
<P>
|
||||
Before each subject line is passed to <b>pcre2_match()</b> or
|
||||
<b>pcre2_dfa_match()</b>, leading and trailing white space is removed, and the
|
||||
line is scanned for backslash escapes, unless the <b>subject_literal</b>
|
||||
modifier was set for the pattern. The following provide a means of encoding
|
||||
non-printing characters in a visible way:
|
||||
Before each subject line is passed to <b>pcre2_match()</b>,
|
||||
<b>pcre2_dfa_match()</b>, or <b>pcre2_jit_match()</b>, leading and trailing white
|
||||
space is removed, and the line is scanned for backslash escapes, unless the
|
||||
<b>subject_literal</b> modifier was set for the pattern. The following provide a
|
||||
means of encoding non-printing characters in a visible way:
|
||||
<pre>
|
||||
\a alarm (BEL, \x07)
|
||||
\b backspace (\x08)
|
||||
|
@ -613,6 +622,7 @@ way <b>pcre2_compile()</b> behaves. See
|
|||
for a description of the effects of these options.
|
||||
<pre>
|
||||
allow_empty_class set PCRE2_ALLOW_EMPTY_CLASS
|
||||
allow_lookaround_bsk set PCRE2_EXTRA_ALLOW_LOOKAROUND_BSK
|
||||
allow_surrogate_escapes set PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES
|
||||
alt_bsux set PCRE2_ALT_BSUX
|
||||
alt_circumflex set PCRE2_ALT_CIRCUMFLEX
|
||||
|
@ -1231,7 +1241,8 @@ pattern, but can be overridden by modifiers on the subject.
|
|||
copy=<number or name> copy captured substring
|
||||
depth_limit=<n> set a depth limit
|
||||
dfa use <b>pcre2_dfa_match()</b>
|
||||
find_limits find match and depth limits
|
||||
find_limits find heap, match and depth limits
|
||||
find_limits_noheap find match and depth limits
|
||||
get=<number or name> extract captured substring
|
||||
getall extract all captured substrings
|
||||
/g global global matching
|
||||
|
@ -1241,6 +1252,8 @@ pattern, but can be overridden by modifiers on the subject.
|
|||
match_limit=<n> set a match limit
|
||||
memory show heap memory usage
|
||||
null_context match with a NULL context
|
||||
null_replacement substitute with NULL replacement
|
||||
null_subject match with NULL subject
|
||||
offset=<n> set starting offset
|
||||
offset_limit=<n> set offset limit
|
||||
ovector=<n> set size of output vector
|
||||
|
@ -1552,7 +1565,7 @@ Setting heap, match, and depth limits
|
|||
<P>
|
||||
The <b>heap_limit</b>, <b>match_limit</b>, and <b>depth_limit</b> modifiers set
|
||||
the appropriate limits in the match context. These values are ignored when the
|
||||
<b>find_limits</b> modifier is specified.
|
||||
<b>find_limits</b> or <b>find_limits_noheap</b> modifier is specified.
|
||||
</P>
|
||||
<br><b>
|
||||
Finding minimum limits
|
||||
|
@ -1562,8 +1575,12 @@ If the <b>find_limits</b> modifier is present on a subject line, <b>pcre2test</b
|
|||
calls the relevant matching function several times, setting different values in
|
||||
the match context via <b>pcre2_set_heap_limit()</b>,
|
||||
<b>pcre2_set_match_limit()</b>, or <b>pcre2_set_depth_limit()</b> until it finds
|
||||
the minimum values for each parameter that allows the match to complete without
|
||||
error. If JIT is being used, only the match limit is relevant.
|
||||
the smallest value for each parameter that allows the match to complete without
|
||||
a "limit exceeded" error. The match itself may succeed or fail. An alternative
|
||||
modifier, <b>find_limits_noheap</b>, omits the heap limit. This is used in the
|
||||
standard tests, because the minimum heap limit varies between systems. If JIT
|
||||
is being used, only the match limit is relevant, and the other two are
|
||||
automatically omitted.
|
||||
</P>
|
||||
<P>
|
||||
When using this modifier, the pattern should not contain any limit settings
|
||||
|
@ -1591,9 +1608,7 @@ overall amount of computing resource that is used.
|
|||
</P>
|
||||
<P>
|
||||
For both kinds of matching, the <i>heap_limit</i> number, which is in kibibytes
|
||||
(units of 1024 bytes), limits the amount of heap memory used for matching. A
|
||||
value of zero disables the use of any heap memory; many simple pattern matches
|
||||
can be done without using the heap, so zero is not an unreasonable setting.
|
||||
(units of 1024 bytes), limits the amount of heap memory used for matching.
|
||||
</P>
|
||||
<br><b>
|
||||
Showing MARK names
|
||||
|
@ -1611,12 +1626,10 @@ Showing memory usage
|
|||
<P>
|
||||
The <b>memory</b> modifier causes <b>pcre2test</b> to log the sizes of all heap
|
||||
memory allocation and freeing calls that occur during a call to
|
||||
<b>pcre2_match()</b> or <b>pcre2_dfa_match()</b>. These occur only when a match
|
||||
requires a bigger vector than the default for remembering backtracking points
|
||||
(<b>pcre2_match()</b>) or for internal workspace (<b>pcre2_dfa_match()</b>). In
|
||||
many cases there will be no heap memory used and therefore no additional
|
||||
output. No heap memory is allocated during matching with JIT, so in that case
|
||||
the <b>memory</b> modifier never has any effect. For this modifier to work, the
|
||||
<b>pcre2_match()</b> or <b>pcre2_dfa_match()</b>. In the latter case, heap memory
|
||||
is used only when a match requires more internal workspace that the default
|
||||
allocation on the stack, so in many cases there will be no output. No heap
|
||||
memory is allocated during matching with JIT. For this modifier to work, the
|
||||
<b>null_context</b> modifier must not be set on both the pattern and the
|
||||
subject, though it can be set on one or the other.
|
||||
</P>
|
||||
|
@ -1670,7 +1683,7 @@ When testing <b>pcre2_substitute()</b>, this modifier also has the effect of
|
|||
passing the replacement string as zero-terminated.
|
||||
</P>
|
||||
<br><b>
|
||||
Passing a NULL context
|
||||
Passing a NULL context, subject, or replacement
|
||||
</b><br>
|
||||
<P>
|
||||
Normally, <b>pcre2test</b> passes a context block to <b>pcre2_match()</b>,
|
||||
|
@ -1678,7 +1691,13 @@ Normally, <b>pcre2test</b> passes a context block to <b>pcre2_match()</b>,
|
|||
If the <b>null_context</b> modifier is set, however, NULL is passed. This is for
|
||||
testing that the matching and substitution functions behave correctly in this
|
||||
case (they use default values). This modifier cannot be used with the
|
||||
<b>find_limits</b> or <b>substitute_callout</b> modifiers.
|
||||
<b>find_limits</b>, <b>find_limits_noheap</b>, or <b>substitute_callout</b>
|
||||
modifiers.
|
||||
</P>
|
||||
<P>
|
||||
Similarly, for testing purposes, if the <b>null_subject</b> or
|
||||
<b>null_replacement</b> modifier is set, the subject or replacement string
|
||||
pointers are passed as NULL, respectively, to the relevant functions.
|
||||
</P>
|
||||
<br><a name="SEC12" href="#TOC1">THE ALTERNATIVE MATCHING FUNCTION</a><br>
|
||||
<P>
|
||||
|
@ -2117,16 +2136,16 @@ on the stack.
|
|||
<P>
|
||||
Philip Hazel
|
||||
<br>
|
||||
University Computing Service
|
||||
Retired from University Computing Service
|
||||
<br>
|
||||
Cambridge, England.
|
||||
<br>
|
||||
</P>
|
||||
<br><a name="SEC21" href="#TOC1">REVISION</a><br>
|
||||
<P>
|
||||
Last updated: 28 April 2021
|
||||
Last updated: 27 July 2022
|
||||
<br>
|
||||
Copyright © 1997-2021 University of Cambridge.
|
||||
Copyright © 1997-2022 University of Cambridge.
|
||||
<br>
|
||||
<p>
|
||||
Return to the <a href="index.html">PCRE2 index page</a>.
|
||||
|
|
|
@ -50,17 +50,18 @@ UNICODE PROPERTY SUPPORT
|
|||
<P>
|
||||
When PCRE2 is built with Unicode support, the escape sequences \p{..},
|
||||
\P{..}, and \X can be used. This is not dependent on the PCRE2_UTF setting.
|
||||
The Unicode properties that can be tested are limited to the general category
|
||||
properties such as Lu for an upper case letter or Nd for a decimal number, the
|
||||
Unicode script names such as Arabic or Han, and the derived properties Any and
|
||||
L&. Full lists are given in the
|
||||
The Unicode properties that can be tested are a subset of those that Perl
|
||||
supports. Currently they are limited to the general category properties such as
|
||||
Lu for an upper case letter or Nd for a decimal number, the Unicode script
|
||||
names such as Arabic or Han, Bidi_Class, Bidi_Control, and the derived
|
||||
properties Any and LC (synonym L&). Full lists are given in the
|
||||
<a href="pcre2pattern.html"><b>pcre2pattern</b></a>
|
||||
and
|
||||
<a href="pcre2syntax.html"><b>pcre2syntax</b></a>
|
||||
documentation. Only the short names for properties are supported. For example,
|
||||
\p{L} matches a letter. Its Perl synonym, \p{Letter}, is not supported.
|
||||
Furthermore, in Perl, many properties may optionally be prefixed by "Is", for
|
||||
compatibility with Perl 5.6. PCRE2 does not support this.
|
||||
documentation. In general, only the short names for properties are supported.
|
||||
For example, \p{L} matches a letter. Its longer synonym, \p{Letter}, is not
|
||||
supported. Furthermore, in Perl, many properties may optionally be prefixed by
|
||||
"Is", for compatibility with Perl 5.6. PCRE2 does not support this.
|
||||
</P>
|
||||
<br><b>
|
||||
WIDE CHARACTERS AND UTF MODES
|
||||
|
@ -477,7 +478,7 @@ AUTHOR
|
|||
<P>
|
||||
Philip Hazel
|
||||
<br>
|
||||
University Computing Service
|
||||
Retired from University Computing Service
|
||||
<br>
|
||||
Cambridge, England.
|
||||
<br>
|
||||
|
@ -486,9 +487,9 @@ Cambridge, England.
|
|||
REVISION
|
||||
</b><br>
|
||||
<P>
|
||||
Last updated: 23 February 2020
|
||||
Last updated: 22 December 2021
|
||||
<br>
|
||||
Copyright © 1997-2020 University of Cambridge.
|
||||
Copyright © 1997-2021 University of Cambridge.
|
||||
<br>
|
||||
<p>
|
||||
Return to the <a href="index.html">PCRE2 index page</a>.
|
||||
|
|
11
doc/pcre2.3
11
doc/pcre2.3
|
@ -1,4 +1,4 @@
|
|||
.TH PCRE2 3 "28 April 2021" "PCRE2 10.37"
|
||||
.TH PCRE2 3 "27 August 2021" "PCRE2 10.38"
|
||||
.SH NAME
|
||||
PCRE2 - Perl-compatible regular expressions (revised API)
|
||||
.SH INTRODUCTION
|
||||
|
@ -11,7 +11,8 @@ nearly two decades, the limitations of the original API were making development
|
|||
increasingly difficult. The new API is more extensible, and it was simplified
|
||||
by abolishing the separate "study" optimizing function; in PCRE2, patterns are
|
||||
automatically optimized where possible. Since forking from PCRE1, the code has
|
||||
been extensively refactored and new features introduced.
|
||||
been extensively refactored and new features introduced. The old library is now
|
||||
obsolete and is no longer maintained.
|
||||
.P
|
||||
As well as Perl-style regular expression patterns, some features that appeared
|
||||
in Python and the original PCRE before they appeared in Perl are available
|
||||
|
@ -190,18 +191,18 @@ function, listing its arguments and results.
|
|||
.sp
|
||||
.nf
|
||||
Philip Hazel
|
||||
University Computing Service
|
||||
Retired from University Computing Service
|
||||
Cambridge, England.
|
||||
.fi
|
||||
.P
|
||||
Putting an actual email address here is a spam magnet. If you want to email me,
|
||||
use my two initials, followed by the two digits 10, at the domain cam.ac.uk.
|
||||
use my two names separated by a dot at gmail.com.
|
||||
.
|
||||
.
|
||||
.SH REVISION
|
||||
.rs
|
||||
.sp
|
||||
.nf
|
||||
Last updated: 28 April 2021
|
||||
Last updated: 27 August 2021
|
||||
Copyright (c) 1997-2021 University of Cambridge.
|
||||
.fi
|
||||
|
|
2565
doc/pcre2.txt
2565
doc/pcre2.txt
File diff suppressed because it is too large
Load Diff
|
@ -1,4 +1,4 @@
|
|||
.TH PCRE2_COMPILE 3 "23 May 2019" "PCRE2 10.34"
|
||||
.TH PCRE2_COMPILE 3 "22 April 2022" "PCRE2 10.41"
|
||||
.SH NAME
|
||||
PCRE2 - Perl-compatible regular expressions (revised API)
|
||||
.SH SYNOPSIS
|
||||
|
@ -80,8 +80,17 @@ Additional options may be set in the compile context via the
|
|||
.\"
|
||||
function.
|
||||
.P
|
||||
The yield of this function is a pointer to a private data structure that
|
||||
contains the compiled pattern, or NULL if an error was detected.
|
||||
If either of \fIerrorcode\fP or \fIerroroffset\fP is NULL, the function returns
|
||||
NULL immediately. Otherwise, the yield of this function is a pointer to a
|
||||
private data structure that contains the compiled pattern, or NULL if an error
|
||||
was detected. In the error case, a text error message can be obtained by
|
||||
passing the value returned via the \fIerrorcode\fP argument to the the
|
||||
\fBpcre2_get_error_message()\fP function. The offset (in code units) where the
|
||||
error was encountered is returned via the \fIerroroffset\fP argument.
|
||||
.P
|
||||
If there is no error, the value passed via \fIerrorcode\fP returns the message
|
||||
"no error" if passed to \fBpcre2_get_error_message()\fP, and the value passed
|
||||
via \fIerroroffset\fP is zero.
|
||||
.P
|
||||
There is a complete description of the PCRE2 native API, with more detail on
|
||||
each option, in the
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
.TH PCRE2_DFA_MATCH 3 "16 October 2018" "PCRE2 10.33"
|
||||
.TH PCRE2_DFA_MATCH 3 "28 August 2021" "PCRE2 10.38"
|
||||
.SH NAME
|
||||
PCRE2 - Perl-compatible regular expressions (revised API)
|
||||
.SH SYNOPSIS
|
||||
|
@ -33,10 +33,15 @@ just once (except when processing lookaround assertions). This function is
|
|||
\fIworkspace\fP Points to a vector of ints used as working space
|
||||
\fIwscount\fP Number of elements in the vector
|
||||
.sp
|
||||
For \fBpcre2_dfa_match()\fP, a match context is needed only if you want to set
|
||||
up a callout function or specify the heap limit or the match or the recursion
|
||||
depth limits. The \fIlength\fP and \fIstartoffset\fP values are code units, not
|
||||
characters. The options are:
|
||||
The size of output vector needed to contain all the results depends on the
|
||||
number of simultaneous matches, not on the number of parentheses in the
|
||||
pattern. Using \fBpcre2_match_data_create_from_pattern()\fP to create the match
|
||||
data block is therefore not advisable when using this function.
|
||||
.P
|
||||
A match context is needed only if you want to set up a callout function or
|
||||
specify the heap limit or the match or the recursion depth limits. The
|
||||
\fIlength\fP and \fIstartoffset\fP values are code units, not characters. The
|
||||
options are:
|
||||
.sp
|
||||
PCRE2_ANCHORED Match only at the first position
|
||||
PCRE2_COPY_MATCHED_SUBJECT
|
||||
|
|
|
@ -22,7 +22,8 @@ allocation. The result can be passed to the JIT run-time code by calling
|
|||
\fBpcre2_jit_stack_assign()\fP to associate the stack with a compiled pattern,
|
||||
which can then be processed by \fBpcre2_match()\fP or \fBpcre2_jit_match()\fP.
|
||||
A maximum stack size of 512KiB to 1MiB should be more than enough for any
|
||||
pattern. For more details, see the
|
||||
pattern. If the stack couldn't be allocated or the values passed were not
|
||||
reasonable, NULL will be returned. For more details, see the
|
||||
.\" HREF
|
||||
\fBpcre2jit\fP
|
||||
.\"
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
.TH PCRE2_MATCH_DATA_CREATE 3 "29 July 2015" "PCRE2 10.21"
|
||||
.TH PCRE2_MATCH_DATA_CREATE 3 "28 August 2021" "PCRE2 10.38"
|
||||
.SH NAME
|
||||
PCRE2 - Perl-compatible regular expressions (revised API)
|
||||
.SH SYNOPSIS
|
||||
|
@ -18,8 +18,9 @@ This function creates a new match data block, which is used for holding the
|
|||
result of a match. The first argument specifies the number of pairs of offsets
|
||||
that are required. These form the "output vector" (ovector) within the match
|
||||
data block, and are used to identify the matched string and any captured
|
||||
substrings. There is always one pair of offsets; if \fBovecsize\fP is zero, it
|
||||
is treated as one.
|
||||
substrings when matching with \fBpcre2_match()\fP, or a number of different
|
||||
matches at the same point when used with \fBpcre2_dfa_match()\fP. There is
|
||||
always one pair of offsets; if \fBovecsize\fP is zero, it is treated as one.
|
||||
.P
|
||||
The second argument points to a general context, for custom memory management,
|
||||
or is NULL for system memory management. The result of the function is NULL if
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
.TH PCRE2_MATCH_DATA_CREATE_FROM_PATTERN 3 "29 July 2015" "PCRE2 10.21"
|
||||
.TH PCRE2_MATCH_DATA_CREATE_FROM_PATTERN 3 "28 August 2021" "PCRE2 10.38"
|
||||
.SH NAME
|
||||
PCRE2 - Perl-compatible regular expressions (revised API)
|
||||
.SH SYNOPSIS
|
||||
|
@ -14,12 +14,15 @@ PCRE2 - Perl-compatible regular expressions (revised API)
|
|||
.SH DESCRIPTION
|
||||
.rs
|
||||
.sp
|
||||
This function creates a new match data block, which is used for holding the
|
||||
result of a match. The first argument points to a compiled pattern. The number
|
||||
of capturing parentheses within the pattern is used to compute the number of
|
||||
pairs of offsets that are required in the match data block. These form the
|
||||
"output vector" (ovector) within the match data block, and are used to identify
|
||||
the matched string and any captured substrings.
|
||||
This function creates a new match data block for holding the result of a match.
|
||||
The first argument points to a compiled pattern. The number of capturing
|
||||
parentheses within the pattern is used to compute the number of pairs of
|
||||
offsets that are required in the match data block. These form the "output
|
||||
vector" (ovector) within the match data block, and are used to identify the
|
||||
matched string and any captured substrings when matching with
|
||||
\fBpcre2_match()\fP. If you are using \fBpcre2_dfa_match()\fP, which uses the
|
||||
outut vector in a different way, you should use \fBpcre2_match_data_create()\fP
|
||||
instead of this function.
|
||||
.P
|
||||
The second argument points to a general context, for custom memory management,
|
||||
or is NULL to use the same memory allocator as was used for the compiled
|
||||
|
|
|
@ -36,7 +36,7 @@ the following negative error codes:
|
|||
PCRE2_ERROR_BADDATA \fInumber_of_codes\fP is zero or less
|
||||
PCRE2_ERROR_BADMAGIC mismatch of id bytes in \fIbytes\fP
|
||||
PCRE2_ERROR_BADMODE mismatch of variable unit size or PCRE version
|
||||
PCRE2_ERROR_MEMORY memory allocation failed
|
||||
PCRE2_ERROR_NOMEMORY memory allocation failed
|
||||
PCRE2_ERROR_NULL \fIcodes\fP or \fIbytes\fP is NULL
|
||||
.sp
|
||||
PCRE2_ERROR_BADMAGIC may mean that the data is corrupt, or that it was compiled
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
.TH PCRE2_SET_COMPILE_EXTRA_OPTIONS 3 "11 February 2019" "PCRE2 10.33"
|
||||
.TH PCRE2_SET_COMPILE_EXTRA_OPTIONS 3 "31 August 2021" "PCRE2 10.38"
|
||||
.SH NAME
|
||||
PCRE2 - Perl-compatible regular expressions (revised API)
|
||||
.SH SYNOPSIS
|
||||
|
@ -18,12 +18,13 @@ This function sets additional option bits for \fBpcre2_compile()\fP that are
|
|||
housed in a compile context. It completely replaces all the bits. The extra
|
||||
options are:
|
||||
.sp
|
||||
PCRE2_EXTRA_ALLOW_LOOKAROUND_BSK Allow \eK in lookarounds
|
||||
.\" JOIN
|
||||
PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES Allow \ex{df800} to \ex{dfff}
|
||||
PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES Allow \ex{d800} to \ex{dfff}
|
||||
in UTF-8 and UTF-32 modes
|
||||
.\" JOIN
|
||||
PCRE2_EXTRA_ALT_BSUX Extended alternate \eu, \eU, and \ex
|
||||
handling
|
||||
PCRE2_EXTRA_ALT_BSUX Extended alternate \eu, \eU, and
|
||||
\ex handling
|
||||
.\" JOIN
|
||||
PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL Treat all invalid escapes as
|
||||
a literal following character
|
||||
|
|
|
@ -55,32 +55,42 @@ automatically added.
|
|||
The subject and replacement lengths can be given as PCRE2_ZERO_TERMINATED for
|
||||
zero-terminated strings. The options are:
|
||||
.sp
|
||||
PCRE2_ANCHORED Match only at the first position
|
||||
PCRE2_ENDANCHORED Pattern can match only at end of subject
|
||||
PCRE2_NOTBOL Subject is not the beginning of a line
|
||||
PCRE2_NOTEOL Subject is not the end of a line
|
||||
PCRE2_NOTEMPTY An empty string is not a valid match
|
||||
PCRE2_ANCHORED Match only at the first position
|
||||
PCRE2_ENDANCHORED Match only at end of subject
|
||||
.\" JOIN
|
||||
PCRE2_NOTEMPTY_ATSTART An empty string at the start of the
|
||||
subject is not a valid match
|
||||
PCRE2_NO_JIT Do not use JIT matching
|
||||
PCRE2_NOTBOL Subject is not the beginning of a
|
||||
line
|
||||
PCRE2_NOTEOL Subject is not the end of a line
|
||||
.\" JOIN
|
||||
PCRE2_NO_UTF_CHECK Do not check the subject or replacement
|
||||
for UTF validity (only relevant if
|
||||
PCRE2_UTF was set at compile time)
|
||||
PCRE2_SUBSTITUTE_EXTENDED Do extended replacement processing
|
||||
PCRE2_SUBSTITUTE_GLOBAL Replace all occurrences in the subject
|
||||
PCRE2_SUBSTITUTE_LITERAL The replacement string is literal
|
||||
PCRE2_SUBSTITUTE_MATCHED Use pre-existing match data for 1st match
|
||||
PCRE2_SUBSTITUTE_OVERFLOW_LENGTH If overflow, compute needed length
|
||||
PCRE2_NOTEMPTY An empty string is not a
|
||||
valid match
|
||||
.\" JOIN
|
||||
PCRE2_NOTEMPTY_ATSTART An empty string at the start of
|
||||
the subject is not a valid match
|
||||
PCRE2_NO_JIT Do not use JIT matching
|
||||
.\" JOIN
|
||||
PCRE2_NO_UTF_CHECK Do not check for UTF validity in
|
||||
the subject or replacement
|
||||
.\" JOIN
|
||||
(only relevant if PCRE2_UTF was
|
||||
set at compile time)
|
||||
PCRE2_SUBSTITUTE_EXTENDED Do extended replacement processing
|
||||
.\" JOIN
|
||||
PCRE2_SUBSTITUTE_GLOBAL Replace all occurrences in the
|
||||
subject
|
||||
PCRE2_SUBSTITUTE_LITERAL The replacement string is literal
|
||||
.\" JOIN
|
||||
PCRE2_SUBSTITUTE_MATCHED Use pre-existing match data for
|
||||
first match
|
||||
PCRE2_SUBSTITUTE_OVERFLOW_LENGTH If overflow, compute needed length
|
||||
PCRE2_SUBSTITUTE_REPLACEMENT_ONLY Return only replacement string(s)
|
||||
PCRE2_SUBSTITUTE_UNKNOWN_UNSET Treat unknown group as unset
|
||||
PCRE2_SUBSTITUTE_UNSET_EMPTY Simple unset insert = empty string
|
||||
PCRE2_SUBSTITUTE_UNKNOWN_UNSET Treat unknown group as unset
|
||||
PCRE2_SUBSTITUTE_UNSET_EMPTY Simple unset insert = empty string
|
||||
.sp
|
||||
If PCRE2_SUBSTITUTE_LITERAL is set, PCRE2_SUBSTITUTE_EXTENDED,
|
||||
PCRE2_SUBSTITUTE_UNKNOWN_UNSET, and PCRE2_SUBSTITUTE_UNSET_EMPTY are ignored.
|
||||
.P
|
||||
If PCRE2_SUBSTITUTE_MATCHED is set, \fImatch_data\fP must be non-zero; its
|
||||
If PCRE2_SUBSTITUTE_MATCHED is set, \fImatch_data\fP must be non-NULL; its
|
||||
contents must be the result of a call to \fBpcre2_match()\fP using the same
|
||||
pattern and subject.
|
||||
.P
|
||||
|
|
173
doc/pcre2api.3
173
doc/pcre2api.3
|
@ -1,4 +1,4 @@
|
|||
.TH PCRE2API 3 "04 November 2020" "PCRE2 10.36"
|
||||
.TH PCRE2API 3 "27 July 2022" "PCRE2 10.41"
|
||||
.SH NAME
|
||||
PCRE2 - Perl-compatible regular expressions (revised API)
|
||||
.sp
|
||||
|
@ -953,7 +953,7 @@ has its own memory control arrangements (see the
|
|||
documentation for more details). If the limit is reached, the negative error
|
||||
code PCRE2_ERROR_HEAPLIMIT is returned. The default limit can be set when PCRE2
|
||||
is built; if it is not, the default is set very large and is essentially
|
||||
"unlimited".
|
||||
unlimited.
|
||||
.P
|
||||
A value for the heap limit may also be supplied by an item at the start of a
|
||||
pattern of the form
|
||||
|
@ -964,18 +964,18 @@ where ddd is a decimal number. However, such a setting is ignored unless ddd is
|
|||
less than the limit set by the caller of \fBpcre2_match()\fP or, if no such
|
||||
limit is set, less than the default.
|
||||
.P
|
||||
The \fBpcre2_match()\fP function starts out using a 20KiB vector on the system
|
||||
stack for recording backtracking points. The more nested backtracking points
|
||||
there are (that is, the deeper the search tree), the more memory is needed.
|
||||
Heap memory is used only if the initial vector is too small. If the heap limit
|
||||
is set to a value less than 21 (in particular, zero) no heap memory will be
|
||||
used. In this case, only patterns that do not have a lot of nested backtracking
|
||||
can be successfully processed.
|
||||
The \fBpcre2_match()\fP function always needs some heap memory, so setting a
|
||||
value of zero guarantees a "heap limit exceeded" error. Details of how
|
||||
\fBpcre2_match()\fP uses the heap are given in the
|
||||
.\" HREF
|
||||
\fBpcre2perform\fP
|
||||
.\"
|
||||
documentation.
|
||||
.P
|
||||
Similarly, for \fBpcre2_dfa_match()\fP, a vector on the system stack is used
|
||||
when processing pattern recursions, lookarounds, or atomic groups, and only if
|
||||
this is not big enough is heap memory used. In this case, too, setting a value
|
||||
of zero disables the use of the heap.
|
||||
For \fBpcre2_dfa_match()\fP, a vector on the system stack is used when
|
||||
processing pattern recursions, lookarounds, or atomic groups, and only if this
|
||||
is not big enough is heap memory used. In this case, setting a value of zero
|
||||
disables the use of the heap.
|
||||
.sp
|
||||
.nf
|
||||
.B int pcre2_set_match_limit(pcre2_match_context *\fImcontext\fP,
|
||||
|
@ -1019,10 +1019,10 @@ less than the limit set by the caller of \fBpcre2_match()\fP or
|
|||
.fi
|
||||
.sp
|
||||
This parameter limits the depth of nested backtracking in \fBpcre2_match()\fP.
|
||||
Each time a nested backtracking point is passed, a new memory "frame" is used
|
||||
Each time a nested backtracking point is passed, a new memory frame is used
|
||||
to remember the state of matching at that point. Thus, this parameter
|
||||
indirectly limits the amount of memory that is used in a match. However,
|
||||
because the size of each memory "frame" depends on the number of capturing
|
||||
because the size of each memory frame depends on the number of capturing
|
||||
parentheses, the actual memory limit varies from pattern to pattern. This limit
|
||||
was more useful in versions before 10.30, where function recursion was used for
|
||||
backtracking.
|
||||
|
@ -1323,8 +1323,7 @@ If \fIerrorcode\fP or \fIerroroffset\fP is NULL, \fBpcre2_compile()\fP returns
|
|||
NULL immediately. Otherwise, the variables to which these point are set to an
|
||||
error code and an offset (number of code units) within the pattern,
|
||||
respectively, when \fBpcre2_compile()\fP returns NULL because a compilation
|
||||
error has occurred. The values are not defined when compilation is successful
|
||||
and \fBpcre2_compile()\fP returns a non-NULL value.
|
||||
error has occurred.
|
||||
.P
|
||||
There are nearly 100 positive error codes that \fBpcre2_compile()\fP may return
|
||||
if it finds an error in the pattern. There are also some negative error codes
|
||||
|
@ -1343,14 +1342,17 @@ message"
|
|||
below)
|
||||
.\"
|
||||
should be self-explanatory. Macro names starting with PCRE2_ERROR_ are defined
|
||||
for both positive and negative error codes in \fBpcre2.h\fP.
|
||||
for both positive and negative error codes in \fBpcre2.h\fP. When compilation
|
||||
is successful \fIerrorcode\fP is set to a value that returns the message "no
|
||||
error" if passed to \fBpcre2_get_error_message()\fP.
|
||||
.P
|
||||
The value returned in \fIerroroffset\fP is an indication of where in the
|
||||
pattern the error occurred. It is not necessarily the furthest point in the
|
||||
pattern that was read. For example, after the error "lookbehind assertion is
|
||||
not fixed length", the error offset points to the start of the failing
|
||||
assertion. For an invalid UTF-8 or UTF-16 string, the offset is that of the
|
||||
first code unit of the failing character.
|
||||
pattern an error occurred. When there is no error, zero is returned. A non-zero
|
||||
value is not necessarily the furthest point in the pattern that was read. For
|
||||
example, after the error "lookbehind assertion is not fixed length", the error
|
||||
offset points to the start of the failing assertion. For an invalid UTF-8 or
|
||||
UTF-16 string, the offset is that of the first code unit of the failing
|
||||
character.
|
||||
.P
|
||||
Some errors are not detected until the whole pattern has been scanned; in these
|
||||
cases, the offset passed back is the length of the pattern. Note that the
|
||||
|
@ -1794,7 +1796,7 @@ it is set, the effect of passing an invalid UTF string as a pattern is
|
|||
undefined. It may cause your program to crash or loop.
|
||||
.P
|
||||
Note that this option can also be passed to \fBpcre2_match()\fP and
|
||||
\fBpcre_dfa_match()\fP, to suppress UTF validity checking of the subject
|
||||
\fBpcre2_dfa_match()\fP, to suppress UTF validity checking of the subject
|
||||
string.
|
||||
.P
|
||||
Note also that setting PCRE2_NO_UTF_CHECK at compile time does not disable the
|
||||
|
@ -1875,6 +1877,13 @@ characters with code points greater than 127.
|
|||
.sp
|
||||
The option bits that can be set in a compile context by calling the
|
||||
\fBpcre2_set_compile_extra_options()\fP function are as follows:
|
||||
.sp
|
||||
PCRE2_EXTRA_ALLOW_LOOKAROUND_BSK
|
||||
.sp
|
||||
Since release 10.38 PCRE2 has forbidden the use of \eK within lookaround
|
||||
assertions, following Perl's lead. This option is provided to re-enable the
|
||||
previous behaviour (act in positive lookarounds, ignore in negative ones) in
|
||||
case anybody is relying on it.
|
||||
.sp
|
||||
PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES
|
||||
.sp
|
||||
|
@ -2008,8 +2017,8 @@ point. However, this applies only to characters whose code points are less than
|
|||
256. By default, higher-valued code points never match escapes such as \ew or
|
||||
\ed.
|
||||
.P
|
||||
When PCRE2 is built with Unicode support (the default), the Unicode properties
|
||||
of all characters can be tested with \ep and \eP, or, alternatively, the
|
||||
When PCRE2 is built with Unicode support (the default), certain Unicode
|
||||
character properties can be tested with \ep and \eP, or, alternatively, the
|
||||
PCRE2_UCP option can be set when a pattern is compiled; this causes \ew and
|
||||
friends to use Unicode property support instead of the built-in tables.
|
||||
PCRE2_UCP also causes upper/lower casing operations on characters with code
|
||||
|
@ -2272,7 +2281,7 @@ return zero. The third argument should point to a \fBsize_t\fP variable.
|
|||
PCRE2_INFO_LASTCODETYPE
|
||||
.sp
|
||||
Returns 1 if there is a rightmost literal code unit that must exist in any
|
||||
matched string, other than at its start. The third argument should point to a
|
||||
matched string, other than at its start. The third argument should point to a
|
||||
\fBuint32_t\fP variable. If there is no such value, 0 is returned. When 1 is
|
||||
returned, the code unit value itself can be retrieved using
|
||||
PCRE2_INFO_LASTCODEUNIT. For anchored patterns, a last literal value is
|
||||
|
@ -2490,19 +2499,27 @@ to an abstract format like Java or .NET serialization.
|
|||
Information about a successful or unsuccessful match is placed in a match
|
||||
data block, which is an opaque structure that is accessed by function calls. In
|
||||
particular, the match data block contains a vector of offsets into the subject
|
||||
string that define the matched part of the subject and any substrings that were
|
||||
captured. This is known as the \fIovector\fP.
|
||||
string that define the matched parts of the subject. This is known as the
|
||||
\fIovector\fP.
|
||||
.P
|
||||
Before calling \fBpcre2_match()\fP, \fBpcre2_dfa_match()\fP, or
|
||||
\fBpcre2_jit_match()\fP you must create a match data block by calling one of
|
||||
the creation functions above. For \fBpcre2_match_data_create()\fP, the first
|
||||
argument is the number of pairs of offsets in the \fIovector\fP. One pair of
|
||||
offsets is required to identify the string that matched the whole pattern, with
|
||||
an additional pair for each captured substring. For example, a value of 4
|
||||
creates enough space to record the matched portion of the subject plus three
|
||||
captured substrings. A minimum of at least 1 pair is imposed by
|
||||
\fBpcre2_match_data_create()\fP, so it is always possible to return the overall
|
||||
matched string.
|
||||
argument is the number of pairs of offsets in the \fIovector\fP.
|
||||
.P
|
||||
When using \fBpcre2_match()\fP, one pair of offsets is required to identify the
|
||||
string that matched the whole pattern, with an additional pair for each
|
||||
captured substring. For example, a value of 4 creates enough space to record
|
||||
the matched portion of the subject plus three captured substrings.
|
||||
.P
|
||||
When using \fBpcre2_dfa_match()\fP there may be multiple matched substrings of
|
||||
different lengths at the same point in the subject. The ovector should be made
|
||||
large enough to hold as many as are expected.
|
||||
.P
|
||||
A minimum of at least 1 pair is imposed by \fBpcre2_match_data_create()\fP, so
|
||||
it is always possible to return the overall matched string in the case of
|
||||
\fBpcre2_match()\fP or the longest match in the case of
|
||||
\fBpcre2_dfa_match()\fP.
|
||||
.P
|
||||
The second argument of \fBpcre2_match_data_create()\fP is a pointer to a
|
||||
general context, which can specify custom memory management for obtaining the
|
||||
|
@ -2511,10 +2528,11 @@ pass NULL, which causes \fBmalloc()\fP to be used.
|
|||
.P
|
||||
For \fBpcre2_match_data_create_from_pattern()\fP, the first argument is a
|
||||
pointer to a compiled pattern. The ovector is created to be exactly the right
|
||||
size to hold all the substrings a pattern might capture. The second argument is
|
||||
again a pointer to a general context, but in this case if NULL is passed, the
|
||||
memory is obtained using the same allocator that was used for the compiled
|
||||
pattern (custom or default).
|
||||
size to hold all the substrings a pattern might capture when matched using
|
||||
\fBpcre2_match()\fP. You should not use this call when matching with
|
||||
\fBpcre2_dfa_match()\fP. The second argument is again a pointer to a general
|
||||
context, but in this case if NULL is passed, the memory is obtained using the
|
||||
same allocator that was used for the compiled pattern (custom or default).
|
||||
.P
|
||||
A match data block can be used many times, with the same or different compiled
|
||||
patterns. You can extract information from a match data block after a match
|
||||
|
@ -2608,7 +2626,9 @@ The subject string is passed to \fBpcre2_match()\fP as a pointer in
|
|||
\fIstartoffset\fP. The length and offset are in code units, not characters.
|
||||
That is, they are in bytes for the 8-bit library, 16-bit code units for the
|
||||
16-bit library, and 32-bit code units for the 32-bit library, whether or not
|
||||
UTF processing is enabled.
|
||||
UTF processing is enabled. As a special case, if \fIsubject\fP is NULL and
|
||||
\fIlength\fP is zero, the subject is assumed to be an empty string. If
|
||||
\fIlength\fP is non-zero, an error occurs if \fIsubject\fP is NULL.
|
||||
.P
|
||||
If \fIstartoffset\fP is greater than the length of the subject,
|
||||
\fBpcre2_match()\fP returns PCRE2_ERROR_BADOFFSET. When the starting offset is
|
||||
|
@ -2628,10 +2648,10 @@ lookbehind. For example, consider the pattern
|
|||
.sp
|
||||
which finds occurrences of "iss" in the middle of words. (\eB matches only if
|
||||
the current position in the subject is not a word boundary.) When applied to
|
||||
the string "Mississipi" the first call to \fBpcre2_match()\fP finds the first
|
||||
the string "Mississippi" the first call to \fBpcre2_match()\fP finds the first
|
||||
occurrence. If \fBpcre2_match()\fP is called again with just the remainder of
|
||||
the subject, namely "issipi", it does not match, because \eB is always false at
|
||||
the start of the subject, which is deemed to be a word boundary. However, if
|
||||
the subject, namely "issippi", it does not match, because \eB is always false
|
||||
at the start of the subject, which is deemed to be a word boundary. However, if
|
||||
\fBpcre2_match()\fP is passed the entire string again, but with
|
||||
\fIstartoffset\fP set to 4, it finds the second occurrence of "iss" because it
|
||||
is able to look behind the starting point to discover that it is preceded by a
|
||||
|
@ -3142,11 +3162,11 @@ The backtracking match limit was reached.
|
|||
.sp
|
||||
PCRE2_ERROR_NOMEMORY
|
||||
.sp
|
||||
If a pattern contains many nested backtracking points, heap memory is used to
|
||||
remember them. This error is given when the memory allocation function (default
|
||||
or custom) fails. Note that a different error, PCRE2_ERROR_HEAPLIMIT, is given
|
||||
if the amount of memory needed exceeds the heap limit. PCRE2_ERROR_NOMEMORY is
|
||||
also returned if PCRE2_COPY_MATCHED_SUBJECT is set and memory allocation fails.
|
||||
Heap memory is used to remember backgracking points. This error is given when
|
||||
the memory allocation function (default or custom) fails. Note that a different
|
||||
error, PCRE2_ERROR_HEAPLIMIT, is given if the amount of memory needed exceeds
|
||||
the heap limit. PCRE2_ERROR_NOMEMORY is also returned if
|
||||
PCRE2_COPY_MATCHED_SUBJECT is set and memory allocation fails.
|
||||
.sp
|
||||
PCRE2_ERROR_NULL
|
||||
.sp
|
||||
|
@ -3397,12 +3417,16 @@ same number causes an error at compile time.
|
|||
.P
|
||||
This function optionally calls \fBpcre2_match()\fP and then makes a copy of the
|
||||
subject string in \fIoutputbuffer\fP, replacing parts that were matched with
|
||||
the \fIreplacement\fP string, whose length is supplied in \fBrlength\fP. This
|
||||
can be given as PCRE2_ZERO_TERMINATED for a zero-terminated string. There is an
|
||||
option (see PCRE2_SUBSTITUTE_REPLACEMENT_ONLY below) to return just the
|
||||
replacement string(s). The default action is to perform just one replacement if
|
||||
the pattern matches, but there is an option that requests multiple replacements
|
||||
(see PCRE2_SUBSTITUTE_GLOBAL below).
|
||||
the \fIreplacement\fP string, whose length is supplied in \fBrlength\fP, which
|
||||
can be given as PCRE2_ZERO_TERMINATED for a zero-terminated string. As a
|
||||
special case, if \fIreplacement\fP is NULL and \fIrlength\fP is zero, the
|
||||
replacement is assumed to be an empty string. If \fIrlength\fP is non-zero, an
|
||||
error occurs if \fIreplacement\fP is NULL.
|
||||
.P
|
||||
There is an option (see PCRE2_SUBSTITUTE_REPLACEMENT_ONLY below) to return just
|
||||
the replacement string(s). The default action is to perform just one
|
||||
replacement if the pattern matches, but there is an option that requests
|
||||
multiple replacements (see PCRE2_SUBSTITUTE_GLOBAL below).
|
||||
.P
|
||||
If successful, \fBpcre2_substitute()\fP returns the number of substitutions
|
||||
that were carried out. This may be zero if no match was found, and is never
|
||||
|
@ -3431,12 +3455,12 @@ block may or may not have been changed.
|
|||
As well as the usual options for \fBpcre2_match()\fP, a number of additional
|
||||
options can be set in the \fIoptions\fP argument of \fBpcre2_substitute()\fP.
|
||||
One such option is PCRE2_SUBSTITUTE_MATCHED. When this is set, an external
|
||||
\fImatch_data\fP block must be provided, and it must have been used for an
|
||||
external call to \fBpcre2_match()\fP. The data in the \fImatch_data\fP block
|
||||
(return code, offset vector) is used for the first substitution instead of
|
||||
calling \fBpcre2_match()\fP from within \fBpcre2_substitute()\fP. This allows
|
||||
an application to check for a match before choosing to substitute, without
|
||||
having to repeat the match.
|
||||
\fImatch_data\fP block must be provided, and it must have already been used for
|
||||
an external call to \fBpcre2_match()\fP with the same pattern and subject
|
||||
arguments. The data in the \fImatch_data\fP block (return code, offset vector)
|
||||
is then used for the first substitution instead of calling \fBpcre2_match()\fP
|
||||
from within \fBpcre2_substitute()\fP. This allows an application to check for a
|
||||
match before choosing to substitute, without having to repeat the match.
|
||||
.P
|
||||
The contents of the externally supplied match data block are not changed when
|
||||
PCRE2_SUBSTITUTE_MATCHED is set. If PCRE2_SUBSTITUTE_GLOBAL is also set,
|
||||
|
@ -3568,7 +3592,7 @@ and force lower case. The escape sequences change the current state: \eU and
|
|||
terminating a \eQ quoted sequence) reverts to no case forcing. The sequences
|
||||
\eu and \el force the next character (if it is a letter) to upper or lower
|
||||
case, respectively, and then the state automatically reverts to no case
|
||||
forcing. Case forcing applies to all inserted characters, including those from
|
||||
forcing. Case forcing applies to all inserted characters, including those from
|
||||
capture groups and letters within \eQ...\eE quoted sequences. If either
|
||||
PCRE2_UTF or PCRE2_UCP was set when the pattern was compiled, Unicode
|
||||
properties are used for case forcing characters whose code points are greater
|
||||
|
@ -3633,7 +3657,9 @@ needed is returned via \fIoutlengthptr\fP. Note that this does not happen by
|
|||
default.
|
||||
.P
|
||||
PCRE2_ERROR_NULL is returned if PCRE2_SUBSTITUTE_MATCHED is set but the
|
||||
\fImatch_data\fP argument is NULL.
|
||||
\fImatch_data\fP argument is NULL or if the \fIsubject\fP or \fIreplacement\fP
|
||||
arguments are NULL. For backward compatibility reasons an exception is made for
|
||||
the \fIreplacement\fP argument if the \fIrlength\fP argument is also 0.
|
||||
.P
|
||||
PCRE2_ERROR_BADREPLACEMENT is used for miscellaneous syntax errors in the
|
||||
replacement string, with more particular errors being PCRE2_ERROR_BADREPESCAPE
|
||||
|
@ -3795,12 +3821,13 @@ other alternatives. Ultimately, when it runs out of matches,
|
|||
.P
|
||||
The function \fBpcre2_dfa_match()\fP is called to match a subject string
|
||||
against a compiled pattern, using a matching algorithm that scans the subject
|
||||
string just once (not counting lookaround assertions), and does not backtrack.
|
||||
This has different characteristics to the normal algorithm, and is not
|
||||
compatible with Perl. Some of the features of PCRE2 patterns are not supported.
|
||||
Nevertheless, there are times when this kind of matching can be useful. For a
|
||||
discussion of the two matching algorithms, and a list of features that
|
||||
\fBpcre2_dfa_match()\fP does not support, see the
|
||||
string just once (not counting lookaround assertions), and does not backtrack
|
||||
(except when processing lookaround assertions). This has different
|
||||
characteristics to the normal algorithm, and is not compatible with Perl. Some
|
||||
of the features of PCRE2 patterns are not supported. Nevertheless, there are
|
||||
times when this kind of matching can be useful. For a discussion of the two
|
||||
matching algorithms, and a list of features that \fBpcre2_dfa_match()\fP does
|
||||
not support, see the
|
||||
.\" HREF
|
||||
\fBpcre2matching\fP
|
||||
.\"
|
||||
|
@ -3832,7 +3859,7 @@ Here is an example of a simple call to \fBpcre2_dfa_match()\fP:
|
|||
wspace, /* working space vector */
|
||||
20); /* number of elements (NOT size in bytes) */
|
||||
.
|
||||
.SS "Option bits for \fBpcre_dfa_match()\fP"
|
||||
.SS "Option bits for \fBpcre2_dfa_match()\fP"
|
||||
.rs
|
||||
.sp
|
||||
The unused bits of the \fIoptions\fP argument for \fBpcre2_dfa_match()\fP must
|
||||
|
@ -3991,7 +4018,7 @@ fail, this error is given.
|
|||
.sp
|
||||
.nf
|
||||
Philip Hazel
|
||||
University Computing Service
|
||||
Retired from University Computing Service
|
||||
Cambridge, England.
|
||||
.fi
|
||||
.
|
||||
|
@ -4000,6 +4027,6 @@ Cambridge, England.
|
|||
.rs
|
||||
.sp
|
||||
.nf
|
||||
Last updated: 04 November 2020
|
||||
Copyright (c) 1997-2020 University of Cambridge.
|
||||
Last updated: 27 July 2022
|
||||
Copyright (c) 1997-2022 University of Cambridge.
|
||||
.fi
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
.TH PCRE2BUILD 3 "20 March 2020" "PCRE2 10.35"
|
||||
.TH PCRE2BUILD 3 "27 July 2022" "PCRE2 10.41"
|
||||
.SH NAME
|
||||
PCRE2 - Perl-compatible regular expressions (revised API)
|
||||
.
|
||||
|
@ -122,8 +122,9 @@ locked this out by setting PCRE2_NEVER_UTF.
|
|||
UTF support allows the libraries to process character code points up to
|
||||
0x10ffff in the strings that they handle. Unicode support also gives access to
|
||||
the Unicode properties of characters, using pattern escapes such as \eP, \ep,
|
||||
and \eX. Only the general category properties such as \fILu\fP and \fINd\fP are
|
||||
supported. Details are given in the
|
||||
and \eX. Only the general category properties such as \fILu\fP and \fINd\fP,
|
||||
script names, and some bi-directional properties are supported. Details are
|
||||
given in the
|
||||
.\" HREF
|
||||
\fBpcre2pattern\fP
|
||||
.\"
|
||||
|
@ -277,12 +278,11 @@ to the \fBconfigure\fP command. This setting also applies to the
|
|||
\fBpcre2_dfa_match()\fP matching function, and to JIT matching (though the
|
||||
counting is done differently).
|
||||
.P
|
||||
The \fBpcre2_match()\fP function starts out using a 20KiB vector on the system
|
||||
stack to record backtracking points. The more nested backtracking points there
|
||||
are (that is, the deeper the search tree), the more memory is needed. If the
|
||||
initial vector is not large enough, heap memory is used, up to a certain limit,
|
||||
which is specified in kibibytes (units of 1024 bytes). The limit can be changed
|
||||
at run time, as described in the
|
||||
The \fBpcre2_match()\fP function uses heap memory to record backtracking
|
||||
points. The more nested backtracking points there are (that is, the deeper the
|
||||
search tree), the more memory is needed. There is an upper limit, specified in
|
||||
kibibytes (units of 1024 bytes). This limit can be changed at run time, as
|
||||
described in the
|
||||
.\" HREF
|
||||
\fBpcre2api\fP
|
||||
.\"
|
||||
|
@ -302,7 +302,7 @@ You can also explicitly limit the depth of nested backtracking in the
|
|||
for --with-match-limit. You can set a lower default limit by adding, for
|
||||
example,
|
||||
.sp
|
||||
--with-match-limit_depth=10000
|
||||
--with-match-limit-depth=10000
|
||||
.sp
|
||||
to the \fBconfigure\fP command. This value can be overridden at run time. This
|
||||
depth limit indirectly limits the amount of heap memory that is used, but
|
||||
|
@ -563,15 +563,16 @@ documentation.
|
|||
.sp
|
||||
The C99 standard defines formatting modifiers z and t for size_t and
|
||||
ptrdiff_t values, respectively. By default, PCRE2 uses these modifiers in
|
||||
environments other than Microsoft Visual Studio when __STDC_VERSION__ is
|
||||
defined and has a value greater than or equal to 199901L (indicating C99).
|
||||
environments other than old versions of Microsoft Visual Studio when
|
||||
__STDC_VERSION__ is defined and has a value greater than or equal to 199901L
|
||||
(indicating support for C99).
|
||||
However, there is at least one environment that claims to be C99 but does not
|
||||
support these modifiers. If
|
||||
.sp
|
||||
--disable-percent-zt
|
||||
.sp
|
||||
is specified, no use is made of the z or t modifiers. Instead of %td or %zu,
|
||||
%lu is used, with a cast for size_t values.
|
||||
a suitable format is used depending in the size of long for the platform.
|
||||
.
|
||||
.
|
||||
.SH "SUPPORT FOR FUZZERS"
|
||||
|
@ -623,7 +624,7 @@ give a warning.
|
|||
.sp
|
||||
.nf
|
||||
Philip Hazel
|
||||
University Computing Service
|
||||
Retired from University Computing Service
|
||||
Cambridge, England.
|
||||
.fi
|
||||
.
|
||||
|
@ -632,6 +633,6 @@ Cambridge, England.
|
|||
.rs
|
||||
.sp
|
||||
.nf
|
||||
Last updated: 20 March 2020
|
||||
Copyright (c) 1997-2020 University of Cambridge.
|
||||
Last updated: 27 July 2022
|
||||
Copyright (c) 1997-2022 University of Cambridge.
|
||||
.fi
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
.TH PCRE2COMPAT 3 "06 October 2020" "PCRE2 10.36"
|
||||
.TH PCRE2COMPAT 3 "08 December 2021" "PCRE2 10.40"
|
||||
.SH NAME
|
||||
PCRE2 - Perl-compatible regular expressions (revised API)
|
||||
.SH "DIFFERENCES BETWEEN PCRE2 AND PERL"
|
||||
|
@ -6,31 +6,38 @@ PCRE2 - Perl-compatible regular expressions (revised API)
|
|||
.sp
|
||||
This document describes some of the differences in the ways that PCRE2 and Perl
|
||||
handle regular expressions. The differences described here are with respect to
|
||||
Perl version 5.32.0, but as both Perl and PCRE2 are continually changing, the
|
||||
Perl version 5.34.0, but as both Perl and PCRE2 are continually changing, the
|
||||
information may at times be out of date.
|
||||
.P
|
||||
1. PCRE2 has only a subset of Perl's Unicode support. Details of what it does
|
||||
1. When PCRE2_DOTALL (equivalent to Perl's /s qualifier) is not set, the
|
||||
behaviour of the '.' metacharacter differs from Perl. In PCRE2, '.' matches the
|
||||
next character unless it is the start of a newline sequence. This means that,
|
||||
if the newline setting is CR, CRLF, or NUL, '.' will match the code point LF
|
||||
(0x0A) in ASCII/Unicode environments, and NL (either 0x15 or 0x25) when using
|
||||
EBCDIC. In Perl, '.' appears never to match LF, even when 0x0A is not a newline
|
||||
indicator.
|
||||
.P
|
||||
2. PCRE2 has only a subset of Perl's Unicode support. Details of what it does
|
||||
have are given in the
|
||||
.\" HREF
|
||||
\fBpcre2unicode\fP
|
||||
.\"
|
||||
page.
|
||||
.P
|
||||
2. Like Perl, PCRE2 allows repeat quantifiers on parenthesized assertions, but
|
||||
3. Like Perl, PCRE2 allows repeat quantifiers on parenthesized assertions, but
|
||||
they do not mean what you might think. For example, (?!a){3} does not assert
|
||||
that the next three characters are not "a". It just asserts that the next
|
||||
character is not "a" three times (in principle; PCRE2 optimizes this to run the
|
||||
assertion just once). Perl allows some repeat quantifiers on other assertions,
|
||||
for example, \eb* (but not \eb{3}, though oddly it does allow ^{3}), but these
|
||||
do not seem to have any use. PCRE2 does not allow any kind of quantifier on
|
||||
non-lookaround assertions.
|
||||
for example, \eb* , but these do not seem to have any use. PCRE2 does not allow
|
||||
any kind of quantifier on non-lookaround assertions.
|
||||
.P
|
||||
3. Capture groups that occur inside negative lookaround assertions are counted,
|
||||
4. Capture groups that occur inside negative lookaround assertions are counted,
|
||||
but their entries in the offsets vector are set only when a negative assertion
|
||||
is a condition that has a matching branch (that is, the condition is false).
|
||||
Perl may set such capture groups in other circumstances.
|
||||
.P
|
||||
4. The following Perl escape sequences are not supported: \eF, \el, \eL, \eu,
|
||||
5. The following Perl escape sequences are not supported: \eF, \el, \eL, \eu,
|
||||
\eU, and \eN when followed by a character name. \eN on its own, matching a
|
||||
non-newline character, and \eN{U+dd..}, matching a Unicode code point, are
|
||||
supported. The escapes that modify the case of following letters are
|
||||
|
@ -40,12 +47,12 @@ generated by default. However, if either of the PCRE2_ALT_BSUX or
|
|||
PCRE2_EXTRA_ALT_BSUX options is set, \eU and \eu are interpreted as ECMAScript
|
||||
interprets them.
|
||||
.P
|
||||
5. The Perl escape sequences \ep, \eP, and \eX are supported only if PCRE2 is
|
||||
6. The Perl escape sequences \ep, \eP, and \eX are supported only if PCRE2 is
|
||||
built with Unicode support (the default). The properties that can be tested
|
||||
with \ep and \eP are limited to the general category properties such as Lu and
|
||||
Nd, script names such as Greek or Han, and the derived properties Any and L&.
|
||||
Both PCRE2 and Perl support the Cs (surrogate) property, but in PCRE2 its use
|
||||
is limited. See the
|
||||
Nd, script names such as Greek or Han, Bidi_Class, Bidi_Control, and the
|
||||
derived properties Any and LC (synonym L&). Both PCRE2 and Perl support the Cs
|
||||
(surrogate) property, but in PCRE2 its use is limited. See the
|
||||
.\" HREF
|
||||
\fBpcre2pattern\fP
|
||||
.\"
|
||||
|
@ -53,14 +60,14 @@ documentation for details. The long synonyms for property names that Perl
|
|||
supports (such as \ep{Letter}) are not supported by PCRE2, nor is it permitted
|
||||
to prefix any of these properties with "Is".
|
||||
.P
|
||||
6. PCRE2 supports the \eQ...\eE escape for quoting substrings. Characters
|
||||
7. PCRE2 supports the \eQ...\eE escape for quoting substrings. Characters
|
||||
in between are treated as literals. However, this is slightly different from
|
||||
Perl in that $ and @ are also handled as literals inside the quotes. In Perl,
|
||||
they cause variable interpolation (but of course PCRE2 does not have
|
||||
variables). Also, Perl does "double-quotish backslash interpolation" on any
|
||||
backslashes between \eQ and \eE which, its documentation says, "may lead to
|
||||
confusing results". PCRE2 treats a backslash between \eQ and \eE just like any
|
||||
other character. Note the following examples:
|
||||
they cause variable interpolation (PCRE2 does not have variables). Also, Perl
|
||||
does "double-quotish backslash interpolation" on any backslashes between \eQ
|
||||
and \eE which, its documentation says, "may lead to confusing results". PCRE2
|
||||
treats a backslash between \eQ and \eE just like any other character. Note the
|
||||
following examples:
|
||||
.sp
|
||||
Pattern PCRE2 matches Perl matches
|
||||
.sp
|
||||
|
@ -75,7 +82,7 @@ other character. Note the following examples:
|
|||
The \eQ...\eE sequence is recognized both inside and outside character classes
|
||||
by both PCRE2 and Perl.
|
||||
.P
|
||||
7. Fairly obviously, PCRE2 does not support the (?{code}) and (??{code})
|
||||
8. Fairly obviously, PCRE2 does not support the (?{code}) and (??{code})
|
||||
constructions. However, PCRE2 does have a "callout" feature, which allows an
|
||||
external function to be called during pattern matching. See the
|
||||
.\" HREF
|
||||
|
@ -83,11 +90,11 @@ external function to be called during pattern matching. See the
|
|||
.\"
|
||||
documentation for details.
|
||||
.P
|
||||
8. Subroutine calls (whether recursive or not) were treated as atomic groups up
|
||||
9. Subroutine calls (whether recursive or not) were treated as atomic groups up
|
||||
to PCRE2 release 10.23, but from release 10.30 this changed, and backtracking
|
||||
into subroutine calls is now supported, as in Perl.
|
||||
.P
|
||||
9. In PCRE2, if any of the backtracking control verbs are used in a group that
|
||||
10. In PCRE2, if any of the backtracking control verbs are used in a group that
|
||||
is called as a subroutine (whether or not recursively), their effect is
|
||||
confined to that group; it does not extend to the surrounding pattern. This is
|
||||
not always the case in Perl. In particular, if (*THEN) is present in a group
|
||||
|
@ -95,18 +102,18 @@ that is called as a subroutine, its action is limited to that group, even if
|
|||
the group does not contain any | characters. Note that such groups are
|
||||
processed as anchored at the point where they are tested.
|
||||
.P
|
||||
10. If a pattern contains more than one backtracking control verb, the first
|
||||
11. If a pattern contains more than one backtracking control verb, the first
|
||||
one that is backtracked onto acts. For example, in the pattern
|
||||
A(*COMMIT)B(*PRUNE)C a failure in B triggers (*COMMIT), but a failure in C
|
||||
triggers (*PRUNE). Perl's behaviour is more complex; in many cases it is the
|
||||
same as PCRE2, but there are cases where it differs.
|
||||
.P
|
||||
11. There are some differences that are concerned with the settings of captured
|
||||
12. There are some differences that are concerned with the settings of captured
|
||||
strings when part of a pattern is repeated. For example, matching "aba" against
|
||||
the pattern /^(a(b)?)+$/ in Perl leaves $2 unset, but in PCRE2 it is set to
|
||||
"b".
|
||||
.P
|
||||
12. PCRE2's handling of duplicate capture group numbers and names is not as
|
||||
13. PCRE2's handling of duplicate capture group numbers and names is not as
|
||||
general as Perl's. This is a consequence of the fact the PCRE2 works internally
|
||||
just with numbers, using an external table to translate between numbers and
|
||||
names. In particular, a pattern such as (?|(?<a>A)|(?<b>B)), where the two
|
||||
|
@ -115,35 +122,38 @@ causes an error at compile time. If it were allowed, it would not be possible
|
|||
to distinguish which group matched, because both names map to capture group
|
||||
number 1. To avoid this confusing situation, an error is given at compile time.
|
||||
.P
|
||||
13. Perl used to recognize comments in some places that PCRE2 does not, for
|
||||
14. Perl used to recognize comments in some places that PCRE2 does not, for
|
||||
example, between the ( and ? at the start of a group. If the /x modifier is
|
||||
set, Perl allowed white space between ( and ? though the latest Perls give an
|
||||
error (for a while it was just deprecated). There may still be some cases where
|
||||
Perl behaves differently.
|
||||
.P
|
||||
14. Perl, when in warning mode, gives warnings for character classes such as
|
||||
15. Perl, when in warning mode, gives warnings for character classes such as
|
||||
[A-\ed] or [a-[:digit:]]. It then treats the hyphens as literals. PCRE2 has no
|
||||
warning features, so it gives an error in these cases because they are almost
|
||||
certainly user mistakes.
|
||||
.P
|
||||
15. In PCRE2, the upper/lower case character properties Lu and Ll are not
|
||||
16. In PCRE2, the upper/lower case character properties Lu and Ll are not
|
||||
affected when case-independent matching is specified. For example, \ep{Lu}
|
||||
always matches an upper case letter. I think Perl has changed in this respect;
|
||||
in the release at the time of writing (5.32), \ep{Lu} and \ep{Ll} match all
|
||||
in the release at the time of writing (5.34), \ep{Lu} and \ep{Ll} match all
|
||||
letters, regardless of case, when case independence is specified.
|
||||
.P
|
||||
16. From release 5.32.0, Perl locks out the use of \eK in lookaround
|
||||
assertions. In PCRE2, \eK is acted on when it occurs in positive assertions,
|
||||
but is ignored in negative assertions.
|
||||
17. From release 5.32.0, Perl locks out the use of \eK in lookaround
|
||||
assertions. From release 10.38 PCRE2 does the same by default. However, there
|
||||
is an option for re-enabling the previous behaviour. When this option is set,
|
||||
\eK is acted on when it occurs in positive assertions, but is ignored in
|
||||
negative assertions.
|
||||
.P
|
||||
17. PCRE2 provides some extensions to the Perl regular expression facilities.
|
||||
18. PCRE2 provides some extensions to the Perl regular expression facilities.
|
||||
Perl 5.10 included new features that were not in earlier versions of Perl, some
|
||||
of which (such as named parentheses) were in PCRE2 for some time before. This
|
||||
list is with respect to Perl 5.32:
|
||||
list is with respect to Perl 5.34:
|
||||
.sp
|
||||
(a) Although lookbehind assertions in PCRE2 must match fixed length strings,
|
||||
each alternative toplevel branch of a lookbehind assertion can match a
|
||||
different length of string. Perl requires them all to have the same length.
|
||||
different length of string. Perl used to require them all to have the same
|
||||
length, but the latest version has some variable length support.
|
||||
.sp
|
||||
(b) From PCRE2 10.23, backreferences to groups of fixed length are supported
|
||||
in lookbehinds, provided that there is no possibility of referencing a
|
||||
|
@ -184,11 +194,11 @@ the pattern.
|
|||
extension to the lookaround facilities. The default, Perl-compatible
|
||||
lookarounds are atomic.
|
||||
.P
|
||||
18. The Perl /a modifier restricts /d numbers to pure ascii, and the /aa
|
||||
19. The Perl /a modifier restricts /d numbers to pure ascii, and the /aa
|
||||
modifier restricts /i case-insensitive matching to pure ascii, ignoring Unicode
|
||||
rules. This separation cannot be represented with PCRE2_UCP.
|
||||
.P
|
||||
19. Perl has different limits than PCRE2. See the
|
||||
20. Perl has different limits than PCRE2. See the
|
||||
.\" HREF
|
||||
\fBpcre2limit\fP
|
||||
.\"
|
||||
|
@ -203,7 +213,7 @@ fall into any stack-overflow limit. PCRE2 made a similar change at release
|
|||
.sp
|
||||
.nf
|
||||
Philip Hazel
|
||||
University Computing Service
|
||||
Retired from University Computing Service
|
||||
Cambridge, England.
|
||||
.fi
|
||||
.
|
||||
|
@ -212,6 +222,6 @@ Cambridge, England.
|
|||
.rs
|
||||
.sp
|
||||
.nf
|
||||
Last updated: 06 October 2020
|
||||
Copyright (c) 1997-2019 University of Cambridge.
|
||||
Last updated: 08 December 2021
|
||||
Copyright (c) 1997-2021 University of Cambridge.
|
||||
.fi
|
||||
|
|
|
@ -116,8 +116,8 @@ permitted to match separator characters, but the double-star (**) feature
|
|||
(which does match separators) is supported.
|
||||
.P
|
||||
PCRE2_CONVERT_GLOB_NO_WILD_SEPARATOR matches globs with wildcards allowed to
|
||||
match separator characters. PCRE2_GLOB_NO_STARSTAR matches globs with the
|
||||
double-star feature disabled. These options may be given together.
|
||||
match separator characters. PCRE2_CONVERT_GLOB_NO_STARSTAR matches globs with
|
||||
the double-star feature disabled. These options may be given together.
|
||||
.
|
||||
.
|
||||
.SH "CONVERTING POSIX PATTERNS"
|
||||
|
|
|
@ -215,8 +215,8 @@ if (rc < 0)
|
|||
return 1;
|
||||
}
|
||||
|
||||
/* Match succeded. Get a pointer to the output vector, where string offsets are
|
||||
stored. */
|
||||
/* Match succeeded. Get a pointer to the output vector, where string offsets
|
||||
are stored. */
|
||||
|
||||
ovector = pcre2_get_ovector_pointer(match_data);
|
||||
printf("Match succeeded at offset %d\en", (int)ovector[0]);
|
||||
|
@ -234,9 +234,12 @@ pcre2_match_data_create_from_pattern() above. */
|
|||
if (rc == 0)
|
||||
printf("ovector was not big enough for all the captured substrings\en");
|
||||
|
||||
/* We must guard against patterns such as /(?=.\eK)/ that use \eK in an assertion
|
||||
to set the start of a match later than its end. In this demonstration program,
|
||||
we just detect this case and give up. */
|
||||
/* Since release 10.38 PCRE2 has locked out the use of \eK in lookaround
|
||||
assertions. However, there is an option to re-enable the old behaviour. If that
|
||||
is set, it is possible to run patterns such as /(?=.\eK)/ that use \eK in an
|
||||
assertion to set the start of a match later than its end. In this demonstration
|
||||
program, we show how to detect this case, but it shouldn't arise because the
|
||||
option is never set. */
|
||||
|
||||
if (ovector[0] > ovector[1])
|
||||
{
|
||||
|
@ -453,7 +456,7 @@ for (;;)
|
|||
return 1;
|
||||
}
|
||||
|
||||
/* Match succeded */
|
||||
/* Match succeeded */
|
||||
|
||||
printf("\enMatch succeeded again at offset %d\en", (int)ovector[0]);
|
||||
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
.TH PCRE2GREP 1 "04 October 2020" "PCRE2 10.36"
|
||||
.TH PCRE2GREP 1 "30 July 2022" "PCRE2 10.41"
|
||||
.SH NAME
|
||||
pcre2grep - a grep with Perl-compatible regular expressions.
|
||||
.SH SYNOPSIS
|
||||
|
@ -43,13 +43,15 @@ For example:
|
|||
.sp
|
||||
pcre2grep some-pattern file1 - file3
|
||||
.sp
|
||||
Input files are searched line by line. By default, each line that matches a
|
||||
By default, input files are searched line by line. Each line that matches a
|
||||
pattern is copied to the standard output, and if there is more than one file,
|
||||
the file name is output at the start of each line, followed by a colon.
|
||||
However, there are options that can change how \fBpcre2grep\fP behaves. In
|
||||
particular, the \fB-M\fP option makes it possible to search for strings that
|
||||
span line boundaries. What defines a line boundary is controlled by the
|
||||
\fB-N\fP (\fB--newline\fP) option.
|
||||
However, there are options that can change how \fBpcre2grep\fP behaves. For
|
||||
example, the \fB-M\fP option makes it possible to search for strings that span
|
||||
line boundaries. What defines a line boundary is controlled by the \fB-N\fP
|
||||
(\fB--newline\fP) option. The \fB-h\fP and \fB-H\fP options control whether or
|
||||
not file names are shown, and the \fB-Z\fP option changes the file name
|
||||
terminator to a zero byte.
|
||||
.P
|
||||
The amount of memory used for buffering files that are being scanned is
|
||||
controlled by parameters that can be set by the \fB--buffer-size\fP and
|
||||
|
@ -149,22 +151,30 @@ Output up to \fInumber\fP lines of context after each matching line. Fewer
|
|||
lines are output if the next match or the end of the file is reached, or if the
|
||||
processing buffer size has been set too small. If file names and/or line
|
||||
numbers are being output, a hyphen separator is used instead of a colon for the
|
||||
context lines. A line containing "--" is output between each group of lines,
|
||||
unless they are in fact contiguous in the input file. The value of \fInumber\fP
|
||||
is expected to be relatively small. When \fB-c\fP is used, \fB-A\fP is ignored.
|
||||
context lines (the \fB-Z\fP option can be used to change the file name
|
||||
terminator to a zero byte). A line containing "--" is output between each group
|
||||
of lines, unless they are in fact contiguous in the input file. The value of
|
||||
\fInumber\fP is expected to be relatively small. When \fB-c\fP is used,
|
||||
\fB-A\fP is ignored.
|
||||
.TP
|
||||
\fB-a\fP, \fB--text\fP
|
||||
Treat binary files as text. This is equivalent to
|
||||
\fB--binary-files\fP=\fItext\fP.
|
||||
.TP
|
||||
\fB--allow-lookaround-bsk\fP
|
||||
PCRE2 now forbids the use of \eK in lookarounds by default, in line with Perl.
|
||||
This option causes \fBpcre2grep\fP to set the PCRE2_EXTRA_ALLOW_LOOKAROUND_BSK
|
||||
option, which enables this somewhat dangerous usage.
|
||||
.TP
|
||||
\fB-B\fP \fInumber\fP, \fB--before-context=\fP\fInumber\fP
|
||||
Output up to \fInumber\fP lines of context before each matching line. Fewer
|
||||
lines are output if the previous match or the start of the file is within
|
||||
\fInumber\fP lines, or if the processing buffer size has been set too small. If
|
||||
file names and/or line numbers are being output, a hyphen separator is used
|
||||
instead of a colon for the context lines. A line containing "--" is output
|
||||
between each group of lines, unless they are in fact contiguous in the input
|
||||
file. The value of \fInumber\fP is expected to be relatively small. When
|
||||
instead of a colon for the context lines (the \fB-Z\fP option can be used to
|
||||
change the file name terminator to a zero byte). A line containing "--" is
|
||||
output between each group of lines, unless they are in fact contiguous in the
|
||||
input file. The value of \fInumber\fP is expected to be relatively small. When
|
||||
\fB-c\fP is used, \fB-B\fP is ignored.
|
||||
.TP
|
||||
\fB--binary-files=\fP\fIword\fP
|
||||
|
@ -351,19 +361,21 @@ shown separately. This option is mutually exclusive with \fB--output\fP,
|
|||
.TP
|
||||
\fB-H\fP, \fB--with-filename\fP
|
||||
Force the inclusion of the file name at the start of output lines when
|
||||
searching a single file. By default, the file name is not shown in this case.
|
||||
For matching lines, the file name is followed by a colon; for context lines, a
|
||||
hyphen separator is used. If a line number is also being output, it follows the
|
||||
file name. When the \fB-M\fP option causes a pattern to match more than one
|
||||
line, only the first is preceded by the file name. This option overrides any
|
||||
previous \fB-h\fP, \fB-l\fP, or \fB-L\fP options.
|
||||
searching a single file. The file name is not normally shown in this case.
|
||||
By default, for matching lines, the file name is followed by a colon; for
|
||||
context lines, a hyphen separator is used. The \fB-Z\fP option can be used to
|
||||
change the terminator to a zero byte. If a line number is also being output,
|
||||
it follows the file name. When the \fB-M\fP option causes a pattern to match
|
||||
more than one line, only the first is preceded by the file name. This option
|
||||
overrides any previous \fB-h\fP, \fB-l\fP, or \fB-L\fP options.
|
||||
.TP
|
||||
\fB-h\fP, \fB--no-filename\fP
|
||||
Suppress the output file names when searching multiple files. By default,
|
||||
file names are shown when multiple files are searched. For matching lines, the
|
||||
file name is followed by a colon; for context lines, a hyphen separator is used.
|
||||
If a line number is also being output, it follows the file name. This option
|
||||
overrides any previous \fB-H\fP, \fB-L\fP, or \fB-l\fP options.
|
||||
Suppress the output file names when searching multiple files. File names are
|
||||
normally shown when multiple files are searched. By default, for matching
|
||||
lines, the file name is followed by a colon; for context lines, a hyphen
|
||||
separator is used. The \fB-Z\fP option can be used to change the terminator to
|
||||
a zero byte. If a line number is also being output, it follows the file name.
|
||||
This option overrides any previous \fB-H\fP, \fB-L\fP, or \fB-l\fP options.
|
||||
.TP
|
||||
\fB--heap-limit\fP=\fInumber\fP
|
||||
See \fB--match-limit\fP below.
|
||||
|
@ -412,17 +424,19 @@ given any number of times. If a directory matches both \fB--include-dir\fP and
|
|||
\fB-L\fP, \fB--files-without-match\fP
|
||||
Instead of outputting lines from the files, just output the names of the files
|
||||
that do not contain any lines that would have been output. Each file name is
|
||||
output once, on a separate line. This option overrides any previous \fB-H\fP,
|
||||
\fB-h\fP, or \fB-l\fP options.
|
||||
output once, on a separate line by default, but if the \fB-Z\fP option is set,
|
||||
they are separated by zero bytes instead of newlines. This option overrides any
|
||||
previous \fB-H\fP, \fB-h\fP, or \fB-l\fP options.
|
||||
.TP
|
||||
\fB-l\fP, \fB--files-with-matches\fP
|
||||
Instead of outputting lines from the files, just output the names of the files
|
||||
containing lines that would have been output. Each file name is output once, on
|
||||
a separate line. Searching normally stops as soon as a matching line is found
|
||||
in a file. However, if the \fB-c\fP (count) option is also used, matching
|
||||
continues in order to obtain the correct count, and those files that have at
|
||||
least one match are listed along with their counts. Using this option with
|
||||
\fB-c\fP is a way of suppressing the listing of files with no matches that
|
||||
a separate line, but if the \fB-Z\fP option is set, they are separated by zero
|
||||
bytes instead of newlines. Searching normally stops as soon as a matching line
|
||||
is found in a file. However, if the \fB-c\fP (count) option is also used,
|
||||
matching continues in order to obtain the correct count, and those files that
|
||||
have at least one match are listed along with their counts. Using this option
|
||||
with \fB-c\fP is a way of suppressing the listing of files with no matches that
|
||||
occurs with \fB-c\fP on its own. This option overrides any previous \fB-H\fP,
|
||||
\fB-h\fP, or \fB-L\fP options.
|
||||
.TP
|
||||
|
@ -511,10 +525,7 @@ counter that is incremented each time around its main processing loop. If the
|
|||
value set by \fB--match-limit\fP is reached, an error occurs.
|
||||
.sp
|
||||
The \fB--heap-limit\fP option specifies, as a number of kibibytes (units of
|
||||
1024 bytes), the amount of heap memory that may be used for matching. Heap
|
||||
memory is needed only if matching the pattern requires a significant number of
|
||||
nested backtracking points to be remembered. This parameter can be set to zero
|
||||
to forbid the use of heap memory altogether.
|
||||
1024 bytes), the maximum amount of heap memory that may be used for matching.
|
||||
.sp
|
||||
The \fB--depth-limit\fP option limits the depth of nested backtracking points,
|
||||
which indirectly limits the amount of memory that is used. The amount of memory
|
||||
|
@ -727,6 +738,12 @@ be more than one line. This is equivalent to having "^(?:" at the start of each
|
|||
pattern and ")$" at the end. This option applies only to the patterns that are
|
||||
matched against the contents of files; it does not apply to patterns specified
|
||||
by any of the \fB--include\fP or \fB--exclude\fP options.
|
||||
.TP
|
||||
\fB-Z\fP, \fB--null\fP
|
||||
Terminate files names in the regular output with a zero byte (the NUL
|
||||
character) instead of what would normally appear. This is useful when file
|
||||
names contain unusual characters such as colons, hyphens, or even newlines. The
|
||||
option does not apply to file names in error messages.
|
||||
.
|
||||
.
|
||||
.SH "ENVIRONMENT VARIABLES"
|
||||
|
@ -946,7 +963,7 @@ because VMS does not distinguish between exit(0) and exit(1).
|
|||
.sp
|
||||
.nf
|
||||
Philip Hazel
|
||||
University Computing Service
|
||||
Retired from University Computing Service
|
||||
Cambridge, England.
|
||||
.fi
|
||||
.
|
||||
|
@ -955,6 +972,6 @@ Cambridge, England.
|
|||
.rs
|
||||
.sp
|
||||
.nf
|
||||
Last updated: 04 October 2020
|
||||
Copyright (c) 1997-2020 University of Cambridge.
|
||||
Last updated: 30 July 2022
|
||||
Copyright (c) 1997-2022 University of Cambridge.
|
||||
.fi
|
||||
|
|
|
@ -42,13 +42,15 @@ DESCRIPTION
|
|||
|
||||
pcre2grep some-pattern file1 - file3
|
||||
|
||||
Input files are searched line by line. By default, each line that
|
||||
By default, input files are searched line by line. Each line that
|
||||
matches a pattern is copied to the standard output, and if there is
|
||||
more than one file, the file name is output at the start of each line,
|
||||
followed by a colon. However, there are options that can change how
|
||||
pcre2grep behaves. In particular, the -M option makes it possible to
|
||||
pcre2grep behaves. For example, the -M option makes it possible to
|
||||
search for strings that span line boundaries. What defines a line
|
||||
boundary is controlled by the -N (--newline) option.
|
||||
boundary is controlled by the -N (--newline) option. The -h and -H op-
|
||||
tions control whether or not file names are shown, and the -Z option
|
||||
changes the file name terminator to a zero byte.
|
||||
|
||||
The amount of memory used for buffering files that are being scanned is
|
||||
controlled by parameters that can be set by the --buffer-size and
|
||||
|
@ -149,26 +151,35 @@ OPTIONS
|
|||
the file is reached, or if the processing buffer size has
|
||||
been set too small. If file names and/or line numbers are be-
|
||||
ing output, a hyphen separator is used instead of a colon for
|
||||
the context lines. A line containing "--" is output between
|
||||
each group of lines, unless they are in fact contiguous in
|
||||
the input file. The value of number is expected to be rela-
|
||||
tively small. When -c is used, -A is ignored.
|
||||
the context lines (the -Z option can be used to change the
|
||||
file name terminator to a zero byte). A line containing "--"
|
||||
is output between each group of lines, unless they are in
|
||||
fact contiguous in the input file. The value of number is ex-
|
||||
pected to be relatively small. When -c is used, -A is ig-
|
||||
nored.
|
||||
|
||||
-a, --text
|
||||
Treat binary files as text. This is equivalent to --binary-
|
||||
files=text.
|
||||
|
||||
--allow-lookaround-bsk
|
||||
PCRE2 now forbids the use of \K in lookarounds by default, in
|
||||
line with Perl. This option causes pcre2grep to set the
|
||||
PCRE2_EXTRA_ALLOW_LOOKAROUND_BSK option, which enables this
|
||||
somewhat dangerous usage.
|
||||
|
||||
-B number, --before-context=number
|
||||
Output up to number lines of context before each matching
|
||||
line. Fewer lines are output if the previous match or the
|
||||
start of the file is within number lines, or if the process-
|
||||
ing buffer size has been set too small. If file names and/or
|
||||
Output up to number lines of context before each matching
|
||||
line. Fewer lines are output if the previous match or the
|
||||
start of the file is within number lines, or if the process-
|
||||
ing buffer size has been set too small. If file names and/or
|
||||
line numbers are being output, a hyphen separator is used in-
|
||||
stead of a colon for the context lines. A line containing
|
||||
"--" is output between each group of lines, unless they are
|
||||
in fact contiguous in the input file. The value of number is
|
||||
expected to be relatively small. When -c is used, -B is ig-
|
||||
nored.
|
||||
stead of a colon for the context lines (the -Z option can be
|
||||
used to change the file name terminator to a zero byte). A
|
||||
line containing "--" is output between each group of lines,
|
||||
unless they are in fact contiguous in the input file. The
|
||||
value of number is expected to be relatively small. When -c
|
||||
is used, -B is ignored.
|
||||
|
||||
--binary-files=word
|
||||
Specify how binary files are to be processed. If the word is
|
||||
|
@ -381,89 +392,94 @@ OPTIONS
|
|||
|
||||
-H, --with-filename
|
||||
Force the inclusion of the file name at the start of output
|
||||
lines when searching a single file. By default, the file name
|
||||
is not shown in this case. For matching lines, the file name
|
||||
is followed by a colon; for context lines, a hyphen separator
|
||||
is used. If a line number is also being output, it follows
|
||||
the file name. When the -M option causes a pattern to match
|
||||
more than one line, only the first is preceded by the file
|
||||
name. This option overrides any previous -h, -l, or -L op-
|
||||
tions.
|
||||
lines when searching a single file. The file name is not nor-
|
||||
mally shown in this case. By default, for matching lines,
|
||||
the file name is followed by a colon; for context lines, a
|
||||
hyphen separator is used. The -Z option can be used to change
|
||||
the terminator to a zero byte. If a line number is also being
|
||||
output, it follows the file name. When the -M option causes a
|
||||
pattern to match more than one line, only the first is pre-
|
||||
ceded by the file name. This option overrides any previous
|
||||
-h, -l, or -L options.
|
||||
|
||||
-h, --no-filename
|
||||
Suppress the output file names when searching multiple files.
|
||||
By default, file names are shown when multiple files are
|
||||
searched. For matching lines, the file name is followed by a
|
||||
colon; for context lines, a hyphen separator is used. If a
|
||||
line number is also being output, it follows the file name.
|
||||
This option overrides any previous -H, -L, or -l options.
|
||||
File names are normally shown when multiple files are
|
||||
searched. By default, for matching lines, the file name is
|
||||
followed by a colon; for context lines, a hyphen separator is
|
||||
used. The -Z option can be used to change the terminator to a
|
||||
zero byte. If a line number is also being output, it follows
|
||||
the file name. This option overrides any previous -H, -L, or
|
||||
-l options.
|
||||
|
||||
--heap-limit=number
|
||||
See --match-limit below.
|
||||
|
||||
--help Output a help message, giving brief details of the command
|
||||
options and file type support, and then exit. Anything else
|
||||
--help Output a help message, giving brief details of the command
|
||||
options and file type support, and then exit. Anything else
|
||||
on the command line is ignored.
|
||||
|
||||
-I Ignore binary files. This is equivalent to --binary-
|
||||
-I Ignore binary files. This is equivalent to --binary-
|
||||
files=without-match.
|
||||
|
||||
-i, --ignore-case
|
||||
Ignore upper/lower case distinctions during comparisons.
|
||||
|
||||
--include=pattern
|
||||
If any --include patterns are specified, the only files that
|
||||
If any --include patterns are specified, the only files that
|
||||
are processed are those whose names match one of the patterns
|
||||
and do not match an --exclude pattern. This option does not
|
||||
affect directories, but it applies to all files, whether
|
||||
listed on the command line, obtained from --file-list, or by
|
||||
scanning a directory. The pattern is a PCRE2 regular expres-
|
||||
sion, and is matched against the final component of the file
|
||||
name, not the entire path. The -F, -w, and -x options do not
|
||||
apply to this pattern. The option may be given any number of
|
||||
times. If a file name matches both an --include and an --ex-
|
||||
clude pattern, it is excluded. There is no short form for
|
||||
and do not match an --exclude pattern. This option does not
|
||||
affect directories, but it applies to all files, whether
|
||||
listed on the command line, obtained from --file-list, or by
|
||||
scanning a directory. The pattern is a PCRE2 regular expres-
|
||||
sion, and is matched against the final component of the file
|
||||
name, not the entire path. The -F, -w, and -x options do not
|
||||
apply to this pattern. The option may be given any number of
|
||||
times. If a file name matches both an --include and an --ex-
|
||||
clude pattern, it is excluded. There is no short form for
|
||||
this option.
|
||||
|
||||
--include-from=filename
|
||||
Treat each non-empty line of the file as the data for an
|
||||
Treat each non-empty line of the file as the data for an
|
||||
--include option. What constitutes a newline for this purpose
|
||||
is the operating system's default. The --newline option has
|
||||
is the operating system's default. The --newline option has
|
||||
no effect on this option. This option may be given any number
|
||||
of times; all the files are read.
|
||||
|
||||
--include-dir=pattern
|
||||
If any --include-dir patterns are specified, the only direc-
|
||||
tories that are processed are those whose names match one of
|
||||
the patterns and do not match an --exclude-dir pattern. This
|
||||
applies to all directories, whether listed on the command
|
||||
line, obtained from --file-list, or by scanning a parent di-
|
||||
rectory. The pattern is a PCRE2 regular expression, and is
|
||||
matched against the final component of the directory name,
|
||||
not the entire path. The -F, -w, and -x options do not apply
|
||||
If any --include-dir patterns are specified, the only direc-
|
||||
tories that are processed are those whose names match one of
|
||||
the patterns and do not match an --exclude-dir pattern. This
|
||||
applies to all directories, whether listed on the command
|
||||
line, obtained from --file-list, or by scanning a parent di-
|
||||
rectory. The pattern is a PCRE2 regular expression, and is
|
||||
matched against the final component of the directory name,
|
||||
not the entire path. The -F, -w, and -x options do not apply
|
||||
to this pattern. The option may be given any number of times.
|
||||
If a directory matches both --include-dir and --exclude-dir,
|
||||
If a directory matches both --include-dir and --exclude-dir,
|
||||
it is excluded. There is no short form for this option.
|
||||
|
||||
-L, --files-without-match
|
||||
Instead of outputting lines from the files, just output the
|
||||
names of the files that do not contain any lines that would
|
||||
have been output. Each file name is output once, on a sepa-
|
||||
rate line. This option overrides any previous -H, -h, or -l
|
||||
options.
|
||||
Instead of outputting lines from the files, just output the
|
||||
names of the files that do not contain any lines that would
|
||||
have been output. Each file name is output once, on a sepa-
|
||||
rate line by default, but if the -Z option is set, they are
|
||||
separated by zero bytes instead of newlines. This option
|
||||
overrides any previous -H, -h, or -l options.
|
||||
|
||||
-l, --files-with-matches
|
||||
Instead of outputting lines from the files, just output the
|
||||
names of the files containing lines that would have been out-
|
||||
put. Each file name is output once, on a separate line.
|
||||
Searching normally stops as soon as a matching line is found
|
||||
in a file. However, if the -c (count) option is also used,
|
||||
matching continues in order to obtain the correct count, and
|
||||
those files that have at least one match are listed along
|
||||
with their counts. Using this option with -c is a way of sup-
|
||||
pressing the listing of files with no matches that occurs
|
||||
with -c on its own. This option overrides any previous -H,
|
||||
-h, or -L options.
|
||||
put. Each file name is output once, on a separate line, but
|
||||
if the -Z option is set, they are separated by zero bytes in-
|
||||
stead of newlines. Searching normally stops as soon as a
|
||||
matching line is found in a file. However, if the -c (count)
|
||||
option is also used, matching continues in order to obtain
|
||||
the correct count, and those files that have at least one
|
||||
match are listed along with their counts. Using this option
|
||||
with -c is a way of suppressing the listing of files with no
|
||||
matches that occurs with -c on its own. This option overrides
|
||||
any previous -H, -h, or -L options.
|
||||
|
||||
--label=name
|
||||
This option supplies a name to be used for the standard input
|
||||
|
@ -471,105 +487,102 @@ OPTIONS
|
|||
input)" is used. There is no short form for this option.
|
||||
|
||||
--line-buffered
|
||||
When this option is given, non-compressed input is read and
|
||||
processed line by line, and the output is flushed after each
|
||||
write. By default, input is read in large chunks, unless
|
||||
pcre2grep can determine that it is reading from a terminal,
|
||||
When this option is given, non-compressed input is read and
|
||||
processed line by line, and the output is flushed after each
|
||||
write. By default, input is read in large chunks, unless
|
||||
pcre2grep can determine that it is reading from a terminal,
|
||||
which is currently possible only in Unix-like environments or
|
||||
Windows. Output to terminal is normally automatically flushed
|
||||
by the operating system. This option can be useful when the
|
||||
input or output is attached to a pipe and you do not want
|
||||
pcre2grep to buffer up large amounts of data. However, its
|
||||
use will affect performance, and the -M (multiline) option
|
||||
ceases to work. When input is from a compressed .gz or .bz2
|
||||
by the operating system. This option can be useful when the
|
||||
input or output is attached to a pipe and you do not want
|
||||
pcre2grep to buffer up large amounts of data. However, its
|
||||
use will affect performance, and the -M (multiline) option
|
||||
ceases to work. When input is from a compressed .gz or .bz2
|
||||
file, --line-buffered is ignored.
|
||||
|
||||
--line-offsets
|
||||
Instead of showing lines or parts of lines that match, show
|
||||
Instead of showing lines or parts of lines that match, show
|
||||
each match as a line number, the offset from the start of the
|
||||
line, and a length. The line number is terminated by a colon
|
||||
(as usual; see the -n option), and the offset and length are
|
||||
separated by a comma. In this mode, no context is shown.
|
||||
That is, the -A, -B, and -C options are ignored. If there is
|
||||
more than one match in a line, each of them is shown sepa-
|
||||
rately. This option is mutually exclusive with --output,
|
||||
line, and a length. The line number is terminated by a colon
|
||||
(as usual; see the -n option), and the offset and length are
|
||||
separated by a comma. In this mode, no context is shown.
|
||||
That is, the -A, -B, and -C options are ignored. If there is
|
||||
more than one match in a line, each of them is shown sepa-
|
||||
rately. This option is mutually exclusive with --output,
|
||||
--file-offsets, and --only-matching.
|
||||
|
||||
--locale=locale-name
|
||||
This option specifies a locale to be used for pattern match-
|
||||
ing. It overrides the value in the LC_ALL or LC_CTYPE envi-
|
||||
ronment variables. If no locale is specified, the PCRE2 li-
|
||||
This option specifies a locale to be used for pattern match-
|
||||
ing. It overrides the value in the LC_ALL or LC_CTYPE envi-
|
||||
ronment variables. If no locale is specified, the PCRE2 li-
|
||||
brary's default (usually the "C" locale) is used. There is no
|
||||
short form for this option.
|
||||
|
||||
-M, --multiline
|
||||
Allow patterns to match more than one line. When this option
|
||||
Allow patterns to match more than one line. When this option
|
||||
is set, the PCRE2 library is called in "multiline" mode. This
|
||||
allows a matched string to extend past the end of a line and
|
||||
continue on one or more subsequent lines. Patterns used with
|
||||
allows a matched string to extend past the end of a line and
|
||||
continue on one or more subsequent lines. Patterns used with
|
||||
-M may usefully contain literal newline characters and inter-
|
||||
nal occurrences of ^ and $ characters. The output for a suc-
|
||||
cessful match may consist of more than one line. The first
|
||||
line is the line in which the match started, and the last
|
||||
line is the line in which the match ended. If the matched
|
||||
string ends with a newline sequence, the output ends at the
|
||||
end of that line. If -v is set, none of the lines in a
|
||||
multi-line match are output. Once a match has been handled,
|
||||
scanning restarts at the beginning of the line after the one
|
||||
nal occurrences of ^ and $ characters. The output for a suc-
|
||||
cessful match may consist of more than one line. The first
|
||||
line is the line in which the match started, and the last
|
||||
line is the line in which the match ended. If the matched
|
||||
string ends with a newline sequence, the output ends at the
|
||||
end of that line. If -v is set, none of the lines in a
|
||||
multi-line match are output. Once a match has been handled,
|
||||
scanning restarts at the beginning of the line after the one
|
||||
in which the match ended.
|
||||
|
||||
The newline sequence that separates multiple lines must be
|
||||
matched as part of the pattern. For example, to find the
|
||||
phrase "regular expression" in a file where "regular" might
|
||||
be at the end of a line and "expression" at the start of the
|
||||
The newline sequence that separates multiple lines must be
|
||||
matched as part of the pattern. For example, to find the
|
||||
phrase "regular expression" in a file where "regular" might
|
||||
be at the end of a line and "expression" at the start of the
|
||||
next line, you could use this command:
|
||||
|
||||
pcre2grep -M 'regular\s+expression' <file>
|
||||
|
||||
The \s escape sequence matches any white space character, in-
|
||||
cluding newlines, and is followed by + so as to match trail-
|
||||
ing white space on the first line as well as possibly han-
|
||||
cluding newlines, and is followed by + so as to match trail-
|
||||
ing white space on the first line as well as possibly han-
|
||||
dling a two-character newline sequence.
|
||||
|
||||
There is a limit to the number of lines that can be matched,
|
||||
imposed by the way that pcre2grep buffers the input file as
|
||||
it scans it. With a sufficiently large processing buffer,
|
||||
There is a limit to the number of lines that can be matched,
|
||||
imposed by the way that pcre2grep buffers the input file as
|
||||
it scans it. With a sufficiently large processing buffer,
|
||||
this should not be a problem, but the -M option does not work
|
||||
when input is read line by line (see --line-buffered.)
|
||||
|
||||
-m number, --max-count=number
|
||||
Stop processing after finding number matching lines, or non-
|
||||
matching lines if -v is also set. Any trailing context lines
|
||||
are output after the final match. In multiline mode, each
|
||||
multiline match counts as just one line for this purpose. If
|
||||
this limit is reached when reading the standard input from a
|
||||
Stop processing after finding number matching lines, or non-
|
||||
matching lines if -v is also set. Any trailing context lines
|
||||
are output after the final match. In multiline mode, each
|
||||
multiline match counts as just one line for this purpose. If
|
||||
this limit is reached when reading the standard input from a
|
||||
regular file, the file is left positioned just after the last
|
||||
matching line. If -c is also set, the count that is output
|
||||
is never greater than number. This option has no effect if
|
||||
matching line. If -c is also set, the count that is output
|
||||
is never greater than number. This option has no effect if
|
||||
used with -L, -l, or -q, or when just checking for a match in
|
||||
a binary file.
|
||||
|
||||
--match-limit=number
|
||||
Processing some regular expression patterns may take a very
|
||||
Processing some regular expression patterns may take a very
|
||||
long time to search for all possible matching strings. Others
|
||||
may require a very large amount of memory. There are three
|
||||
may require a very large amount of memory. There are three
|
||||
options that set resource limits for matching.
|
||||
|
||||
The --match-limit option provides a means of limiting comput-
|
||||
ing resource usage when processing patterns that are not go-
|
||||
ing resource usage when processing patterns that are not go-
|
||||
ing to match, but which have a very large number of possibil-
|
||||
ities in their search trees. The classic example is a pattern
|
||||
that uses nested unlimited repeats. Internally, PCRE2 has a
|
||||
counter that is incremented each time around its main pro-
|
||||
cessing loop. If the value set by --match-limit is reached,
|
||||
that uses nested unlimited repeats. Internally, PCRE2 has a
|
||||
counter that is incremented each time around its main pro-
|
||||
cessing loop. If the value set by --match-limit is reached,
|
||||
an error occurs.
|
||||
|
||||
The --heap-limit option specifies, as a number of kibibytes
|
||||
(units of 1024 bytes), the amount of heap memory that may be
|
||||
used for matching. Heap memory is needed only if matching the
|
||||
pattern requires a significant number of nested backtracking
|
||||
points to be remembered. This parameter can be set to zero to
|
||||
forbid the use of heap memory altogether.
|
||||
The --heap-limit option specifies, as a number of kibibytes
|
||||
(units of 1024 bytes), the maximum amount of heap memory that
|
||||
may be used for matching.
|
||||
|
||||
The --depth-limit option limits the depth of nested back-
|
||||
tracking points, which indirectly limits the amount of memory
|
||||
|
@ -806,6 +819,13 @@ OPTIONS
|
|||
does not apply to patterns specified by any of the --include
|
||||
or --exclude options.
|
||||
|
||||
-Z, --null
|
||||
Terminate files names in the regular output with a zero byte
|
||||
(the NUL character) instead of what would normally appear.
|
||||
This is useful when file names contain unusual characters
|
||||
such as colons, hyphens, or even newlines. The option does
|
||||
not apply to file names in error messages.
|
||||
|
||||
|
||||
ENVIRONMENT VARIABLES
|
||||
|
||||
|
@ -1010,11 +1030,11 @@ SEE ALSO
|
|||
AUTHOR
|
||||
|
||||
Philip Hazel
|
||||
University Computing Service
|
||||
Retired from University Computing Service
|
||||
Cambridge, England.
|
||||
|
||||
|
||||
REVISION
|
||||
|
||||
Last updated: 04 October 2020
|
||||
Copyright (c) 1997-2020 University of Cambridge.
|
||||
Last updated: 30 July 2022
|
||||
Copyright (c) 1997-2022 University of Cambridge.
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
.TH PCRE2JIT 3 "23 May 2019" "PCRE2 10.34"
|
||||
.TH PCRE2JIT 3 "30 November 2021" "PCRE2 10.40"
|
||||
.SH NAME
|
||||
PCRE2 - Perl-compatible regular expressions (revised API)
|
||||
.SH "PCRE2 JUST-IN-TIME COMPILER SUPPORT"
|
||||
|
@ -29,6 +29,7 @@ platforms:
|
|||
.sp
|
||||
ARM 32-bit (v5, v7, and Thumb2)
|
||||
ARM 64-bit
|
||||
IBM s390x 64 bit
|
||||
Intel x86 32-bit and 64-bit
|
||||
MIPS 32-bit and 64-bit
|
||||
Power PC 32-bit and 64-bit
|
||||
|
@ -250,11 +251,11 @@ non-sequential matches in one thread is to use callouts: if a callout function
|
|||
starts another match, that match must use a different JIT stack to the one used
|
||||
for currently suspended match(es).
|
||||
.P
|
||||
In a multithread application, if you do not
|
||||
specify a JIT stack, or if you assign or pass back NULL from a callback, that
|
||||
is thread-safe, because each thread has its own machine stack. However, if you
|
||||
assign or pass back a non-NULL JIT stack, this must be a different stack for
|
||||
each thread so that the application is thread-safe.
|
||||
In a multithread application, if you do not specify a JIT stack, or if you
|
||||
assign or pass back NULL from a callback, that is thread-safe, because each
|
||||
thread has its own machine stack. However, if you assign or pass back a
|
||||
non-NULL JIT stack, this must be a different stack for each thread so that the
|
||||
application is thread-safe.
|
||||
.P
|
||||
Strictly speaking, even more is allowed. You can assign the same non-NULL stack
|
||||
to a match context that is used by any number of patterns, as long as they are
|
||||
|
@ -266,7 +267,7 @@ inefficient solution, and not recommended.
|
|||
This is a suggestion for how a multithreaded program that needs to set up
|
||||
non-default JIT stacks might operate:
|
||||
.sp
|
||||
During thread initalization
|
||||
During thread initialization
|
||||
thread_local_var = pcre2_jit_stack_create(...)
|
||||
.sp
|
||||
During thread exit
|
||||
|
@ -354,8 +355,8 @@ out this complicated API.
|
|||
.B void pcre2_jit_free_unused_memory(pcre2_general_context *\fIgcontext\fP);
|
||||
.fi
|
||||
.P
|
||||
The JIT executable allocator does not free all memory when it is possible.
|
||||
It expects new allocations, and keeps some free memory around to improve
|
||||
The JIT executable allocator does not free all memory when it is possible. It
|
||||
expects new allocations, and keeps some free memory around to improve
|
||||
allocation speed. However, in low memory conditions, it might be better to free
|
||||
all possible memory. You can cause this to happen by calling
|
||||
pcre2_jit_free_unused_memory(). Its argument is a general context, for custom
|
||||
|
@ -415,10 +416,10 @@ that was not compiled.
|
|||
.P
|
||||
When you call \fBpcre2_match()\fP, as well as testing for invalid options, a
|
||||
number of other sanity checks are performed on the arguments. For example, if
|
||||
the subject pointer is NULL, an immediate error is given. Also, unless
|
||||
PCRE2_NO_UTF_CHECK is set, a UTF subject string is tested for validity. In the
|
||||
interests of speed, these checks do not happen on the JIT fast path, and if
|
||||
invalid data is passed, the result is undefined.
|
||||
the subject pointer is NULL but the length is non-zero, an immediate error is
|
||||
given. Also, unless PCRE2_NO_UTF_CHECK is set, a UTF subject string is tested
|
||||
for validity. In the interests of speed, these checks do not happen on the JIT
|
||||
fast path, and if invalid data is passed, the result is undefined.
|
||||
.P
|
||||
Bypassing the sanity checks and the \fBpcre2_match()\fP wrapping can give
|
||||
speedups of more than 10%.
|
||||
|
@ -444,6 +445,6 @@ Cambridge, England.
|
|||
.rs
|
||||
.sp
|
||||
.nf
|
||||
Last updated: 23 May 2019
|
||||
Copyright (c) 1997-2019 University of Cambridge.
|
||||
Last updated: 30 November 2021
|
||||
Copyright (c) 1997-2021 University of Cambridge.
|
||||
.fi
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
.TH PCRE2LIMITS 3 "03 February 2019" "PCRE2 10.33"
|
||||
.TH PCRE2LIMITS 3 "26 July 2022" "PCRE2 10.41"
|
||||
.SH NAME
|
||||
PCRE2 - Perl-compatible regular expressions (revised API)
|
||||
.SH "SIZE AND OTHER LIMITATIONS"
|
||||
|
@ -51,6 +51,10 @@ is 255 code units for the 8-bit library and 65535 code units for the 16-bit and
|
|||
.P
|
||||
The maximum length of a string argument to a callout is the largest number a
|
||||
32-bit unsigned integer can hold.
|
||||
.P
|
||||
The maximum amount of heap memory used for matching is controlled by the heap
|
||||
limit, which can be set in a pattern or in a match context. The default is a
|
||||
very large number, effectively unlimited.
|
||||
.
|
||||
.
|
||||
.SH AUTHOR
|
||||
|
@ -58,7 +62,7 @@ The maximum length of a string argument to a callout is the largest number a
|
|||
.sp
|
||||
.nf
|
||||
Philip Hazel
|
||||
University Computing Service
|
||||
Retired from University Computing Service
|
||||
Cambridge, England.
|
||||
.fi
|
||||
.
|
||||
|
@ -67,6 +71,6 @@ Cambridge, England.
|
|||
.rs
|
||||
.sp
|
||||
.nf
|
||||
Last updated: 02 February 2019
|
||||
Copyright (c) 1997-2019 University of Cambridge.
|
||||
Last updated: 26 July 2022
|
||||
Copyright (c) 1997-2022 University of Cambridge.
|
||||
.fi
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
.TH PCRE2MATCHING 3 "23 May 2019" "PCRE2 10.34"
|
||||
.TH PCRE2MATCHING 3 "28 August 2021" "PCRE2 10.38"
|
||||
.SH NAME
|
||||
PCRE2 - Perl-compatible regular expressions (revised API)
|
||||
.SH "PCRE2 MATCHING ALGORITHMS"
|
||||
|
@ -61,8 +61,9 @@ tried is controlled by the greedy or ungreedy nature of the quantifier.
|
|||
If a leaf node is reached, a matching string has been found, and at that point
|
||||
the algorithm stops. Thus, if there is more than one possible match, this
|
||||
algorithm returns the first one that it finds. Whether this is the shortest,
|
||||
the longest, or some intermediate length depends on the way the greedy and
|
||||
ungreedy repetition quantifiers are specified in the pattern.
|
||||
the longest, or some intermediate length depends on the way the alternations
|
||||
and the greedy or ungreedy repetition quantifiers are specified in the
|
||||
pattern.
|
||||
.P
|
||||
Because it ends up with a single path through the tree, it is relatively
|
||||
straightforward for this algorithm to keep track of the substrings that are
|
||||
|
@ -91,10 +92,15 @@ no more unterminated paths. At this point, terminated paths represent the
|
|||
different matching possibilities (if there are none, the match has failed).
|
||||
Thus, if there is more than one possible match, this algorithm finds all of
|
||||
them, and in particular, it finds the longest. The matches are returned in
|
||||
decreasing order of length. There is an option to stop the algorithm after the
|
||||
first match (which is necessarily the shortest) is found.
|
||||
the output vector in decreasing order of length. There is an option to stop the
|
||||
algorithm after the first match (which is necessarily the shortest) is found.
|
||||
.P
|
||||
Note that all the matches that are found start at the same point in the
|
||||
Note that the size of vector needed to contain all the results depends on the
|
||||
number of simultaneous matches, not on the number of parentheses in the
|
||||
pattern. Using \fBpcre2_match_data_create_from_pattern()\fP to create the match
|
||||
data block is therefore not advisable when doing DFA matching.
|
||||
.P
|
||||
Note also that all the matches that are found start at the same point in the
|
||||
subject. If the pattern
|
||||
.sp
|
||||
cat(er(pillar)?)?
|
||||
|
@ -165,19 +171,13 @@ supported by \fBpcre2_dfa_match()\fP.
|
|||
.SH "ADVANTAGES OF THE ALTERNATIVE ALGORITHM"
|
||||
.rs
|
||||
.sp
|
||||
Using the alternative matching algorithm provides the following advantages:
|
||||
The main advantage of the alternative algorithm is that all possible matches
|
||||
(at a single point in the subject) are automatically found, and in particular,
|
||||
the longest match is found. To find more than one match at the same point using
|
||||
the standard algorithm, you have to do kludgy things with callouts.
|
||||
.P
|
||||
1. All possible matches (at a single point in the subject) are automatically
|
||||
found, and in particular, the longest match is found. To find more than one
|
||||
match using the standard algorithm, you have to do kludgy things with
|
||||
callouts.
|
||||
.P
|
||||
2. Because the alternative algorithm scans the subject string just once, and
|
||||
never needs to backtrack (except for lookbehinds), it is possible to pass very
|
||||
long subject strings to the matching function in several pieces, checking for
|
||||
partial matching each time. Although it is also possible to do multi-segment
|
||||
matching using the standard algorithm, by retaining partially matched
|
||||
substrings, it is more complicated. The
|
||||
Partial matching is possible with this algorithm, though it has some
|
||||
limitations. The
|
||||
.\" HREF
|
||||
\fBpcre2partial\fP
|
||||
.\"
|
||||
|
@ -199,6 +199,8 @@ invalid UTF string are not supported.
|
|||
.P
|
||||
3. Although atomic groups are supported, their use does not provide the
|
||||
performance advantage that it does for the standard algorithm.
|
||||
.P
|
||||
4. JIT optimization is not supported.
|
||||
.
|
||||
.
|
||||
.SH AUTHOR
|
||||
|
@ -206,7 +208,7 @@ performance advantage that it does for the standard algorithm.
|
|||
.sp
|
||||
.nf
|
||||
Philip Hazel
|
||||
University Computing Service
|
||||
Retired from University Computing Service
|
||||
Cambridge, England.
|
||||
.fi
|
||||
.
|
||||
|
@ -215,6 +217,6 @@ Cambridge, England.
|
|||
.rs
|
||||
.sp
|
||||
.nf
|
||||
Last updated: 23 May 2019
|
||||
Copyright (c) 1997-2019 University of Cambridge.
|
||||
Last updated: 28 August 2021
|
||||
Copyright (c) 1997-2021 University of Cambridge.
|
||||
.fi
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
.TH PCRE2PATTERN 3 "06 October 2020" "PCRE2 10.35"
|
||||
.TH PCRE2PATTERN 3 "12 January 2022" "PCRE2 10.40"
|
||||
.SH NAME
|
||||
PCRE2 - Perl-compatible regular expressions (revised API)
|
||||
.SH "PCRE2 REGULAR EXPRESSION DETAILS"
|
||||
|
@ -509,7 +509,6 @@ for themselves. For example, outside a character class:
|
|||
.\" JOIN
|
||||
\e377 might be a backreference, otherwise
|
||||
the value 255 (decimal)
|
||||
.\" JOIN
|
||||
\e81 is always a backreference
|
||||
.sp
|
||||
Note that octal values of 100 or greater that are specified using this syntax
|
||||
|
@ -741,7 +740,7 @@ Unicode support is not needed for these characters to be recognized.
|
|||
.P
|
||||
It is possible to restrict \eR to match only CR, LF, or CRLF (instead of the
|
||||
complete set of Unicode line endings) by setting the option PCRE2_BSR_ANYCRLF
|
||||
at compile time. (BSR is an abbrevation for "backslash R".) This can be made
|
||||
at compile time. (BSR is an abbreviation for "backslash R".) This can be made
|
||||
the default when PCRE2 is built; if this is the case, the other behaviour can
|
||||
be requested via the PCRE2_BSR_UNICODE option. It is also possible to specify
|
||||
these settings by starting a pattern string with one of the following
|
||||
|
@ -773,195 +772,64 @@ can be used in any mode, though in 8-bit and 16-bit non-UTF modes these
|
|||
sequences are of course limited to testing characters whose code points are
|
||||
less than U+0100 and U+10000, respectively. In 32-bit non-UTF mode, code points
|
||||
greater than 0x10ffff (the Unicode limit) may be encountered. These are all
|
||||
treated as being in the Unknown script and with an unassigned type. The extra
|
||||
escape sequences are:
|
||||
treated as being in the Unknown script and with an unassigned type.
|
||||
.P
|
||||
Matching characters by Unicode property is not fast, because PCRE2 has to do a
|
||||
multistage table lookup in order to find a character's property. That is why
|
||||
the traditional escape sequences such as \ed and \ew do not use Unicode
|
||||
properties in PCRE2 by default, though you can make them do so by setting the
|
||||
PCRE2_UCP option or by starting the pattern with (*UCP).
|
||||
.P
|
||||
The extra escape sequences that provide property support are:
|
||||
.sp
|
||||
\ep{\fIxx\fP} a character with the \fIxx\fP property
|
||||
\eP{\fIxx\fP} a character without the \fIxx\fP property
|
||||
\eX a Unicode extended grapheme cluster
|
||||
.sp
|
||||
The property names represented by \fIxx\fP above are case-sensitive. There is
|
||||
support for Unicode script names, Unicode general category properties, "Any",
|
||||
which matches any character (including newline), and some special PCRE2
|
||||
properties (described in the
|
||||
The property names represented by \fIxx\fP above are not case-sensitive, and in
|
||||
accordance with Unicode's "loose matching" rules, spaces, hyphens, and
|
||||
underscores are ignored. There is support for Unicode script names, Unicode
|
||||
general category properties, "Any", which matches any character (including
|
||||
newline), Bidi_Class, a number of binary (yes/no) properties, and some special
|
||||
PCRE2 properties (described
|
||||
.\" HTML <a href="#extraprops">
|
||||
.\" </a>
|
||||
next section).
|
||||
below).
|
||||
.\"
|
||||
Other Perl properties such as "InMusicalSymbols" are not supported by PCRE2.
|
||||
Note that \eP{Any} does not match any characters, so always causes a match
|
||||
failure.
|
||||
Certain other Perl properties such as "InMusicalSymbols" are not supported by
|
||||
PCRE2. Note that \eP{Any} does not match any characters, so always causes a
|
||||
match failure.
|
||||
.
|
||||
.
|
||||
.
|
||||
.SS "Script properties for \ep and \eP"
|
||||
.rs
|
||||
.sp
|
||||
There are three different syntax forms for matching a script. Each Unicode
|
||||
character has a basic script and, optionally, a list of other scripts ("Script
|
||||
Extensions") with which it is commonly used. Using the Adlam script as an
|
||||
example, \ep{sc:Adlam} matches characters whose basic script is Adlam, whereas
|
||||
\ep{scx:Adlam} matches, in addition, characters that have Adlam in their
|
||||
extensions list. The full names "script" and "script extensions" for the
|
||||
property types are recognized, and a equals sign is an alternative to the
|
||||
colon. If a script name is given without a property type, for example,
|
||||
\ep{Adlam}, it is treated as \ep{scx:Adlam}. Perl changed to this
|
||||
interpretation at release 5.26 and PCRE2 changed at release 10.40.
|
||||
.P
|
||||
Sets of Unicode characters are defined as belonging to certain scripts. A
|
||||
character from one of these sets can be matched using a script name. For
|
||||
example:
|
||||
.sp
|
||||
\ep{Greek}
|
||||
\eP{Han}
|
||||
.sp
|
||||
Unassigned characters (and in non-UTF 32-bit mode, characters with code points
|
||||
greater than 0x10FFFF) are assigned the "Unknown" script. Others that are not
|
||||
part of an identified script are lumped together as "Common". The current list
|
||||
of scripts is:
|
||||
.P
|
||||
Adlam,
|
||||
Ahom,
|
||||
Anatolian_Hieroglyphs,
|
||||
Arabic,
|
||||
Armenian,
|
||||
Avestan,
|
||||
Balinese,
|
||||
Bamum,
|
||||
Bassa_Vah,
|
||||
Batak,
|
||||
Bengali,
|
||||
Bhaiksuki,
|
||||
Bopomofo,
|
||||
Brahmi,
|
||||
Braille,
|
||||
Buginese,
|
||||
Buhid,
|
||||
Canadian_Aboriginal,
|
||||
Carian,
|
||||
Caucasian_Albanian,
|
||||
Chakma,
|
||||
Cham,
|
||||
Cherokee,
|
||||
Chorasmian,
|
||||
Common,
|
||||
Coptic,
|
||||
Cuneiform,
|
||||
Cypriot,
|
||||
Cyrillic,
|
||||
Deseret,
|
||||
Devanagari,
|
||||
Dives_Akuru,
|
||||
Dogra,
|
||||
Duployan,
|
||||
Egyptian_Hieroglyphs,
|
||||
Elbasan,
|
||||
Elymaic,
|
||||
Ethiopic,
|
||||
Georgian,
|
||||
Glagolitic,
|
||||
Gothic,
|
||||
Grantha,
|
||||
Greek,
|
||||
Gujarati,
|
||||
Gunjala_Gondi,
|
||||
Gurmukhi,
|
||||
Han,
|
||||
Hangul,
|
||||
Hanifi_Rohingya,
|
||||
Hanunoo,
|
||||
Hatran,
|
||||
Hebrew,
|
||||
Hiragana,
|
||||
Imperial_Aramaic,
|
||||
Inherited,
|
||||
Inscriptional_Pahlavi,
|
||||
Inscriptional_Parthian,
|
||||
Javanese,
|
||||
Kaithi,
|
||||
Kannada,
|
||||
Katakana,
|
||||
Kayah_Li,
|
||||
Kharoshthi,
|
||||
Khitan_Small_Script,
|
||||
Khmer,
|
||||
Khojki,
|
||||
Khudawadi,
|
||||
Lao,
|
||||
Latin,
|
||||
Lepcha,
|
||||
Limbu,
|
||||
Linear_A,
|
||||
Linear_B,
|
||||
Lisu,
|
||||
Lycian,
|
||||
Lydian,
|
||||
Mahajani,
|
||||
Makasar,
|
||||
Malayalam,
|
||||
Mandaic,
|
||||
Manichaean,
|
||||
Marchen,
|
||||
Masaram_Gondi,
|
||||
Medefaidrin,
|
||||
Meetei_Mayek,
|
||||
Mende_Kikakui,
|
||||
Meroitic_Cursive,
|
||||
Meroitic_Hieroglyphs,
|
||||
Miao,
|
||||
Modi,
|
||||
Mongolian,
|
||||
Mro,
|
||||
Multani,
|
||||
Myanmar,
|
||||
Nabataean,
|
||||
Nandinagari,
|
||||
New_Tai_Lue,
|
||||
Newa,
|
||||
Nko,
|
||||
Nushu,
|
||||
Nyakeng_Puachue_Hmong,
|
||||
Ogham,
|
||||
Ol_Chiki,
|
||||
Old_Hungarian,
|
||||
Old_Italic,
|
||||
Old_North_Arabian,
|
||||
Old_Permic,
|
||||
Old_Persian,
|
||||
Old_Sogdian,
|
||||
Old_South_Arabian,
|
||||
Old_Turkic,
|
||||
Oriya,
|
||||
Osage,
|
||||
Osmanya,
|
||||
Pahawh_Hmong,
|
||||
Palmyrene,
|
||||
Pau_Cin_Hau,
|
||||
Phags_Pa,
|
||||
Phoenician,
|
||||
Psalter_Pahlavi,
|
||||
Rejang,
|
||||
Runic,
|
||||
Samaritan,
|
||||
Saurashtra,
|
||||
Sharada,
|
||||
Shavian,
|
||||
Siddham,
|
||||
SignWriting,
|
||||
Sinhala,
|
||||
Sogdian,
|
||||
Sora_Sompeng,
|
||||
Soyombo,
|
||||
Sundanese,
|
||||
Syloti_Nagri,
|
||||
Syriac,
|
||||
Tagalog,
|
||||
Tagbanwa,
|
||||
Tai_Le,
|
||||
Tai_Tham,
|
||||
Tai_Viet,
|
||||
Takri,
|
||||
Tamil,
|
||||
Tangut,
|
||||
Telugu,
|
||||
Thaana,
|
||||
Thai,
|
||||
Tibetan,
|
||||
Tifinagh,
|
||||
Tirhuta,
|
||||
Ugaritic,
|
||||
Unknown,
|
||||
Vai,
|
||||
Wancho,
|
||||
Warang_Citi,
|
||||
Yezidi,
|
||||
Yi,
|
||||
Zanabazar_Square.
|
||||
.P
|
||||
of recognized script names and their 4-character abbreviations can be obtained
|
||||
by running this command:
|
||||
.sp
|
||||
pcre2test -LS
|
||||
.sp
|
||||
.
|
||||
.
|
||||
.
|
||||
.SS "The general category property for \ep and \eP"
|
||||
.rs
|
||||
.sp
|
||||
Each character has exactly one Unicode general category property, specified by
|
||||
a two-letter abbreviation. For compatibility with Perl, negation can be
|
||||
specified by including a circumflex between the opening brace and the property
|
||||
|
@ -1021,9 +889,9 @@ The following general category property codes are supported:
|
|||
Zp Paragraph separator
|
||||
Zs Space separator
|
||||
.sp
|
||||
The special property L& is also supported: it matches a character that has
|
||||
the Lu, Ll, or Lt property, in other words, a letter that is not classified as
|
||||
a modifier or "other".
|
||||
The special property LC, which has the synonym L&, is also supported: it
|
||||
matches a character that has the Lu, Ll, or Lt property, in other words, a
|
||||
letter that is not classified as a modifier or "other".
|
||||
.P
|
||||
The Cs (Surrogate) property applies only to characters whose code points are in
|
||||
the range U+D800 to U+DFFF. These characters are no different to any other
|
||||
|
@ -1047,12 +915,53 @@ Unicode table.
|
|||
Specifying caseless matching does not affect these escape sequences. For
|
||||
example, \ep{Lu} always matches only upper case letters. This is different from
|
||||
the behaviour of current versions of Perl.
|
||||
.P
|
||||
Matching characters by Unicode property is not fast, because PCRE2 has to do a
|
||||
multistage table lookup in order to find a character's property. That is why
|
||||
the traditional escape sequences such as \ed and \ew do not use Unicode
|
||||
properties in PCRE2 by default, though you can make them do so by setting the
|
||||
PCRE2_UCP option or by starting the pattern with (*UCP).
|
||||
.
|
||||
.
|
||||
.SS "Binary (yes/no) properties for \ep and \eP"
|
||||
.rs
|
||||
.sp
|
||||
Unicode defines a number of binary properties, that is, properties whose only
|
||||
values are true or false. You can obtain a list of those that are recognized by
|
||||
\ep and \eP, along with their abbreviations, by running this command:
|
||||
.sp
|
||||
pcre2test -LP
|
||||
.sp
|
||||
.
|
||||
.
|
||||
.SS "The Bidi_Class property for \ep and \eP"
|
||||
.rs
|
||||
.sp
|
||||
\ep{Bidi_Class:<class>} matches a character with the given class
|
||||
\ep{BC:<class>} matches a character with the given class
|
||||
.sp
|
||||
The recognized classes are:
|
||||
.sp
|
||||
AL Arabic letter
|
||||
AN Arabic number
|
||||
B paragraph separator
|
||||
BN boundary neutral
|
||||
CS common separator
|
||||
EN European number
|
||||
ES European separator
|
||||
ET European terminator
|
||||
FSI first strong isolate
|
||||
L left-to-right
|
||||
LRE left-to-right embedding
|
||||
LRI left-to-right isolate
|
||||
LRO left-to-right override
|
||||
NSM non-spacing mark
|
||||
ON other neutral
|
||||
PDF pop directional format
|
||||
PDI pop directional isolate
|
||||
R right-to-left
|
||||
RLE right-to-left embedding
|
||||
RLI right-to-left isolate
|
||||
RLO right-to-left override
|
||||
S segment separator
|
||||
WS which space
|
||||
.sp
|
||||
An equals sign may be used instead of a colon. The class names are
|
||||
case-insensitive; only the short names listed above are recognized.
|
||||
.
|
||||
.
|
||||
.SS Extended grapheme clusters
|
||||
|
@ -1082,7 +991,7 @@ additional characters according to the following rules for ending a cluster:
|
|||
3. Do not break Hangul (a Korean script) syllable sequences. Hangul characters
|
||||
are of five types: L, V, T, LV, and LVT. An L character may be followed by an
|
||||
L, V, LV, or LVT character; an LV or V character may be followed by a V or T
|
||||
character; an LVT or T character may be follwed only by a T character.
|
||||
character; an LVT or T character may be followed only by a T character.
|
||||
.P
|
||||
4. Do not end before extending characters or spacing marks or the "zero-width
|
||||
joiner" character. Characters with the "mark" property always have the
|
||||
|
@ -1168,9 +1077,11 @@ For example, when the pattern
|
|||
.sp
|
||||
matches "foobar", the first substring is still set to "foo".
|
||||
.P
|
||||
Perl used to document that the use of \eK within lookaround assertions is "not
|
||||
well defined", but from version 5.32.0 Perl does not support this usage at all.
|
||||
In PCRE2, \eK is acted upon when it occurs inside positive assertions, but is
|
||||
From version 5.32.0 Perl forbids the use of \eK in lookaround assertions. From
|
||||
release 10.38 PCRE2 also forbids this by default. However, the
|
||||
PCRE2_EXTRA_ALLOW_LOOKAROUND_BSK option can be used when calling
|
||||
\fBpcre2_compile()\fP to re-enable the previous behaviour. When this option is
|
||||
set, \eK is acted upon when it occurs inside positive assertions, but is
|
||||
ignored in negative assertions. Note that when a pattern such as (?=ab\eK)
|
||||
matches, the reported start of the match can be greater than the end of the
|
||||
match. Using \eK in a lookbehind assertion at the start of a pattern can also
|
||||
|
@ -1329,14 +1240,19 @@ end of the subject in both modes, and if all branches of a pattern start with
|
|||
.sp
|
||||
Outside a character class, a dot in the pattern matches any one character in
|
||||
the subject string except (by default) a character that signifies the end of a
|
||||
line.
|
||||
line. One or more characters may be specified as line terminators (see
|
||||
.\" HTML <a href="#newlines">
|
||||
.\" </a>
|
||||
"Newline conventions"
|
||||
.\"
|
||||
above).
|
||||
.P
|
||||
When a line ending is defined as a single character, dot never matches that
|
||||
character; when the two-character sequence CRLF is used, dot does not match CR
|
||||
if it is immediately followed by LF, but otherwise it matches all characters
|
||||
(including isolated CRs and LFs). When any Unicode line endings are being
|
||||
recognized, dot does not match CR or LF or any of the other line ending
|
||||
characters.
|
||||
Dot never matches a single line-ending character. When the two-character
|
||||
sequence CRLF is the only line ending, dot does not match CR if it is
|
||||
immediately followed by LF, but otherwise it matches all characters (including
|
||||
isolated CRs and LFs). When ANYCRLF is selected for line endings, no occurences
|
||||
of CR of LF match dot. When all Unicode line endings are being recognized, dot
|
||||
does not match CR or LF or any of the other line ending characters.
|
||||
.P
|
||||
The behaviour of dot with regard to newlines can be changed. If the
|
||||
PCRE2_DOTALL option is set, a dot matches any one character, without exception.
|
||||
|
@ -2179,10 +2095,10 @@ be easier to remember:
|
|||
.sp
|
||||
(*atomic:\ed+)foo
|
||||
.sp
|
||||
This kind of parenthesized group "locks up" the part of the pattern it
|
||||
contains once it has matched, and a failure further into the pattern is
|
||||
prevented from backtracking into it. Backtracking past it to previous items,
|
||||
however, works as normal.
|
||||
This kind of parenthesized group "locks up" the part of the pattern it contains
|
||||
once it has matched, and a failure further into the pattern is prevented from
|
||||
backtracking into it. Backtracking past it to previous items, however, works as
|
||||
normal.
|
||||
.P
|
||||
An alternative description is that a group of this type matches exactly the
|
||||
string of characters that an identical standalone pattern would match, if
|
||||
|
@ -2928,7 +2844,7 @@ breaks):
|
|||
(?(DEFINE) (?<byte> 2[0-4]\ed | 25[0-5] | 1\ed\ed | [1-9]?\ed) )
|
||||
\eb (?&byte) (\e.(?&byte)){3} \eb
|
||||
.sp
|
||||
The first part of the pattern is a DEFINE group inside which a another group
|
||||
The first part of the pattern is a DEFINE group inside which another group
|
||||
named "byte" is defined. This matches an individual component of an IPv4
|
||||
address (a number less than 256). When matching takes place, this part of the
|
||||
pattern is skipped because DEFINE acts like a false condition. The rest of the
|
||||
|
@ -3658,7 +3574,7 @@ successful match if there is a later mismatch. Consider:
|
|||
.sp
|
||||
If the subject is "aaaac...", after the first match attempt fails (starting at
|
||||
the first character in the string), the starting point skips on to start the
|
||||
next attempt at "c". Note that a possessive quantifer does not have the same
|
||||
next attempt at "c". Note that a possessive quantifier does not have the same
|
||||
effect as this example; although it would suppress backtracking during the
|
||||
first match attempt, the second attempt would start at the second character
|
||||
instead of skipping on to "c".
|
||||
|
@ -3889,7 +3805,7 @@ there is a backtrack at the outer level.
|
|||
.sp
|
||||
.nf
|
||||
Philip Hazel
|
||||
University Computing Service
|
||||
Retired from University Computing Service
|
||||
Cambridge, England.
|
||||
.fi
|
||||
.
|
||||
|
@ -3898,6 +3814,6 @@ Cambridge, England.
|
|||
.rs
|
||||
.sp
|
||||
.nf
|
||||
Last updated: 06 October 2020
|
||||
Copyright (c) 1997-2020 University of Cambridge.
|
||||
Last updated: 12 January 2022
|
||||
Copyright (c) 1997-2022 University of Cambridge.
|
||||
.fi
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
.TH PCRE2PERFORM 3 "03 February 2019" "PCRE2 10.33"
|
||||
.TH PCRE2PERFORM 3 "27 July 2022" "PCRE2 10.41"
|
||||
.SH NAME
|
||||
PCRE2 - Perl-compatible regular expressions (revised API)
|
||||
.SH "PCRE2 PERFORMANCE"
|
||||
|
@ -69,12 +69,28 @@ From release 10.30, the interpretive (non-JIT) version of \fBpcre2_match()\fP
|
|||
uses very little system stack at run time. In earlier releases recursive
|
||||
function calls could use a great deal of stack, and this could cause problems,
|
||||
but this usage has been eliminated. Backtracking positions are now explicitly
|
||||
remembered in memory frames controlled by the code. An initial 20KiB vector of
|
||||
frames is allocated on the system stack (enough for about 100 frames for small
|
||||
patterns), but if this is insufficient, heap memory is used. The amount of heap
|
||||
memory can be limited; if the limit is set to zero, only the initial stack
|
||||
vector is used. Rewriting patterns to be time-efficient, as described below,
|
||||
may also reduce the memory requirements.
|
||||
remembered in memory frames controlled by the code.
|
||||
.P
|
||||
The size of each frame depends on the size of pointer variables and the number
|
||||
of capturing parenthesized groups in the pattern being matched. On a 64-bit
|
||||
system the frame size for a pattern with no captures is 128 bytes. For each
|
||||
capturing group the size increases by 16 bytes.
|
||||
.P
|
||||
Until release 10.41, an initial 20KiB frames vector was allocated on the system
|
||||
stack, but this still caused some issues for multi-thread applications where
|
||||
each thread has a very small stack. From release 10.41 backtracking memory
|
||||
frames are always held in heap memory. An initial heap allocation is obtained
|
||||
the first time any match data block is passed to \fBpcre2_match()\fP. This is
|
||||
remembered with the match data block and re-used if that block is used for
|
||||
another match. It is freed when the match data block itself is freed.
|
||||
.P
|
||||
The size of the initial block is the larger of 20KiB or ten times the pattern's
|
||||
frame size, unless the heap limit is less than this, in which case the heap
|
||||
limit is used. If the initial block proves to be too small during matching, it
|
||||
is replaced by a larger block, subject to the heap limit. The heap limit is
|
||||
checked only when a new block is to be allocated. Reducing the heap limit
|
||||
between calls to \fBpcre2_match()\fP with the same match data block does not
|
||||
affect the saved block.
|
||||
.P
|
||||
In contrast to \fBpcre2_match()\fP, \fBpcre2_dfa_match()\fP does use recursive
|
||||
function calls, but only for processing atomic groups, lookaround assertions,
|
||||
|
@ -230,7 +246,7 @@ pattern to match. This is done by repeatedly matching with different limits.
|
|||
.sp
|
||||
.nf
|
||||
Philip Hazel
|
||||
University Computing Service
|
||||
Retired from University Computing Service
|
||||
Cambridge, England.
|
||||
.fi
|
||||
.
|
||||
|
@ -239,6 +255,6 @@ Cambridge, England.
|
|||
.rs
|
||||
.sp
|
||||
.nf
|
||||
Last updated: 03 February 2019
|
||||
Copyright (c) 1997-2019 University of Cambridge.
|
||||
Last updated: 27 July 2022
|
||||
Copyright (c) 1997-2022 University of Cambridge.
|
||||
.fi
|
||||
|
|
|
@ -6,11 +6,11 @@ PCRE2 - Perl-compatible regular expressions (revised API)
|
|||
.sp
|
||||
.nf
|
||||
.B int32_t pcre2_serialize_decode(pcre2_code **\fIcodes\fP,
|
||||
.B " int32_t \fInumber_of_codes\fP, const uint32_t *\fIbytes\fP,"
|
||||
.B " int32_t \fInumber_of_codes\fP, const uint8_t *\fIbytes\fP,"
|
||||
.B " pcre2_general_context *\fIgcontext\fP);"
|
||||
.sp
|
||||
.B int32_t pcre2_serialize_encode(pcre2_code **\fIcodes\fP,
|
||||
.B " int32_t \fInumber_of_codes\fP, uint32_t **\fIserialized_bytes\fP,"
|
||||
.B int32_t pcre2_serialize_encode(const pcre2_code **\fIcodes\fP,
|
||||
.B " int32_t \fInumber_of_codes\fP, uint8_t **\fIserialized_bytes\fP,"
|
||||
.B " PCRE2_SIZE *\fIserialized_size\fP, pcre2_general_context *\fIgcontext\fP);"
|
||||
.sp
|
||||
.B void pcre2_serialize_free(uint8_t *\fIbytes\fP);
|
||||
|
@ -81,7 +81,7 @@ of serialized patterns, or one of the following negative error codes:
|
|||
.sp
|
||||
PCRE2_ERROR_BADDATA the number of patterns is zero or less
|
||||
PCRE2_ERROR_BADMAGIC mismatch of id bytes in one of the patterns
|
||||
PCRE2_ERROR_MEMORY memory allocation failed
|
||||
PCRE2_ERROR_NOMEMORY memory allocation failed
|
||||
PCRE2_ERROR_MIXEDTABLES the patterns do not all use the same tables
|
||||
PCRE2_ERROR_NULL the 1st, 3rd, or 4th argument is NULL
|
||||
.sp
|
||||
|
@ -141,7 +141,6 @@ mangagement functions for the decoded patterns. If this argument is NULL,
|
|||
\fBmalloc()\fP and \fBfree()\fP are used. After deserialization, the byte
|
||||
stream is no longer needed and can be discarded.
|
||||
.sp
|
||||
int32_t number_of_codes;
|
||||
pcre2_code *list_of_codes[2];
|
||||
uint8_t *bytes = <serialized data>;
|
||||
int32_t number_of_codes =
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
.TH PCRE2SYNTAX 3 "28 December 2019" "PCRE2 10.35"
|
||||
.TH PCRE2SYNTAX 3 "12 January 2022" "PCRE2 10.40"
|
||||
.SH NAME
|
||||
PCRE2 - Perl-compatible regular expressions (revised API)
|
||||
.SH "PCRE2 REGULAR EXPRESSION SYNTAX SUMMARY"
|
||||
|
@ -102,6 +102,10 @@ happening, \es and \ew may also match characters with code points in the range
|
|||
128-255. If the PCRE2_UCP option is set, the behaviour of these escape
|
||||
sequences is changed to use Unicode properties and they match many more
|
||||
characters.
|
||||
.P
|
||||
Property descriptions in \ep and \eP are matched caselessly; hyphens,
|
||||
underscores, and white space are ignored, in accordance with Unicode's "loose
|
||||
matching" rules.
|
||||
.
|
||||
.
|
||||
.SH "GENERAL CATEGORY PROPERTIES FOR \ep and \eP"
|
||||
|
@ -120,6 +124,7 @@ characters.
|
|||
Lo Other letter
|
||||
Lt Title case letter
|
||||
Lu Upper case letter
|
||||
Lc Ll, Lu, or Lt
|
||||
L& Ll, Lu, or Lt
|
||||
.sp
|
||||
M Mark
|
||||
|
@ -167,165 +172,59 @@ Perl and POSIX space are now the same. Perl added VT to its space character set
|
|||
at release 5.18.
|
||||
.
|
||||
.
|
||||
.SH "SCRIPT NAMES FOR \ep AND \eP"
|
||||
.SH "BINARY PROPERTIES FOR \ep AND \eP"
|
||||
.rs
|
||||
.sp
|
||||
Adlam,
|
||||
Ahom,
|
||||
Anatolian_Hieroglyphs,
|
||||
Arabic,
|
||||
Armenian,
|
||||
Avestan,
|
||||
Balinese,
|
||||
Bamum,
|
||||
Bassa_Vah,
|
||||
Batak,
|
||||
Bengali,
|
||||
Bhaiksuki,
|
||||
Bopomofo,
|
||||
Brahmi,
|
||||
Braille,
|
||||
Buginese,
|
||||
Buhid,
|
||||
Canadian_Aboriginal,
|
||||
Carian,
|
||||
Caucasian_Albanian,
|
||||
Chakma,
|
||||
Cham,
|
||||
Cherokee,
|
||||
Chorasmian,
|
||||
Common,
|
||||
Coptic,
|
||||
Cuneiform,
|
||||
Cypriot,
|
||||
Cyrillic,
|
||||
Deseret,
|
||||
Devanagari,
|
||||
Dives_Akuru,
|
||||
Dogra,
|
||||
Duployan,
|
||||
Egyptian_Hieroglyphs,
|
||||
Elbasan,
|
||||
Elymaic,
|
||||
Ethiopic,
|
||||
Georgian,
|
||||
Glagolitic,
|
||||
Gothic,
|
||||
Grantha,
|
||||
Greek,
|
||||
Gujarati,
|
||||
Gunjala_Gondi,
|
||||
Gurmukhi,
|
||||
Han,
|
||||
Hangul,
|
||||
Hanifi_Rohingya,
|
||||
Hanunoo,
|
||||
Hatran,
|
||||
Hebrew,
|
||||
Hiragana,
|
||||
Imperial_Aramaic,
|
||||
Inherited,
|
||||
Inscriptional_Pahlavi,
|
||||
Inscriptional_Parthian,
|
||||
Javanese,
|
||||
Kaithi,
|
||||
Kannada,
|
||||
Katakana,
|
||||
Kayah_Li,
|
||||
Kharoshthi,
|
||||
Khitan_Small_Script,
|
||||
Khmer,
|
||||
Khojki,
|
||||
Khudawadi,
|
||||
Lao,
|
||||
Latin,
|
||||
Lepcha,
|
||||
Limbu,
|
||||
Linear_A,
|
||||
Linear_B,
|
||||
Lisu,
|
||||
Lycian,
|
||||
Lydian,
|
||||
Mahajani,
|
||||
Makasar,
|
||||
Malayalam,
|
||||
Mandaic,
|
||||
Manichaean,
|
||||
Marchen,
|
||||
Masaram_Gondi,
|
||||
Medefaidrin,
|
||||
Meetei_Mayek,
|
||||
Mende_Kikakui,
|
||||
Meroitic_Cursive,
|
||||
Meroitic_Hieroglyphs,
|
||||
Miao,
|
||||
Modi,
|
||||
Mongolian,
|
||||
Mro,
|
||||
Multani,
|
||||
Myanmar,
|
||||
Nabataean,
|
||||
Nandinagari,
|
||||
New_Tai_Lue,
|
||||
Newa,
|
||||
Nko,
|
||||
Nushu,
|
||||
Nyakeng_Puachue_Hmong,
|
||||
Ogham,
|
||||
Ol_Chiki,
|
||||
Old_Hungarian,
|
||||
Old_Italic,
|
||||
Old_North_Arabian,
|
||||
Old_Permic,
|
||||
Old_Persian,
|
||||
Old_Sogdian,
|
||||
Old_South_Arabian,
|
||||
Old_Turkic,
|
||||
Oriya,
|
||||
Osage,
|
||||
Osmanya,
|
||||
Pahawh_Hmong,
|
||||
Palmyrene,
|
||||
Pau_Cin_Hau,
|
||||
Phags_Pa,
|
||||
Phoenician,
|
||||
Psalter_Pahlavi,
|
||||
Rejang,
|
||||
Runic,
|
||||
Samaritan,
|
||||
Saurashtra,
|
||||
Sharada,
|
||||
Shavian,
|
||||
Siddham,
|
||||
SignWriting,
|
||||
Sinhala,
|
||||
Sogdian,
|
||||
Sora_Sompeng,
|
||||
Soyombo,
|
||||
Sundanese,
|
||||
Syloti_Nagri,
|
||||
Syriac,
|
||||
Tagalog,
|
||||
Tagbanwa,
|
||||
Tai_Le,
|
||||
Tai_Tham,
|
||||
Tai_Viet,
|
||||
Takri,
|
||||
Tamil,
|
||||
Tangut,
|
||||
Telugu,
|
||||
Thaana,
|
||||
Thai,
|
||||
Tibetan,
|
||||
Tifinagh,
|
||||
Tirhuta,
|
||||
Ugaritic,
|
||||
Vai,
|
||||
Wancho,
|
||||
Warang_Citi,
|
||||
Yezidi,
|
||||
Yi,
|
||||
Zanabazar_Square.
|
||||
Unicode defines a number of binary properties, that is, properties whose only
|
||||
values are true or false. You can obtain a list of those that are recognized by
|
||||
\ep and \eP, along with their abbreviations, by running this command:
|
||||
.sp
|
||||
pcre2test -LP
|
||||
.
|
||||
.
|
||||
.
|
||||
.SH "SCRIPT MATCHING WITH \ep AND \eP"
|
||||
.rs
|
||||
.sp
|
||||
Many script names and their 4-letter abbreviations are recognized in
|
||||
\ep{sc:...} or \ep{scx:...} items, or on their own with \ep (and also \eP of
|
||||
course). You can obtain a list of these scripts by running this command:
|
||||
.sp
|
||||
pcre2test -LS
|
||||
.
|
||||
.
|
||||
.
|
||||
.SH "THE BIDI_CLASS PROPERTY FOR \ep AND \eP"
|
||||
.rs
|
||||
.sp
|
||||
\ep{Bidi_Class:<class>} matches a character with the given class
|
||||
\ep{BC:<class>} matches a character with the given class
|
||||
.sp
|
||||
The recognized classes are:
|
||||
.sp
|
||||
AL Arabic letter
|
||||
AN Arabic number
|
||||
B paragraph separator
|
||||
BN boundary neutral
|
||||
CS common separator
|
||||
EN European number
|
||||
ES European separator
|
||||
ET European terminator
|
||||
FSI first strong isolate
|
||||
L left-to-right
|
||||
LRE left-to-right embedding
|
||||
LRI left-to-right isolate
|
||||
LRO left-to-right override
|
||||
NSM non-spacing mark
|
||||
ON other neutral
|
||||
PDF pop directional format
|
||||
PDI pop directional isolate
|
||||
R right-to-left
|
||||
RLE right-to-left embedding
|
||||
RLI right-to-left isolate
|
||||
RLO right-to-left override
|
||||
S segment separator
|
||||
WS which space
|
||||
.
|
||||
.
|
||||
.SH "CHARACTER CLASSES"
|
||||
|
@ -401,6 +300,9 @@ but some of them use Unicode properties if PCRE2_UCP is set. You can use
|
|||
.sp
|
||||
\eK set reported start of match
|
||||
.sp
|
||||
From release 10.38 \eK is not permitted by default in lookaround assertions,
|
||||
for compatibility with Perl. However, if the PCRE2_EXTRA_ALLOW_LOOKAROUND_BSK
|
||||
option is set, the previous behaviour is re-enabled. When this option is set,
|
||||
\eK is honoured in positive assertions, but ignored in negative ones.
|
||||
.
|
||||
.
|
||||
|
@ -667,7 +569,7 @@ delimiter }. To encode the ending delimiter within the string, double it.
|
|||
.sp
|
||||
.nf
|
||||
Philip Hazel
|
||||
University Computing Service
|
||||
Retired from University Computing Service
|
||||
Cambridge, England.
|
||||
.fi
|
||||
.
|
||||
|
@ -676,6 +578,6 @@ Cambridge, England.
|
|||
.rs
|
||||
.sp
|
||||
.nf
|
||||
Last updated: 28 December 2019
|
||||
Copyright (c) 1997-2019 University of Cambridge.
|
||||
Last updated: 12 January 2022
|
||||
Copyright (c) 1997-2022 University of Cambridge.
|
||||
.fi
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
.TH PCRE2TEST 1 "28 April 2021" "PCRE 10.37"
|
||||
.TH PCRE2TEST 1 "27 July 2022" "PCRE 10.41"
|
||||
.SH NAME
|
||||
pcre2test - a program for testing Perl-compatible regular expressions.
|
||||
.SH SYNOPSIS
|
||||
|
@ -27,12 +27,7 @@ each match attempt. Modifiers on external or internal command lines, the
|
|||
patterns, and the subject lines specify PCRE2 function options, control how the
|
||||
subject is processed, and what output is produced.
|
||||
.P
|
||||
As the original fairly simple PCRE library evolved, it acquired many different
|
||||
features, and as a result, the original \fBpcretest\fP program ended up with a
|
||||
lot of options in a messy, arcane syntax for testing all the features. The
|
||||
move to the new PCRE2 API provided an opportunity to re-implement the test
|
||||
program as \fBpcre2test\fP, with a cleaner modifier syntax. Nevertheless, there
|
||||
are still many obscure modifiers, some of which are specifically designed for
|
||||
There are many obscure modifiers, some of which are specifically designed for
|
||||
use in conjunction with the test script and data files that are distributed as
|
||||
part of PCRE2. All the modifiers are documented here, some without much
|
||||
justification, but many of them are unlikely to be of use except when testing
|
||||
|
@ -52,7 +47,7 @@ format before being passed to the library functions. Results are converted back
|
|||
to 8-bit code units for output.
|
||||
.P
|
||||
In the rest of this document, the names of library functions and structures
|
||||
are given in generic form, for example, \fBpcre_compile()\fP. The actual
|
||||
are given in generic form, for example, \fBpcre2_compile()\fP. The actual
|
||||
names used in the libraries have a suffix _8, _16, or _32, as appropriate.
|
||||
.
|
||||
.
|
||||
|
@ -61,10 +56,10 @@ names used in the libraries have a suffix _8, _16, or _32, as appropriate.
|
|||
.rs
|
||||
.sp
|
||||
Input to \fBpcre2test\fP is processed line by line, either by calling the C
|
||||
library's \fBfgets()\fP function, or via the \fBlibreadline\fP library. In some
|
||||
Windows environments character 26 (hex 1A) causes an immediate end of file, and
|
||||
no further data is read, so this character should be avoided unless you really
|
||||
want that action.
|
||||
library's \fBfgets()\fP function, or via the \fBlibreadline\fP or \fBlibedit\fP
|
||||
library. In some Windows environments character 26 (hex 1A) causes an immediate
|
||||
end of file, and no further data is read, so this character should be avoided
|
||||
unless you really want that action.
|
||||
.P
|
||||
The input is processed using using C's string functions, so must not
|
||||
contain binary zeros, even though in Unix-like environments, \fBfgets()\fP
|
||||
|
@ -216,7 +211,17 @@ available, and the use of JIT for matching is verified.
|
|||
\fB-LM\fP
|
||||
List modifiers: write a list of available pattern and subject modifiers to the
|
||||
standard output, then exit with zero exit code. All other options are ignored.
|
||||
If both -C and -LM are present, whichever is first is recognized.
|
||||
If both -C and any -Lx options are present, whichever is first is recognized.
|
||||
.TP 10
|
||||
\fB-LP\fP
|
||||
List properties: write a list of recognized Unicode properties to the standard
|
||||
output, then exit with zero exit code. All other options are ignored. If both
|
||||
-C and any -Lx options are present, whichever is first is recognized.
|
||||
.TP 10
|
||||
\fB-LS\fP
|
||||
List scripts: write a list of recogized Unicode script names to the standard
|
||||
output, then exit with zero exit code. All other options are ignored. If both
|
||||
-C and any -Lx options are present, whichever is first is recognized.
|
||||
.TP 10
|
||||
\fB-pattern\fP \fImodifier-list\fP
|
||||
Behave as if each pattern line contains the given modifiers.
|
||||
|
@ -443,15 +448,17 @@ excluding pattern meta-characters):
|
|||
.sp
|
||||
This is interpreted as the pattern's delimiter. A regular expression may be
|
||||
continued over several input lines, in which case the newline characters are
|
||||
included within it. It is possible to include the delimiter within the pattern
|
||||
by escaping it with a backslash, for example
|
||||
included within it. It is possible to include the delimiter as a literal within
|
||||
the pattern by escaping it with a backslash, for example
|
||||
.sp
|
||||
/abc\e/def/
|
||||
.sp
|
||||
If you do this, the escape and the delimiter form part of the pattern, but
|
||||
since the delimiters are all non-alphanumeric, this does not affect its
|
||||
interpretation. If the terminating delimiter is immediately followed by a
|
||||
backslash, for example,
|
||||
since the delimiters are all non-alphanumeric, the inclusion of the backslash
|
||||
does not affect the pattern's interpretation. Note, however, that this trick
|
||||
does not work within \eQ...\eE literal bracketing because the backslash will
|
||||
itself be interpreted as a literal. If the terminating delimiter is immediately
|
||||
followed by a backslash, for example,
|
||||
.sp
|
||||
/abc/\e
|
||||
.sp
|
||||
|
@ -470,11 +477,11 @@ A pattern can be followed by a modifier list (details below).
|
|||
.SH "SUBJECT LINE SYNTAX"
|
||||
.rs
|
||||
.sp
|
||||
Before each subject line is passed to \fBpcre2_match()\fP or
|
||||
\fBpcre2_dfa_match()\fP, leading and trailing white space is removed, and the
|
||||
line is scanned for backslash escapes, unless the \fBsubject_literal\fP
|
||||
modifier was set for the pattern. The following provide a means of encoding
|
||||
non-printing characters in a visible way:
|
||||
Before each subject line is passed to \fBpcre2_match()\fP,
|
||||
\fBpcre2_dfa_match()\fP, or \fBpcre2_jit_match()\fP, leading and trailing white
|
||||
space is removed, and the line is scanned for backslash escapes, unless the
|
||||
\fBsubject_literal\fP modifier was set for the pattern. The following provide a
|
||||
means of encoding non-printing characters in a visible way:
|
||||
.sp
|
||||
\ea alarm (BEL, \ex07)
|
||||
\eb backspace (\ex08)
|
||||
|
@ -570,6 +577,7 @@ way \fBpcre2_compile()\fP behaves. See
|
|||
for a description of the effects of these options.
|
||||
.sp
|
||||
allow_empty_class set PCRE2_ALLOW_EMPTY_CLASS
|
||||
allow_lookaround_bsk set PCRE2_EXTRA_ALLOW_LOOKAROUND_BSK
|
||||
allow_surrogate_escapes set PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES
|
||||
alt_bsux set PCRE2_ALT_BSUX
|
||||
alt_circumflex set PCRE2_ALT_CIRCUMFLEX
|
||||
|
@ -1198,7 +1206,8 @@ pattern, but can be overridden by modifiers on the subject.
|
|||
copy=<number or name> copy captured substring
|
||||
depth_limit=<n> set a depth limit
|
||||
dfa use \fBpcre2_dfa_match()\fP
|
||||
find_limits find match and depth limits
|
||||
find_limits find heap, match and depth limits
|
||||
find_limits_noheap find match and depth limits
|
||||
get=<number or name> extract captured substring
|
||||
getall extract all captured substrings
|
||||
/g global global matching
|
||||
|
@ -1208,6 +1217,8 @@ pattern, but can be overridden by modifiers on the subject.
|
|||
match_limit=<n> set a match limit
|
||||
memory show heap memory usage
|
||||
null_context match with a NULL context
|
||||
null_replacement substitute with NULL replacement
|
||||
null_subject match with NULL subject
|
||||
offset=<n> set starting offset
|
||||
offset_limit=<n> set offset limit
|
||||
ovector=<n> set size of output vector
|
||||
|
@ -1518,7 +1529,7 @@ value that was set on the pattern.
|
|||
.sp
|
||||
The \fBheap_limit\fP, \fBmatch_limit\fP, and \fBdepth_limit\fP modifiers set
|
||||
the appropriate limits in the match context. These values are ignored when the
|
||||
\fBfind_limits\fP modifier is specified.
|
||||
\fBfind_limits\fP or \fBfind_limits_noheap\fP modifier is specified.
|
||||
.
|
||||
.
|
||||
.SS "Finding minimum limits"
|
||||
|
@ -1528,8 +1539,12 @@ If the \fBfind_limits\fP modifier is present on a subject line, \fBpcre2test\fP
|
|||
calls the relevant matching function several times, setting different values in
|
||||
the match context via \fBpcre2_set_heap_limit()\fP,
|
||||
\fBpcre2_set_match_limit()\fP, or \fBpcre2_set_depth_limit()\fP until it finds
|
||||
the minimum values for each parameter that allows the match to complete without
|
||||
error. If JIT is being used, only the match limit is relevant.
|
||||
the smallest value for each parameter that allows the match to complete without
|
||||
a "limit exceeded" error. The match itself may succeed or fail. An alternative
|
||||
modifier, \fBfind_limits_noheap\fP, omits the heap limit. This is used in the
|
||||
standard tests, because the minimum heap limit varies between systems. If JIT
|
||||
is being used, only the match limit is relevant, and the other two are
|
||||
automatically omitted.
|
||||
.P
|
||||
When using this modifier, the pattern should not contain any limit settings
|
||||
such as (*LIMIT_MATCH=...) within it. If such a setting is present and is
|
||||
|
@ -1553,9 +1568,7 @@ and non-recursive, to the internal matching function, thus controlling the
|
|||
overall amount of computing resource that is used.
|
||||
.P
|
||||
For both kinds of matching, the \fIheap_limit\fP number, which is in kibibytes
|
||||
(units of 1024 bytes), limits the amount of heap memory used for matching. A
|
||||
value of zero disables the use of any heap memory; many simple pattern matches
|
||||
can be done without using the heap, so zero is not an unreasonable setting.
|
||||
(units of 1024 bytes), limits the amount of heap memory used for matching.
|
||||
.
|
||||
.
|
||||
.SS "Showing MARK names"
|
||||
|
@ -1574,12 +1587,10 @@ is added to the non-match message.
|
|||
.sp
|
||||
The \fBmemory\fP modifier causes \fBpcre2test\fP to log the sizes of all heap
|
||||
memory allocation and freeing calls that occur during a call to
|
||||
\fBpcre2_match()\fP or \fBpcre2_dfa_match()\fP. These occur only when a match
|
||||
requires a bigger vector than the default for remembering backtracking points
|
||||
(\fBpcre2_match()\fP) or for internal workspace (\fBpcre2_dfa_match()\fP). In
|
||||
many cases there will be no heap memory used and therefore no additional
|
||||
output. No heap memory is allocated during matching with JIT, so in that case
|
||||
the \fBmemory\fP modifier never has any effect. For this modifier to work, the
|
||||
\fBpcre2_match()\fP or \fBpcre2_dfa_match()\fP. In the latter case, heap memory
|
||||
is used only when a match requires more internal workspace that the default
|
||||
allocation on the stack, so in many cases there will be no output. No heap
|
||||
memory is allocated during matching with JIT. For this modifier to work, the
|
||||
\fBnull_context\fP modifier must not be set on both the pattern and the
|
||||
subject, though it can be set on one or the other.
|
||||
.
|
||||
|
@ -1631,7 +1642,7 @@ When testing \fBpcre2_substitute()\fP, this modifier also has the effect of
|
|||
passing the replacement string as zero-terminated.
|
||||
.
|
||||
.
|
||||
.SS "Passing a NULL context"
|
||||
.SS "Passing a NULL context, subject, or replacement"
|
||||
.rs
|
||||
.sp
|
||||
Normally, \fBpcre2test\fP passes a context block to \fBpcre2_match()\fP,
|
||||
|
@ -1639,7 +1650,12 @@ Normally, \fBpcre2test\fP passes a context block to \fBpcre2_match()\fP,
|
|||
If the \fBnull_context\fP modifier is set, however, NULL is passed. This is for
|
||||
testing that the matching and substitution functions behave correctly in this
|
||||
case (they use default values). This modifier cannot be used with the
|
||||
\fBfind_limits\fP or \fBsubstitute_callout\fP modifiers.
|
||||
\fBfind_limits\fP, \fBfind_limits_noheap\fP, or \fBsubstitute_callout\fP
|
||||
modifiers.
|
||||
.P
|
||||
Similarly, for testing purposes, if the \fBnull_subject\fP or
|
||||
\fBnull_replacement\fP modifier is set, the subject or replacement string
|
||||
pointers are passed as NULL, respectively, to the relevant functions.
|
||||
.
|
||||
.
|
||||
.SH "THE ALTERNATIVE MATCHING FUNCTION"
|
||||
|
@ -2096,7 +2112,7 @@ on the stack.
|
|||
.sp
|
||||
.nf
|
||||
Philip Hazel
|
||||
University Computing Service
|
||||
Retired from University Computing Service
|
||||
Cambridge, England.
|
||||
.fi
|
||||
.
|
||||
|
@ -2105,6 +2121,6 @@ Cambridge, England.
|
|||
.rs
|
||||
.sp
|
||||
.nf
|
||||
Last updated: 28 April 2021
|
||||
Copyright (c) 1997-2021 University of Cambridge.
|
||||
Last updated: 27 July 2022
|
||||
Copyright (c) 1997-2022 University of Cambridge.
|
||||
.fi
|
||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -1,4 +1,4 @@
|
|||
.TH PCRE2UNICODE 3 "23 February 2020" "PCRE2 10.35"
|
||||
.TH PCRE2UNICODE 3 "22 December 2021" "PCRE2 10.40"
|
||||
.SH NAME
|
||||
PCRE - Perl-compatible regular expressions (revised API)
|
||||
.SH "UNICODE AND UTF SUPPORT"
|
||||
|
@ -40,10 +40,11 @@ handled, as documented below.
|
|||
.sp
|
||||
When PCRE2 is built with Unicode support, the escape sequences \ep{..},
|
||||
\eP{..}, and \eX can be used. This is not dependent on the PCRE2_UTF setting.
|
||||
The Unicode properties that can be tested are limited to the general category
|
||||
properties such as Lu for an upper case letter or Nd for a decimal number, the
|
||||
Unicode script names such as Arabic or Han, and the derived properties Any and
|
||||
L&. Full lists are given in the
|
||||
The Unicode properties that can be tested are a subset of those that Perl
|
||||
supports. Currently they are limited to the general category properties such as
|
||||
Lu for an upper case letter or Nd for a decimal number, the Unicode script
|
||||
names such as Arabic or Han, Bidi_Class, Bidi_Control, and the derived
|
||||
properties Any and LC (synonym L&). Full lists are given in the
|
||||
.\" HREF
|
||||
\fBpcre2pattern\fP
|
||||
.\"
|
||||
|
@ -51,10 +52,10 @@ and
|
|||
.\" HREF
|
||||
\fBpcre2syntax\fP
|
||||
.\"
|
||||
documentation. Only the short names for properties are supported. For example,
|
||||
\ep{L} matches a letter. Its Perl synonym, \ep{Letter}, is not supported.
|
||||
Furthermore, in Perl, many properties may optionally be prefixed by "Is", for
|
||||
compatibility with Perl 5.6. PCRE2 does not support this.
|
||||
documentation. In general, only the short names for properties are supported.
|
||||
For example, \ep{L} matches a letter. Its longer synonym, \ep{Letter}, is not
|
||||
supported. Furthermore, in Perl, many properties may optionally be prefixed by
|
||||
"Is", for compatibility with Perl 5.6. PCRE2 does not support this.
|
||||
.
|
||||
.
|
||||
.SH "WIDE CHARACTERS AND UTF MODES"
|
||||
|
@ -448,7 +449,7 @@ can be useful when searching for UTF text in executable or other binary files.
|
|||
.sp
|
||||
.nf
|
||||
Philip Hazel
|
||||
University Computing Service
|
||||
Retired from University Computing Service
|
||||
Cambridge, England.
|
||||
.fi
|
||||
.
|
||||
|
@ -457,6 +458,6 @@ Cambridge, England.
|
|||
.rs
|
||||
.sp
|
||||
.nf
|
||||
Last updated: 23 February 2020
|
||||
Copyright (c) 1997-2020 University of Cambridge.
|
||||
Last updated: 22 December 2021
|
||||
Copyright (c) 1997-2021 University of Cambridge.
|
||||
.fi
|
||||
|
|
|
@ -0,0 +1,56 @@
|
|||
# PCRE2 - Perl-Compatible Regular Expressions
|
||||
|
||||
The PCRE2 library is a set of C functions that implement regular expression
|
||||
pattern matching using the same syntax and semantics as Perl 5. PCRE2 has its
|
||||
own native API, as well as a set of wrapper functions that correspond to the
|
||||
POSIX regular expression API. The PCRE2 library is free, even for building
|
||||
proprietary software. It comes in three forms, for processing 8-bit, 16-bit,
|
||||
or 32-bit code units, in either literal or UTF encoding.
|
||||
|
||||
PCRE2 was first released in 2015 to replace the API in the original PCRE
|
||||
library, which is now obsolete and no longer maintained. As well as a more
|
||||
flexible API, the code of PCRE2 has been much improved since the fork.
|
||||
|
||||
## Download
|
||||
|
||||
As well as downloading from the
|
||||
[GitHub site](https://github.com/PCRE2Project/pcre2), you can download PCRE2
|
||||
or the older, unmaintained PCRE1 library from an
|
||||
[*unofficial* mirror](https://sourceforge.net/projects/pcre/files/) at SourceForge.
|
||||
|
||||
You can check out the PCRE2 source code via Git or Subversion:
|
||||
|
||||
git clone https://github.com/PCRE2Project/pcre2.git
|
||||
svn co https://github.com/PCRE2Project/pcre2.git
|
||||
|
||||
## Contributed Ports
|
||||
|
||||
If you just need the command-line PCRE2 tools on Windows, precompiled binary
|
||||
versions are available at this
|
||||
[Rexegg page](http://www.rexegg.com/pcregrep-pcretest.html).
|
||||
|
||||
A PCRE2 port for z/OS, a mainframe operating system which uses EBCDIC as its
|
||||
default character encoding, can be found at
|
||||
[http://www.cbttape.org](http://www.cbttape.org/) (File 939).
|
||||
|
||||
## Documentation
|
||||
|
||||
You can read the PCRE2 documentation
|
||||
[here](https://PCRE2Project.github.io/pcre2/doc/html/index.html).
|
||||
|
||||
Comparisons to Perl's regular expression semantics can be found in the
|
||||
community authored Wikipedia entry for PCRE.
|
||||
|
||||
There is a curated summary of changes for each PCRE release, copies of
|
||||
documentation from older releases, and other useful information from the third
|
||||
party authored
|
||||
[RexEgg PCRE Documentation and Change Log page](http://www.rexegg.com/pcre-documentation.html).
|
||||
|
||||
## Contact
|
||||
|
||||
To report a problem with the PCRE2 library, or to make a feature request, please
|
||||
use the PCRE2 GitHub issues tracker. There is a mailing list for discussion of
|
||||
PCRE2 issues and development at pcre2-dev@googlegroups.com, which is where any
|
||||
announcements will be made. You can browse the
|
||||
[list archives](https://groups.google.com/g/pcre2-dev).
|
||||
|
|
@ -0,0 +1,355 @@
|
|||
#! /usr/bin/python
|
||||
|
||||
# PCRE2 UNICODE PROPERTY SUPPORT
|
||||
# ------------------------------
|
||||
|
||||
# This file is a Python module containing common lists and functions for the
|
||||
# GenerateXXX scripts that create various.c and .h files from Unicode data
|
||||
# files. It was created as part of a re-organizaton of these scripts in
|
||||
# December 2021.
|
||||
|
||||
|
||||
import re
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# DATA LISTS
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
# BIDI classes in the DerivedBidiClass.txt file, with comments.
|
||||
|
||||
bidi_classes = [
|
||||
'AL', 'Arabic letter',
|
||||
'AN', 'Arabic number',
|
||||
'B', 'Paragraph separator',
|
||||
'BN', 'Boundary neutral',
|
||||
'CS', 'Common separator',
|
||||
'EN', 'European number',
|
||||
'ES', 'European separator',
|
||||
'ET', 'European terminator',
|
||||
'FSI', 'First strong isolate',
|
||||
'L', 'Left to right',
|
||||
'LRE', 'Left to right embedding',
|
||||
'LRI', 'Left to right isolate',
|
||||
'LRO', 'Left to right override',
|
||||
'NSM', 'Non-spacing mark',
|
||||
'ON', 'Other neutral',
|
||||
'PDF', 'Pop directional format',
|
||||
'PDI', 'Pop directional isolate',
|
||||
'R', 'Right to left',
|
||||
'RLE', 'Right to left embedding',
|
||||
'RLI', 'Right to left isolate',
|
||||
'RLO', 'Right to left override',
|
||||
'S', 'Segment separator',
|
||||
'WS', 'White space'
|
||||
]
|
||||
|
||||
# Particular category property names, with comments. NOTE: If ever this list
|
||||
# is changed, the table called "catposstab" in the pcre2_auto_possess.c file
|
||||
# must be edited to keep in step.
|
||||
|
||||
category_names = [
|
||||
'Cc', 'Control',
|
||||
'Cf', 'Format',
|
||||
'Cn', 'Unassigned',
|
||||
'Co', 'Private use',
|
||||
'Cs', 'Surrogate',
|
||||
'Ll', 'Lower case letter',
|
||||
'Lm', 'Modifier letter',
|
||||
'Lo', 'Other letter',
|
||||
'Lt', 'Title case letter',
|
||||
'Lu', 'Upper case letter',
|
||||
'Mc', 'Spacing mark',
|
||||
'Me', 'Enclosing mark',
|
||||
'Mn', 'Non-spacing mark',
|
||||
'Nd', 'Decimal number',
|
||||
'Nl', 'Letter number',
|
||||
'No', 'Other number',
|
||||
'Pc', 'Connector punctuation',
|
||||
'Pd', 'Dash punctuation',
|
||||
'Pe', 'Close punctuation',
|
||||
'Pf', 'Final punctuation',
|
||||
'Pi', 'Initial punctuation',
|
||||
'Po', 'Other punctuation',
|
||||
'Ps', 'Open punctuation',
|
||||
'Sc', 'Currency symbol',
|
||||
'Sk', 'Modifier symbol',
|
||||
'Sm', 'Mathematical symbol',
|
||||
'So', 'Other symbol',
|
||||
'Zl', 'Line separator',
|
||||
'Zp', 'Paragraph separator',
|
||||
'Zs', 'Space separator'
|
||||
]
|
||||
|
||||
# The Extended_Pictographic property is not found in the file where all the
|
||||
# others are (GraphemeBreakProperty.txt). It comes from the emoji-data.txt
|
||||
# file, but we list it here so that the name has the correct index value.
|
||||
|
||||
break_properties = [
|
||||
'CR', ' 0',
|
||||
'LF', ' 1',
|
||||
'Control', ' 2',
|
||||
'Extend', ' 3',
|
||||
'Prepend', ' 4',
|
||||
'SpacingMark', ' 5',
|
||||
'L', ' 6 Hangul syllable type L',
|
||||
'V', ' 7 Hangul syllable type V',
|
||||
'T', ' 8 Hangul syllable type T',
|
||||
'LV', ' 9 Hangul syllable type LV',
|
||||
'LVT', '10 Hangul syllable type LVT',
|
||||
'Regional_Indicator', '11',
|
||||
'Other', '12',
|
||||
'ZWJ', '13',
|
||||
'Extended_Pictographic', '14'
|
||||
]
|
||||
|
||||
# List of files from which the names of Boolean properties are obtained, along
|
||||
# with a list of regex patterns for properties to be ignored, and a list of
|
||||
# extra pattern names to add.
|
||||
|
||||
bool_propsfiles = ['PropList.txt', 'DerivedCoreProperties.txt', 'emoji-data.txt']
|
||||
bool_propsignore = [r'^Other_', r'^Hyphen$']
|
||||
bool_propsextras = ['ASCII', 'Bidi_Mirrored']
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# GET BOOLEAN PROPERTY NAMES
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
# Get a list of Boolean property names from a number of files.
|
||||
|
||||
def getbpropslist():
|
||||
bplist = []
|
||||
bplast = ""
|
||||
|
||||
for filename in bool_propsfiles:
|
||||
try:
|
||||
file = open('Unicode.tables/' + filename, 'r')
|
||||
except IOError:
|
||||
print(f"** Couldn't open {'Unicode.tables/' + filename}\n")
|
||||
sys.exit(1)
|
||||
|
||||
for line in file:
|
||||
line = re.sub(r'#.*', '', line)
|
||||
data = list(map(str.strip, line.split(';')))
|
||||
if len(data) <= 1 or data[1] == bplast:
|
||||
continue
|
||||
bplast = data[1]
|
||||
for pat in bool_propsignore:
|
||||
if re.match(pat, bplast) != None:
|
||||
break
|
||||
else:
|
||||
bplist.append(bplast)
|
||||
|
||||
file.close()
|
||||
|
||||
bplist.extend(bool_propsextras)
|
||||
bplist.sort()
|
||||
return bplist
|
||||
|
||||
bool_properties = getbpropslist()
|
||||
bool_props_list_item_size = (len(bool_properties) + 31) // 32
|
||||
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# COLLECTING PROPERTY NAMES AND ALIASES
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
script_names = ['Unknown']
|
||||
abbreviations = {}
|
||||
|
||||
def collect_property_names():
|
||||
global script_names
|
||||
global abbreviations
|
||||
|
||||
names_re = re.compile(r'^[0-9A-F]{4,6}(?:\.\.[0-9A-F]{4,6})? +; ([A-Za-z_]+) #')
|
||||
|
||||
last_script_name = ""
|
||||
with open("Unicode.tables/Scripts.txt") as f:
|
||||
for line in f:
|
||||
match_obj = names_re.match(line)
|
||||
|
||||
if match_obj == None or match_obj.group(1) == last_script_name:
|
||||
continue
|
||||
|
||||
last_script_name = match_obj.group(1)
|
||||
script_names.append(last_script_name)
|
||||
|
||||
# Sometimes there is comment in the line
|
||||
# so splitting around semicolon is not enough
|
||||
value_alias_re = re.compile(r' *([A-Za-z_]+) *; *([A-Za-z_]+) *; *([A-Za-z_]+)(?: *; *([A-Za-z_ ]+))?')
|
||||
|
||||
with open("Unicode.tables/PropertyValueAliases.txt") as f:
|
||||
for line in f:
|
||||
match_obj = value_alias_re.match(line)
|
||||
|
||||
if match_obj == None:
|
||||
continue
|
||||
|
||||
if match_obj.group(1) == "sc":
|
||||
if match_obj.group(2) == match_obj.group(3):
|
||||
abbreviations[match_obj.group(3)] = ()
|
||||
elif match_obj.group(4) == None:
|
||||
abbreviations[match_obj.group(3)] = (match_obj.group(2),)
|
||||
else:
|
||||
abbreviations[match_obj.group(3)] = (match_obj.group(2), match_obj.group(4))
|
||||
|
||||
# We can also collect Boolean property abbreviations into the same dictionary
|
||||
|
||||
bin_alias_re = re.compile(r' *([A-Za-z_]+) *; *([A-Za-z_]+)(?: *; *([A-Za-z_]+))?')
|
||||
with open("Unicode.tables/PropertyAliases.txt") as f:
|
||||
for line in f:
|
||||
match_obj = bin_alias_re.match(line)
|
||||
if match_obj == None:
|
||||
continue
|
||||
|
||||
if match_obj.group(2) in bool_properties:
|
||||
if match_obj.group(3) == None:
|
||||
abbreviations[match_obj.group(2)] = (match_obj.group(1),)
|
||||
else:
|
||||
abbreviations[match_obj.group(2)] = (match_obj.group(1), match_obj.group(3))
|
||||
|
||||
collect_property_names()
|
||||
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# REORDERING SCRIPT NAMES
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
script_abbrevs = []
|
||||
|
||||
def reorder_scripts():
|
||||
global script_names
|
||||
global script_abbrevs
|
||||
global abbreviations
|
||||
|
||||
for name in script_names:
|
||||
abbrevs = abbreviations[name]
|
||||
script_abbrevs.append(name if len(abbrevs) == 0 else abbrevs[0])
|
||||
|
||||
extended_script_abbrevs = set()
|
||||
with open("Unicode.tables/ScriptExtensions.txt") as f:
|
||||
names_re = re.compile(r'^[0-9A-F]{4,6}(?:\.\.[0-9A-F]{4,6})? +; ([A-Za-z_ ]+) #')
|
||||
|
||||
for line in f:
|
||||
match_obj = names_re.match(line)
|
||||
|
||||
if match_obj == None:
|
||||
continue
|
||||
|
||||
for name in match_obj.group(1).split(" "):
|
||||
extended_script_abbrevs.add(name)
|
||||
|
||||
new_script_names = []
|
||||
new_script_abbrevs = []
|
||||
|
||||
for idx, abbrev in enumerate(script_abbrevs):
|
||||
if abbrev in extended_script_abbrevs:
|
||||
new_script_names.append(script_names[idx])
|
||||
new_script_abbrevs.append(abbrev)
|
||||
|
||||
for idx, abbrev in enumerate(script_abbrevs):
|
||||
if abbrev not in extended_script_abbrevs:
|
||||
new_script_names.append(script_names[idx])
|
||||
new_script_abbrevs.append(abbrev)
|
||||
|
||||
script_names = new_script_names
|
||||
script_abbrevs = new_script_abbrevs
|
||||
|
||||
reorder_scripts()
|
||||
script_list_item_size = (script_names.index('Unknown') + 31) // 32
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# DERIVED LISTS
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
# Create general character property names from the first letters of the
|
||||
# particular categories.
|
||||
|
||||
gcn_set = set(category_names[i][0] for i in range(0, len(category_names), 2))
|
||||
general_category_names = list(gcn_set)
|
||||
general_category_names.sort()
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# FUNCTIONS
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
import sys
|
||||
|
||||
# Open an output file, using the command's argument or a default. Write common
|
||||
# preliminary header information.
|
||||
|
||||
def open_output(default):
|
||||
if len(sys.argv) > 2:
|
||||
print('** Too many arguments: just give a file name')
|
||||
sys.exit(1)
|
||||
if len(sys.argv) == 2:
|
||||
output_name = sys.argv[1]
|
||||
else:
|
||||
output_name = default
|
||||
try:
|
||||
file = open(output_name, "w")
|
||||
except IOError:
|
||||
print ("** Couldn't open %s" % output_name)
|
||||
sys.exit(1)
|
||||
|
||||
script_name = sys.argv[0]
|
||||
i = script_name.rfind('/')
|
||||
if i >= 0:
|
||||
script_name = script_name[i+1:]
|
||||
|
||||
file.write("""\
|
||||
/*************************************************
|
||||
* Perl-Compatible Regular Expressions *
|
||||
*************************************************/
|
||||
|
||||
/* PCRE is a library of functions to support regular expressions whose syntax
|
||||
and semantics are as close as possible to those of the Perl 5 language.
|
||||
|
||||
Written by Philip Hazel
|
||||
Original API code Copyright (c) 1997-2012 University of Cambridge
|
||||
New API code Copyright (c) 2016-2022 University of Cambridge
|
||||
|
||||
This module is auto-generated from Unicode data files. DO NOT EDIT MANUALLY!
|
||||
""")
|
||||
|
||||
file.write("Instead, modify the maint/%s script and run it to generate\n"
|
||||
"a new version of this code.\n\n" % script_name)
|
||||
|
||||
file.write("""\
|
||||
-----------------------------------------------------------------------------
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright notice,
|
||||
this list of conditions and the following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
|
||||
* Neither the name of the University of Cambridge nor the names of its
|
||||
contributors may be used to endorse or promote products derived from
|
||||
this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
POSSIBILITY OF SUCH DAMAGE.
|
||||
-----------------------------------------------------------------------------
|
||||
*/
|
||||
\n""")
|
||||
return file
|
||||
|
||||
# End of UcpCommon.py
|
|
@ -0,0 +1,188 @@
|
|||
#! /usr/bin/python
|
||||
|
||||
# PCRE2 UNICODE PROPERTY SUPPORT
|
||||
# ------------------------------
|
||||
#
|
||||
# This file auto-generates unicode property tests and their expected output.
|
||||
# It is recommended to re-run this generator after the unicode files are
|
||||
# updated. The names of the generated files are `testinput26` and `testoutput26`
|
||||
|
||||
import re
|
||||
import sys
|
||||
|
||||
from GenerateCommon import \
|
||||
script_names, \
|
||||
script_abbrevs
|
||||
|
||||
def write_both(text):
|
||||
input_file.write(text)
|
||||
output_file.write(text)
|
||||
|
||||
def to_string_char(ch_idx):
|
||||
if ch_idx < 128:
|
||||
if ch_idx < 16:
|
||||
return "\\x{0%x}" % ch_idx
|
||||
if ch_idx >= 32:
|
||||
return chr(ch_idx)
|
||||
return "\\x{%x}" % ch_idx
|
||||
|
||||
output_directory = ""
|
||||
|
||||
if len(sys.argv) > 2:
|
||||
print('** Too many arguments: just give a directory name')
|
||||
sys.exit(1)
|
||||
if len(sys.argv) == 2:
|
||||
output_directory = sys.argv[1]
|
||||
if not output_directory.endswith("/"):
|
||||
output_directory += "/"
|
||||
|
||||
try:
|
||||
input_file = open(output_directory + "testinput26", "w")
|
||||
output_file = open(output_directory + "testoutput26", "w")
|
||||
except IOError:
|
||||
print ("** Couldn't open output files")
|
||||
sys.exit(1)
|
||||
|
||||
write_both("# These tests are generated by maint/GenerateTest26.py, do not edit.\n\n")
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# UNICODE SCRIPT EXTENSION TESTS
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
write_both("# Unicode Script Extension tests.\n\n")
|
||||
|
||||
def gen_script_tests():
|
||||
script_data = [None] * len(script_names)
|
||||
char_data = [None] * 0x110000
|
||||
|
||||
property_re = re.compile("^([0-9A-F]{4,6})(?:\\.\\.([0-9A-F]{4,6}))? +; ([A-Za-z_ ]+) #")
|
||||
prev_name = ""
|
||||
script_idx = -1
|
||||
|
||||
with open("Unicode.tables/Scripts.txt") as f:
|
||||
for line in f:
|
||||
match_obj = property_re.match(line)
|
||||
|
||||
if match_obj == None:
|
||||
continue
|
||||
|
||||
name = match_obj.group(3)
|
||||
if name != prev_name:
|
||||
script_idx = script_names.index(name)
|
||||
prev_name = name
|
||||
|
||||
low = int(match_obj.group(1), 16)
|
||||
high = low
|
||||
char_data[low] = name
|
||||
|
||||
if match_obj.group(2) != None:
|
||||
high = int(match_obj.group(2), 16)
|
||||
for idx in range(low + 1, high + 1):
|
||||
char_data[idx] = name
|
||||
|
||||
if script_data[script_idx] == None:
|
||||
script_data[script_idx] = [low, None, None, None, None]
|
||||
script_data[script_idx][1] = high
|
||||
|
||||
extended_script_indicies = {}
|
||||
|
||||
with open("Unicode.tables/ScriptExtensions.txt") as f:
|
||||
for line in f:
|
||||
match_obj = property_re.match(line)
|
||||
|
||||
if match_obj == None:
|
||||
continue
|
||||
|
||||
low = int(match_obj.group(1), 16)
|
||||
high = low
|
||||
if match_obj.group(2) != None:
|
||||
high = int(match_obj.group(2), 16)
|
||||
|
||||
for abbrev in match_obj.group(3).split(" "):
|
||||
if abbrev not in extended_script_indicies:
|
||||
idx = script_abbrevs.index(abbrev)
|
||||
extended_script_indicies[abbrev] = idx
|
||||
rec = script_data[idx]
|
||||
rec[2] = low
|
||||
rec[3] = high
|
||||
else:
|
||||
idx = extended_script_indicies[abbrev]
|
||||
rec = script_data[idx]
|
||||
if rec[2] > low:
|
||||
rec[2] = low
|
||||
if rec[3] < high:
|
||||
rec[3] = high
|
||||
|
||||
if rec[4] == None:
|
||||
name = script_names[idx]
|
||||
for idx in range(low, high + 1):
|
||||
if char_data[idx] != name:
|
||||
rec[4] = idx
|
||||
break
|
||||
|
||||
long_property_name = False
|
||||
|
||||
for idx, rec in enumerate(script_data):
|
||||
script_name = script_names[idx]
|
||||
|
||||
if script_name == "Unknown":
|
||||
continue
|
||||
|
||||
script_abbrev = script_abbrevs[idx]
|
||||
|
||||
write_both("# Base script check\n")
|
||||
write_both("/^\\p{sc=%s}/utf\n" % script_name)
|
||||
write_both(" %s\n" % to_string_char(rec[0]))
|
||||
output_file.write(" 0: %s\n" % to_string_char(rec[0]))
|
||||
write_both("\n")
|
||||
|
||||
write_both("/^\\p{Script=%s}/utf\n" % script_abbrev)
|
||||
write_both(" %s\n" % to_string_char(rec[1]))
|
||||
output_file.write(" 0: %s\n" % to_string_char(rec[1]))
|
||||
write_both("\n")
|
||||
|
||||
if rec[2] != None:
|
||||
property_name = "scx"
|
||||
if long_property_name:
|
||||
property_name = "Script_Extensions"
|
||||
|
||||
write_both("# Script extension check\n")
|
||||
write_both("/^\\p{%s}/utf\n" % script_name)
|
||||
write_both(" %s\n" % to_string_char(rec[2]))
|
||||
output_file.write(" 0: %s\n" % to_string_char(rec[2]))
|
||||
write_both("\n")
|
||||
|
||||
write_both("/^\\p{%s=%s}/utf\n" % (property_name, script_abbrev))
|
||||
write_both(" %s\n" % to_string_char(rec[3]))
|
||||
output_file.write(" 0: %s\n" % to_string_char(rec[3]))
|
||||
write_both("\n")
|
||||
|
||||
long_property_name = not long_property_name
|
||||
|
||||
if rec[4] != None:
|
||||
write_both("# Script extension only character\n")
|
||||
write_both("/^\\p{%s}/utf\n" % script_name)
|
||||
write_both(" %s\n" % to_string_char(rec[4]))
|
||||
output_file.write(" 0: %s\n" % to_string_char(rec[4]))
|
||||
write_both("\n")
|
||||
|
||||
write_both("/^\\p{sc=%s}/utf\n" % script_name)
|
||||
write_both(" %s\n" % to_string_char(rec[4]))
|
||||
output_file.write("No match\n")
|
||||
write_both("\n")
|
||||
else:
|
||||
print("External character has not found for %s" % script_name)
|
||||
|
||||
high = rec[1]
|
||||
if rec[3] != None and rec[3] > rec[1]:
|
||||
high = rec[3]
|
||||
write_both("# Character not in script\n")
|
||||
write_both("/^\\p{%s}/utf\n" % script_name)
|
||||
write_both(" %s\n" % to_string_char(high + 1))
|
||||
output_file.write("No match\n")
|
||||
write_both("\n")
|
||||
|
||||
|
||||
gen_script_tests()
|
||||
|
||||
write_both("# End of testinput26\n")
|
|
@ -0,0 +1,923 @@
|
|||
#! /usr/bin/python
|
||||
|
||||
# PCRE2 UNICODE PROPERTY SUPPORT
|
||||
# ------------------------------
|
||||
#
|
||||
# This script generates the pcre2_ucd.c file from Unicode data files. This is
|
||||
# the compressed Unicode property data used by PCRE2. The script was created in
|
||||
# December 2021 as part of the Unicode data generation refactoring. It is
|
||||
# basically a re-working of the MultiStage2.py script that was submitted to the
|
||||
# PCRE project by Peter Kankowski in 2008 as part of a previous upgrading of
|
||||
# Unicode property support. A number of extensions have since been added. The
|
||||
# main difference in the 2021 upgrade (apart from comments and layout) is that
|
||||
# the data tables (e.g. list of script names) are now listed in or generated by
|
||||
# a separate Python module that is shared with the other Generate scripts.
|
||||
#
|
||||
# This script must be run in the "maint" directory. It requires the following
|
||||
# Unicode data tables: BidiMirrorring.txt, CaseFolding.txt,
|
||||
# DerivedBidiClass.txt, DerivedCoreProperties.txt, DerivedGeneralCategory.txt,
|
||||
# GraphemeBreakProperty.txt, PropList.txt, PropertyAliases.txt,
|
||||
# PropertyValueAliases.txt, ScriptExtensions.txt, Scripts.txt, and
|
||||
# emoji-data.txt. These must be in the Unicode.tables subdirectory.
|
||||
#
|
||||
# The emoji-data.txt file is found in the "emoji" subdirectory even though it
|
||||
# is technically part of a different (but coordinated) standard as shown
|
||||
# in files associated with Unicode Technical Standard #51 ("Unicode Emoji"),
|
||||
# for example:
|
||||
#
|
||||
# http://unicode.org/Public/emoji/13.0/ReadMe.txt
|
||||
#
|
||||
# DerivedBidiClass.txt and DerivedGeneralCategory.txt are in the "extracted"
|
||||
# subdirectory of the Unicode database (UCD) on the Unicode web site;
|
||||
# GraphemeBreakProperty.txt is in the "auxiliary" subdirectory. The other files
|
||||
# are in the top-level UCD directory.
|
||||
#
|
||||
# -----------------------------------------------------------------------------
|
||||
# Minor modifications made to the original script:
|
||||
# Added #! line at start
|
||||
# Removed tabs
|
||||
# Made it work with Python 2.4 by rewriting two statements that needed 2.5
|
||||
# Consequent code tidy
|
||||
# Adjusted data file names to take from the Unicode.tables directory
|
||||
# Adjusted global table names by prefixing _pcre_.
|
||||
# Commented out stuff relating to the casefolding table, which isn't used;
|
||||
# removed completely in 2012.
|
||||
# Corrected size calculation
|
||||
# Add #ifndef SUPPORT_UCP to use dummy tables when no UCP support is needed.
|
||||
# Update for PCRE2: name changes, and SUPPORT_UCP is abolished.
|
||||
#
|
||||
# Major modifications made to the original script:
|
||||
# Added code to add a grapheme break property field to records.
|
||||
#
|
||||
# Added code to search for sets of more than two characters that must match
|
||||
# each other caselessly. A new table is output containing these sets, and
|
||||
# offsets into the table are added to the main output records. This new
|
||||
# code scans CaseFolding.txt instead of UnicodeData.txt, which is no longer
|
||||
# used.
|
||||
#
|
||||
# Update for Python3:
|
||||
# . Processed with 2to3, but that didn't fix everything
|
||||
# . Changed string.strip to str.strip
|
||||
# . Added encoding='utf-8' to the open() call
|
||||
# . Inserted 'int' before blocksize/ELEMS_PER_LINE because an int is
|
||||
# required and the result of the division is a float
|
||||
#
|
||||
# Added code to scan the emoji-data.txt file to find the Extended Pictographic
|
||||
# property, which is used by PCRE2 as a grapheme breaking property. This was
|
||||
# done when updating to Unicode 11.0.0 (July 2018).
|
||||
#
|
||||
# Added code to add a Script Extensions field to records. This has increased
|
||||
# their size from 8 to 12 bytes, only 10 of which are currently used.
|
||||
#
|
||||
# Added code to add a bidi class field to records by scanning the
|
||||
# DerivedBidiClass.txt and PropList.txt files. This uses one of the two spare
|
||||
# bytes, so now 11 out of 12 are in use.
|
||||
#
|
||||
# 01-March-2010: Updated list of scripts for Unicode 5.2.0
|
||||
# 30-April-2011: Updated list of scripts for Unicode 6.0.0
|
||||
# July-2012: Updated list of scripts for Unicode 6.1.0
|
||||
# 20-August-2012: Added scan of GraphemeBreakProperty.txt and added a new
|
||||
# field in the record to hold the value. Luckily, the
|
||||
# structure had a hole in it, so the resulting table is
|
||||
# not much bigger than before.
|
||||
# 18-September-2012: Added code for multiple caseless sets. This uses the
|
||||
# final hole in the structure.
|
||||
# 30-September-2012: Added RegionalIndicator break property from Unicode 6.2.0
|
||||
# 13-May-2014: Updated for PCRE2
|
||||
# 03-June-2014: Updated for Python 3
|
||||
# 20-June-2014: Updated for Unicode 7.0.0
|
||||
# 12-August-2014: Updated to put Unicode version into the file
|
||||
# 19-June-2015: Updated for Unicode 8.0.0
|
||||
# 02-July-2017: Updated for Unicode 10.0.0
|
||||
# 03-July-2018: Updated for Unicode 11.0.0
|
||||
# 07-July-2018: Added code to scan emoji-data.txt for the Extended
|
||||
# Pictographic property.
|
||||
# 01-October-2018: Added the 'Unknown' script name
|
||||
# 03-October-2018: Added new field for Script Extensions
|
||||
# 27-July-2019: Updated for Unicode 12.1.0
|
||||
# 10-March-2020: Updated for Unicode 13.0.0
|
||||
# PCRE2-10.39: Updated for Unicode 14.0.0
|
||||
# 05-December-2021: Added code to scan DerivedBidiClass.txt for bidi class,
|
||||
# and also PropList.txt for the Bidi_Control property
|
||||
# 19-December-2021: Reworked script extensions lists to be bit maps instead
|
||||
# of zero-terminated lists of script numbers.
|
||||
# ----------------------------------------------------------------------------
|
||||
#
|
||||
# Changes to the refactored script:
|
||||
#
|
||||
# 26-December-2021: Refactoring completed
|
||||
# 10-January-2022: Addition of general Boolean property support
|
||||
# 12-January-2022: Merge scriptx and bidiclass fields
|
||||
# 14-January-2022: Enlarge Boolean property offset to 12 bits
|
||||
#
|
||||
# ----------------------------------------------------------------------------
|
||||
#
|
||||
#
|
||||
# The main tables generated by this script are used by macros defined in
|
||||
# pcre2_internal.h. They look up Unicode character properties using short
|
||||
# sequences of code that contains no branches, which makes for greater speed.
|
||||
#
|
||||
# Conceptually, there is a table of records (of type ucd_record), one for each
|
||||
# Unicode character. Each record contains the script number, script extension
|
||||
# value, character type, grapheme break type, offset to caseless matching set,
|
||||
# offset to the character's other case, the bidi class, and offset to bitmap of
|
||||
# Boolean properties.
|
||||
#
|
||||
# A real table covering all Unicode characters would be far too big. It can be
|
||||
# efficiently compressed by observing that many characters have the same
|
||||
# record, and many blocks of characters (taking 128 characters in a block) have
|
||||
# the same set of records as other blocks. This leads to a 2-stage lookup
|
||||
# process.
|
||||
#
|
||||
# This script constructs seven tables. The ucd_caseless_sets table contains
|
||||
# lists of characters that all match each other caselessly. Each list is
|
||||
# in order, and is terminated by NOTACHAR (0xffffffff), which is larger than
|
||||
# any valid character. The first list is empty; this is used for characters
|
||||
# that are not part of any list.
|
||||
#
|
||||
# The ucd_digit_sets table contains the code points of the '9' characters in
|
||||
# each set of 10 decimal digits in Unicode. This is used to ensure that digits
|
||||
# in script runs all come from the same set. The first element in the vector
|
||||
# contains the number of subsequent elements, which are in ascending order.
|
||||
#
|
||||
# Scripts are partitioned into two groups. Scripts that appear in at least one
|
||||
# character's script extension list come first, followed by "Unknown" and then
|
||||
# all the rest. This sorting is done automatically in the GenerateCommon.py
|
||||
# script. A script's number is its index in the script_names list.
|
||||
#
|
||||
# The ucd_script_sets table contains bitmaps that represent lists of scripts
|
||||
# for Script Extensions properties. Each bitmap consists of a fixed number of
|
||||
# unsigned 32-bit numbers, enough to allocate a bit for every script that is
|
||||
# used in any character's extension list, that is, enough for every script
|
||||
# whose number is less than ucp_Unknown. A character's script extension value
|
||||
# in its ucd record is an offset into the ucd_script_sets vector. The first
|
||||
# bitmap has no bits set; characters that have no script extensions have zero
|
||||
# as their script extensions value so that they use this map.
|
||||
#
|
||||
# The ucd_boolprop_sets table contains bitmaps that represent lists of Boolean
|
||||
# properties. Each bitmap consists of a fixed number of unsigned 32-bit
|
||||
# numbers, enough to allocate a bit for each supported Boolean property.
|
||||
#
|
||||
# The ucd_records table contains one instance of every unique character record
|
||||
# that is required. The ucd_stage1 table is indexed by a character's block
|
||||
# number, which is the character's code point divided by 128, since 128 is the
|
||||
# size of each block. The result of a lookup in ucd_stage1 a "virtual" block
|
||||
# number.
|
||||
#
|
||||
# The ucd_stage2 table is a table of "virtual" blocks; each block is indexed by
|
||||
# the offset of a character within its own block, and the result is the index
|
||||
# number of the required record in the ucd_records vector.
|
||||
#
|
||||
# The following examples are correct for the Unicode 14.0.0 database. Future
|
||||
# updates may make change the actual lookup values.
|
||||
#
|
||||
# Example: lowercase "a" (U+0061) is in block 0
|
||||
# lookup 0 in stage1 table yields 0
|
||||
# lookup 97 (0x61) in the first table in stage2 yields 35
|
||||
# record 35 is { 0, 5, 12, 0, -32, 18432, 44 }
|
||||
# 0 = ucp_Latin => Latin script
|
||||
# 5 = ucp_Ll => Lower case letter
|
||||
# 12 = ucp_gbOther => Grapheme break property "Other"
|
||||
# 0 => Not part of a caseless set
|
||||
# -32 (-0x20) => Other case is U+0041
|
||||
# 18432 = 0x4800 => Combined Bidi class + script extension values
|
||||
# 44 => Offset to Boolean properties
|
||||
#
|
||||
# The top 5 bits of the sixth field are the Bidi class, with the rest being the
|
||||
# script extension value, giving:
|
||||
#
|
||||
# 9 = ucp_bidiL => Bidi class left-to-right
|
||||
# 0 => No special script extension property
|
||||
#
|
||||
# Almost all lowercase latin characters resolve to the same record. One or two
|
||||
# are different because they are part of a multi-character caseless set (for
|
||||
# example, k, K and the Kelvin symbol are such a set).
|
||||
#
|
||||
# Example: hiragana letter A (U+3042) is in block 96 (0x60)
|
||||
# lookup 96 in stage1 table yields 93
|
||||
# lookup 66 (0x42) in table 93 in stage2 yields 819
|
||||
# record 819 is { 20, 7, 12, 0, 0, 18432, 82 }
|
||||
# 20 = ucp_Hiragana => Hiragana script
|
||||
# 7 = ucp_Lo => Other letter
|
||||
# 12 = ucp_gbOther => Grapheme break property "Other"
|
||||
# 0 => Not part of a caseless set
|
||||
# 0 => No other case
|
||||
# 18432 = 0x4800 => Combined Bidi class + script extension values
|
||||
# 82 => Offset to Boolean properties
|
||||
#
|
||||
# The top 5 bits of the sixth field are the Bidi class, with the rest being the
|
||||
# script extension value, giving:
|
||||
#
|
||||
# 9 = ucp_bidiL => Bidi class left-to-right
|
||||
# 0 => No special script extension property
|
||||
#
|
||||
# Example: vedic tone karshana (U+1CD0) is in block 57 (0x39)
|
||||
# lookup 57 in stage1 table yields 55
|
||||
# lookup 80 (0x50) in table 55 in stage2 yields 621
|
||||
# record 621 is { 84, 12, 3, 0, 0, 26762, 96 }
|
||||
# 84 = ucp_Inherited => Script inherited from predecessor
|
||||
# 12 = ucp_Mn => Non-spacing mark
|
||||
# 3 = ucp_gbExtend => Grapheme break property "Extend"
|
||||
# 0 => Not part of a caseless set
|
||||
# 0 => No other case
|
||||
# 26762 = 0x688A => Combined Bidi class + script extension values
|
||||
# 96 => Offset to Boolean properties
|
||||
#
|
||||
# The top 5 bits of the sixth field are the Bidi class, with the rest being the
|
||||
# script extension value, giving:
|
||||
#
|
||||
# 13 = ucp_bidiNSM => Bidi class non-spacing mark
|
||||
# 138 => Script Extension list offset = 138
|
||||
#
|
||||
# At offset 138 in the ucd_script_sets vector we find a bitmap with bits 1, 8,
|
||||
# 18, and 47 set. This means that this character is expected to be used with
|
||||
# any of those scripts, which are Bengali, Devanagari, Kannada, and Grantha.
|
||||
#
|
||||
# Philip Hazel, last updated 14 January 2022.
|
||||
##############################################################################
|
||||
|
||||
|
||||
# Import standard modules
|
||||
|
||||
import re
|
||||
import string
|
||||
import sys
|
||||
|
||||
# Import common data lists and functions
|
||||
|
||||
from GenerateCommon import \
|
||||
bidi_classes, \
|
||||
bool_properties, \
|
||||
bool_propsfiles, \
|
||||
bool_props_list_item_size, \
|
||||
break_properties, \
|
||||
category_names, \
|
||||
general_category_names, \
|
||||
script_abbrevs, \
|
||||
script_list_item_size, \
|
||||
script_names, \
|
||||
open_output
|
||||
|
||||
# Some general parameters
|
||||
|
||||
MAX_UNICODE = 0x110000
|
||||
NOTACHAR = 0xffffffff
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# DEFINE FUNCTIONS
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
# Parse a line of Scripts.txt, GraphemeBreakProperty.txt, DerivedBidiClass.txt
|
||||
# or DerivedGeneralCategory.txt
|
||||
|
||||
def make_get_names(enum):
|
||||
return lambda chardata: enum.index(chardata[1])
|
||||
|
||||
|
||||
# Parse a line of CaseFolding.txt
|
||||
|
||||
def get_other_case(chardata):
|
||||
if chardata[1] == 'C' or chardata[1] == 'S':
|
||||
return int(chardata[2], 16) - int(chardata[0], 16)
|
||||
return 0
|
||||
|
||||
|
||||
# Parse a line of ScriptExtensions.txt
|
||||
|
||||
def get_script_extension(chardata):
|
||||
global last_script_extension
|
||||
|
||||
offset = len(script_lists) * script_list_item_size
|
||||
if last_script_extension == chardata[1]:
|
||||
return offset - script_list_item_size
|
||||
|
||||
last_script_extension = chardata[1]
|
||||
script_lists.append(tuple(script_abbrevs.index(abbrev) for abbrev in last_script_extension.split(' ')))
|
||||
return offset
|
||||
|
||||
|
||||
# Read a whole table in memory, setting/checking the Unicode version
|
||||
|
||||
def read_table(file_name, get_value, default_value):
|
||||
global unicode_version
|
||||
|
||||
f = re.match(r'^[^/]+/([^.]+)\.txt$', file_name)
|
||||
file_base = f.group(1)
|
||||
version_pat = r"^# " + re.escape(file_base) + r"-(\d+\.\d+\.\d+)\.txt$"
|
||||
file = open(file_name, 'r', encoding='utf-8')
|
||||
f = re.match(version_pat, file.readline())
|
||||
version = f.group(1)
|
||||
if unicode_version == "":
|
||||
unicode_version = version
|
||||
elif unicode_version != version:
|
||||
print("WARNING: Unicode version differs in %s", file_name, file=sys.stderr)
|
||||
|
||||
table = [default_value] * MAX_UNICODE
|
||||
for line in file:
|
||||
line = re.sub(r'#.*', '', line)
|
||||
chardata = list(map(str.strip, line.split(';')))
|
||||
if len(chardata) <= 1:
|
||||
continue
|
||||
value = get_value(chardata)
|
||||
m = re.match(r'([0-9a-fA-F]+)(\.\.([0-9a-fA-F]+))?$', chardata[0])
|
||||
char = int(m.group(1), 16)
|
||||
if m.group(3) is None:
|
||||
last = char
|
||||
else:
|
||||
last = int(m.group(3), 16)
|
||||
for i in range(char, last + 1):
|
||||
# It is important not to overwrite a previously set value because in the
|
||||
# CaseFolding file there are lines to be ignored (returning the default
|
||||
# value of 0) which often come after a line which has already set data.
|
||||
if table[i] == default_value:
|
||||
table[i] = value
|
||||
file.close()
|
||||
return table
|
||||
|
||||
|
||||
# Get the smallest possible C language type for the values in a table
|
||||
|
||||
def get_type_size(table):
|
||||
type_size = [("uint8_t", 1), ("uint16_t", 2), ("uint32_t", 4),
|
||||
("signed char", 1), ("int16_t", 2), ("int32_t", 4)]
|
||||
limits = [(0, 255), (0, 65535), (0, 4294967295), (-128, 127),
|
||||
(-32768, 32767), (-2147483648, 2147483647)]
|
||||
minval = min(table)
|
||||
maxval = max(table)
|
||||
for num, (minlimit, maxlimit) in enumerate(limits):
|
||||
if minlimit <= minval and maxval <= maxlimit:
|
||||
return type_size[num]
|
||||
raise OverflowError("Too large to fit into C types")
|
||||
|
||||
|
||||
# Get the total size of a list of tables
|
||||
|
||||
def get_tables_size(*tables):
|
||||
total_size = 0
|
||||
for table in tables:
|
||||
type, size = get_type_size(table)
|
||||
total_size += size * len(table)
|
||||
return total_size
|
||||
|
||||
|
||||
# Compress a table into the two stages
|
||||
|
||||
def compress_table(table, block_size):
|
||||
blocks = {} # Dictionary for finding identical blocks
|
||||
stage1 = [] # Stage 1 table contains block numbers (indices into stage 2 table)
|
||||
stage2 = [] # Stage 2 table contains the blocks with property values
|
||||
table = tuple(table)
|
||||
for i in range(0, len(table), block_size):
|
||||
block = table[i:i+block_size]
|
||||
start = blocks.get(block)
|
||||
if start is None:
|
||||
# Allocate a new block
|
||||
start = len(stage2) / block_size
|
||||
stage2 += block
|
||||
blocks[block] = start
|
||||
stage1.append(start)
|
||||
return stage1, stage2
|
||||
|
||||
|
||||
# Output a table
|
||||
|
||||
def write_table(table, table_name, block_size = None):
|
||||
type, size = get_type_size(table)
|
||||
ELEMS_PER_LINE = 16
|
||||
|
||||
s = "const %s %s[] = { /* %d bytes" % (type, table_name, size * len(table))
|
||||
if block_size:
|
||||
s += ", block = %d" % block_size
|
||||
f.write(s + " */\n")
|
||||
table = tuple(table)
|
||||
if block_size is None:
|
||||
fmt = "%3d," * ELEMS_PER_LINE + " /* U+%04X */\n"
|
||||
mult = MAX_UNICODE / len(table)
|
||||
for i in range(0, len(table), ELEMS_PER_LINE):
|
||||
f.write(fmt % (table[i:i+ELEMS_PER_LINE] + (int(i * mult),)))
|
||||
else:
|
||||
if block_size > ELEMS_PER_LINE:
|
||||
el = ELEMS_PER_LINE
|
||||
else:
|
||||
el = block_size
|
||||
fmt = "%3d," * el + "\n"
|
||||
if block_size > ELEMS_PER_LINE:
|
||||
fmt = fmt * int(block_size / ELEMS_PER_LINE)
|
||||
for i in range(0, len(table), block_size):
|
||||
f.write(("\n/* block %d */\n" + fmt) % ((i / block_size,) + table[i:i+block_size]))
|
||||
f.write("};\n\n")
|
||||
|
||||
|
||||
# Extract the unique combinations of properties into records
|
||||
|
||||
def combine_tables(*tables):
|
||||
records = {}
|
||||
index = []
|
||||
for t in zip(*tables):
|
||||
i = records.get(t)
|
||||
if i is None:
|
||||
i = records[t] = len(records)
|
||||
index.append(i)
|
||||
return index, records
|
||||
|
||||
|
||||
# Create a record struct
|
||||
|
||||
def get_record_size_struct(records):
|
||||
size = 0
|
||||
structure = 'typedef struct {\n'
|
||||
for i in range(len(records[0])):
|
||||
record_slice = [record[i] for record in records]
|
||||
slice_type, slice_size = get_type_size(record_slice)
|
||||
# add padding: round up to the nearest power of slice_size
|
||||
size = (size + slice_size - 1) & -slice_size
|
||||
size += slice_size
|
||||
structure += '%s property_%d;\n' % (slice_type, i)
|
||||
|
||||
# round up to the first item of the next structure in array
|
||||
record_slice = [record[0] for record in records]
|
||||
slice_type, slice_size = get_type_size(record_slice)
|
||||
size = (size + slice_size - 1) & -slice_size
|
||||
|
||||
structure += '} ucd_record;\n*/\n'
|
||||
return size, structure
|
||||
|
||||
|
||||
# Write records
|
||||
|
||||
def write_records(records, record_size):
|
||||
f.write('const ucd_record PRIV(ucd_records)[] = { ' + \
|
||||
'/* %d bytes, record size %d */\n' % (len(records) * record_size, record_size))
|
||||
records = list(zip(list(records.keys()), list(records.values())))
|
||||
records.sort(key = lambda x: x[1])
|
||||
for i, record in enumerate(records):
|
||||
f.write((' {' + '%6d, ' * len(record[0]) + '}, /* %3d */\n') % (record[0] + (i,)))
|
||||
f.write('};\n\n')
|
||||
|
||||
|
||||
# Write a bit set
|
||||
|
||||
def write_bitsets(list, item_size):
|
||||
for d in list:
|
||||
bitwords = [0] * item_size
|
||||
for idx in d:
|
||||
bitwords[idx // 32] |= 1 << (idx & 31)
|
||||
s = " "
|
||||
for x in bitwords:
|
||||
f.write("%s" % s)
|
||||
s = ", "
|
||||
f.write("0x%08xu" % x)
|
||||
f.write(",\n")
|
||||
f.write("};\n\n")
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# This bit of code must have been useful when the original script was being
|
||||
# developed. Retain it just in case it is ever needed again.
|
||||
|
||||
# def test_record_size():
|
||||
# tests = [ \
|
||||
# ( [(3,), (6,), (6,), (1,)], 1 ), \
|
||||
# ( [(300,), (600,), (600,), (100,)], 2 ), \
|
||||
# ( [(25, 3), (6, 6), (34, 6), (68, 1)], 2 ), \
|
||||
# ( [(300, 3), (6, 6), (340, 6), (690, 1)], 4 ), \
|
||||
# ( [(3, 300), (6, 6), (6, 340), (1, 690)], 4 ), \
|
||||
# ( [(300, 300), (6, 6), (6, 340), (1, 690)], 4 ), \
|
||||
# ( [(3, 100000), (6, 6), (6, 123456), (1, 690)], 8 ), \
|
||||
# ( [(100000, 300), (6, 6), (123456, 6), (1, 690)], 8 ), \
|
||||
# ]
|
||||
# for test in tests:
|
||||
# size, struct = get_record_size_struct(test[0])
|
||||
# assert(size == test[1])
|
||||
# test_record_size()
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# MAIN CODE FOR CREATING TABLES
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
unicode_version = ""
|
||||
|
||||
# Some of the tables imported from GenerateCommon.py have alternate comment
|
||||
# strings for use by GenerateUcpHeader. The comments are not wanted here, so
|
||||
# remove them.
|
||||
|
||||
bidi_classes = bidi_classes[::2]
|
||||
break_properties = break_properties[::2]
|
||||
category_names = category_names[::2]
|
||||
|
||||
# Create the various tables from Unicode data files
|
||||
|
||||
script = read_table('Unicode.tables/Scripts.txt', make_get_names(script_names), script_names.index('Unknown'))
|
||||
category = read_table('Unicode.tables/DerivedGeneralCategory.txt', make_get_names(category_names), category_names.index('Cn'))
|
||||
break_props = read_table('Unicode.tables/GraphemeBreakProperty.txt', make_get_names(break_properties), break_properties.index('Other'))
|
||||
other_case = read_table('Unicode.tables/CaseFolding.txt', get_other_case, 0)
|
||||
bidi_class = read_table('Unicode.tables/DerivedBidiClass.txt', make_get_names(bidi_classes), bidi_classes.index('L'))
|
||||
|
||||
# The grapheme breaking rules were changed for Unicode 11.0.0 (June 2018). Now
|
||||
# we need to find the Extended_Pictographic property for emoji characters. This
|
||||
# can be set as an additional grapheme break property, because the default for
|
||||
# all the emojis is "other". We scan the emoji-data.txt file and modify the
|
||||
# break-props table.
|
||||
|
||||
file = open('Unicode.tables/emoji-data.txt', 'r', encoding='utf-8')
|
||||
for line in file:
|
||||
line = re.sub(r'#.*', '', line)
|
||||
chardata = list(map(str.strip, line.split(';')))
|
||||
if len(chardata) <= 1:
|
||||
continue
|
||||
if chardata[1] != "Extended_Pictographic":
|
||||
continue
|
||||
m = re.match(r'([0-9a-fA-F]+)(\.\.([0-9a-fA-F]+))?$', chardata[0])
|
||||
char = int(m.group(1), 16)
|
||||
if m.group(3) is None:
|
||||
last = char
|
||||
else:
|
||||
last = int(m.group(3), 16)
|
||||
for i in range(char, last + 1):
|
||||
if break_props[i] != break_properties.index('Other'):
|
||||
print("WARNING: Emoji 0x%x has break property %s, not 'Other'",
|
||||
i, break_properties[break_props[i]], file=sys.stderr)
|
||||
break_props[i] = break_properties.index('Extended_Pictographic')
|
||||
file.close()
|
||||
|
||||
# Handle script extensions. The get_script_extesion() function maintains a
|
||||
# list of unique bitmaps representing lists of scripts, returning the offset
|
||||
# in that list. Initialize the list with an empty set, which is used for
|
||||
# characters that have no script extensions.
|
||||
|
||||
script_lists = [[]]
|
||||
last_script_extension = ""
|
||||
scriptx_bidi_class = read_table('Unicode.tables/ScriptExtensions.txt', get_script_extension, 0)
|
||||
|
||||
for idx in range(len(scriptx_bidi_class)):
|
||||
scriptx_bidi_class[idx] = scriptx_bidi_class[idx] | (bidi_class[idx] << 11)
|
||||
bidi_class = None
|
||||
|
||||
# Find the Boolean properties of each character. This next bit of magic creates
|
||||
# a list of empty lists. Using [[]] * MAX_UNICODE gives a list of references to
|
||||
# the *same* list, which is not what we want.
|
||||
|
||||
bprops = [[] for _ in range(MAX_UNICODE)]
|
||||
|
||||
# Collect the properties from the various files
|
||||
|
||||
for filename in bool_propsfiles:
|
||||
try:
|
||||
file = open('Unicode.tables/' + filename, 'r')
|
||||
except IOError:
|
||||
print(f"** Couldn't open {'Unicode.tables/' + filename}\n")
|
||||
sys.exit(1)
|
||||
|
||||
for line in file:
|
||||
line = re.sub(r'#.*', '', line)
|
||||
data = list(map(str.strip, line.split(';')))
|
||||
if len(data) <= 1:
|
||||
continue
|
||||
|
||||
try:
|
||||
ix = bool_properties.index(data[1])
|
||||
except ValueError:
|
||||
continue
|
||||
|
||||
m = re.match(r'([0-9a-fA-F]+)(\.\.([0-9a-fA-F]+))?$', data[0])
|
||||
char = int(m.group(1), 16)
|
||||
if m.group(3) is None:
|
||||
last = char
|
||||
else:
|
||||
last = int(m.group(3), 16)
|
||||
|
||||
for i in range(char, last + 1):
|
||||
bprops[i].append(ix)
|
||||
|
||||
file.close()
|
||||
|
||||
# The ASCII property isn't listed in any files, but it is easy enough to add
|
||||
# it manually.
|
||||
|
||||
ix = bool_properties.index("ASCII")
|
||||
for i in range(128):
|
||||
bprops[i].append(ix)
|
||||
|
||||
# The Bidi_Mirrored property isn't listed in any property files. We have to
|
||||
# deduce it from the file that lists the mirrored characters.
|
||||
|
||||
ix = bool_properties.index("Bidi_Mirrored")
|
||||
|
||||
try:
|
||||
file = open('Unicode.tables/BidiMirroring.txt', 'r')
|
||||
except IOError:
|
||||
print(f"** Couldn't open {'Unicode.tables/BidiMirroring.txt'}\n")
|
||||
sys.exit(1)
|
||||
|
||||
for line in file:
|
||||
line = re.sub(r'#.*', '', line)
|
||||
data = list(map(str.strip, line.split(';')))
|
||||
if len(data) <= 1:
|
||||
continue
|
||||
c = int(data[0], 16)
|
||||
bprops[c].append(ix)
|
||||
|
||||
file.close()
|
||||
|
||||
# Scan each character's boolean property list and created a list of unique
|
||||
# lists, at the same time, setting the index in that list for each property in
|
||||
# the bool_props vector.
|
||||
|
||||
bool_props = [0] * MAX_UNICODE
|
||||
bool_props_lists = [[]]
|
||||
|
||||
for c in range(MAX_UNICODE):
|
||||
s = set(bprops[c])
|
||||
for i in range(len(bool_props_lists)):
|
||||
if s == set(bool_props_lists[i]):
|
||||
break;
|
||||
else:
|
||||
bool_props_lists.append(bprops[c])
|
||||
i += 1
|
||||
|
||||
bool_props[c] = i * bool_props_list_item_size
|
||||
|
||||
# This block of code was added by PH in September 2012. It scans the other_case
|
||||
# table to find sets of more than two characters that must all match each other
|
||||
# caselessly. Later in this script a table of these sets is written out.
|
||||
# However, we have to do this work here in order to compute the offsets in the
|
||||
# table that are inserted into the main table.
|
||||
|
||||
# The CaseFolding.txt file lists pairs, but the common logic for reading data
|
||||
# sets only one value, so first we go through the table and set "return"
|
||||
# offsets for those that are not already set.
|
||||
|
||||
for c in range(MAX_UNICODE):
|
||||
if other_case[c] != 0 and other_case[c + other_case[c]] == 0:
|
||||
other_case[c + other_case[c]] = -other_case[c]
|
||||
|
||||
# Now scan again and create equivalence sets.
|
||||
|
||||
caseless_sets = []
|
||||
|
||||
for c in range(MAX_UNICODE):
|
||||
o = c + other_case[c]
|
||||
|
||||
# Trigger when this character's other case does not point back here. We
|
||||
# now have three characters that are case-equivalent.
|
||||
|
||||
if other_case[o] != -other_case[c]:
|
||||
t = o + other_case[o]
|
||||
|
||||
# Scan the existing sets to see if any of the three characters are already
|
||||
# part of a set. If so, unite the existing set with the new set.
|
||||
|
||||
appended = 0
|
||||
for s in caseless_sets:
|
||||
found = 0
|
||||
for x in s:
|
||||
if x == c or x == o or x == t:
|
||||
found = 1
|
||||
|
||||
# Add new characters to an existing set
|
||||
|
||||
if found:
|
||||
found = 0
|
||||
for y in [c, o, t]:
|
||||
for x in s:
|
||||
if x == y:
|
||||
found = 1
|
||||
if not found:
|
||||
s.append(y)
|
||||
appended = 1
|
||||
|
||||
# If we have not added to an existing set, create a new one.
|
||||
|
||||
if not appended:
|
||||
caseless_sets.append([c, o, t])
|
||||
|
||||
# End of loop looking for caseless sets.
|
||||
|
||||
# Now scan the sets and set appropriate offsets for the characters.
|
||||
|
||||
caseless_offsets = [0] * MAX_UNICODE
|
||||
|
||||
offset = 1;
|
||||
for s in caseless_sets:
|
||||
for x in s:
|
||||
caseless_offsets[x] = offset
|
||||
offset += len(s) + 1
|
||||
|
||||
# End of block of code for creating offsets for caseless matching sets.
|
||||
|
||||
|
||||
# Combine all the tables
|
||||
|
||||
table, records = combine_tables(script, category, break_props,
|
||||
caseless_offsets, other_case, scriptx_bidi_class, bool_props)
|
||||
|
||||
# Find the record size and create a string definition of the structure for
|
||||
# outputting as a comment.
|
||||
|
||||
record_size, record_struct = get_record_size_struct(list(records.keys()))
|
||||
|
||||
# Find the optimum block size for the two-stage table
|
||||
|
||||
min_size = sys.maxsize
|
||||
for block_size in [2 ** i for i in range(5,10)]:
|
||||
size = len(records) * record_size
|
||||
stage1, stage2 = compress_table(table, block_size)
|
||||
size += get_tables_size(stage1, stage2)
|
||||
#print "/* block size %5d => %5d bytes */" % (block_size, size)
|
||||
if size < min_size:
|
||||
min_size = size
|
||||
min_stage1, min_stage2 = stage1, stage2
|
||||
min_block_size = block_size
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# MAIN CODE FOR WRITING THE OUTPUT FILE
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
# Open the output file (no return on failure). This call also writes standard
|
||||
# header boilerplate.
|
||||
|
||||
f = open_output("pcre2_ucd.c")
|
||||
|
||||
# Output this file's heading text
|
||||
|
||||
f.write("""\
|
||||
/* This file contains tables of Unicode properties that are extracted from
|
||||
Unicode data files. See the comments at the start of maint/GenerateUcd.py for
|
||||
details.
|
||||
|
||||
As well as being part of the PCRE2 library, this file is #included by the
|
||||
pcre2test program, which redefines the PRIV macro to change table names from
|
||||
_pcre2_xxx to xxxx, thereby avoiding name clashes with the library. At present,
|
||||
just one of these tables is actually needed. When compiling the library, some
|
||||
headers are needed. */
|
||||
|
||||
#ifndef PCRE2_PCRE2TEST
|
||||
#ifdef HAVE_CONFIG_H
|
||||
#include "config.h"
|
||||
#endif
|
||||
#include "pcre2_internal.h"
|
||||
#endif /* PCRE2_PCRE2TEST */
|
||||
|
||||
/* The tables herein are needed only when UCP support is built, and in PCRE2
|
||||
that happens automatically with UTF support. This module should not be
|
||||
referenced otherwise, so it should not matter whether it is compiled or not.
|
||||
However a comment was received about space saving - maybe the guy linked all
|
||||
the modules rather than using a library - so we include a condition to cut out
|
||||
the tables when not needed. But don't leave a totally empty module because some
|
||||
compilers barf at that. Instead, just supply some small dummy tables. */
|
||||
|
||||
#ifndef SUPPORT_UNICODE
|
||||
const ucd_record PRIV(ucd_records)[] = {{0,0,0,0,0,0,0}};
|
||||
const uint16_t PRIV(ucd_stage1)[] = {0};
|
||||
const uint16_t PRIV(ucd_stage2)[] = {0};
|
||||
const uint32_t PRIV(ucd_caseless_sets)[] = {0};
|
||||
#else
|
||||
\n""")
|
||||
|
||||
# --- Output some variable heading stuff ---
|
||||
|
||||
f.write("/* Total size: %d bytes, block size: %d. */\n\n" % (min_size, min_block_size))
|
||||
f.write('const char *PRIV(unicode_version) = "{}";\n\n'.format(unicode_version))
|
||||
|
||||
f.write("""\
|
||||
/* When recompiling tables with a new Unicode version, please check the types
|
||||
in this structure definition with those in pcre2_internal.h (the actual field
|
||||
names will be different).
|
||||
\n""")
|
||||
|
||||
f.write(record_struct)
|
||||
|
||||
f.write("""
|
||||
/* If the 32-bit library is run in non-32-bit mode, character values greater
|
||||
than 0x10ffff may be encountered. For these we set up a special record. */
|
||||
|
||||
#if PCRE2_CODE_UNIT_WIDTH == 32
|
||||
const ucd_record PRIV(dummy_ucd_record)[] = {{
|
||||
ucp_Unknown, /* script */
|
||||
ucp_Cn, /* type unassigned */
|
||||
ucp_gbOther, /* grapheme break property */
|
||||
0, /* case set */
|
||||
0, /* other case */
|
||||
0 | (ucp_bidiL << UCD_BIDICLASS_SHIFT), /* script extension and bidi class */
|
||||
0, /* bool properties offset */
|
||||
}};
|
||||
#endif
|
||||
\n""")
|
||||
|
||||
# --- Output the table of caseless character sets ---
|
||||
|
||||
f.write("""\
|
||||
/* This table contains lists of characters that are caseless sets of
|
||||
more than one character. Each list is terminated by NOTACHAR. */
|
||||
|
||||
const uint32_t PRIV(ucd_caseless_sets)[] = {
|
||||
NOTACHAR,
|
||||
""")
|
||||
|
||||
for s in caseless_sets:
|
||||
s = sorted(s)
|
||||
for x in s:
|
||||
f.write(' 0x%04x,' % x)
|
||||
f.write(' NOTACHAR,\n')
|
||||
f.write('};\n\n')
|
||||
|
||||
# --- Other tables are not needed by pcre2test ---
|
||||
|
||||
f.write("""\
|
||||
/* When #included in pcre2test, we don't need the table of digit sets, nor the
|
||||
the large main UCD tables. */
|
||||
|
||||
#ifndef PCRE2_PCRE2TEST
|
||||
\n""")
|
||||
|
||||
# --- Read Scripts.txt again for the sets of 10 digits. ---
|
||||
|
||||
digitsets = []
|
||||
file = open('Unicode.tables/Scripts.txt', 'r', encoding='utf-8')
|
||||
|
||||
for line in file:
|
||||
m = re.match(r'([0-9a-fA-F]+)\.\.([0-9a-fA-F]+)\s+;\s+\S+\s+#\s+Nd\s+', line)
|
||||
if m is None:
|
||||
continue
|
||||
first = int(m.group(1),16)
|
||||
last = int(m.group(2),16)
|
||||
if ((last - first + 1) % 10) != 0:
|
||||
f.write("ERROR: %04x..%04x does not contain a multiple of 10 characters" % (first, last),
|
||||
file=sys.stderr)
|
||||
while first < last:
|
||||
digitsets.append(first + 9)
|
||||
first += 10
|
||||
file.close()
|
||||
digitsets.sort()
|
||||
|
||||
f.write("""\
|
||||
/* This table lists the code points for the '9' characters in each set of
|
||||
decimal digits. It is used to ensure that all the digits in a script run come
|
||||
from the same set. */
|
||||
|
||||
const uint32_t PRIV(ucd_digit_sets)[] = {
|
||||
""")
|
||||
|
||||
f.write(" %d, /* Number of subsequent values */" % len(digitsets))
|
||||
count = 8
|
||||
for d in digitsets:
|
||||
if count == 8:
|
||||
f.write("\n ")
|
||||
count = 0
|
||||
f.write(" 0x%05x," % d)
|
||||
count += 1
|
||||
f.write("\n};\n\n")
|
||||
|
||||
f.write("""\
|
||||
/* This vector is a list of script bitsets for the Script Extension property.
|
||||
The number of 32-bit words in each bitset is #defined in pcre2_ucp.h as
|
||||
ucd_script_sets_item_size. */
|
||||
|
||||
const uint32_t PRIV(ucd_script_sets)[] = {
|
||||
""")
|
||||
write_bitsets(script_lists, script_list_item_size)
|
||||
|
||||
f.write("""\
|
||||
/* This vector is a list of bitsets for Boolean properties. The number of
|
||||
32_bit words in each bitset is #defined as ucd_boolprop_sets_item_size in
|
||||
pcre2_ucp.h. */
|
||||
|
||||
const uint32_t PRIV(ucd_boolprop_sets)[] = {
|
||||
""")
|
||||
write_bitsets(bool_props_lists, bool_props_list_item_size)
|
||||
|
||||
|
||||
# Output the main UCD tables.
|
||||
|
||||
f.write("""\
|
||||
/* These are the main two-stage UCD tables. The fields in each record are:
|
||||
script (8 bits), character type (8 bits), grapheme break property (8 bits),
|
||||
offset to multichar other cases or zero (8 bits), offset to other case or zero
|
||||
(32 bits, signed), bidi class (5 bits) and script extension (11 bits) packed
|
||||
into a 16-bit field, and offset in binary properties table (16 bits). */
|
||||
\n""")
|
||||
|
||||
write_records(records, record_size)
|
||||
write_table(min_stage1, 'PRIV(ucd_stage1)')
|
||||
write_table(min_stage2, 'PRIV(ucd_stage2)', min_block_size)
|
||||
|
||||
f.write("#if UCD_BLOCK_SIZE != %d\n" % min_block_size)
|
||||
f.write("""\
|
||||
#error Please correct UCD_BLOCK_SIZE in pcre2_internal.h
|
||||
#endif
|
||||
#endif /* SUPPORT_UNICODE */
|
||||
|
||||
#endif /* PCRE2_PCRE2TEST */
|
||||
|
||||
/* End of pcre2_ucd.c */
|
||||
""")
|
||||
|
||||
f.close
|
||||
|
||||
# End
|
|
@ -0,0 +1,98 @@
|
|||
#! /usr/bin/python
|
||||
|
||||
# PCRE2 UNICODE PROPERTY SUPPORT
|
||||
# ------------------------------
|
||||
|
||||
# This script generates the pcre2_ucp.h file from Unicode data files. This
|
||||
# header uses enumerations to give names to Unicode property types and script
|
||||
# names.
|
||||
|
||||
# This script was created in December 2021 as part of the Unicode data
|
||||
# generation refactoring.
|
||||
|
||||
|
||||
# Import common data lists and functions
|
||||
|
||||
from GenerateCommon import \
|
||||
bidi_classes, \
|
||||
bool_properties, \
|
||||
bool_props_list_item_size, \
|
||||
break_properties, \
|
||||
category_names, \
|
||||
general_category_names, \
|
||||
script_list_item_size, \
|
||||
script_names, \
|
||||
open_output
|
||||
|
||||
# Open the output file (no return on failure). This call also writes standard
|
||||
# header boilerplate.
|
||||
|
||||
f = open_output("pcre2_ucp.h")
|
||||
|
||||
# Output this file's heading text
|
||||
|
||||
f.write("""\
|
||||
#ifndef PCRE2_UCP_H_IDEMPOTENT_GUARD
|
||||
#define PCRE2_UCP_H_IDEMPOTENT_GUARD
|
||||
|
||||
/* This file contains definitions of the Unicode property values that are
|
||||
returned by the UCD access macros and used throughout PCRE2.
|
||||
|
||||
IMPORTANT: The specific values of the first two enums (general and particular
|
||||
character categories) are assumed by the table called catposstab in the file
|
||||
pcre2_auto_possess.c. They are unlikely to change, but should be checked after
|
||||
an update. */
|
||||
\n""")
|
||||
|
||||
f.write("/* These are the general character categories. */\n\nenum {\n")
|
||||
for i in general_category_names:
|
||||
f.write(" ucp_%s,\n" % i)
|
||||
f.write("};\n\n")
|
||||
|
||||
f.write("/* These are the particular character categories. */\n\nenum {\n")
|
||||
for i in range(0, len(category_names), 2):
|
||||
f.write(" ucp_%s, /* %s */\n" % (category_names[i], category_names[i+1]))
|
||||
f.write("};\n\n")
|
||||
|
||||
f.write("/* These are Boolean properties. */\n\nenum {\n")
|
||||
for i in bool_properties:
|
||||
f.write(" ucp_%s,\n" % i)
|
||||
|
||||
f.write(" /* This must be last */\n")
|
||||
f.write(" ucp_Bprop_Count\n};\n\n")
|
||||
|
||||
f.write("/* Size of entries in ucd_boolprop_sets[] */\n\n")
|
||||
f.write("#define ucd_boolprop_sets_item_size %d\n\n" % bool_props_list_item_size)
|
||||
|
||||
f.write("/* These are the bidi class values. */\n\nenum {\n")
|
||||
for i in range(0, len(bidi_classes), 2):
|
||||
sp = ' ' * (4 - len(bidi_classes[i]))
|
||||
f.write(" ucp_bidi%s,%s /* %s */\n" % (bidi_classes[i], sp, bidi_classes[i+1]))
|
||||
f.write("};\n\n")
|
||||
|
||||
f.write("/* These are grapheme break properties. The Extended Pictographic "
|
||||
"property\ncomes from the emoji-data.txt file. */\n\nenum {\n")
|
||||
for i in range(0, len(break_properties), 2):
|
||||
sp = ' ' * (21 - len(break_properties[i]))
|
||||
f.write(" ucp_gb%s,%s /* %s */\n" % (break_properties[i], sp, break_properties[i+1]))
|
||||
f.write("};\n\n")
|
||||
|
||||
f.write("/* These are the script identifications. */\n\nenum {\n /* Scripts which has characters in other scripts. */\n")
|
||||
for i in script_names:
|
||||
if i == "Unknown":
|
||||
f.write("\n /* Scripts which has no characters in other scripts. */\n")
|
||||
f.write(" ucp_%s,\n" % i)
|
||||
f.write("\n")
|
||||
|
||||
f.write(" /* This must be last */\n")
|
||||
f.write(" ucp_Script_Count\n};\n\n")
|
||||
|
||||
f.write("/* Size of entries in ucd_script_sets[] */\n\n")
|
||||
f.write("#define ucd_script_sets_item_size %d\n\n" % script_list_item_size)
|
||||
|
||||
f.write("#endif /* PCRE2_UCP_H_IDEMPOTENT_GUARD */\n\n")
|
||||
f.write("/* End of pcre2_ucp.h */\n")
|
||||
|
||||
f.close()
|
||||
|
||||
# End
|
|
@ -0,0 +1,203 @@
|
|||
#! /usr/bin/python
|
||||
|
||||
# PCRE2 UNICODE PROPERTY SUPPORT
|
||||
# ------------------------------
|
||||
|
||||
# This script generates the pcre2_ucptables.c file, which contains tables for
|
||||
# recognizing Unicode property names. It is #included by pcre2_tables.c. In
|
||||
# order to reduce the number of relocations when loading the PCRE2 library, the
|
||||
# names are held as a single large string, with offsets in the table. This is
|
||||
# tedious to maintain by hand. Therefore, a script is used to generate the
|
||||
# table.
|
||||
|
||||
# This script was created in December 2021 based on the previous GenerateUtt
|
||||
# script, whose output had to be manually edited into pcre2_tables.c. Here is
|
||||
# the history of the original script:
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
# Modified by PH 17-March-2009 to generate the more verbose form that works
|
||||
# for UTF-support in EBCDIC as well as ASCII environments.
|
||||
# Modified by PH 01-March-2010 to add new scripts for Unicode 5.2.0.
|
||||
# Modified by PH 04-May-2010 to add new "X.." special categories.
|
||||
# Modified by PH 30-April-2011 to add new scripts for Unicode 6.0.0
|
||||
# Modified by ChPe 30-September-2012 to add this note; no other changes were
|
||||
# necessary for Unicode 6.2.0 support.
|
||||
# Modfied by PH 26-February-2013 to add the Xuc special category.
|
||||
# Comment modified by PH 13-May-2014 to update to PCRE2 file names.
|
||||
# Script updated to Python 3 by running it through the 2to3 converter.
|
||||
# Added script names for Unicode 7.0.0, 20-June-2014.
|
||||
# Added script names for Unicode 8.0.0, 19-June-2015.
|
||||
# Added script names for Unicode 10.0.0, 02-July-2017.
|
||||
# Added script names for Unicode 11.0.0, 03-July-2018.
|
||||
# Added 'Unknown' script, 01-October-2018.
|
||||
# Added script names for Unicode 12.1.0, 27-July-2019.
|
||||
# Added script names for Unicode 13.0.0, 10-March-2020.
|
||||
# Added Script names for Unicode 14.0.0, PCRE2-10.39
|
||||
# Added support for bidi class and bidi control, 06-December-2021
|
||||
# This also involved lower casing strings and removing underscores, in
|
||||
# accordance with Unicode's "loose matching" rules, which Perl observes.
|
||||
# Changed default script type from PT_SC to PT_SCX, 18-December-2021
|
||||
# -----------------------------------------------------------------------------
|
||||
#
|
||||
# Note subsequent changes here:
|
||||
#
|
||||
# 27-December-2021: Added support for 4-letter script abbreviations.
|
||||
# 10-January-2022: Further updates for Boolean property support
|
||||
# -----------------------------------------------------------------------------
|
||||
|
||||
|
||||
# Import common data lists and functions
|
||||
|
||||
from GenerateCommon import \
|
||||
abbreviations, \
|
||||
bool_properties, \
|
||||
bidi_classes, \
|
||||
category_names, \
|
||||
general_category_names, \
|
||||
script_names, \
|
||||
open_output
|
||||
|
||||
# Open the output file (no return on failure). This call also writes standard
|
||||
# header boilerplate.
|
||||
|
||||
f = open_output("pcre2_ucptables.c")
|
||||
|
||||
# The list in bidi_classes contains just the Unicode classes such as AN, LRE,
|
||||
# etc., along with comments. We need to add "bidi" in front of each value, in
|
||||
# order to create names that don't clash with other types of property.
|
||||
|
||||
bidi_class_names = []
|
||||
for i in range(0, len(bidi_classes), 2):
|
||||
bidi_class_names.append("bidi" + bidi_classes[i])
|
||||
|
||||
# Remove the comments from other lists that contain them.
|
||||
|
||||
category_names = category_names[::2]
|
||||
|
||||
# Create standardized versions of the names by lowercasing and removing
|
||||
# underscores.
|
||||
|
||||
def stdname(x):
|
||||
return x.lower().replace('_', '')
|
||||
|
||||
def stdnames(x):
|
||||
y = [''] * len(x)
|
||||
for i in range(len(x)):
|
||||
y[i] = stdname(x[i])
|
||||
return y
|
||||
|
||||
std_category_names = stdnames(category_names)
|
||||
std_general_category_names = stdnames(general_category_names)
|
||||
std_bidi_class_names = stdnames(bidi_class_names)
|
||||
std_bool_properties = stdnames(bool_properties)
|
||||
|
||||
# Create the table, starting with the Unicode script, category and bidi class
|
||||
# names. We keep both the standardized name and the original, because the
|
||||
# latter is used for the ucp_xx names. NOTE: for the script abbreviations, we
|
||||
# still use the full original names.
|
||||
|
||||
utt_table = []
|
||||
|
||||
scx_end = script_names.index('Unknown')
|
||||
|
||||
for idx, name in enumerate(script_names):
|
||||
pt_type = 'PT_SCX' if idx < scx_end else 'PT_SC'
|
||||
utt_table.append((stdname(name), name, pt_type))
|
||||
for abbrev in abbreviations[name]:
|
||||
utt_table.append((stdname(abbrev), name, pt_type))
|
||||
|
||||
# Add the remaining property lists
|
||||
|
||||
utt_table += list(zip(std_category_names, category_names, ['PT_PC'] * len(category_names)))
|
||||
utt_table += list(zip(std_general_category_names, general_category_names, ['PT_GC'] * len(general_category_names)))
|
||||
utt_table += list(zip(std_bidi_class_names, bidi_class_names, ['PT_BIDICL'] * len(bidi_class_names)))
|
||||
|
||||
for name in bool_properties:
|
||||
utt_table.append((stdname(name), name, 'PT_BOOL'))
|
||||
if name in abbreviations:
|
||||
for abbrev in abbreviations[name]:
|
||||
utt_table.append((stdname(abbrev), name, 'PT_BOOL'))
|
||||
|
||||
# Now add specials and synonyms. Note both the standardized and capitalized
|
||||
# forms are needed.
|
||||
|
||||
utt_table.append(('any', 'Any', 'PT_ANY'))
|
||||
utt_table.append(('l&', 'L&', 'PT_LAMP'))
|
||||
utt_table.append(('lc', 'LC', 'PT_LAMP'))
|
||||
utt_table.append(('xan', 'Xan', 'PT_ALNUM'))
|
||||
utt_table.append(('xps', 'Xps', 'PT_PXSPACE'))
|
||||
utt_table.append(('xsp', 'Xsp', 'PT_SPACE'))
|
||||
utt_table.append(('xuc', 'Xuc', 'PT_UCNC'))
|
||||
utt_table.append(('xwd', 'Xwd', 'PT_WORD'))
|
||||
|
||||
# Remove duplicates from the table and then sort it.
|
||||
|
||||
utt_table = list(set(utt_table))
|
||||
utt_table.sort()
|
||||
|
||||
# Output file-specific heading
|
||||
|
||||
f.write("""\
|
||||
#ifdef SUPPORT_UNICODE
|
||||
|
||||
/* The PRIV(utt)[] table below translates Unicode property names into type and
|
||||
code values. It is searched by binary chop, so must be in collating sequence of
|
||||
name. Originally, the table contained pointers to the name strings in the first
|
||||
field of each entry. However, that leads to a large number of relocations when
|
||||
a shared library is dynamically loaded. A significant reduction is made by
|
||||
putting all the names into a single, large string and using offsets instead.
|
||||
All letters are lower cased, and underscores are removed, in accordance with
|
||||
the "loose matching" rules that Unicode advises and Perl uses. */
|
||||
\n""")
|
||||
|
||||
# We have to use STR_ macros to define the strings so that it all works in
|
||||
# UTF-8 mode on EBCDIC platforms.
|
||||
|
||||
for utt in utt_table:
|
||||
f.write('#define STRING_%s0' % (utt[0].replace('&', '_AMPERSAND')))
|
||||
for c in utt[0]:
|
||||
if c == '&':
|
||||
f.write(' STR_AMPERSAND')
|
||||
else:
|
||||
f.write(' STR_%s' % c);
|
||||
f.write(' "\\0"\n')
|
||||
|
||||
# Output the long string of concatenated names
|
||||
|
||||
f.write('\nconst char PRIV(utt_names)[] =\n');
|
||||
last = ''
|
||||
for utt in utt_table:
|
||||
if utt == utt_table[-1]:
|
||||
last = ';'
|
||||
f.write(' STRING_%s0%s\n' % (utt[0].replace('&', '_AMPERSAND'), last))
|
||||
|
||||
# Output the property type table
|
||||
|
||||
f.write('\nconst ucp_type_table PRIV(utt)[] = {\n')
|
||||
offset = 0
|
||||
last = ','
|
||||
for utt in utt_table:
|
||||
if utt[2] in ('PT_ANY', 'PT_LAMP', 'PT_ALNUM', 'PT_PXSPACE',
|
||||
'PT_SPACE', 'PT_UCNC', 'PT_WORD'):
|
||||
value = '0'
|
||||
else:
|
||||
value = 'ucp_' + utt[1]
|
||||
if utt == utt_table[-1]:
|
||||
last = ''
|
||||
f.write(' { %3d, %s, %s }%s\n' % (offset, utt[2], value, last))
|
||||
offset += len(utt[0]) + 1
|
||||
f.write('};\n\n')
|
||||
|
||||
# Ending text
|
||||
|
||||
f.write("""\
|
||||
const size_t PRIV(utt_size) = sizeof(PRIV(utt)) / sizeof(ucp_type_table);
|
||||
|
||||
#endif /* SUPPORT_UNICODE */
|
||||
|
||||
/* End of pcre2_ucptables.c */
|
||||
""")
|
||||
|
||||
f.close
|
||||
|
||||
# End
|
|
@ -1,137 +0,0 @@
|
|||
#! /usr/bin/python
|
||||
|
||||
# Generate utt tables. Note: this script has now been converted to Python 3.
|
||||
|
||||
# The source file pcre2_tables.c contains (amongst other things), a table that
|
||||
# is indexed by script name. In order to reduce the number of relocations when
|
||||
# loading the library, the names are held as a single large string, with
|
||||
# offsets in the table. This is tedious to maintain by hand. Therefore, this
|
||||
# script is used to generate the table. The output is sent to stdout; usually
|
||||
# that should be directed to a temporary file. Then pcre2_tables.c can be
|
||||
# edited by replacing the relevant definitions and table therein with the
|
||||
# temporary file.
|
||||
|
||||
# Modified by PH 17-March-2009 to generate the more verbose form that works
|
||||
# for UTF-support in EBCDIC as well as ASCII environments.
|
||||
# Modified by PH 01-March-2010 to add new scripts for Unicode 5.2.0.
|
||||
# Modified by PH 04-May-2010 to add new "X.." special categories.
|
||||
# Modified by PH 30-April-2011 to add new scripts for Unicode 6.0.0
|
||||
# Modified by ChPe 30-September-2012 to add this note; no other changes were
|
||||
# necessary for Unicode 6.2.0 support.
|
||||
# Modfied by PH 26-February-2013 to add the Xuc special category.
|
||||
# Comment modified by PH 13-May-2014 to update to PCRE2 file names.
|
||||
# Script updated to Python 3 by running it through the 2to3 converter.
|
||||
# Added script names for Unicode 7.0.0, 20-June-2014.
|
||||
# Added script names for Unicode 8.0.0, 19-June-2015.
|
||||
# Added script names for Unicode 10.0.0, 02-July-2017.
|
||||
# Added script names for Unicode 11.0.0, 03-July-2018.
|
||||
# Added 'Unknown' script, 01-October-2018.
|
||||
# Added script names for Unicode 12.1.0, 27-July-2019.
|
||||
# Added script names for Unicode 13.0.0, 10-March-2020.
|
||||
|
||||
script_names = ['Unknown', 'Arabic', 'Armenian', 'Bengali', 'Bopomofo', 'Braille', 'Buginese', 'Buhid', 'Canadian_Aboriginal', \
|
||||
'Cherokee', 'Common', 'Coptic', 'Cypriot', 'Cyrillic', 'Deseret', 'Devanagari', 'Ethiopic', 'Georgian', \
|
||||
'Glagolitic', 'Gothic', 'Greek', 'Gujarati', 'Gurmukhi', 'Han', 'Hangul', 'Hanunoo', 'Hebrew', 'Hiragana', \
|
||||
'Inherited', 'Kannada', 'Katakana', 'Kharoshthi', 'Khmer', 'Lao', 'Latin', 'Limbu', 'Linear_B', 'Malayalam', \
|
||||
'Mongolian', 'Myanmar', 'New_Tai_Lue', 'Ogham', 'Old_Italic', 'Old_Persian', 'Oriya', 'Osmanya', 'Runic', \
|
||||
'Shavian', 'Sinhala', 'Syloti_Nagri', 'Syriac', 'Tagalog', 'Tagbanwa', 'Tai_Le', 'Tamil', 'Telugu', 'Thaana', \
|
||||
'Thai', 'Tibetan', 'Tifinagh', 'Ugaritic', 'Yi', \
|
||||
# New for Unicode 5.0
|
||||
'Balinese', 'Cuneiform', 'Nko', 'Phags_Pa', 'Phoenician', \
|
||||
# New for Unicode 5.1
|
||||
'Carian', 'Cham', 'Kayah_Li', 'Lepcha', 'Lycian', 'Lydian', 'Ol_Chiki', 'Rejang', 'Saurashtra', 'Sundanese', 'Vai', \
|
||||
# New for Unicode 5.2
|
||||
'Avestan', 'Bamum', 'Egyptian_Hieroglyphs', 'Imperial_Aramaic', \
|
||||
'Inscriptional_Pahlavi', 'Inscriptional_Parthian', \
|
||||
'Javanese', 'Kaithi', 'Lisu', 'Meetei_Mayek', \
|
||||
'Old_South_Arabian', 'Old_Turkic', 'Samaritan', 'Tai_Tham', 'Tai_Viet', \
|
||||
# New for Unicode 6.0.0
|
||||
'Batak', 'Brahmi', 'Mandaic', \
|
||||
# New for Unicode 6.1.0
|
||||
'Chakma', 'Meroitic_Cursive', 'Meroitic_Hieroglyphs', 'Miao', 'Sharada', 'Sora_Sompeng', 'Takri',
|
||||
# New for Unicode 7.0.0
|
||||
'Bassa_Vah', 'Caucasian_Albanian', 'Duployan', 'Elbasan', 'Grantha', 'Khojki', 'Khudawadi',
|
||||
'Linear_A', 'Mahajani', 'Manichaean', 'Mende_Kikakui', 'Modi', 'Mro', 'Nabataean',
|
||||
'Old_North_Arabian', 'Old_Permic', 'Pahawh_Hmong', 'Palmyrene', 'Psalter_Pahlavi',
|
||||
'Pau_Cin_Hau', 'Siddham', 'Tirhuta', 'Warang_Citi',
|
||||
# New for Unicode 8.0.0
|
||||
'Ahom', 'Anatolian_Hieroglyphs', 'Hatran', 'Multani', 'Old_Hungarian',
|
||||
'SignWriting',
|
||||
# New for Unicode 10.0.0
|
||||
'Adlam', 'Bhaiksuki', 'Marchen', 'Newa', 'Osage', 'Tangut', 'Masaram_Gondi',
|
||||
'Nushu', 'Soyombo', 'Zanabazar_Square',
|
||||
# New for Unicode 11.0.0
|
||||
'Dogra', 'Gunjala_Gondi', 'Hanifi_Rohingya', 'Makasar', 'Medefaidrin',
|
||||
'Old_Sogdian', 'Sogdian',
|
||||
# New for Unicode 12.0.0
|
||||
'Elymaic', 'Nandinagari', 'Nyiakeng_Puachue_Hmong', 'Wancho',
|
||||
# New for Unicode 13.0.0
|
||||
'Chorasmian', 'Dives_Akuru', 'Khitan_Small_Script', 'Yezidi'
|
||||
]
|
||||
|
||||
category_names = ['Cc', 'Cf', 'Cn', 'Co', 'Cs', 'Ll', 'Lm', 'Lo', 'Lt', 'Lu',
|
||||
'Mc', 'Me', 'Mn', 'Nd', 'Nl', 'No', 'Pc', 'Pd', 'Pe', 'Pf', 'Pi', 'Po', 'Ps',
|
||||
'Sc', 'Sk', 'Sm', 'So', 'Zl', 'Zp', 'Zs' ]
|
||||
|
||||
general_category_names = ['C', 'L', 'M', 'N', 'P', 'S', 'Z']
|
||||
|
||||
# First add the Unicode script and category names.
|
||||
|
||||
utt_table = list(zip(script_names, ['PT_SC'] * len(script_names)))
|
||||
utt_table += list(zip(category_names, ['PT_PC'] * len(category_names)))
|
||||
utt_table += list(zip(general_category_names, ['PT_GC'] * len(general_category_names)))
|
||||
|
||||
# Now add our own specials.
|
||||
|
||||
utt_table.append(('Any', 'PT_ANY'))
|
||||
utt_table.append(('L&', 'PT_LAMP'))
|
||||
utt_table.append(('Xan', 'PT_ALNUM'))
|
||||
utt_table.append(('Xps', 'PT_PXSPACE'))
|
||||
utt_table.append(('Xsp', 'PT_SPACE'))
|
||||
utt_table.append(('Xuc', 'PT_UCNC'))
|
||||
utt_table.append(('Xwd', 'PT_WORD'))
|
||||
|
||||
# Sort the table.
|
||||
|
||||
utt_table.sort()
|
||||
|
||||
# We have to use STR_ macros to define the strings so that it all works in
|
||||
# UTF-8 mode on EBCDIC platforms.
|
||||
|
||||
for utt in utt_table:
|
||||
print('#define STRING_%s0' % (utt[0].replace('&', '_AMPERSAND')), end=' ')
|
||||
for c in utt[0]:
|
||||
if c == '_':
|
||||
print('STR_UNDERSCORE', end=' ')
|
||||
elif c == '&':
|
||||
print('STR_AMPERSAND', end=' ')
|
||||
else:
|
||||
print('STR_%s' % c, end=' ');
|
||||
print('"\\0"')
|
||||
|
||||
# Print the actual table, using the string names
|
||||
|
||||
print('')
|
||||
print('const char PRIV(utt_names)[] =');
|
||||
last = ''
|
||||
for utt in utt_table:
|
||||
if utt == utt_table[-1]:
|
||||
last = ';'
|
||||
print(' STRING_%s0%s' % (utt[0].replace('&', '_AMPERSAND'), last))
|
||||
# This was how it was done before the EBCDIC-compatible modification.
|
||||
# print ' "%s\\0"%s' % (utt[0], last)
|
||||
|
||||
print('\nconst ucp_type_table PRIV(utt)[] = {')
|
||||
offset = 0
|
||||
last = ','
|
||||
for utt in utt_table:
|
||||
if utt[1] in ('PT_ANY', 'PT_LAMP', 'PT_ALNUM', 'PT_PXSPACE',
|
||||
'PT_SPACE', 'PT_UCNC', 'PT_WORD'):
|
||||
value = '0'
|
||||
else:
|
||||
value = 'ucp_' + utt[0]
|
||||
if utt == utt_table[-1]:
|
||||
last = ''
|
||||
print(' { %3d, %s, %s }%s' % (offset, utt[1], value, last))
|
||||
offset += len(utt[0]) + 1
|
||||
print('};')
|
|
@ -1,814 +0,0 @@
|
|||
#! /usr/bin/python
|
||||
|
||||
# Multistage table builder
|
||||
# (c) Peter Kankowski, 2008
|
||||
|
||||
##############################################################################
|
||||
# This script was submitted to the PCRE project by Peter Kankowski as part of
|
||||
# the upgrading of Unicode property support. The new code speeds up property
|
||||
# matching many times. The script is for the use of PCRE maintainers, to
|
||||
# generate the pcre2_ucd.c file that contains a digested form of the Unicode
|
||||
# data tables. A number of extensions have been added to the original script.
|
||||
#
|
||||
# The script has now been upgraded to Python 3 for PCRE2, and should be run in
|
||||
# the maint subdirectory, using the command
|
||||
#
|
||||
# [python3] ./MultiStage2.py >../src/pcre2_ucd.c
|
||||
#
|
||||
# It requires six Unicode data tables: DerivedGeneralCategory.txt,
|
||||
# GraphemeBreakProperty.txt, Scripts.txt, ScriptExtensions.txt,
|
||||
# CaseFolding.txt, and emoji-data.txt. These must be in the
|
||||
# maint/Unicode.tables subdirectory.
|
||||
#
|
||||
# DerivedGeneralCategory.txt is found in the "extracted" subdirectory of the
|
||||
# Unicode database (UCD) on the Unicode web site; GraphemeBreakProperty.txt is
|
||||
# in the "auxiliary" subdirectory. Scripts.txt, ScriptExtensions.txt, and
|
||||
# CaseFolding.txt are directly in the UCD directory.
|
||||
#
|
||||
# The emoji-data.txt file is found in the "emoji" subdirectory even though it
|
||||
# is technically part of a different (but coordinated) standard as shown
|
||||
# in files associated with Unicode Technical Standard #51 ("Unicode Emoji"),
|
||||
# for example:
|
||||
#
|
||||
# http://unicode.org/Public/emoji/13.0/ReadMe.txt
|
||||
#
|
||||
# -----------------------------------------------------------------------------
|
||||
# Minor modifications made to this script:
|
||||
# Added #! line at start
|
||||
# Removed tabs
|
||||
# Made it work with Python 2.4 by rewriting two statements that needed 2.5
|
||||
# Consequent code tidy
|
||||
# Adjusted data file names to take from the Unicode.tables directory
|
||||
# Adjusted global table names by prefixing _pcre_.
|
||||
# Commented out stuff relating to the casefolding table, which isn't used;
|
||||
# removed completely in 2012.
|
||||
# Corrected size calculation
|
||||
# Add #ifndef SUPPORT_UCP to use dummy tables when no UCP support is needed.
|
||||
# Update for PCRE2: name changes, and SUPPORT_UCP is abolished.
|
||||
#
|
||||
# Major modifications made to this script:
|
||||
# Added code to add a grapheme break property field to records.
|
||||
#
|
||||
# Added code to search for sets of more than two characters that must match
|
||||
# each other caselessly. A new table is output containing these sets, and
|
||||
# offsets into the table are added to the main output records. This new
|
||||
# code scans CaseFolding.txt instead of UnicodeData.txt, which is no longer
|
||||
# used.
|
||||
#
|
||||
# Update for Python3:
|
||||
# . Processed with 2to3, but that didn't fix everything
|
||||
# . Changed string.strip to str.strip
|
||||
# . Added encoding='utf-8' to the open() call
|
||||
# . Inserted 'int' before blocksize/ELEMS_PER_LINE because an int is
|
||||
# required and the result of the division is a float
|
||||
#
|
||||
# Added code to scan the emoji-data.txt file to find the Extended Pictographic
|
||||
# property, which is used by PCRE2 as a grapheme breaking property. This was
|
||||
# done when updating to Unicode 11.0.0 (July 2018).
|
||||
#
|
||||
# Added code to add a Script Extensions field to records. This has increased
|
||||
# their size from 8 to 12 bytes, only 10 of which are currently used.
|
||||
#
|
||||
# 01-March-2010: Updated list of scripts for Unicode 5.2.0
|
||||
# 30-April-2011: Updated list of scripts for Unicode 6.0.0
|
||||
# July-2012: Updated list of scripts for Unicode 6.1.0
|
||||
# 20-August-2012: Added scan of GraphemeBreakProperty.txt and added a new
|
||||
# field in the record to hold the value. Luckily, the
|
||||
# structure had a hole in it, so the resulting table is
|
||||
# not much bigger than before.
|
||||
# 18-September-2012: Added code for multiple caseless sets. This uses the
|
||||
# final hole in the structure.
|
||||
# 30-September-2012: Added RegionalIndicator break property from Unicode 6.2.0
|
||||
# 13-May-2014: Updated for PCRE2
|
||||
# 03-June-2014: Updated for Python 3
|
||||
# 20-June-2014: Updated for Unicode 7.0.0
|
||||
# 12-August-2014: Updated to put Unicode version into the file
|
||||
# 19-June-2015: Updated for Unicode 8.0.0
|
||||
# 02-July-2017: Updated for Unicode 10.0.0
|
||||
# 03-July-2018: Updated for Unicode 11.0.0
|
||||
# 07-July-2018: Added code to scan emoji-data.txt for the Extended
|
||||
# Pictographic property.
|
||||
# 01-October-2018: Added the 'Unknown' script name
|
||||
# 03-October-2018: Added new field for Script Extensions
|
||||
# 27-July-2019: Updated for Unicode 12.1.0
|
||||
# 10-March-2020: Updated for Unicode 13.0.0
|
||||
# ----------------------------------------------------------------------------
|
||||
#
|
||||
#
|
||||
# The main tables generated by this script are used by macros defined in
|
||||
# pcre2_internal.h. They look up Unicode character properties using short
|
||||
# sequences of code that contains no branches, which makes for greater speed.
|
||||
#
|
||||
# Conceptually, there is a table of records (of type ucd_record), containing a
|
||||
# script number, script extension value, character type, grapheme break type,
|
||||
# offset to caseless matching set, offset to the character's other case, for
|
||||
# every Unicode character. However, a real table covering all Unicode
|
||||
# characters would be far too big. It can be efficiently compressed by
|
||||
# observing that many characters have the same record, and many blocks of
|
||||
# characters (taking 128 characters in a block) have the same set of records as
|
||||
# other blocks. This leads to a 2-stage lookup process.
|
||||
#
|
||||
# This script constructs six tables. The ucd_caseless_sets table contains
|
||||
# lists of characters that all match each other caselessly. Each list is
|
||||
# in order, and is terminated by NOTACHAR (0xffffffff), which is larger than
|
||||
# any valid character. The first list is empty; this is used for characters
|
||||
# that are not part of any list.
|
||||
#
|
||||
# The ucd_digit_sets table contains the code points of the '9' characters in
|
||||
# each set of 10 decimal digits in Unicode. This is used to ensure that digits
|
||||
# in script runs all come from the same set. The first element in the vector
|
||||
# contains the number of subsequent elements, which are in ascending order.
|
||||
#
|
||||
# The ucd_script_sets vector contains lists of script numbers that are the
|
||||
# Script Extensions properties of certain characters. Each list is terminated
|
||||
# by zero (ucp_Unknown). A character with more than one script listed for its
|
||||
# Script Extension property has a negative value in its record. This is the
|
||||
# negated offset to the start of the relevant list in the ucd_script_sets
|
||||
# vector.
|
||||
#
|
||||
# The ucd_records table contains one instance of every unique record that is
|
||||
# required. The ucd_stage1 table is indexed by a character's block number,
|
||||
# which is the character's code point divided by 128, since 128 is the size
|
||||
# of each block. The result of a lookup in ucd_stage1 a "virtual" block number.
|
||||
#
|
||||
# The ucd_stage2 table is a table of "virtual" blocks; each block is indexed by
|
||||
# the offset of a character within its own block, and the result is the index
|
||||
# number of the required record in the ucd_records vector.
|
||||
#
|
||||
# The following examples are correct for the Unicode 11.0.0 database. Future
|
||||
# updates may make change the actual lookup values.
|
||||
#
|
||||
# Example: lowercase "a" (U+0061) is in block 0
|
||||
# lookup 0 in stage1 table yields 0
|
||||
# lookup 97 (0x61) in the first table in stage2 yields 17
|
||||
# record 17 is { 34, 5, 12, 0, -32, 34, 0 }
|
||||
# 34 = ucp_Latin => Latin script
|
||||
# 5 = ucp_Ll => Lower case letter
|
||||
# 12 = ucp_gbOther => Grapheme break property "Other"
|
||||
# 0 => Not part of a caseless set
|
||||
# -32 (-0x20) => Other case is U+0041
|
||||
# 34 = ucp_Latin => No special Script Extension property
|
||||
# 0 => Dummy value, unused at present
|
||||
#
|
||||
# Almost all lowercase latin characters resolve to the same record. One or two
|
||||
# are different because they are part of a multi-character caseless set (for
|
||||
# example, k, K and the Kelvin symbol are such a set).
|
||||
#
|
||||
# Example: hiragana letter A (U+3042) is in block 96 (0x60)
|
||||
# lookup 96 in stage1 table yields 90
|
||||
# lookup 66 (0x42) in table 90 in stage2 yields 564
|
||||
# record 564 is { 27, 7, 12, 0, 0, 27, 0 }
|
||||
# 27 = ucp_Hiragana => Hiragana script
|
||||
# 7 = ucp_Lo => Other letter
|
||||
# 12 = ucp_gbOther => Grapheme break property "Other"
|
||||
# 0 => Not part of a caseless set
|
||||
# 0 => No other case
|
||||
# 27 = ucp_Hiragana => No special Script Extension property
|
||||
# 0 => Dummy value, unused at present
|
||||
#
|
||||
# Example: vedic tone karshana (U+1CD0) is in block 57 (0x39)
|
||||
# lookup 57 in stage1 table yields 55
|
||||
# lookup 80 (0x50) in table 55 in stage2 yields 458
|
||||
# record 458 is { 28, 12, 3, 0, 0, -101, 0 }
|
||||
# 28 = ucp_Inherited => Script inherited from predecessor
|
||||
# 12 = ucp_Mn => Non-spacing mark
|
||||
# 3 = ucp_gbExtend => Grapheme break property "Extend"
|
||||
# 0 => Not part of a caseless set
|
||||
# 0 => No other case
|
||||
# -101 => Script Extension list offset = 101
|
||||
# 0 => Dummy value, unused at present
|
||||
#
|
||||
# At offset 101 in the ucd_script_sets vector we find the list 3, 15, 107, 29,
|
||||
# and terminator 0. This means that this character is expected to be used with
|
||||
# any of those scripts, which are Bengali, Devanagari, Grantha, and Kannada.
|
||||
#
|
||||
# Philip Hazel, 03 July 2008
|
||||
##############################################################################
|
||||
|
||||
|
||||
import re
|
||||
import string
|
||||
import sys
|
||||
|
||||
MAX_UNICODE = 0x110000
|
||||
NOTACHAR = 0xffffffff
|
||||
|
||||
|
||||
# Parse a line of Scripts.txt, GraphemeBreakProperty.txt or DerivedGeneralCategory.txt
|
||||
def make_get_names(enum):
|
||||
return lambda chardata: enum.index(chardata[1])
|
||||
|
||||
# Parse a line of CaseFolding.txt
|
||||
def get_other_case(chardata):
|
||||
if chardata[1] == 'C' or chardata[1] == 'S':
|
||||
return int(chardata[2], 16) - int(chardata[0], 16)
|
||||
return 0
|
||||
|
||||
# Parse a line of ScriptExtensions.txt
|
||||
def get_script_extension(chardata):
|
||||
this_script_list = list(chardata[1].split(' '))
|
||||
if len(this_script_list) == 1:
|
||||
return script_abbrevs.index(this_script_list[0])
|
||||
|
||||
script_numbers = []
|
||||
for d in this_script_list:
|
||||
script_numbers.append(script_abbrevs.index(d))
|
||||
script_numbers.append(0)
|
||||
script_numbers_length = len(script_numbers)
|
||||
|
||||
for i in range(1, len(script_lists) - script_numbers_length + 1):
|
||||
for j in range(0, script_numbers_length):
|
||||
found = True
|
||||
if script_lists[i+j] != script_numbers[j]:
|
||||
found = False
|
||||
break
|
||||
if found:
|
||||
return -i
|
||||
|
||||
# Not found in existing lists
|
||||
|
||||
return_value = len(script_lists)
|
||||
script_lists.extend(script_numbers)
|
||||
return -return_value
|
||||
|
||||
# Read the whole table in memory, setting/checking the Unicode version
|
||||
def read_table(file_name, get_value, default_value):
|
||||
global unicode_version
|
||||
|
||||
f = re.match(r'^[^/]+/([^.]+)\.txt$', file_name)
|
||||
file_base = f.group(1)
|
||||
version_pat = r"^# " + re.escape(file_base) + r"-(\d+\.\d+\.\d+)\.txt$"
|
||||
file = open(file_name, 'r', encoding='utf-8')
|
||||
f = re.match(version_pat, file.readline())
|
||||
version = f.group(1)
|
||||
if unicode_version == "":
|
||||
unicode_version = version
|
||||
elif unicode_version != version:
|
||||
print("WARNING: Unicode version differs in %s", file_name, file=sys.stderr)
|
||||
|
||||
table = [default_value] * MAX_UNICODE
|
||||
for line in file:
|
||||
line = re.sub(r'#.*', '', line)
|
||||
chardata = list(map(str.strip, line.split(';')))
|
||||
if len(chardata) <= 1:
|
||||
continue
|
||||
value = get_value(chardata)
|
||||
m = re.match(r'([0-9a-fA-F]+)(\.\.([0-9a-fA-F]+))?$', chardata[0])
|
||||
char = int(m.group(1), 16)
|
||||
if m.group(3) is None:
|
||||
last = char
|
||||
else:
|
||||
last = int(m.group(3), 16)
|
||||
for i in range(char, last + 1):
|
||||
# It is important not to overwrite a previously set
|
||||
# value because in the CaseFolding file there are lines
|
||||
# to be ignored (returning the default value of 0)
|
||||
# which often come after a line which has already set
|
||||
# data.
|
||||
if table[i] == default_value:
|
||||
table[i] = value
|
||||
file.close()
|
||||
return table
|
||||
|
||||
# Get the smallest possible C language type for the values
|
||||
def get_type_size(table):
|
||||
type_size = [("uint8_t", 1), ("uint16_t", 2), ("uint32_t", 4),
|
||||
("signed char", 1), ("pcre_int16", 2), ("pcre_int32", 4)]
|
||||
limits = [(0, 255), (0, 65535), (0, 4294967295),
|
||||
(-128, 127), (-32768, 32767), (-2147483648, 2147483647)]
|
||||
minval = min(table)
|
||||
maxval = max(table)
|
||||
for num, (minlimit, maxlimit) in enumerate(limits):
|
||||
if minlimit <= minval and maxval <= maxlimit:
|
||||
return type_size[num]
|
||||
else:
|
||||
raise OverflowError("Too large to fit into C types")
|
||||
|
||||
def get_tables_size(*tables):
|
||||
total_size = 0
|
||||
for table in tables:
|
||||
type, size = get_type_size(table)
|
||||
total_size += size * len(table)
|
||||
return total_size
|
||||
|
||||
# Compress the table into the two stages
|
||||
def compress_table(table, block_size):
|
||||
blocks = {} # Dictionary for finding identical blocks
|
||||
stage1 = [] # Stage 1 table contains block numbers (indices into stage 2 table)
|
||||
stage2 = [] # Stage 2 table contains the blocks with property values
|
||||
table = tuple(table)
|
||||
for i in range(0, len(table), block_size):
|
||||
block = table[i:i+block_size]
|
||||
start = blocks.get(block)
|
||||
if start is None:
|
||||
# Allocate a new block
|
||||
start = len(stage2) / block_size
|
||||
stage2 += block
|
||||
blocks[block] = start
|
||||
stage1.append(start)
|
||||
|
||||
return stage1, stage2
|
||||
|
||||
# Print a table
|
||||
def print_table(table, table_name, block_size = None):
|
||||
type, size = get_type_size(table)
|
||||
ELEMS_PER_LINE = 16
|
||||
|
||||
s = "const %s %s[] = { /* %d bytes" % (type, table_name, size * len(table))
|
||||
if block_size:
|
||||
s += ", block = %d" % block_size
|
||||
print(s + " */")
|
||||
table = tuple(table)
|
||||
if block_size is None:
|
||||
fmt = "%3d," * ELEMS_PER_LINE + " /* U+%04X */"
|
||||
mult = MAX_UNICODE / len(table)
|
||||
for i in range(0, len(table), ELEMS_PER_LINE):
|
||||
print(fmt % (table[i:i+ELEMS_PER_LINE] +
|
||||
(int(i * mult),)))
|
||||
else:
|
||||
if block_size > ELEMS_PER_LINE:
|
||||
el = ELEMS_PER_LINE
|
||||
else:
|
||||
el = block_size
|
||||
fmt = "%3d," * el + "\n"
|
||||
if block_size > ELEMS_PER_LINE:
|
||||
fmt = fmt * int(block_size / ELEMS_PER_LINE)
|
||||
for i in range(0, len(table), block_size):
|
||||
print(("/* block %d */\n" + fmt) % ((i / block_size,) + table[i:i+block_size]))
|
||||
print("};\n")
|
||||
|
||||
# Extract the unique combinations of properties into records
|
||||
def combine_tables(*tables):
|
||||
records = {}
|
||||
index = []
|
||||
for t in zip(*tables):
|
||||
i = records.get(t)
|
||||
if i is None:
|
||||
i = records[t] = len(records)
|
||||
index.append(i)
|
||||
return index, records
|
||||
|
||||
def get_record_size_struct(records):
|
||||
size = 0
|
||||
structure = '/* When recompiling tables with a new Unicode version, please check the\n' + \
|
||||
'types in this structure definition from pcre2_internal.h (the actual\n' + \
|
||||
'field names will be different):\n\ntypedef struct {\n'
|
||||
for i in range(len(records[0])):
|
||||
record_slice = [record[i] for record in records]
|
||||
slice_type, slice_size = get_type_size(record_slice)
|
||||
# add padding: round up to the nearest power of slice_size
|
||||
size = (size + slice_size - 1) & -slice_size
|
||||
size += slice_size
|
||||
structure += '%s property_%d;\n' % (slice_type, i)
|
||||
|
||||
# round up to the first item of the next structure in array
|
||||
record_slice = [record[0] for record in records]
|
||||
slice_type, slice_size = get_type_size(record_slice)
|
||||
size = (size + slice_size - 1) & -slice_size
|
||||
|
||||
structure += '} ucd_record;\n*/\n'
|
||||
return size, structure
|
||||
|
||||
def test_record_size():
|
||||
tests = [ \
|
||||
( [(3,), (6,), (6,), (1,)], 1 ), \
|
||||
( [(300,), (600,), (600,), (100,)], 2 ), \
|
||||
( [(25, 3), (6, 6), (34, 6), (68, 1)], 2 ), \
|
||||
( [(300, 3), (6, 6), (340, 6), (690, 1)], 4 ), \
|
||||
( [(3, 300), (6, 6), (6, 340), (1, 690)], 4 ), \
|
||||
( [(300, 300), (6, 6), (6, 340), (1, 690)], 4 ), \
|
||||
( [(3, 100000), (6, 6), (6, 123456), (1, 690)], 8 ), \
|
||||
( [(100000, 300), (6, 6), (123456, 6), (1, 690)], 8 ), \
|
||||
]
|
||||
for test in tests:
|
||||
size, struct = get_record_size_struct(test[0])
|
||||
assert(size == test[1])
|
||||
#print struct
|
||||
|
||||
def print_records(records, record_size):
|
||||
print('const ucd_record PRIV(ucd_records)[] = { ' + \
|
||||
'/* %d bytes, record size %d */' % (len(records) * record_size, record_size))
|
||||
|
||||
records = list(zip(list(records.keys()), list(records.values())))
|
||||
records.sort(key = lambda x: x[1])
|
||||
for i, record in enumerate(records):
|
||||
print((' {' + '%6d, ' * len(record[0]) + '}, /* %3d */') % (record[0] + (i,)))
|
||||
print('};\n')
|
||||
|
||||
script_names = ['Unknown', 'Arabic', 'Armenian', 'Bengali', 'Bopomofo', 'Braille', 'Buginese', 'Buhid', 'Canadian_Aboriginal',
|
||||
'Cherokee', 'Common', 'Coptic', 'Cypriot', 'Cyrillic', 'Deseret', 'Devanagari', 'Ethiopic', 'Georgian',
|
||||
'Glagolitic', 'Gothic', 'Greek', 'Gujarati', 'Gurmukhi', 'Han', 'Hangul', 'Hanunoo', 'Hebrew', 'Hiragana',
|
||||
'Inherited', 'Kannada', 'Katakana', 'Kharoshthi', 'Khmer', 'Lao', 'Latin', 'Limbu', 'Linear_B', 'Malayalam',
|
||||
'Mongolian', 'Myanmar', 'New_Tai_Lue', 'Ogham', 'Old_Italic', 'Old_Persian', 'Oriya', 'Osmanya', 'Runic',
|
||||
'Shavian', 'Sinhala', 'Syloti_Nagri', 'Syriac', 'Tagalog', 'Tagbanwa', 'Tai_Le', 'Tamil', 'Telugu', 'Thaana',
|
||||
'Thai', 'Tibetan', 'Tifinagh', 'Ugaritic', 'Yi',
|
||||
# New for Unicode 5.0
|
||||
'Balinese', 'Cuneiform', 'Nko', 'Phags_Pa', 'Phoenician',
|
||||
# New for Unicode 5.1
|
||||
'Carian', 'Cham', 'Kayah_Li', 'Lepcha', 'Lycian', 'Lydian', 'Ol_Chiki', 'Rejang', 'Saurashtra', 'Sundanese', 'Vai',
|
||||
# New for Unicode 5.2
|
||||
'Avestan', 'Bamum', 'Egyptian_Hieroglyphs', 'Imperial_Aramaic',
|
||||
'Inscriptional_Pahlavi', 'Inscriptional_Parthian',
|
||||
'Javanese', 'Kaithi', 'Lisu', 'Meetei_Mayek',
|
||||
'Old_South_Arabian', 'Old_Turkic', 'Samaritan', 'Tai_Tham', 'Tai_Viet',
|
||||
# New for Unicode 6.0.0
|
||||
'Batak', 'Brahmi', 'Mandaic',
|
||||
# New for Unicode 6.1.0
|
||||
'Chakma', 'Meroitic_Cursive', 'Meroitic_Hieroglyphs', 'Miao', 'Sharada', 'Sora_Sompeng', 'Takri',
|
||||
# New for Unicode 7.0.0
|
||||
'Bassa_Vah', 'Caucasian_Albanian', 'Duployan', 'Elbasan', 'Grantha', 'Khojki', 'Khudawadi',
|
||||
'Linear_A', 'Mahajani', 'Manichaean', 'Mende_Kikakui', 'Modi', 'Mro', 'Nabataean',
|
||||
'Old_North_Arabian', 'Old_Permic', 'Pahawh_Hmong', 'Palmyrene', 'Psalter_Pahlavi',
|
||||
'Pau_Cin_Hau', 'Siddham', 'Tirhuta', 'Warang_Citi',
|
||||
# New for Unicode 8.0.0
|
||||
'Ahom', 'Anatolian_Hieroglyphs', 'Hatran', 'Multani', 'Old_Hungarian',
|
||||
'SignWriting',
|
||||
# New for Unicode 10.0.0
|
||||
'Adlam', 'Bhaiksuki', 'Marchen', 'Newa', 'Osage', 'Tangut', 'Masaram_Gondi',
|
||||
'Nushu', 'Soyombo', 'Zanabazar_Square',
|
||||
# New for Unicode 11.0.0
|
||||
'Dogra', 'Gunjala_Gondi', 'Hanifi_Rohingya', 'Makasar', 'Medefaidrin',
|
||||
'Old_Sogdian', 'Sogdian',
|
||||
# New for Unicode 12.0.0
|
||||
'Elymaic', 'Nandinagari', 'Nyiakeng_Puachue_Hmong', 'Wancho',
|
||||
# New for Unicode 13.0.0
|
||||
'Chorasmian', 'Dives_Akuru', 'Khitan_Small_Script', 'Yezidi'
|
||||
]
|
||||
|
||||
script_abbrevs = [
|
||||
'Zzzz', 'Arab', 'Armn', 'Beng', 'Bopo', 'Brai', 'Bugi', 'Buhd', 'Cans',
|
||||
'Cher', 'Zyyy', 'Copt', 'Cprt', 'Cyrl', 'Dsrt', 'Deva', 'Ethi', 'Geor',
|
||||
'Glag', 'Goth', 'Grek', 'Gujr', 'Guru', 'Hani', 'Hang', 'Hano', 'Hebr',
|
||||
'Hira', 'Zinh', 'Knda', 'Kana', 'Khar', 'Khmr', 'Laoo', 'Latn', 'Limb',
|
||||
'Linb', 'Mlym', 'Mong', 'Mymr', 'Talu', 'Ogam', 'Ital', 'Xpeo', 'Orya',
|
||||
'Osma', 'Runr', 'Shaw', 'Sinh', 'Sylo', 'Syrc', 'Tglg', 'Tagb', 'Tale',
|
||||
'Taml', 'Telu', 'Thaa', 'Thai', 'Tibt', 'Tfng', 'Ugar', 'Yiii',
|
||||
#New for Unicode 5.0
|
||||
'Bali', 'Xsux', 'Nkoo', 'Phag', 'Phnx',
|
||||
#New for Unicode 5.1
|
||||
'Cari', 'Cham', 'Kali', 'Lepc', 'Lyci', 'Lydi', 'Olck', 'Rjng', 'Saur',
|
||||
'Sund', 'Vaii',
|
||||
#New for Unicode 5.2
|
||||
'Avst', 'Bamu', 'Egyp', 'Armi', 'Phli', 'Prti', 'Java', 'Kthi', 'Lisu',
|
||||
'Mtei', 'Sarb', 'Orkh', 'Samr', 'Lana', 'Tavt',
|
||||
#New for Unicode 6.0.0
|
||||
'Batk', 'Brah', 'Mand',
|
||||
#New for Unicode 6.1.0
|
||||
'Cakm', 'Merc', 'Mero', 'Plrd', 'Shrd', 'Sora', 'Takr',
|
||||
#New for Unicode 7.0.0
|
||||
'Bass', 'Aghb', 'Dupl', 'Elba', 'Gran', 'Khoj', 'Sind', 'Lina', 'Mahj',
|
||||
'Mani', 'Mend', 'Modi', 'Mroo', 'Nbat', 'Narb', 'Perm', 'Hmng', 'Palm',
|
||||
'Phlp', 'Pauc', 'Sidd', 'Tirh', 'Wara',
|
||||
#New for Unicode 8.0.0
|
||||
'Ahom', 'Hluw', 'Hatr', 'Mult', 'Hung', 'Sgnw',
|
||||
#New for Unicode 10.0.0
|
||||
'Adlm', 'Bhks', 'Marc', 'Newa', 'Osge', 'Tang', 'Gonm', 'Nshu', 'Soyo',
|
||||
'Zanb',
|
||||
#New for Unicode 11.0.0
|
||||
'Dogr', 'Gong', 'Rohg', 'Maka', 'Medf', 'Sogo', 'Sogd',
|
||||
#New for Unicode 12.0.0
|
||||
'Elym', 'Nand', 'Hmnp', 'Wcho',
|
||||
#New for Unicode 13.0.0
|
||||
'Chrs', 'Diak', 'Kits', 'Yezi'
|
||||
]
|
||||
|
||||
category_names = ['Cc', 'Cf', 'Cn', 'Co', 'Cs', 'Ll', 'Lm', 'Lo', 'Lt', 'Lu',
|
||||
'Mc', 'Me', 'Mn', 'Nd', 'Nl', 'No', 'Pc', 'Pd', 'Pe', 'Pf', 'Pi', 'Po', 'Ps',
|
||||
'Sc', 'Sk', 'Sm', 'So', 'Zl', 'Zp', 'Zs' ]
|
||||
|
||||
# The Extended_Pictographic property is not found in the file where all the
|
||||
# others are (GraphemeBreakProperty.txt). It comes from the emoji-data.txt
|
||||
# file, but we list it here so that the name has the correct index value.
|
||||
|
||||
break_property_names = ['CR', 'LF', 'Control', 'Extend', 'Prepend',
|
||||
'SpacingMark', 'L', 'V', 'T', 'LV', 'LVT', 'Regional_Indicator', 'Other',
|
||||
'ZWJ', 'Extended_Pictographic' ]
|
||||
|
||||
test_record_size()
|
||||
unicode_version = ""
|
||||
|
||||
script = read_table('Unicode.tables/Scripts.txt', make_get_names(script_names), script_names.index('Unknown'))
|
||||
category = read_table('Unicode.tables/DerivedGeneralCategory.txt', make_get_names(category_names), category_names.index('Cn'))
|
||||
break_props = read_table('Unicode.tables/GraphemeBreakProperty.txt', make_get_names(break_property_names), break_property_names.index('Other'))
|
||||
other_case = read_table('Unicode.tables/CaseFolding.txt', get_other_case, 0)
|
||||
|
||||
# The grapheme breaking rules were changed for Unicode 11.0.0 (June 2018). Now
|
||||
# we need to find the Extended_Pictographic property for emoji characters. This
|
||||
# can be set as an additional grapheme break property, because the default for
|
||||
# all the emojis is "other". We scan the emoji-data.txt file and modify the
|
||||
# break-props table.
|
||||
|
||||
file = open('Unicode.tables/emoji-data.txt', 'r', encoding='utf-8')
|
||||
for line in file:
|
||||
line = re.sub(r'#.*', '', line)
|
||||
chardata = list(map(str.strip, line.split(';')))
|
||||
if len(chardata) <= 1:
|
||||
continue
|
||||
|
||||
if chardata[1] != "Extended_Pictographic":
|
||||
continue
|
||||
|
||||
m = re.match(r'([0-9a-fA-F]+)(\.\.([0-9a-fA-F]+))?$', chardata[0])
|
||||
char = int(m.group(1), 16)
|
||||
if m.group(3) is None:
|
||||
last = char
|
||||
else:
|
||||
last = int(m.group(3), 16)
|
||||
for i in range(char, last + 1):
|
||||
if break_props[i] != break_property_names.index('Other'):
|
||||
print("WARNING: Emoji 0x%x has break property %s, not 'Other'",
|
||||
i, break_property_names[break_props[i]], file=sys.stderr)
|
||||
break_props[i] = break_property_names.index('Extended_Pictographic')
|
||||
file.close()
|
||||
|
||||
# The Script Extensions property default value is the Script value. Parse the
|
||||
# file, setting 'Unknown' as the default (this will never be a Script Extension
|
||||
# value), then scan it and fill in the default from Scripts. Code added by PH
|
||||
# in October 2018. Positive values are used for just a single script for a
|
||||
# code point. Negative values are negated offsets in a list of lists of
|
||||
# multiple scripts. Initialize this list with a single entry, as the zeroth
|
||||
# element is never used.
|
||||
|
||||
script_lists = [0]
|
||||
script_abbrevs_default = script_abbrevs.index('Zzzz')
|
||||
scriptx = read_table('Unicode.tables/ScriptExtensions.txt', get_script_extension, script_abbrevs_default)
|
||||
|
||||
for i in range(0, MAX_UNICODE):
|
||||
if scriptx[i] == script_abbrevs_default:
|
||||
scriptx[i] = script[i]
|
||||
|
||||
# With the addition of the new Script Extensions field, we need some padding
|
||||
# to get the Unicode records up to 12 bytes (multiple of 4). Set a value
|
||||
# greater than 255 to make the field 16 bits.
|
||||
|
||||
padding_dummy = [0] * MAX_UNICODE
|
||||
padding_dummy[0] = 256
|
||||
|
||||
# This block of code was added by PH in September 2012. I am not a Python
|
||||
# programmer, so the style is probably dreadful, but it does the job. It scans
|
||||
# the other_case table to find sets of more than two characters that must all
|
||||
# match each other caselessly. Later in this script a table of these sets is
|
||||
# written out. However, we have to do this work here in order to compute the
|
||||
# offsets in the table that are inserted into the main table.
|
||||
|
||||
# The CaseFolding.txt file lists pairs, but the common logic for reading data
|
||||
# sets only one value, so first we go through the table and set "return"
|
||||
# offsets for those that are not already set.
|
||||
|
||||
for c in range(MAX_UNICODE):
|
||||
if other_case[c] != 0 and other_case[c + other_case[c]] == 0:
|
||||
other_case[c + other_case[c]] = -other_case[c]
|
||||
|
||||
# Now scan again and create equivalence sets.
|
||||
|
||||
sets = []
|
||||
|
||||
for c in range(MAX_UNICODE):
|
||||
o = c + other_case[c]
|
||||
|
||||
# Trigger when this character's other case does not point back here. We
|
||||
# now have three characters that are case-equivalent.
|
||||
|
||||
if other_case[o] != -other_case[c]:
|
||||
t = o + other_case[o]
|
||||
|
||||
# Scan the existing sets to see if any of the three characters are already
|
||||
# part of a set. If so, unite the existing set with the new set.
|
||||
|
||||
appended = 0
|
||||
for s in sets:
|
||||
found = 0
|
||||
for x in s:
|
||||
if x == c or x == o or x == t:
|
||||
found = 1
|
||||
|
||||
# Add new characters to an existing set
|
||||
|
||||
if found:
|
||||
found = 0
|
||||
for y in [c, o, t]:
|
||||
for x in s:
|
||||
if x == y:
|
||||
found = 1
|
||||
if not found:
|
||||
s.append(y)
|
||||
appended = 1
|
||||
|
||||
# If we have not added to an existing set, create a new one.
|
||||
|
||||
if not appended:
|
||||
sets.append([c, o, t])
|
||||
|
||||
# End of loop looking for caseless sets.
|
||||
|
||||
# Now scan the sets and set appropriate offsets for the characters.
|
||||
|
||||
caseless_offsets = [0] * MAX_UNICODE
|
||||
|
||||
offset = 1;
|
||||
for s in sets:
|
||||
for x in s:
|
||||
caseless_offsets[x] = offset
|
||||
offset += len(s) + 1
|
||||
|
||||
# End of block of code for creating offsets for caseless matching sets.
|
||||
|
||||
|
||||
# Combine the tables
|
||||
|
||||
table, records = combine_tables(script, category, break_props,
|
||||
caseless_offsets, other_case, scriptx, padding_dummy)
|
||||
|
||||
record_size, record_struct = get_record_size_struct(list(records.keys()))
|
||||
|
||||
# Find the optimum block size for the two-stage table
|
||||
min_size = sys.maxsize
|
||||
for block_size in [2 ** i for i in range(5,10)]:
|
||||
size = len(records) * record_size
|
||||
stage1, stage2 = compress_table(table, block_size)
|
||||
size += get_tables_size(stage1, stage2)
|
||||
#print "/* block size %5d => %5d bytes */" % (block_size, size)
|
||||
if size < min_size:
|
||||
min_size = size
|
||||
min_stage1, min_stage2 = stage1, stage2
|
||||
min_block_size = block_size
|
||||
|
||||
print("/* This module is generated by the maint/MultiStage2.py script.")
|
||||
print("Do not modify it by hand. Instead modify the script and run it")
|
||||
print("to regenerate this code.")
|
||||
print()
|
||||
print("As well as being part of the PCRE2 library, this module is #included")
|
||||
print("by the pcre2test program, which redefines the PRIV macro to change")
|
||||
print("table names from _pcre2_xxx to xxxx, thereby avoiding name clashes")
|
||||
print("with the library. At present, just one of these tables is actually")
|
||||
print("needed. */")
|
||||
print()
|
||||
print("#ifndef PCRE2_PCRE2TEST")
|
||||
print()
|
||||
print("#ifdef HAVE_CONFIG_H")
|
||||
print("#include \"config.h\"")
|
||||
print("#endif")
|
||||
print()
|
||||
print("#include \"pcre2_internal.h\"")
|
||||
print()
|
||||
print("#endif /* PCRE2_PCRE2TEST */")
|
||||
print()
|
||||
print("/* Unicode character database. */")
|
||||
print("/* This file was autogenerated by the MultiStage2.py script. */")
|
||||
print("/* Total size: %d bytes, block size: %d. */" % (min_size, min_block_size))
|
||||
print()
|
||||
print("/* The tables herein are needed only when UCP support is built,")
|
||||
print("and in PCRE2 that happens automatically with UTF support.")
|
||||
print("This module should not be referenced otherwise, so")
|
||||
print("it should not matter whether it is compiled or not. However")
|
||||
print("a comment was received about space saving - maybe the guy linked")
|
||||
print("all the modules rather than using a library - so we include a")
|
||||
print("condition to cut out the tables when not needed. But don't leave")
|
||||
print("a totally empty module because some compilers barf at that.")
|
||||
print("Instead, just supply some small dummy tables. */")
|
||||
print()
|
||||
print("#ifndef SUPPORT_UNICODE")
|
||||
print("const ucd_record PRIV(ucd_records)[] = {{0,0,0,0,0,0,0 }};")
|
||||
print("const uint16_t PRIV(ucd_stage1)[] = {0};")
|
||||
print("const uint16_t PRIV(ucd_stage2)[] = {0};")
|
||||
print("const uint32_t PRIV(ucd_caseless_sets)[] = {0};")
|
||||
print("#else")
|
||||
print()
|
||||
print("const char *PRIV(unicode_version) = \"{}\";".format(unicode_version))
|
||||
print()
|
||||
print("/* If the 32-bit library is run in non-32-bit mode, character values")
|
||||
print("greater than 0x10ffff may be encountered. For these we set up a")
|
||||
print("special record. */")
|
||||
print()
|
||||
print("#if PCRE2_CODE_UNIT_WIDTH == 32")
|
||||
print("const ucd_record PRIV(dummy_ucd_record)[] = {{")
|
||||
print(" ucp_Unknown, /* script */")
|
||||
print(" ucp_Cn, /* type unassigned */")
|
||||
print(" ucp_gbOther, /* grapheme break property */")
|
||||
print(" 0, /* case set */")
|
||||
print(" 0, /* other case */")
|
||||
print(" ucp_Unknown, /* script extension */")
|
||||
print(" 0, /* dummy filler */")
|
||||
print(" }};")
|
||||
print("#endif")
|
||||
print()
|
||||
print(record_struct)
|
||||
|
||||
# --- Added by PH: output the table of caseless character sets ---
|
||||
|
||||
print("/* This table contains lists of characters that are caseless sets of")
|
||||
print("more than one character. Each list is terminated by NOTACHAR. */\n")
|
||||
|
||||
print("const uint32_t PRIV(ucd_caseless_sets)[] = {")
|
||||
print(" NOTACHAR,")
|
||||
for s in sets:
|
||||
s = sorted(s)
|
||||
for x in s:
|
||||
print(' 0x%04x,' % x, end=' ')
|
||||
print(' NOTACHAR,')
|
||||
print('};')
|
||||
print()
|
||||
|
||||
# ------
|
||||
|
||||
print("/* When #included in pcre2test, we don't need the table of digit")
|
||||
print("sets, nor the the large main UCD tables. */")
|
||||
print()
|
||||
print("#ifndef PCRE2_PCRE2TEST")
|
||||
print()
|
||||
|
||||
# --- Added by PH: read Scripts.txt again for the sets of 10 digits. ---
|
||||
|
||||
digitsets = []
|
||||
file = open('Unicode.tables/Scripts.txt', 'r', encoding='utf-8')
|
||||
|
||||
for line in file:
|
||||
m = re.match(r'([0-9a-fA-F]+)\.\.([0-9a-fA-F]+)\s+;\s+\S+\s+#\s+Nd\s+', line)
|
||||
if m is None:
|
||||
continue
|
||||
first = int(m.group(1),16)
|
||||
last = int(m.group(2),16)
|
||||
if ((last - first + 1) % 10) != 0:
|
||||
print("ERROR: %04x..%04x does not contain a multiple of 10 characters" % (first, last),
|
||||
file=sys.stderr)
|
||||
while first < last:
|
||||
digitsets.append(first + 9)
|
||||
first += 10
|
||||
file.close()
|
||||
digitsets.sort()
|
||||
|
||||
print("/* This table lists the code points for the '9' characters in each")
|
||||
print("set of decimal digits. It is used to ensure that all the digits in")
|
||||
print("a script run come from the same set. */\n")
|
||||
print("const uint32_t PRIV(ucd_digit_sets)[] = {")
|
||||
|
||||
print(" %d, /* Number of subsequent values */" % len(digitsets), end='')
|
||||
count = 8
|
||||
for d in digitsets:
|
||||
if count == 8:
|
||||
print("\n ", end='')
|
||||
count = 0
|
||||
print(" 0x%05x," % d, end='')
|
||||
count += 1
|
||||
print("\n};\n")
|
||||
|
||||
print("/* This vector is a list of lists of scripts for the Script Extension")
|
||||
print("property. Each sublist is zero-terminated. */\n")
|
||||
print("const uint8_t PRIV(ucd_script_sets)[] = {")
|
||||
|
||||
count = 0
|
||||
print(" /* 0 */", end='')
|
||||
for d in script_lists:
|
||||
print(" %3d," % d, end='')
|
||||
count += 1
|
||||
if d == 0:
|
||||
print("\n /* %3d */" % count, end='')
|
||||
print("\n};\n")
|
||||
|
||||
# Output the main UCD tables.
|
||||
|
||||
print("/* These are the main two-stage UCD tables. The fields in each record are:")
|
||||
print("script (8 bits), character type (8 bits), grapheme break property (8 bits),")
|
||||
print("offset to multichar other cases or zero (8 bits), offset to other case")
|
||||
print("or zero (32 bits, signed), script extension (16 bits, signed), and a dummy")
|
||||
print("16-bit field to make the whole thing a multiple of 4 bytes. */\n")
|
||||
|
||||
print_records(records, record_size)
|
||||
print_table(min_stage1, 'PRIV(ucd_stage1)')
|
||||
print_table(min_stage2, 'PRIV(ucd_stage2)', min_block_size)
|
||||
print("#if UCD_BLOCK_SIZE != %d" % min_block_size)
|
||||
print("#error Please correct UCD_BLOCK_SIZE in pcre2_internal.h")
|
||||
print("#endif")
|
||||
print("#endif /* SUPPORT_UNICODE */")
|
||||
print()
|
||||
print("#endif /* PCRE2_PCRE2TEST */")
|
||||
|
||||
|
||||
# This code was part of the original contribution, but is commented out as it
|
||||
# was never used. A two-stage table has sufficed.
|
||||
|
||||
"""
|
||||
|
||||
# Three-stage tables:
|
||||
|
||||
# Find the optimum block size for 3-stage table
|
||||
min_size = sys.maxint
|
||||
for stage3_block in [2 ** i for i in range(2,6)]:
|
||||
stage_i, stage3 = compress_table(table, stage3_block)
|
||||
for stage2_block in [2 ** i for i in range(5,10)]:
|
||||
size = len(records) * 4
|
||||
stage1, stage2 = compress_table(stage_i, stage2_block)
|
||||
size += get_tables_size(stage1, stage2, stage3)
|
||||
# print "/* %5d / %3d => %5d bytes */" % (stage2_block, stage3_block, size)
|
||||
if size < min_size:
|
||||
min_size = size
|
||||
min_stage1, min_stage2, min_stage3 = stage1, stage2, stage3
|
||||
min_stage2_block, min_stage3_block = stage2_block, stage3_block
|
||||
|
||||
print "/* Total size: %d bytes" % min_size */
|
||||
print_records(records)
|
||||
print_table(min_stage1, 'ucd_stage1')
|
||||
print_table(min_stage2, 'ucd_stage2', min_stage2_block)
|
||||
print_table(min_stage3, 'ucd_stage3', min_stage3_block)
|
||||
|
||||
"""
|
277
maint/README
277
maint/README
|
@ -16,99 +16,122 @@ and also contains some notes for maintainers. Its contents are:
|
|||
Files in the maint directory
|
||||
============================
|
||||
|
||||
GenerateUtt.py A Python script to generate part of the pcre2_tables.c file
|
||||
that contains Unicode script names in a long string with
|
||||
offsets, which is tedious to maintain by hand.
|
||||
GenerateCommon.py
|
||||
A Python module containing data and functions that are used by the other
|
||||
Generate scripts.
|
||||
|
||||
GenerateTest26.py
|
||||
A Python script that generates input and expected output test data for test
|
||||
26, which tests certain aspects of Unicode property support.
|
||||
|
||||
ManyConfigTests A shell script that runs "configure, make, test" a number of
|
||||
times with different configuration settings.
|
||||
GenerateUcd.py
|
||||
A Python script that generates the file pcre2_ucd.c from GenerateCommon.py
|
||||
and Unicode data files, which are themselves downloaded from the Unicode web
|
||||
site. The generated file contains the tables for a 2-stage lookup of Unicode
|
||||
properties, along with some auxiliary tables. The script starts with a long
|
||||
comment that gives details of the tables it constructs.
|
||||
|
||||
MultiStage2.py A Python script that generates the file pcre2_ucd.c from six
|
||||
Unicode data files, which are themselves downloaded from the
|
||||
Unicode web site. Run this script in the "maint" directory.
|
||||
The generated file is written to stdout. It contains the
|
||||
tables for a 2-stage lookup of Unicode properties, along with
|
||||
some auxiliary tables.
|
||||
GenerateUcpHeader.py
|
||||
A Python script that generates the file pcre2_ucp.h from GenerateCommon.py
|
||||
and Unicode data files. The generated file defines constants for various
|
||||
Unicode property values.
|
||||
|
||||
GenerateUcpTables.py
|
||||
A Python script that generates the file pcre2_ucptables.c from
|
||||
GenerateCommon.py and Unicode data files. The generated file contains tables
|
||||
for looking up Unicode property names.
|
||||
|
||||
ManyConfigTests
|
||||
A shell script that runs "configure, make, test" a number of times with
|
||||
different configuration settings.
|
||||
|
||||
pcre2_chartables.c.non-standard
|
||||
This is a set of character tables that came from a Windows
|
||||
system. It has characters greater than 128 that are set as
|
||||
spaces, amongst other things. I kept it so that it can be
|
||||
used for testing from time to time.
|
||||
This is a set of character tables that came from a Windows system. It has
|
||||
characters greater than 128 that are set as spaces, amongst other things. I
|
||||
kept it so that it can be used for testing from time to time.
|
||||
|
||||
README This file.
|
||||
README
|
||||
This file.
|
||||
|
||||
Unicode.tables The files in this directory were downloaded from the Unicode
|
||||
web site. They contain information about Unicode characters
|
||||
and scripts. The ones used by the MultiStage2.py script are
|
||||
CaseFolding.txt, DerivedGeneralCategory.txt, Scripts.txt,
|
||||
ScriptExtensions.txt, GraphemeBreakProperty.txt, and
|
||||
emoji-data.txt. I've kept UnicodeData.txt (which is no longer
|
||||
used by the script) because it is useful occasionally for
|
||||
manually looking up the details of certain characters.
|
||||
However, note that character names in this file such as
|
||||
"Arabic sign sanah" do NOT mean that the character is in a
|
||||
particular script (in this case, Arabic). Scripts.txt and
|
||||
ScriptExtensions.txt are where to look for script information.
|
||||
Unicode.tables
|
||||
The files in this directory were downloaded from the Unicode web site. They
|
||||
contain information about Unicode characters and scripts, and are used by the
|
||||
Generate scripts. There is also UnicodeData.txt, which is no longer used by
|
||||
any script, because it is useful occasionally for manually looking up the
|
||||
details of certain characters. However, note that character names in this
|
||||
file such as "Arabic sign sanah" do NOT mean that the character is in a
|
||||
particular script (in this case, Arabic). Scripts.txt and
|
||||
ScriptExtensions.txt are where to look for script information.
|
||||
|
||||
ucptest.c A short C program for testing the Unicode property macros
|
||||
that do lookups in the pcre2_ucd.c data, mainly useful after
|
||||
rebuilding the Unicode property table. Compile and run this in
|
||||
the "maint" directory (see comments at its head). This program
|
||||
can also be used to find characters with specific properties.
|
||||
ucptest.c
|
||||
A program for testing the Unicode property macros that do lookups in the
|
||||
pcre2_ucd.c data, mainly useful after rebuilding the Unicode property tables.
|
||||
Compile and run this in the "maint" directory (see comments at its head).
|
||||
This program can also be used to find characters with specific properties and
|
||||
to list which properties are supported.
|
||||
|
||||
ucptestdata A directory containing four files, testinput{1,2} and
|
||||
testoutput{1,2}, for use in conjunction with the ucptest
|
||||
program.
|
||||
ucptestdata
|
||||
A directory containing four files, testinput{1,2} and testoutput{1,2}, for
|
||||
use in conjunction with the ucptest program.
|
||||
|
||||
utf8.c A short, freestanding C program for converting a Unicode code
|
||||
point into a sequence of bytes in the UTF-8 encoding, and vice
|
||||
versa. If its argument is a hex number such as 0x1234, it
|
||||
outputs a list of the equivalent UTF-8 bytes. If its argument
|
||||
is a sequence of concatenated UTF-8 bytes (e.g. e188b4) it
|
||||
treats them as a UTF-8 character and outputs the equivalent
|
||||
code point in hex. See comments at its head for details.
|
||||
utf8.c
|
||||
A short, freestanding C program for converting a Unicode code point into a
|
||||
sequence of bytes in the UTF-8 encoding, and vice versa. If its argument is a
|
||||
hex number such as 0x1234, it outputs a list of the equivalent UTF-8 bytes.
|
||||
If its argument is a sequence of concatenated UTF-8 bytes (e.g. 12e188b4) it
|
||||
treats them as a UTF-8 string and outputs the equivalent code points in hex.
|
||||
See comments at its head for details.
|
||||
|
||||
|
||||
Updating to a new Unicode release
|
||||
=================================
|
||||
|
||||
When there is a new release of Unicode, the files in Unicode.tables must be
|
||||
refreshed from the web site. If the new version of Unicode adds new character
|
||||
scripts, the source file pcre2_ucp.h and both the MultiStage2.py and the
|
||||
GenerateUtt.py scripts must be edited to add the new names. I have been adding
|
||||
each new group at the end of the relevant list, with a comment. Note also that
|
||||
both the pcre2syntax.3 and pcre2pattern.3 man pages contain lists of Unicode
|
||||
script names.
|
||||
refreshed from the web site. Once that is done, the four Python scripts that
|
||||
generate files from the Unicode data can be run from within the "maint"
|
||||
directory.
|
||||
|
||||
MultiStage2.py has two lists: the full names and the abbreviations that are
|
||||
found in the ScriptExtensions.txt file. A list of script names and their
|
||||
abbreviations can be found in the PropertyValueAliases.txt file on the
|
||||
Unicode web site. There is also a Wikipedia page that lists them, and notes the
|
||||
Unicode version in which they were introduced:
|
||||
Note: Previously, it was necessary to update lists of scripts and their
|
||||
abbreviations by hand before running the Python scripts. This is no longer
|
||||
necessary because the scripts have been upgraded to extract this information
|
||||
themselves. Also, there used to be explicit lists of scripts in two of the man
|
||||
pages. This is no longer the case; the pcre2test program can now output a list
|
||||
of supported scripts.
|
||||
|
||||
https://en.wikipedia.org/wiki/Unicode_scripts#Table_of_Unicode_scripts
|
||||
You can give an output file name as an argument to the following scripts, but
|
||||
by default:
|
||||
|
||||
Once the script name lists have been updated, MultiStage2.py can be run to
|
||||
generate a new version of pcre2_ucd.c, and GenerateUtt.py can be run to
|
||||
generate the tricky tables for inclusion in pcre2_tables.c (which must be
|
||||
hand-edited). If MultiStage2.py gives the error "ValueError: list.index(x): x
|
||||
not in list", the cause is usually a missing (or misspelt) name in one of the
|
||||
lists of scripts.
|
||||
GenerateUcd.py creates pcre2_ucd.c )
|
||||
GenerateUcpHeader.py creates pcre2_ucp.h ) in the current directory
|
||||
GenerateUcpTables.py creates pcre2_ucptables.c )
|
||||
|
||||
The ucptest program can be compiled and used to check that the new tables in
|
||||
pcre2_ucd.c work properly, using the data files in ucptestdata to check a
|
||||
number of test characters. It used to be necessary to update the source
|
||||
ucptest.c whenever new Unicode scripts were added, but this is no longer
|
||||
required because that program now uses the lists in the PCRE2 source. However,
|
||||
adding a few tests for new scripts to the files in ucptestdata is a good idea.
|
||||
These files can be compared against the existing versions in the src directory
|
||||
to check on any changes before replacing the old files, but you can also
|
||||
generate directly into the final location by running:
|
||||
|
||||
./GenerateUcd.py ../src/pcre2_ucd.c
|
||||
./GenerateUcpHeader.py ../src/pcre2_ucp.h
|
||||
./GenerateUcpTables.py ../src/pcre2_ucptables.c
|
||||
|
||||
Once the .c and .h files are in the ../src directory, the ucptest program can
|
||||
be compiled and used to check that the new tables work properly. The data files
|
||||
in ucptestdata are set up to check a number of test characters. See the
|
||||
comments at the start of ucptest.c. If there are new scripts, adding a few
|
||||
tests to the files in ucptestdata is a good idea.
|
||||
|
||||
Finally, you should run the GenerateTest26.py script to regenerate new versions
|
||||
of the input and expected output from a series of Unicode property tests that
|
||||
are automatically generated from the Unicode data files. By default, the files
|
||||
are written to testinput26 and testoutput26 in the current directory, but you
|
||||
can give an alternative directory name as an argument to the script. These
|
||||
files should eventually be installed in the main testdata directory.
|
||||
|
||||
|
||||
Preparing for a PCRE2 release
|
||||
=============================
|
||||
|
||||
This section contains a checklist of things that I consult before building a
|
||||
distribution for a new release.
|
||||
This section contains a checklist of things that I do before building a new
|
||||
release.
|
||||
|
||||
. Ensure that the version number and version date are correct in configure.ac.
|
||||
|
||||
|
@ -117,19 +140,19 @@ distribution for a new release.
|
|||
|
||||
. If new build options or new source files have been added, ensure that they
|
||||
are added to the CMake files as well as to the autoconf files. The relevant
|
||||
files are CMakeLists.txt and config-cmake.h.in. After making a release
|
||||
tarball, test it out with CMake if there have been changes here.
|
||||
files are CMakeLists.txt and config-cmake.h.in. After making a release, test
|
||||
it out with CMake if there have been changes here.
|
||||
|
||||
. Run ./autogen.sh to ensure everything is up-to-date.
|
||||
|
||||
. Compile and test with many different config options, and combinations of
|
||||
options. Also, test with valgrind by running "RunTest valgrind" and
|
||||
"RunGrepTest valgrind" (which takes quite a long time). The script
|
||||
maint/ManyConfigTests now encapsulates this testing. It runs tests with
|
||||
different configurations, and it also runs some of them with valgrind, all of
|
||||
which can take quite some time.
|
||||
"RunGrepTest valgrind". The script maint/ManyConfigTests now encapsulates
|
||||
this testing. It runs tests with different configurations, and it also runs
|
||||
some of them with valgrind, all of which can take quite some time.
|
||||
|
||||
. Run tests in both 32-bit and 64-bit environments if possible.
|
||||
. Run tests in both 32-bit and 64-bit environments if possible. I can no longer
|
||||
run 32-bit tests.
|
||||
|
||||
. Run tests with two or more different compilers (e.g. clang and gcc), and
|
||||
make use of -fsanitize=address and friends where possible. For gcc,
|
||||
|
@ -140,7 +163,9 @@ distribution for a new release.
|
|||
be added when compiling with JIT. Another useful clang option is
|
||||
-fsanitize=signed-integer-overflow
|
||||
|
||||
. Do a test build using CMake.
|
||||
. Do a test build using CMake. Remove src/config.h first, lest it override the
|
||||
version that CMake creates. Also do a CMake unity build to check that it
|
||||
still works: [c]cmake -DCMAKE_UNITY_BUILD=ON sets up a unity build.
|
||||
|
||||
. Run perltest.sh on the test data for tests 1 and 4. The output should match
|
||||
the PCRE2 test output, apart from the version identification at the start of
|
||||
|
@ -159,12 +184,12 @@ distribution for a new release.
|
|||
systems. For example, on Solaris it is helpful to test using Sun's cc
|
||||
compiler as a change from gcc. Adding -xarch=v9 to the cc options does a
|
||||
64-bit test, but it also needs -S 64 for pcre2test to increase the stack size
|
||||
for test 2. Since I retired I can no longer do much of this, but instead I
|
||||
rely on putting out release candidates for folks on the pcre-dev list to
|
||||
test.
|
||||
for test 2. Since I retired I can no longer do much of this. There are
|
||||
automated tests under Ubuntu, Alpine, and Windows that are now set up as
|
||||
GitHub actions. Check that they are running clean.
|
||||
|
||||
. The buildbots at http://buildfarm.opencsw.org/ do some automated testing
|
||||
of PCRE2 and should be checked before putting out a release.
|
||||
of PCRE2 and should also be checked before putting out a release.
|
||||
|
||||
|
||||
Updating version info for libtool
|
||||
|
@ -214,20 +239,20 @@ changes in a shared library:
|
|||
Making a PCRE2 release
|
||||
======================
|
||||
|
||||
Run PrepareRelease and commit the files that it changes (by removing trailing
|
||||
spaces). The first thing this script does is to run CheckMan on the man pages;
|
||||
if it finds any markup errors, it reports them and then aborts.
|
||||
Run PrepareRelease and commit the files that it changes. The first thing this
|
||||
script does is to run CheckMan on the man pages; if it finds any markup errors,
|
||||
it reports them and then aborts. Otherwise it removes trailing spaces from
|
||||
sources and refreshes the HTML documentation. Update the GitHub repository with
|
||||
"git push".
|
||||
|
||||
Once PrepareRelease has run clean, run "make distcheck" to create the tarballs
|
||||
and the zipball. Double-check with "svn status", then create an SVN tagged
|
||||
copy:
|
||||
|
||||
svn copy svn://vcs.exim.org/pcre2/code/trunk \
|
||||
svn://vcs.exim.org/pcre2/code/tags/pcre2-10.xx
|
||||
and the zipball. I then sign these files. Double-check with "git status" that
|
||||
the repository is fully up-to-date, then create a new tag and a release on
|
||||
GitHub. Upload the tarballs, zipball, and the signatures as "assets" of the
|
||||
GitHub release.
|
||||
|
||||
When the new release is out, don't forget to tell webmaster@pcre.org and the
|
||||
mailing list. Also, update the list of version numbers in Bugzilla
|
||||
(administration > products > PCRE > Edit versions).
|
||||
mailing list.
|
||||
|
||||
|
||||
Future ideas (wish list)
|
||||
|
@ -235,7 +260,8 @@ Future ideas (wish list)
|
|||
|
||||
This section records a list of ideas so that they do not get forgotten. They
|
||||
vary enormously in their usefulness and potential for implementation. Some are
|
||||
very sensible; some are rather wacky. Some have been on this list for years.
|
||||
very sensible; some are rather wacky. Some have been on this list for many
|
||||
years.
|
||||
|
||||
. Optimization
|
||||
|
||||
|
@ -276,9 +302,6 @@ very sensible; some are rather wacky. Some have been on this list for years.
|
|||
|
||||
. An option to convert results into character offsets and character lengths.
|
||||
|
||||
. An option for pcre2grep to scan only the start of a file. I am not keen -
|
||||
this is the job of "head".
|
||||
|
||||
. A (non-Unix) user wanted pcregrep options to (a) list a file name just once,
|
||||
preceded by a blank line, instead of adding it to every matched line, and (b)
|
||||
support --outputfile=name.
|
||||
|
@ -317,10 +340,9 @@ very sensible; some are rather wacky. Some have been on this list for years.
|
|||
|
||||
. PCRE2 cannot at present distinguish between subpatterns with different names,
|
||||
but the same number (created by the use of ?|). In order to do so, a way of
|
||||
remembering *which* subpattern numbered n matched is needed. Bugzilla #760.
|
||||
(*MARK) can perhaps be used as a way round this problem. However, note that
|
||||
Perl does not distinguish: like PCRE2, a name is just an alias for a number
|
||||
in Perl.
|
||||
remembering *which* subpattern numbered n matched is needed. (*MARK) can
|
||||
perhaps be used as a way round this problem. However, note that Perl does not
|
||||
distinguish: like PCRE2, a name is just an alias for a number in Perl.
|
||||
|
||||
. Instead of having #ifdef HAVE_CONFIG_H in each module, put #include
|
||||
"something" and the the #ifdef appears only in one place, in "something".
|
||||
|
@ -346,10 +368,6 @@ very sensible; some are rather wacky. Some have been on this list for years.
|
|||
|
||||
See Unicode TR 29. The last two are very much aimed at natural language.
|
||||
|
||||
. (?[...]) extended classes: big project.
|
||||
|
||||
. Bugzilla #1694 requests backwards searching.
|
||||
|
||||
. Allow a callout to specify a number of characters to skip. This can be done
|
||||
compatibly via an extra callout field.
|
||||
|
||||
|
@ -361,9 +379,6 @@ very sensible; some are rather wacky. Some have been on this list for years.
|
|||
. A limit on substitutions: a user suggested somehow finding a way of making
|
||||
match_limit apply to the whole operation instead of each match separately.
|
||||
|
||||
. Redesign handling of class/nclass/xclass because the compile code logic is
|
||||
currently very contorted and obscure.
|
||||
|
||||
. Some #defines could be replaced with enums to improve robustness.
|
||||
|
||||
. There was a request for an option for pcre2_match() to return the longest
|
||||
|
@ -380,7 +395,8 @@ very sensible; some are rather wacky. Some have been on this list for years.
|
|||
The test function could make use of get_substrings() to cover more code.
|
||||
|
||||
. A neater way of handling recursion file names in pcre2grep, e.g. a single
|
||||
buffer that can grow.
|
||||
buffer that can grow. See also GitHub issue #2 (recursion looping via
|
||||
symlinks).
|
||||
|
||||
. A user suggested that before/after parameters in pcre2grep could have
|
||||
negative values, to list lines near to the matched line, but not necessarily
|
||||
|
@ -395,14 +411,7 @@ very sensible; some are rather wacky. Some have been on this list for years.
|
|||
. Breaking loops that match an empty string: perhaps find a way of continuing
|
||||
if *something* has changed, but this might mean remembering additional data.
|
||||
"Something" could be a capture value, but then a list of previous values
|
||||
would be needed to avoid a cycle of changes. Bugzilla #2182.
|
||||
|
||||
. The use of \K in assertions is problematic. There was some talk of Perl
|
||||
banning this, but it hasn't happened. Some problems could be avoided by
|
||||
not allowing it to set a value before the match start; others by not allowing
|
||||
it to set a value after the match end. This could be controlled by an option
|
||||
such as PCRE2_SANE_BACKSLASH_K, for compatibility (or possibly make the sane
|
||||
behaviour the default and implement PCRE2_INSANE_BACKSLASH_K).
|
||||
would be needed to avoid a cycle of changes.
|
||||
|
||||
. If a function could be written to find 3-character (or other length) fixed
|
||||
strings, at least one of which must be present for a match, efficient
|
||||
|
@ -410,6 +419,8 @@ very sensible; some are rather wacky. Some have been on this list for years.
|
|||
|
||||
. If pcre2grep had --first-line (match only in the first line) it could be
|
||||
efficiently used to find files "starting with xxx". What about --last-line?
|
||||
There was also the suggestion of an option for pcre2grep to scan only the
|
||||
start of a file. I am not keen - this is the job of "head".
|
||||
|
||||
. A user requested a means of determining whether a failed match was failed by
|
||||
the start-of-match optimizations, or by running the match engine. Easy enough
|
||||
|
@ -419,25 +430,31 @@ very sensible; some are rather wacky. Some have been on this list for years.
|
|||
interpreters? JIT already does some of this, but it may not be worth it for
|
||||
the interpreters.
|
||||
|
||||
. There was a request for a way of re-defining \w (and therefore \W, \b, and
|
||||
\B). An in-pattern sequence such as (?w=[...]) was suggested. Easiest way
|
||||
would be simply to inline the class, with lookarounds for \b and \B. Ideally
|
||||
the setting should last till the end of the group, which means remembering
|
||||
all previous settings; maybe a fixed amount of stack would do - how deep
|
||||
would anyone want to nest these things? Bugzilla #2301.
|
||||
|
||||
. Recognize the short script names. They are already listed in maint/
|
||||
Multistage2.py because they are needed for scanning the script extensions
|
||||
file.
|
||||
|
||||
. Use script extensions for \p?
|
||||
. Redesign handling of class/nclass/xclass because the compile code logic is
|
||||
currently very contorted and obscure. Also there was a request for a way of
|
||||
re-defining \w (and therefore \W, \b, and \B). An in-pattern sequence such as
|
||||
(?w=[...]) was suggested. Easiest way would be simply to inline the class,
|
||||
with lookarounds for \b and \B. Ideally the setting should last till the end
|
||||
of the group, which means remembering all previous settings; maybe a fixed
|
||||
amount of stack would do - how deep would anyone want to nest these things?
|
||||
See GitHub issue #13 for a compendium of character class issues, including
|
||||
(?[...]) extended classes.
|
||||
|
||||
. A user suggested something like --with-build-info to set a build information
|
||||
string that could be retrieved by pcre2_config(). However, there's no
|
||||
facility for a length limit in pcre2_config(), and what would be the
|
||||
encoding?
|
||||
|
||||
. Quantified groups with a fixed count currently operate by replicating the
|
||||
group in the compiled bytecode. This may not really matter in these days of
|
||||
gigabyte memory, but perhaps another implementation might be considered.
|
||||
Needs coordination between the interpreters and JIT.
|
||||
|
||||
. There are regular requests for variable-length lookbehinds.
|
||||
|
||||
. See also any suggestions in the GitHub issues.
|
||||
|
||||
Philip Hazel
|
||||
Email local part: ph10
|
||||
Email domain: cam.ac.uk
|
||||
Last updated: 01 April 2020
|
||||
Email local part: Philip.Hazel
|
||||
Email domain: gmail.com
|
||||
Last updated: 25 April 2022
|
||||
|
|
|
@ -0,0 +1,633 @@
|
|||
# BidiMirroring-14.0.0.txt
|
||||
# Date: 2021-08-08, 22:55:00 GMT [KW, RP]
|
||||
# © 2021 Unicode®, Inc.
|
||||
# For terms of use, see https://www.unicode.org/terms_of_use.html
|
||||
#
|
||||
# Unicode Character Database
|
||||
# For documentation, see https://www.unicode.org/reports/tr44/
|
||||
#
|
||||
# Bidi_Mirroring_Glyph Property
|
||||
#
|
||||
# This file is an informative contributory data file in the
|
||||
# Unicode Character Database.
|
||||
#
|
||||
# This data file lists characters that have the Bidi_Mirrored=Yes property
|
||||
# value, for which there is another Unicode character that typically has a glyph
|
||||
# that is the mirror image of the original character's glyph.
|
||||
#
|
||||
# The repertoire covered by the file is Unicode 14.0.0.
|
||||
#
|
||||
# The file contains a list of lines with mappings from one code point
|
||||
# to another one for character-based mirroring.
|
||||
# Note that for "real" mirroring, a rendering engine needs to select
|
||||
# appropriate alternative glyphs, and that many Unicode characters do not
|
||||
# have a mirror-image Unicode character.
|
||||
#
|
||||
# Each mapping line contains two fields, separated by a semicolon (';').
|
||||
# Each of the two fields contains a code point represented as a
|
||||
# variable-length hexadecimal value with 4 to 6 digits.
|
||||
# A comment indicates where the characters are "BEST FIT" mirroring.
|
||||
#
|
||||
# Code points for which Bidi_Mirrored=Yes, but for which no appropriate
|
||||
# characters exist with mirrored glyphs, are
|
||||
# listed as comments at the end of the file.
|
||||
#
|
||||
# Formally, the default value of the Bidi_Mirroring_Glyph property
|
||||
# for each code point is <none>, unless a mapping to
|
||||
# some other character is specified in this data file. When a code
|
||||
# point has the default value for the Bidi_Mirroring_Glyph property,
|
||||
# that means that no other character exists whose glyph is suitable
|
||||
# for character-based mirroring.
|
||||
#
|
||||
# For information on bidi mirroring, see UAX #9: Unicode Bidirectional Algorithm,
|
||||
# at https://www.unicode.org/reports/tr9/
|
||||
#
|
||||
# This file was originally created by Markus Scherer.
|
||||
# Extended for Unicode 3.2, 4.0, 4.1, 5.0, 5.1, 5.2, and 6.0 by Ken Whistler,
|
||||
# and for subsequent versions by Ken Whistler, Laurentiu Iancu, and Roozbeh Pournader.
|
||||
#
|
||||
# Historical and Compatibility Information:
|
||||
#
|
||||
# The OpenType Mirroring Pairs List (OMPL) is frozen to match the
|
||||
# Unicode 5.1 version of the Bidi_Mirroring_Glyph property (2008).
|
||||
# See https://www.microsoft.com/typography/otspec/ompl.txt
|
||||
#
|
||||
# The Unicode 6.1 version of the Bidi_Mirroring_Glyph property (2011)
|
||||
# added one mirroring pair: 27CB <--> 27CD.
|
||||
#
|
||||
# The Unicode 11.0 version of the Bidi_Mirroring_Glyph property (2018)
|
||||
# underwent a substantial revision, to formally recognize all of the
|
||||
# exact mirroring pairs and "BEST FIT" mirroring pairs that had been
|
||||
# added after the freezing of the OMPL list. As a result, starting
|
||||
# with Unicode 11.0, the bmg mapping values more accurately reflect
|
||||
# the current status of glyphs for Bidi_Mirrored characters in
|
||||
# the Unicode Standard, but this listing now extends significantly
|
||||
# beyond the frozen OMPL list. Implementers should be aware of this
|
||||
# intentional distinction.
|
||||
#
|
||||
# ############################################################
|
||||
#
|
||||
# Property: Bidi_Mirroring_Glyph
|
||||
#
|
||||
# @missing: 0000..10FFFF; <none>
|
||||
|
||||
0028; 0029 # LEFT PARENTHESIS
|
||||
0029; 0028 # RIGHT PARENTHESIS
|
||||
003C; 003E # LESS-THAN SIGN
|
||||
003E; 003C # GREATER-THAN SIGN
|
||||
005B; 005D # LEFT SQUARE BRACKET
|
||||
005D; 005B # RIGHT SQUARE BRACKET
|
||||
007B; 007D # LEFT CURLY BRACKET
|
||||
007D; 007B # RIGHT CURLY BRACKET
|
||||
00AB; 00BB # LEFT-POINTING DOUBLE ANGLE QUOTATION MARK
|
||||
00BB; 00AB # RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK
|
||||
0F3A; 0F3B # TIBETAN MARK GUG RTAGS GYON
|
||||
0F3B; 0F3A # TIBETAN MARK GUG RTAGS GYAS
|
||||
0F3C; 0F3D # TIBETAN MARK ANG KHANG GYON
|
||||
0F3D; 0F3C # TIBETAN MARK ANG KHANG GYAS
|
||||
169B; 169C # OGHAM FEATHER MARK
|
||||
169C; 169B # OGHAM REVERSED FEATHER MARK
|
||||
2039; 203A # SINGLE LEFT-POINTING ANGLE QUOTATION MARK
|
||||
203A; 2039 # SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
|
||||
2045; 2046 # LEFT SQUARE BRACKET WITH QUILL
|
||||
2046; 2045 # RIGHT SQUARE BRACKET WITH QUILL
|
||||
207D; 207E # SUPERSCRIPT LEFT PARENTHESIS
|
||||
207E; 207D # SUPERSCRIPT RIGHT PARENTHESIS
|
||||
208D; 208E # SUBSCRIPT LEFT PARENTHESIS
|
||||
208E; 208D # SUBSCRIPT RIGHT PARENTHESIS
|
||||
2208; 220B # ELEMENT OF
|
||||
2209; 220C # [BEST FIT] NOT AN ELEMENT OF
|
||||
220A; 220D # SMALL ELEMENT OF
|
||||
220B; 2208 # CONTAINS AS MEMBER
|
||||
220C; 2209 # [BEST FIT] DOES NOT CONTAIN AS MEMBER
|
||||
220D; 220A # SMALL CONTAINS AS MEMBER
|
||||
2215; 29F5 # DIVISION SLASH
|
||||
221F; 2BFE # RIGHT ANGLE
|
||||
2220; 29A3 # ANGLE
|
||||
2221; 299B # MEASURED ANGLE
|
||||
2222; 29A0 # SPHERICAL ANGLE
|
||||
2224; 2AEE # DOES NOT DIVIDE
|
||||
223C; 223D # TILDE OPERATOR
|
||||
223D; 223C # REVERSED TILDE
|
||||
2243; 22CD # ASYMPTOTICALLY EQUAL TO
|
||||
2245; 224C # APPROXIMATELY EQUAL TO
|
||||
224C; 2245 # ALL EQUAL TO
|
||||
2252; 2253 # APPROXIMATELY EQUAL TO OR THE IMAGE OF
|
||||
2253; 2252 # IMAGE OF OR APPROXIMATELY EQUAL TO
|
||||
2254; 2255 # COLON EQUALS
|
||||
2255; 2254 # EQUALS COLON
|
||||
2264; 2265 # LESS-THAN OR EQUAL TO
|
||||
2265; 2264 # GREATER-THAN OR EQUAL TO
|
||||
2266; 2267 # LESS-THAN OVER EQUAL TO
|
||||
2267; 2266 # GREATER-THAN OVER EQUAL TO
|
||||
2268; 2269 # [BEST FIT] LESS-THAN BUT NOT EQUAL TO
|
||||
2269; 2268 # [BEST FIT] GREATER-THAN BUT NOT EQUAL TO
|
||||
226A; 226B # MUCH LESS-THAN
|
||||
226B; 226A # MUCH GREATER-THAN
|
||||
226E; 226F # [BEST FIT] NOT LESS-THAN
|
||||
226F; 226E # [BEST FIT] NOT GREATER-THAN
|
||||
2270; 2271 # [BEST FIT] NEITHER LESS-THAN NOR EQUAL TO
|
||||
2271; 2270 # [BEST FIT] NEITHER GREATER-THAN NOR EQUAL TO
|
||||
2272; 2273 # [BEST FIT] LESS-THAN OR EQUIVALENT TO
|
||||
2273; 2272 # [BEST FIT] GREATER-THAN OR EQUIVALENT TO
|
||||
2274; 2275 # [BEST FIT] NEITHER LESS-THAN NOR EQUIVALENT TO
|
||||
2275; 2274 # [BEST FIT] NEITHER GREATER-THAN NOR EQUIVALENT TO
|
||||
2276; 2277 # LESS-THAN OR GREATER-THAN
|
||||
2277; 2276 # GREATER-THAN OR LESS-THAN
|
||||
2278; 2279 # [BEST FIT] NEITHER LESS-THAN NOR GREATER-THAN
|
||||
2279; 2278 # [BEST FIT] NEITHER GREATER-THAN NOR LESS-THAN
|
||||
227A; 227B # PRECEDES
|
||||
227B; 227A # SUCCEEDS
|
||||
227C; 227D # PRECEDES OR EQUAL TO
|
||||
227D; 227C # SUCCEEDS OR EQUAL TO
|
||||
227E; 227F # [BEST FIT] PRECEDES OR EQUIVALENT TO
|
||||
227F; 227E # [BEST FIT] SUCCEEDS OR EQUIVALENT TO
|
||||
2280; 2281 # [BEST FIT] DOES NOT PRECEDE
|
||||
2281; 2280 # [BEST FIT] DOES NOT SUCCEED
|
||||
2282; 2283 # SUBSET OF
|
||||
2283; 2282 # SUPERSET OF
|
||||
2284; 2285 # [BEST FIT] NOT A SUBSET OF
|
||||
2285; 2284 # [BEST FIT] NOT A SUPERSET OF
|
||||
2286; 2287 # SUBSET OF OR EQUAL TO
|
||||
2287; 2286 # SUPERSET OF OR EQUAL TO
|
||||
2288; 2289 # [BEST FIT] NEITHER A SUBSET OF NOR EQUAL TO
|
||||
2289; 2288 # [BEST FIT] NEITHER A SUPERSET OF NOR EQUAL TO
|
||||
228A; 228B # [BEST FIT] SUBSET OF WITH NOT EQUAL TO
|
||||
228B; 228A # [BEST FIT] SUPERSET OF WITH NOT EQUAL TO
|
||||
228F; 2290 # SQUARE IMAGE OF
|
||||
2290; 228F # SQUARE ORIGINAL OF
|
||||
2291; 2292 # SQUARE IMAGE OF OR EQUAL TO
|
||||
2292; 2291 # SQUARE ORIGINAL OF OR EQUAL TO
|
||||
2298; 29B8 # CIRCLED DIVISION SLASH
|
||||
22A2; 22A3 # RIGHT TACK
|
||||
22A3; 22A2 # LEFT TACK
|
||||
22A6; 2ADE # ASSERTION
|
||||
22A8; 2AE4 # TRUE
|
||||
22A9; 2AE3 # FORCES
|
||||
22AB; 2AE5 # DOUBLE VERTICAL BAR DOUBLE RIGHT TURNSTILE
|
||||
22B0; 22B1 # PRECEDES UNDER RELATION
|
||||
22B1; 22B0 # SUCCEEDS UNDER RELATION
|
||||
22B2; 22B3 # NORMAL SUBGROUP OF
|
||||
22B3; 22B2 # CONTAINS AS NORMAL SUBGROUP
|
||||
22B4; 22B5 # NORMAL SUBGROUP OF OR EQUAL TO
|
||||
22B5; 22B4 # CONTAINS AS NORMAL SUBGROUP OR EQUAL TO
|
||||
22B6; 22B7 # ORIGINAL OF
|
||||
22B7; 22B6 # IMAGE OF
|
||||
22B8; 27DC # MULTIMAP
|
||||
22C9; 22CA # LEFT NORMAL FACTOR SEMIDIRECT PRODUCT
|
||||
22CA; 22C9 # RIGHT NORMAL FACTOR SEMIDIRECT PRODUCT
|
||||
22CB; 22CC # LEFT SEMIDIRECT PRODUCT
|
||||
22CC; 22CB # RIGHT SEMIDIRECT PRODUCT
|
||||
22CD; 2243 # REVERSED TILDE EQUALS
|
||||
22D0; 22D1 # DOUBLE SUBSET
|
||||
22D1; 22D0 # DOUBLE SUPERSET
|
||||
22D6; 22D7 # LESS-THAN WITH DOT
|
||||
22D7; 22D6 # GREATER-THAN WITH DOT
|
||||
22D8; 22D9 # VERY MUCH LESS-THAN
|
||||
22D9; 22D8 # VERY MUCH GREATER-THAN
|
||||
22DA; 22DB # LESS-THAN EQUAL TO OR GREATER-THAN
|
||||
22DB; 22DA # GREATER-THAN EQUAL TO OR LESS-THAN
|
||||
22DC; 22DD # EQUAL TO OR LESS-THAN
|
||||
22DD; 22DC # EQUAL TO OR GREATER-THAN
|
||||
22DE; 22DF # EQUAL TO OR PRECEDES
|
||||
22DF; 22DE # EQUAL TO OR SUCCEEDS
|
||||
22E0; 22E1 # [BEST FIT] DOES NOT PRECEDE OR EQUAL
|
||||
22E1; 22E0 # [BEST FIT] DOES NOT SUCCEED OR EQUAL
|
||||
22E2; 22E3 # [BEST FIT] NOT SQUARE IMAGE OF OR EQUAL TO
|
||||
22E3; 22E2 # [BEST FIT] NOT SQUARE ORIGINAL OF OR EQUAL TO
|
||||
22E4; 22E5 # [BEST FIT] SQUARE IMAGE OF OR NOT EQUAL TO
|
||||
22E5; 22E4 # [BEST FIT] SQUARE ORIGINAL OF OR NOT EQUAL TO
|
||||
22E6; 22E7 # [BEST FIT] LESS-THAN BUT NOT EQUIVALENT TO
|
||||
22E7; 22E6 # [BEST FIT] GREATER-THAN BUT NOT EQUIVALENT TO
|
||||
22E8; 22E9 # [BEST FIT] PRECEDES BUT NOT EQUIVALENT TO
|
||||
22E9; 22E8 # [BEST FIT] SUCCEEDS BUT NOT EQUIVALENT TO
|
||||
22EA; 22EB # [BEST FIT] NOT NORMAL SUBGROUP OF
|
||||
22EB; 22EA # [BEST FIT] DOES NOT CONTAIN AS NORMAL SUBGROUP
|
||||
22EC; 22ED # [BEST FIT] NOT NORMAL SUBGROUP OF OR EQUAL TO
|
||||
22ED; 22EC # [BEST FIT] DOES NOT CONTAIN AS NORMAL SUBGROUP OR EQUAL
|
||||
22F0; 22F1 # UP RIGHT DIAGONAL ELLIPSIS
|
||||
22F1; 22F0 # DOWN RIGHT DIAGONAL ELLIPSIS
|
||||
22F2; 22FA # ELEMENT OF WITH LONG HORIZONTAL STROKE
|
||||
22F3; 22FB # ELEMENT OF WITH VERTICAL BAR AT END OF HORIZONTAL STROKE
|
||||
22F4; 22FC # SMALL ELEMENT OF WITH VERTICAL BAR AT END OF HORIZONTAL STROKE
|
||||
22F6; 22FD # ELEMENT OF WITH OVERBAR
|
||||
22F7; 22FE # SMALL ELEMENT OF WITH OVERBAR
|
||||
22FA; 22F2 # CONTAINS WITH LONG HORIZONTAL STROKE
|
||||
22FB; 22F3 # CONTAINS WITH VERTICAL BAR AT END OF HORIZONTAL STROKE
|
||||
22FC; 22F4 # SMALL CONTAINS WITH VERTICAL BAR AT END OF HORIZONTAL STROKE
|
||||
22FD; 22F6 # CONTAINS WITH OVERBAR
|
||||
22FE; 22F7 # SMALL CONTAINS WITH OVERBAR
|
||||
2308; 2309 # LEFT CEILING
|
||||
2309; 2308 # RIGHT CEILING
|
||||
230A; 230B # LEFT FLOOR
|
||||
230B; 230A # RIGHT FLOOR
|
||||
2329; 232A # LEFT-POINTING ANGLE BRACKET
|
||||
232A; 2329 # RIGHT-POINTING ANGLE BRACKET
|
||||
2768; 2769 # MEDIUM LEFT PARENTHESIS ORNAMENT
|
||||
2769; 2768 # MEDIUM RIGHT PARENTHESIS ORNAMENT
|
||||
276A; 276B # MEDIUM FLATTENED LEFT PARENTHESIS ORNAMENT
|
||||
276B; 276A # MEDIUM FLATTENED RIGHT PARENTHESIS ORNAMENT
|
||||
276C; 276D # MEDIUM LEFT-POINTING ANGLE BRACKET ORNAMENT
|
||||
276D; 276C # MEDIUM RIGHT-POINTING ANGLE BRACKET ORNAMENT
|
||||
276E; 276F # HEAVY LEFT-POINTING ANGLE QUOTATION MARK ORNAMENT
|
||||
276F; 276E # HEAVY RIGHT-POINTING ANGLE QUOTATION MARK ORNAMENT
|
||||
2770; 2771 # HEAVY LEFT-POINTING ANGLE BRACKET ORNAMENT
|
||||
2771; 2770 # HEAVY RIGHT-POINTING ANGLE BRACKET ORNAMENT
|
||||
2772; 2773 # LIGHT LEFT TORTOISE SHELL BRACKET ORNAMENT
|
||||
2773; 2772 # LIGHT RIGHT TORTOISE SHELL BRACKET ORNAMENT
|
||||
2774; 2775 # MEDIUM LEFT CURLY BRACKET ORNAMENT
|
||||
2775; 2774 # MEDIUM RIGHT CURLY BRACKET ORNAMENT
|
||||
27C3; 27C4 # OPEN SUBSET
|
||||
27C4; 27C3 # OPEN SUPERSET
|
||||
27C5; 27C6 # LEFT S-SHAPED BAG DELIMITER
|
||||
27C6; 27C5 # RIGHT S-SHAPED BAG DELIMITER
|
||||
27C8; 27C9 # REVERSE SOLIDUS PRECEDING SUBSET
|
||||
27C9; 27C8 # SUPERSET PRECEDING SOLIDUS
|
||||
27CB; 27CD # MATHEMATICAL RISING DIAGONAL
|
||||
27CD; 27CB # MATHEMATICAL FALLING DIAGONAL
|
||||
27D5; 27D6 # LEFT OUTER JOIN
|
||||
27D6; 27D5 # RIGHT OUTER JOIN
|
||||
27DC; 22B8 # LEFT MULTIMAP
|
||||
27DD; 27DE # LONG RIGHT TACK
|
||||
27DE; 27DD # LONG LEFT TACK
|
||||
27E2; 27E3 # WHITE CONCAVE-SIDED DIAMOND WITH LEFTWARDS TICK
|
||||
27E3; 27E2 # WHITE CONCAVE-SIDED DIAMOND WITH RIGHTWARDS TICK
|
||||
27E4; 27E5 # WHITE SQUARE WITH LEFTWARDS TICK
|
||||
27E5; 27E4 # WHITE SQUARE WITH RIGHTWARDS TICK
|
||||
27E6; 27E7 # MATHEMATICAL LEFT WHITE SQUARE BRACKET
|
||||
27E7; 27E6 # MATHEMATICAL RIGHT WHITE SQUARE BRACKET
|
||||
27E8; 27E9 # MATHEMATICAL LEFT ANGLE BRACKET
|
||||
27E9; 27E8 # MATHEMATICAL RIGHT ANGLE BRACKET
|
||||
27EA; 27EB # MATHEMATICAL LEFT DOUBLE ANGLE BRACKET
|
||||
27EB; 27EA # MATHEMATICAL RIGHT DOUBLE ANGLE BRACKET
|
||||
27EC; 27ED # MATHEMATICAL LEFT WHITE TORTOISE SHELL BRACKET
|
||||
27ED; 27EC # MATHEMATICAL RIGHT WHITE TORTOISE SHELL BRACKET
|
||||
27EE; 27EF # MATHEMATICAL LEFT FLATTENED PARENTHESIS
|
||||
27EF; 27EE # MATHEMATICAL RIGHT FLATTENED PARENTHESIS
|
||||
2983; 2984 # LEFT WHITE CURLY BRACKET
|
||||
2984; 2983 # RIGHT WHITE CURLY BRACKET
|
||||
2985; 2986 # LEFT WHITE PARENTHESIS
|
||||
2986; 2985 # RIGHT WHITE PARENTHESIS
|
||||
2987; 2988 # Z NOTATION LEFT IMAGE BRACKET
|
||||
2988; 2987 # Z NOTATION RIGHT IMAGE BRACKET
|
||||
2989; 298A # Z NOTATION LEFT BINDING BRACKET
|
||||
298A; 2989 # Z NOTATION RIGHT BINDING BRACKET
|
||||
298B; 298C # LEFT SQUARE BRACKET WITH UNDERBAR
|
||||
298C; 298B # RIGHT SQUARE BRACKET WITH UNDERBAR
|
||||
298D; 2990 # LEFT SQUARE BRACKET WITH TICK IN TOP CORNER
|
||||
298E; 298F # RIGHT SQUARE BRACKET WITH TICK IN BOTTOM CORNER
|
||||
298F; 298E # LEFT SQUARE BRACKET WITH TICK IN BOTTOM CORNER
|
||||
2990; 298D # RIGHT SQUARE BRACKET WITH TICK IN TOP CORNER
|
||||
2991; 2992 # LEFT ANGLE BRACKET WITH DOT
|
||||
2992; 2991 # RIGHT ANGLE BRACKET WITH DOT
|
||||
2993; 2994 # LEFT ARC LESS-THAN BRACKET
|
||||
2994; 2993 # RIGHT ARC GREATER-THAN BRACKET
|
||||
2995; 2996 # DOUBLE LEFT ARC GREATER-THAN BRACKET
|
||||
2996; 2995 # DOUBLE RIGHT ARC LESS-THAN BRACKET
|
||||
2997; 2998 # LEFT BLACK TORTOISE SHELL BRACKET
|
||||
2998; 2997 # RIGHT BLACK TORTOISE SHELL BRACKET
|
||||
299B; 2221 # MEASURED ANGLE OPENING LEFT
|
||||
29A0; 2222 # SPHERICAL ANGLE OPENING LEFT
|
||||
29A3; 2220 # REVERSED ANGLE
|
||||
29A4; 29A5 # ANGLE WITH UNDERBAR
|
||||
29A5; 29A4 # REVERSED ANGLE WITH UNDERBAR
|
||||
29A8; 29A9 # MEASURED ANGLE WITH OPEN ARM ENDING IN ARROW POINTING UP AND RIGHT
|
||||
29A9; 29A8 # MEASURED ANGLE WITH OPEN ARM ENDING IN ARROW POINTING UP AND LEFT
|
||||
29AA; 29AB # MEASURED ANGLE WITH OPEN ARM ENDING IN ARROW POINTING DOWN AND RIGHT
|
||||
29AB; 29AA # MEASURED ANGLE WITH OPEN ARM ENDING IN ARROW POINTING DOWN AND LEFT
|
||||
29AC; 29AD # MEASURED ANGLE WITH OPEN ARM ENDING IN ARROW POINTING RIGHT AND UP
|
||||
29AD; 29AC # MEASURED ANGLE WITH OPEN ARM ENDING IN ARROW POINTING LEFT AND UP
|
||||
29AE; 29AF # MEASURED ANGLE WITH OPEN ARM ENDING IN ARROW POINTING RIGHT AND DOWN
|
||||
29AF; 29AE # MEASURED ANGLE WITH OPEN ARM ENDING IN ARROW POINTING LEFT AND DOWN
|
||||
29B8; 2298 # CIRCLED REVERSE SOLIDUS
|
||||
29C0; 29C1 # CIRCLED LESS-THAN
|
||||
29C1; 29C0 # CIRCLED GREATER-THAN
|
||||
29C4; 29C5 # SQUARED RISING DIAGONAL SLASH
|
||||
29C5; 29C4 # SQUARED FALLING DIAGONAL SLASH
|
||||
29CF; 29D0 # LEFT TRIANGLE BESIDE VERTICAL BAR
|
||||
29D0; 29CF # VERTICAL BAR BESIDE RIGHT TRIANGLE
|
||||
29D1; 29D2 # BOWTIE WITH LEFT HALF BLACK
|
||||
29D2; 29D1 # BOWTIE WITH RIGHT HALF BLACK
|
||||
29D4; 29D5 # TIMES WITH LEFT HALF BLACK
|
||||
29D5; 29D4 # TIMES WITH RIGHT HALF BLACK
|
||||
29D8; 29D9 # LEFT WIGGLY FENCE
|
||||
29D9; 29D8 # RIGHT WIGGLY FENCE
|
||||
29DA; 29DB # LEFT DOUBLE WIGGLY FENCE
|
||||
29DB; 29DA # RIGHT DOUBLE WIGGLY FENCE
|
||||
29E8; 29E9 # DOWN-POINTING TRIANGLE WITH LEFT HALF BLACK
|
||||
29E9; 29E8 # DOWN-POINTING TRIANGLE WITH RIGHT HALF BLACK
|
||||
29F5; 2215 # REVERSE SOLIDUS OPERATOR
|
||||
29F8; 29F9 # BIG SOLIDUS
|
||||
29F9; 29F8 # BIG REVERSE SOLIDUS
|
||||
29FC; 29FD # LEFT-POINTING CURVED ANGLE BRACKET
|
||||
29FD; 29FC # RIGHT-POINTING CURVED ANGLE BRACKET
|
||||
2A2B; 2A2C # MINUS SIGN WITH FALLING DOTS
|
||||
2A2C; 2A2B # MINUS SIGN WITH RISING DOTS
|
||||
2A2D; 2A2E # PLUS SIGN IN LEFT HALF CIRCLE
|
||||
2A2E; 2A2D # PLUS SIGN IN RIGHT HALF CIRCLE
|
||||
2A34; 2A35 # MULTIPLICATION SIGN IN LEFT HALF CIRCLE
|
||||
2A35; 2A34 # MULTIPLICATION SIGN IN RIGHT HALF CIRCLE
|
||||
2A3C; 2A3D # INTERIOR PRODUCT
|
||||
2A3D; 2A3C # RIGHTHAND INTERIOR PRODUCT
|
||||
2A64; 2A65 # Z NOTATION DOMAIN ANTIRESTRICTION
|
||||
2A65; 2A64 # Z NOTATION RANGE ANTIRESTRICTION
|
||||
2A79; 2A7A # LESS-THAN WITH CIRCLE INSIDE
|
||||
2A7A; 2A79 # GREATER-THAN WITH CIRCLE INSIDE
|
||||
2A7B; 2A7C # [BEST FIT] LESS-THAN WITH QUESTION MARK ABOVE
|
||||
2A7C; 2A7B # [BEST FIT] GREATER-THAN WITH QUESTION MARK ABOVE
|
||||
2A7D; 2A7E # LESS-THAN OR SLANTED EQUAL TO
|
||||
2A7E; 2A7D # GREATER-THAN OR SLANTED EQUAL TO
|
||||
2A7F; 2A80 # LESS-THAN OR SLANTED EQUAL TO WITH DOT INSIDE
|
||||
2A80; 2A7F # GREATER-THAN OR SLANTED EQUAL TO WITH DOT INSIDE
|
||||
2A81; 2A82 # LESS-THAN OR SLANTED EQUAL TO WITH DOT ABOVE
|
||||
2A82; 2A81 # GREATER-THAN OR SLANTED EQUAL TO WITH DOT ABOVE
|
||||
2A83; 2A84 # LESS-THAN OR SLANTED EQUAL TO WITH DOT ABOVE RIGHT
|
||||
2A84; 2A83 # GREATER-THAN OR SLANTED EQUAL TO WITH DOT ABOVE LEFT
|
||||
2A85; 2A86 # [BEST FIT] LESS-THAN OR APPROXIMATE
|
||||
2A86; 2A85 # [BEST FIT] GREATER-THAN OR APPROXIMATE
|
||||
2A87; 2A88 # [BEST FIT] LESS-THAN AND SINGLE-LINE NOT EQUAL TO
|
||||
2A88; 2A87 # [BEST FIT] GREATER-THAN AND SINGLE-LINE NOT EQUAL TO
|
||||
2A89; 2A8A # [BEST FIT] LESS-THAN AND NOT APPROXIMATE
|
||||
2A8A; 2A89 # [BEST FIT] GREATER-THAN AND NOT APPROXIMATE
|
||||
2A8B; 2A8C # LESS-THAN ABOVE DOUBLE-LINE EQUAL ABOVE GREATER-THAN
|
||||
2A8C; 2A8B # GREATER-THAN ABOVE DOUBLE-LINE EQUAL ABOVE LESS-THAN
|
||||
2A8D; 2A8E # [BEST FIT] LESS-THAN ABOVE SIMILAR OR EQUAL
|
||||
2A8E; 2A8D # [BEST FIT] GREATER-THAN ABOVE SIMILAR OR EQUAL
|
||||
2A8F; 2A90 # [BEST FIT] LESS-THAN ABOVE SIMILAR ABOVE GREATER-THAN
|
||||
2A90; 2A8F # [BEST FIT] GREATER-THAN ABOVE SIMILAR ABOVE LESS-THAN
|
||||
2A91; 2A92 # LESS-THAN ABOVE GREATER-THAN ABOVE DOUBLE-LINE EQUAL
|
||||
2A92; 2A91 # GREATER-THAN ABOVE LESS-THAN ABOVE DOUBLE-LINE EQUAL
|
||||
2A93; 2A94 # LESS-THAN ABOVE SLANTED EQUAL ABOVE GREATER-THAN ABOVE SLANTED EQUAL
|
||||
2A94; 2A93 # GREATER-THAN ABOVE SLANTED EQUAL ABOVE LESS-THAN ABOVE SLANTED EQUAL
|
||||
2A95; 2A96 # SLANTED EQUAL TO OR LESS-THAN
|
||||
2A96; 2A95 # SLANTED EQUAL TO OR GREATER-THAN
|
||||
2A97; 2A98 # SLANTED EQUAL TO OR LESS-THAN WITH DOT INSIDE
|
||||
2A98; 2A97 # SLANTED EQUAL TO OR GREATER-THAN WITH DOT INSIDE
|
||||
2A99; 2A9A # DOUBLE-LINE EQUAL TO OR LESS-THAN
|
||||
2A9A; 2A99 # DOUBLE-LINE EQUAL TO OR GREATER-THAN
|
||||
2A9B; 2A9C # DOUBLE-LINE SLANTED EQUAL TO OR LESS-THAN
|
||||
2A9C; 2A9B # DOUBLE-LINE SLANTED EQUAL TO OR GREATER-THAN
|
||||
2A9D; 2A9E # [BEST FIT] SIMILAR OR LESS-THAN
|
||||
2A9E; 2A9D # [BEST FIT] SIMILAR OR GREATER-THAN
|
||||
2A9F; 2AA0 # [BEST FIT] SIMILAR ABOVE LESS-THAN ABOVE EQUALS SIGN
|
||||
2AA0; 2A9F # [BEST FIT] SIMILAR ABOVE GREATER-THAN ABOVE EQUALS SIGN
|
||||
2AA1; 2AA2 # DOUBLE NESTED LESS-THAN
|
||||
2AA2; 2AA1 # DOUBLE NESTED GREATER-THAN
|
||||
2AA6; 2AA7 # LESS-THAN CLOSED BY CURVE
|
||||
2AA7; 2AA6 # GREATER-THAN CLOSED BY CURVE
|
||||
2AA8; 2AA9 # LESS-THAN CLOSED BY CURVE ABOVE SLANTED EQUAL
|
||||
2AA9; 2AA8 # GREATER-THAN CLOSED BY CURVE ABOVE SLANTED EQUAL
|
||||
2AAA; 2AAB # SMALLER THAN
|
||||
2AAB; 2AAA # LARGER THAN
|
||||
2AAC; 2AAD # SMALLER THAN OR EQUAL TO
|
||||
2AAD; 2AAC # LARGER THAN OR EQUAL TO
|
||||
2AAF; 2AB0 # PRECEDES ABOVE SINGLE-LINE EQUALS SIGN
|
||||
2AB0; 2AAF # SUCCEEDS ABOVE SINGLE-LINE EQUALS SIGN
|
||||
2AB1; 2AB2 # [BEST FIT] PRECEDES ABOVE SINGLE-LINE NOT EQUAL TO
|
||||
2AB2; 2AB1 # [BEST FIT] SUCCEEDS ABOVE SINGLE-LINE NOT EQUAL TO
|
||||
2AB3; 2AB4 # PRECEDES ABOVE EQUALS SIGN
|
||||
2AB4; 2AB3 # SUCCEEDS ABOVE EQUALS SIGN
|
||||
2AB5; 2AB6 # [BEST FIT] PRECEDES ABOVE NOT EQUAL TO
|
||||
2AB6; 2AB5 # [BEST FIT] SUCCEEDS ABOVE NOT EQUAL TO
|
||||
2AB7; 2AB8 # [BEST FIT] PRECEDES ABOVE ALMOST EQUAL TO
|
||||
2AB8; 2AB7 # [BEST FIT] SUCCEEDS ABOVE ALMOST EQUAL TO
|
||||
2AB9; 2ABA # [BEST FIT] PRECEDES ABOVE NOT ALMOST EQUAL TO
|
||||
2ABA; 2AB9 # [BEST FIT] SUCCEEDS ABOVE NOT ALMOST EQUAL TO
|
||||
2ABB; 2ABC # DOUBLE PRECEDES
|
||||
2ABC; 2ABB # DOUBLE SUCCEEDS
|
||||
2ABD; 2ABE # SUBSET WITH DOT
|
||||
2ABE; 2ABD # SUPERSET WITH DOT
|
||||
2ABF; 2AC0 # SUBSET WITH PLUS SIGN BELOW
|
||||
2AC0; 2ABF # SUPERSET WITH PLUS SIGN BELOW
|
||||
2AC1; 2AC2 # SUBSET WITH MULTIPLICATION SIGN BELOW
|
||||
2AC2; 2AC1 # SUPERSET WITH MULTIPLICATION SIGN BELOW
|
||||
2AC3; 2AC4 # SUBSET OF OR EQUAL TO WITH DOT ABOVE
|
||||
2AC4; 2AC3 # SUPERSET OF OR EQUAL TO WITH DOT ABOVE
|
||||
2AC5; 2AC6 # SUBSET OF ABOVE EQUALS SIGN
|
||||
2AC6; 2AC5 # SUPERSET OF ABOVE EQUALS SIGN
|
||||
2AC7; 2AC8 # [BEST FIT] SUBSET OF ABOVE TILDE OPERATOR
|
||||
2AC8; 2AC7 # [BEST FIT] SUPERSET OF ABOVE TILDE OPERATOR
|
||||
2AC9; 2ACA # [BEST FIT] SUBSET OF ABOVE ALMOST EQUAL TO
|
||||
2ACA; 2AC9 # [BEST FIT] SUPERSET OF ABOVE ALMOST EQUAL TO
|
||||
2ACB; 2ACC # [BEST FIT] SUBSET OF ABOVE NOT EQUAL TO
|
||||
2ACC; 2ACB # [BEST FIT] SUPERSET OF ABOVE NOT EQUAL TO
|
||||
2ACD; 2ACE # SQUARE LEFT OPEN BOX OPERATOR
|
||||
2ACE; 2ACD # SQUARE RIGHT OPEN BOX OPERATOR
|
||||
2ACF; 2AD0 # CLOSED SUBSET
|
||||
2AD0; 2ACF # CLOSED SUPERSET
|
||||
2AD1; 2AD2 # CLOSED SUBSET OR EQUAL TO
|
||||
2AD2; 2AD1 # CLOSED SUPERSET OR EQUAL TO
|
||||
2AD3; 2AD4 # SUBSET ABOVE SUPERSET
|
||||
2AD4; 2AD3 # SUPERSET ABOVE SUBSET
|
||||
2AD5; 2AD6 # SUBSET ABOVE SUBSET
|
||||
2AD6; 2AD5 # SUPERSET ABOVE SUPERSET
|
||||
2ADE; 22A6 # SHORT LEFT TACK
|
||||
2AE3; 22A9 # DOUBLE VERTICAL BAR LEFT TURNSTILE
|
||||
2AE4; 22A8 # VERTICAL BAR DOUBLE LEFT TURNSTILE
|
||||
2AE5; 22AB # DOUBLE VERTICAL BAR DOUBLE LEFT TURNSTILE
|
||||
2AEC; 2AED # DOUBLE STROKE NOT SIGN
|
||||
2AED; 2AEC # REVERSED DOUBLE STROKE NOT SIGN
|
||||
2AEE; 2224 # DOES NOT DIVIDE WITH REVERSED NEGATION SLASH
|
||||
2AF7; 2AF8 # TRIPLE NESTED LESS-THAN
|
||||
2AF8; 2AF7 # TRIPLE NESTED GREATER-THAN
|
||||
2AF9; 2AFA # DOUBLE-LINE SLANTED LESS-THAN OR EQUAL TO
|
||||
2AFA; 2AF9 # DOUBLE-LINE SLANTED GREATER-THAN OR EQUAL TO
|
||||
2BFE; 221F # REVERSED RIGHT ANGLE
|
||||
2E02; 2E03 # LEFT SUBSTITUTION BRACKET
|
||||
2E03; 2E02 # RIGHT SUBSTITUTION BRACKET
|
||||
2E04; 2E05 # LEFT DOTTED SUBSTITUTION BRACKET
|
||||
2E05; 2E04 # RIGHT DOTTED SUBSTITUTION BRACKET
|
||||
2E09; 2E0A # LEFT TRANSPOSITION BRACKET
|
||||
2E0A; 2E09 # RIGHT TRANSPOSITION BRACKET
|
||||
2E0C; 2E0D # LEFT RAISED OMISSION BRACKET
|
||||
2E0D; 2E0C # RIGHT RAISED OMISSION BRACKET
|
||||
2E1C; 2E1D # LEFT LOW PARAPHRASE BRACKET
|
||||
2E1D; 2E1C # RIGHT LOW PARAPHRASE BRACKET
|
||||
2E20; 2E21 # LEFT VERTICAL BAR WITH QUILL
|
||||
2E21; 2E20 # RIGHT VERTICAL BAR WITH QUILL
|
||||
2E22; 2E23 # TOP LEFT HALF BRACKET
|
||||
2E23; 2E22 # TOP RIGHT HALF BRACKET
|
||||
2E24; 2E25 # BOTTOM LEFT HALF BRACKET
|
||||
2E25; 2E24 # BOTTOM RIGHT HALF BRACKET
|
||||
2E26; 2E27 # LEFT SIDEWAYS U BRACKET
|
||||
2E27; 2E26 # RIGHT SIDEWAYS U BRACKET
|
||||
2E28; 2E29 # LEFT DOUBLE PARENTHESIS
|
||||
2E29; 2E28 # RIGHT DOUBLE PARENTHESIS
|
||||
2E55; 2E56 # LEFT SQUARE BRACKET WITH STROKE
|
||||
2E56; 2E55 # RIGHT SQUARE BRACKET WITH STROKE
|
||||
2E57; 2E58 # LEFT SQUARE BRACKET WITH DOUBLE STROKE
|
||||
2E58; 2E57 # RIGHT SQUARE BRACKET WITH DOUBLE STROKE
|
||||
2E59; 2E5A # TOP HALF LEFT PARENTHESIS
|
||||
2E5A; 2E59 # TOP HALF RIGHT PARENTHESIS
|
||||
2E5B; 2E5C # BOTTOM HALF LEFT PARENTHESIS
|
||||
2E5C; 2E5B # BOTTOM HALF RIGHT PARENTHESIS
|
||||
3008; 3009 # LEFT ANGLE BRACKET
|
||||
3009; 3008 # RIGHT ANGLE BRACKET
|
||||
300A; 300B # LEFT DOUBLE ANGLE BRACKET
|
||||
300B; 300A # RIGHT DOUBLE ANGLE BRACKET
|
||||
300C; 300D # [BEST FIT] LEFT CORNER BRACKET
|
||||
300D; 300C # [BEST FIT] RIGHT CORNER BRACKET
|
||||
300E; 300F # [BEST FIT] LEFT WHITE CORNER BRACKET
|
||||
300F; 300E # [BEST FIT] RIGHT WHITE CORNER BRACKET
|
||||
3010; 3011 # LEFT BLACK LENTICULAR BRACKET
|
||||
3011; 3010 # RIGHT BLACK LENTICULAR BRACKET
|
||||
3014; 3015 # LEFT TORTOISE SHELL BRACKET
|
||||
3015; 3014 # RIGHT TORTOISE SHELL BRACKET
|
||||
3016; 3017 # LEFT WHITE LENTICULAR BRACKET
|
||||
3017; 3016 # RIGHT WHITE LENTICULAR BRACKET
|
||||
3018; 3019 # LEFT WHITE TORTOISE SHELL BRACKET
|
||||
3019; 3018 # RIGHT WHITE TORTOISE SHELL BRACKET
|
||||
301A; 301B # LEFT WHITE SQUARE BRACKET
|
||||
301B; 301A # RIGHT WHITE SQUARE BRACKET
|
||||
FE59; FE5A # SMALL LEFT PARENTHESIS
|
||||
FE5A; FE59 # SMALL RIGHT PARENTHESIS
|
||||
FE5B; FE5C # SMALL LEFT CURLY BRACKET
|
||||
FE5C; FE5B # SMALL RIGHT CURLY BRACKET
|
||||
FE5D; FE5E # SMALL LEFT TORTOISE SHELL BRACKET
|
||||
FE5E; FE5D # SMALL RIGHT TORTOISE SHELL BRACKET
|
||||
FE64; FE65 # SMALL LESS-THAN SIGN
|
||||
FE65; FE64 # SMALL GREATER-THAN SIGN
|
||||
FF08; FF09 # FULLWIDTH LEFT PARENTHESIS
|
||||
FF09; FF08 # FULLWIDTH RIGHT PARENTHESIS
|
||||
FF1C; FF1E # FULLWIDTH LESS-THAN SIGN
|
||||
FF1E; FF1C # FULLWIDTH GREATER-THAN SIGN
|
||||
FF3B; FF3D # FULLWIDTH LEFT SQUARE BRACKET
|
||||
FF3D; FF3B # FULLWIDTH RIGHT SQUARE BRACKET
|
||||
FF5B; FF5D # FULLWIDTH LEFT CURLY BRACKET
|
||||
FF5D; FF5B # FULLWIDTH RIGHT CURLY BRACKET
|
||||
FF5F; FF60 # FULLWIDTH LEFT WHITE PARENTHESIS
|
||||
FF60; FF5F # FULLWIDTH RIGHT WHITE PARENTHESIS
|
||||
FF62; FF63 # [BEST FIT] HALFWIDTH LEFT CORNER BRACKET
|
||||
FF63; FF62 # [BEST FIT] HALFWIDTH RIGHT CORNER BRACKET
|
||||
|
||||
# The following characters have no appropriate mirroring character.
|
||||
# For these characters it is up to the rendering system
|
||||
# to provide mirrored glyphs.
|
||||
|
||||
# 2140; DOUBLE-STRUCK N-ARY SUMMATION
|
||||
# 2201; COMPLEMENT
|
||||
# 2202; PARTIAL DIFFERENTIAL
|
||||
# 2203; THERE EXISTS
|
||||
# 2204; THERE DOES NOT EXIST
|
||||
# 2211; N-ARY SUMMATION
|
||||
# 2216; SET MINUS
|
||||
# 221A; SQUARE ROOT
|
||||
# 221B; CUBE ROOT
|
||||
# 221C; FOURTH ROOT
|
||||
# 221D; PROPORTIONAL TO
|
||||
# 2226; NOT PARALLEL TO
|
||||
# 222B; INTEGRAL
|
||||
# 222C; DOUBLE INTEGRAL
|
||||
# 222D; TRIPLE INTEGRAL
|
||||
# 222E; CONTOUR INTEGRAL
|
||||
# 222F; SURFACE INTEGRAL
|
||||
# 2230; VOLUME INTEGRAL
|
||||
# 2231; CLOCKWISE INTEGRAL
|
||||
# 2232; CLOCKWISE CONTOUR INTEGRAL
|
||||
# 2233; ANTICLOCKWISE CONTOUR INTEGRAL
|
||||
# 2239; EXCESS
|
||||
# 223B; HOMOTHETIC
|
||||
# 223E; INVERTED LAZY S
|
||||
# 223F; SINE WAVE
|
||||
# 2240; WREATH PRODUCT
|
||||
# 2241; NOT TILDE
|
||||
# 2242; MINUS TILDE
|
||||
# 2244; NOT ASYMPTOTICALLY EQUAL TO
|
||||
# 2246; APPROXIMATELY BUT NOT ACTUALLY EQUAL TO
|
||||
# 2247; NEITHER APPROXIMATELY NOR ACTUALLY EQUAL TO
|
||||
# 2248; ALMOST EQUAL TO
|
||||
# 2249; NOT ALMOST EQUAL TO
|
||||
# 224A; ALMOST EQUAL OR EQUAL TO
|
||||
# 224B; TRIPLE TILDE
|
||||
# 225F; QUESTIONED EQUAL TO
|
||||
# 2260; NOT EQUAL TO
|
||||
# 2262; NOT IDENTICAL TO
|
||||
# 228C; MULTISET
|
||||
# 22A7; MODELS
|
||||
# 22AA; TRIPLE VERTICAL BAR RIGHT TURNSTILE
|
||||
# 22AC; DOES NOT PROVE
|
||||
# 22AD; NOT TRUE
|
||||
# 22AE; DOES NOT FORCE
|
||||
# 22AF; NEGATED DOUBLE VERTICAL BAR DOUBLE RIGHT TURNSTILE
|
||||
# 22BE; RIGHT ANGLE WITH ARC
|
||||
# 22BF; RIGHT TRIANGLE
|
||||
# 22F5; ELEMENT OF WITH DOT ABOVE
|
||||
# 22F8; ELEMENT OF WITH UNDERBAR
|
||||
# 22F9; ELEMENT OF WITH TWO HORIZONTAL STROKES
|
||||
# 22FF; Z NOTATION BAG MEMBERSHIP
|
||||
# 2320; TOP HALF INTEGRAL
|
||||
# 2321; BOTTOM HALF INTEGRAL
|
||||
# 27C0; THREE DIMENSIONAL ANGLE
|
||||
# 27CC; LONG DIVISION
|
||||
# 27D3; LOWER RIGHT CORNER WITH DOT
|
||||
# 27D4; UPPER LEFT CORNER WITH DOT
|
||||
# 299C; RIGHT ANGLE VARIANT WITH SQUARE
|
||||
# 299D; MEASURED RIGHT ANGLE WITH DOT
|
||||
# 299E; ANGLE WITH S INSIDE
|
||||
# 299F; ACUTE ANGLE
|
||||
# 29A2; TURNED ANGLE
|
||||
# 29A6; OBLIQUE ANGLE OPENING UP
|
||||
# 29A7; OBLIQUE ANGLE OPENING DOWN
|
||||
# 29C2; CIRCLE WITH SMALL CIRCLE TO THE RIGHT
|
||||
# 29C3; CIRCLE WITH TWO HORIZONTAL STROKES TO THE RIGHT
|
||||
# 29C9; TWO JOINED SQUARES
|
||||
# 29CE; RIGHT TRIANGLE ABOVE LEFT TRIANGLE
|
||||
# 29DC; INCOMPLETE INFINITY
|
||||
# 29E1; INCREASES AS
|
||||
# 29E3; EQUALS SIGN AND SLANTED PARALLEL
|
||||
# 29E4; EQUALS SIGN AND SLANTED PARALLEL WITH TILDE ABOVE
|
||||
# 29E5; IDENTICAL TO AND SLANTED PARALLEL
|
||||
# 29F4; RULE-DELAYED
|
||||
# 29F6; SOLIDUS WITH OVERBAR
|
||||
# 29F7; REVERSE SOLIDUS WITH HORIZONTAL STROKE
|
||||
# 2A0A; MODULO TWO SUM
|
||||
# 2A0B; SUMMATION WITH INTEGRAL
|
||||
# 2A0C; QUADRUPLE INTEGRAL OPERATOR
|
||||
# 2A0D; FINITE PART INTEGRAL
|
||||
# 2A0E; INTEGRAL WITH DOUBLE STROKE
|
||||
# 2A0F; INTEGRAL AVERAGE WITH SLASH
|
||||
# 2A10; CIRCULATION FUNCTION
|
||||
# 2A11; ANTICLOCKWISE INTEGRATION
|
||||
# 2A12; LINE INTEGRATION WITH RECTANGULAR PATH AROUND POLE
|
||||
# 2A13; LINE INTEGRATION WITH SEMICIRCULAR PATH AROUND POLE
|
||||
# 2A14; LINE INTEGRATION NOT INCLUDING THE POLE
|
||||
# 2A15; INTEGRAL AROUND A POINT OPERATOR
|
||||
# 2A16; QUATERNION INTEGRAL OPERATOR
|
||||
# 2A17; INTEGRAL WITH LEFTWARDS ARROW WITH HOOK
|
||||
# 2A18; INTEGRAL WITH TIMES SIGN
|
||||
# 2A19; INTEGRAL WITH INTERSECTION
|
||||
# 2A1A; INTEGRAL WITH UNION
|
||||
# 2A1B; INTEGRAL WITH OVERBAR
|
||||
# 2A1C; INTEGRAL WITH UNDERBAR
|
||||
# 2A1E; LARGE LEFT TRIANGLE OPERATOR
|
||||
# 2A1F; Z NOTATION SCHEMA COMPOSITION
|
||||
# 2A20; Z NOTATION SCHEMA PIPING
|
||||
# 2A21; Z NOTATION SCHEMA PROJECTION
|
||||
# 2A24; PLUS SIGN WITH TILDE ABOVE
|
||||
# 2A26; PLUS SIGN WITH TILDE BELOW
|
||||
# 2A29; MINUS SIGN WITH COMMA ABOVE
|
||||
# 2A3E; Z NOTATION RELATIONAL COMPOSITION
|
||||
# 2A57; SLOPING LARGE OR
|
||||
# 2A58; SLOPING LARGE AND
|
||||
# 2A6A; TILDE OPERATOR WITH DOT ABOVE
|
||||
# 2A6B; TILDE OPERATOR WITH RISING DOTS
|
||||
# 2A6C; SIMILAR MINUS SIMILAR
|
||||
# 2A6D; CONGRUENT WITH DOT ABOVE
|
||||
# 2A6F; ALMOST EQUAL TO WITH CIRCUMFLEX ACCENT
|
||||
# 2A70; APPROXIMATELY EQUAL OR EQUAL TO
|
||||
# 2A73; EQUALS SIGN ABOVE TILDE OPERATOR
|
||||
# 2A74; DOUBLE COLON EQUAL
|
||||
# 2AA3; DOUBLE NESTED LESS-THAN WITH UNDERBAR
|
||||
# 2ADC; FORKING
|
||||
# 2AE2; VERTICAL BAR TRIPLE RIGHT TURNSTILE
|
||||
# 2AE6; LONG DASH FROM LEFT MEMBER OF DOUBLE VERTICAL
|
||||
# 2AF3; PARALLEL WITH TILDE OPERATOR
|
||||
# 2AFB; TRIPLE SOLIDUS BINARY RELATION
|
||||
# 2AFD; DOUBLE SOLIDUS OPERATOR
|
||||
# 1D6DB; MATHEMATICAL BOLD PARTIAL DIFFERENTIAL
|
||||
# 1D715; MATHEMATICAL ITALIC PARTIAL DIFFERENTIAL
|
||||
# 1D74F; MATHEMATICAL BOLD ITALIC PARTIAL DIFFERENTIAL
|
||||
# 1D789; MATHEMATICAL SANS-SERIF BOLD PARTIAL DIFFERENTIAL
|
||||
# 1D7C3; MATHEMATICAL SANS-SERIF BOLD ITALIC PARTIAL DIFFERENTIAL
|
||||
|
||||
# EOF
|
|
@ -1,6 +1,6 @@
|
|||
# CaseFolding-13.0.0.txt
|
||||
# Date: 2019-09-08, 23:30:59 GMT
|
||||
# © 2019 Unicode®, Inc.
|
||||
# CaseFolding-14.0.0.txt
|
||||
# Date: 2021-03-08, 19:35:41 GMT
|
||||
# © 2021 Unicode®, Inc.
|
||||
# Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries.
|
||||
# For terms of use, see http://www.unicode.org/terms_of_use.html
|
||||
#
|
||||
|
@ -1050,6 +1050,7 @@
|
|||
2C2C; C; 2C5C; # GLAGOLITIC CAPITAL LETTER SHTAPIC
|
||||
2C2D; C; 2C5D; # GLAGOLITIC CAPITAL LETTER TROKUTASTI A
|
||||
2C2E; C; 2C5E; # GLAGOLITIC CAPITAL LETTER LATINATE MYSLITE
|
||||
2C2F; C; 2C5F; # GLAGOLITIC CAPITAL LETTER CAUDATE CHRIVI
|
||||
2C60; C; 2C61; # LATIN CAPITAL LETTER L WITH DOUBLE BAR
|
||||
2C62; C; 026B; # LATIN CAPITAL LETTER L WITH MIDDLE TILDE
|
||||
2C63; C; 1D7D; # LATIN CAPITAL LETTER P WITH STROKE
|
||||
|
@ -1230,12 +1231,16 @@ A7B8; C; A7B9; # LATIN CAPITAL LETTER U WITH STROKE
|
|||
A7BA; C; A7BB; # LATIN CAPITAL LETTER GLOTTAL A
|
||||
A7BC; C; A7BD; # LATIN CAPITAL LETTER GLOTTAL I
|
||||
A7BE; C; A7BF; # LATIN CAPITAL LETTER GLOTTAL U
|
||||
A7C0; C; A7C1; # LATIN CAPITAL LETTER OLD POLISH O
|
||||
A7C2; C; A7C3; # LATIN CAPITAL LETTER ANGLICANA W
|
||||
A7C4; C; A794; # LATIN CAPITAL LETTER C WITH PALATAL HOOK
|
||||
A7C5; C; 0282; # LATIN CAPITAL LETTER S WITH HOOK
|
||||
A7C6; C; 1D8E; # LATIN CAPITAL LETTER Z WITH PALATAL HOOK
|
||||
A7C7; C; A7C8; # LATIN CAPITAL LETTER D WITH SHORT STROKE OVERLAY
|
||||
A7C9; C; A7CA; # LATIN CAPITAL LETTER S WITH SHORT STROKE OVERLAY
|
||||
A7D0; C; A7D1; # LATIN CAPITAL LETTER CLOSED INSULAR G
|
||||
A7D6; C; A7D7; # LATIN CAPITAL LETTER MIDDLE SCOTS S
|
||||
A7D8; C; A7D9; # LATIN CAPITAL LETTER SIGMOID S
|
||||
A7F5; C; A7F6; # LATIN CAPITAL LETTER REVERSED HALF H
|
||||
AB70; C; 13A0; # CHEROKEE SMALL LETTER A
|
||||
AB71; C; 13A1; # CHEROKEE SMALL LETTER E
|
||||
|
@ -1431,6 +1436,41 @@ FF3A; C; FF5A; # FULLWIDTH LATIN CAPITAL LETTER Z
|
|||
104D1; C; 104F9; # OSAGE CAPITAL LETTER GHA
|
||||
104D2; C; 104FA; # OSAGE CAPITAL LETTER ZA
|
||||
104D3; C; 104FB; # OSAGE CAPITAL LETTER ZHA
|
||||
10570; C; 10597; # VITHKUQI CAPITAL LETTER A
|
||||
10571; C; 10598; # VITHKUQI CAPITAL LETTER BBE
|
||||
10572; C; 10599; # VITHKUQI CAPITAL LETTER BE
|
||||
10573; C; 1059A; # VITHKUQI CAPITAL LETTER CE
|
||||
10574; C; 1059B; # VITHKUQI CAPITAL LETTER CHE
|
||||
10575; C; 1059C; # VITHKUQI CAPITAL LETTER DE
|
||||
10576; C; 1059D; # VITHKUQI CAPITAL LETTER DHE
|
||||
10577; C; 1059E; # VITHKUQI CAPITAL LETTER EI
|
||||
10578; C; 1059F; # VITHKUQI CAPITAL LETTER E
|
||||
10579; C; 105A0; # VITHKUQI CAPITAL LETTER FE
|
||||
1057A; C; 105A1; # VITHKUQI CAPITAL LETTER GA
|
||||
1057C; C; 105A3; # VITHKUQI CAPITAL LETTER HA
|
||||
1057D; C; 105A4; # VITHKUQI CAPITAL LETTER HHA
|
||||
1057E; C; 105A5; # VITHKUQI CAPITAL LETTER I
|
||||
1057F; C; 105A6; # VITHKUQI CAPITAL LETTER IJE
|
||||
10580; C; 105A7; # VITHKUQI CAPITAL LETTER JE
|
||||
10581; C; 105A8; # VITHKUQI CAPITAL LETTER KA
|
||||
10582; C; 105A9; # VITHKUQI CAPITAL LETTER LA
|
||||
10583; C; 105AA; # VITHKUQI CAPITAL LETTER LLA
|
||||
10584; C; 105AB; # VITHKUQI CAPITAL LETTER ME
|
||||
10585; C; 105AC; # VITHKUQI CAPITAL LETTER NE
|
||||
10586; C; 105AD; # VITHKUQI CAPITAL LETTER NJE
|
||||
10587; C; 105AE; # VITHKUQI CAPITAL LETTER O
|
||||
10588; C; 105AF; # VITHKUQI CAPITAL LETTER PE
|
||||
10589; C; 105B0; # VITHKUQI CAPITAL LETTER QA
|
||||
1058A; C; 105B1; # VITHKUQI CAPITAL LETTER RE
|
||||
1058C; C; 105B3; # VITHKUQI CAPITAL LETTER SE
|
||||
1058D; C; 105B4; # VITHKUQI CAPITAL LETTER SHE
|
||||
1058E; C; 105B5; # VITHKUQI CAPITAL LETTER TE
|
||||
1058F; C; 105B6; # VITHKUQI CAPITAL LETTER THE
|
||||
10590; C; 105B7; # VITHKUQI CAPITAL LETTER U
|
||||
10591; C; 105B8; # VITHKUQI CAPITAL LETTER VE
|
||||
10592; C; 105B9; # VITHKUQI CAPITAL LETTER XE
|
||||
10594; C; 105BB; # VITHKUQI CAPITAL LETTER Y
|
||||
10595; C; 105BC; # VITHKUQI CAPITAL LETTER ZE
|
||||
10C80; C; 10CC0; # OLD HUNGARIAN CAPITAL LETTER A
|
||||
10C81; C; 10CC1; # OLD HUNGARIAN CAPITAL LETTER AA
|
||||
10C82; C; 10CC2; # OLD HUNGARIAN CAPITAL LETTER EB
|
||||
|
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
|
@ -1,6 +1,6 @@
|
|||
# DerivedGeneralCategory-13.0.0.txt
|
||||
# Date: 2019-10-21, 14:30:32 GMT
|
||||
# © 2019 Unicode®, Inc.
|
||||
# DerivedGeneralCategory-14.0.0.txt
|
||||
# Date: 2021-07-10, 00:35:08 GMT
|
||||
# © 2021 Unicode®, Inc.
|
||||
# Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries.
|
||||
# For terms of use, see http://www.unicode.org/terms_of_use.html
|
||||
#
|
||||
|
@ -27,7 +27,6 @@
|
|||
05C8..05CF ; Cn # [8] <reserved-05C8>..<reserved-05CF>
|
||||
05EB..05EE ; Cn # [4] <reserved-05EB>..<reserved-05EE>
|
||||
05F5..05FF ; Cn # [11] <reserved-05F5>..<reserved-05FF>
|
||||
061D ; Cn # <reserved-061D>
|
||||
070E ; Cn # <reserved-070E>
|
||||
074B..074C ; Cn # [2] <reserved-074B>..<reserved-074C>
|
||||
07B2..07BF ; Cn # [14] <reserved-07B2>..<reserved-07BF>
|
||||
|
@ -36,9 +35,9 @@
|
|||
083F ; Cn # <reserved-083F>
|
||||
085C..085D ; Cn # [2] <reserved-085C>..<reserved-085D>
|
||||
085F ; Cn # <reserved-085F>
|
||||
086B..089F ; Cn # [53] <reserved-086B>..<reserved-089F>
|
||||
08B5 ; Cn # <reserved-08B5>
|
||||
08C8..08D2 ; Cn # [11] <reserved-08C8>..<reserved-08D2>
|
||||
086B..086F ; Cn # [5] <reserved-086B>..<reserved-086F>
|
||||
088F ; Cn # <reserved-088F>
|
||||
0892..0897 ; Cn # [6] <reserved-0892>..<reserved-0897>
|
||||
0984 ; Cn # <reserved-0984>
|
||||
098D..098E ; Cn # [2] <reserved-098D>..<reserved-098E>
|
||||
0991..0992 ; Cn # [2] <reserved-0991>..<reserved-0992>
|
||||
|
@ -116,12 +115,13 @@
|
|||
0C0D ; Cn # <reserved-0C0D>
|
||||
0C11 ; Cn # <reserved-0C11>
|
||||
0C29 ; Cn # <reserved-0C29>
|
||||
0C3A..0C3C ; Cn # [3] <reserved-0C3A>..<reserved-0C3C>
|
||||
0C3A..0C3B ; Cn # [2] <reserved-0C3A>..<reserved-0C3B>
|
||||
0C45 ; Cn # <reserved-0C45>
|
||||
0C49 ; Cn # <reserved-0C49>
|
||||
0C4E..0C54 ; Cn # [7] <reserved-0C4E>..<reserved-0C54>
|
||||
0C57 ; Cn # <reserved-0C57>
|
||||
0C5B..0C5F ; Cn # [5] <reserved-0C5B>..<reserved-0C5F>
|
||||
0C5B..0C5C ; Cn # [2] <reserved-0C5B>..<reserved-0C5C>
|
||||
0C5E..0C5F ; Cn # [2] <reserved-0C5E>..<reserved-0C5F>
|
||||
0C64..0C65 ; Cn # [2] <reserved-0C64>..<reserved-0C65>
|
||||
0C70..0C76 ; Cn # [7] <reserved-0C70>..<reserved-0C76>
|
||||
0C8D ; Cn # <reserved-0C8D>
|
||||
|
@ -132,7 +132,7 @@
|
|||
0CC5 ; Cn # <reserved-0CC5>
|
||||
0CC9 ; Cn # <reserved-0CC9>
|
||||
0CCE..0CD4 ; Cn # [7] <reserved-0CCE>..<reserved-0CD4>
|
||||
0CD7..0CDD ; Cn # [7] <reserved-0CD7>..<reserved-0CDD>
|
||||
0CD7..0CDC ; Cn # [6] <reserved-0CD7>..<reserved-0CDC>
|
||||
0CDF ; Cn # <reserved-0CDF>
|
||||
0CE4..0CE5 ; Cn # [2] <reserved-0CE4>..<reserved-0CE5>
|
||||
0CF0 ; Cn # <reserved-0CF0>
|
||||
|
@ -200,8 +200,7 @@
|
|||
13FE..13FF ; Cn # [2] <reserved-13FE>..<reserved-13FF>
|
||||
169D..169F ; Cn # [3] <reserved-169D>..<reserved-169F>
|
||||
16F9..16FF ; Cn # [7] <reserved-16F9>..<reserved-16FF>
|
||||
170D ; Cn # <reserved-170D>
|
||||
1715..171F ; Cn # [11] <reserved-1715>..<reserved-171F>
|
||||
1716..171E ; Cn # [9] <reserved-1716>..<reserved-171E>
|
||||
1737..173F ; Cn # [9] <reserved-1737>..<reserved-173F>
|
||||
1754..175F ; Cn # [12] <reserved-1754>..<reserved-175F>
|
||||
176D ; Cn # <reserved-176D>
|
||||
|
@ -210,7 +209,6 @@
|
|||
17DE..17DF ; Cn # [2] <reserved-17DE>..<reserved-17DF>
|
||||
17EA..17EF ; Cn # [6] <reserved-17EA>..<reserved-17EF>
|
||||
17FA..17FF ; Cn # [6] <reserved-17FA>..<reserved-17FF>
|
||||
180F ; Cn # <reserved-180F>
|
||||
181A..181F ; Cn # [6] <reserved-181A>..<reserved-181F>
|
||||
1879..187F ; Cn # [7] <reserved-1879>..<reserved-187F>
|
||||
18AB..18AF ; Cn # [5] <reserved-18AB>..<reserved-18AF>
|
||||
|
@ -230,9 +228,9 @@
|
|||
1A8A..1A8F ; Cn # [6] <reserved-1A8A>..<reserved-1A8F>
|
||||
1A9A..1A9F ; Cn # [6] <reserved-1A9A>..<reserved-1A9F>
|
||||
1AAE..1AAF ; Cn # [2] <reserved-1AAE>..<reserved-1AAF>
|
||||
1AC1..1AFF ; Cn # [63] <reserved-1AC1>..<reserved-1AFF>
|
||||
1B4C..1B4F ; Cn # [4] <reserved-1B4C>..<reserved-1B4F>
|
||||
1B7D..1B7F ; Cn # [3] <reserved-1B7D>..<reserved-1B7F>
|
||||
1ACF..1AFF ; Cn # [49] <reserved-1ACF>..<reserved-1AFF>
|
||||
1B4D..1B4F ; Cn # [3] <reserved-1B4D>..<reserved-1B4F>
|
||||
1B7F ; Cn # <reserved-1B7F>
|
||||
1BF4..1BFB ; Cn # [8] <reserved-1BF4>..<reserved-1BFB>
|
||||
1C38..1C3A ; Cn # [3] <reserved-1C38>..<reserved-1C3A>
|
||||
1C4A..1C4C ; Cn # [3] <reserved-1C4A>..<reserved-1C4C>
|
||||
|
@ -240,7 +238,6 @@
|
|||
1CBB..1CBC ; Cn # [2] <reserved-1CBB>..<reserved-1CBC>
|
||||
1CC8..1CCF ; Cn # [8] <reserved-1CC8>..<reserved-1CCF>
|
||||
1CFB..1CFF ; Cn # [5] <reserved-1CFB>..<reserved-1CFF>
|
||||
1DFA ; Cn # <reserved-1DFA>
|
||||
1F16..1F17 ; Cn # [2] <reserved-1F16>..<reserved-1F17>
|
||||
1F1E..1F1F ; Cn # [2] <reserved-1F1E>..<reserved-1F1F>
|
||||
1F46..1F47 ; Cn # [2] <reserved-1F46>..<reserved-1F47>
|
||||
|
@ -261,15 +258,13 @@
|
|||
2072..2073 ; Cn # [2] <reserved-2072>..<reserved-2073>
|
||||
208F ; Cn # <reserved-208F>
|
||||
209D..209F ; Cn # [3] <reserved-209D>..<reserved-209F>
|
||||
20C0..20CF ; Cn # [16] <reserved-20C0>..<reserved-20CF>
|
||||
20C1..20CF ; Cn # [15] <reserved-20C1>..<reserved-20CF>
|
||||
20F1..20FF ; Cn # [15] <reserved-20F1>..<reserved-20FF>
|
||||
218C..218F ; Cn # [4] <reserved-218C>..<reserved-218F>
|
||||
2427..243F ; Cn # [25] <reserved-2427>..<reserved-243F>
|
||||
244B..245F ; Cn # [21] <reserved-244B>..<reserved-245F>
|
||||
2B74..2B75 ; Cn # [2] <reserved-2B74>..<reserved-2B75>
|
||||
2B96 ; Cn # <reserved-2B96>
|
||||
2C2F ; Cn # <reserved-2C2F>
|
||||
2C5F ; Cn # <reserved-2C5F>
|
||||
2CF4..2CF8 ; Cn # [5] <reserved-2CF4>..<reserved-2CF8>
|
||||
2D26 ; Cn # <reserved-2D26>
|
||||
2D28..2D2C ; Cn # [5] <reserved-2D28>..<reserved-2D2C>
|
||||
|
@ -285,7 +280,7 @@
|
|||
2DCF ; Cn # <reserved-2DCF>
|
||||
2DD7 ; Cn # <reserved-2DD7>
|
||||
2DDF ; Cn # <reserved-2DDF>
|
||||
2E53..2E7F ; Cn # [45] <reserved-2E53>..<reserved-2E7F>
|
||||
2E5E..2E7F ; Cn # [34] <reserved-2E5E>..<reserved-2E7F>
|
||||
2E9A ; Cn # <reserved-2E9A>
|
||||
2EF4..2EFF ; Cn # [12] <reserved-2EF4>..<reserved-2EFF>
|
||||
2FD6..2FEF ; Cn # [26] <reserved-2FD6>..<reserved-2FEF>
|
||||
|
@ -297,13 +292,14 @@
|
|||
318F ; Cn # <reserved-318F>
|
||||
31E4..31EF ; Cn # [12] <reserved-31E4>..<reserved-31EF>
|
||||
321F ; Cn # <reserved-321F>
|
||||
9FFD..9FFF ; Cn # [3] <reserved-9FFD>..<reserved-9FFF>
|
||||
A48D..A48F ; Cn # [3] <reserved-A48D>..<reserved-A48F>
|
||||
A4C7..A4CF ; Cn # [9] <reserved-A4C7>..<reserved-A4CF>
|
||||
A62C..A63F ; Cn # [20] <reserved-A62C>..<reserved-A63F>
|
||||
A6F8..A6FF ; Cn # [8] <reserved-A6F8>..<reserved-A6FF>
|
||||
A7C0..A7C1 ; Cn # [2] <reserved-A7C0>..<reserved-A7C1>
|
||||
A7CB..A7F4 ; Cn # [42] <reserved-A7CB>..<reserved-A7F4>
|
||||
A7CB..A7CF ; Cn # [5] <reserved-A7CB>..<reserved-A7CF>
|
||||
A7D2 ; Cn # <reserved-A7D2>
|
||||
A7D4 ; Cn # <reserved-A7D4>
|
||||
A7DA..A7F1 ; Cn # [24] <reserved-A7DA>..<reserved-A7F1>
|
||||
A82D..A82F ; Cn # [3] <reserved-A82D>..<reserved-A82F>
|
||||
A83A..A83F ; Cn # [6] <reserved-A83A>..<reserved-A83F>
|
||||
A878..A87F ; Cn # [8] <reserved-A878>..<reserved-A87F>
|
||||
|
@ -339,11 +335,10 @@ FB3D ; Cn # <reserved-FB3D>
|
|||
FB3F ; Cn # <reserved-FB3F>
|
||||
FB42 ; Cn # <reserved-FB42>
|
||||
FB45 ; Cn # <reserved-FB45>
|
||||
FBC2..FBD2 ; Cn # [17] <reserved-FBC2>..<reserved-FBD2>
|
||||
FD40..FD4F ; Cn # [16] <reserved-FD40>..<reserved-FD4F>
|
||||
FBC3..FBD2 ; Cn # [16] <reserved-FBC3>..<reserved-FBD2>
|
||||
FD90..FD91 ; Cn # [2] <reserved-FD90>..<reserved-FD91>
|
||||
FDC8..FDEF ; Cn # [40] <reserved-FDC8>..<noncharacter-FDEF>
|
||||
FDFE..FDFF ; Cn # [2] <reserved-FDFE>..<reserved-FDFF>
|
||||
FDC8..FDCE ; Cn # [7] <reserved-FDC8>..<reserved-FDCE>
|
||||
FDD0..FDEF ; Cn # [32] <noncharacter-FDD0>..<noncharacter-FDEF>
|
||||
FE1A..FE1F ; Cn # [6] <reserved-FE1A>..<reserved-FE1F>
|
||||
FE53 ; Cn # <reserved-FE53>
|
||||
FE67 ; Cn # <reserved-FE67>
|
||||
|
@ -387,10 +382,20 @@ FFFE..FFFF ; Cn # [2] <noncharacter-FFFE>..<noncharacter-FFFF>
|
|||
104FC..104FF ; Cn # [4] <reserved-104FC>..<reserved-104FF>
|
||||
10528..1052F ; Cn # [8] <reserved-10528>..<reserved-1052F>
|
||||
10564..1056E ; Cn # [11] <reserved-10564>..<reserved-1056E>
|
||||
10570..105FF ; Cn # [144] <reserved-10570>..<reserved-105FF>
|
||||
1057B ; Cn # <reserved-1057B>
|
||||
1058B ; Cn # <reserved-1058B>
|
||||
10593 ; Cn # <reserved-10593>
|
||||
10596 ; Cn # <reserved-10596>
|
||||
105A2 ; Cn # <reserved-105A2>
|
||||
105B2 ; Cn # <reserved-105B2>
|
||||
105BA ; Cn # <reserved-105BA>
|
||||
105BD..105FF ; Cn # [67] <reserved-105BD>..<reserved-105FF>
|
||||
10737..1073F ; Cn # [9] <reserved-10737>..<reserved-1073F>
|
||||
10756..1075F ; Cn # [10] <reserved-10756>..<reserved-1075F>
|
||||
10768..107FF ; Cn # [152] <reserved-10768>..<reserved-107FF>
|
||||
10768..1077F ; Cn # [24] <reserved-10768>..<reserved-1077F>
|
||||
10786 ; Cn # <reserved-10786>
|
||||
107B1 ; Cn # <reserved-107B1>
|
||||
107BB..107FF ; Cn # [69] <reserved-107BB>..<reserved-107FF>
|
||||
10806..10807 ; Cn # [2] <reserved-10806>..<reserved-10807>
|
||||
10809 ; Cn # <reserved-10809>
|
||||
10836 ; Cn # <reserved-10836>
|
||||
|
@ -433,12 +438,13 @@ FFFE..FFFF ; Cn # [2] <noncharacter-FFFE>..<noncharacter-FFFF>
|
|||
10EAE..10EAF ; Cn # [2] <reserved-10EAE>..<reserved-10EAF>
|
||||
10EB2..10EFF ; Cn # [78] <reserved-10EB2>..<reserved-10EFF>
|
||||
10F28..10F2F ; Cn # [8] <reserved-10F28>..<reserved-10F2F>
|
||||
10F5A..10FAF ; Cn # [86] <reserved-10F5A>..<reserved-10FAF>
|
||||
10F5A..10F6F ; Cn # [22] <reserved-10F5A>..<reserved-10F6F>
|
||||
10F8A..10FAF ; Cn # [38] <reserved-10F8A>..<reserved-10FAF>
|
||||
10FCC..10FDF ; Cn # [20] <reserved-10FCC>..<reserved-10FDF>
|
||||
10FF7..10FFF ; Cn # [9] <reserved-10FF7>..<reserved-10FFF>
|
||||
1104E..11051 ; Cn # [4] <reserved-1104E>..<reserved-11051>
|
||||
11070..1107E ; Cn # [15] <reserved-11070>..<reserved-1107E>
|
||||
110C2..110CC ; Cn # [11] <reserved-110C2>..<reserved-110CC>
|
||||
11076..1107E ; Cn # [9] <reserved-11076>..<reserved-1107E>
|
||||
110C3..110CC ; Cn # [10] <reserved-110C3>..<reserved-110CC>
|
||||
110CE..110CF ; Cn # [2] <reserved-110CE>..<reserved-110CF>
|
||||
110E9..110EF ; Cn # [7] <reserved-110E9>..<reserved-110EF>
|
||||
110FA..110FF ; Cn # [6] <reserved-110FA>..<reserved-110FF>
|
||||
|
@ -480,11 +486,11 @@ FFFE..FFFF ; Cn # [2] <noncharacter-FFFE>..<noncharacter-FFFF>
|
|||
11645..1164F ; Cn # [11] <reserved-11645>..<reserved-1164F>
|
||||
1165A..1165F ; Cn # [6] <reserved-1165A>..<reserved-1165F>
|
||||
1166D..1167F ; Cn # [19] <reserved-1166D>..<reserved-1167F>
|
||||
116B9..116BF ; Cn # [7] <reserved-116B9>..<reserved-116BF>
|
||||
116BA..116BF ; Cn # [6] <reserved-116BA>..<reserved-116BF>
|
||||
116CA..116FF ; Cn # [54] <reserved-116CA>..<reserved-116FF>
|
||||
1171B..1171C ; Cn # [2] <reserved-1171B>..<reserved-1171C>
|
||||
1172C..1172F ; Cn # [4] <reserved-1172C>..<reserved-1172F>
|
||||
11740..117FF ; Cn # [192] <reserved-11740>..<reserved-117FF>
|
||||
11747..117FF ; Cn # [185] <reserved-11747>..<reserved-117FF>
|
||||
1183C..1189F ; Cn # [100] <reserved-1183C>..<reserved-1189F>
|
||||
118F3..118FE ; Cn # [12] <reserved-118F3>..<reserved-118FE>
|
||||
11907..11908 ; Cn # [2] <reserved-11907>..<reserved-11908>
|
||||
|
@ -499,7 +505,7 @@ FFFE..FFFF ; Cn # [2] <noncharacter-FFFE>..<noncharacter-FFFF>
|
|||
119D8..119D9 ; Cn # [2] <reserved-119D8>..<reserved-119D9>
|
||||
119E5..119FF ; Cn # [27] <reserved-119E5>..<reserved-119FF>
|
||||
11A48..11A4F ; Cn # [8] <reserved-11A48>..<reserved-11A4F>
|
||||
11AA3..11ABF ; Cn # [29] <reserved-11AA3>..<reserved-11ABF>
|
||||
11AA3..11AAF ; Cn # [13] <reserved-11AA3>..<reserved-11AAF>
|
||||
11AF9..11BFF ; Cn # [263] <reserved-11AF9>..<reserved-11BFF>
|
||||
11C09 ; Cn # <reserved-11C09>
|
||||
11C37 ; Cn # <reserved-11C37>
|
||||
|
@ -527,14 +533,16 @@ FFFE..FFFF ; Cn # [2] <noncharacter-FFFE>..<noncharacter-FFFF>
|
|||
1239A..123FF ; Cn # [102] <reserved-1239A>..<reserved-123FF>
|
||||
1246F ; Cn # <reserved-1246F>
|
||||
12475..1247F ; Cn # [11] <reserved-12475>..<reserved-1247F>
|
||||
12544..12FFF ; Cn # [2748] <reserved-12544>..<reserved-12FFF>
|
||||
12544..12F8F ; Cn # [2636] <reserved-12544>..<reserved-12F8F>
|
||||
12FF3..12FFF ; Cn # [13] <reserved-12FF3>..<reserved-12FFF>
|
||||
1342F ; Cn # <reserved-1342F>
|
||||
13439..143FF ; Cn # [4039] <reserved-13439>..<reserved-143FF>
|
||||
14647..167FF ; Cn # [8633] <reserved-14647>..<reserved-167FF>
|
||||
16A39..16A3F ; Cn # [7] <reserved-16A39>..<reserved-16A3F>
|
||||
16A5F ; Cn # <reserved-16A5F>
|
||||
16A6A..16A6D ; Cn # [4] <reserved-16A6A>..<reserved-16A6D>
|
||||
16A70..16ACF ; Cn # [96] <reserved-16A70>..<reserved-16ACF>
|
||||
16ABF ; Cn # <reserved-16ABF>
|
||||
16ACA..16ACF ; Cn # [6] <reserved-16ACA>..<reserved-16ACF>
|
||||
16AEE..16AEF ; Cn # [2] <reserved-16AEE>..<reserved-16AEF>
|
||||
16AF6..16AFF ; Cn # [10] <reserved-16AF6>..<reserved-16AFF>
|
||||
16B46..16B4F ; Cn # [10] <reserved-16B46>..<reserved-16B4F>
|
||||
|
@ -550,8 +558,11 @@ FFFE..FFFF ; Cn # [2] <noncharacter-FFFE>..<noncharacter-FFFF>
|
|||
16FF2..16FFF ; Cn # [14] <reserved-16FF2>..<reserved-16FFF>
|
||||
187F8..187FF ; Cn # [8] <reserved-187F8>..<reserved-187FF>
|
||||
18CD6..18CFF ; Cn # [42] <reserved-18CD6>..<reserved-18CFF>
|
||||
18D09..1AFFF ; Cn # [8951] <reserved-18D09>..<reserved-1AFFF>
|
||||
1B11F..1B14F ; Cn # [49] <reserved-1B11F>..<reserved-1B14F>
|
||||
18D09..1AFEF ; Cn # [8935] <reserved-18D09>..<reserved-1AFEF>
|
||||
1AFF4 ; Cn # <reserved-1AFF4>
|
||||
1AFFC ; Cn # <reserved-1AFFC>
|
||||
1AFFF ; Cn # <reserved-1AFFF>
|
||||
1B123..1B14F ; Cn # [45] <reserved-1B123>..<reserved-1B14F>
|
||||
1B153..1B163 ; Cn # [17] <reserved-1B153>..<reserved-1B163>
|
||||
1B168..1B16F ; Cn # [8] <reserved-1B168>..<reserved-1B16F>
|
||||
1B2FC..1BBFF ; Cn # [2308] <reserved-1B2FC>..<reserved-1BBFF>
|
||||
|
@ -559,10 +570,13 @@ FFFE..FFFF ; Cn # [2] <noncharacter-FFFE>..<noncharacter-FFFF>
|
|||
1BC7D..1BC7F ; Cn # [3] <reserved-1BC7D>..<reserved-1BC7F>
|
||||
1BC89..1BC8F ; Cn # [7] <reserved-1BC89>..<reserved-1BC8F>
|
||||
1BC9A..1BC9B ; Cn # [2] <reserved-1BC9A>..<reserved-1BC9B>
|
||||
1BCA4..1CFFF ; Cn # [4956] <reserved-1BCA4>..<reserved-1CFFF>
|
||||
1BCA4..1CEFF ; Cn # [4700] <reserved-1BCA4>..<reserved-1CEFF>
|
||||
1CF2E..1CF2F ; Cn # [2] <reserved-1CF2E>..<reserved-1CF2F>
|
||||
1CF47..1CF4F ; Cn # [9] <reserved-1CF47>..<reserved-1CF4F>
|
||||
1CFC4..1CFFF ; Cn # [60] <reserved-1CFC4>..<reserved-1CFFF>
|
||||
1D0F6..1D0FF ; Cn # [10] <reserved-1D0F6>..<reserved-1D0FF>
|
||||
1D127..1D128 ; Cn # [2] <reserved-1D127>..<reserved-1D128>
|
||||
1D1E9..1D1FF ; Cn # [23] <reserved-1D1E9>..<reserved-1D1FF>
|
||||
1D1EB..1D1FF ; Cn # [21] <reserved-1D1EB>..<reserved-1D1FF>
|
||||
1D246..1D2DF ; Cn # [154] <reserved-1D246>..<reserved-1D2DF>
|
||||
1D2F4..1D2FF ; Cn # [12] <reserved-1D2F4>..<reserved-1D2FF>
|
||||
1D357..1D35F ; Cn # [9] <reserved-1D357>..<reserved-1D35F>
|
||||
|
@ -589,7 +603,8 @@ FFFE..FFFF ; Cn # [2] <noncharacter-FFFE>..<noncharacter-FFFF>
|
|||
1D7CC..1D7CD ; Cn # [2] <reserved-1D7CC>..<reserved-1D7CD>
|
||||
1DA8C..1DA9A ; Cn # [15] <reserved-1DA8C>..<reserved-1DA9A>
|
||||
1DAA0 ; Cn # <reserved-1DAA0>
|
||||
1DAB0..1DFFF ; Cn # [1360] <reserved-1DAB0>..<reserved-1DFFF>
|
||||
1DAB0..1DEFF ; Cn # [1104] <reserved-1DAB0>..<reserved-1DEFF>
|
||||
1DF1F..1DFFF ; Cn # [225] <reserved-1DF1F>..<reserved-1DFFF>
|
||||
1E007 ; Cn # <reserved-1E007>
|
||||
1E019..1E01A ; Cn # [2] <reserved-1E019>..<reserved-1E01A>
|
||||
1E022 ; Cn # <reserved-1E022>
|
||||
|
@ -598,9 +613,14 @@ FFFE..FFFF ; Cn # [2] <noncharacter-FFFE>..<noncharacter-FFFF>
|
|||
1E12D..1E12F ; Cn # [3] <reserved-1E12D>..<reserved-1E12F>
|
||||
1E13E..1E13F ; Cn # [2] <reserved-1E13E>..<reserved-1E13F>
|
||||
1E14A..1E14D ; Cn # [4] <reserved-1E14A>..<reserved-1E14D>
|
||||
1E150..1E2BF ; Cn # [368] <reserved-1E150>..<reserved-1E2BF>
|
||||
1E150..1E28F ; Cn # [320] <reserved-1E150>..<reserved-1E28F>
|
||||
1E2AF..1E2BF ; Cn # [17] <reserved-1E2AF>..<reserved-1E2BF>
|
||||
1E2FA..1E2FE ; Cn # [5] <reserved-1E2FA>..<reserved-1E2FE>
|
||||
1E300..1E7FF ; Cn # [1280] <reserved-1E300>..<reserved-1E7FF>
|
||||
1E300..1E7DF ; Cn # [1248] <reserved-1E300>..<reserved-1E7DF>
|
||||
1E7E7 ; Cn # <reserved-1E7E7>
|
||||
1E7EC ; Cn # <reserved-1E7EC>
|
||||
1E7EF ; Cn # <reserved-1E7EF>
|
||||
1E7FF ; Cn # <reserved-1E7FF>
|
||||
1E8C5..1E8C6 ; Cn # [2] <reserved-1E8C5>..<reserved-1E8C6>
|
||||
1E8D7..1E8FF ; Cn # [41] <reserved-1E8D7>..<reserved-1E8FF>
|
||||
1E94C..1E94F ; Cn # [4] <reserved-1E94C>..<reserved-1E94F>
|
||||
|
@ -654,34 +674,35 @@ FFFE..FFFF ; Cn # [2] <noncharacter-FFFE>..<noncharacter-FFFF>
|
|||
1F249..1F24F ; Cn # [7] <reserved-1F249>..<reserved-1F24F>
|
||||
1F252..1F25F ; Cn # [14] <reserved-1F252>..<reserved-1F25F>
|
||||
1F266..1F2FF ; Cn # [154] <reserved-1F266>..<reserved-1F2FF>
|
||||
1F6D8..1F6DF ; Cn # [8] <reserved-1F6D8>..<reserved-1F6DF>
|
||||
1F6D8..1F6DC ; Cn # [5] <reserved-1F6D8>..<reserved-1F6DC>
|
||||
1F6ED..1F6EF ; Cn # [3] <reserved-1F6ED>..<reserved-1F6EF>
|
||||
1F6FD..1F6FF ; Cn # [3] <reserved-1F6FD>..<reserved-1F6FF>
|
||||
1F774..1F77F ; Cn # [12] <reserved-1F774>..<reserved-1F77F>
|
||||
1F7D9..1F7DF ; Cn # [7] <reserved-1F7D9>..<reserved-1F7DF>
|
||||
1F7EC..1F7FF ; Cn # [20] <reserved-1F7EC>..<reserved-1F7FF>
|
||||
1F7EC..1F7EF ; Cn # [4] <reserved-1F7EC>..<reserved-1F7EF>
|
||||
1F7F1..1F7FF ; Cn # [15] <reserved-1F7F1>..<reserved-1F7FF>
|
||||
1F80C..1F80F ; Cn # [4] <reserved-1F80C>..<reserved-1F80F>
|
||||
1F848..1F84F ; Cn # [8] <reserved-1F848>..<reserved-1F84F>
|
||||
1F85A..1F85F ; Cn # [6] <reserved-1F85A>..<reserved-1F85F>
|
||||
1F888..1F88F ; Cn # [8] <reserved-1F888>..<reserved-1F88F>
|
||||
1F8AE..1F8AF ; Cn # [2] <reserved-1F8AE>..<reserved-1F8AF>
|
||||
1F8B2..1F8FF ; Cn # [78] <reserved-1F8B2>..<reserved-1F8FF>
|
||||
1F979 ; Cn # <reserved-1F979>
|
||||
1F9CC ; Cn # <reserved-1F9CC>
|
||||
1FA54..1FA5F ; Cn # [12] <reserved-1FA54>..<reserved-1FA5F>
|
||||
1FA6E..1FA6F ; Cn # [2] <reserved-1FA6E>..<reserved-1FA6F>
|
||||
1FA75..1FA77 ; Cn # [3] <reserved-1FA75>..<reserved-1FA77>
|
||||
1FA7B..1FA7F ; Cn # [5] <reserved-1FA7B>..<reserved-1FA7F>
|
||||
1FA7D..1FA7F ; Cn # [3] <reserved-1FA7D>..<reserved-1FA7F>
|
||||
1FA87..1FA8F ; Cn # [9] <reserved-1FA87>..<reserved-1FA8F>
|
||||
1FAA9..1FAAF ; Cn # [7] <reserved-1FAA9>..<reserved-1FAAF>
|
||||
1FAB7..1FABF ; Cn # [9] <reserved-1FAB7>..<reserved-1FABF>
|
||||
1FAC3..1FACF ; Cn # [13] <reserved-1FAC3>..<reserved-1FACF>
|
||||
1FAD7..1FAFF ; Cn # [41] <reserved-1FAD7>..<reserved-1FAFF>
|
||||
1FAAD..1FAAF ; Cn # [3] <reserved-1FAAD>..<reserved-1FAAF>
|
||||
1FABB..1FABF ; Cn # [5] <reserved-1FABB>..<reserved-1FABF>
|
||||
1FAC6..1FACF ; Cn # [10] <reserved-1FAC6>..<reserved-1FACF>
|
||||
1FADA..1FADF ; Cn # [6] <reserved-1FADA>..<reserved-1FADF>
|
||||
1FAE8..1FAEF ; Cn # [8] <reserved-1FAE8>..<reserved-1FAEF>
|
||||
1FAF7..1FAFF ; Cn # [9] <reserved-1FAF7>..<reserved-1FAFF>
|
||||
1FB93 ; Cn # <reserved-1FB93>
|
||||
1FBCB..1FBEF ; Cn # [37] <reserved-1FBCB>..<reserved-1FBEF>
|
||||
1FBFA..1FFFF ; Cn # [1030] <reserved-1FBFA>..<noncharacter-1FFFF>
|
||||
2A6DE..2A6FF ; Cn # [34] <reserved-2A6DE>..<reserved-2A6FF>
|
||||
2B735..2B73F ; Cn # [11] <reserved-2B735>..<reserved-2B73F>
|
||||
2A6E0..2A6FF ; Cn # [32] <reserved-2A6E0>..<reserved-2A6FF>
|
||||
2B739..2B73F ; Cn # [7] <reserved-2B739>..<reserved-2B73F>
|
||||
2B81E..2B81F ; Cn # [2] <reserved-2B81E>..<reserved-2B81F>
|
||||
2CEA2..2CEAF ; Cn # [14] <reserved-2CEA2>..<reserved-2CEAF>
|
||||
2EBE1..2F7FF ; Cn # [3103] <reserved-2EBE1>..<reserved-2F7FF>
|
||||
|
@ -693,7 +714,7 @@ E01F0..EFFFF ; Cn # [65040] <reserved-E01F0>..<noncharacter-EFFFF>
|
|||
FFFFE..FFFFF ; Cn # [2] <noncharacter-FFFFE>..<noncharacter-FFFFF>
|
||||
10FFFE..10FFFF; Cn # [2] <noncharacter-10FFFE>..<noncharacter-10FFFF>
|
||||
|
||||
# Total code points: 830672
|
||||
# Total code points: 829834
|
||||
|
||||
# ================================================
|
||||
|
||||
|
@ -1130,7 +1151,7 @@ FFFFE..FFFFF ; Cn # [2] <noncharacter-FFFFE>..<noncharacter-FFFFF>
|
|||
213E..213F ; Lu # [2] DOUBLE-STRUCK CAPITAL GAMMA..DOUBLE-STRUCK CAPITAL PI
|
||||
2145 ; Lu # DOUBLE-STRUCK ITALIC CAPITAL D
|
||||
2183 ; Lu # ROMAN NUMERAL REVERSED ONE HUNDRED
|
||||
2C00..2C2E ; Lu # [47] GLAGOLITIC CAPITAL LETTER AZU..GLAGOLITIC CAPITAL LETTER LATINATE MYSLITE
|
||||
2C00..2C2F ; Lu # [48] GLAGOLITIC CAPITAL LETTER AZU..GLAGOLITIC CAPITAL LETTER CAUDATE CHRIVI
|
||||
2C60 ; Lu # LATIN CAPITAL LETTER L WITH DOUBLE BAR
|
||||
2C62..2C64 ; Lu # [3] LATIN CAPITAL LETTER L WITH MIDDLE TILDE..LATIN CAPITAL LETTER R WITH TAIL
|
||||
2C67 ; Lu # LATIN CAPITAL LETTER H WITH DESCENDER
|
||||
|
@ -1295,13 +1316,21 @@ A7B8 ; Lu # LATIN CAPITAL LETTER U WITH STROKE
|
|||
A7BA ; Lu # LATIN CAPITAL LETTER GLOTTAL A
|
||||
A7BC ; Lu # LATIN CAPITAL LETTER GLOTTAL I
|
||||
A7BE ; Lu # LATIN CAPITAL LETTER GLOTTAL U
|
||||
A7C0 ; Lu # LATIN CAPITAL LETTER OLD POLISH O
|
||||
A7C2 ; Lu # LATIN CAPITAL LETTER ANGLICANA W
|
||||
A7C4..A7C7 ; Lu # [4] LATIN CAPITAL LETTER C WITH PALATAL HOOK..LATIN CAPITAL LETTER D WITH SHORT STROKE OVERLAY
|
||||
A7C9 ; Lu # LATIN CAPITAL LETTER S WITH SHORT STROKE OVERLAY
|
||||
A7D0 ; Lu # LATIN CAPITAL LETTER CLOSED INSULAR G
|
||||
A7D6 ; Lu # LATIN CAPITAL LETTER MIDDLE SCOTS S
|
||||
A7D8 ; Lu # LATIN CAPITAL LETTER SIGMOID S
|
||||
A7F5 ; Lu # LATIN CAPITAL LETTER REVERSED HALF H
|
||||
FF21..FF3A ; Lu # [26] FULLWIDTH LATIN CAPITAL LETTER A..FULLWIDTH LATIN CAPITAL LETTER Z
|
||||
10400..10427 ; Lu # [40] DESERET CAPITAL LETTER LONG I..DESERET CAPITAL LETTER EW
|
||||
104B0..104D3 ; Lu # [36] OSAGE CAPITAL LETTER A..OSAGE CAPITAL LETTER ZHA
|
||||
10570..1057A ; Lu # [11] VITHKUQI CAPITAL LETTER A..VITHKUQI CAPITAL LETTER GA
|
||||
1057C..1058A ; Lu # [15] VITHKUQI CAPITAL LETTER HA..VITHKUQI CAPITAL LETTER RE
|
||||
1058C..10592 ; Lu # [7] VITHKUQI CAPITAL LETTER SE..VITHKUQI CAPITAL LETTER XE
|
||||
10594..10595 ; Lu # [2] VITHKUQI CAPITAL LETTER Y..VITHKUQI CAPITAL LETTER ZE
|
||||
10C80..10CB2 ; Lu # [51] OLD HUNGARIAN CAPITAL LETTER A..OLD HUNGARIAN CAPITAL LETTER US
|
||||
118A0..118BF ; Lu # [32] WARANG CITI CAPITAL LETTER NGAA..WARANG CITI CAPITAL LETTER VIYO
|
||||
16E40..16E5F ; Lu # [32] MEDEFAIDRIN CAPITAL LETTER M..MEDEFAIDRIN CAPITAL LETTER Y
|
||||
|
@ -1338,7 +1367,7 @@ FF21..FF3A ; Lu # [26] FULLWIDTH LATIN CAPITAL LETTER A..FULLWIDTH LATIN CAP
|
|||
1D7CA ; Lu # MATHEMATICAL BOLD CAPITAL DIGAMMA
|
||||
1E900..1E921 ; Lu # [34] ADLAM CAPITAL LETTER ALIF..ADLAM CAPITAL LETTER SHA
|
||||
|
||||
# Total code points: 1791
|
||||
# Total code points: 1831
|
||||
|
||||
# ================================================
|
||||
|
||||
|
@ -1775,7 +1804,7 @@ FF21..FF3A ; Lu # [26] FULLWIDTH LATIN CAPITAL LETTER A..FULLWIDTH LATIN CAP
|
|||
2146..2149 ; Ll # [4] DOUBLE-STRUCK ITALIC SMALL D..DOUBLE-STRUCK ITALIC SMALL J
|
||||
214E ; Ll # TURNED SMALL F
|
||||
2184 ; Ll # LATIN SMALL LETTER REVERSED C
|
||||
2C30..2C5E ; Ll # [47] GLAGOLITIC SMALL LETTER AZU..GLAGOLITIC SMALL LETTER LATINATE MYSLITE
|
||||
2C30..2C5F ; Ll # [48] GLAGOLITIC SMALL LETTER AZU..GLAGOLITIC SMALL LETTER CAUDATE CHRIVI
|
||||
2C61 ; Ll # LATIN SMALL LETTER L WITH DOUBLE BAR
|
||||
2C65..2C66 ; Ll # [2] LATIN SMALL LETTER A WITH STROKE..LATIN SMALL LETTER T WITH DIAGONAL STROKE
|
||||
2C68 ; Ll # LATIN SMALL LETTER H WITH DESCENDER
|
||||
|
@ -1944,9 +1973,15 @@ A7B9 ; Ll # LATIN SMALL LETTER U WITH STROKE
|
|||
A7BB ; Ll # LATIN SMALL LETTER GLOTTAL A
|
||||
A7BD ; Ll # LATIN SMALL LETTER GLOTTAL I
|
||||
A7BF ; Ll # LATIN SMALL LETTER GLOTTAL U
|
||||
A7C1 ; Ll # LATIN SMALL LETTER OLD POLISH O
|
||||
A7C3 ; Ll # LATIN SMALL LETTER ANGLICANA W
|
||||
A7C8 ; Ll # LATIN SMALL LETTER D WITH SHORT STROKE OVERLAY
|
||||
A7CA ; Ll # LATIN SMALL LETTER S WITH SHORT STROKE OVERLAY
|
||||
A7D1 ; Ll # LATIN SMALL LETTER CLOSED INSULAR G
|
||||
A7D3 ; Ll # LATIN SMALL LETTER DOUBLE THORN
|
||||
A7D5 ; Ll # LATIN SMALL LETTER DOUBLE WYNN
|
||||
A7D7 ; Ll # LATIN SMALL LETTER MIDDLE SCOTS S
|
||||
A7D9 ; Ll # LATIN SMALL LETTER SIGMOID S
|
||||
A7F6 ; Ll # LATIN SMALL LETTER REVERSED HALF H
|
||||
A7FA ; Ll # LATIN LETTER SMALL CAPITAL TURNED M
|
||||
AB30..AB5A ; Ll # [43] LATIN SMALL LETTER BARRED ALPHA..LATIN SMALL LETTER Y WITH SHORT RIGHT LEG
|
||||
|
@ -1957,6 +1992,10 @@ FB13..FB17 ; Ll # [5] ARMENIAN SMALL LIGATURE MEN NOW..ARMENIAN SMALL LIGAT
|
|||
FF41..FF5A ; Ll # [26] FULLWIDTH LATIN SMALL LETTER A..FULLWIDTH LATIN SMALL LETTER Z
|
||||
10428..1044F ; Ll # [40] DESERET SMALL LETTER LONG I..DESERET SMALL LETTER EW
|
||||
104D8..104FB ; Ll # [36] OSAGE SMALL LETTER A..OSAGE SMALL LETTER ZHA
|
||||
10597..105A1 ; Ll # [11] VITHKUQI SMALL LETTER A..VITHKUQI SMALL LETTER GA
|
||||
105A3..105B1 ; Ll # [15] VITHKUQI SMALL LETTER HA..VITHKUQI SMALL LETTER RE
|
||||
105B3..105B9 ; Ll # [7] VITHKUQI SMALL LETTER SE..VITHKUQI SMALL LETTER XE
|
||||
105BB..105BC ; Ll # [2] VITHKUQI SMALL LETTER Y..VITHKUQI SMALL LETTER ZE
|
||||
10CC0..10CF2 ; Ll # [51] OLD HUNGARIAN SMALL LETTER A..OLD HUNGARIAN SMALL LETTER US
|
||||
118C0..118DF ; Ll # [32] WARANG CITI SMALL LETTER NGAA..WARANG CITI SMALL LETTER VIYO
|
||||
16E60..16E7F ; Ll # [32] MEDEFAIDRIN SMALL LETTER M..MEDEFAIDRIN SMALL LETTER Y
|
||||
|
@ -1988,9 +2027,11 @@ FF41..FF5A ; Ll # [26] FULLWIDTH LATIN SMALL LETTER A..FULLWIDTH LATIN SMALL
|
|||
1D7AA..1D7C2 ; Ll # [25] MATHEMATICAL SANS-SERIF BOLD ITALIC SMALL ALPHA..MATHEMATICAL SANS-SERIF BOLD ITALIC SMALL OMEGA
|
||||
1D7C4..1D7C9 ; Ll # [6] MATHEMATICAL SANS-SERIF BOLD ITALIC EPSILON SYMBOL..MATHEMATICAL SANS-SERIF BOLD ITALIC PI SYMBOL
|
||||
1D7CB ; Ll # MATHEMATICAL BOLD SMALL DIGAMMA
|
||||
1DF00..1DF09 ; Ll # [10] LATIN SMALL LETTER FENG DIGRAPH WITH TRILL..LATIN SMALL LETTER T WITH HOOK AND RETROFLEX HOOK
|
||||
1DF0B..1DF1E ; Ll # [20] LATIN SMALL LETTER ESH WITH DOUBLE BAR..LATIN SMALL LETTER S WITH CURL
|
||||
1E922..1E943 ; Ll # [34] ADLAM SMALL LETTER ALIF..ADLAM SMALL LETTER SHA
|
||||
|
||||
# Total code points: 2155
|
||||
# Total code points: 2227
|
||||
|
||||
# ================================================
|
||||
|
||||
|
@ -2028,6 +2069,7 @@ FF41..FF5A ; Ll # [26] FULLWIDTH LATIN SMALL LETTER A..FULLWIDTH LATIN SMALL
|
|||
081A ; Lm # SAMARITAN MODIFIER LETTER EPENTHETIC YUT
|
||||
0824 ; Lm # SAMARITAN MODIFIER LETTER SHORT A
|
||||
0828 ; Lm # SAMARITAN MODIFIER LETTER I
|
||||
08C9 ; Lm # ARABIC SMALL FARSI YEH
|
||||
0971 ; Lm # DEVANAGARI SIGN HIGH SPACING DOT
|
||||
0E46 ; Lm # THAI CHARACTER MAIYAMOK
|
||||
0EC6 ; Lm # LAO KO LA
|
||||
|
@ -2058,6 +2100,7 @@ A69C..A69D ; Lm # [2] MODIFIER LETTER CYRILLIC HARD SIGN..MODIFIER LETTER C
|
|||
A717..A71F ; Lm # [9] MODIFIER LETTER DOT VERTICAL BAR..MODIFIER LETTER LOW INVERTED EXCLAMATION MARK
|
||||
A770 ; Lm # MODIFIER LETTER US
|
||||
A788 ; Lm # MODIFIER LETTER LOW CIRCUMFLEX ACCENT
|
||||
A7F2..A7F4 ; Lm # [3] MODIFIER LETTER CAPITAL C..MODIFIER LETTER CAPITAL Q
|
||||
A7F8..A7F9 ; Lm # [2] MODIFIER LETTER CAPITAL H WITH STROKE..MODIFIER LETTER SMALL LIGATURE OE
|
||||
A9CF ; Lm # JAVANESE PANGRANGKEP
|
||||
A9E6 ; Lm # MYANMAR MODIFIER LETTER SHAN REDUPLICATION
|
||||
|
@ -2068,14 +2111,20 @@ AB5C..AB5F ; Lm # [4] MODIFIER LETTER SMALL HENG..MODIFIER LETTER SMALL U W
|
|||
AB69 ; Lm # MODIFIER LETTER SMALL TURNED W
|
||||
FF70 ; Lm # HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK
|
||||
FF9E..FF9F ; Lm # [2] HALFWIDTH KATAKANA VOICED SOUND MARK..HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK
|
||||
10780..10785 ; Lm # [6] MODIFIER LETTER SMALL CAPITAL AA..MODIFIER LETTER SMALL B WITH HOOK
|
||||
10787..107B0 ; Lm # [42] MODIFIER LETTER SMALL DZ DIGRAPH..MODIFIER LETTER SMALL V WITH RIGHT HOOK
|
||||
107B2..107BA ; Lm # [9] MODIFIER LETTER SMALL CAPITAL Y..MODIFIER LETTER SMALL S WITH CURL
|
||||
16B40..16B43 ; Lm # [4] PAHAWH HMONG SIGN VOS SEEV..PAHAWH HMONG SIGN IB YAM
|
||||
16F93..16F9F ; Lm # [13] MIAO LETTER TONE-2..MIAO LETTER REFORMED TONE-8
|
||||
16FE0..16FE1 ; Lm # [2] TANGUT ITERATION MARK..NUSHU ITERATION MARK
|
||||
16FE3 ; Lm # OLD CHINESE ITERATION MARK
|
||||
1AFF0..1AFF3 ; Lm # [4] KATAKANA LETTER MINNAN TONE-2..KATAKANA LETTER MINNAN TONE-5
|
||||
1AFF5..1AFFB ; Lm # [7] KATAKANA LETTER MINNAN TONE-7..KATAKANA LETTER MINNAN NASALIZED TONE-5
|
||||
1AFFD..1AFFE ; Lm # [2] KATAKANA LETTER MINNAN NASALIZED TONE-7..KATAKANA LETTER MINNAN NASALIZED TONE-8
|
||||
1E137..1E13D ; Lm # [7] NYIAKENG PUACHUE HMONG SIGN FOR PERSON..NYIAKENG PUACHUE HMONG SYLLABLE LENGTHENER
|
||||
1E94B ; Lm # ADLAM NASALIZATION MARK
|
||||
|
||||
# Total code points: 260
|
||||
# Total code points: 334
|
||||
|
||||
# ================================================
|
||||
|
||||
|
@ -2104,8 +2153,9 @@ FF9E..FF9F ; Lm # [2] HALFWIDTH KATAKANA VOICED SOUND MARK..HALFWIDTH KATAK
|
|||
0800..0815 ; Lo # [22] SAMARITAN LETTER ALAF..SAMARITAN LETTER TAAF
|
||||
0840..0858 ; Lo # [25] MANDAIC LETTER HALQA..MANDAIC LETTER AIN
|
||||
0860..086A ; Lo # [11] SYRIAC LETTER MALAYALAM NGA..SYRIAC LETTER MALAYALAM SSA
|
||||
08A0..08B4 ; Lo # [21] ARABIC LETTER BEH WITH SMALL V BELOW..ARABIC LETTER KAF WITH DOT BELOW
|
||||
08B6..08C7 ; Lo # [18] ARABIC LETTER BEH WITH SMALL MEEM ABOVE..ARABIC LETTER LAM WITH SMALL ARABIC LETTER TAH ABOVE
|
||||
0870..0887 ; Lo # [24] ARABIC LETTER ALEF WITH ATTACHED FATHA..ARABIC BASELINE ROUND DOT
|
||||
0889..088E ; Lo # [6] ARABIC LETTER NOON WITH INVERTED SMALL V..ARABIC VERTICAL TAIL
|
||||
08A0..08C8 ; Lo # [41] ARABIC LETTER BEH WITH SMALL V BELOW..ARABIC LETTER GRAF
|
||||
0904..0939 ; Lo # [54] DEVANAGARI LETTER SHORT A..DEVANAGARI LETTER HA
|
||||
093D ; Lo # DEVANAGARI SIGN AVAGRAHA
|
||||
0950 ; Lo # DEVANAGARI OM
|
||||
|
@ -2170,6 +2220,7 @@ FF9E..FF9F ; Lm # [2] HALFWIDTH KATAKANA VOICED SOUND MARK..HALFWIDTH KATAK
|
|||
0C2A..0C39 ; Lo # [16] TELUGU LETTER PA..TELUGU LETTER HA
|
||||
0C3D ; Lo # TELUGU SIGN AVAGRAHA
|
||||
0C58..0C5A ; Lo # [3] TELUGU LETTER TSA..TELUGU LETTER RRRA
|
||||
0C5D ; Lo # TELUGU LETTER NAKAARA POLLU
|
||||
0C60..0C61 ; Lo # [2] TELUGU LETTER VOCALIC RR..TELUGU LETTER VOCALIC LL
|
||||
0C80 ; Lo # KANNADA SIGN SPACING CANDRABINDU
|
||||
0C85..0C8C ; Lo # [8] KANNADA LETTER A..KANNADA LETTER VOCALIC L
|
||||
|
@ -2178,7 +2229,7 @@ FF9E..FF9F ; Lm # [2] HALFWIDTH KATAKANA VOICED SOUND MARK..HALFWIDTH KATAK
|
|||
0CAA..0CB3 ; Lo # [10] KANNADA LETTER PA..KANNADA LETTER LLA
|
||||
0CB5..0CB9 ; Lo # [5] KANNADA LETTER VA..KANNADA LETTER HA
|
||||
0CBD ; Lo # KANNADA SIGN AVAGRAHA
|
||||
0CDE ; Lo # KANNADA LETTER FA
|
||||
0CDD..0CDE ; Lo # [2] KANNADA LETTER NAKAARA POLLU..KANNADA LETTER FA
|
||||
0CE0..0CE1 ; Lo # [2] KANNADA LETTER VOCALIC RR..KANNADA LETTER VOCALIC LL
|
||||
0CF1..0CF2 ; Lo # [2] KANNADA SIGN JIHVAMULIYA..KANNADA SIGN UPADHMANIYA
|
||||
0D04..0D0C ; Lo # [9] MALAYALAM LETTER VEDIC ANUSVARA..MALAYALAM LETTER VOCALIC L
|
||||
|
@ -2242,9 +2293,8 @@ FF9E..FF9F ; Lm # [2] HALFWIDTH KATAKANA VOICED SOUND MARK..HALFWIDTH KATAK
|
|||
1681..169A ; Lo # [26] OGHAM LETTER BEITH..OGHAM LETTER PEITH
|
||||
16A0..16EA ; Lo # [75] RUNIC LETTER FEHU FEOH FE F..RUNIC LETTER X
|
||||
16F1..16F8 ; Lo # [8] RUNIC LETTER K..RUNIC LETTER FRANKS CASKET AESC
|
||||
1700..170C ; Lo # [13] TAGALOG LETTER A..TAGALOG LETTER YA
|
||||
170E..1711 ; Lo # [4] TAGALOG LETTER LA..TAGALOG LETTER HA
|
||||
1720..1731 ; Lo # [18] HANUNOO LETTER A..HANUNOO LETTER HA
|
||||
1700..1711 ; Lo # [18] TAGALOG LETTER A..TAGALOG LETTER HA
|
||||
171F..1731 ; Lo # [19] TAGALOG LETTER ARCHAIC RA..HANUNOO LETTER HA
|
||||
1740..1751 ; Lo # [18] BUHID LETTER A..BUHID LETTER HA
|
||||
1760..176C ; Lo # [13] TAGBANWA LETTER A..TAGBANWA LETTER YA
|
||||
176E..1770 ; Lo # [3] TAGBANWA LETTER LA..TAGBANWA LETTER SA
|
||||
|
@ -2264,7 +2314,7 @@ FF9E..FF9F ; Lm # [2] HALFWIDTH KATAKANA VOICED SOUND MARK..HALFWIDTH KATAK
|
|||
1A00..1A16 ; Lo # [23] BUGINESE LETTER KA..BUGINESE LETTER HA
|
||||
1A20..1A54 ; Lo # [53] TAI THAM LETTER HIGH KA..TAI THAM LETTER GREAT SA
|
||||
1B05..1B33 ; Lo # [47] BALINESE LETTER AKARA..BALINESE LETTER HA
|
||||
1B45..1B4B ; Lo # [7] BALINESE LETTER KAF SASAK..BALINESE LETTER ASYURA SASAK
|
||||
1B45..1B4C ; Lo # [8] BALINESE LETTER KAF SASAK..BALINESE LETTER ARCHAIC JNYA
|
||||
1B83..1BA0 ; Lo # [30] SUNDANESE LETTER A..SUNDANESE LETTER HA
|
||||
1BAE..1BAF ; Lo # [2] SUNDANESE LETTER KHA..SUNDANESE LETTER SYA
|
||||
1BBA..1BE5 ; Lo # [44] SUNDANESE AVAGRAHA..BATAK LETTER U
|
||||
|
@ -2297,8 +2347,7 @@ FF9E..FF9F ; Lm # [2] HALFWIDTH KATAKANA VOICED SOUND MARK..HALFWIDTH KATAK
|
|||
31A0..31BF ; Lo # [32] BOPOMOFO LETTER BU..BOPOMOFO LETTER AH
|
||||
31F0..31FF ; Lo # [16] KATAKANA LETTER SMALL KU..KATAKANA LETTER SMALL RO
|
||||
3400..4DBF ; Lo # [6592] CJK UNIFIED IDEOGRAPH-3400..CJK UNIFIED IDEOGRAPH-4DBF
|
||||
4E00..9FFC ; Lo # [20989] CJK UNIFIED IDEOGRAPH-4E00..CJK UNIFIED IDEOGRAPH-9FFC
|
||||
A000..A014 ; Lo # [21] YI SYLLABLE IT..YI SYLLABLE E
|
||||
4E00..A014 ; Lo # [21013] CJK UNIFIED IDEOGRAPH-4E00..YI SYLLABLE E
|
||||
A016..A48C ; Lo # [1143] YI SYLLABLE BIT..YI SYLLABLE YYR
|
||||
A4D0..A4F7 ; Lo # [40] LISU LETTER BA..LISU LETTER OE
|
||||
A500..A60B ; Lo # [268] VAI SYLLABLE EE..VAI SYLLABLE NG
|
||||
|
@ -2426,9 +2475,12 @@ FFDA..FFDC ; Lo # [3] HALFWIDTH HANGUL LETTER EU..HALFWIDTH HANGUL LETTER I
|
|||
10F00..10F1C ; Lo # [29] OLD SOGDIAN LETTER ALEPH..OLD SOGDIAN LETTER FINAL TAW WITH VERTICAL TAIL
|
||||
10F27 ; Lo # OLD SOGDIAN LIGATURE AYIN-DALETH
|
||||
10F30..10F45 ; Lo # [22] SOGDIAN LETTER ALEPH..SOGDIAN INDEPENDENT SHIN
|
||||
10F70..10F81 ; Lo # [18] OLD UYGHUR LETTER ALEPH..OLD UYGHUR LETTER LESH
|
||||
10FB0..10FC4 ; Lo # [21] CHORASMIAN LETTER ALEPH..CHORASMIAN LETTER TAW
|
||||
10FE0..10FF6 ; Lo # [23] ELYMAIC LETTER ALEPH..ELYMAIC LIGATURE ZAYIN-YODH
|
||||
11003..11037 ; Lo # [53] BRAHMI SIGN JIHVAMULIYA..BRAHMI LETTER OLD TAMIL NNNA
|
||||
11071..11072 ; Lo # [2] BRAHMI LETTER OLD TAMIL SHORT E..BRAHMI LETTER OLD TAMIL SHORT O
|
||||
11075 ; Lo # BRAHMI LETTER OLD TAMIL LLA
|
||||
11083..110AF ; Lo # [45] KAITHI LETTER A..KAITHI LETTER HA
|
||||
110D0..110E8 ; Lo # [25] SORA SOMPENG LETTER SAH..SORA SOMPENG LETTER MAE
|
||||
11103..11126 ; Lo # [36] CHAKMA LETTER AA..CHAKMA LETTER HAA
|
||||
|
@ -2470,6 +2522,7 @@ FFDA..FFDC ; Lo # [3] HALFWIDTH HANGUL LETTER EU..HALFWIDTH HANGUL LETTER I
|
|||
11680..116AA ; Lo # [43] TAKRI LETTER A..TAKRI LETTER RRA
|
||||
116B8 ; Lo # TAKRI LETTER ARCHAIC KHA
|
||||
11700..1171A ; Lo # [27] AHOM LETTER KA..AHOM LETTER ALTERNATE BA
|
||||
11740..11746 ; Lo # [7] AHOM LETTER CA..AHOM LETTER LLA
|
||||
11800..1182B ; Lo # [44] DOGRA LETTER A..DOGRA LETTER RRA
|
||||
118FF..11906 ; Lo # [8] WARANG CITI OM..DIVES AKURU LETTER E
|
||||
11909 ; Lo # DIVES AKURU LETTER O
|
||||
|
@ -2488,7 +2541,7 @@ FFDA..FFDC ; Lo # [3] HALFWIDTH HANGUL LETTER EU..HALFWIDTH HANGUL LETTER I
|
|||
11A50 ; Lo # SOYOMBO LETTER A
|
||||
11A5C..11A89 ; Lo # [46] SOYOMBO LETTER KA..SOYOMBO CLUSTER-INITIAL LETTER SA
|
||||
11A9D ; Lo # SOYOMBO MARK PLUTA
|
||||
11AC0..11AF8 ; Lo # [57] PAU CIN HAU LETTER PA..PAU CIN HAU GLOTTAL STOP FINAL
|
||||
11AB0..11AF8 ; Lo # [73] CANADIAN SYLLABICS NATTILIK HI..PAU CIN HAU GLOTTAL STOP FINAL
|
||||
11C00..11C08 ; Lo # [9] BHAIKSUKI LETTER A..BHAIKSUKI LETTER VOCALIC L
|
||||
11C0A..11C2E ; Lo # [37] BHAIKSUKI LETTER E..BHAIKSUKI LETTER HA
|
||||
11C40 ; Lo # BHAIKSUKI SIGN AVAGRAHA
|
||||
|
@ -2505,10 +2558,12 @@ FFDA..FFDC ; Lo # [3] HALFWIDTH HANGUL LETTER EU..HALFWIDTH HANGUL LETTER I
|
|||
11FB0 ; Lo # LISU LETTER YHA
|
||||
12000..12399 ; Lo # [922] CUNEIFORM SIGN A..CUNEIFORM SIGN U U
|
||||
12480..12543 ; Lo # [196] CUNEIFORM SIGN AB TIMES NUN TENU..CUNEIFORM SIGN ZU5 TIMES THREE DISH TENU
|
||||
12F90..12FF0 ; Lo # [97] CYPRO-MINOAN SIGN CM001..CYPRO-MINOAN SIGN CM114
|
||||
13000..1342E ; Lo # [1071] EGYPTIAN HIEROGLYPH A001..EGYPTIAN HIEROGLYPH AA032
|
||||
14400..14646 ; Lo # [583] ANATOLIAN HIEROGLYPH A001..ANATOLIAN HIEROGLYPH A530
|
||||
16800..16A38 ; Lo # [569] BAMUM LETTER PHASE-A NGKUE MFON..BAMUM LETTER PHASE-F VUEQ
|
||||
16A40..16A5E ; Lo # [31] MRO LETTER TA..MRO LETTER TEK
|
||||
16A70..16ABE ; Lo # [79] TANGSA LETTER OZ..TANGSA LETTER ZA
|
||||
16AD0..16AED ; Lo # [30] BASSA VAH LETTER ENNI..BASSA VAH LETTER I
|
||||
16B00..16B2F ; Lo # [48] PAHAWH HMONG VOWEL KEEB..PAHAWH HMONG CONSONANT CAU
|
||||
16B63..16B77 ; Lo # [21] PAHAWH HMONG SIGN VOS LUB..PAHAWH HMONG SIGN CIM NRES TOS
|
||||
|
@ -2518,7 +2573,7 @@ FFDA..FFDC ; Lo # [3] HALFWIDTH HANGUL LETTER EU..HALFWIDTH HANGUL LETTER I
|
|||
17000..187F7 ; Lo # [6136] TANGUT IDEOGRAPH-17000..TANGUT IDEOGRAPH-187F7
|
||||
18800..18CD5 ; Lo # [1238] TANGUT COMPONENT-001..KHITAN SMALL SCRIPT CHARACTER-18CD5
|
||||
18D00..18D08 ; Lo # [9] TANGUT IDEOGRAPH-18D00..TANGUT IDEOGRAPH-18D08
|
||||
1B000..1B11E ; Lo # [287] KATAKANA LETTER ARCHAIC E..HENTAIGANA LETTER N-MU-MO-2
|
||||
1B000..1B122 ; Lo # [291] KATAKANA LETTER ARCHAIC E..KATAKANA LETTER ARCHAIC WU
|
||||
1B150..1B152 ; Lo # [3] HIRAGANA LETTER SMALL WI..HIRAGANA LETTER SMALL WO
|
||||
1B164..1B167 ; Lo # [4] KATAKANA LETTER SMALL WI..KATAKANA LETTER SMALL N
|
||||
1B170..1B2FB ; Lo # [396] NUSHU CHARACTER-1B170..NUSHU CHARACTER-1B2FB
|
||||
|
@ -2526,9 +2581,15 @@ FFDA..FFDC ; Lo # [3] HALFWIDTH HANGUL LETTER EU..HALFWIDTH HANGUL LETTER I
|
|||
1BC70..1BC7C ; Lo # [13] DUPLOYAN AFFIX LEFT HORIZONTAL SECANT..DUPLOYAN AFFIX ATTACHED TANGENT HOOK
|
||||
1BC80..1BC88 ; Lo # [9] DUPLOYAN AFFIX HIGH ACUTE..DUPLOYAN AFFIX HIGH VERTICAL
|
||||
1BC90..1BC99 ; Lo # [10] DUPLOYAN AFFIX LOW ACUTE..DUPLOYAN AFFIX LOW ARROW
|
||||
1DF0A ; Lo # LATIN LETTER RETROFLEX CLICK WITH RETROFLEX HOOK
|
||||
1E100..1E12C ; Lo # [45] NYIAKENG PUACHUE HMONG LETTER MA..NYIAKENG PUACHUE HMONG LETTER W
|
||||
1E14E ; Lo # NYIAKENG PUACHUE HMONG LOGOGRAM NYAJ
|
||||
1E290..1E2AD ; Lo # [30] TOTO LETTER PA..TOTO LETTER A
|
||||
1E2C0..1E2EB ; Lo # [44] WANCHO LETTER AA..WANCHO LETTER YIH
|
||||
1E7E0..1E7E6 ; Lo # [7] ETHIOPIC SYLLABLE HHYA..ETHIOPIC SYLLABLE HHYO
|
||||
1E7E8..1E7EB ; Lo # [4] ETHIOPIC SYLLABLE GURAGE HHWA..ETHIOPIC SYLLABLE HHWE
|
||||
1E7ED..1E7EE ; Lo # [2] ETHIOPIC SYLLABLE GURAGE MWI..ETHIOPIC SYLLABLE GURAGE MWEE
|
||||
1E7F0..1E7FE ; Lo # [15] ETHIOPIC SYLLABLE GURAGE QWI..ETHIOPIC SYLLABLE GURAGE PWEE
|
||||
1E800..1E8C4 ; Lo # [197] MENDE KIKAKUI SYLLABLE M001 KI..MENDE KIKAKUI SYLLABLE M060 NYON
|
||||
1EE00..1EE03 ; Lo # [4] ARABIC MATHEMATICAL ALEF..ARABIC MATHEMATICAL DAL
|
||||
1EE05..1EE1F ; Lo # [27] ARABIC MATHEMATICAL WAW..ARABIC MATHEMATICAL DOTLESS QAF
|
||||
|
@ -2563,15 +2624,15 @@ FFDA..FFDC ; Lo # [3] HALFWIDTH HANGUL LETTER EU..HALFWIDTH HANGUL LETTER I
|
|||
1EEA1..1EEA3 ; Lo # [3] ARABIC MATHEMATICAL DOUBLE-STRUCK BEH..ARABIC MATHEMATICAL DOUBLE-STRUCK DAL
|
||||
1EEA5..1EEA9 ; Lo # [5] ARABIC MATHEMATICAL DOUBLE-STRUCK WAW..ARABIC MATHEMATICAL DOUBLE-STRUCK YEH
|
||||
1EEAB..1EEBB ; Lo # [17] ARABIC MATHEMATICAL DOUBLE-STRUCK LAM..ARABIC MATHEMATICAL DOUBLE-STRUCK GHAIN
|
||||
20000..2A6DD ; Lo # [42718] CJK UNIFIED IDEOGRAPH-20000..CJK UNIFIED IDEOGRAPH-2A6DD
|
||||
2A700..2B734 ; Lo # [4149] CJK UNIFIED IDEOGRAPH-2A700..CJK UNIFIED IDEOGRAPH-2B734
|
||||
20000..2A6DF ; Lo # [42720] CJK UNIFIED IDEOGRAPH-20000..CJK UNIFIED IDEOGRAPH-2A6DF
|
||||
2A700..2B738 ; Lo # [4153] CJK UNIFIED IDEOGRAPH-2A700..CJK UNIFIED IDEOGRAPH-2B738
|
||||
2B740..2B81D ; Lo # [222] CJK UNIFIED IDEOGRAPH-2B740..CJK UNIFIED IDEOGRAPH-2B81D
|
||||
2B820..2CEA1 ; Lo # [5762] CJK UNIFIED IDEOGRAPH-2B820..CJK UNIFIED IDEOGRAPH-2CEA1
|
||||
2CEB0..2EBE0 ; Lo # [7473] CJK UNIFIED IDEOGRAPH-2CEB0..CJK UNIFIED IDEOGRAPH-2EBE0
|
||||
2F800..2FA1D ; Lo # [542] CJK COMPATIBILITY IDEOGRAPH-2F800..CJK COMPATIBILITY IDEOGRAPH-2FA1D
|
||||
30000..3134A ; Lo # [4939] CJK UNIFIED IDEOGRAPH-30000..CJK UNIFIED IDEOGRAPH-3134A
|
||||
|
||||
# Total code points: 127004
|
||||
# Total code points: 127333
|
||||
|
||||
# ================================================
|
||||
|
||||
|
@ -2601,7 +2662,8 @@ FFDA..FFDC ; Lo # [3] HALFWIDTH HANGUL LETTER EU..HALFWIDTH HANGUL LETTER I
|
|||
0825..0827 ; Mn # [3] SAMARITAN VOWEL SIGN SHORT A..SAMARITAN VOWEL SIGN U
|
||||
0829..082D ; Mn # [5] SAMARITAN VOWEL SIGN LONG I..SAMARITAN MARK NEQUDAA
|
||||
0859..085B ; Mn # [3] MANDAIC AFFRICATION MARK..MANDAIC GEMINATION MARK
|
||||
08D3..08E1 ; Mn # [15] ARABIC SMALL LOW WAW..ARABIC SMALL HIGH SIGN SAFHA
|
||||
0898..089F ; Mn # [8] ARABIC SMALL HIGH WORD AL-JUZ..ARABIC HALF MADDA OVER MADDA
|
||||
08CA..08E1 ; Mn # [24] ARABIC SMALL HIGH FARSI YEH..ARABIC SMALL HIGH SIGN SAFHA
|
||||
08E3..0902 ; Mn # [32] ARABIC TURNED DAMMA BELOW..DEVANAGARI SIGN ANUSVARA
|
||||
093A ; Mn # DEVANAGARI VOWEL SIGN OE
|
||||
093C ; Mn # DEVANAGARI SIGN NUKTA
|
||||
|
@ -2642,6 +2704,7 @@ FFDA..FFDC ; Lo # [3] HALFWIDTH HANGUL LETTER EU..HALFWIDTH HANGUL LETTER I
|
|||
0BCD ; Mn # TAMIL SIGN VIRAMA
|
||||
0C00 ; Mn # TELUGU SIGN COMBINING CANDRABINDU ABOVE
|
||||
0C04 ; Mn # TELUGU SIGN COMBINING ANUSVARA ABOVE
|
||||
0C3C ; Mn # TELUGU SIGN NUKTA
|
||||
0C3E..0C40 ; Mn # [3] TELUGU VOWEL SIGN AA..TELUGU VOWEL SIGN II
|
||||
0C46..0C48 ; Mn # [3] TELUGU VOWEL SIGN E..TELUGU VOWEL SIGN AI
|
||||
0C4A..0C4D ; Mn # [4] TELUGU VOWEL SIGN O..TELUGU SIGN VIRAMA
|
||||
|
@ -2691,7 +2754,7 @@ FFDA..FFDC ; Lo # [3] HALFWIDTH HANGUL LETTER EU..HALFWIDTH HANGUL LETTER I
|
|||
109D ; Mn # MYANMAR VOWEL SIGN AITON AI
|
||||
135D..135F ; Mn # [3] ETHIOPIC COMBINING GEMINATION AND VOWEL LENGTH MARK..ETHIOPIC COMBINING GEMINATION MARK
|
||||
1712..1714 ; Mn # [3] TAGALOG VOWEL SIGN I..TAGALOG SIGN VIRAMA
|
||||
1732..1734 ; Mn # [3] HANUNOO VOWEL SIGN I..HANUNOO SIGN PAMUDPOD
|
||||
1732..1733 ; Mn # [2] HANUNOO VOWEL SIGN I..HANUNOO VOWEL SIGN U
|
||||
1752..1753 ; Mn # [2] BUHID VOWEL SIGN I..BUHID VOWEL SIGN U
|
||||
1772..1773 ; Mn # [2] TAGBANWA VOWEL SIGN I..TAGBANWA VOWEL SIGN U
|
||||
17B4..17B5 ; Mn # [2] KHMER VOWEL INHERENT AQ..KHMER VOWEL INHERENT AA
|
||||
|
@ -2700,6 +2763,7 @@ FFDA..FFDC ; Lo # [3] HALFWIDTH HANGUL LETTER EU..HALFWIDTH HANGUL LETTER I
|
|||
17C9..17D3 ; Mn # [11] KHMER SIGN MUUSIKATOAN..KHMER SIGN BATHAMASAT
|
||||
17DD ; Mn # KHMER SIGN ATTHACAN
|
||||
180B..180D ; Mn # [3] MONGOLIAN FREE VARIATION SELECTOR ONE..MONGOLIAN FREE VARIATION SELECTOR THREE
|
||||
180F ; Mn # MONGOLIAN FREE VARIATION SELECTOR FOUR
|
||||
1885..1886 ; Mn # [2] MONGOLIAN LETTER ALI GALI BALUDA..MONGOLIAN LETTER ALI GALI THREE BALUDA
|
||||
18A9 ; Mn # MONGOLIAN LETTER ALI GALI DAGALGA
|
||||
1920..1922 ; Mn # [3] LIMBU VOWEL SIGN A..LIMBU VOWEL SIGN U
|
||||
|
@ -2716,7 +2780,7 @@ FFDA..FFDC ; Lo # [3] HALFWIDTH HANGUL LETTER EU..HALFWIDTH HANGUL LETTER I
|
|||
1A73..1A7C ; Mn # [10] TAI THAM VOWEL SIGN OA ABOVE..TAI THAM SIGN KHUEN-LUE KARAN
|
||||
1A7F ; Mn # TAI THAM COMBINING CRYPTOGRAMMIC DOT
|
||||
1AB0..1ABD ; Mn # [14] COMBINING DOUBLED CIRCUMFLEX ACCENT..COMBINING PARENTHESES BELOW
|
||||
1ABF..1AC0 ; Mn # [2] COMBINING LATIN SMALL LETTER W BELOW..COMBINING LATIN SMALL LETTER TURNED W BELOW
|
||||
1ABF..1ACE ; Mn # [16] COMBINING LATIN SMALL LETTER W BELOW..COMBINING LATIN SMALL LETTER INSULAR T
|
||||
1B00..1B03 ; Mn # [4] BALINESE SIGN ULU RICEM..BALINESE SIGN SURANG
|
||||
1B34 ; Mn # BALINESE SIGN REREKAN
|
||||
1B36..1B3A ; Mn # [5] BALINESE VOWEL SIGN ULU..BALINESE VOWEL SIGN RA REPA
|
||||
|
@ -2739,8 +2803,7 @@ FFDA..FFDC ; Lo # [3] HALFWIDTH HANGUL LETTER EU..HALFWIDTH HANGUL LETTER I
|
|||
1CED ; Mn # VEDIC SIGN TIRYAK
|
||||
1CF4 ; Mn # VEDIC TONE CANDRA ABOVE
|
||||
1CF8..1CF9 ; Mn # [2] VEDIC TONE RING ABOVE..VEDIC TONE DOUBLE RING ABOVE
|
||||
1DC0..1DF9 ; Mn # [58] COMBINING DOTTED GRAVE ACCENT..COMBINING WIDE INVERTED BRIDGE BELOW
|
||||
1DFB..1DFF ; Mn # [5] COMBINING DELETION MARK..COMBINING RIGHT ARROWHEAD AND DOWN ARROWHEAD BELOW
|
||||
1DC0..1DFF ; Mn # [64] COMBINING DOTTED GRAVE ACCENT..COMBINING RIGHT ARROWHEAD AND DOWN ARROWHEAD BELOW
|
||||
20D0..20DC ; Mn # [13] COMBINING LEFT HARPOON ABOVE..COMBINING FOUR DOTS ABOVE
|
||||
20E1 ; Mn # COMBINING LEFT RIGHT ARROW ABOVE
|
||||
20E5..20F0 ; Mn # [12] COMBINING REVERSE SOLIDUS OVERLAY..COMBINING ASTERISK ABOVE
|
||||
|
@ -2799,11 +2862,15 @@ FE20..FE2F ; Mn # [16] COMBINING LIGATURE LEFT HALF..COMBINING CYRILLIC TITL
|
|||
10D24..10D27 ; Mn # [4] HANIFI ROHINGYA SIGN HARBAHAY..HANIFI ROHINGYA SIGN TASSI
|
||||
10EAB..10EAC ; Mn # [2] YEZIDI COMBINING HAMZA MARK..YEZIDI COMBINING MADDA MARK
|
||||
10F46..10F50 ; Mn # [11] SOGDIAN COMBINING DOT BELOW..SOGDIAN COMBINING STROKE BELOW
|
||||
10F82..10F85 ; Mn # [4] OLD UYGHUR COMBINING DOT ABOVE..OLD UYGHUR COMBINING TWO DOTS BELOW
|
||||
11001 ; Mn # BRAHMI SIGN ANUSVARA
|
||||
11038..11046 ; Mn # [15] BRAHMI VOWEL SIGN AA..BRAHMI VIRAMA
|
||||
11070 ; Mn # BRAHMI SIGN OLD TAMIL VIRAMA
|
||||
11073..11074 ; Mn # [2] BRAHMI VOWEL SIGN OLD TAMIL SHORT E..BRAHMI VOWEL SIGN OLD TAMIL SHORT O
|
||||
1107F..11081 ; Mn # [3] BRAHMI NUMBER JOINER..KAITHI SIGN ANUSVARA
|
||||
110B3..110B6 ; Mn # [4] KAITHI VOWEL SIGN U..KAITHI VOWEL SIGN AI
|
||||
110B9..110BA ; Mn # [2] KAITHI SIGN VIRAMA..KAITHI SIGN NUKTA
|
||||
110C2 ; Mn # KAITHI VOWEL SIGN VOCALIC R
|
||||
11100..11102 ; Mn # [3] CHAKMA SIGN CANDRABINDU..CHAKMA SIGN VISARGA
|
||||
11127..1112B ; Mn # [5] CHAKMA VOWEL SIGN A..CHAKMA VOWEL SIGN UU
|
||||
1112D..11134 ; Mn # [8] CHAKMA VOWEL SIGN AI..CHAKMA MAAYYAA
|
||||
|
@ -2883,6 +2950,8 @@ FE20..FE2F ; Mn # [16] COMBINING LIGATURE LEFT HALF..COMBINING CYRILLIC TITL
|
|||
16F8F..16F92 ; Mn # [4] MIAO TONE RIGHT..MIAO TONE BELOW
|
||||
16FE4 ; Mn # KHITAN SMALL SCRIPT FILLER
|
||||
1BC9D..1BC9E ; Mn # [2] DUPLOYAN THICK LETTER SELECTOR..DUPLOYAN DOUBLE MARK
|
||||
1CF00..1CF2D ; Mn # [46] ZNAMENNY COMBINING MARK GORAZDO NIZKO S KRYZHEM ON LEFT..ZNAMENNY COMBINING MARK KRYZH ON LEFT
|
||||
1CF30..1CF46 ; Mn # [23] ZNAMENNY COMBINING TONAL RANGE MARK MRACHNO..ZNAMENNY PRIZNAK MODIFIER ROG
|
||||
1D167..1D169 ; Mn # [3] MUSICAL SYMBOL COMBINING TREMOLO-1..MUSICAL SYMBOL COMBINING TREMOLO-3
|
||||
1D17B..1D182 ; Mn # [8] MUSICAL SYMBOL COMBINING ACCENT..MUSICAL SYMBOL COMBINING LOURE
|
||||
1D185..1D18B ; Mn # [7] MUSICAL SYMBOL COMBINING DOIT..MUSICAL SYMBOL COMBINING TRIPLE TONGUE
|
||||
|
@ -2900,12 +2969,13 @@ FE20..FE2F ; Mn # [16] COMBINING LIGATURE LEFT HALF..COMBINING CYRILLIC TITL
|
|||
1E023..1E024 ; Mn # [2] COMBINING GLAGOLITIC LETTER YU..COMBINING GLAGOLITIC LETTER SMALL YUS
|
||||
1E026..1E02A ; Mn # [5] COMBINING GLAGOLITIC LETTER YO..COMBINING GLAGOLITIC LETTER FITA
|
||||
1E130..1E136 ; Mn # [7] NYIAKENG PUACHUE HMONG TONE-B..NYIAKENG PUACHUE HMONG TONE-D
|
||||
1E2AE ; Mn # TOTO SIGN RISING TONE
|
||||
1E2EC..1E2EF ; Mn # [4] WANCHO TONE TUP..WANCHO TONE KOINI
|
||||
1E8D0..1E8D6 ; Mn # [7] MENDE KIKAKUI COMBINING NUMBER TEENS..MENDE KIKAKUI COMBINING NUMBER MILLIONS
|
||||
1E944..1E94A ; Mn # [7] ADLAM ALIF LENGTHENER..ADLAM NUKTA
|
||||
E0100..E01EF ; Mn # [240] VARIATION SELECTOR-17..VARIATION SELECTOR-256
|
||||
|
||||
# Total code points: 1839
|
||||
# Total code points: 1950
|
||||
|
||||
# ================================================
|
||||
|
||||
|
@ -2980,6 +3050,8 @@ A670..A672 ; Me # [3] COMBINING CYRILLIC TEN MILLIONS SIGN..COMBINING CYRIL
|
|||
1087..108C ; Mc # [6] MYANMAR SIGN SHAN TONE-2..MYANMAR SIGN SHAN COUNCIL TONE-3
|
||||
108F ; Mc # MYANMAR SIGN RUMAI PALAUNG TONE-5
|
||||
109A..109C ; Mc # [3] MYANMAR SIGN KHAMTI TONE-1..MYANMAR VOWEL SIGN AITON A
|
||||
1715 ; Mc # TAGALOG SIGN PAMUDPOD
|
||||
1734 ; Mc # HANUNOO SIGN PAMUDPOD
|
||||
17B6 ; Mc # KHMER VOWEL SIGN AA
|
||||
17BE..17C5 ; Mc # [8] KHMER VOWEL SIGN OE..KHMER VOWEL SIGN AU
|
||||
17C7..17C8 ; Mc # [2] KHMER SIGN REAHMUK..KHMER SIGN YUUKALEAPINTU
|
||||
|
@ -3099,7 +3171,7 @@ ABEC ; Mc # MEETEI MAYEK LUM IYEK
|
|||
1D165..1D166 ; Mc # [2] MUSICAL SYMBOL COMBINING STEM..MUSICAL SYMBOL COMBINING SPRECHGESANG STEM
|
||||
1D16D..1D172 ; Mc # [6] MUSICAL SYMBOL COMBINING AUGMENTATION DOT..MUSICAL SYMBOL COMBINING FLAG-5
|
||||
|
||||
# Total code points: 443
|
||||
# Total code points: 445
|
||||
|
||||
# ================================================
|
||||
|
||||
|
@ -3160,6 +3232,7 @@ FF10..FF19 ; Nd # [10] FULLWIDTH DIGIT ZERO..FULLWIDTH DIGIT NINE
|
|||
11D50..11D59 ; Nd # [10] MASARAM GONDI DIGIT ZERO..MASARAM GONDI DIGIT NINE
|
||||
11DA0..11DA9 ; Nd # [10] GUNJALA GONDI DIGIT ZERO..GUNJALA GONDI DIGIT NINE
|
||||
16A60..16A69 ; Nd # [10] MRO DIGIT ZERO..MRO DIGIT NINE
|
||||
16AC0..16AC9 ; Nd # [10] TANGSA DIGIT ZERO..TANGSA DIGIT NINE
|
||||
16B50..16B59 ; Nd # [10] PAHAWH HMONG DIGIT ZERO..PAHAWH HMONG DIGIT NINE
|
||||
1D7CE..1D7FF ; Nd # [50] MATHEMATICAL BOLD DIGIT ZERO..MATHEMATICAL MONOSPACE DIGIT NINE
|
||||
1E140..1E149 ; Nd # [10] NYIAKENG PUACHUE HMONG DIGIT ZERO..NYIAKENG PUACHUE HMONG DIGIT NINE
|
||||
|
@ -3167,7 +3240,7 @@ FF10..FF19 ; Nd # [10] FULLWIDTH DIGIT ZERO..FULLWIDTH DIGIT NINE
|
|||
1E950..1E959 ; Nd # [10] ADLAM DIGIT ZERO..ADLAM DIGIT NINE
|
||||
1FBF0..1FBF9 ; Nd # [10] SEGMENTED DIGIT ZERO..SEGMENTED DIGIT NINE
|
||||
|
||||
# Total code points: 650
|
||||
# Total code points: 660
|
||||
|
||||
# ================================================
|
||||
|
||||
|
@ -3314,6 +3387,7 @@ A830..A835 ; No # [6] NORTH INDIC FRACTION ONE QUARTER..NORTH INDIC FRACTIO
|
|||
061C ; Cf # ARABIC LETTER MARK
|
||||
06DD ; Cf # ARABIC END OF AYAH
|
||||
070F ; Cf # SYRIAC ABBREVIATION MARK
|
||||
0890..0891 ; Cf # [2] ARABIC POUND MARK ABOVE..ARABIC PIASTRE MARK ABOVE
|
||||
08E2 ; Cf # ARABIC DISPUTED END OF AYAH
|
||||
180E ; Cf # MONGOLIAN VOWEL SEPARATOR
|
||||
200B..200F ; Cf # [5] ZERO WIDTH SPACE..RIGHT-TO-LEFT MARK
|
||||
|
@ -3330,7 +3404,7 @@ FFF9..FFFB ; Cf # [3] INTERLINEAR ANNOTATION ANCHOR..INTERLINEAR ANNOTATION
|
|||
E0001 ; Cf # LANGUAGE TAG
|
||||
E0020..E007F ; Cf # [96] TAG SPACE..CANCEL TAG
|
||||
|
||||
# Total code points: 161
|
||||
# Total code points: 163
|
||||
|
||||
# ================================================
|
||||
|
||||
|
@ -3364,6 +3438,7 @@ D800..DFFF ; Cs # [2048] <surrogate-D800>..<surrogate-DFFF>
|
|||
2E1A ; Pd # HYPHEN WITH DIAERESIS
|
||||
2E3A..2E3B ; Pd # [2] TWO-EM DASH..THREE-EM DASH
|
||||
2E40 ; Pd # DOUBLE HYPHEN
|
||||
2E5D ; Pd # OBLIQUE HYPHEN
|
||||
301C ; Pd # WAVE DASH
|
||||
3030 ; Pd # WAVY DASH
|
||||
30A0 ; Pd # KATAKANA-HIRAGANA DOUBLE HYPHEN
|
||||
|
@ -3373,7 +3448,7 @@ FE63 ; Pd # SMALL HYPHEN-MINUS
|
|||
FF0D ; Pd # FULLWIDTH HYPHEN-MINUS
|
||||
10EAD ; Pd # YEZIDI HYPHENATION MARK
|
||||
|
||||
# Total code points: 25
|
||||
# Total code points: 26
|
||||
|
||||
# ================================================
|
||||
|
||||
|
@ -3425,6 +3500,10 @@ FF0D ; Pd # FULLWIDTH HYPHEN-MINUS
|
|||
2E26 ; Ps # LEFT SIDEWAYS U BRACKET
|
||||
2E28 ; Ps # LEFT DOUBLE PARENTHESIS
|
||||
2E42 ; Ps # DOUBLE LOW-REVERSED-9 QUOTATION MARK
|
||||
2E55 ; Ps # LEFT SQUARE BRACKET WITH STROKE
|
||||
2E57 ; Ps # LEFT SQUARE BRACKET WITH DOUBLE STROKE
|
||||
2E59 ; Ps # TOP HALF LEFT PARENTHESIS
|
||||
2E5B ; Ps # BOTTOM HALF LEFT PARENTHESIS
|
||||
3008 ; Ps # LEFT ANGLE BRACKET
|
||||
300A ; Ps # LEFT DOUBLE ANGLE BRACKET
|
||||
300C ; Ps # LEFT CORNER BRACKET
|
||||
|
@ -3455,7 +3534,7 @@ FF5B ; Ps # FULLWIDTH LEFT CURLY BRACKET
|
|||
FF5F ; Ps # FULLWIDTH LEFT WHITE PARENTHESIS
|
||||
FF62 ; Ps # HALFWIDTH LEFT CORNER BRACKET
|
||||
|
||||
# Total code points: 75
|
||||
# Total code points: 79
|
||||
|
||||
# ================================================
|
||||
|
||||
|
@ -3504,6 +3583,10 @@ FF62 ; Ps # HALFWIDTH LEFT CORNER BRACKET
|
|||
2E25 ; Pe # BOTTOM RIGHT HALF BRACKET
|
||||
2E27 ; Pe # RIGHT SIDEWAYS U BRACKET
|
||||
2E29 ; Pe # RIGHT DOUBLE PARENTHESIS
|
||||
2E56 ; Pe # RIGHT SQUARE BRACKET WITH STROKE
|
||||
2E58 ; Pe # RIGHT SQUARE BRACKET WITH DOUBLE STROKE
|
||||
2E5A ; Pe # TOP HALF RIGHT PARENTHESIS
|
||||
2E5C ; Pe # BOTTOM HALF RIGHT PARENTHESIS
|
||||
3009 ; Pe # RIGHT ANGLE BRACKET
|
||||
300B ; Pe # RIGHT DOUBLE ANGLE BRACKET
|
||||
300D ; Pe # RIGHT CORNER BRACKET
|
||||
|
@ -3534,7 +3617,7 @@ FF5D ; Pe # FULLWIDTH RIGHT CURLY BRACKET
|
|||
FF60 ; Pe # FULLWIDTH RIGHT WHITE PARENTHESIS
|
||||
FF63 ; Pe # HALFWIDTH RIGHT CORNER BRACKET
|
||||
|
||||
# Total code points: 73
|
||||
# Total code points: 77
|
||||
|
||||
# ================================================
|
||||
|
||||
|
@ -3576,7 +3659,7 @@ FF3F ; Pc # FULLWIDTH LOW LINE
|
|||
0609..060A ; Po # [2] ARABIC-INDIC PER MILLE SIGN..ARABIC-INDIC PER TEN THOUSAND SIGN
|
||||
060C..060D ; Po # [2] ARABIC COMMA..ARABIC DATE SEPARATOR
|
||||
061B ; Po # ARABIC SEMICOLON
|
||||
061E..061F ; Po # [2] ARABIC TRIPLE DOT PUNCTUATION MARK..ARABIC QUESTION MARK
|
||||
061D..061F ; Po # [3] ARABIC END OF TEXT MARK..ARABIC QUESTION MARK
|
||||
066A..066D ; Po # [4] ARABIC PERCENT SIGN..ARABIC FIVE POINTED STAR
|
||||
06D4 ; Po # ARABIC FULL STOP
|
||||
0700..070D ; Po # [14] SYRIAC END OF PARAGRAPH..SYRIAC HARKLEAN ASTERISCUS
|
||||
|
@ -3613,6 +3696,7 @@ FF3F ; Pc # FULLWIDTH LOW LINE
|
|||
1AA0..1AA6 ; Po # [7] TAI THAM SIGN WIANG..TAI THAM SIGN REVERSED ROTATED RANA
|
||||
1AA8..1AAD ; Po # [6] TAI THAM SIGN KAAN..TAI THAM SIGN CAANG
|
||||
1B5A..1B60 ; Po # [7] BALINESE PANTI..BALINESE PAMENENG
|
||||
1B7D..1B7E ; Po # [2] BALINESE PANTI LANTANG..BALINESE PAMADA LANTANG
|
||||
1BFC..1BFF ; Po # [4] BATAK SYMBOL BINDU NA METEK..BATAK SYMBOL BINDU PANGOLAT
|
||||
1C3B..1C3F ; Po # [5] LEPCHA PUNCTUATION TA-ROL..LEPCHA PUNCTUATION TSHOOK
|
||||
1C7E..1C7F ; Po # [2] OL CHIKI PUNCTUATION MUCAAD..OL CHIKI PUNCTUATION DOUBLE MUCAAD
|
||||
|
@ -3641,7 +3725,7 @@ FF3F ; Pc # FULLWIDTH LOW LINE
|
|||
2E3C..2E3F ; Po # [4] STENOGRAPHIC FULL STOP..CAPITULUM
|
||||
2E41 ; Po # REVERSED COMMA
|
||||
2E43..2E4F ; Po # [13] DASH WITH LEFT UPTURN..CORNISH VERSE DIVIDER
|
||||
2E52 ; Po # TIRONIAN SIGN CAPITAL ET
|
||||
2E52..2E54 ; Po # [3] TIRONIAN SIGN CAPITAL ET..MEDIEVAL QUESTION MARK
|
||||
3001..3003 ; Po # [3] IDEOGRAPHIC COMMA..DITTO MARK
|
||||
303D ; Po # PART ALTERNATION MARK
|
||||
30FB ; Po # KATAKANA MIDDLE DOT
|
||||
|
@ -3695,6 +3779,7 @@ FF64..FF65 ; Po # [2] HALFWIDTH IDEOGRAPHIC COMMA..HALFWIDTH KATAKANA MIDDL
|
|||
10B39..10B3F ; Po # [7] AVESTAN ABBREVIATION MARK..LARGE ONE RING OVER TWO RINGS PUNCTUATION
|
||||
10B99..10B9C ; Po # [4] PSALTER PAHLAVI SECTION MARK..PSALTER PAHLAVI FOUR DOTS WITH DOT
|
||||
10F55..10F59 ; Po # [5] SOGDIAN PUNCTUATION TWO VERTICAL BARS..SOGDIAN PUNCTUATION HALF CIRCLE WITH DOT
|
||||
10F86..10F89 ; Po # [4] OLD UYGHUR PUNCTUATION BAR..OLD UYGHUR PUNCTUATION FOUR DOTS
|
||||
11047..1104D ; Po # [7] BRAHMI DANDA..BRAHMI PUNCTUATION LOTUS
|
||||
110BB..110BC ; Po # [2] KAITHI ABBREVIATION SIGN..KAITHI ENUMERATION SIGN
|
||||
110BE..110C1 ; Po # [4] KAITHI SECTION MARK..KAITHI DOUBLE DANDA
|
||||
|
@ -3713,6 +3798,7 @@ FF64..FF65 ; Po # [2] HALFWIDTH IDEOGRAPHIC COMMA..HALFWIDTH KATAKANA MIDDL
|
|||
115C1..115D7 ; Po # [23] SIDDHAM SIGN SIDDHAM..SIDDHAM SECTION MARK WITH CIRCLES AND FOUR ENCLOSURES
|
||||
11641..11643 ; Po # [3] MODI DANDA..MODI ABBREVIATION SIGN
|
||||
11660..1166C ; Po # [13] MONGOLIAN BIRGA WITH ORNAMENT..MONGOLIAN TURNED SWIRL BIRGA WITH DOUBLE ORNAMENT
|
||||
116B9 ; Po # TAKRI ABBREVIATION SIGN
|
||||
1173C..1173E ; Po # [3] AHOM SIGN SMALL SECTION..AHOM SIGN RULAI
|
||||
1183B ; Po # DOGRA ABBREVIATION SIGN
|
||||
11944..11946 ; Po # [3] DIVES AKURU DOUBLE DANDA..DIVES AKURU END OF TEXT MARK
|
||||
|
@ -3725,6 +3811,7 @@ FF64..FF65 ; Po # [2] HALFWIDTH IDEOGRAPHIC COMMA..HALFWIDTH KATAKANA MIDDL
|
|||
11EF7..11EF8 ; Po # [2] MAKASAR PASSIMBANG..MAKASAR END OF SECTION
|
||||
11FFF ; Po # TAMIL PUNCTUATION END OF TEXT
|
||||
12470..12474 ; Po # [5] CUNEIFORM PUNCTUATION SIGN OLD ASSYRIAN WORD DIVIDER..CUNEIFORM PUNCTUATION SIGN DIAGONAL QUADCOLON
|
||||
12FF1..12FF2 ; Po # [2] CYPRO-MINOAN SIGN CM301..CYPRO-MINOAN SIGN CM302
|
||||
16A6E..16A6F ; Po # [2] MRO DANDA..MRO DOUBLE DANDA
|
||||
16AF5 ; Po # BASSA VAH FULL STOP
|
||||
16B37..16B3B ; Po # [5] PAHAWH HMONG SIGN VOS THOM..PAHAWH HMONG SIGN VOS FEEM
|
||||
|
@ -3735,7 +3822,7 @@ FF64..FF65 ; Po # [2] HALFWIDTH IDEOGRAPHIC COMMA..HALFWIDTH KATAKANA MIDDL
|
|||
1DA87..1DA8B ; Po # [5] SIGNWRITING COMMA..SIGNWRITING PARENTHESIS
|
||||
1E95E..1E95F ; Po # [2] ADLAM INITIAL EXCLAMATION MARK..ADLAM INITIAL QUESTION MARK
|
||||
|
||||
# Total code points: 593
|
||||
# Total code points: 605
|
||||
|
||||
# ================================================
|
||||
|
||||
|
@ -3823,7 +3910,7 @@ FFE9..FFEC ; Sm # [4] HALFWIDTH LEFTWARDS ARROW..HALFWIDTH DOWNWARDS ARROW
|
|||
0BF9 ; Sc # TAMIL RUPEE SIGN
|
||||
0E3F ; Sc # THAI CURRENCY SYMBOL BAHT
|
||||
17DB ; Sc # KHMER CURRENCY SYMBOL RIEL
|
||||
20A0..20BF ; Sc # [32] EURO-CURRENCY SIGN..BITCOIN SIGN
|
||||
20A0..20C0 ; Sc # [33] EURO-CURRENCY SIGN..SOM SIGN
|
||||
A838 ; Sc # NORTH INDIC RUPEE MARK
|
||||
FDFC ; Sc # RIAL SIGN
|
||||
FE69 ; Sc # SMALL DOLLAR SIGN
|
||||
|
@ -3834,7 +3921,7 @@ FFE5..FFE6 ; Sc # [2] FULLWIDTH YEN SIGN..FULLWIDTH WON SIGN
|
|||
1E2FF ; Sc # WANCHO NGUN SIGN
|
||||
1ECB0 ; Sc # INDIC SIYAQ RUPEE MARK
|
||||
|
||||
# Total code points: 62
|
||||
# Total code points: 63
|
||||
|
||||
# ================================================
|
||||
|
||||
|
@ -3853,6 +3940,7 @@ FFE5..FFE6 ; Sc # [2] FULLWIDTH YEN SIGN..FULLWIDTH WON SIGN
|
|||
02EF..02FF ; Sk # [17] MODIFIER LETTER LOW DOWN ARROWHEAD..MODIFIER LETTER LOW LEFT ARROW
|
||||
0375 ; Sk # GREEK LOWER NUMERAL SIGN
|
||||
0384..0385 ; Sk # [2] GREEK TONOS..GREEK DIALYTIKA TONOS
|
||||
0888 ; Sk # ARABIC RAISED ROUND DOT
|
||||
1FBD ; Sk # GREEK KORONIS
|
||||
1FBF..1FC1 ; Sk # [3] GREEK PSILI..GREEK DIALYTIKA AND PERISPOMENI
|
||||
1FCD..1FCF ; Sk # [3] GREEK PSILI AND VARIA..GREEK PSILI AND PERISPOMENI
|
||||
|
@ -3865,13 +3953,13 @@ A720..A721 ; Sk # [2] MODIFIER LETTER STRESS AND HIGH TONE..MODIFIER LETTER
|
|||
A789..A78A ; Sk # [2] MODIFIER LETTER COLON..MODIFIER LETTER SHORT EQUALS SIGN
|
||||
AB5B ; Sk # MODIFIER BREVE WITH INVERTED BREVE
|
||||
AB6A..AB6B ; Sk # [2] MODIFIER LETTER LEFT TACK..MODIFIER LETTER RIGHT TACK
|
||||
FBB2..FBC1 ; Sk # [16] ARABIC SYMBOL DOT ABOVE..ARABIC SYMBOL SMALL TAH BELOW
|
||||
FBB2..FBC2 ; Sk # [17] ARABIC SYMBOL DOT ABOVE..ARABIC SYMBOL WASLA ABOVE
|
||||
FF3E ; Sk # FULLWIDTH CIRCUMFLEX ACCENT
|
||||
FF40 ; Sk # FULLWIDTH GRAVE ACCENT
|
||||
FFE3 ; Sk # FULLWIDTH MACRON
|
||||
1F3FB..1F3FF ; Sk # [5] EMOJI MODIFIER FITZPATRICK TYPE-1-2..EMOJI MODIFIER FITZPATRICK TYPE-6
|
||||
|
||||
# Total code points: 123
|
||||
# Total code points: 125
|
||||
|
||||
# ================================================
|
||||
|
||||
|
@ -3984,7 +4072,9 @@ A828..A82B ; So # [4] SYLOTI NAGRI POETRY MARK-1..SYLOTI NAGRI POETRY MARK-
|
|||
A836..A837 ; So # [2] NORTH INDIC QUARTER MARK..NORTH INDIC PLACEHOLDER MARK
|
||||
A839 ; So # NORTH INDIC QUANTITY MARK
|
||||
AA77..AA79 ; So # [3] MYANMAR SYMBOL AITON EXCLAMATION..MYANMAR SYMBOL AITON TWO
|
||||
FDFD ; So # ARABIC LIGATURE BISMILLAH AR-RAHMAN AR-RAHEEM
|
||||
FD40..FD4F ; So # [16] ARABIC LIGATURE RAHIMAHU ALLAAH..ARABIC LIGATURE RAHIMAHUM ALLAAH
|
||||
FDCF ; So # ARABIC LIGATURE SALAAMUHU ALAYNAA
|
||||
FDFD..FDFF ; So # [3] ARABIC LIGATURE BISMILLAH AR-RAHMAN AR-RAHEEM..ARABIC LIGATURE AZZA WA JALL
|
||||
FFE4 ; So # FULLWIDTH BROKEN BAR
|
||||
FFE8 ; So # HALFWIDTH FORMS LIGHT VERTICAL
|
||||
FFED..FFEE ; So # [2] HALFWIDTH BLACK SQUARE..HALFWIDTH WHITE CIRCLE
|
||||
|
@ -4003,13 +4093,14 @@ FFFC..FFFD ; So # [2] OBJECT REPLACEMENT CHARACTER..REPLACEMENT CHARACTER
|
|||
16B3C..16B3F ; So # [4] PAHAWH HMONG SIGN XYEEM NTXIV..PAHAWH HMONG SIGN XYEEM FAIB
|
||||
16B45 ; So # PAHAWH HMONG SIGN CIM TSOV ROG
|
||||
1BC9C ; So # DUPLOYAN SIGN O WITH CROSS
|
||||
1CF50..1CFC3 ; So # [116] ZNAMENNY NEUME KRYUK..ZNAMENNY NEUME PAUK
|
||||
1D000..1D0F5 ; So # [246] BYZANTINE MUSICAL SYMBOL PSILI..BYZANTINE MUSICAL SYMBOL GORGON NEO KATO
|
||||
1D100..1D126 ; So # [39] MUSICAL SYMBOL SINGLE BARLINE..MUSICAL SYMBOL DRUM CLEF-2
|
||||
1D129..1D164 ; So # [60] MUSICAL SYMBOL MULTIPLE MEASURE REST..MUSICAL SYMBOL ONE HUNDRED TWENTY-EIGHTH NOTE
|
||||
1D16A..1D16C ; So # [3] MUSICAL SYMBOL FINGERED TREMOLO-1..MUSICAL SYMBOL FINGERED TREMOLO-3
|
||||
1D183..1D184 ; So # [2] MUSICAL SYMBOL ARPEGGIATO UP..MUSICAL SYMBOL ARPEGGIATO DOWN
|
||||
1D18C..1D1A9 ; So # [30] MUSICAL SYMBOL RINFORZANDO..MUSICAL SYMBOL DEGREE SLASH
|
||||
1D1AE..1D1E8 ; So # [59] MUSICAL SYMBOL PEDAL MARK..MUSICAL SYMBOL KIEVAN FLAT SIGN
|
||||
1D1AE..1D1EA ; So # [61] MUSICAL SYMBOL PEDAL MARK..MUSICAL SYMBOL KORON
|
||||
1D200..1D241 ; So # [66] GREEK VOCAL NOTATION SYMBOL-1..GREEK INSTRUMENTAL NOTATION SYMBOL-54
|
||||
1D245 ; So # GREEK MUSICAL LEIMMA
|
||||
1D300..1D356 ; So # [87] MONOGRAM FOR EARTH..TETRAGRAM FOR FOSTERING
|
||||
|
@ -4035,32 +4126,33 @@ FFFC..FFFD ; So # [2] OBJECT REPLACEMENT CHARACTER..REPLACEMENT CHARACTER
|
|||
1F260..1F265 ; So # [6] ROUNDED SYMBOL FOR FU..ROUNDED SYMBOL FOR CAI
|
||||
1F300..1F3FA ; So # [251] CYCLONE..AMPHORA
|
||||
1F400..1F6D7 ; So # [728] RAT..ELEVATOR
|
||||
1F6E0..1F6EC ; So # [13] HAMMER AND WRENCH..AIRPLANE ARRIVING
|
||||
1F6DD..1F6EC ; So # [16] PLAYGROUND SLIDE..AIRPLANE ARRIVING
|
||||
1F6F0..1F6FC ; So # [13] SATELLITE..ROLLER SKATE
|
||||
1F700..1F773 ; So # [116] ALCHEMICAL SYMBOL FOR QUINTESSENCE..ALCHEMICAL SYMBOL FOR HALF OUNCE
|
||||
1F780..1F7D8 ; So # [89] BLACK LEFT-POINTING ISOSCELES RIGHT TRIANGLE..NEGATIVE CIRCLED SQUARE
|
||||
1F7E0..1F7EB ; So # [12] LARGE ORANGE CIRCLE..LARGE BROWN SQUARE
|
||||
1F7F0 ; So # HEAVY EQUALS SIGN
|
||||
1F800..1F80B ; So # [12] LEFTWARDS ARROW WITH SMALL TRIANGLE ARROWHEAD..DOWNWARDS ARROW WITH LARGE TRIANGLE ARROWHEAD
|
||||
1F810..1F847 ; So # [56] LEFTWARDS ARROW WITH SMALL EQUILATERAL ARROWHEAD..DOWNWARDS HEAVY ARROW
|
||||
1F850..1F859 ; So # [10] LEFTWARDS SANS-SERIF ARROW..UP DOWN SANS-SERIF ARROW
|
||||
1F860..1F887 ; So # [40] WIDE-HEADED LEFTWARDS LIGHT BARB ARROW..WIDE-HEADED SOUTH WEST VERY HEAVY BARB ARROW
|
||||
1F890..1F8AD ; So # [30] LEFTWARDS TRIANGLE ARROWHEAD..WHITE ARROW SHAFT WIDTH TWO THIRDS
|
||||
1F8B0..1F8B1 ; So # [2] ARROW POINTING UPWARDS THEN NORTH WEST..ARROW POINTING RIGHTWARDS THEN CURVING SOUTH WEST
|
||||
1F900..1F978 ; So # [121] CIRCLED CROSS FORMEE WITH FOUR DOTS..DISGUISED FACE
|
||||
1F97A..1F9CB ; So # [82] FACE WITH PLEADING EYES..BUBBLE TEA
|
||||
1F9CD..1FA53 ; So # [135] STANDING PERSON..BLACK CHESS KNIGHT-BISHOP
|
||||
1F900..1FA53 ; So # [340] CIRCLED CROSS FORMEE WITH FOUR DOTS..BLACK CHESS KNIGHT-BISHOP
|
||||
1FA60..1FA6D ; So # [14] XIANGQI RED GENERAL..XIANGQI BLACK SOLDIER
|
||||
1FA70..1FA74 ; So # [5] BALLET SHOES..THONG SANDAL
|
||||
1FA78..1FA7A ; So # [3] DROP OF BLOOD..STETHOSCOPE
|
||||
1FA78..1FA7C ; So # [5] DROP OF BLOOD..CRUTCH
|
||||
1FA80..1FA86 ; So # [7] YO-YO..NESTING DOLLS
|
||||
1FA90..1FAA8 ; So # [25] RINGED PLANET..ROCK
|
||||
1FAB0..1FAB6 ; So # [7] FLY..FEATHER
|
||||
1FAC0..1FAC2 ; So # [3] ANATOMICAL HEART..PEOPLE HUGGING
|
||||
1FAD0..1FAD6 ; So # [7] BLUEBERRIES..TEAPOT
|
||||
1FA90..1FAAC ; So # [29] RINGED PLANET..HAMSA
|
||||
1FAB0..1FABA ; So # [11] FLY..NEST WITH EGGS
|
||||
1FAC0..1FAC5 ; So # [6] ANATOMICAL HEART..PERSON WITH CROWN
|
||||
1FAD0..1FAD9 ; So # [10] BLUEBERRIES..JAR
|
||||
1FAE0..1FAE7 ; So # [8] MELTING FACE..BUBBLES
|
||||
1FAF0..1FAF6 ; So # [7] HAND WITH INDEX FINGER AND THUMB CROSSED..HEART HANDS
|
||||
1FB00..1FB92 ; So # [147] BLOCK SEXTANT-1..UPPER HALF INVERSE MEDIUM SHADE AND LOWER HALF BLOCK
|
||||
1FB94..1FBCA ; So # [55] LEFT HALF INVERSE MEDIUM SHADE AND RIGHT HALF BLOCK..WHITE UP-POINTING CHEVRON
|
||||
|
||||
# Total code points: 6431
|
||||
# Total code points: 6605
|
||||
|
||||
# ================================================
|
||||
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
# GraphemeBreakProperty-13.0.0.txt
|
||||
# Date: 2019-10-21, 14:30:35 GMT
|
||||
# © 2019 Unicode®, Inc.
|
||||
# GraphemeBreakProperty-14.0.0.txt
|
||||
# Date: 2021-08-12, 23:13:02 GMT
|
||||
# © 2021 Unicode®, Inc.
|
||||
# Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries.
|
||||
# For terms of use, see http://www.unicode.org/terms_of_use.html
|
||||
#
|
||||
|
@ -21,6 +21,7 @@
|
|||
0600..0605 ; Prepend # Cf [6] ARABIC NUMBER SIGN..ARABIC NUMBER MARK ABOVE
|
||||
06DD ; Prepend # Cf ARABIC END OF AYAH
|
||||
070F ; Prepend # Cf SYRIAC ABBREVIATION MARK
|
||||
0890..0891 ; Prepend # Cf [2] ARABIC POUND MARK ABOVE..ARABIC PIASTRE MARK ABOVE
|
||||
08E2 ; Prepend # Cf ARABIC DISPUTED END OF AYAH
|
||||
0D4E ; Prepend # Lo MALAYALAM LETTER DOT REPH
|
||||
110BD ; Prepend # Cf KAITHI NUMBER SIGN
|
||||
|
@ -32,7 +33,7 @@
|
|||
11A84..11A89 ; Prepend # Lo [6] SOYOMBO SIGN JIHVAMULIYA..SOYOMBO CLUSTER-INITIAL LETTER SA
|
||||
11D46 ; Prepend # Lo MASARAM GONDI REPHA
|
||||
|
||||
# Total code points: 24
|
||||
# Total code points: 26
|
||||
|
||||
# ================================================
|
||||
|
||||
|
@ -104,7 +105,8 @@ E01F0..E0FFF ; Control # Cn [3600] <reserved-E01F0>..<reserved-E0FFF>
|
|||
0825..0827 ; Extend # Mn [3] SAMARITAN VOWEL SIGN SHORT A..SAMARITAN VOWEL SIGN U
|
||||
0829..082D ; Extend # Mn [5] SAMARITAN VOWEL SIGN LONG I..SAMARITAN MARK NEQUDAA
|
||||
0859..085B ; Extend # Mn [3] MANDAIC AFFRICATION MARK..MANDAIC GEMINATION MARK
|
||||
08D3..08E1 ; Extend # Mn [15] ARABIC SMALL LOW WAW..ARABIC SMALL HIGH SIGN SAFHA
|
||||
0898..089F ; Extend # Mn [8] ARABIC SMALL HIGH WORD AL-JUZ..ARABIC HALF MADDA OVER MADDA
|
||||
08CA..08E1 ; Extend # Mn [24] ARABIC SMALL HIGH FARSI YEH..ARABIC SMALL HIGH SIGN SAFHA
|
||||
08E3..0902 ; Extend # Mn [32] ARABIC TURNED DAMMA BELOW..DEVANAGARI SIGN ANUSVARA
|
||||
093A ; Extend # Mn DEVANAGARI VOWEL SIGN OE
|
||||
093C ; Extend # Mn DEVANAGARI SIGN NUKTA
|
||||
|
@ -151,6 +153,7 @@ E01F0..E0FFF ; Control # Cn [3600] <reserved-E01F0>..<reserved-E0FFF>
|
|||
0BD7 ; Extend # Mc TAMIL AU LENGTH MARK
|
||||
0C00 ; Extend # Mn TELUGU SIGN COMBINING CANDRABINDU ABOVE
|
||||
0C04 ; Extend # Mn TELUGU SIGN COMBINING ANUSVARA ABOVE
|
||||
0C3C ; Extend # Mn TELUGU SIGN NUKTA
|
||||
0C3E..0C40 ; Extend # Mn [3] TELUGU VOWEL SIGN AA..TELUGU VOWEL SIGN II
|
||||
0C46..0C48 ; Extend # Mn [3] TELUGU VOWEL SIGN E..TELUGU VOWEL SIGN AI
|
||||
0C4A..0C4D ; Extend # Mn [4] TELUGU VOWEL SIGN O..TELUGU SIGN VIRAMA
|
||||
|
@ -206,7 +209,7 @@ E01F0..E0FFF ; Control # Cn [3600] <reserved-E01F0>..<reserved-E0FFF>
|
|||
109D ; Extend # Mn MYANMAR VOWEL SIGN AITON AI
|
||||
135D..135F ; Extend # Mn [3] ETHIOPIC COMBINING GEMINATION AND VOWEL LENGTH MARK..ETHIOPIC COMBINING GEMINATION MARK
|
||||
1712..1714 ; Extend # Mn [3] TAGALOG VOWEL SIGN I..TAGALOG SIGN VIRAMA
|
||||
1732..1734 ; Extend # Mn [3] HANUNOO VOWEL SIGN I..HANUNOO SIGN PAMUDPOD
|
||||
1732..1733 ; Extend # Mn [2] HANUNOO VOWEL SIGN I..HANUNOO VOWEL SIGN U
|
||||
1752..1753 ; Extend # Mn [2] BUHID VOWEL SIGN I..BUHID VOWEL SIGN U
|
||||
1772..1773 ; Extend # Mn [2] TAGBANWA VOWEL SIGN I..TAGBANWA VOWEL SIGN U
|
||||
17B4..17B5 ; Extend # Mn [2] KHMER VOWEL INHERENT AQ..KHMER VOWEL INHERENT AA
|
||||
|
@ -215,6 +218,7 @@ E01F0..E0FFF ; Control # Cn [3600] <reserved-E01F0>..<reserved-E0FFF>
|
|||
17C9..17D3 ; Extend # Mn [11] KHMER SIGN MUUSIKATOAN..KHMER SIGN BATHAMASAT
|
||||
17DD ; Extend # Mn KHMER SIGN ATTHACAN
|
||||
180B..180D ; Extend # Mn [3] MONGOLIAN FREE VARIATION SELECTOR ONE..MONGOLIAN FREE VARIATION SELECTOR THREE
|
||||
180F ; Extend # Mn MONGOLIAN FREE VARIATION SELECTOR FOUR
|
||||
1885..1886 ; Extend # Mn [2] MONGOLIAN LETTER ALI GALI BALUDA..MONGOLIAN LETTER ALI GALI THREE BALUDA
|
||||
18A9 ; Extend # Mn MONGOLIAN LETTER ALI GALI DAGALGA
|
||||
1920..1922 ; Extend # Mn [3] LIMBU VOWEL SIGN A..LIMBU VOWEL SIGN U
|
||||
|
@ -232,7 +236,7 @@ E01F0..E0FFF ; Control # Cn [3600] <reserved-E01F0>..<reserved-E0FFF>
|
|||
1A7F ; Extend # Mn TAI THAM COMBINING CRYPTOGRAMMIC DOT
|
||||
1AB0..1ABD ; Extend # Mn [14] COMBINING DOUBLED CIRCUMFLEX ACCENT..COMBINING PARENTHESES BELOW
|
||||
1ABE ; Extend # Me COMBINING PARENTHESES OVERLAY
|
||||
1ABF..1AC0 ; Extend # Mn [2] COMBINING LATIN SMALL LETTER W BELOW..COMBINING LATIN SMALL LETTER TURNED W BELOW
|
||||
1ABF..1ACE ; Extend # Mn [16] COMBINING LATIN SMALL LETTER W BELOW..COMBINING LATIN SMALL LETTER INSULAR T
|
||||
1B00..1B03 ; Extend # Mn [4] BALINESE SIGN ULU RICEM..BALINESE SIGN SURANG
|
||||
1B34 ; Extend # Mn BALINESE SIGN REREKAN
|
||||
1B35 ; Extend # Mc BALINESE VOWEL SIGN TEDUNG
|
||||
|
@ -256,8 +260,7 @@ E01F0..E0FFF ; Control # Cn [3600] <reserved-E01F0>..<reserved-E0FFF>
|
|||
1CED ; Extend # Mn VEDIC SIGN TIRYAK
|
||||
1CF4 ; Extend # Mn VEDIC TONE CANDRA ABOVE
|
||||
1CF8..1CF9 ; Extend # Mn [2] VEDIC TONE RING ABOVE..VEDIC TONE DOUBLE RING ABOVE
|
||||
1DC0..1DF9 ; Extend # Mn [58] COMBINING DOTTED GRAVE ACCENT..COMBINING WIDE INVERTED BRIDGE BELOW
|
||||
1DFB..1DFF ; Extend # Mn [5] COMBINING DELETION MARK..COMBINING RIGHT ARROWHEAD AND DOWN ARROWHEAD BELOW
|
||||
1DC0..1DFF ; Extend # Mn [64] COMBINING DOTTED GRAVE ACCENT..COMBINING RIGHT ARROWHEAD AND DOWN ARROWHEAD BELOW
|
||||
200C ; Extend # Cf ZERO WIDTH NON-JOINER
|
||||
20D0..20DC ; Extend # Mn [13] COMBINING LEFT HARPOON ABOVE..COMBINING FOUR DOTS ABOVE
|
||||
20DD..20E0 ; Extend # Me [4] COMBINING ENCLOSING CIRCLE..COMBINING ENCLOSING CIRCLE BACKSLASH
|
||||
|
@ -322,11 +325,15 @@ FF9E..FF9F ; Extend # Lm [2] HALFWIDTH KATAKANA VOICED SOUND MARK..HALFWIDT
|
|||
10D24..10D27 ; Extend # Mn [4] HANIFI ROHINGYA SIGN HARBAHAY..HANIFI ROHINGYA SIGN TASSI
|
||||
10EAB..10EAC ; Extend # Mn [2] YEZIDI COMBINING HAMZA MARK..YEZIDI COMBINING MADDA MARK
|
||||
10F46..10F50 ; Extend # Mn [11] SOGDIAN COMBINING DOT BELOW..SOGDIAN COMBINING STROKE BELOW
|
||||
10F82..10F85 ; Extend # Mn [4] OLD UYGHUR COMBINING DOT ABOVE..OLD UYGHUR COMBINING TWO DOTS BELOW
|
||||
11001 ; Extend # Mn BRAHMI SIGN ANUSVARA
|
||||
11038..11046 ; Extend # Mn [15] BRAHMI VOWEL SIGN AA..BRAHMI VIRAMA
|
||||
11070 ; Extend # Mn BRAHMI SIGN OLD TAMIL VIRAMA
|
||||
11073..11074 ; Extend # Mn [2] BRAHMI VOWEL SIGN OLD TAMIL SHORT E..BRAHMI VOWEL SIGN OLD TAMIL SHORT O
|
||||
1107F..11081 ; Extend # Mn [3] BRAHMI NUMBER JOINER..KAITHI SIGN ANUSVARA
|
||||
110B3..110B6 ; Extend # Mn [4] KAITHI VOWEL SIGN U..KAITHI VOWEL SIGN AI
|
||||
110B9..110BA ; Extend # Mn [2] KAITHI SIGN VIRAMA..KAITHI SIGN NUKTA
|
||||
110C2 ; Extend # Mn KAITHI VOWEL SIGN VOCALIC R
|
||||
11100..11102 ; Extend # Mn [3] CHAKMA SIGN CANDRABINDU..CHAKMA SIGN VISARGA
|
||||
11127..1112B ; Extend # Mn [5] CHAKMA VOWEL SIGN A..CHAKMA VOWEL SIGN UU
|
||||
1112D..11134 ; Extend # Mn [8] CHAKMA VOWEL SIGN AI..CHAKMA MAAYYAA
|
||||
|
@ -412,6 +419,8 @@ FF9E..FF9F ; Extend # Lm [2] HALFWIDTH KATAKANA VOICED SOUND MARK..HALFWIDT
|
|||
16F8F..16F92 ; Extend # Mn [4] MIAO TONE RIGHT..MIAO TONE BELOW
|
||||
16FE4 ; Extend # Mn KHITAN SMALL SCRIPT FILLER
|
||||
1BC9D..1BC9E ; Extend # Mn [2] DUPLOYAN THICK LETTER SELECTOR..DUPLOYAN DOUBLE MARK
|
||||
1CF00..1CF2D ; Extend # Mn [46] ZNAMENNY COMBINING MARK GORAZDO NIZKO S KRYZHEM ON LEFT..ZNAMENNY COMBINING MARK KRYZH ON LEFT
|
||||
1CF30..1CF46 ; Extend # Mn [23] ZNAMENNY COMBINING TONAL RANGE MARK MRACHNO..ZNAMENNY PRIZNAK MODIFIER ROG
|
||||
1D165 ; Extend # Mc MUSICAL SYMBOL COMBINING STEM
|
||||
1D167..1D169 ; Extend # Mn [3] MUSICAL SYMBOL COMBINING TREMOLO-1..MUSICAL SYMBOL COMBINING TREMOLO-3
|
||||
1D16E..1D172 ; Extend # Mc [5] MUSICAL SYMBOL COMBINING FLAG-1..MUSICAL SYMBOL COMBINING FLAG-5
|
||||
|
@ -431,6 +440,7 @@ FF9E..FF9F ; Extend # Lm [2] HALFWIDTH KATAKANA VOICED SOUND MARK..HALFWIDT
|
|||
1E023..1E024 ; Extend # Mn [2] COMBINING GLAGOLITIC LETTER YU..COMBINING GLAGOLITIC LETTER SMALL YUS
|
||||
1E026..1E02A ; Extend # Mn [5] COMBINING GLAGOLITIC LETTER YO..COMBINING GLAGOLITIC LETTER FITA
|
||||
1E130..1E136 ; Extend # Mn [7] NYIAKENG PUACHUE HMONG TONE-B..NYIAKENG PUACHUE HMONG TONE-D
|
||||
1E2AE ; Extend # Mn TOTO SIGN RISING TONE
|
||||
1E2EC..1E2EF ; Extend # Mn [4] WANCHO TONE TUP..WANCHO TONE KOINI
|
||||
1E8D0..1E8D6 ; Extend # Mn [7] MENDE KIKAKUI COMBINING NUMBER TEENS..MENDE KIKAKUI COMBINING NUMBER MILLIONS
|
||||
1E944..1E94A ; Extend # Mn [7] ADLAM ALIF LENGTHENER..ADLAM NUKTA
|
||||
|
@ -438,7 +448,7 @@ FF9E..FF9F ; Extend # Lm [2] HALFWIDTH KATAKANA VOICED SOUND MARK..HALFWIDT
|
|||
E0020..E007F ; Extend # Cf [96] TAG SPACE..CANCEL TAG
|
||||
E0100..E01EF ; Extend # Mn [240] VARIATION SELECTOR-17..VARIATION SELECTOR-256
|
||||
|
||||
# Total code points: 1984
|
||||
# Total code points: 2095
|
||||
|
||||
# ================================================
|
||||
|
||||
|
@ -495,6 +505,8 @@ E0100..E01EF ; Extend # Mn [240] VARIATION SELECTOR-17..VARIATION SELECTOR-256
|
|||
103B..103C ; SpacingMark # Mc [2] MYANMAR CONSONANT SIGN MEDIAL YA..MYANMAR CONSONANT SIGN MEDIAL RA
|
||||
1056..1057 ; SpacingMark # Mc [2] MYANMAR VOWEL SIGN VOCALIC R..MYANMAR VOWEL SIGN VOCALIC RR
|
||||
1084 ; SpacingMark # Mc MYANMAR VOWEL SIGN SHAN E
|
||||
1715 ; SpacingMark # Mc TAGALOG SIGN PAMUDPOD
|
||||
1734 ; SpacingMark # Mc HANUNOO SIGN PAMUDPOD
|
||||
17B6 ; SpacingMark # Mc KHMER VOWEL SIGN AA
|
||||
17BE..17C5 ; SpacingMark # Mc [8] KHMER VOWEL SIGN OE..KHMER VOWEL SIGN AU
|
||||
17C7..17C8 ; SpacingMark # Mc [2] KHMER SIGN REAHMUK..KHMER SIGN YUUKALEAPINTU
|
||||
|
@ -579,7 +591,6 @@ ABEC ; SpacingMark # Mc MEETEI MAYEK LUM IYEK
|
|||
116AC ; SpacingMark # Mc TAKRI SIGN VISARGA
|
||||
116AE..116AF ; SpacingMark # Mc [2] TAKRI VOWEL SIGN I..TAKRI VOWEL SIGN II
|
||||
116B6 ; SpacingMark # Mc TAKRI SIGN VIRAMA
|
||||
11720..11721 ; SpacingMark # Mc [2] AHOM VOWEL SIGN A..AHOM VOWEL SIGN AA
|
||||
11726 ; SpacingMark # Mc AHOM VOWEL SIGN E
|
||||
1182C..1182E ; SpacingMark # Mc [3] DOGRA VOWEL SIGN AA..DOGRA VOWEL SIGN II
|
||||
11838 ; SpacingMark # Mc DOGRA SIGN VISARGA
|
||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,212 @@
|
|||
# PropertyAliases-14.0.0.txt
|
||||
# Date: 2021-03-08, 19:35:48 GMT
|
||||
# © 2021 Unicode®, Inc.
|
||||
# Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries.
|
||||
# For terms of use, see http://www.unicode.org/terms_of_use.html
|
||||
#
|
||||
# Unicode Character Database
|
||||
# For documentation, see http://www.unicode.org/reports/tr44/
|
||||
#
|
||||
# This file contains aliases for properties used in the UCD.
|
||||
# These names can be used for XML formats of UCD data, for regular-expression
|
||||
# property tests, and other programmatic textual descriptions of Unicode data.
|
||||
#
|
||||
# The names may be translated in appropriate environments, and additional
|
||||
# aliases may be useful.
|
||||
#
|
||||
# FORMAT
|
||||
#
|
||||
# Each line has two or more fields, separated by semicolons.
|
||||
#
|
||||
# First Field: The first field is the short name for the property.
|
||||
# It is typically an abbreviation, but in a number of cases it is simply
|
||||
# a duplicate of the "long name" in the second field.
|
||||
# For Unihan database tags, the short name is actually a longer string than
|
||||
# the tag specified in the second field.
|
||||
#
|
||||
# Second Field: The second field is the long name for the property,
|
||||
# typically the formal name used in documentation about the property.
|
||||
#
|
||||
# The above are the preferred aliases. Other aliases may be listed in additional fields.
|
||||
#
|
||||
# Loose matching should be applied to all property names and property values, with
|
||||
# the exception of String Property values. With loose matching of property names and
|
||||
# values, the case distinctions, whitespace, and '_' are ignored. For Numeric Property
|
||||
# values, numeric equivalencies are applied: thus "01.00" is equivalent to "1".
|
||||
#
|
||||
# NOTE: Property value names are NOT unique across properties. For example:
|
||||
#
|
||||
# AL means Arabic Letter for the Bidi_Class property, and
|
||||
# AL means Above_Left for the Combining_Class property, and
|
||||
# AL means Alphabetic for the Line_Break property.
|
||||
#
|
||||
# In addition, some property names may be the same as some property value names.
|
||||
# For example:
|
||||
#
|
||||
# sc means the Script property, and
|
||||
# Sc means the General_Category property value Currency_Symbol (Sc)
|
||||
#
|
||||
# The combination of property value and property name is, however, unique.
|
||||
#
|
||||
# For more information, see UAX #44, Unicode Character Database, and
|
||||
# UTS #18, Unicode Regular Expressions.
|
||||
# ================================================
|
||||
|
||||
|
||||
# ================================================
|
||||
# Numeric Properties
|
||||
# ================================================
|
||||
cjkAccountingNumeric ; kAccountingNumeric
|
||||
cjkOtherNumeric ; kOtherNumeric
|
||||
cjkPrimaryNumeric ; kPrimaryNumeric
|
||||
nv ; Numeric_Value
|
||||
|
||||
# ================================================
|
||||
# String Properties
|
||||
# ================================================
|
||||
cf ; Case_Folding
|
||||
cjkCompatibilityVariant ; kCompatibilityVariant
|
||||
dm ; Decomposition_Mapping
|
||||
FC_NFKC ; FC_NFKC_Closure
|
||||
lc ; Lowercase_Mapping
|
||||
NFKC_CF ; NFKC_Casefold
|
||||
scf ; Simple_Case_Folding ; sfc
|
||||
slc ; Simple_Lowercase_Mapping
|
||||
stc ; Simple_Titlecase_Mapping
|
||||
suc ; Simple_Uppercase_Mapping
|
||||
tc ; Titlecase_Mapping
|
||||
uc ; Uppercase_Mapping
|
||||
|
||||
# ================================================
|
||||
# Miscellaneous Properties
|
||||
# ================================================
|
||||
bmg ; Bidi_Mirroring_Glyph
|
||||
bpb ; Bidi_Paired_Bracket
|
||||
cjkIICore ; kIICore
|
||||
cjkIRG_GSource ; kIRG_GSource
|
||||
cjkIRG_HSource ; kIRG_HSource
|
||||
cjkIRG_JSource ; kIRG_JSource
|
||||
cjkIRG_KPSource ; kIRG_KPSource
|
||||
cjkIRG_KSource ; kIRG_KSource
|
||||
cjkIRG_MSource ; kIRG_MSource
|
||||
cjkIRG_SSource ; kIRG_SSource
|
||||
cjkIRG_TSource ; kIRG_TSource
|
||||
cjkIRG_UKSource ; kIRG_UKSource
|
||||
cjkIRG_USource ; kIRG_USource
|
||||
cjkIRG_VSource ; kIRG_VSource
|
||||
cjkRSUnicode ; kRSUnicode ; Unicode_Radical_Stroke; URS
|
||||
EqUIdeo ; Equivalent_Unified_Ideograph
|
||||
isc ; ISO_Comment
|
||||
JSN ; Jamo_Short_Name
|
||||
na ; Name
|
||||
na1 ; Unicode_1_Name
|
||||
Name_Alias ; Name_Alias
|
||||
scx ; Script_Extensions
|
||||
|
||||
# ================================================
|
||||
# Catalog Properties
|
||||
# ================================================
|
||||
age ; Age
|
||||
blk ; Block
|
||||
sc ; Script
|
||||
|
||||
# ================================================
|
||||
# Enumerated Properties
|
||||
# ================================================
|
||||
bc ; Bidi_Class
|
||||
bpt ; Bidi_Paired_Bracket_Type
|
||||
ccc ; Canonical_Combining_Class
|
||||
dt ; Decomposition_Type
|
||||
ea ; East_Asian_Width
|
||||
gc ; General_Category
|
||||
GCB ; Grapheme_Cluster_Break
|
||||
hst ; Hangul_Syllable_Type
|
||||
InPC ; Indic_Positional_Category
|
||||
InSC ; Indic_Syllabic_Category
|
||||
jg ; Joining_Group
|
||||
jt ; Joining_Type
|
||||
lb ; Line_Break
|
||||
NFC_QC ; NFC_Quick_Check
|
||||
NFD_QC ; NFD_Quick_Check
|
||||
NFKC_QC ; NFKC_Quick_Check
|
||||
NFKD_QC ; NFKD_Quick_Check
|
||||
nt ; Numeric_Type
|
||||
SB ; Sentence_Break
|
||||
vo ; Vertical_Orientation
|
||||
WB ; Word_Break
|
||||
|
||||
# ================================================
|
||||
# Binary Properties
|
||||
# ================================================
|
||||
AHex ; ASCII_Hex_Digit
|
||||
Alpha ; Alphabetic
|
||||
Bidi_C ; Bidi_Control
|
||||
Bidi_M ; Bidi_Mirrored
|
||||
Cased ; Cased
|
||||
CE ; Composition_Exclusion
|
||||
CI ; Case_Ignorable
|
||||
Comp_Ex ; Full_Composition_Exclusion
|
||||
CWCF ; Changes_When_Casefolded
|
||||
CWCM ; Changes_When_Casemapped
|
||||
CWKCF ; Changes_When_NFKC_Casefolded
|
||||
CWL ; Changes_When_Lowercased
|
||||
CWT ; Changes_When_Titlecased
|
||||
CWU ; Changes_When_Uppercased
|
||||
Dash ; Dash
|
||||
Dep ; Deprecated
|
||||
DI ; Default_Ignorable_Code_Point
|
||||
Dia ; Diacritic
|
||||
EBase ; Emoji_Modifier_Base
|
||||
EComp ; Emoji_Component
|
||||
EMod ; Emoji_Modifier
|
||||
Emoji ; Emoji
|
||||
EPres ; Emoji_Presentation
|
||||
Ext ; Extender
|
||||
ExtPict ; Extended_Pictographic
|
||||
Gr_Base ; Grapheme_Base
|
||||
Gr_Ext ; Grapheme_Extend
|
||||
Gr_Link ; Grapheme_Link
|
||||
Hex ; Hex_Digit
|
||||
Hyphen ; Hyphen
|
||||
IDC ; ID_Continue
|
||||
Ideo ; Ideographic
|
||||
IDS ; ID_Start
|
||||
IDSB ; IDS_Binary_Operator
|
||||
IDST ; IDS_Trinary_Operator
|
||||
Join_C ; Join_Control
|
||||
LOE ; Logical_Order_Exception
|
||||
Lower ; Lowercase
|
||||
Math ; Math
|
||||
NChar ; Noncharacter_Code_Point
|
||||
OAlpha ; Other_Alphabetic
|
||||
ODI ; Other_Default_Ignorable_Code_Point
|
||||
OGr_Ext ; Other_Grapheme_Extend
|
||||
OIDC ; Other_ID_Continue
|
||||
OIDS ; Other_ID_Start
|
||||
OLower ; Other_Lowercase
|
||||
OMath ; Other_Math
|
||||
OUpper ; Other_Uppercase
|
||||
Pat_Syn ; Pattern_Syntax
|
||||
Pat_WS ; Pattern_White_Space
|
||||
PCM ; Prepended_Concatenation_Mark
|
||||
QMark ; Quotation_Mark
|
||||
Radical ; Radical
|
||||
RI ; Regional_Indicator
|
||||
SD ; Soft_Dotted
|
||||
STerm ; Sentence_Terminal
|
||||
Term ; Terminal_Punctuation
|
||||
UIdeo ; Unified_Ideograph
|
||||
Upper ; Uppercase
|
||||
VS ; Variation_Selector
|
||||
WSpace ; White_Space ; space
|
||||
XIDC ; XID_Continue
|
||||
XIDS ; XID_Start
|
||||
XO_NFC ; Expands_On_NFC
|
||||
XO_NFD ; Expands_On_NFD
|
||||
XO_NFKC ; Expands_On_NFKC
|
||||
XO_NFKD ; Expands_On_NFKD
|
||||
|
||||
# ================================================
|
||||
# Total: 129
|
||||
|
||||
# EOF
|
File diff suppressed because it is too large
Load Diff
|
@ -1,6 +1,6 @@
|
|||
# ScriptExtensions-13.0.0.txt
|
||||
# Date: 2020-01-22, 00:07:43 GMT
|
||||
# © 2020 Unicode®, Inc.
|
||||
# ScriptExtensions-14.0.0.txt
|
||||
# Date: 2021-06-04, 02:19:38 GMT
|
||||
# © 2021 Unicode®, Inc.
|
||||
# Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries.
|
||||
# For terms of use, see http://www.unicode.org/terms_of_use.html
|
||||
#
|
||||
|
@ -11,10 +11,10 @@
|
|||
# with more than one script, but with a limited number of scripts.
|
||||
# For each code point, there is one or more property values. Each such value is a Script property value.
|
||||
# For more information, see:
|
||||
# UAX #24, Unicode Script Property: http://www.unicode.org/reports/tr24/
|
||||
# UAX #24, Unicode Script Property: https://www.unicode.org/reports/tr24/
|
||||
# Especially the sections:
|
||||
# http://www.unicode.org/reports/tr24/#Assignment_Script_Values
|
||||
# http://www.unicode.org/reports/tr24/#Assignment_ScriptX_Values
|
||||
# https://www.unicode.org/reports/tr24/#Assignment_Script_Values
|
||||
# https://www.unicode.org/reports/tr24/#Assignment_ScriptX_Values
|
||||
#
|
||||
# Each Script_Extensions value in this file consists of a set
|
||||
# of one or more abbreviated Script property values. The ordering of the
|
||||
|
@ -119,6 +119,14 @@
|
|||
|
||||
# ================================================
|
||||
|
||||
# Script_Extensions=Syrc
|
||||
|
||||
1DFA ; Syrc # Mn COMBINING DOT BELOW LEFT
|
||||
|
||||
# Total code points: 1
|
||||
|
||||
# ================================================
|
||||
|
||||
# Script_Extensions=Arab Copt
|
||||
|
||||
102E0 ; Arab Copt # Mn COPTIC EPACT THOUSANDS MARK
|
||||
|
@ -136,6 +144,15 @@
|
|||
|
||||
# ================================================
|
||||
|
||||
# Script_Extensions=Arab Nkoo
|
||||
|
||||
FD3E ; Arab Nkoo # Pe ORNATE LEFT PARENTHESIS
|
||||
FD3F ; Arab Nkoo # Ps ORNATE RIGHT PARENTHESIS
|
||||
|
||||
# Total code points: 2
|
||||
|
||||
# ================================================
|
||||
|
||||
# Script_Extensions=Arab Syrc
|
||||
|
||||
064B..0655 ; Arab Syrc # Mn [11] ARABIC FATHATAN..ARABIC HAMZA BELOW
|
||||
|
@ -186,10 +203,10 @@ A9CF ; Bugi Java # Lm JAVANESE PANGRANGKEP
|
|||
|
||||
# Script_Extensions=Cprt Linb
|
||||
|
||||
10100..10102 ; Cprt Linb # Po [3] AEGEAN WORD SEPARATOR LINE..AEGEAN CHECK MARK
|
||||
10102 ; Cprt Linb # Po AEGEAN CHECK MARK
|
||||
10137..1013F ; Cprt Linb # So [9] AEGEAN WEIGHT BASE UNIT..AEGEAN MEASURE THIRD SUBUNIT
|
||||
|
||||
# Total code points: 12
|
||||
# Total code points: 10
|
||||
|
||||
# ================================================
|
||||
|
||||
|
@ -342,6 +359,14 @@ FF9E..FF9F ; Hira Kana # Lm [2] HALFWIDTH KATAKANA VOICED SOUND MARK..HALFW
|
|||
|
||||
# ================================================
|
||||
|
||||
# Script_Extensions=Mani Ougr
|
||||
|
||||
10AF2 ; Mani Ougr # Po MANICHAEAN PUNCTUATION DOUBLE DOT WITHIN DOT
|
||||
|
||||
# Total code points: 1
|
||||
|
||||
# ================================================
|
||||
|
||||
# Script_Extensions=Mong Phag
|
||||
|
||||
1802..1803 ; Mong Phag # Po [2] MONGOLIAN COMMA..MONGOLIAN FULL STOP
|
||||
|
@ -383,6 +408,14 @@ FF9E..FF9F ; Hira Kana # Lm [2] HALFWIDTH KATAKANA VOICED SOUND MARK..HALFW
|
|||
|
||||
# ================================================
|
||||
|
||||
# Script_Extensions=Cpmn Cprt Linb
|
||||
|
||||
10100..10101 ; Cpmn Cprt Linb # Po [2] AEGEAN WORD SEPARATOR LINE..AEGEAN WORD SEPARATOR DOT
|
||||
|
||||
# Total code points: 2
|
||||
|
||||
# ================================================
|
||||
|
||||
# Script_Extensions=Cprt Lina Linb
|
||||
|
||||
10107..10133 ; Cprt Lina Linb # No [45] AEGEAN NUMBER ONE..AEGEAN NUMBER NINETY THOUSAND
|
||||
|
@ -449,16 +482,6 @@ A92E ; Kali Latn Mymr # Po KAYAH LI SIGN CWI
|
|||
|
||||
# ================================================
|
||||
|
||||
# Script_Extensions=Arab Rohg Syrc Thaa Yezi
|
||||
|
||||
060C ; Arab Rohg Syrc Thaa Yezi # Po ARABIC COMMA
|
||||
061B ; Arab Rohg Syrc Thaa Yezi # Po ARABIC SEMICOLON
|
||||
061F ; Arab Rohg Syrc Thaa Yezi # Po ARABIC QUESTION MARK
|
||||
|
||||
# Total code points: 3
|
||||
|
||||
# ================================================
|
||||
|
||||
# Script_Extensions=Bopo Hang Hani Hira Kana
|
||||
|
||||
3003 ; Bopo Hang Hani Hira Kana # Po DITTO MARK
|
||||
|
@ -474,6 +497,15 @@ FE45..FE46 ; Bopo Hang Hani Hira Kana # Po [2] SESAME DOT..WHITE SESAME DOT
|
|||
|
||||
# ================================================
|
||||
|
||||
# Script_Extensions=Arab Nkoo Rohg Syrc Thaa Yezi
|
||||
|
||||
060C ; Arab Nkoo Rohg Syrc Thaa Yezi # Po ARABIC COMMA
|
||||
061B ; Arab Nkoo Rohg Syrc Thaa Yezi # Po ARABIC SEMICOLON
|
||||
|
||||
# Total code points: 2
|
||||
|
||||
# ================================================
|
||||
|
||||
# Script_Extensions=Bopo Hang Hani Hira Kana Yiii
|
||||
|
||||
3001..3002 ; Bopo Hang Hani Hira Kana Yiii # Po [2] IDEOGRAPHIC COMMA..IDEOGRAPHIC FULL STOP
|
||||
|
@ -513,9 +545,9 @@ FF64..FF65 ; Bopo Hang Hani Hira Kana Yiii # Po [2] HALFWIDTH IDEOGRAPHIC C
|
|||
|
||||
# ================================================
|
||||
|
||||
# Script_Extensions=Adlm Arab Mand Mani Phlp Rohg Sogd Syrc
|
||||
# Script_Extensions=Adlm Arab Nkoo Rohg Syrc Thaa Yezi
|
||||
|
||||
0640 ; Adlm Arab Mand Mani Phlp Rohg Sogd Syrc # Lm ARABIC TATWEEL
|
||||
061F ; Adlm Arab Nkoo Rohg Syrc Thaa Yezi # Po ARABIC QUESTION MARK
|
||||
|
||||
# Total code points: 1
|
||||
|
||||
|
@ -529,6 +561,14 @@ FF64..FF65 ; Bopo Hang Hani Hira Kana Yiii # Po [2] HALFWIDTH IDEOGRAPHIC C
|
|||
|
||||
# ================================================
|
||||
|
||||
# Script_Extensions=Adlm Arab Mand Mani Ougr Phlp Rohg Sogd Syrc
|
||||
|
||||
0640 ; Adlm Arab Mand Mani Ougr Phlp Rohg Sogd Syrc # Lm ARABIC TATWEEL
|
||||
|
||||
# Total code points: 1
|
||||
|
||||
# ================================================
|
||||
|
||||
# Script_Extensions=Deva Dogr Gujr Guru Khoj Kthi Mahj Modi Sind Takr Tirh
|
||||
|
||||
A836..A837 ; Deva Dogr Gujr Guru Khoj Kthi Mahj Modi Sind Takr Tirh # So [2] NORTH INDIC QUARTER MARK..NORTH INDIC PLACEHOLDER MARK
|
||||
|
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue