Compare commits
142 Commits
pcre2-10.3
...
amigaos
Author | SHA1 | Date |
---|---|---|
George Sokianos | 4a45482c9c | |
Philip Hazel | 8b133fa0ba | |
Philip Hazel | cc5e121c8e | |
Philip Hazel | 1343bdff8f | |
Philip Hazel | d90fb23878 | |
Ezekiel Warren | e47fc51584 | |
Zoltan Herczeg | b67d568201 | |
Zoltan Herczeg | 4851890ede | |
Amin Yahyaabadi | 3e52db5209 | |
Philip Hazel | 4804b00e8f | |
Philip Hazel | 7549fdca74 | |
Philip Hazel | 5271b533c4 | |
larinsv | 45af1203bd | |
Rémi Verschelde | 187b7ba050 | |
William A Rowe Jr | 06f34ba374 | |
GregThain | a334ea2a34 | |
Carlo Marcelo Arenas Belón | 15a82c3efd | |
Philip Hazel | 51a5fcdc1f | |
Philip Hazel | 104fe2fead | |
Philip Hazel | f65df06305 | |
pkeir | a13d7d4340 | |
Lucas Trzesniewski | c630e868ca | |
Joe Zhang | 77ce1ff528 | |
Philip Hazel | ff5402a378 | |
Philip Hazel | b52d055d1b | |
Carlo Marcelo Arenas Belón | a4ac97fea8 | |
Philip Hazel | fedf4d9d40 | |
Philip Hazel | 8ebf9efe7b | |
Carlo Marcelo Arenas Belón | 4edcf6ada5 | |
Philip Hazel | d0c7544e78 | |
Carlo Marcelo Arenas Belón | f28e82602d | |
Philip Hazel | 1bb2b97b29 | |
Lucas Trzesniewski | 3fec24a26f | |
Philip Hazel | 66b3cb34df | |
Philip Hazel | 29a43aa11d | |
Philip Hazel | 3103b8f20a | |
Philip Hazel | 13be26a5c2 | |
pagabuc | ba6a5f16d2 | |
Zoltan Herczeg | d07c967b3a | |
Carlo Marcelo Arenas Belón | 4279abbd7d | |
Philip Hazel | 8ff3ab27d5 | |
Zoltan Herczeg | e612e06b5d | |
Philip Hazel | 64c9baaaa4 | |
Carlo Marcelo Arenas Belón | 9c8abddc52 | |
Carlo Marcelo Arenas Belón | f11c26842d | |
Zoltan Herczeg | 4ca0530b9b | |
Zoltan Herczeg | 03654e751e | |
Zoltan Herczeg | d4fa336fbc | |
Zoltan Herczeg | 50a51cb7e6 | |
Philip Hazel | f7a7341726 | |
Philip Hazel | eef5740ff9 | |
Zoltan Herczeg | dea56d2df9 | |
Adam | 111cd470b5 | |
Philip Hazel | fdd9479108 | |
Philip Hazel | 419e3c68a3 | |
Zoltan Herczeg | e21345de97 | |
Philip Hazel | e85a81ebac | |
Philip Hazel | 504ff06fff | |
Philip Hazel | 360a84e80b | |
Zoltan Herczeg | 061e57695a | |
Philip Hazel | 7f7d3e8521 | |
Philip Hazel | bf35c0518c | |
Zoltan Herczeg | 68fbc1982e | |
Philip Hazel | 06d3a66065 | |
Philip Hazel | 87571b5af3 | |
Philip Hazel | 838cdac4dc | |
Philip Hazel | 628a804102 | |
Philip Hazel | ec091e2e44 | |
Philip Hazel | 636569a957 | |
Philip Hazel | 81d3729c66 | |
Zoltan Herczeg | f90542a209 | |
Carlo Marcelo Arenas Belón | 14dbc6e6ec | |
Philip Hazel | 80205ee2a0 | |
Jessica Clarke | 04ecb267c0 | |
Jessica Clarke | 534b4760e3 | |
Philip Hazel | 31fb2e58a1 | |
Zoltan Herczeg | 435140a0ac | |
Philip Hazel | c24047f15d | |
Zoltan Herczeg | e7457003cd | |
Philip Hazel | d888d36013 | |
Zoltan Herczeg | 6614b281bc | |
Zoltan Herczeg | afa4756d19 | |
Philip Hazel | 7713f33e46 | |
Michael Kaufmann | af2637ee5e | |
Philip Hazel | 98e7d70bc6 | |
Philip Hazel | 321b559ed4 | |
Philip Hazel | 16c8a84cce | |
Philip Hazel | 4514ddd2a2 | |
Philip Hazel | 944f0e10a1 | |
Philip Hazel | b29732063b | |
Philip Hazel | 92d7cf1dd0 | |
Philip Hazel | 1d432ee3cf | |
Philip Hazel | 194a15315a | |
Philip Hazel | 1c41a5b815 | |
Zoltan Herczeg | 4243515033 | |
Philip Hazel | 49b29f837d | |
Philip Hazel | 30abd0ac8d | |
Philip Hazel | 0246c6bf64 | |
Philip Hazel | 823d4ac956 | |
Philip Hazel | ba3d0edcbd | |
Philip Hazel | 4ef0c51d2b | |
Philip Hazel | 7ab2769728 | |
Philip Hazel | 2a294ddadb | |
Philip Hazel | cb854a912e | |
Philip Hazel | 16dccbcb13 | |
Carlo Marcelo Arenas Belón | ae4e6261e5 | |
Carlo Marcelo Arenas Belón | d24a1c9d31 | |
Carlo Marcelo Arenas Belón | 055b7ce4a9 | |
Philip Hazel | 4a8f5d104c | |
Carlo Marcelo Arenas Belón | 587b94277b | |
Philip Hazel | c8d31f1605 | |
Carlo Marcelo Arenas Belón | adf76faace | |
Zoltan Herczeg | d144199dfb | |
Carlo Marcelo Arenas Belón | eb42305f07 | |
Philip Hazel | 46890604a4 | |
Carlo Marcelo Arenas Belón | acc520924c | |
Philip Hazel | bc70a183fc | |
Carlo Marcelo Arenas Belón | dae475092d | |
Philip Hazel | 1ed34b9cb1 | |
Philip Hazel | f19e84674e | |
Carlo Marcelo Arenas Belón | 7db8784296 | |
Philip Hazel | 072717a61f | |
Philip Hazel | 35fee4193b | |
Philip Hazel | 3469b13b8e | |
Philip Hazel | 29c37f9aa3 | |
Carlo Marcelo Arenas Belón | 128c50360c | |
Philip Hazel | bf2c8cc564 | |
Philip Hazel | 87f32b9b39 | |
Philip Hazel | 7ed39af7cc | |
Carlo Marcelo Arenas Belón | 3b973ebf4b | |
Carlo Marcelo Arenas Belón | f5e4e10042 | |
Carlo Marcelo Arenas Belón | d46f1863be | |
Philip Hazel | c99f0738c5 | |
Philip Hazel | 794470b51d | |
PhilipHazel | 179c5d212c | |
Lucas Trzesniewski | ec0755b829 | |
Philip Hazel | 8d9e91228c | |
PhilipHazel | e7af7efaa1 | |
Zoltan Herczeg | 51ec2c9893 | |
Philip Hazel | 0612ed77c2 | |
Philip Hazel | 507e4dcf6f | |
Zoltan Herczeg | dc5f966635 |
|
@ -0,0 +1,3 @@
|
|||
common --experimental_enable_bzlmod
|
||||
build --incompatible_enable_cc_toolchain_resolution
|
||||
build --incompatible_strict_action_env
|
|
@ -0,0 +1,77 @@
|
|||
|
||||
name: Build
|
||||
on: [push, pull_request]
|
||||
|
||||
jobs:
|
||||
linux:
|
||||
name: Linux
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v2
|
||||
|
||||
- name: Autogen
|
||||
run: ./autogen.sh
|
||||
|
||||
- name: Configure
|
||||
run: ./configure --enable-jit --enable-pcre2-8 --enable-pcre2-16 --enable-pcre2-32
|
||||
|
||||
- name: Build
|
||||
run: make
|
||||
|
||||
- name: Test (main test script)
|
||||
run: ./RunTest
|
||||
|
||||
- name: Test (JIT test program)
|
||||
run: ./pcre2_jit_test
|
||||
|
||||
- name: Test (pcre2grep test script)
|
||||
run: ./RunGrepTest
|
||||
|
||||
alpine:
|
||||
name: alpine
|
||||
runs-on: ubuntu-latest
|
||||
container: alpine
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v2
|
||||
|
||||
- name: Autotools
|
||||
run: apk add --no-cache automake autoconf gcc libtool make musl-dev
|
||||
|
||||
- name: Autogen
|
||||
run: ./autogen.sh
|
||||
|
||||
- name: Configure
|
||||
run: ./configure --enable-jit --enable-pcre2-8 --enable-pcre2-16 --enable-pcre2-32
|
||||
|
||||
- name: Build
|
||||
run: make
|
||||
|
||||
- name: Test (main test script)
|
||||
run: ./RunTest
|
||||
|
||||
- name: Test (JIT test program)
|
||||
run: ./pcre2_jit_test
|
||||
|
||||
- name: Test (pcre2grep test script)
|
||||
run: ./RunGrepTest
|
||||
|
||||
windows:
|
||||
name: 32bit Windows
|
||||
runs-on: windows-latest
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v2
|
||||
|
||||
- name: Configure
|
||||
run: cmake -DPCRE2_SUPPORT_JIT=ON -DPCRE2_BUILD_PCRE2_16=ON -DPCRE2_BUILD_PCRE2_32=ON -B build -A Win32
|
||||
|
||||
- name: Build
|
||||
run: cmake --build build
|
||||
|
||||
- name: Test
|
||||
run: |
|
||||
cd build\Debug
|
||||
..\..\RunTest.bat
|
||||
|
|
@ -0,0 +1,73 @@
|
|||
# For most projects, this workflow file will not need changing; you simply need
|
||||
# to commit it to your repository.
|
||||
#
|
||||
# You may wish to alter this file to override the set of languages analyzed,
|
||||
# or to provide custom queries or build logic.
|
||||
#
|
||||
# ******** NOTE ********
|
||||
# We have attempted to detect the languages in your repository. Please check
|
||||
# the `language` matrix defined below to confirm you have the correct set of
|
||||
# supported CodeQL languages.
|
||||
#
|
||||
name: "CodeQL"
|
||||
|
||||
on:
|
||||
push:
|
||||
branches: [ master ]
|
||||
pull_request:
|
||||
# The branches below must be a subset of the branches above
|
||||
branches: [ master ]
|
||||
schedule:
|
||||
- cron: '27 6 * * 4'
|
||||
|
||||
# Declare default permissions as read only.
|
||||
permissions: read-all
|
||||
|
||||
jobs:
|
||||
analyze:
|
||||
name: Analyze
|
||||
runs-on: ubuntu-latest
|
||||
permissions:
|
||||
actions: read
|
||||
contents: read
|
||||
security-events: write
|
||||
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
language: [ 'cpp', 'python' ]
|
||||
# CodeQL supports [ 'cpp', 'csharp', 'go', 'java', 'javascript', 'python', 'ruby' ]
|
||||
# Learn more about CodeQL language support at https://git.io/codeql-language-support
|
||||
|
||||
steps:
|
||||
- name: Checkout repository
|
||||
uses: actions/checkout@v2
|
||||
|
||||
# Initializes the CodeQL tools for scanning.
|
||||
- name: Initialize CodeQL
|
||||
uses: github/codeql-action/init@v1
|
||||
with:
|
||||
languages: ${{ matrix.language }}
|
||||
# If you wish to specify custom queries, you can do so here or in a config file.
|
||||
# By default, queries listed here will override any specified in a config file.
|
||||
# Prefix the list here with "+" to use these queries and those in the config file.
|
||||
# queries: ./path/to/local/query, your-org/your-repo/queries@main
|
||||
|
||||
# Autobuild attempts to build any compiled languages (C/C++, C#, or Java).
|
||||
# If this step fails, then you should remove it and run the build manually (see below)
|
||||
- name: Autobuild
|
||||
uses: github/codeql-action/autobuild@v1
|
||||
|
||||
# ℹ️ Command-line programs to run using the OS shell.
|
||||
# 📚 https://git.io/JvXDl
|
||||
|
||||
# ✏️ If the Autobuild fails above, remove it and uncomment the following three lines
|
||||
# and modify them (or add more) to build your code if your project
|
||||
# uses a compiled language
|
||||
|
||||
#- run: |
|
||||
# make bootstrap
|
||||
# make release
|
||||
|
||||
- name: Perform CodeQL Analysis
|
||||
uses: github/codeql-action/analyze@v1
|
|
@ -0,0 +1,55 @@
|
|||
name: Scorecards supply-chain security
|
||||
on:
|
||||
# Only the default branch is supported.
|
||||
branch_protection_rule:
|
||||
schedule:
|
||||
- cron: '23 17 * * 1'
|
||||
push:
|
||||
branches: [ master ]
|
||||
|
||||
# Declare default permissions as read only.
|
||||
permissions: read-all
|
||||
|
||||
jobs:
|
||||
analysis:
|
||||
name: Scorecards analysis
|
||||
runs-on: ubuntu-latest
|
||||
permissions:
|
||||
# Needed to upload the results to code-scanning dashboard.
|
||||
security-events: write
|
||||
actions: read
|
||||
contents: read
|
||||
|
||||
steps:
|
||||
- name: "Checkout code"
|
||||
uses: actions/checkout@ec3a7ce113134d7a93b817d10a8272cb61118579 # v2.4.0
|
||||
with:
|
||||
persist-credentials: false
|
||||
|
||||
- name: "Run analysis"
|
||||
uses: ossf/scorecard-action@c1aec4ac820532bab364f02a81873c555a0ba3a1 # v1.0.4
|
||||
with:
|
||||
results_file: results.sarif
|
||||
results_format: sarif
|
||||
# Read-only PAT token. To create it,
|
||||
# follow the steps in https://github.com/ossf/scorecard-action#pat-token-creation.
|
||||
repo_token: ${{ secrets.SCORECARD_READ_TOKEN }}
|
||||
# Publish the results to enable scorecard badges. For more details, see
|
||||
# https://github.com/ossf/scorecard-action#publishing-results.
|
||||
# For private repositories, `publish_results` will automatically be set to `false`,
|
||||
# regardless of the value entered here.
|
||||
publish_results: true
|
||||
|
||||
# Upload the results as artifacts (optional).
|
||||
- name: "Upload artifact"
|
||||
uses: actions/upload-artifact@82c141cc518b40d92cc801eee768e7aafc9c2fa2 # v2.3.1
|
||||
with:
|
||||
name: SARIF file
|
||||
path: results.sarif
|
||||
retention-days: 5
|
||||
|
||||
# Upload the results to GitHub's code scanning dashboard.
|
||||
- name: "Upload to code-scanning"
|
||||
uses: github/codeql-action/upload-sarif@5f532563584d71fdef14ee64d17bafb34f751ce5 # v1.0.26
|
||||
with:
|
||||
sarif_file: results.sarif
|
|
@ -6,7 +6,9 @@
|
|||
*.pc
|
||||
*.o
|
||||
*~
|
||||
*.lha
|
||||
|
||||
__pycache__
|
||||
.deps
|
||||
.libs
|
||||
|
||||
|
@ -74,4 +76,7 @@ src/pcre2.h
|
|||
src/pcre2_chartables.c
|
||||
src/stamp-h1
|
||||
|
||||
/bazel-*
|
||||
|
||||
# End
|
||||
|
||||
|
|
6
AUTHORS
6
AUTHORS
|
@ -8,7 +8,7 @@ Email domain: gmail.com
|
|||
Retired from University of Cambridge Computing Service,
|
||||
Cambridge, England.
|
||||
|
||||
Copyright (c) 1997-2021 University of Cambridge
|
||||
Copyright (c) 1997-2022 University of Cambridge
|
||||
All rights reserved
|
||||
|
||||
|
||||
|
@ -19,7 +19,7 @@ Written by: Zoltan Herczeg
|
|||
Email local part: hzmester
|
||||
Emain domain: freemail.hu
|
||||
|
||||
Copyright(c) 2010-2021 Zoltan Herczeg
|
||||
Copyright(c) 2010-2022 Zoltan Herczeg
|
||||
All rights reserved.
|
||||
|
||||
|
||||
|
@ -30,7 +30,7 @@ Written by: Zoltan Herczeg
|
|||
Email local part: hzmester
|
||||
Emain domain: freemail.hu
|
||||
|
||||
Copyright(c) 2009-2021 Zoltan Herczeg
|
||||
Copyright(c) 2009-2022 Zoltan Herczeg
|
||||
All rights reserved.
|
||||
|
||||
####
|
||||
|
|
|
@ -0,0 +1,72 @@
|
|||
load("@rules_cc//cc:defs.bzl", "cc_library", "cc_test")
|
||||
load("@bazel_skylib//rules:copy_file.bzl", "copy_file")
|
||||
|
||||
copy_file(
|
||||
name = "config_h_generic",
|
||||
src = "src/config.h.generic",
|
||||
out = "src/config.h",
|
||||
)
|
||||
|
||||
copy_file(
|
||||
name = "pcre2_h_generic",
|
||||
src = "src/pcre2.h.generic",
|
||||
out = "src/pcre2.h",
|
||||
)
|
||||
|
||||
copy_file(
|
||||
name = "pcre2_chartables_c",
|
||||
src = "src/pcre2_chartables.c.dist",
|
||||
out = "src/pcre2_chartables.c",
|
||||
)
|
||||
|
||||
cc_library(
|
||||
name = "pcre2",
|
||||
srcs = [
|
||||
"src/pcre2_auto_possess.c",
|
||||
"src/pcre2_compile.c",
|
||||
"src/pcre2_config.c",
|
||||
"src/pcre2_context.c",
|
||||
"src/pcre2_convert.c",
|
||||
"src/pcre2_dfa_match.c",
|
||||
"src/pcre2_error.c",
|
||||
"src/pcre2_extuni.c",
|
||||
"src/pcre2_find_bracket.c",
|
||||
"src/pcre2_maketables.c",
|
||||
"src/pcre2_match.c",
|
||||
"src/pcre2_match_data.c",
|
||||
"src/pcre2_newline.c",
|
||||
"src/pcre2_ord2utf.c",
|
||||
"src/pcre2_pattern_info.c",
|
||||
"src/pcre2_script_run.c",
|
||||
"src/pcre2_serialize.c",
|
||||
"src/pcre2_string_utils.c",
|
||||
"src/pcre2_study.c",
|
||||
"src/pcre2_substitute.c",
|
||||
"src/pcre2_substring.c",
|
||||
"src/pcre2_tables.c",
|
||||
"src/pcre2_ucd.c",
|
||||
"src/pcre2_ucptables.c",
|
||||
"src/pcre2_valid_utf.c",
|
||||
"src/pcre2_xclass.c",
|
||||
":pcre2_chartables_c",
|
||||
],
|
||||
hdrs = glob(["src/*.h"]) + [
|
||||
":config_h_generic",
|
||||
":pcre2_h_generic",
|
||||
],
|
||||
defines = [
|
||||
"HAVE_CONFIG_H",
|
||||
"PCRE2_CODE_UNIT_WIDTH=8",
|
||||
"PCRE2_STATIC",
|
||||
],
|
||||
includes = ["src"],
|
||||
strip_include_prefix = "src",
|
||||
visibility = ["//visibility:public"],
|
||||
)
|
||||
|
||||
cc_binary(
|
||||
name = "pcre2demo",
|
||||
srcs = ["src/pcre2demo.c"],
|
||||
visibility = ["//visibility:public"],
|
||||
deps = [":pcre2"],
|
||||
)
|
100
CMakeLists.txt
100
CMakeLists.txt
|
@ -103,13 +103,18 @@
|
|||
PROJECT(PCRE2 C)
|
||||
|
||||
# Increased minimum to 2.8.5 to support GNUInstallDirs.
|
||||
# Increased minimum to 3.0.0 because older than 2.8.12 is deprecated.
|
||||
CMAKE_MINIMUM_REQUIRED(VERSION 3.0.0)
|
||||
# Increased minimum to 3.1 to support imported targets.
|
||||
CMAKE_MINIMUM_REQUIRED(VERSION 3.1)
|
||||
|
||||
# Set policy CMP0026 to avoid warnings for the use of LOCATION in
|
||||
# GET_TARGET_PROPERTY. This should no longer be required.
|
||||
# CMAKE_POLICY(SET CMP0026 OLD)
|
||||
|
||||
# With a recent cmake, you can provide a rootdir to look for non
|
||||
# standard installed library dependencies, but to do so, the policy
|
||||
# needs to be set to new (by uncommenting the following)
|
||||
# CMAKE_POLICY(SET CMP0074 NEW)
|
||||
|
||||
# For FindReadline.cmake. This was changed to allow setting CMAKE_MODULE_PATH
|
||||
# on the command line.
|
||||
# SET(CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake)
|
||||
|
@ -134,8 +139,6 @@ INCLUDE(CheckTypeSize)
|
|||
INCLUDE(GNUInstallDirs) # for CMAKE_INSTALL_LIBDIR
|
||||
|
||||
CHECK_INCLUDE_FILE(dirent.h HAVE_DIRENT_H)
|
||||
CHECK_INCLUDE_FILE(stdint.h HAVE_STDINT_H)
|
||||
CHECK_INCLUDE_FILE(inttypes.h HAVE_INTTYPES_H)
|
||||
CHECK_INCLUDE_FILE(sys/stat.h HAVE_SYS_STAT_H)
|
||||
CHECK_INCLUDE_FILE(sys/types.h HAVE_SYS_TYPES_H)
|
||||
CHECK_INCLUDE_FILE(unistd.h HAVE_UNISTD_H)
|
||||
|
@ -144,10 +147,16 @@ CHECK_INCLUDE_FILE(windows.h HAVE_WINDOWS_H)
|
|||
CHECK_SYMBOL_EXISTS(bcopy "strings.h" HAVE_BCOPY)
|
||||
CHECK_SYMBOL_EXISTS(memfd_create "sys/mman.h" HAVE_MEMFD_CREATE)
|
||||
CHECK_SYMBOL_EXISTS(memmove "string.h" HAVE_MEMMOVE)
|
||||
CHECK_SYMBOL_EXISTS(realpath "stdlib.h" HAVE_REALPATH)
|
||||
CHECK_SYMBOL_EXISTS(secure_getenv "stdlib.h" HAVE_SECURE_GETENV)
|
||||
CHECK_SYMBOL_EXISTS(strerror "string.h" HAVE_STRERROR)
|
||||
|
||||
CHECK_C_SOURCE_COMPILES(
|
||||
"#include <stdlib.h>
|
||||
#include <limits.h>
|
||||
int main(int c, char *v[]) { char buf[PATH_MAX]; realpath(v[1], buf); return 0; }"
|
||||
HAVE_REALPATH
|
||||
)
|
||||
|
||||
set(ORIG_CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS})
|
||||
set(CMAKE_REQUIRED_FLAGS "${CMAKE_REQUIRED_FLAGS} -Werror")
|
||||
CHECK_C_SOURCE_COMPILES(
|
||||
|
@ -302,9 +311,19 @@ ENDIF(PCRE2_SUPPORT_LIBZ)
|
|||
IF(EDITLINE_FOUND)
|
||||
OPTION (PCRE2_SUPPORT_LIBEDIT "Enable support for linking pcre2test with libedit." OFF)
|
||||
ENDIF(EDITLINE_FOUND)
|
||||
IF(PCRE2_SUPPORT_LIBEDIT)
|
||||
INCLUDE_DIRECTORIES(${EDITLINE_INCLUDE_DIR})
|
||||
ENDIF(PCRE2_SUPPORT_LIBEDIT)
|
||||
IF(EDITLINE_FOUND)
|
||||
IF(PCRE2_SUPPORT_LIBEDIT)
|
||||
INCLUDE_DIRECTORIES(${EDITLINE_INCLUDE_DIR})
|
||||
ENDIF(PCRE2_SUPPORT_LIBEDIT)
|
||||
ELSE(EDITLINE_FOUND)
|
||||
IF(PCRE2_SUPPORT_LIBEDIT)
|
||||
MESSAGE(FATAL_ERROR
|
||||
" libedit not found, set EDITLINE_INCLUDE_DIR to a compatible header\n"
|
||||
" or set Editline_ROOT to a full libedit installed tree, as needed\n"
|
||||
" Might need to enable policy CMP0074 in CMakeLists.txt"
|
||||
)
|
||||
ENDIF(PCRE2_SUPPORT_LIBEDIT)
|
||||
ENDIF(EDITLINE_FOUND)
|
||||
|
||||
# readline lib
|
||||
IF(READLINE_FOUND)
|
||||
|
@ -342,7 +361,12 @@ IF(PCRE2_BUILD_PCRE2GREP AND NOT PCRE2_BUILD_PCRE2_8)
|
|||
ENDIF(PCRE2_BUILD_PCRE2GREP AND NOT PCRE2_BUILD_PCRE2_8)
|
||||
|
||||
IF(PCRE2_SUPPORT_LIBREADLINE AND PCRE2_SUPPORT_LIBEDIT)
|
||||
MESSAGE(FATAL_ERROR "Only one of libreadline or libeditline can be specified")
|
||||
IF(READLINE_FOUND)
|
||||
MESSAGE(FATAL_ERROR
|
||||
" Only one of the readline compatible libraries can be enabled.\n"
|
||||
" Disable libreadline with -DPCRE2_SUPPORT_LIBREADLINE=OFF"
|
||||
)
|
||||
ENDIF(READLINE_FOUND)
|
||||
ENDIF(PCRE2_SUPPORT_LIBREADLINE AND PCRE2_SUPPORT_LIBEDIT)
|
||||
|
||||
IF(PCRE2_SUPPORT_BSR_ANYCRLF)
|
||||
|
@ -358,7 +382,13 @@ IF(PCRE2_SUPPORT_UNICODE)
|
|||
ENDIF(PCRE2_SUPPORT_UNICODE)
|
||||
|
||||
IF(PCRE2_SUPPORT_JIT)
|
||||
SET(SUPPORT_JIT 1)
|
||||
SET(SUPPORT_JIT 1)
|
||||
IF(UNIX)
|
||||
FIND_PACKAGE(Threads REQUIRED)
|
||||
IF(CMAKE_USE_PTHREADS_INIT)
|
||||
SET(REQUIRE_PTHREAD 1)
|
||||
ENDIF(CMAKE_USE_PTHREADS_INIT)
|
||||
ENDIF(UNIX)
|
||||
ENDIF(PCRE2_SUPPORT_JIT)
|
||||
|
||||
IF(PCRE2_SUPPORT_JIT_SEALLOC)
|
||||
|
@ -628,6 +658,8 @@ IF(MINGW AND BUILD_SHARED_LIBS)
|
|||
ENDIF(MINGW AND BUILD_SHARED_LIBS)
|
||||
|
||||
IF(MSVC AND BUILD_SHARED_LIBS)
|
||||
SET(dll_pdb_files ${PROJECT_BINARY_DIR}/pcre2-posix.pdb ${dll_pdb_files})
|
||||
SET(dll_pdb_debug_files ${PROJECT_BINARY_DIR}/pcre2-posixd.pdb ${dll_pdb_debug_files})
|
||||
IF (EXISTS ${PROJECT_SOURCE_DIR}/pcre2.rc)
|
||||
SET(PCRE2_SOURCES ${PCRE2_SOURCES} pcre2.rc)
|
||||
ENDIF(EXISTS ${PROJECT_SOURCE_DIR}/pcre2.rc)
|
||||
|
@ -673,6 +705,10 @@ IF(PCRE2_BUILD_PCRE2_8)
|
|||
VERSION ${LIBPCRE2_8_VERSION}
|
||||
SOVERSION ${LIBPCRE2_8_SOVERSION})
|
||||
TARGET_COMPILE_DEFINITIONS(pcre2-8-static PUBLIC PCRE2_STATIC)
|
||||
TARGET_INCLUDE_DIRECTORIES(pcre2-8-static PUBLIC ${PROJECT_BINARY_DIR})
|
||||
IF(REQUIRE_PTHREAD)
|
||||
TARGET_LINK_LIBRARIES(pcre2-8-static Threads::Threads)
|
||||
ENDIF(REQUIRE_PTHREAD)
|
||||
SET(targets ${targets} pcre2-8-static)
|
||||
ADD_LIBRARY(pcre2-posix-static STATIC ${PCRE2POSIX_HEADERS} ${PCRE2POSIX_SOURCES})
|
||||
SET_TARGET_PROPERTIES(pcre2-posix-static PROPERTIES
|
||||
|
@ -683,6 +719,7 @@ IF(PCRE2_BUILD_PCRE2_8)
|
|||
SOVERSION ${LIBPCRE2_POSIX_SOVERSION})
|
||||
TARGET_LINK_LIBRARIES(pcre2-posix-static pcre2-8-static)
|
||||
TARGET_COMPILE_DEFINITIONS(pcre2-posix-static PUBLIC PCRE2_STATIC)
|
||||
TARGET_INCLUDE_DIRECTORIES(pcre2-posix-static PUBLIC ${PROJECT_BINARY_DIR})
|
||||
SET(targets ${targets} pcre2-posix-static)
|
||||
|
||||
IF(MSVC)
|
||||
|
@ -699,6 +736,7 @@ IF(PCRE2_BUILD_PCRE2_8)
|
|||
|
||||
IF(BUILD_SHARED_LIBS)
|
||||
ADD_LIBRARY(pcre2-8-shared SHARED ${PCRE2_HEADERS} ${PCRE2_SOURCES} ${PROJECT_BINARY_DIR}/config.h)
|
||||
TARGET_INCLUDE_DIRECTORIES(pcre2-8-shared PUBLIC ${PROJECT_BINARY_DIR})
|
||||
SET_TARGET_PROPERTIES(pcre2-8-shared PROPERTIES
|
||||
COMPILE_DEFINITIONS PCRE2_CODE_UNIT_WIDTH=8
|
||||
MACHO_COMPATIBILITY_VERSION "${LIBPCRE2_8_MACHO_COMPATIBILITY_VERSION}"
|
||||
|
@ -706,8 +744,12 @@ IF(PCRE2_BUILD_PCRE2_8)
|
|||
VERSION ${LIBPCRE2_8_VERSION}
|
||||
SOVERSION ${LIBPCRE2_8_SOVERSION}
|
||||
OUTPUT_NAME pcre2-8)
|
||||
IF(REQUIRE_PTHREAD)
|
||||
TARGET_LINK_LIBRARIES(pcre2-8-shared Threads::Threads)
|
||||
ENDIF(REQUIRE_PTHREAD)
|
||||
SET(targets ${targets} pcre2-8-shared)
|
||||
ADD_LIBRARY(pcre2-posix-shared SHARED ${PCRE2POSIX_HEADERS} ${PCRE2POSIX_SOURCES})
|
||||
TARGET_INCLUDE_DIRECTORIES(pcre2-posix-shared PUBLIC ${PROJECT_BINARY_DIR})
|
||||
SET_TARGET_PROPERTIES(pcre2-posix-shared PROPERTIES
|
||||
COMPILE_DEFINITIONS PCRE2_CODE_UNIT_WIDTH=8
|
||||
MACHO_COMPATIBILITY_VERSION "${LIBPCRE2_POSIX_MACHO_COMPATIBILITY_VERSION}"
|
||||
|
@ -717,6 +759,8 @@ IF(PCRE2_BUILD_PCRE2_8)
|
|||
OUTPUT_NAME pcre2-posix)
|
||||
TARGET_LINK_LIBRARIES(pcre2-posix-shared pcre2-8-shared)
|
||||
SET(targets ${targets} pcre2-posix-shared)
|
||||
SET(dll_pdb_files ${PROJECT_BINARY_DIR}/pcre2-8.pdb ${dll_pdb_files})
|
||||
SET(dll_pdb_debug_files ${PROJECT_BINARY_DIR}/pcre2-8d.pdb ${dll_pdb_debug_files})
|
||||
|
||||
IF(MINGW)
|
||||
IF(NON_STANDARD_LIB_PREFIX)
|
||||
|
@ -742,6 +786,7 @@ ENDIF(PCRE2_BUILD_PCRE2_8)
|
|||
IF(PCRE2_BUILD_PCRE2_16)
|
||||
IF(BUILD_STATIC_LIBS)
|
||||
ADD_LIBRARY(pcre2-16-static STATIC ${PCRE2_HEADERS} ${PCRE2_SOURCES} ${PROJECT_BINARY_DIR}/config.h)
|
||||
TARGET_INCLUDE_DIRECTORIES(pcre2-16-static PUBLIC ${PROJECT_BINARY_DIR})
|
||||
SET_TARGET_PROPERTIES(pcre2-16-static PROPERTIES
|
||||
COMPILE_DEFINITIONS PCRE2_CODE_UNIT_WIDTH=16
|
||||
MACHO_COMPATIBILITY_VERSION "${LIBPCRE2_32_MACHO_COMPATIBILITY_VERSION}"
|
||||
|
@ -749,6 +794,9 @@ IF(PCRE2_BUILD_PCRE2_16)
|
|||
VERSION ${LIBPCRE2_16_VERSION}
|
||||
SOVERSION ${LIBPCRE2_16_SOVERSION})
|
||||
TARGET_COMPILE_DEFINITIONS(pcre2-16-static PUBLIC PCRE2_STATIC)
|
||||
IF(REQUIRE_PTHREAD)
|
||||
TARGET_LINK_LIBRARIES(pcre2-16-static Threads::Threads)
|
||||
ENDIF(REQUIRE_PTHREAD)
|
||||
SET(targets ${targets} pcre2-16-static)
|
||||
|
||||
IF(MSVC)
|
||||
|
@ -763,6 +811,7 @@ IF(PCRE2_BUILD_PCRE2_16)
|
|||
|
||||
IF(BUILD_SHARED_LIBS)
|
||||
ADD_LIBRARY(pcre2-16-shared SHARED ${PCRE2_HEADERS} ${PCRE2_SOURCES} ${PROJECT_BINARY_DIR}/config.h)
|
||||
TARGET_INCLUDE_DIRECTORIES(pcre2-16-shared PUBLIC ${PROJECT_BINARY_DIR})
|
||||
SET_TARGET_PROPERTIES(pcre2-16-shared PROPERTIES
|
||||
COMPILE_DEFINITIONS PCRE2_CODE_UNIT_WIDTH=16
|
||||
MACHO_COMPATIBILITY_VERSION "${LIBPCRE2_32_MACHO_COMPATIBILITY_VERSION}"
|
||||
|
@ -770,7 +819,12 @@ IF(PCRE2_BUILD_PCRE2_16)
|
|||
VERSION ${LIBPCRE2_16_VERSION}
|
||||
SOVERSION ${LIBPCRE2_16_SOVERSION}
|
||||
OUTPUT_NAME pcre2-16)
|
||||
IF(REQUIRE_PTHREAD)
|
||||
TARGET_LINK_LIBRARIES(pcre2-16-shared Threads::Threads)
|
||||
ENDIF(REQUIRE_PTHREAD)
|
||||
SET(targets ${targets} pcre2-16-shared)
|
||||
SET(dll_pdb_files ${PROJECT_BINARY_DIR}/pcre2-16.pdb ${dll_pdb_files})
|
||||
SET(dll_pdb_debug_files ${PROJECT_BINARY_DIR}/pcre2-16d.pdb ${dll_pdb_debug_files})
|
||||
|
||||
IF(MINGW)
|
||||
IF(NON_STANDARD_LIB_PREFIX)
|
||||
|
@ -794,6 +848,7 @@ ENDIF(PCRE2_BUILD_PCRE2_16)
|
|||
IF(PCRE2_BUILD_PCRE2_32)
|
||||
IF(BUILD_STATIC_LIBS)
|
||||
ADD_LIBRARY(pcre2-32-static STATIC ${PCRE2_HEADERS} ${PCRE2_SOURCES} ${PROJECT_BINARY_DIR}/config.h)
|
||||
TARGET_INCLUDE_DIRECTORIES(pcre2-32-static PUBLIC ${PROJECT_BINARY_DIR})
|
||||
SET_TARGET_PROPERTIES(pcre2-32-static PROPERTIES
|
||||
COMPILE_DEFINITIONS PCRE2_CODE_UNIT_WIDTH=32
|
||||
MACHO_COMPATIBILITY_VERSION "${LIBPCRE2_32_MACHO_COMPATIBILITY_VERSION}"
|
||||
|
@ -801,6 +856,9 @@ IF(PCRE2_BUILD_PCRE2_32)
|
|||
VERSION ${LIBPCRE2_32_VERSION}
|
||||
SOVERSION ${LIBPCRE2_32_SOVERSION})
|
||||
TARGET_COMPILE_DEFINITIONS(pcre2-32-static PUBLIC PCRE2_STATIC)
|
||||
IF(REQUIRE_PTHREAD)
|
||||
TARGET_LINK_LIBRARIES(pcre2-32-static Threads::Threads)
|
||||
ENDIF(REQUIRE_PTHREAD)
|
||||
SET(targets ${targets} pcre2-32-static)
|
||||
|
||||
IF(MSVC)
|
||||
|
@ -815,6 +873,7 @@ IF(PCRE2_BUILD_PCRE2_32)
|
|||
|
||||
IF(BUILD_SHARED_LIBS)
|
||||
ADD_LIBRARY(pcre2-32-shared SHARED ${PCRE2_HEADERS} ${PCRE2_SOURCES} ${PROJECT_BINARY_DIR}/config.h)
|
||||
TARGET_INCLUDE_DIRECTORIES(pcre2-32-shared PUBLIC ${PROJECT_BINARY_DIR})
|
||||
SET_TARGET_PROPERTIES(pcre2-32-shared PROPERTIES
|
||||
COMPILE_DEFINITIONS PCRE2_CODE_UNIT_WIDTH=32
|
||||
MACHO_COMPATIBILITY_VERSION "${LIBPCRE2_32_MACHO_COMPATIBILITY_VERSION}"
|
||||
|
@ -822,7 +881,12 @@ IF(PCRE2_BUILD_PCRE2_32)
|
|||
VERSION ${LIBPCRE2_32_VERSION}
|
||||
SOVERSION ${LIBPCRE2_32_SOVERSION}
|
||||
OUTPUT_NAME pcre2-32)
|
||||
IF(REQUIRE_PTHREAD)
|
||||
TARGET_LINK_LIBRARIES(pcre2-32-shared Threads::Threads)
|
||||
ENDIF(REQUIRE_PTHREAD)
|
||||
SET(targets ${targets} pcre2-32-shared)
|
||||
SET(dll_pdb_files ${PROJECT_BINARY_DIR}/pcre2-32.pdb ${dll_pdb_files})
|
||||
SET(dll_pdb_debug_files ${PROJECT_BINARY_DIR}/pcre2-32d.pdb ${dll_pdb_debug_files})
|
||||
|
||||
IF(MINGW)
|
||||
IF(NON_STANDARD_LIB_PREFIX)
|
||||
|
@ -1024,25 +1088,13 @@ FILE(GLOB html ${PROJECT_SOURCE_DIR}/doc/html/*.html)
|
|||
FILE(GLOB man1 ${PROJECT_SOURCE_DIR}/doc/*.1)
|
||||
FILE(GLOB man3 ${PROJECT_SOURCE_DIR}/doc/*.3)
|
||||
|
||||
FOREACH(man ${man3})
|
||||
GET_FILENAME_COMPONENT(man_tmp ${man} NAME)
|
||||
SET(man3_new ${man3} ${man})
|
||||
ENDFOREACH(man ${man3})
|
||||
SET(man3 ${man3_new})
|
||||
|
||||
INSTALL(FILES ${man1} DESTINATION man/man1)
|
||||
INSTALL(FILES ${man3} DESTINATION man/man3)
|
||||
INSTALL(FILES ${html} DESTINATION share/doc/pcre2/html)
|
||||
|
||||
IF(MSVC AND INSTALL_MSVC_PDB)
|
||||
INSTALL(FILES ${PROJECT_BINARY_DIR}/pcre2.pdb
|
||||
${PROJECT_BINARY_DIR}/pcre2posix.pdb
|
||||
DESTINATION bin
|
||||
CONFIGURATIONS RelWithDebInfo)
|
||||
INSTALL(FILES ${PROJECT_BINARY_DIR}/pcre2d.pdb
|
||||
${PROJECT_BINARY_DIR}/pcre2posixd.pdb
|
||||
DESTINATION bin
|
||||
CONFIGURATIONS Debug)
|
||||
INSTALL(FILES ${dll_pdb_files} DESTINATION bin CONFIGURATIONS RelWithDebInfo)
|
||||
INSTALL(FILES ${dll_pdb_debug_files} DESTINATION bin CONFIGURATIONS Debug)
|
||||
ENDIF(MSVC AND INSTALL_MSVC_PDB)
|
||||
|
||||
# Help, only for nice output
|
||||
|
|
240
ChangeLog
240
ChangeLog
|
@ -1,11 +1,229 @@
|
|||
Change Log for PCRE2
|
||||
--------------------
|
||||
Change Log for PCRE2 - see also the Git log
|
||||
-------------------------------------------
|
||||
|
||||
Version 10.38-RC1 31-August-2021
|
||||
--------------------------------
|
||||
|
||||
Version 10.41 xx-xxx-2022
|
||||
-------------------------
|
||||
|
||||
1. Add fflush() before and after a fork callout in pcre2grep to get its output
|
||||
to be the same on all systems. (THere were previously ordering differences in
|
||||
Alpine Linux).
|
||||
|
||||
2. Merged patch from @carenas (GitHub #110) for pthreads support in CMake.
|
||||
|
||||
3. SSF scorecards grumbled about possible overflow in an expression in
|
||||
pcre2test. It never would have overflowed in practice, but some casts have been
|
||||
added and at the some time there's been some tidying of fprints that output
|
||||
size_t values.
|
||||
|
||||
4. PR #94 showed up an unused enum in pcre2_convert.c, which is now removed.
|
||||
|
||||
5. Minor code re-arrangement to remove gcc warning about realloc() in
|
||||
pcre2test.
|
||||
|
||||
6. Change a number of int variables that hold buffer and line lengths in
|
||||
pcre2grep to PCRE2_SIZE (aka size_t).
|
||||
|
||||
7. Added an #ifdef to cut out a call to PRIV(jit_free) when JIT is not
|
||||
supported (even though that function would do nothing in that case) at the
|
||||
request of a user who doesn't even want to link with pcre_jit_compile.o. Also
|
||||
tidied up an untidy #ifdef arrangement in pcre2test.
|
||||
|
||||
8. Fixed an issue in the backtracking optimization of character repeats in
|
||||
JIT. Furthermore optimize star repetitions, not just plus repetitions.
|
||||
|
||||
9. Removed the use of an initial backtracking frames vector on the system stack
|
||||
in pcre2_match() so that it now always uses the heap. (In a multi-thread
|
||||
environment with very small stacks there had been an issue.) This also is
|
||||
tidier for JIT matching, which didn't need that vector. The heap vector is now
|
||||
remembered in the match data block and re-used if that block itself is re-used.
|
||||
It is freed with the match data block.
|
||||
|
||||
10. Adjusted the find_limits code in pcre2test to work with change 9 above.
|
||||
|
||||
11. Added find_limits_noheap to pcre2test, because the heap limits are now
|
||||
different in different environments and so cannot be included in the standard
|
||||
tests.
|
||||
|
||||
12. Created a test for pcre2_match() heap processing that is not part of the
|
||||
tests run by 'make check', but can be run manually. The current output is from
|
||||
a 64-bit system.
|
||||
|
||||
13. Implemented -Z aka --null in pcre2grep.
|
||||
|
||||
|
||||
Version 10.40 15-April-2022
|
||||
---------------------------
|
||||
|
||||
1. Merged patch from @carenas (GitHub #35, 7db87842) to fix pcre2grep incorrect
|
||||
handling of multiple passes.
|
||||
|
||||
2. Merged patch from @carenas (GitHub #36, dae47509) to fix portability issue
|
||||
in pcre2grep with buffered fseek(stdin).
|
||||
|
||||
3. Merged patch from @carenas (GitHub #37, acc520924) to fix tests when -S is
|
||||
not supported.
|
||||
|
||||
4. Revert an unintended change in JIT repeat detection.
|
||||
|
||||
5. Merged patch from @carenas (GitHub #52, b037bfa1) to fix build on GNU Hurd.
|
||||
|
||||
6. Merged documentation and comments patches from @carenas (GitHub #47).
|
||||
|
||||
7. Merged patch from @carenas (GitHub #49) to remove obsolete JFriedl test code
|
||||
from pcre2grep.
|
||||
|
||||
8. Merged patch from @carenas (GitHub #48) to fix CMake install issue #46.
|
||||
|
||||
9. Merged patch from @carenas (GitHub #53) fixing NULL checks in matching and
|
||||
substituting.
|
||||
|
||||
10. Add null_subject and null_replacement modifiers to pcre2test.
|
||||
|
||||
11. Add check for NULL subject to POSIX regexec() function.
|
||||
|
||||
12. Add check for NULL replacement to pcre2_substitute().
|
||||
|
||||
13. For the subject arguments of pcre2_match(), pcre2_dfa_match(), and
|
||||
pcre2_substitute(), and the replacement argument of the latter, if the pointer
|
||||
is NULL and the length is zero, treat as an empty string. Apparently a number
|
||||
of applications treat NULL/0 in this way.
|
||||
|
||||
14. Added support for Bidi_Class and a number of binary Unicode properties,
|
||||
including Bidi_Control.
|
||||
|
||||
15. Fix some minor issues raised by clang sanitize.
|
||||
|
||||
16. Very minor code speed up for maximizing character property matches.
|
||||
|
||||
17. A number of changes to script matching for \p and \P:
|
||||
|
||||
(a) Script extensions for a character are now coded as a bitmap instead of
|
||||
a list of script numbers, which should be faster and does not need a
|
||||
loop.
|
||||
|
||||
(b) Added the syntax \p{script:xxx} and \p{script_extensions:xxx} (synonyms
|
||||
sc and scx).
|
||||
|
||||
(c) Changed \p{scriptname} from being the same as \p{sc:scriptname} to being
|
||||
the same as \p{scx:scriptname} because this change happened in Perl at
|
||||
release 5.26.
|
||||
|
||||
(d) The standard Unicode 4-letter abbreviations for script names are now
|
||||
recognized.
|
||||
|
||||
(e) In accordance with Unicode and Perl's "loose matching" rules, spaces,
|
||||
hyphens, and underscores are ignored in property names, which are then
|
||||
matched independent of case.
|
||||
|
||||
18. The Python scripts in the maint directory have been refactored. There are
|
||||
now three scripts that generate pcre2_ucd.c, pcre2_ucp.h, and pcre2_ucptables.c
|
||||
(which is #included by pcre2_tables.c). The data lists that used to be
|
||||
duplicated are now held in a single common Python module.
|
||||
|
||||
19. On CHERI, and thus Arm's Morello prototype, pointers are represented as
|
||||
hardware capabilities, which consist of both an integer address and additional
|
||||
metadata, meaning they are twice the size of the platform's size_t type, i.e.
|
||||
16 bytes on a 64-bit system. The ovector member of heapframe happens to only be
|
||||
8 byte aligned, and so computing frame_size ended up with a multiple of 8 but
|
||||
not 16. Whilst the first frame was always suitably aligned, this then
|
||||
misaligned the frame that follows, resulting in an alignment fault when storing
|
||||
a pointer to Fecode at the start of match. Patch to fix this issue by Jessica
|
||||
Clarke PR#72.
|
||||
|
||||
20. Added -LP and -LS listing options to pcre2test.
|
||||
|
||||
21. A user discovered that the library names in CMakeLists.txt for MSVC
|
||||
debugger (PDB) files were incorrect - perhaps never tried for PCRE2?
|
||||
|
||||
22. An item such as [Aa] is optimized into a caseless single character match.
|
||||
When this was quantified (e.g. [Aa]{2}) and was also the last literal item in a
|
||||
pattern, the optimizing "must be present for a match" character check was not
|
||||
being flagged as caseless, causing some matches that should have succeeded to
|
||||
fail.
|
||||
|
||||
23. Fixed a unicode property matching issue in JIT. The character was not
|
||||
fully read in caseless matching.
|
||||
|
||||
24. Fixed an issue affecting recursions in JIT caused by duplicated data
|
||||
transfers.
|
||||
|
||||
25. Merged patch from @carenas (GitHub #96) which fixes some problems with
|
||||
pcre2test and readline/readedit:
|
||||
|
||||
* Use the right header for libedit in FreeBSD with autoconf
|
||||
* Really allow libedit with cmake
|
||||
* Avoid using readline headers with libedit
|
||||
|
||||
|
||||
Version 10.39 29-October-2021
|
||||
-----------------------------
|
||||
|
||||
1. Fix incorrect detection of alternatives in first character search in JIT.
|
||||
|
||||
2. Merged patch from @carenas (GitHub #28):
|
||||
|
||||
Visual Studio 2013 includes support for %zu and %td, so let newer
|
||||
versions of it avoid the fallback, and while at it, make sure that
|
||||
the first check is for DISABLE_PERCENT_ZT so it will be always
|
||||
honoured if chosen.
|
||||
|
||||
prtdiff_t is signed, so use a signed type instead, and make sure
|
||||
that an appropriate width is chosen if pointers are 64bit wide and
|
||||
long is not (ex: Windows 64bit).
|
||||
|
||||
IMHO removing the cast (and therefore the possibilty of truncation)
|
||||
make the code cleaner and the fallback is likely portable enough
|
||||
with all 64-bit POSIX systems doing LP64 except for Windows.
|
||||
|
||||
3. Merged patch from @carenas (GitHub #29) to update to Unicode 14.0.0.
|
||||
|
||||
4. Merged patch from @carenas (GitHub #30):
|
||||
|
||||
* Cleanup: remove references to no longer used stdint.h
|
||||
|
||||
Since 19c50b9d (Unconditionally use inttypes.h instead of trying for stdint.h
|
||||
(simplification) and remove the now unnecessary inclusion in
|
||||
pcre2_internal.h., 2018-11-14), stdint.h is no longer used.
|
||||
|
||||
Remove checks for it in autotools and CMake and document better the expected
|
||||
build failures for systems that might have stdint.h (C99) and not inttypes.h
|
||||
(from POSIX), like old Windows.
|
||||
|
||||
* Cleanup: remove detection for inttypes.h which is a hard dependency
|
||||
|
||||
CMake checks for standard headers are not meant to be used for hard
|
||||
dependencies, so will prevent a possible fallback to work.
|
||||
|
||||
Alternatively, the header could be checked to make the configuration fail
|
||||
instead of breaking the build, but that was punted, as it was missing anyway
|
||||
from autotools.
|
||||
|
||||
5. Merged patch from @carenas (GitHub #32):
|
||||
|
||||
* jit: allow building with ancient MSVC versions
|
||||
|
||||
Visual Studio older than 2013 fails to build with JIT enabled, because it is
|
||||
unable to parse non C89 compatible syntax, with mixed declarations and code.
|
||||
While most recent compilers wouldn't even report this as a warning since it
|
||||
is valid C99, it could be also made visible by adding to gcc/clang the
|
||||
-Wdeclaration-after-statement flag at build time.
|
||||
|
||||
Move the code below the affected definitions.
|
||||
|
||||
* pcre2grep: avoid mixing declarations with code
|
||||
|
||||
Since d5a61ee8 (Patch to detect (and ignore) symlink loops in pcre2grep,
|
||||
2021-08-28), code will fail to build in a strict C89 compiler.
|
||||
|
||||
Reformat slightly to make it C89 compatible again.
|
||||
|
||||
|
||||
Version 10.38 01-October-2021
|
||||
-----------------------------
|
||||
|
||||
1. Fix invalid single character repetition issues in JIT when the repetition
|
||||
is inside a capturing bracket and the bracket is preceeded by character
|
||||
is inside a capturing bracket and the bracket is preceded by character
|
||||
literals.
|
||||
|
||||
2. Installed revised CMake configuration files provided by Jan-Willem Blokland.
|
||||
|
@ -52,6 +270,14 @@ However, just in case anybody was relying on the old behaviour, there is an
|
|||
option called PCRE2_EXTRA_ALLOW_LOOKAROUND_BSK that enables the old behaviour.
|
||||
An option has also been added to pcre2grep to enable this.
|
||||
|
||||
7. Re-enable a JIT optimization which was unintentionally disabled in 10.35.
|
||||
|
||||
8. There is a loop counter to catch excessively crazy patterns when checking
|
||||
the lengths of lookbehinds at compile time. This was incorrectly getting reset
|
||||
whenever a lookahead was processed, leading to some fuzzer-generated patterns
|
||||
taking a very long time to compile when (?|) was present in the pattern,
|
||||
because (?|) disables caching of group lengths.
|
||||
|
||||
|
||||
Version 10.37 26-May-2021
|
||||
-------------------------
|
||||
|
@ -237,7 +463,7 @@ now correctly backtracked, so this unnecessary restriction has been removed.
|
|||
|
||||
7. Added PCRE2_SUBSTITUTE_MATCHED.
|
||||
|
||||
8. Added (?* and (?<* as synonms for (*napla: and (*naplb: to match another
|
||||
8. Added (?* and (?<* as synonyms for (*napla: and (*naplb: to match another
|
||||
regex engine. The Perl regex folks are aware of this usage and have made a note
|
||||
about it.
|
||||
|
||||
|
@ -668,7 +894,7 @@ Patch by Guillem Jover.
|
|||
warnings were reported.
|
||||
|
||||
38. Using the clang compiler with sanitizing options causes runtime complaints
|
||||
about truncation for statments such as x = ~x when x is an 8-bit value; it
|
||||
about truncation for statements such as x = ~x when x is an 8-bit value; it
|
||||
seems to compute ~x as a 32-bit value. Changing such statements to x = 255 ^ x
|
||||
gets rid of the warnings. There were also two missing casts in pcre2test.
|
||||
|
||||
|
|
64
HACKING
64
HACKING
|
@ -8,8 +8,8 @@ library is referred to as PCRE1 below. For information about testing PCRE2, see
|
|||
the pcre2test documentation and the comment at the head of the RunTest file.
|
||||
|
||||
PCRE1 releases were up to 8.3x when PCRE2 was developed, and later bug fix
|
||||
releases remain in the 8.xx series. PCRE2 releases started at 10.00 to avoid
|
||||
confusion with PCRE1.
|
||||
releases carried on the 8.xx series, up to the final 8.45 release. PCRE2
|
||||
releases started at 10.00 to avoid confusion with PCRE1.
|
||||
|
||||
|
||||
Historical note 1
|
||||
|
@ -38,8 +38,8 @@ Historical note 2
|
|||
By contrast, the code originally written by Henry Spencer (which was
|
||||
subsequently heavily modified for Perl) compiles the expression twice: once in
|
||||
a dummy mode in order to find out how much store will be needed, and then for
|
||||
real. (The Perl version probably doesn't do this any more; I'm talking about
|
||||
the original library.) The execution function operates by backtracking and
|
||||
real. (The Perl version may or may not still do this; I'm talking about the
|
||||
original library.) The execution function operates by backtracking and
|
||||
maximizing (or, optionally, minimizing, in Perl) the amount of the subject that
|
||||
matches individual wild portions of the pattern. This is an "NFA algorithm" in
|
||||
Friedl's terminology.
|
||||
|
@ -151,8 +151,8 @@ of code units in the item itself. The exception is the aforementioned large
|
|||
advance to check for such values. When auto-callouts are enabled, the generous
|
||||
assumption is made that there will be a callout for each pattern code unit
|
||||
(which of course is only actually true if all code units are literals) plus one
|
||||
at the end. There is a default parsed pattern vector on the system stack, but
|
||||
if this is not big enough, heap memory is used.
|
||||
at the end. A default parsed pattern vector is defined on the system stack, to
|
||||
minimize memory handling, but if this is not big enough, heap memory is used.
|
||||
|
||||
As before, the actual compiling function is run twice, the first time to
|
||||
determine the amount of memory needed for the final compiled pattern. It
|
||||
|
@ -187,7 +187,7 @@ META_CLASS_EMPTY [] empty class - only with PCRE2_ALLOW_EMPTY_CLASS
|
|||
META_CLASS_EMPTY_NOT [^] negative empty class - ditto
|
||||
META_CLASS_END ] end of non-empty class
|
||||
META_CLASS_NOT [^ start non-empty negative class
|
||||
META_COMMIT (*COMMIT)
|
||||
META_COMMIT (*COMMIT) - no argument (see below for with argument)
|
||||
META_COND_ASSERT (?(?assertion)
|
||||
META_DOLLAR $ metacharacter
|
||||
META_DOT . metacharacter
|
||||
|
@ -201,18 +201,18 @@ META_NOCAPTURE (?: no capture parens
|
|||
META_PLUS +
|
||||
META_PLUS_PLUS ++
|
||||
META_PLUS_QUERY +?
|
||||
META_PRUNE (*PRUNE) - no argument
|
||||
META_PRUNE (*PRUNE) - no argument (see below for with argument)
|
||||
META_QUERY ?
|
||||
META_QUERY_PLUS ?+
|
||||
META_QUERY_QUERY ??
|
||||
META_RANGE_ESCAPED hyphen in class range with at least one escape
|
||||
META_RANGE_LITERAL hyphen in class range defined literally
|
||||
META_SKIP (*SKIP) - no argument
|
||||
META_THEN (*THEN) - no argument
|
||||
META_SKIP (*SKIP) - no argument (see below for with argument)
|
||||
META_THEN (*THEN) - no argument (see below for with argument)
|
||||
|
||||
The two RANGE values occur only in character classes. They are positioned
|
||||
between two literals that define the start and end of the range. In an EBCDIC
|
||||
evironment it is necessary to know whether either of the range values was
|
||||
environment it is necessary to know whether either of the range values was
|
||||
specified as an escape. In an ASCII/Unicode environment the distinction is not
|
||||
relevant.
|
||||
|
||||
|
@ -229,17 +229,16 @@ If the data for META_ALT is non-zero, it is inside a lookbehind, and the data
|
|||
is the length of its branch, for which OP_REVERSE must be generated.
|
||||
|
||||
META_BACKREF, META_CAPTURE, and META_RECURSE have the capture group number as
|
||||
their data in the lower 16 bits of the element.
|
||||
their data in the lower 16 bits of the element. META_RECURSE is followed by an
|
||||
offset, for use in error messages.
|
||||
|
||||
META_BACKREF is followed by an offset if the back reference group number is 10
|
||||
or more. The offsets of the first ocurrences of references to groups whose
|
||||
or more. The offsets of the first occurrences of references to groups whose
|
||||
numbers are less than 10 are put in cb->small_ref_offset[] (only the first
|
||||
occurrence is useful). On 64-bit systems this avoids using more than two parsed
|
||||
pattern elements for items such as \3. The offset is used when an error occurs
|
||||
because the reference is to a non-existent group.
|
||||
|
||||
META_RECURSE is always followed by an offset, for use in error messages.
|
||||
|
||||
META_ESCAPE has an ESC_xxx value as its data. For ESC_P and ESC_p, the next
|
||||
element contains the 16-bit type and data property values, packed together.
|
||||
ESC_g and ESC_k are used only for named references - numerical ones are turned
|
||||
|
@ -291,9 +290,9 @@ META_LOOKBEHIND (?<= start of lookbehind
|
|||
META_LOOKBEHIND_NA (*naplb: start of non-atomic lookbehind
|
||||
META_LOOKBEHINDNOT (?<! start of negative lookbehind
|
||||
|
||||
The following are followed by two elements, the minimum and maximum. Repeat
|
||||
values are limited to 65535 (MAX_REPEAT). A maximum value of "unlimited" is
|
||||
represented by UNLIMITED_REPEAT, which is bigger than MAX_REPEAT:
|
||||
The following are followed by two elements, the minimum and maximum. The
|
||||
maximum value is limited to 65535 (MAX_REPEAT). A maximum value of "unlimited"
|
||||
is represented by UNLIMITED_REPEAT, which is bigger than MAX_REPEAT:
|
||||
|
||||
META_MINMAX {n,m} repeat
|
||||
META_MINMAX_PLUS {n,m}+ repeat
|
||||
|
@ -347,11 +346,11 @@ support is not available for this kind of matching.
|
|||
Changeable options
|
||||
------------------
|
||||
|
||||
The /i, /m, or /s options (PCRE2_CASELESS, PCRE2_MULTILINE, PCRE2_DOTALL, and
|
||||
others) may be changed in the middle of patterns by items such as (?i). Their
|
||||
processing is handled entirely at compile time by generating different opcodes
|
||||
for the different settings. The runtime functions do not need to keep track of
|
||||
an option's state.
|
||||
The /i, /m, or /s options (PCRE2_CASELESS, PCRE2_MULTILINE, PCRE2_DOTALL) and
|
||||
some others may be changed in the middle of patterns by items such as (?i).
|
||||
Their processing is handled entirely at compile time by generating different
|
||||
opcodes for the different settings. The runtime functions do not need to keep
|
||||
track of an option's state.
|
||||
|
||||
PCRE2_DUPNAMES, PCRE2_EXTENDED, PCRE2_EXTENDED_MORE, and PCRE2_NO_AUTO_CAPTURE
|
||||
are tracked and processed during the parsing pre-pass. The others are handled
|
||||
|
@ -437,7 +436,7 @@ Backtracking control verbs
|
|||
--------------------------
|
||||
|
||||
Verbs with no arguments generate opcodes with no following data (as listed
|
||||
in the section above).
|
||||
in the section above).
|
||||
|
||||
(*MARK:NAME) generates OP_MARK followed by the mark name, preceded by a
|
||||
length in one code unit, and followed by a binary zero. The name length is
|
||||
|
@ -468,8 +467,8 @@ Caseless matching (positive or negative) of characters that have more than two
|
|||
case-equivalent code points (which is possible only in UTF mode) is handled by
|
||||
compiling a Unicode property item (see below), with the pseudo-property
|
||||
PT_CLIST. The value of this property is an offset in a vector called
|
||||
"ucd_caseless_sets" which identifies the start of a short list of equivalent
|
||||
characters, terminated by the value NOTACHAR (0xffffffff).
|
||||
"ucd_caseless_sets" which identifies the start of a short list of case
|
||||
equivalent characters, terminated by the value NOTACHAR (0xffffffff).
|
||||
|
||||
|
||||
Repeating single characters
|
||||
|
@ -546,8 +545,9 @@ Each is followed by two code units that encode the desired property as a type
|
|||
and a value. The types are a set of #defines of the form PT_xxx, and the values
|
||||
are enumerations of the form ucp_xx, defined in the pcre2_ucp.h source file.
|
||||
The value is relevant only for PT_GC (General Category), PT_PC (Particular
|
||||
Category), PT_SC (Script), and the pseudo-property PT_CLIST, which is used to
|
||||
identify a list of case-equivalent characters when there are three or more.
|
||||
Category), PT_SC (Script), PT_BIDICL (Bidi Class), PT_BOOL (Boolean property),
|
||||
and the pseudo-property PT_CLIST, which is used to identify a list of
|
||||
case-equivalent characters when there are three or more (see above).
|
||||
|
||||
Repeats of these items use the OP_TYPESTAR etc. set of opcodes, followed by
|
||||
three code units: OP_PROP or OP_NOTPROP, and then the desired property type and
|
||||
|
@ -665,9 +665,9 @@ a count that immediately follows the offset.
|
|||
There are several opcodes that mark the end of a subpattern group. OP_KET is
|
||||
used for subpatterns that do not repeat indefinitely, OP_KETRMIN and
|
||||
OP_KETRMAX are used for indefinite repetitions, minimally or maximally
|
||||
respectively, and OP_KETRPOS for possessive repetitions (see below for more
|
||||
respectively, and OP_KETRPOS for possessive repetitions (see below for more
|
||||
details). All four are followed by a LINK_SIZE value giving (as a positive
|
||||
number) the offset back to the matching bracket opcode.
|
||||
number) the offset back to the matching opening bracket opcode.
|
||||
|
||||
If a subpattern is quantified such that it is permitted to match zero times, it
|
||||
is preceded by one of OP_BRAZERO, OP_BRAMINZERO, or OP_SKIPZERO. These are
|
||||
|
@ -718,7 +718,7 @@ Assertions
|
|||
|
||||
Forward assertions are also just like other subpatterns, but starting with one
|
||||
of the opcodes OP_ASSERT, OP_ASSERT_NA (non-atomic assertion), or
|
||||
OP_ASSERT_NOT. Backward assertions use the opcodes OP_ASSERTBACK,
|
||||
OP_ASSERT_NOT. Backward assertions use the opcodes OP_ASSERTBACK,
|
||||
OP_ASSERTBACK_NA, and OP_ASSERTBACK_NOT, and the first opcode inside the
|
||||
assertion is OP_REVERSE, followed by a count of the number of characters to
|
||||
move back the pointer in the subject string. In ASCII or UTF-32 mode, the count
|
||||
|
@ -827,4 +827,4 @@ not a real opcode, but is used to check at compile time that tables indexed by
|
|||
opcode are the correct length, in order to catch updating errors.
|
||||
|
||||
Philip Hazel
|
||||
12 July 2019
|
||||
April 2022
|
||||
|
|
6
LICENCE
6
LICENCE
|
@ -26,7 +26,7 @@ Email domain: gmail.com
|
|||
Retired from University of Cambridge Computing Service,
|
||||
Cambridge, England.
|
||||
|
||||
Copyright (c) 1997-2021 University of Cambridge
|
||||
Copyright (c) 1997-2022 University of Cambridge
|
||||
All rights reserved.
|
||||
|
||||
|
||||
|
@ -37,7 +37,7 @@ Written by: Zoltan Herczeg
|
|||
Email local part: hzmester
|
||||
Email domain: freemail.hu
|
||||
|
||||
Copyright(c) 2010-2021 Zoltan Herczeg
|
||||
Copyright(c) 2010-2022 Zoltan Herczeg
|
||||
All rights reserved.
|
||||
|
||||
|
||||
|
@ -48,7 +48,7 @@ Written by: Zoltan Herczeg
|
|||
Email local part: hzmester
|
||||
Email domain: freemail.hu
|
||||
|
||||
Copyright(c) 2009-2021 Zoltan Herczeg
|
||||
Copyright(c) 2009-2022 Zoltan Herczeg
|
||||
All rights reserved.
|
||||
|
||||
|
||||
|
|
|
@ -0,0 +1,8 @@
|
|||
module(
|
||||
name = "pcre2",
|
||||
version = "10.40",
|
||||
compatibility_level = 1,
|
||||
)
|
||||
|
||||
bazel_dep(name = "rules_cc", version = "0.0.1")
|
||||
bazel_dep(name = "bazel_skylib", version = "1.2.1")
|
11
Makefile.am
11
Makefile.am
|
@ -382,6 +382,10 @@ COMMON_SOURCES = \
|
|||
src/pcre2_valid_utf.c \
|
||||
src/pcre2_xclass.c
|
||||
|
||||
# The pcre2_ucptables.c file is #included by pcre2_tables.c
|
||||
|
||||
EXTRA_DIST += src/pcre2_ucptables.c
|
||||
|
||||
if WITH_PCRE2_8
|
||||
lib_LTLIBRARIES += libpcre2-8.la
|
||||
libpcre2_8_la_SOURCES = \
|
||||
|
@ -448,9 +452,10 @@ EXTRA_DIST += \
|
|||
src/sljit/sljitNativePPC_32.c \
|
||||
src/sljit/sljitNativePPC_64.c \
|
||||
src/sljit/sljitNativePPC_common.c \
|
||||
src/sljit/sljitNativeRISCV_32.c \
|
||||
src/sljit/sljitNativeRISCV_64.c \
|
||||
src/sljit/sljitNativeRISCV_common.c \
|
||||
src/sljit/sljitNativeS390X.c \
|
||||
src/sljit/sljitNativeSPARC_32.c \
|
||||
src/sljit/sljitNativeSPARC_common.c \
|
||||
src/sljit/sljitNativeX86_32.c \
|
||||
src/sljit/sljitNativeX86_64.c \
|
||||
src/sljit/sljitNativeX86_common.c \
|
||||
|
@ -663,6 +668,7 @@ EXTRA_DIST += \
|
|||
testdata/testinput23 \
|
||||
testdata/testinput24 \
|
||||
testdata/testinput25 \
|
||||
testdata/testinput26 \
|
||||
testdata/testinputEBC \
|
||||
testdata/testoutput1 \
|
||||
testdata/testoutput2 \
|
||||
|
@ -705,6 +711,7 @@ EXTRA_DIST += \
|
|||
testdata/testoutput23 \
|
||||
testdata/testoutput24 \
|
||||
testdata/testoutput25 \
|
||||
testdata/testoutput26 \
|
||||
testdata/testoutputEBC \
|
||||
testdata/valgrind-jit.supp \
|
||||
testdata/wintestinput3 \
|
||||
|
|
|
@ -0,0 +1,271 @@
|
|||
#
|
||||
# Project: pcre2
|
||||
#
|
||||
# Created on: 10-01-2022 22:01:46
|
||||
#
|
||||
# commands to use:
|
||||
# make -f Makefile.os4 libpcre2.a
|
||||
# make -f Makefile.os4 libpcre2-posix.a
|
||||
# make -f Makefile.os4 pcre2test
|
||||
# sh RunTest
|
||||
# make -f Makefile.os4 clean
|
||||
#
|
||||
|
||||
###################################################################
|
||||
##
|
||||
##//// Objects
|
||||
##
|
||||
###################################################################
|
||||
|
||||
libpcre2_OBJ := \
|
||||
src/pcre2_chartables.o src/pcre2_auto_possess.o src/pcre2_compile.o \
|
||||
src/pcre2_config.o src/pcre2_context.o src/pcre2_convert.o \
|
||||
src/pcre2_dfa_match.o src/pcre2_error.o src/pcre2_extuni.o \
|
||||
src/pcre2_find_bracket.o src/pcre2_jit_compile.o src/pcre2_maketables.o \
|
||||
src/pcre2_match.o src/pcre2_match_data.o src/pcre2_newline.o \
|
||||
src/pcre2_ord2utf.o src/pcre2_pattern_info.o src/pcre2_script_run.o \
|
||||
src/pcre2_serialize.o src/pcre2_string_utils.o src/pcre2_study.o \
|
||||
src/pcre2_substitute.o src/pcre2_substring.o src/pcre2_tables.o \
|
||||
src/pcre2_ucd.o src/pcre2_valid_utf.o src/pcre2_xclass.o \
|
||||
|
||||
|
||||
|
||||
pcre2posix_OBJ := \
|
||||
src/pcre2posix.o
|
||||
|
||||
|
||||
pcre2test_OBJ := \
|
||||
src/pcre2test.o
|
||||
|
||||
|
||||
pcre2grep_OBJ := \
|
||||
src/pcre2grep.o
|
||||
|
||||
###################################################################
|
||||
##
|
||||
##//// Variables and Environment
|
||||
##
|
||||
###################################################################
|
||||
|
||||
MCRT := -mcrt=newlib
|
||||
ifeq ($(USE_CLIB2), yes)
|
||||
MCRT := -mcrt=clib2
|
||||
endif
|
||||
|
||||
CC := gcc:bin/gcc
|
||||
|
||||
INCPATH := -I. -Isrc
|
||||
|
||||
# for pcre2test
|
||||
CFLAGS := $(MCRT) $(INCPATH) -O2 -DHAVE_CONFIG_H -DPCRE2_CODE_UNIT_WIDTH=8
|
||||
|
||||
###################################################################
|
||||
##
|
||||
##//// General rules
|
||||
##
|
||||
###################################################################
|
||||
|
||||
.PHONY: all all-before all-after clean clean-custom realclean
|
||||
|
||||
all: all-before libpcre2.a libpcre2-posix.a all-after
|
||||
|
||||
all-before:
|
||||
# You can add rules here to execute before the project is built
|
||||
|
||||
all-after:
|
||||
# You can add rules here to execute after the project is built
|
||||
|
||||
tests: pcre2test pcre2grep
|
||||
|
||||
clean: clean-custom
|
||||
@echo "Cleaning compiler objects..."
|
||||
@rm -f $(libpcre2_OBJ) $(pcre2posix_OBJ) $(pcre2test_OBJ)
|
||||
|
||||
cleanall: clean
|
||||
@echo "Cleaning compiler targets..."
|
||||
@rm -f libpcre.a libpcre-posix.a pcre2test pcre2grep
|
||||
|
||||
###################################################################
|
||||
##
|
||||
##//// Targets
|
||||
##
|
||||
###################################################################
|
||||
|
||||
libpcre2.a: $(libpcre2_OBJ)
|
||||
ar -rcs libpcre2.a $(libpcre2_OBJ)
|
||||
ranlib libpcre2.a
|
||||
|
||||
libpcre2-posix.a: $(pcre2posix_OBJ)
|
||||
ar -rcs libpcre2-posix.a $(pcre2posix_OBJ)
|
||||
ranlib libpcre2-posix.a
|
||||
|
||||
pcre2test: libpcre2.a libpcre2-posix.a $(pcre2test_OBJ)
|
||||
@echo "Linking pcre2test"
|
||||
@gcc:bin/gcc $(MCRT) -o pcre2test $(pcre2test_OBJ) -L. -lauto -lpcre2 -lpcre2-posix
|
||||
@echo "Removing stale debug target: pcre2test"
|
||||
@rm -f pcre2test.debug
|
||||
|
||||
pcre2grep: libpcre2.a $(pcre2grep_OBJ)
|
||||
@echo "Linking pcre2grep"
|
||||
@gcc:bin/gcc $(MCRT) -o pcre2grep $(pcre2grep_OBJ) -L . -lauto -lpcre2
|
||||
@echo "Removing stale debug target: pcre2grep"
|
||||
@rm -f pcre2grep.debug
|
||||
|
||||
|
||||
###################################################################
|
||||
##
|
||||
##//// Standard rules
|
||||
##
|
||||
###################################################################
|
||||
|
||||
# A default rule to make all the objects listed below
|
||||
# because we are hiding compiler commands from the output
|
||||
|
||||
.c.o:
|
||||
@echo "Compiling $<"
|
||||
@$(CC) -c $< -o $*.o $(CFLAGS)
|
||||
|
||||
src/pcre2_chartables.o: src/pcre2_chartables.c src/config.h src/pcre2_internal.h \
|
||||
src/pcre2.h src/pcre2_ucp.h
|
||||
|
||||
src/pcre2_auto_possess.o: src/pcre2_auto_possess.c src/config.h src/pcre2_internal.h \
|
||||
src/pcre2.h src/pcre2_ucp.h
|
||||
|
||||
src/pcre2_compile.o: src/pcre2_compile.c src/config.h src/pcre2_internal.h \
|
||||
src/pcre2.h src/pcre2_ucp.h src/pcre2_intmodedep.h \
|
||||
|
||||
|
||||
src/pcre2_config.o: src/pcre2_config.c src/config.h src/pcre2_internal.h \
|
||||
src/pcre2.h src/pcre2_ucp.h
|
||||
|
||||
src/pcre2_context.o: src/pcre2_context.c src/config.h src/pcre2_internal.h \
|
||||
src/pcre2.h src/pcre2_ucp.h
|
||||
|
||||
src/pcre2_convert.o: src/pcre2_convert.c src/config.h src/pcre2_internal.h \
|
||||
src/pcre2.h src/pcre2_ucp.h
|
||||
|
||||
src/pcre2_dfa_match.o: src/pcre2_dfa_match.c src/config.h src/pcre2_internal.h \
|
||||
src/pcre2.h src/pcre2_ucp.h
|
||||
|
||||
src/pcre2_error.o: src/pcre2_error.c src/config.h src/pcre2_internal.h \
|
||||
src/pcre2.h src/pcre2_ucp.h
|
||||
|
||||
src/pcre2_extuni.o: src/pcre2_extuni.c src/config.h src/pcre2_internal.h \
|
||||
src/pcre2.h src/pcre2_ucp.h
|
||||
|
||||
src/pcre2_find_bracket.o: src/pcre2_find_bracket.c src/config.h src/pcre2_internal.h \
|
||||
src/pcre2.h src/pcre2_ucp.h
|
||||
|
||||
src/pcre2_jit_compile.o: src/pcre2_jit_compile.c src/config.h src/pcre2_internal.h \
|
||||
src/pcre2.h src/pcre2_ucp.h src/pcre2_intmodedep.h \
|
||||
src/sljit/sljitLir.c src/sljit/sljitLir.h src/sljit/sljitConfig.h \
|
||||
src/sljit/sljitConfigInternal.h src/sljit/sljitUtils.c src/sljit/sljitProtExecAllocator.c \
|
||||
src/sljit/sljitWXExecAllocator.c src/sljit/sljitExecAllocator.c src/pcre2_jit_simd_inc.h \
|
||||
src/pcre2_jit_neon_inc.h src/pcre2_jit_match.c
|
||||
|
||||
src/pcre2_maketables.o: src/pcre2_maketables.c
|
||||
|
||||
src/pcre2_match.o: src/pcre2_match.c src/config.h src/pcre2_internal.h \
|
||||
src/pcre2.h src/pcre2_ucp.h
|
||||
|
||||
src/pcre2_match_data.o: src/pcre2_match_data.c src/config.h src/pcre2_internal.h \
|
||||
src/pcre2.h src/pcre2_ucp.h
|
||||
|
||||
src/pcre2_newline.o: src/pcre2_newline.c src/config.h src/pcre2_internal.h \
|
||||
src/pcre2.h src/pcre2_ucp.h
|
||||
|
||||
src/pcre2_ord2utf.o: src/pcre2_ord2utf.c src/config.h src/pcre2_internal.h \
|
||||
src/pcre2.h src/pcre2_ucp.h
|
||||
|
||||
src/pcre2_pattern_info.o: src/pcre2_pattern_info.c src/config.h src/pcre2_internal.h \
|
||||
src/pcre2.h src/pcre2_ucp.h
|
||||
|
||||
src/pcre2_script_run.o: src/pcre2_script_run.c src/config.h src/pcre2_internal.h \
|
||||
src/pcre2.h src/pcre2_ucp.h
|
||||
|
||||
src/pcre2_serialize.o: src/pcre2_serialize.c src/config.h src/pcre2_internal.h \
|
||||
src/pcre2.h src/pcre2_ucp.h
|
||||
|
||||
src/pcre2test.o: src/pcre2test.c src/config.h src/pcre2.h \
|
||||
src/pcre2posix.h src/pcre2_internal.h src/pcre2_ucp.h \
|
||||
src/pcre2_intmodedep.h src/pcre2_tables.c src/pcre2_ucptables.c \
|
||||
src/pcre2_ucd.c src/pcre2_printint.c
|
||||
|
||||
src/pcre2_string_utils.o: src/pcre2_string_utils.c src/config.h src/pcre2_internal.h \
|
||||
src/pcre2.h src/pcre2_ucp.h
|
||||
|
||||
src/pcre2_study.o: src/pcre2_study.c src/config.h src/pcre2_internal.h \
|
||||
src/pcre2.h src/pcre2_ucp.h
|
||||
|
||||
src/pcre2_substitute.o: src/pcre2_substitute.c src/config.h src/pcre2_internal.h \
|
||||
src/pcre2.h src/pcre2_ucp.h
|
||||
|
||||
src/pcre2_substring.o: src/pcre2_substring.c src/config.h src/pcre2_internal.h \
|
||||
src/pcre2.h src/pcre2_ucp.h
|
||||
|
||||
src/pcre2posix.o: src/pcre2posix.c src/config.h src/pcre2.h \
|
||||
|
||||
|
||||
src/pcre2_tables.o: src/pcre2_tables.c src/config.h src/pcre2_internal.h \
|
||||
src/pcre2.h src/pcre2_ucp.h src/pcre2_intmodedep.h \
|
||||
|
||||
|
||||
src/pcre2_ucd.o: src/pcre2_ucd.c src/config.h src/pcre2_internal.h \
|
||||
src/pcre2.h src/pcre2_ucp.h
|
||||
|
||||
src/pcre2_valid_utf.o: src/pcre2_valid_utf.c src/config.h src/pcre2_internal.h \
|
||||
src/pcre2.h src/pcre2_ucp.h
|
||||
|
||||
src/pcre2_xclass.o: src/pcre2_xclass.c src/config.h src/pcre2_internal.h \
|
||||
src/pcre2.h src/pcre2_ucp.h
|
||||
|
||||
|
||||
src/pcre2grep.o: src/pcre2grep.c src/config.h
|
||||
|
||||
###################################################################
|
||||
##
|
||||
##//// Custom rules
|
||||
##
|
||||
###################################################################
|
||||
|
||||
runtests: libpcre2.a libpcre2-posix.a tests
|
||||
sh RunTest
|
||||
sh RunGrepTest
|
||||
|
||||
release:
|
||||
@echo "Create release folders..."
|
||||
@mkdir -p release/local/newlib/lib release/local/clib2/lib release/local/Documentation/pcre2 release/local/common/include
|
||||
|
||||
@echo "Building newlib based libraries..."
|
||||
@make -f Makefile.os4 all
|
||||
@cp libpcre2.a release/local/newlib/lib/
|
||||
@cp libpcre2-posix.a release/local/newlib/lib/
|
||||
|
||||
@echo "Clean build and libraries files..."
|
||||
@make -f Makefile.os4 cleanall
|
||||
|
||||
@echo "Building clib2 based libraries..."
|
||||
@make -f Makefile.os4 all USE_CLIB2=yes
|
||||
@cp libpcre2.a release/local/clib2/lib/
|
||||
@cp libpcre2-posix.a release/local/clib2/lib/
|
||||
|
||||
@echo "Copy the necessary files..."
|
||||
@cp src/pcre2.h release/local/common/include/
|
||||
@cp src/pcre2posix.h release/local/common/include/
|
||||
@cp COPYING release/local/Documentation/pcre2/
|
||||
@cp HACKING release/local/Documentation/pcre2/
|
||||
@cp LICENCE release/local/Documentation/pcre2/
|
||||
@cp README release/local/Documentation/pcre2/
|
||||
@cp README-OS4.md release/local/Documentation/pcre2/
|
||||
|
||||
@echo "Clean build and libraries files..."
|
||||
@make -f Makefile.os4 cleanall
|
||||
|
||||
@echo "Creating the lha release file..."
|
||||
@rm -f pcre2.lha
|
||||
@lha -aeqr3 a pcre2.lha release/
|
||||
|
||||
@rm -rf release
|
||||
|
||||
###################################################################
|
||||
|
48
NEWS
48
NEWS
|
@ -2,8 +2,52 @@ News about PCRE2 releases
|
|||
-------------------------
|
||||
|
||||
|
||||
Version 10.38-RC1 31-August-2021
|
||||
--------------------------------
|
||||
Version 10.40 15-April-2022
|
||||
---------------------------
|
||||
|
||||
This is mostly a bug-fixing and code-tidying release. However, there are some
|
||||
extensions to Unicode property handling:
|
||||
|
||||
* Added support for Bidi_Class and a number of binary Unicode properties,
|
||||
including Bidi_Control.
|
||||
|
||||
* A number of changes to script matching for \p and \P:
|
||||
|
||||
(a) Script extensions for a character are now coded as a bitmap instead of
|
||||
a list of script numbers, which should be faster and does not need a
|
||||
loop.
|
||||
|
||||
(b) Added the syntax \p{script:xxx} and \p{script_extensions:xxx} (synonyms
|
||||
sc and scx).
|
||||
|
||||
(c) Changed \p{scriptname} from being the same as \p{sc:scriptname} to being
|
||||
the same as \p{scx:scriptname} because this change happened in Perl at
|
||||
release 5.26.
|
||||
|
||||
(d) The standard Unicode 4-letter abbreviations for script names are now
|
||||
recognized.
|
||||
|
||||
(e) In accordance with Unicode and Perl's "loose matching" rules, spaces,
|
||||
hyphens, and underscores are ignored in property names, which are then
|
||||
matched independent of case.
|
||||
|
||||
As always, see ChangeLog for a list of all changes (also the Git log).
|
||||
|
||||
|
||||
Version 10.39 29-October-2021
|
||||
-----------------------------
|
||||
|
||||
This release is happening soon after 10.38 because the bug fix is important.
|
||||
|
||||
1. Fix incorrect detection of alternatives in first character search in JIT.
|
||||
|
||||
2. Update to Unicode 14.0.0.
|
||||
|
||||
3. Some code cleanups (see ChangeLog).
|
||||
|
||||
|
||||
Version 10.38 01-October-2021
|
||||
-----------------------------
|
||||
|
||||
As well as some bug fixes and tidies (as always, see ChangeLog for details),
|
||||
the documentation is updated to list the new URLs, following the move of the
|
||||
|
|
|
@ -121,6 +121,7 @@ environment, for example.
|
|||
pcre2_substring.c
|
||||
pcre2_tables.c
|
||||
pcre2_ucd.c
|
||||
pcre2_ucptables.c
|
||||
pcre2_valid_utf.c
|
||||
pcre2_xclass.c
|
||||
|
||||
|
@ -306,7 +307,7 @@ cache can be deleted by selecting "File > Delete Cache".
|
|||
3. Create a new, empty build directory, preferably a subdirectory of the
|
||||
source dir. For example, C:\pcre2\pcre2-xx\build.
|
||||
|
||||
4. Run cmake-gui from the Shell envirornment of your build tool, for example,
|
||||
4. Run cmake-gui from the Shell environment of your build tool, for example,
|
||||
Msys for Msys/MinGW or Visual Studio Command Prompt for VC/VC++. Do not try
|
||||
to start Cmake from the Windows Start menu, as this can lead to errors.
|
||||
|
||||
|
@ -343,10 +344,10 @@ cache can be deleted by selecting "File > Delete Cache".
|
|||
|
||||
BUILDING PCRE2 ON WINDOWS WITH VISUAL STUDIO
|
||||
|
||||
The code currently cannot be compiled without a stdint.h header, which is
|
||||
available only in relatively recent versions of Visual Studio. However, this
|
||||
portable and permissively-licensed implementation of the header worked without
|
||||
issue:
|
||||
The code currently cannot be compiled without an inttypes.h header, which is
|
||||
available only with Visual Studio 2013 or newer. However, this portable and
|
||||
permissively-licensed implementation of the stdint.h header could be used as an
|
||||
alternative:
|
||||
|
||||
http://www.azillionmonkeys.com/qed/pstdint.h
|
||||
|
||||
|
@ -373,7 +374,7 @@ Otherwise:
|
|||
1. Copy RunTest.bat into the directory where pcre2test.exe and pcre2grep.exe
|
||||
have been created.
|
||||
|
||||
2. Edit RunTest.bat to indentify the full or relative location of
|
||||
2. Edit RunTest.bat to identify the full or relative location of
|
||||
the pcre2 source (wherein which the testdata folder resides), e.g.:
|
||||
|
||||
set srcdir=C:\pcre2\pcre2-10.00
|
||||
|
|
66
README
66
README
|
@ -5,11 +5,10 @@ PCRE2 is a re-working of the original PCRE1 library to provide an entirely new
|
|||
API. Since its initial release in 2015, there has been further development of
|
||||
the code and it now differs from PCRE1 in more than just the API. There are new
|
||||
features, and the internals have been improved. The original PCRE1 library is
|
||||
now obsolete and should not be used in new projects. The latest release of
|
||||
PCRE2 is available in .tar.gz, tar.bz2, or .zip form from this GitHub
|
||||
repository:
|
||||
now obsolete and no longer maintained. The latest release of PCRE2 is available
|
||||
in .tar.gz, tar.bz2, or .zip form from this GitHub repository:
|
||||
|
||||
https://github.com/PhilipHazel/pcre2/releases
|
||||
https://github.com/PCRE2Project/pcre2/releases
|
||||
|
||||
There is a mailing list for discussion about the development of PCRE2 at
|
||||
pcre2-dev@googlegroups.com. You can subscribe by sending an email to
|
||||
|
@ -18,7 +17,7 @@ pcre2-dev+subscribe@googlegroups.com.
|
|||
You can access the archives and also subscribe or manage your subscription
|
||||
here:
|
||||
|
||||
https://groups.google.com/pcre2-dev
|
||||
https://groups.google.com/g/pcre2-dev
|
||||
|
||||
Please read the NEWS file if you are upgrading from a previous release. The
|
||||
contents of this README file are:
|
||||
|
@ -115,12 +114,18 @@ Building PCRE2 using autotools
|
|||
The following instructions assume the use of the widely used "configure; make;
|
||||
make install" (autotools) process.
|
||||
|
||||
To build PCRE2 on system that supports autotools, first run the "configure"
|
||||
command from the PCRE2 distribution directory, with your current directory set
|
||||
If you have downloaded and unpacked a PCRE2 release tarball, run the
|
||||
"configure" command from the PCRE2 directory, with your current directory set
|
||||
to the directory where you want the files to be created. This command is a
|
||||
standard GNU "autoconf" configuration script, for which generic instructions
|
||||
are supplied in the file INSTALL.
|
||||
|
||||
The files in the GitHub repository do not contain "configure". If you have
|
||||
downloaded the PCRE2 source files from GitHub, before you can run "configure"
|
||||
you must run the shell script called autogen.sh. This runs a number of
|
||||
autotools to create a "configure" script (you must of course have the autotools
|
||||
commands installed in order to do this).
|
||||
|
||||
Most commonly, people build PCRE2 within its own distribution directory, and in
|
||||
this case, on many systems, just running "./configure" is sufficient. However,
|
||||
the usual methods of changing standard defaults are available. For example:
|
||||
|
@ -189,10 +194,10 @@ library. They are also documented in the pcre2build man page.
|
|||
|
||||
As well as supporting UTF strings, Unicode support includes support for the
|
||||
\P, \p, and \X sequences that recognize Unicode character properties.
|
||||
However, only the basic two-letter properties such as Lu are supported.
|
||||
Escape sequences such as \d and \w in patterns do not by default make use of
|
||||
Unicode properties, but can be made to do so by setting the PCRE2_UCP option
|
||||
or starting a pattern with (*UCP).
|
||||
However, only a subset of Unicode properties are supported; see the
|
||||
pcre2pattern man page for details. Escape sequences such as \d and \w in
|
||||
patterns do not by default make use of Unicode properties, but can be made to
|
||||
do so by setting the PCRE2_UCP option or starting a pattern with (*UCP).
|
||||
|
||||
. You can build PCRE2 to recognize either CR or LF or the sequence CRLF, or any
|
||||
of the preceding, or any of the Unicode newline sequences, or the NUL (zero)
|
||||
|
@ -370,19 +375,20 @@ library. They are also documented in the pcre2build man page.
|
|||
necessary to specify something like LIBS="-lncurses" as well. This is
|
||||
because, to quote the readline INSTALL, "Readline uses the termcap functions,
|
||||
but does not link with the termcap or curses library itself, allowing
|
||||
applications which link with readline the to choose an appropriate library."
|
||||
applications which link with readline the option to choose an appropriate
|
||||
library."
|
||||
If you get error messages about missing functions tgetstr, tgetent, tputs,
|
||||
tgetflag, or tgoto, this is the problem, and linking with the ncurses library
|
||||
should fix it.
|
||||
|
||||
. The C99 standard defines formatting modifiers z and t for size_t and
|
||||
ptrdiff_t values, respectively. By default, PCRE2 uses these modifiers in
|
||||
environments other than Microsoft Visual Studio when __STDC_VERSION__ is
|
||||
defined and has a value greater than or equal to 199901L (indicating C99).
|
||||
However, there is at least one environment that claims to be C99 but does not
|
||||
support these modifiers. If --disable-percent-zt is specified, no use is made
|
||||
of the z or t modifiers. Instead of %td or %zu, %lu is used, with a cast for
|
||||
size_t values.
|
||||
environments other than Microsoft Visual Studio versions earlier than 2013
|
||||
when __STDC_VERSION__ is defined and has a value greater than or equal to
|
||||
199901L (indicating C99). However, there is at least one environment that
|
||||
claims to be C99 but does not support these modifiers. If
|
||||
--disable-percent-zt is specified, no use is made of the z or t modifiers.
|
||||
Instead of %td or %zu, %lu is used, with a cast for size_t values.
|
||||
|
||||
. There is a special option called --enable-fuzz-support for use by people who
|
||||
want to run fuzzing tests on PCRE2. At present this applies only to the 8-bit
|
||||
|
@ -395,10 +401,10 @@ library. They are also documented in the pcre2build man page.
|
|||
Setting --enable-fuzz-support also causes a binary called pcre2fuzzcheck to
|
||||
be created. This is normally run under valgrind or used when PCRE2 is
|
||||
compiled with address sanitizing enabled. It calls the fuzzing function and
|
||||
outputs information about it is doing. The input strings are specified by
|
||||
arguments: if an argument starts with "=" the rest of it is a literal input
|
||||
string. Otherwise, it is assumed to be a file name, and the contents of the
|
||||
file are the test string.
|
||||
outputs information about what it is doing. The input strings are specified
|
||||
by arguments: if an argument starts with "=" the rest of it is a literal
|
||||
input string. Otherwise, it is assumed to be a file name, and the contents
|
||||
of the file are the test string.
|
||||
|
||||
. Releases before 10.30 could be compiled with --disable-stack-for-recursion,
|
||||
which caused pcre2_match() to use individual blocks on the heap for
|
||||
|
@ -412,7 +418,7 @@ The "configure" script builds the following files for the basic C library:
|
|||
. Makefile the makefile that builds the library
|
||||
. src/config.h build-time configuration options for the library
|
||||
. src/pcre2.h the public PCRE2 header file
|
||||
. pcre2-config script that shows the building settings such as CFLAGS
|
||||
. pcre2-config script that shows the building settings such as CFLAGS
|
||||
that were set for "configure"
|
||||
. libpcre2-8.pc )
|
||||
. libpcre2-16.pc ) data for the pkg-config command
|
||||
|
@ -572,9 +578,9 @@ at build time" for more details.
|
|||
Making new tarballs
|
||||
-------------------
|
||||
|
||||
The command "make dist" creates two PCRE2 tarballs, in tar.gz and zip formats.
|
||||
The command "make distcheck" does the same, but then does a trial build of the
|
||||
new distribution to ensure that it works.
|
||||
The command "make dist" creates three PCRE2 tarballs, in tar.gz, tar.bz2, and
|
||||
zip formats. The command "make distcheck" does the same, but then does a trial
|
||||
build of the new distribution to ensure that it works.
|
||||
|
||||
If you have modified any of the man page sources in the doc directory, you
|
||||
should first run the PrepareRelease script before making a distribution. This
|
||||
|
@ -603,13 +609,13 @@ is available. RunTest outputs a comment when it skips a test.
|
|||
|
||||
Many (but not all) of the tests that are not skipped are run twice if JIT
|
||||
support is available. On the second run, JIT compilation is forced. This
|
||||
testing can be suppressed by putting "nojit" on the RunTest command line.
|
||||
testing can be suppressed by putting "-nojit" on the RunTest command line.
|
||||
|
||||
The entire set of tests is run once for each of the 8-bit, 16-bit and 32-bit
|
||||
libraries that are enabled. If you want to run just one set of tests, call
|
||||
RunTest with either the -8, -16 or -32 option.
|
||||
|
||||
If valgrind is installed, you can run the tests under it by putting "valgrind"
|
||||
If valgrind is installed, you can run the tests under it by putting "-valgrind"
|
||||
on the RunTest command line. To run pcre2test on just one or more specific test
|
||||
files, give their numbers as arguments to RunTest, for example:
|
||||
|
||||
|
@ -690,7 +696,7 @@ Test 14 contains some special UTF and UCP tests that give different output for
|
|||
different code unit widths.
|
||||
|
||||
Test 15 contains a number of tests that must not be run with JIT. They check,
|
||||
among other non-JIT things, the match-limiting features of the intepretive
|
||||
among other non-JIT things, the match-limiting features of the interpretive
|
||||
matcher.
|
||||
|
||||
Test 16 is run only when JIT support is not available. It checks that an
|
||||
|
@ -906,4 +912,4 @@ The distribution should contain the files listed below.
|
|||
Philip Hazel
|
||||
Email local part: Philip.Hazel
|
||||
Email domain: gmail.com
|
||||
Last updated: 27 August 2021
|
||||
Last updated: 15 April 2022
|
||||
|
|
|
@ -0,0 +1,39 @@
|
|||
PCRE2 (Perl-compatible regular expression library)
|
||||
---------------------------------------------------------------------------
|
||||
|
||||
This is a port of PCRE2 10.40 by Philip Hazel for AmigaOS 4, as found at the
|
||||
GitHub repository https://github.com/PCRE2Project/pcre2
|
||||
|
||||
More information about PCRE can be found at its official website
|
||||
at https://www.pcre.org and at the documentation that comes with this
|
||||
package.
|
||||
|
||||
In the archive both newlib and clib2 libraries are included. It has been
|
||||
tested with various applications, but in case you find issues please
|
||||
contact me.
|
||||
|
||||
To install it into your AmigaOS 4 SDK installation, just extract all the
|
||||
files in the SDK: path.
|
||||
|
||||
Compile
|
||||
--------------------------
|
||||
The source and the changes I did can be found at my personale repository
|
||||
https://git.walkero.gr/walkero/pcre2
|
||||
|
||||
You can compile it using the Makefile.os4 file, and produce the libraries
|
||||
yourself.
|
||||
|
||||
* with newlib run:
|
||||
```bash
|
||||
make -f Makefile.os4 all
|
||||
```
|
||||
* with clib2 run:
|
||||
```bash
|
||||
make -f Makefile.os4 all USE_CLIB2=yes
|
||||
```
|
||||
|
||||
Changelog
|
||||
--------------------------
|
||||
v10.40r1 - 2022-07-31
|
||||
* First release
|
||||
|
|
@ -14,14 +14,14 @@ flexible API, the code of PCRE2 has been much improved since the fork.
|
|||
## Download
|
||||
|
||||
As well as downloading from the
|
||||
[GitHub site](https://github.com/PhilipHazel/pcre2), you can download PCRE2
|
||||
[GitHub site](https://github.com/PCRE2Project/pcre2), you can download PCRE2
|
||||
or the older, unmaintained PCRE1 library from an
|
||||
[*unofficial* mirror](https://sourceforge.net/projects/pcre/files/) at SourceForge.
|
||||
|
||||
You can check out the PCRE2 source code via Git or Subversion:
|
||||
|
||||
git clone https://github.com/PhilipHazel/pcre2.git
|
||||
svn co https://github.com/PhilipHazel/pcre2.git
|
||||
git clone https://github.com/PCRE2Project/pcre2.git
|
||||
svn co https://github.com/PCRE2Project/pcre2.git
|
||||
|
||||
## Contributed Ports
|
||||
|
||||
|
@ -36,7 +36,7 @@ default character encoding, can be found at
|
|||
## Documentation
|
||||
|
||||
You can read the PCRE2 documentation
|
||||
[here](https://philiphazel.github.io/pcre2/doc/html/index.html).
|
||||
[here](https://PCRE2Project.github.io/pcre2/doc/html/index.html).
|
||||
|
||||
Comparisons to Perl's regular expression semantics can be found in the
|
||||
community authored Wikipedia entry for PCRE.
|
||||
|
|
48
RunGrepTest
48
RunGrepTest
|
@ -68,6 +68,22 @@ diff -b /dev/null /dev/null 2>/dev/null && cf="diff -b"
|
|||
diff -u /dev/null /dev/null 2>/dev/null && cf="diff -u"
|
||||
diff -ub /dev/null /dev/null 2>/dev/null && cf="diff -ub"
|
||||
|
||||
# Some tests involve NUL characters. It seems impossible to handle them easily
|
||||
# in many operating systems. An earlier version of this script used sed to
|
||||
# translate NUL into the string ZERO, but this didn't work on Solaris (aka
|
||||
# SunOS), where the version of sed explicitly doesn't like them, and also MacOS
|
||||
# (Darwin), OpenBSD, FreeBSD, NetBSD, and some Linux distributions like Alpine,
|
||||
# even when using GNU sed. A user suggested using tr instead, which
|
||||
# necessitates translating to a single character. However, on (some versions
|
||||
# of?) Solaris, the normal "tr" cannot handle binary zeros, but if
|
||||
# /usr/xpg4/bin/tr is available, it can do so, so test for that.
|
||||
|
||||
if [ -x /usr/xpg4/bin/tr ] ; then
|
||||
tr=/usr/xpg4/bin/tr
|
||||
else
|
||||
tr=tr
|
||||
fi
|
||||
|
||||
# If this test is being run from "make check", $srcdir will be set. If not, set
|
||||
# it to the current or parent directory, whichever one contains the test data.
|
||||
# Subsequently, we run most of the pcre2grep tests in the source directory so
|
||||
|
@ -674,13 +690,27 @@ echo "---------------------------- Test 131 -----------------------------" >>tes
|
|||
echo "RC=$?" >>testtrygrep
|
||||
|
||||
echo "---------------------------- Test 132 -----------------------------" >>testtrygrep
|
||||
(cd $srcdir; $valgrind $vjs $pcre2grep -m1 -A3 '^match'; echo '---'; head -1) <$srcdir/testdata/grepinput >>testtrygrep 2>&1
|
||||
(cd $srcdir; exec 3<testdata/grepinput; $valgrind $vjs $pcre2grep -m1 -A3 '^match' <&3; echo '---'; head -1 <&3; exec 3<&-) >>testtrygrep 2>&1
|
||||
echo "RC=$?" >>testtrygrep
|
||||
|
||||
echo "---------------------------- Test 133 -----------------------------" >>testtrygrep
|
||||
(cd $srcdir; exec 3<testdata/grepinput; $valgrind $vjs $pcre2grep -m1 -A3 '^match' <&3; echo '---'; $valgrind $vjs $pcre2grep -m1 -A3 '^match' <&3; exec 3<&-) >>testtrygrep 2>&1
|
||||
echo "RC=$?" >>testtrygrep
|
||||
|
||||
echo "---------------------------- Test 134 -----------------------------" >>testtrygrep
|
||||
(cd $srcdir; $valgrind $vjs $pcre2grep -m1 -O '=$x{41}$x423$o{103}$o1045=' 'fox') <$srcdir/testdata/grepinputv >>testtrygrep 2>&1
|
||||
echo "RC=$?" >>testtrygrep
|
||||
|
||||
echo "---------------------------- Test 135 -----------------------------" >>testtrygrep
|
||||
(cd $srcdir; $valgrind $vjs $pcre2grep -HZ 'word' ./testdata/grepinputv) | $tr '\000' '@' >>testtrygrep
|
||||
echo "RC=$?" >>testtrygrep
|
||||
(cd $srcdir; $valgrind $vjs $pcre2grep -lZ 'word' ./testdata/grepinputv ./testdata/grepinputv) | $tr '\000' '@' >>testtrygrep
|
||||
echo "RC=$?" >>testtrygrep
|
||||
(cd $srcdir; $valgrind $vjs $pcre2grep -A 1 -B 1 -HZ 'word' ./testdata/grepinputv) | $tr '\000' '@' >>testtrygrep
|
||||
echo "RC=$?" >>testtrygrep
|
||||
(cd $srcdir; $valgrind $vjs $pcre2grep -MHZn 'start[\s]+end' testdata/grepinputM) >>testtrygrep
|
||||
echo "RC=$?" >>testtrygrep
|
||||
|
||||
# Now compare the results.
|
||||
|
||||
$cf $srcdir/testdata/grepoutput testtrygrep
|
||||
|
@ -755,22 +785,6 @@ $valgrind $vjs $pcre2grep -n --newline=any "^(abc|def|ghi|jkl)" testNinputgrep >
|
|||
printf '%c--------------------------- Test N6 ------------------------------\r\n' - >>testtrygrep
|
||||
$valgrind $vjs $pcre2grep -n --newline=anycrlf "^(abc|def|ghi|jkl)" testNinputgrep >>testtrygrep
|
||||
|
||||
# This next test involves NUL characters. It seems impossible to handle them
|
||||
# easily in many operating systems. An earlier version of this script used sed
|
||||
# to translate NUL into the string ZERO, but this didn't work on Solaris (aka
|
||||
# SunOS), where the version of sed explicitly doesn't like them, and also MacOS
|
||||
# (Darwin), OpenBSD, FreeBSD, NetBSD, and some Linux distributions like Alpine,
|
||||
# even when using GNU sed. A user suggested using tr instead, which
|
||||
# necessitates translating to a single character (@). However, on (some
|
||||
# versions of?) Solaris, the normal "tr" cannot handle binary zeros, but if
|
||||
# /usr/xpg4/bin/tr is available, it can do so, so test for that.
|
||||
|
||||
if [ -x /usr/xpg4/bin/tr ] ; then
|
||||
tr=/usr/xpg4/bin/tr
|
||||
else
|
||||
tr=tr
|
||||
fi
|
||||
|
||||
printf '%c--------------------------- Test N7 ------------------------------\r\n' - >>testtrygrep
|
||||
printf 'abc\0def' >testNinputgrep
|
||||
$valgrind $vjs $pcre2grep -na --newline=nul "^(abc|def)" testNinputgrep | $tr '\000' '@' >>testtrygrep
|
||||
|
|
63
RunTest
63
RunTest
|
@ -17,8 +17,16 @@
|
|||
# individual test numbers, ranges of tests such as 3-6 or 3- (meaning 3 to the
|
||||
# end), or a number preceded by ~ to exclude a test. For example, "3-15 ~10"
|
||||
# runs tests 3 to 15, excluding test 10, and just "~10" runs all the tests
|
||||
# except test 10. Whatever order the arguments are in, the tests are always run
|
||||
# in numerical order.
|
||||
# except test 10. Whatever order the arguments are in, these tests are always
|
||||
# run in numerical order.
|
||||
#
|
||||
# If no specific tests are selected (which is the case when this script is run
|
||||
# via 'make check') the default is to run all the numbered tests.
|
||||
#
|
||||
# There may also be named (as well as numbered) tests for special purposes. At
|
||||
# present there is just one, called "heap". This test's output contains the
|
||||
# sizes of heap frames and frame vectors, which depend on the environment. It
|
||||
# is therefore not run unless explicitly requested.
|
||||
#
|
||||
# Inappropriate tests are automatically skipped (with a comment to say so). For
|
||||
# example, if JIT support is not compiled, test 16 is skipped, whereas if JIT
|
||||
|
@ -80,7 +88,9 @@ title22="Test 22: \C tests with UTF (not supported for DFA matching)"
|
|||
title23="Test 23: \C disabled test"
|
||||
title24="Test 24: Non-UTF pattern conversion tests"
|
||||
title25="Test 25: UTF pattern conversion tests"
|
||||
maxtest=25
|
||||
title26="Test 26: Auto-generated unicode property tests"
|
||||
maxtest=26
|
||||
titleheap="Test 'heap': Environment-specific heap tests"
|
||||
|
||||
if [ $# -eq 1 -a "$1" = "list" ]; then
|
||||
echo $title0
|
||||
|
@ -109,6 +119,12 @@ if [ $# -eq 1 -a "$1" = "list" ]; then
|
|||
echo $title23
|
||||
echo $title24
|
||||
echo $title25
|
||||
echo $title26
|
||||
echo ""
|
||||
echo $titleheap
|
||||
echo ""
|
||||
echo "Numbered tests are automatically run if nothing selected."
|
||||
echo "Named tests must be explicitly selected."
|
||||
exit 0
|
||||
fi
|
||||
|
||||
|
@ -238,6 +254,8 @@ do22=no
|
|||
do23=no
|
||||
do24=no
|
||||
do25=no
|
||||
do26=no
|
||||
doheap=no
|
||||
|
||||
while [ $# -gt 0 ] ; do
|
||||
case $1 in
|
||||
|
@ -267,6 +285,8 @@ while [ $# -gt 0 ] ; do
|
|||
23) do23=yes;;
|
||||
24) do24=yes;;
|
||||
25) do25=yes;;
|
||||
26) do26=yes;;
|
||||
heap) doheap=yes;;
|
||||
-8) arg8=yes;;
|
||||
-16) arg16=yes;;
|
||||
-32) arg32=yes;;
|
||||
|
@ -320,7 +340,8 @@ fi
|
|||
# set up a large stack.
|
||||
|
||||
$sim ./pcre2test -S 64 /dev/null /dev/null
|
||||
if [ $? -eq 0 -a "$bigstack" != "" ] ; then
|
||||
support_setstack=$?
|
||||
if [ $support_setstack -eq 0 -a "$bigstack" != "" ] ; then
|
||||
setstack="-S 64"
|
||||
else
|
||||
setstack=""
|
||||
|
@ -407,8 +428,8 @@ if [ $jit -ne 0 -a "$nojit" != "yes" ] ; then
|
|||
fi
|
||||
fi
|
||||
|
||||
# If no specific tests were requested, select all. Those that are not
|
||||
# relevant will be automatically skipped.
|
||||
# If no specific tests were requested, select all the numbered tests. Those
|
||||
# that are not relevant will be automatically skipped.
|
||||
|
||||
if [ $do0 = no -a $do1 = no -a $do2 = no -a $do3 = no -a \
|
||||
$do4 = no -a $do5 = no -a $do6 = no -a $do7 = no -a \
|
||||
|
@ -416,7 +437,7 @@ if [ $do0 = no -a $do1 = no -a $do2 = no -a $do3 = no -a \
|
|||
$do12 = no -a $do13 = no -a $do14 = no -a $do15 = no -a \
|
||||
$do16 = no -a $do17 = no -a $do18 = no -a $do19 = no -a \
|
||||
$do20 = no -a $do21 = no -a $do22 = no -a $do23 = no -a \
|
||||
$do24 = no -a $do25 = no \
|
||||
$do24 = no -a $do25 = no -a $do26 = no -a $doheap = no \
|
||||
]; then
|
||||
do0=yes
|
||||
do1=yes
|
||||
|
@ -444,6 +465,7 @@ if [ $do0 = no -a $do1 = no -a $do2 = no -a $do3 = no -a \
|
|||
do23=yes
|
||||
do24=yes
|
||||
do25=yes
|
||||
do26=yes
|
||||
fi
|
||||
|
||||
# Handle any explicit skips at this stage, so that an argument list may consist
|
||||
|
@ -479,7 +501,9 @@ for bmode in "$test8" "$test16" "$test32"; do
|
|||
echo '' >testtry
|
||||
checkspecial '-C'
|
||||
checkspecial '--help'
|
||||
checkspecial '-S 1 -t 10 testSinput'
|
||||
if [ $support_setstack -eq 0 ] ; then
|
||||
checkspecial '-S 1 -t 10 testSinput'
|
||||
fi
|
||||
echo " OK"
|
||||
fi
|
||||
|
||||
|
@ -860,6 +884,29 @@ for bmode in "$test8" "$test16" "$test32"; do
|
|||
fi
|
||||
fi
|
||||
|
||||
# Auto-generated unicode property tests
|
||||
|
||||
if [ $do26 = yes ] ; then
|
||||
echo $title26
|
||||
if [ $utf -eq 0 ] ; then
|
||||
echo " Skipped because UTF-$bits support is not available"
|
||||
else
|
||||
for opt in "" $jitopt; do
|
||||
$sim $valgrind ${opt:+$vjs} ./pcre2test -q $setstack $bmode $opt $testdata/testinput26 testtry
|
||||
checkresult $? 26 "$opt"
|
||||
done
|
||||
fi
|
||||
fi
|
||||
|
||||
# Manually selected heap tests - output may vary in different environments,
|
||||
# which is why that are not automatically run.
|
||||
|
||||
if [ $doheap = yes ] ; then
|
||||
echo $titleheap
|
||||
$sim $valgrind ./pcre2test -q $setstack $bmode $testdata/testinputheap testtry
|
||||
checkresult $? heap-$bits ""
|
||||
fi
|
||||
|
||||
# End of loop for 8/16/32-bit tests
|
||||
done
|
||||
|
||||
|
|
|
@ -135,9 +135,9 @@ if "%all%" == "yes" (
|
|||
set do7=yes
|
||||
set do8=yes
|
||||
set do9=yes
|
||||
set do10=yes
|
||||
set do10=no
|
||||
set do11=yes
|
||||
set do12=yes
|
||||
set do12=no
|
||||
set do13=yes
|
||||
set do14=yes
|
||||
set do15=yes
|
||||
|
|
|
@ -0,0 +1 @@
|
|||
# See MODULE.bazel
|
|
@ -1,17 +1,16 @@
|
|||
# Modified from FindReadline.cmake (PH Feb 2012)
|
||||
|
||||
if(EDITLINE_INCLUDE_DIR AND EDITLINE_LIBRARY AND NCURSES_LIBRARY)
|
||||
if(EDITLINE_INCLUDE_DIR AND EDITLINE_LIBRARY)
|
||||
set(EDITLINE_FOUND TRUE)
|
||||
else(EDITLINE_INCLUDE_DIR AND EDITLINE_LIBRARY AND NCURSES_LIBRARY)
|
||||
FIND_PATH(EDITLINE_INCLUDE_DIR readline.h
|
||||
/usr/include/editline
|
||||
/usr/include/edit/readline
|
||||
/usr/include/readline
|
||||
else(EDITLINE_INCLUDE_DIR AND EDITLINE_LIBRARY)
|
||||
FIND_PATH(EDITLINE_INCLUDE_DIR readline.h PATH_SUFFIXES
|
||||
editline
|
||||
edit/readline
|
||||
)
|
||||
|
||||
FIND_LIBRARY(EDITLINE_LIBRARY NAMES edit)
|
||||
include(FindPackageHandleStandardArgs)
|
||||
FIND_PACKAGE_HANDLE_STANDARD_ARGS(Editline DEFAULT_MSG EDITLINE_INCLUDE_DIR EDITLINE_LIBRARY )
|
||||
FIND_PACKAGE_HANDLE_STANDARD_ARGS(Editline DEFAULT_MSG EDITLINE_INCLUDE_DIR EDITLINE_LIBRARY)
|
||||
|
||||
MARK_AS_ADVANCED(EDITLINE_INCLUDE_DIR EDITLINE_LIBRARY)
|
||||
endif(EDITLINE_INCLUDE_DIR AND EDITLINE_LIBRARY AND NCURSES_LIBRARY)
|
||||
endif(EDITLINE_INCLUDE_DIR AND EDITLINE_LIBRARY)
|
||||
|
|
|
@ -2,8 +2,6 @@
|
|||
|
||||
#cmakedefine HAVE_ATTRIBUTE_UNINITIALIZED 1
|
||||
#cmakedefine HAVE_DIRENT_H 1
|
||||
#cmakedefine HAVE_INTTYPES_H 1
|
||||
#cmakedefine HAVE_STDINT_H 1
|
||||
#cmakedefine HAVE_STRERROR 1
|
||||
#cmakedefine HAVE_SYS_STAT_H 1
|
||||
#cmakedefine HAVE_SYS_TYPES_H 1
|
||||
|
|
44
configure.ac
44
configure.ac
|
@ -9,15 +9,15 @@ dnl The PCRE2_PRERELEASE feature is for identifying release candidates. It might
|
|||
dnl be defined as -RC2, for example. For real releases, it should be empty.
|
||||
|
||||
m4_define(pcre2_major, [10])
|
||||
m4_define(pcre2_minor, [38])
|
||||
m4_define(pcre2_prerelease, [-RC1])
|
||||
m4_define(pcre2_date, [2021-08-31])
|
||||
m4_define(pcre2_minor, [41])
|
||||
m4_define(pcre2_prerelease, [])
|
||||
m4_define(pcre2_date, [2022-xx-xx])
|
||||
|
||||
# Libtool shared library interface versions (current:revision:age)
|
||||
m4_define(libpcre2_8_version, [10:3:10])
|
||||
m4_define(libpcre2_16_version, [10:3:10])
|
||||
m4_define(libpcre2_32_version, [10:3:10])
|
||||
m4_define(libpcre2_posix_version, [3:1:0])
|
||||
m4_define(libpcre2_8_version, [11:0:11])
|
||||
m4_define(libpcre2_16_version, [11:0:11])
|
||||
m4_define(libpcre2_32_version, [11:0:11])
|
||||
m4_define(libpcre2_posix_version, [3:2:0])
|
||||
|
||||
# NOTE: The CMakeLists.txt file searches for the above variables in the first
|
||||
# 50 lines of this file. Please update that if the variables above are moved.
|
||||
|
@ -512,7 +512,20 @@ AC_TYPE_SIZE_T
|
|||
|
||||
# Checks for library functions.
|
||||
|
||||
AC_CHECK_FUNCS(bcopy memfd_create memmove mkostemp realpath secure_getenv strerror)
|
||||
AC_CHECK_FUNCS(bcopy memfd_create memmove mkostemp secure_getenv strerror)
|
||||
AC_MSG_CHECKING([for realpath])
|
||||
AC_LINK_IFELSE([AC_LANG_PROGRAM([[
|
||||
#include <stdlib.h>
|
||||
#include <limits.h>
|
||||
]],[[
|
||||
char buffer[PATH_MAX];
|
||||
realpath(".", buffer);
|
||||
]])],
|
||||
[AC_MSG_RESULT([yes])
|
||||
AC_DEFINE([HAVE_REALPATH], 1,
|
||||
[Define to 1 if you have the `realpath' function.])
|
||||
],
|
||||
AC_MSG_RESULT([no]))
|
||||
|
||||
# Check for the availability of libz (aka zlib)
|
||||
|
||||
|
@ -584,14 +597,14 @@ if test "$enable_pcre2test_libreadline" = "yes"; then
|
|||
fi
|
||||
fi
|
||||
|
||||
|
||||
# Check for the availability of libedit. Different distributions put its
|
||||
# headers in different places. Try to cover the most common ones.
|
||||
|
||||
if test "$enable_pcre2test_libedit" = "yes"; then
|
||||
AC_CHECK_HEADERS([editline/readline.h], [HAVE_EDITLINE_READLINE_H=1],
|
||||
[AC_CHECK_HEADERS([edit/readline/readline.h], [HAVE_READLINE_READLINE_H=1],
|
||||
[AC_CHECK_HEADERS([readline/readline.h], [HAVE_READLINE_READLINE_H=1])])])
|
||||
AC_CHECK_HEADERS([editline/readline.h edit/readline/readline.h readline.h], [
|
||||
HAVE_LIBEDIT_HEADER=1
|
||||
break
|
||||
])
|
||||
AC_CHECK_LIB([edit], [readline], [LIBEDIT="-ledit"])
|
||||
fi
|
||||
|
||||
|
@ -927,10 +940,9 @@ if test "$enable_pcre2test_libedit" = "yes"; then
|
|||
echo "** Cannot use both --enable-pcre2test-libedit and --enable-pcre2test-readline"
|
||||
exit 1
|
||||
fi
|
||||
if test "$HAVE_EDITLINE_READLINE_H" != "1" -a \
|
||||
"$HAVE_READLINE_READLINE_H" != "1"; then
|
||||
echo "** Cannot --enable-pcre2test-libedit because neither editline/readline.h"
|
||||
echo "** nor readline/readline.h was found."
|
||||
if test -z "$HAVE_LIBEDIT_HEADER"; then
|
||||
echo "** Cannot --enable-pcre2test-libedit because neither editline/readline.h,"
|
||||
echo "** edit/readline/readline.h nor a compatible header was found."
|
||||
exit 1
|
||||
fi
|
||||
if test -z "$LIBEDIT"; then
|
||||
|
|
|
@ -121,6 +121,7 @@ environment, for example.
|
|||
pcre2_substring.c
|
||||
pcre2_tables.c
|
||||
pcre2_ucd.c
|
||||
pcre2_ucptables.c
|
||||
pcre2_valid_utf.c
|
||||
pcre2_xclass.c
|
||||
|
||||
|
@ -306,7 +307,7 @@ cache can be deleted by selecting "File > Delete Cache".
|
|||
3. Create a new, empty build directory, preferably a subdirectory of the
|
||||
source dir. For example, C:\pcre2\pcre2-xx\build.
|
||||
|
||||
4. Run cmake-gui from the Shell envirornment of your build tool, for example,
|
||||
4. Run cmake-gui from the Shell environment of your build tool, for example,
|
||||
Msys for Msys/MinGW or Visual Studio Command Prompt for VC/VC++. Do not try
|
||||
to start Cmake from the Windows Start menu, as this can lead to errors.
|
||||
|
||||
|
@ -343,10 +344,10 @@ cache can be deleted by selecting "File > Delete Cache".
|
|||
|
||||
BUILDING PCRE2 ON WINDOWS WITH VISUAL STUDIO
|
||||
|
||||
The code currently cannot be compiled without a stdint.h header, which is
|
||||
available only in relatively recent versions of Visual Studio. However, this
|
||||
portable and permissively-licensed implementation of the header worked without
|
||||
issue:
|
||||
The code currently cannot be compiled without an inttypes.h header, which is
|
||||
available only with Visual Studio 2013 or newer. However, this portable and
|
||||
permissively-licensed implementation of the stdint.h header could be used as an
|
||||
alternative:
|
||||
|
||||
http://www.azillionmonkeys.com/qed/pstdint.h
|
||||
|
||||
|
@ -373,7 +374,7 @@ Otherwise:
|
|||
1. Copy RunTest.bat into the directory where pcre2test.exe and pcre2grep.exe
|
||||
have been created.
|
||||
|
||||
2. Edit RunTest.bat to indentify the full or relative location of
|
||||
2. Edit RunTest.bat to identify the full or relative location of
|
||||
the pcre2 source (wherein which the testdata folder resides), e.g.:
|
||||
|
||||
set srcdir=C:\pcre2\pcre2-10.00
|
||||
|
|
|
@ -5,11 +5,10 @@ PCRE2 is a re-working of the original PCRE1 library to provide an entirely new
|
|||
API. Since its initial release in 2015, there has been further development of
|
||||
the code and it now differs from PCRE1 in more than just the API. There are new
|
||||
features, and the internals have been improved. The original PCRE1 library is
|
||||
now obsolete and should not be used in new projects. The latest release of
|
||||
PCRE2 is available in .tar.gz, tar.bz2, or .zip form from this GitHub
|
||||
repository:
|
||||
now obsolete and no longer maintained. The latest release of PCRE2 is available
|
||||
in .tar.gz, tar.bz2, or .zip form from this GitHub repository:
|
||||
|
||||
https://github.com/PhilipHazel/pcre2/releases
|
||||
https://github.com/PCRE2Project/pcre2/releases
|
||||
|
||||
There is a mailing list for discussion about the development of PCRE2 at
|
||||
pcre2-dev@googlegroups.com. You can subscribe by sending an email to
|
||||
|
@ -18,7 +17,7 @@ pcre2-dev+subscribe@googlegroups.com.
|
|||
You can access the archives and also subscribe or manage your subscription
|
||||
here:
|
||||
|
||||
https://groups.google.com/pcre2-dev
|
||||
https://groups.google.com/g/pcre2-dev
|
||||
|
||||
Please read the NEWS file if you are upgrading from a previous release. The
|
||||
contents of this README file are:
|
||||
|
@ -115,12 +114,18 @@ Building PCRE2 using autotools
|
|||
The following instructions assume the use of the widely used "configure; make;
|
||||
make install" (autotools) process.
|
||||
|
||||
To build PCRE2 on system that supports autotools, first run the "configure"
|
||||
command from the PCRE2 distribution directory, with your current directory set
|
||||
If you have downloaded and unpacked a PCRE2 release tarball, run the
|
||||
"configure" command from the PCRE2 directory, with your current directory set
|
||||
to the directory where you want the files to be created. This command is a
|
||||
standard GNU "autoconf" configuration script, for which generic instructions
|
||||
are supplied in the file INSTALL.
|
||||
|
||||
The files in the GitHub repository do not contain "configure". If you have
|
||||
downloaded the PCRE2 source files from GitHub, before you can run "configure"
|
||||
you must run the shell script called autogen.sh. This runs a number of
|
||||
autotools to create a "configure" script (you must of course have the autotools
|
||||
commands installed in order to do this).
|
||||
|
||||
Most commonly, people build PCRE2 within its own distribution directory, and in
|
||||
this case, on many systems, just running "./configure" is sufficient. However,
|
||||
the usual methods of changing standard defaults are available. For example:
|
||||
|
@ -189,10 +194,10 @@ library. They are also documented in the pcre2build man page.
|
|||
|
||||
As well as supporting UTF strings, Unicode support includes support for the
|
||||
\P, \p, and \X sequences that recognize Unicode character properties.
|
||||
However, only the basic two-letter properties such as Lu are supported.
|
||||
Escape sequences such as \d and \w in patterns do not by default make use of
|
||||
Unicode properties, but can be made to do so by setting the PCRE2_UCP option
|
||||
or starting a pattern with (*UCP).
|
||||
However, only a subset of Unicode properties are supported; see the
|
||||
pcre2pattern man page for details. Escape sequences such as \d and \w in
|
||||
patterns do not by default make use of Unicode properties, but can be made to
|
||||
do so by setting the PCRE2_UCP option or starting a pattern with (*UCP).
|
||||
|
||||
. You can build PCRE2 to recognize either CR or LF or the sequence CRLF, or any
|
||||
of the preceding, or any of the Unicode newline sequences, or the NUL (zero)
|
||||
|
@ -370,19 +375,20 @@ library. They are also documented in the pcre2build man page.
|
|||
necessary to specify something like LIBS="-lncurses" as well. This is
|
||||
because, to quote the readline INSTALL, "Readline uses the termcap functions,
|
||||
but does not link with the termcap or curses library itself, allowing
|
||||
applications which link with readline the to choose an appropriate library."
|
||||
applications which link with readline the option to choose an appropriate
|
||||
library."
|
||||
If you get error messages about missing functions tgetstr, tgetent, tputs,
|
||||
tgetflag, or tgoto, this is the problem, and linking with the ncurses library
|
||||
should fix it.
|
||||
|
||||
. The C99 standard defines formatting modifiers z and t for size_t and
|
||||
ptrdiff_t values, respectively. By default, PCRE2 uses these modifiers in
|
||||
environments other than Microsoft Visual Studio when __STDC_VERSION__ is
|
||||
defined and has a value greater than or equal to 199901L (indicating C99).
|
||||
However, there is at least one environment that claims to be C99 but does not
|
||||
support these modifiers. If --disable-percent-zt is specified, no use is made
|
||||
of the z or t modifiers. Instead of %td or %zu, %lu is used, with a cast for
|
||||
size_t values.
|
||||
environments other than Microsoft Visual Studio versions earlier than 2013
|
||||
when __STDC_VERSION__ is defined and has a value greater than or equal to
|
||||
199901L (indicating C99). However, there is at least one environment that
|
||||
claims to be C99 but does not support these modifiers. If
|
||||
--disable-percent-zt is specified, no use is made of the z or t modifiers.
|
||||
Instead of %td or %zu, %lu is used, with a cast for size_t values.
|
||||
|
||||
. There is a special option called --enable-fuzz-support for use by people who
|
||||
want to run fuzzing tests on PCRE2. At present this applies only to the 8-bit
|
||||
|
@ -395,10 +401,10 @@ library. They are also documented in the pcre2build man page.
|
|||
Setting --enable-fuzz-support also causes a binary called pcre2fuzzcheck to
|
||||
be created. This is normally run under valgrind or used when PCRE2 is
|
||||
compiled with address sanitizing enabled. It calls the fuzzing function and
|
||||
outputs information about it is doing. The input strings are specified by
|
||||
arguments: if an argument starts with "=" the rest of it is a literal input
|
||||
string. Otherwise, it is assumed to be a file name, and the contents of the
|
||||
file are the test string.
|
||||
outputs information about what it is doing. The input strings are specified
|
||||
by arguments: if an argument starts with "=" the rest of it is a literal
|
||||
input string. Otherwise, it is assumed to be a file name, and the contents
|
||||
of the file are the test string.
|
||||
|
||||
. Releases before 10.30 could be compiled with --disable-stack-for-recursion,
|
||||
which caused pcre2_match() to use individual blocks on the heap for
|
||||
|
@ -412,7 +418,7 @@ The "configure" script builds the following files for the basic C library:
|
|||
. Makefile the makefile that builds the library
|
||||
. src/config.h build-time configuration options for the library
|
||||
. src/pcre2.h the public PCRE2 header file
|
||||
. pcre2-config script that shows the building settings such as CFLAGS
|
||||
. pcre2-config script that shows the building settings such as CFLAGS
|
||||
that were set for "configure"
|
||||
. libpcre2-8.pc )
|
||||
. libpcre2-16.pc ) data for the pkg-config command
|
||||
|
@ -572,9 +578,9 @@ at build time" for more details.
|
|||
Making new tarballs
|
||||
-------------------
|
||||
|
||||
The command "make dist" creates two PCRE2 tarballs, in tar.gz and zip formats.
|
||||
The command "make distcheck" does the same, but then does a trial build of the
|
||||
new distribution to ensure that it works.
|
||||
The command "make dist" creates three PCRE2 tarballs, in tar.gz, tar.bz2, and
|
||||
zip formats. The command "make distcheck" does the same, but then does a trial
|
||||
build of the new distribution to ensure that it works.
|
||||
|
||||
If you have modified any of the man page sources in the doc directory, you
|
||||
should first run the PrepareRelease script before making a distribution. This
|
||||
|
@ -603,13 +609,13 @@ is available. RunTest outputs a comment when it skips a test.
|
|||
|
||||
Many (but not all) of the tests that are not skipped are run twice if JIT
|
||||
support is available. On the second run, JIT compilation is forced. This
|
||||
testing can be suppressed by putting "nojit" on the RunTest command line.
|
||||
testing can be suppressed by putting "-nojit" on the RunTest command line.
|
||||
|
||||
The entire set of tests is run once for each of the 8-bit, 16-bit and 32-bit
|
||||
libraries that are enabled. If you want to run just one set of tests, call
|
||||
RunTest with either the -8, -16 or -32 option.
|
||||
|
||||
If valgrind is installed, you can run the tests under it by putting "valgrind"
|
||||
If valgrind is installed, you can run the tests under it by putting "-valgrind"
|
||||
on the RunTest command line. To run pcre2test on just one or more specific test
|
||||
files, give their numbers as arguments to RunTest, for example:
|
||||
|
||||
|
@ -690,7 +696,7 @@ Test 14 contains some special UTF and UCP tests that give different output for
|
|||
different code unit widths.
|
||||
|
||||
Test 15 contains a number of tests that must not be run with JIT. They check,
|
||||
among other non-JIT things, the match-limiting features of the intepretive
|
||||
among other non-JIT things, the match-limiting features of the interpretive
|
||||
matcher.
|
||||
|
||||
Test 16 is run only when JIT support is not available. It checks that an
|
||||
|
@ -906,4 +912,4 @@ The distribution should contain the files listed below.
|
|||
Philip Hazel
|
||||
Email local part: Philip.Hazel
|
||||
Email domain: gmail.com
|
||||
Last updated: 27 August 2021
|
||||
Last updated: 15 April 2022
|
||||
|
|
|
@ -92,8 +92,18 @@ Additional options may be set in the compile context via the
|
|||
function.
|
||||
</P>
|
||||
<P>
|
||||
The yield of this function is a pointer to a private data structure that
|
||||
contains the compiled pattern, or NULL if an error was detected.
|
||||
If either of <i>errorcode</i> or <i>erroroffset</i> is NULL, the function returns
|
||||
NULL immediately. Otherwise, the yield of this function is a pointer to a
|
||||
private data structure that contains the compiled pattern, or NULL if an error
|
||||
was detected. In the error case, a text error message can be obtained by
|
||||
passing the value returned via the <i>errorcode</i> argument to the the
|
||||
<b>pcre2_get_error_message()</b> function. The offset (in code units) where the
|
||||
error was encountered is returned via the <i>erroroffset</i> argument.
|
||||
</P>
|
||||
<P>
|
||||
If there is no error, the value passed via <i>errorcode</i> returns the message
|
||||
"no error" if passed to <b>pcre2_get_error_message()</b>, and the value passed
|
||||
via <i>erroroffset</i> is zero.
|
||||
</P>
|
||||
<P>
|
||||
There is a complete description of the PCRE2 native API, with more detail on
|
||||
|
|
|
@ -34,7 +34,8 @@ allocation. The result can be passed to the JIT run-time code by calling
|
|||
<b>pcre2_jit_stack_assign()</b> to associate the stack with a compiled pattern,
|
||||
which can then be processed by <b>pcre2_match()</b> or <b>pcre2_jit_match()</b>.
|
||||
A maximum stack size of 512KiB to 1MiB should be more than enough for any
|
||||
pattern. For more details, see the
|
||||
pattern. If the stack couldn't be allocated or the values passed were not
|
||||
reasonable, NULL will be returned. For more details, see the
|
||||
<a href="pcre2jit.html"><b>pcre2jit</b></a>
|
||||
page.
|
||||
</P>
|
||||
|
|
|
@ -48,7 +48,7 @@ the following negative error codes:
|
|||
PCRE2_ERROR_BADDATA <i>number_of_codes</i> is zero or less
|
||||
PCRE2_ERROR_BADMAGIC mismatch of id bytes in <i>bytes</i>
|
||||
PCRE2_ERROR_BADMODE mismatch of variable unit size or PCRE version
|
||||
PCRE2_ERROR_MEMORY memory allocation failed
|
||||
PCRE2_ERROR_NOMEMORY memory allocation failed
|
||||
PCRE2_ERROR_NULL <i>codes</i> or <i>bytes</i> is NULL
|
||||
</pre>
|
||||
PCRE2_ERROR_BADMAGIC may mean that the data is corrupt, or that it was compiled
|
||||
|
|
|
@ -30,8 +30,8 @@ This function sets additional option bits for <b>pcre2_compile()</b> that are
|
|||
housed in a compile context. It completely replaces all the bits. The extra
|
||||
options are:
|
||||
<pre>
|
||||
PCRE2_EXTRA_ALLOW_LOOKAROUND_BSK Allow \K in lookarounds PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES Allow \x{df800} to \x{dfff}
|
||||
in UTF-8 and UTF-32 modes
|
||||
PCRE2_EXTRA_ALLOW_LOOKAROUND_BSK Allow \K in lookarounds
|
||||
PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES Allow \x{d800} to \x{dfff} in UTF-8 and UTF-32 modes
|
||||
PCRE2_EXTRA_ALT_BSUX Extended alternate \u, \U, and \x handling
|
||||
PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL Treat all invalid escapes as a literal following character
|
||||
PCRE2_EXTRA_ESCAPED_CR_IS_LF Interpret \r as \n
|
||||
|
|
|
@ -68,29 +68,29 @@ automatically added.
|
|||
The subject and replacement lengths can be given as PCRE2_ZERO_TERMINATED for
|
||||
zero-terminated strings. The options are:
|
||||
<pre>
|
||||
PCRE2_ANCHORED Match only at the first position
|
||||
PCRE2_ENDANCHORED Pattern can match only at end of subject
|
||||
PCRE2_NOTBOL Subject is not the beginning of a line
|
||||
PCRE2_NOTEOL Subject is not the end of a line
|
||||
PCRE2_NOTEMPTY An empty string is not a valid match
|
||||
PCRE2_NOTEMPTY_ATSTART An empty string at the start of the subject is not a valid match
|
||||
PCRE2_NO_JIT Do not use JIT matching
|
||||
PCRE2_NO_UTF_CHECK Do not check the subject or replacement for UTF validity (only relevant if
|
||||
PCRE2_UTF was set at compile time)
|
||||
PCRE2_SUBSTITUTE_EXTENDED Do extended replacement processing
|
||||
PCRE2_SUBSTITUTE_GLOBAL Replace all occurrences in the subject
|
||||
PCRE2_SUBSTITUTE_LITERAL The replacement string is literal
|
||||
PCRE2_SUBSTITUTE_MATCHED Use pre-existing match data for 1st match
|
||||
PCRE2_SUBSTITUTE_OVERFLOW_LENGTH If overflow, compute needed length
|
||||
PCRE2_ANCHORED Match only at the first position
|
||||
PCRE2_ENDANCHORED Match only at end of subject
|
||||
PCRE2_NOTBOL Subject is not the beginning of a line
|
||||
PCRE2_NOTEOL Subject is not the end of a line
|
||||
PCRE2_NOTEMPTY An empty string is not a valid match
|
||||
PCRE2_NOTEMPTY_ATSTART An empty string at the start of the subject is not a valid match
|
||||
PCRE2_NO_JIT Do not use JIT matching
|
||||
PCRE2_NO_UTF_CHECK Do not check for UTF validity in the subject or replacement
|
||||
(only relevant if PCRE2_UTF was set at compile time)
|
||||
PCRE2_SUBSTITUTE_EXTENDED Do extended replacement processing
|
||||
PCRE2_SUBSTITUTE_GLOBAL Replace all occurrences in the subject
|
||||
PCRE2_SUBSTITUTE_LITERAL The replacement string is literal
|
||||
PCRE2_SUBSTITUTE_MATCHED Use pre-existing match data for first match
|
||||
PCRE2_SUBSTITUTE_OVERFLOW_LENGTH If overflow, compute needed length
|
||||
PCRE2_SUBSTITUTE_REPLACEMENT_ONLY Return only replacement string(s)
|
||||
PCRE2_SUBSTITUTE_UNKNOWN_UNSET Treat unknown group as unset
|
||||
PCRE2_SUBSTITUTE_UNSET_EMPTY Simple unset insert = empty string
|
||||
PCRE2_SUBSTITUTE_UNKNOWN_UNSET Treat unknown group as unset
|
||||
PCRE2_SUBSTITUTE_UNSET_EMPTY Simple unset insert = empty string
|
||||
</pre>
|
||||
If PCRE2_SUBSTITUTE_LITERAL is set, PCRE2_SUBSTITUTE_EXTENDED,
|
||||
PCRE2_SUBSTITUTE_UNKNOWN_UNSET, and PCRE2_SUBSTITUTE_UNSET_EMPTY are ignored.
|
||||
</P>
|
||||
<P>
|
||||
If PCRE2_SUBSTITUTE_MATCHED is set, <i>match_data</i> must be non-zero; its
|
||||
If PCRE2_SUBSTITUTE_MATCHED is set, <i>match_data</i> must be non-NULL; its
|
||||
contents must be the result of a call to <b>pcre2_match()</b> using the same
|
||||
pattern and subject.
|
||||
</P>
|
||||
|
|
|
@ -1017,7 +1017,7 @@ has its own memory control arrangements (see the
|
|||
documentation for more details). If the limit is reached, the negative error
|
||||
code PCRE2_ERROR_HEAPLIMIT is returned. The default limit can be set when PCRE2
|
||||
is built; if it is not, the default is set very large and is essentially
|
||||
"unlimited".
|
||||
unlimited.
|
||||
</P>
|
||||
<P>
|
||||
A value for the heap limit may also be supplied by an item at the start of a
|
||||
|
@ -1030,19 +1030,17 @@ less than the limit set by the caller of <b>pcre2_match()</b> or, if no such
|
|||
limit is set, less than the default.
|
||||
</P>
|
||||
<P>
|
||||
The <b>pcre2_match()</b> function starts out using a 20KiB vector on the system
|
||||
stack for recording backtracking points. The more nested backtracking points
|
||||
there are (that is, the deeper the search tree), the more memory is needed.
|
||||
Heap memory is used only if the initial vector is too small. If the heap limit
|
||||
is set to a value less than 21 (in particular, zero) no heap memory will be
|
||||
used. In this case, only patterns that do not have a lot of nested backtracking
|
||||
can be successfully processed.
|
||||
The <b>pcre2_match()</b> function always needs some heap memory, so setting a
|
||||
value of zero guarantees a "heap limit exceeded" error. Details of how
|
||||
<b>pcre2_match()</b> uses the heap are given in the
|
||||
<a href="pcre2perform.html"><b>pcre2perform</b></a>
|
||||
documentation.
|
||||
</P>
|
||||
<P>
|
||||
Similarly, for <b>pcre2_dfa_match()</b>, a vector on the system stack is used
|
||||
when processing pattern recursions, lookarounds, or atomic groups, and only if
|
||||
this is not big enough is heap memory used. In this case, too, setting a value
|
||||
of zero disables the use of the heap.
|
||||
For <b>pcre2_dfa_match()</b>, a vector on the system stack is used when
|
||||
processing pattern recursions, lookarounds, or atomic groups, and only if this
|
||||
is not big enough is heap memory used. In this case, setting a value of zero
|
||||
disables the use of the heap.
|
||||
<br>
|
||||
<br>
|
||||
<b>int pcre2_set_match_limit(pcre2_match_context *<i>mcontext</i>,</b>
|
||||
|
@ -1089,10 +1087,10 @@ less than the limit set by the caller of <b>pcre2_match()</b> or
|
|||
<br>
|
||||
<br>
|
||||
This parameter limits the depth of nested backtracking in <b>pcre2_match()</b>.
|
||||
Each time a nested backtracking point is passed, a new memory "frame" is used
|
||||
Each time a nested backtracking point is passed, a new memory frame is used
|
||||
to remember the state of matching at that point. Thus, this parameter
|
||||
indirectly limits the amount of memory that is used in a match. However,
|
||||
because the size of each memory "frame" depends on the number of capturing
|
||||
because the size of each memory frame depends on the number of capturing
|
||||
parentheses, the actual memory limit varies from pattern to pattern. This limit
|
||||
was more useful in versions before 10.30, where function recursion was used for
|
||||
backtracking.
|
||||
|
@ -1383,8 +1381,7 @@ If <i>errorcode</i> or <i>erroroffset</i> is NULL, <b>pcre2_compile()</b> return
|
|||
NULL immediately. Otherwise, the variables to which these point are set to an
|
||||
error code and an offset (number of code units) within the pattern,
|
||||
respectively, when <b>pcre2_compile()</b> returns NULL because a compilation
|
||||
error has occurred. The values are not defined when compilation is successful
|
||||
and <b>pcre2_compile()</b> returns a non-NULL value.
|
||||
error has occurred.
|
||||
</P>
|
||||
<P>
|
||||
There are nearly 100 positive error codes that <b>pcre2_compile()</b> may return
|
||||
|
@ -1399,15 +1396,18 @@ because the textual error messages that are obtained by calling the
|
|||
message"
|
||||
<a href="#geterrormessage">below)</a>
|
||||
should be self-explanatory. Macro names starting with PCRE2_ERROR_ are defined
|
||||
for both positive and negative error codes in <b>pcre2.h</b>.
|
||||
for both positive and negative error codes in <b>pcre2.h</b>. When compilation
|
||||
is successful <i>errorcode</i> is set to a value that returns the message "no
|
||||
error" if passed to <b>pcre2_get_error_message()</b>.
|
||||
</P>
|
||||
<P>
|
||||
The value returned in <i>erroroffset</i> is an indication of where in the
|
||||
pattern the error occurred. It is not necessarily the furthest point in the
|
||||
pattern that was read. For example, after the error "lookbehind assertion is
|
||||
not fixed length", the error offset points to the start of the failing
|
||||
assertion. For an invalid UTF-8 or UTF-16 string, the offset is that of the
|
||||
first code unit of the failing character.
|
||||
pattern an error occurred. When there is no error, zero is returned. A non-zero
|
||||
value is not necessarily the furthest point in the pattern that was read. For
|
||||
example, after the error "lookbehind assertion is not fixed length", the error
|
||||
offset points to the start of the failing assertion. For an invalid UTF-8 or
|
||||
UTF-16 string, the offset is that of the first code unit of the failing
|
||||
character.
|
||||
</P>
|
||||
<P>
|
||||
Some errors are not detected until the whole pattern has been scanned; in these
|
||||
|
@ -1845,7 +1845,7 @@ undefined. It may cause your program to crash or loop.
|
|||
</P>
|
||||
<P>
|
||||
Note that this option can also be passed to <b>pcre2_match()</b> and
|
||||
<b>pcre_dfa_match()</b>, to suppress UTF validity checking of the subject
|
||||
<b>pcre2_dfa_match()</b>, to suppress UTF validity checking of the subject
|
||||
string.
|
||||
</P>
|
||||
<P>
|
||||
|
@ -2055,8 +2055,8 @@ point. However, this applies only to characters whose code points are less than
|
|||
\d.
|
||||
</P>
|
||||
<P>
|
||||
When PCRE2 is built with Unicode support (the default), the Unicode properties
|
||||
of all characters can be tested with \p and \P, or, alternatively, the
|
||||
When PCRE2 is built with Unicode support (the default), certain Unicode
|
||||
character properties can be tested with \p and \P, or, alternatively, the
|
||||
PCRE2_UCP option can be set when a pattern is compiled; this causes \w and
|
||||
friends to use Unicode property support instead of the built-in tables.
|
||||
PCRE2_UCP also causes upper/lower casing operations on characters with code
|
||||
|
@ -2316,7 +2316,7 @@ return zero. The third argument should point to a <b>size_t</b> variable.
|
|||
PCRE2_INFO_LASTCODETYPE
|
||||
</pre>
|
||||
Returns 1 if there is a rightmost literal code unit that must exist in any
|
||||
matched string, other than at its start. The third argument should point to a
|
||||
matched string, other than at its start. The third argument should point to a
|
||||
<b>uint32_t</b> variable. If there is no such value, 0 is returned. When 1 is
|
||||
returned, the code unit value itself can be retrieved using
|
||||
PCRE2_INFO_LASTCODEUNIT. For anchored patterns, a last literal value is
|
||||
|
@ -2640,7 +2640,9 @@ The subject string is passed to <b>pcre2_match()</b> as a pointer in
|
|||
<i>startoffset</i>. The length and offset are in code units, not characters.
|
||||
That is, they are in bytes for the 8-bit library, 16-bit code units for the
|
||||
16-bit library, and 32-bit code units for the 32-bit library, whether or not
|
||||
UTF processing is enabled.
|
||||
UTF processing is enabled. As a special case, if <i>subject</i> is NULL and
|
||||
<i>length</i> is zero, the subject is assumed to be an empty string. If
|
||||
<i>length</i> is non-zero, an error occurs if <i>subject</i> is NULL.
|
||||
</P>
|
||||
<P>
|
||||
If <i>startoffset</i> is greater than the length of the subject,
|
||||
|
@ -3144,11 +3146,11 @@ The backtracking match limit was reached.
|
|||
<pre>
|
||||
PCRE2_ERROR_NOMEMORY
|
||||
</pre>
|
||||
If a pattern contains many nested backtracking points, heap memory is used to
|
||||
remember them. This error is given when the memory allocation function (default
|
||||
or custom) fails. Note that a different error, PCRE2_ERROR_HEAPLIMIT, is given
|
||||
if the amount of memory needed exceeds the heap limit. PCRE2_ERROR_NOMEMORY is
|
||||
also returned if PCRE2_COPY_MATCHED_SUBJECT is set and memory allocation fails.
|
||||
Heap memory is used to remember backgracking points. This error is given when
|
||||
the memory allocation function (default or custom) fails. Note that a different
|
||||
error, PCRE2_ERROR_HEAPLIMIT, is given if the amount of memory needed exceeds
|
||||
the heap limit. PCRE2_ERROR_NOMEMORY is also returned if
|
||||
PCRE2_COPY_MATCHED_SUBJECT is set and memory allocation fails.
|
||||
<pre>
|
||||
PCRE2_ERROR_NULL
|
||||
</pre>
|
||||
|
@ -3394,12 +3396,17 @@ same number causes an error at compile time.
|
|||
<P>
|
||||
This function optionally calls <b>pcre2_match()</b> and then makes a copy of the
|
||||
subject string in <i>outputbuffer</i>, replacing parts that were matched with
|
||||
the <i>replacement</i> string, whose length is supplied in <b>rlength</b>. This
|
||||
can be given as PCRE2_ZERO_TERMINATED for a zero-terminated string. There is an
|
||||
option (see PCRE2_SUBSTITUTE_REPLACEMENT_ONLY below) to return just the
|
||||
replacement string(s). The default action is to perform just one replacement if
|
||||
the pattern matches, but there is an option that requests multiple replacements
|
||||
(see PCRE2_SUBSTITUTE_GLOBAL below).
|
||||
the <i>replacement</i> string, whose length is supplied in <b>rlength</b>, which
|
||||
can be given as PCRE2_ZERO_TERMINATED for a zero-terminated string. As a
|
||||
special case, if <i>replacement</i> is NULL and <i>rlength</i> is zero, the
|
||||
replacement is assumed to be an empty string. If <i>rlength</i> is non-zero, an
|
||||
error occurs if <i>replacement</i> is NULL.
|
||||
</P>
|
||||
<P>
|
||||
There is an option (see PCRE2_SUBSTITUTE_REPLACEMENT_ONLY below) to return just
|
||||
the replacement string(s). The default action is to perform just one
|
||||
replacement if the pattern matches, but there is an option that requests
|
||||
multiple replacements (see PCRE2_SUBSTITUTE_GLOBAL below).
|
||||
</P>
|
||||
<P>
|
||||
If successful, <b>pcre2_substitute()</b> returns the number of substitutions
|
||||
|
@ -3433,12 +3440,12 @@ block may or may not have been changed.
|
|||
As well as the usual options for <b>pcre2_match()</b>, a number of additional
|
||||
options can be set in the <i>options</i> argument of <b>pcre2_substitute()</b>.
|
||||
One such option is PCRE2_SUBSTITUTE_MATCHED. When this is set, an external
|
||||
<i>match_data</i> block must be provided, and it must have been used for an
|
||||
external call to <b>pcre2_match()</b>. The data in the <i>match_data</i> block
|
||||
(return code, offset vector) is used for the first substitution instead of
|
||||
calling <b>pcre2_match()</b> from within <b>pcre2_substitute()</b>. This allows
|
||||
an application to check for a match before choosing to substitute, without
|
||||
having to repeat the match.
|
||||
<i>match_data</i> block must be provided, and it must have already been used for
|
||||
an external call to <b>pcre2_match()</b> with the same pattern and subject
|
||||
arguments. The data in the <i>match_data</i> block (return code, offset vector)
|
||||
is then used for the first substitution instead of calling <b>pcre2_match()</b>
|
||||
from within <b>pcre2_substitute()</b>. This allows an application to check for a
|
||||
match before choosing to substitute, without having to repeat the match.
|
||||
</P>
|
||||
<P>
|
||||
The contents of the externally supplied match data block are not changed when
|
||||
|
@ -3583,7 +3590,7 @@ and force lower case. The escape sequences change the current state: \U and
|
|||
terminating a \Q quoted sequence) reverts to no case forcing. The sequences
|
||||
\u and \l force the next character (if it is a letter) to upper or lower
|
||||
case, respectively, and then the state automatically reverts to no case
|
||||
forcing. Case forcing applies to all inserted characters, including those from
|
||||
forcing. Case forcing applies to all inserted characters, including those from
|
||||
capture groups and letters within \Q...\E quoted sequences. If either
|
||||
PCRE2_UTF or PCRE2_UCP was set when the pattern was compiled, Unicode
|
||||
properties are used for case forcing characters whose code points are greater
|
||||
|
@ -3655,7 +3662,9 @@ default.
|
|||
</P>
|
||||
<P>
|
||||
PCRE2_ERROR_NULL is returned if PCRE2_SUBSTITUTE_MATCHED is set but the
|
||||
<i>match_data</i> argument is NULL.
|
||||
<i>match_data</i> argument is NULL or if the <i>subject</i> or <i>replacement</i>
|
||||
arguments are NULL. For backward compatibility reasons an exception is made for
|
||||
the <i>replacement</i> argument if the <i>rlength</i> argument is also 0.
|
||||
</P>
|
||||
<P>
|
||||
PCRE2_ERROR_BADREPLACEMENT is used for miscellaneous syntax errors in the
|
||||
|
@ -3810,12 +3819,13 @@ other alternatives. Ultimately, when it runs out of matches,
|
|||
<P>
|
||||
The function <b>pcre2_dfa_match()</b> is called to match a subject string
|
||||
against a compiled pattern, using a matching algorithm that scans the subject
|
||||
string just once (not counting lookaround assertions), and does not backtrack.
|
||||
This has different characteristics to the normal algorithm, and is not
|
||||
compatible with Perl. Some of the features of PCRE2 patterns are not supported.
|
||||
Nevertheless, there are times when this kind of matching can be useful. For a
|
||||
discussion of the two matching algorithms, and a list of features that
|
||||
<b>pcre2_dfa_match()</b> does not support, see the
|
||||
string just once (not counting lookaround assertions), and does not backtrack
|
||||
(except when processing lookaround assertions). This has different
|
||||
characteristics to the normal algorithm, and is not compatible with Perl. Some
|
||||
of the features of PCRE2 patterns are not supported. Nevertheless, there are
|
||||
times when this kind of matching can be useful. For a discussion of the two
|
||||
matching algorithms, and a list of features that <b>pcre2_dfa_match()</b> does
|
||||
not support, see the
|
||||
<a href="pcre2matching.html"><b>pcre2matching</b></a>
|
||||
documentation.
|
||||
</P>
|
||||
|
@ -3850,7 +3860,7 @@ Here is an example of a simple call to <b>pcre2_dfa_match()</b>:
|
|||
</PRE>
|
||||
</P>
|
||||
<br><b>
|
||||
Option bits for <b>pcre_dfa_match()</b>
|
||||
Option bits for <b>pcre2_dfa_match()</b>
|
||||
</b><br>
|
||||
<P>
|
||||
The unused bits of the <i>options</i> argument for <b>pcre2_dfa_match()</b> must
|
||||
|
@ -4008,9 +4018,9 @@ Cambridge, England.
|
|||
</P>
|
||||
<br><a name="SEC42" href="#TOC1">REVISION</a><br>
|
||||
<P>
|
||||
Last updated: 30 August 2021
|
||||
Last updated: 27 July 2022
|
||||
<br>
|
||||
Copyright © 1997-2021 University of Cambridge.
|
||||
Copyright © 1997-2022 University of Cambridge.
|
||||
<br>
|
||||
<p>
|
||||
Return to the <a href="index.html">PCRE2 index page</a>.
|
||||
|
|
|
@ -142,8 +142,9 @@ locked this out by setting PCRE2_NEVER_UTF.
|
|||
UTF support allows the libraries to process character code points up to
|
||||
0x10ffff in the strings that they handle. Unicode support also gives access to
|
||||
the Unicode properties of characters, using pattern escapes such as \P, \p,
|
||||
and \X. Only the general category properties such as <i>Lu</i> and <i>Nd</i> are
|
||||
supported. Details are given in the
|
||||
and \X. Only the general category properties such as <i>Lu</i> and <i>Nd</i>,
|
||||
script names, and some bi-directional properties are supported. Details are
|
||||
given in the
|
||||
<a href="pcre2pattern.html"><b>pcre2pattern</b></a>
|
||||
documentation.
|
||||
</P>
|
||||
|
@ -283,12 +284,11 @@ to the <b>configure</b> command. This setting also applies to the
|
|||
counting is done differently).
|
||||
</P>
|
||||
<P>
|
||||
The <b>pcre2_match()</b> function starts out using a 20KiB vector on the system
|
||||
stack to record backtracking points. The more nested backtracking points there
|
||||
are (that is, the deeper the search tree), the more memory is needed. If the
|
||||
initial vector is not large enough, heap memory is used, up to a certain limit,
|
||||
which is specified in kibibytes (units of 1024 bytes). The limit can be changed
|
||||
at run time, as described in the
|
||||
The <b>pcre2_match()</b> function uses heap memory to record backtracking
|
||||
points. The more nested backtracking points there are (that is, the deeper the
|
||||
search tree), the more memory is needed. There is an upper limit, specified in
|
||||
kibibytes (units of 1024 bytes). This limit can be changed at run time, as
|
||||
described in the
|
||||
<a href="pcre2api.html"><b>pcre2api</b></a>
|
||||
documentation. The default limit (in effect unlimited) is 20 million. You can
|
||||
change this by a setting such as
|
||||
|
@ -307,7 +307,7 @@ You can also explicitly limit the depth of nested backtracking in the
|
|||
for --with-match-limit. You can set a lower default limit by adding, for
|
||||
example,
|
||||
<pre>
|
||||
--with-match-limit_depth=10000
|
||||
--with-match-limit-depth=10000
|
||||
</pre>
|
||||
to the <b>configure</b> command. This value can be overridden at run time. This
|
||||
depth limit indirectly limits the amount of heap memory that is used, but
|
||||
|
@ -553,15 +553,16 @@ documentation.
|
|||
<P>
|
||||
The C99 standard defines formatting modifiers z and t for size_t and
|
||||
ptrdiff_t values, respectively. By default, PCRE2 uses these modifiers in
|
||||
environments other than Microsoft Visual Studio when __STDC_VERSION__ is
|
||||
defined and has a value greater than or equal to 199901L (indicating C99).
|
||||
environments other than old versions of Microsoft Visual Studio when
|
||||
__STDC_VERSION__ is defined and has a value greater than or equal to 199901L
|
||||
(indicating support for C99).
|
||||
However, there is at least one environment that claims to be C99 but does not
|
||||
support these modifiers. If
|
||||
<pre>
|
||||
--disable-percent-zt
|
||||
</pre>
|
||||
is specified, no use is made of the z or t modifiers. Instead of %td or %zu,
|
||||
%lu is used, with a cast for size_t values.
|
||||
a suitable format is used depending in the size of long for the platform.
|
||||
</P>
|
||||
<br><a name="SEC22" href="#TOC1">SUPPORT FOR FUZZERS</a><br>
|
||||
<P>
|
||||
|
@ -607,16 +608,16 @@ give a warning.
|
|||
<P>
|
||||
Philip Hazel
|
||||
<br>
|
||||
University Computing Service
|
||||
Retired from University Computing Service
|
||||
<br>
|
||||
Cambridge, England.
|
||||
<br>
|
||||
</P>
|
||||
<br><a name="SEC26" href="#TOC1">REVISION</a><br>
|
||||
<P>
|
||||
Last updated: 20 March 2020
|
||||
Last updated: 27 July 2022
|
||||
<br>
|
||||
Copyright © 1997-2020 University of Cambridge.
|
||||
Copyright © 1997-2022 University of Cambridge.
|
||||
<br>
|
||||
<p>
|
||||
Return to the <a href="index.html">PCRE2 index page</a>.
|
||||
|
|
|
@ -18,33 +18,41 @@ DIFFERENCES BETWEEN PCRE2 AND PERL
|
|||
<P>
|
||||
This document describes some of the differences in the ways that PCRE2 and Perl
|
||||
handle regular expressions. The differences described here are with respect to
|
||||
Perl version 5.32.0, but as both Perl and PCRE2 are continually changing, the
|
||||
Perl version 5.34.0, but as both Perl and PCRE2 are continually changing, the
|
||||
information may at times be out of date.
|
||||
</P>
|
||||
<P>
|
||||
1. PCRE2 has only a subset of Perl's Unicode support. Details of what it does
|
||||
1. When PCRE2_DOTALL (equivalent to Perl's /s qualifier) is not set, the
|
||||
behaviour of the '.' metacharacter differs from Perl. In PCRE2, '.' matches the
|
||||
next character unless it is the start of a newline sequence. This means that,
|
||||
if the newline setting is CR, CRLF, or NUL, '.' will match the code point LF
|
||||
(0x0A) in ASCII/Unicode environments, and NL (either 0x15 or 0x25) when using
|
||||
EBCDIC. In Perl, '.' appears never to match LF, even when 0x0A is not a newline
|
||||
indicator.
|
||||
</P>
|
||||
<P>
|
||||
2. PCRE2 has only a subset of Perl's Unicode support. Details of what it does
|
||||
have are given in the
|
||||
<a href="pcre2unicode.html"><b>pcre2unicode</b></a>
|
||||
page.
|
||||
</P>
|
||||
<P>
|
||||
2. Like Perl, PCRE2 allows repeat quantifiers on parenthesized assertions, but
|
||||
3. Like Perl, PCRE2 allows repeat quantifiers on parenthesized assertions, but
|
||||
they do not mean what you might think. For example, (?!a){3} does not assert
|
||||
that the next three characters are not "a". It just asserts that the next
|
||||
character is not "a" three times (in principle; PCRE2 optimizes this to run the
|
||||
assertion just once). Perl allows some repeat quantifiers on other assertions,
|
||||
for example, \b* (but not \b{3}, though oddly it does allow ^{3}), but these
|
||||
do not seem to have any use. PCRE2 does not allow any kind of quantifier on
|
||||
non-lookaround assertions.
|
||||
for example, \b* , but these do not seem to have any use. PCRE2 does not allow
|
||||
any kind of quantifier on non-lookaround assertions.
|
||||
</P>
|
||||
<P>
|
||||
3. Capture groups that occur inside negative lookaround assertions are counted,
|
||||
4. Capture groups that occur inside negative lookaround assertions are counted,
|
||||
but their entries in the offsets vector are set only when a negative assertion
|
||||
is a condition that has a matching branch (that is, the condition is false).
|
||||
Perl may set such capture groups in other circumstances.
|
||||
</P>
|
||||
<P>
|
||||
4. The following Perl escape sequences are not supported: \F, \l, \L, \u,
|
||||
5. The following Perl escape sequences are not supported: \F, \l, \L, \u,
|
||||
\U, and \N when followed by a character name. \N on its own, matching a
|
||||
non-newline character, and \N{U+dd..}, matching a Unicode code point, are
|
||||
supported. The escapes that modify the case of following letters are
|
||||
|
@ -55,26 +63,26 @@ PCRE2_EXTRA_ALT_BSUX options is set, \U and \u are interpreted as ECMAScript
|
|||
interprets them.
|
||||
</P>
|
||||
<P>
|
||||
5. The Perl escape sequences \p, \P, and \X are supported only if PCRE2 is
|
||||
6. The Perl escape sequences \p, \P, and \X are supported only if PCRE2 is
|
||||
built with Unicode support (the default). The properties that can be tested
|
||||
with \p and \P are limited to the general category properties such as Lu and
|
||||
Nd, script names such as Greek or Han, and the derived properties Any and L&.
|
||||
Both PCRE2 and Perl support the Cs (surrogate) property, but in PCRE2 its use
|
||||
is limited. See the
|
||||
Nd, script names such as Greek or Han, Bidi_Class, Bidi_Control, and the
|
||||
derived properties Any and LC (synonym L&). Both PCRE2 and Perl support the Cs
|
||||
(surrogate) property, but in PCRE2 its use is limited. See the
|
||||
<a href="pcre2pattern.html"><b>pcre2pattern</b></a>
|
||||
documentation for details. The long synonyms for property names that Perl
|
||||
supports (such as \p{Letter}) are not supported by PCRE2, nor is it permitted
|
||||
to prefix any of these properties with "Is".
|
||||
</P>
|
||||
<P>
|
||||
6. PCRE2 supports the \Q...\E escape for quoting substrings. Characters
|
||||
7. PCRE2 supports the \Q...\E escape for quoting substrings. Characters
|
||||
in between are treated as literals. However, this is slightly different from
|
||||
Perl in that $ and @ are also handled as literals inside the quotes. In Perl,
|
||||
they cause variable interpolation (but of course PCRE2 does not have
|
||||
variables). Also, Perl does "double-quotish backslash interpolation" on any
|
||||
backslashes between \Q and \E which, its documentation says, "may lead to
|
||||
confusing results". PCRE2 treats a backslash between \Q and \E just like any
|
||||
other character. Note the following examples:
|
||||
they cause variable interpolation (PCRE2 does not have variables). Also, Perl
|
||||
does "double-quotish backslash interpolation" on any backslashes between \Q
|
||||
and \E which, its documentation says, "may lead to confusing results". PCRE2
|
||||
treats a backslash between \Q and \E just like any other character. Note the
|
||||
following examples:
|
||||
<pre>
|
||||
Pattern PCRE2 matches Perl matches
|
||||
|
||||
|
@ -88,19 +96,19 @@ The \Q...\E sequence is recognized both inside and outside character classes
|
|||
by both PCRE2 and Perl.
|
||||
</P>
|
||||
<P>
|
||||
7. Fairly obviously, PCRE2 does not support the (?{code}) and (??{code})
|
||||
8. Fairly obviously, PCRE2 does not support the (?{code}) and (??{code})
|
||||
constructions. However, PCRE2 does have a "callout" feature, which allows an
|
||||
external function to be called during pattern matching. See the
|
||||
<a href="pcre2callout.html"><b>pcre2callout</b></a>
|
||||
documentation for details.
|
||||
</P>
|
||||
<P>
|
||||
8. Subroutine calls (whether recursive or not) were treated as atomic groups up
|
||||
9. Subroutine calls (whether recursive or not) were treated as atomic groups up
|
||||
to PCRE2 release 10.23, but from release 10.30 this changed, and backtracking
|
||||
into subroutine calls is now supported, as in Perl.
|
||||
</P>
|
||||
<P>
|
||||
9. In PCRE2, if any of the backtracking control verbs are used in a group that
|
||||
10. In PCRE2, if any of the backtracking control verbs are used in a group that
|
||||
is called as a subroutine (whether or not recursively), their effect is
|
||||
confined to that group; it does not extend to the surrounding pattern. This is
|
||||
not always the case in Perl. In particular, if (*THEN) is present in a group
|
||||
|
@ -109,20 +117,20 @@ the group does not contain any | characters. Note that such groups are
|
|||
processed as anchored at the point where they are tested.
|
||||
</P>
|
||||
<P>
|
||||
10. If a pattern contains more than one backtracking control verb, the first
|
||||
11. If a pattern contains more than one backtracking control verb, the first
|
||||
one that is backtracked onto acts. For example, in the pattern
|
||||
A(*COMMIT)B(*PRUNE)C a failure in B triggers (*COMMIT), but a failure in C
|
||||
triggers (*PRUNE). Perl's behaviour is more complex; in many cases it is the
|
||||
same as PCRE2, but there are cases where it differs.
|
||||
</P>
|
||||
<P>
|
||||
11. There are some differences that are concerned with the settings of captured
|
||||
12. There are some differences that are concerned with the settings of captured
|
||||
strings when part of a pattern is repeated. For example, matching "aba" against
|
||||
the pattern /^(a(b)?)+$/ in Perl leaves $2 unset, but in PCRE2 it is set to
|
||||
"b".
|
||||
</P>
|
||||
<P>
|
||||
12. PCRE2's handling of duplicate capture group numbers and names is not as
|
||||
13. PCRE2's handling of duplicate capture group numbers and names is not as
|
||||
general as Perl's. This is a consequence of the fact the PCRE2 works internally
|
||||
just with numbers, using an external table to translate between numbers and
|
||||
names. In particular, a pattern such as (?|(?<a>A)|(?<b>B)), where the two
|
||||
|
@ -132,42 +140,43 @@ to distinguish which group matched, because both names map to capture group
|
|||
number 1. To avoid this confusing situation, an error is given at compile time.
|
||||
</P>
|
||||
<P>
|
||||
13. Perl used to recognize comments in some places that PCRE2 does not, for
|
||||
14. Perl used to recognize comments in some places that PCRE2 does not, for
|
||||
example, between the ( and ? at the start of a group. If the /x modifier is
|
||||
set, Perl allowed white space between ( and ? though the latest Perls give an
|
||||
error (for a while it was just deprecated). There may still be some cases where
|
||||
Perl behaves differently.
|
||||
</P>
|
||||
<P>
|
||||
14. Perl, when in warning mode, gives warnings for character classes such as
|
||||
15. Perl, when in warning mode, gives warnings for character classes such as
|
||||
[A-\d] or [a-[:digit:]]. It then treats the hyphens as literals. PCRE2 has no
|
||||
warning features, so it gives an error in these cases because they are almost
|
||||
certainly user mistakes.
|
||||
</P>
|
||||
<P>
|
||||
15. In PCRE2, the upper/lower case character properties Lu and Ll are not
|
||||
16. In PCRE2, the upper/lower case character properties Lu and Ll are not
|
||||
affected when case-independent matching is specified. For example, \p{Lu}
|
||||
always matches an upper case letter. I think Perl has changed in this respect;
|
||||
in the release at the time of writing (5.32), \p{Lu} and \p{Ll} match all
|
||||
in the release at the time of writing (5.34), \p{Lu} and \p{Ll} match all
|
||||
letters, regardless of case, when case independence is specified.
|
||||
</P>
|
||||
<P>
|
||||
16. From release 5.32.0, Perl locks out the use of \K in lookaround
|
||||
17. From release 5.32.0, Perl locks out the use of \K in lookaround
|
||||
assertions. From release 10.38 PCRE2 does the same by default. However, there
|
||||
is an option for re-enabling the previous behaviour. When this option is set,
|
||||
\K is acted on when it occurs in positive assertions, but is ignored in
|
||||
negative assertions.
|
||||
</P>
|
||||
<P>
|
||||
17. PCRE2 provides some extensions to the Perl regular expression facilities.
|
||||
18. PCRE2 provides some extensions to the Perl regular expression facilities.
|
||||
Perl 5.10 included new features that were not in earlier versions of Perl, some
|
||||
of which (such as named parentheses) were in PCRE2 for some time before. This
|
||||
list is with respect to Perl 5.32:
|
||||
list is with respect to Perl 5.34:
|
||||
<br>
|
||||
<br>
|
||||
(a) Although lookbehind assertions in PCRE2 must match fixed length strings,
|
||||
each alternative toplevel branch of a lookbehind assertion can match a
|
||||
different length of string. Perl requires them all to have the same length.
|
||||
different length of string. Perl used to require them all to have the same
|
||||
length, but the latest version has some variable length support.
|
||||
<br>
|
||||
<br>
|
||||
(b) From PCRE2 10.23, backreferences to groups of fixed length are supported
|
||||
|
@ -221,12 +230,12 @@ extension to the lookaround facilities. The default, Perl-compatible
|
|||
lookarounds are atomic.
|
||||
</P>
|
||||
<P>
|
||||
18. The Perl /a modifier restricts /d numbers to pure ascii, and the /aa
|
||||
19. The Perl /a modifier restricts /d numbers to pure ascii, and the /aa
|
||||
modifier restricts /i case-insensitive matching to pure ascii, ignoring Unicode
|
||||
rules. This separation cannot be represented with PCRE2_UCP.
|
||||
</P>
|
||||
<P>
|
||||
19. Perl has different limits than PCRE2. See the
|
||||
20. Perl has different limits than PCRE2. See the
|
||||
<a href="pcre2limit.html"><b>pcre2limit</b></a>
|
||||
documentation for details. Perl went with 5.10 from recursion to iteration
|
||||
keeping the intermediate matches on the heap, which is ~10% slower but does not
|
||||
|
@ -248,7 +257,7 @@ Cambridge, England.
|
|||
REVISION
|
||||
</b><br>
|
||||
<P>
|
||||
Last updated: 30 August 2021
|
||||
Last updated: 08 December 2021
|
||||
<br>
|
||||
Copyright © 1997-2021 University of Cambridge.
|
||||
<br>
|
||||
|
|
|
@ -141,8 +141,8 @@ permitted to match separator characters, but the double-star (**) feature
|
|||
</P>
|
||||
<P>
|
||||
PCRE2_CONVERT_GLOB_NO_WILD_SEPARATOR matches globs with wildcards allowed to
|
||||
match separator characters. PCRE2_GLOB_NO_STARSTAR matches globs with the
|
||||
double-star feature disabled. These options may be given together.
|
||||
match separator characters. PCRE2_CONVERT_GLOB_NO_STARSTAR matches globs with
|
||||
the double-star feature disabled. These options may be given together.
|
||||
</P>
|
||||
<br><a name="SEC5" href="#TOC1">CONVERTING POSIX PATTERNS</a><br>
|
||||
<P>
|
||||
|
|
|
@ -71,13 +71,15 @@ For example:
|
|||
<pre>
|
||||
pcre2grep some-pattern file1 - file3
|
||||
</pre>
|
||||
Input files are searched line by line. By default, each line that matches a
|
||||
By default, input files are searched line by line. Each line that matches a
|
||||
pattern is copied to the standard output, and if there is more than one file,
|
||||
the file name is output at the start of each line, followed by a colon.
|
||||
However, there are options that can change how <b>pcre2grep</b> behaves. In
|
||||
particular, the <b>-M</b> option makes it possible to search for strings that
|
||||
span line boundaries. What defines a line boundary is controlled by the
|
||||
<b>-N</b> (<b>--newline</b>) option.
|
||||
However, there are options that can change how <b>pcre2grep</b> behaves. For
|
||||
example, the <b>-M</b> option makes it possible to search for strings that span
|
||||
line boundaries. What defines a line boundary is controlled by the <b>-N</b>
|
||||
(<b>--newline</b>) option. The <b>-h</b> and <b>-H</b> options control whether or
|
||||
not file names are shown, and the <b>-Z</b> option changes the file name
|
||||
terminator to a zero byte.
|
||||
</P>
|
||||
<P>
|
||||
The amount of memory used for buffering files that are being scanned is
|
||||
|
@ -178,9 +180,11 @@ Output up to <i>number</i> lines of context after each matching line. Fewer
|
|||
lines are output if the next match or the end of the file is reached, or if the
|
||||
processing buffer size has been set too small. If file names and/or line
|
||||
numbers are being output, a hyphen separator is used instead of a colon for the
|
||||
context lines. A line containing "--" is output between each group of lines,
|
||||
unless they are in fact contiguous in the input file. The value of <i>number</i>
|
||||
is expected to be relatively small. When <b>-c</b> is used, <b>-A</b> is ignored.
|
||||
context lines (the <b>-Z</b> option can be used to change the file name
|
||||
terminator to a zero byte). A line containing "--" is output between each group
|
||||
of lines, unless they are in fact contiguous in the input file. The value of
|
||||
<i>number</i> is expected to be relatively small. When <b>-c</b> is used,
|
||||
<b>-A</b> is ignored.
|
||||
</P>
|
||||
<P>
|
||||
<b>-a</b>, <b>--text</b>
|
||||
|
@ -199,9 +203,10 @@ Output up to <i>number</i> lines of context before each matching line. Fewer
|
|||
lines are output if the previous match or the start of the file is within
|
||||
<i>number</i> lines, or if the processing buffer size has been set too small. If
|
||||
file names and/or line numbers are being output, a hyphen separator is used
|
||||
instead of a colon for the context lines. A line containing "--" is output
|
||||
between each group of lines, unless they are in fact contiguous in the input
|
||||
file. The value of <i>number</i> is expected to be relatively small. When
|
||||
instead of a colon for the context lines (the <b>-Z</b> option can be used to
|
||||
change the file name terminator to a zero byte). A line containing "--" is
|
||||
output between each group of lines, unless they are in fact contiguous in the
|
||||
input file. The value of <i>number</i> is expected to be relatively small. When
|
||||
<b>-c</b> is used, <b>-B</b> is ignored.
|
||||
</P>
|
||||
<P>
|
||||
|
@ -411,20 +416,22 @@ shown separately. This option is mutually exclusive with <b>--output</b>,
|
|||
<P>
|
||||
<b>-H</b>, <b>--with-filename</b>
|
||||
Force the inclusion of the file name at the start of output lines when
|
||||
searching a single file. By default, the file name is not shown in this case.
|
||||
For matching lines, the file name is followed by a colon; for context lines, a
|
||||
hyphen separator is used. If a line number is also being output, it follows the
|
||||
file name. When the <b>-M</b> option causes a pattern to match more than one
|
||||
line, only the first is preceded by the file name. This option overrides any
|
||||
previous <b>-h</b>, <b>-l</b>, or <b>-L</b> options.
|
||||
searching a single file. The file name is not normally shown in this case.
|
||||
By default, for matching lines, the file name is followed by a colon; for
|
||||
context lines, a hyphen separator is used. The <b>-Z</b> option can be used to
|
||||
change the terminator to a zero byte. If a line number is also being output,
|
||||
it follows the file name. When the <b>-M</b> option causes a pattern to match
|
||||
more than one line, only the first is preceded by the file name. This option
|
||||
overrides any previous <b>-h</b>, <b>-l</b>, or <b>-L</b> options.
|
||||
</P>
|
||||
<P>
|
||||
<b>-h</b>, <b>--no-filename</b>
|
||||
Suppress the output file names when searching multiple files. By default,
|
||||
file names are shown when multiple files are searched. For matching lines, the
|
||||
file name is followed by a colon; for context lines, a hyphen separator is used.
|
||||
If a line number is also being output, it follows the file name. This option
|
||||
overrides any previous <b>-H</b>, <b>-L</b>, or <b>-l</b> options.
|
||||
Suppress the output file names when searching multiple files. File names are
|
||||
normally shown when multiple files are searched. By default, for matching
|
||||
lines, the file name is followed by a colon; for context lines, a hyphen
|
||||
separator is used. The <b>-Z</b> option can be used to change the terminator to
|
||||
a zero byte. If a line number is also being output, it follows the file name.
|
||||
This option overrides any previous <b>-H</b>, <b>-L</b>, or <b>-l</b> options.
|
||||
</P>
|
||||
<P>
|
||||
<b>--heap-limit</b>=<i>number</i>
|
||||
|
@ -481,18 +488,20 @@ given any number of times. If a directory matches both <b>--include-dir</b> and
|
|||
<b>-L</b>, <b>--files-without-match</b>
|
||||
Instead of outputting lines from the files, just output the names of the files
|
||||
that do not contain any lines that would have been output. Each file name is
|
||||
output once, on a separate line. This option overrides any previous <b>-H</b>,
|
||||
<b>-h</b>, or <b>-l</b> options.
|
||||
output once, on a separate line by default, but if the <b>-Z</b> option is set,
|
||||
they are separated by zero bytes instead of newlines. This option overrides any
|
||||
previous <b>-H</b>, <b>-h</b>, or <b>-l</b> options.
|
||||
</P>
|
||||
<P>
|
||||
<b>-l</b>, <b>--files-with-matches</b>
|
||||
Instead of outputting lines from the files, just output the names of the files
|
||||
containing lines that would have been output. Each file name is output once, on
|
||||
a separate line. Searching normally stops as soon as a matching line is found
|
||||
in a file. However, if the <b>-c</b> (count) option is also used, matching
|
||||
continues in order to obtain the correct count, and those files that have at
|
||||
least one match are listed along with their counts. Using this option with
|
||||
<b>-c</b> is a way of suppressing the listing of files with no matches that
|
||||
a separate line, but if the <b>-Z</b> option is set, they are separated by zero
|
||||
bytes instead of newlines. Searching normally stops as soon as a matching line
|
||||
is found in a file. However, if the <b>-c</b> (count) option is also used,
|
||||
matching continues in order to obtain the correct count, and those files that
|
||||
have at least one match are listed along with their counts. Using this option
|
||||
with <b>-c</b> is a way of suppressing the listing of files with no matches that
|
||||
occurs with <b>-c</b> on its own. This option overrides any previous <b>-H</b>,
|
||||
<b>-h</b>, or <b>-L</b> options.
|
||||
</P>
|
||||
|
@ -592,10 +601,7 @@ value set by <b>--match-limit</b> is reached, an error occurs.
|
|||
<br>
|
||||
<br>
|
||||
The <b>--heap-limit</b> option specifies, as a number of kibibytes (units of
|
||||
1024 bytes), the amount of heap memory that may be used for matching. Heap
|
||||
memory is needed only if matching the pattern requires a significant number of
|
||||
nested backtracking points to be remembered. This parameter can be set to zero
|
||||
to forbid the use of heap memory altogether.
|
||||
1024 bytes), the maximum amount of heap memory that may be used for matching.
|
||||
<br>
|
||||
<br>
|
||||
The <b>--depth-limit</b> option limits the depth of nested backtracking points,
|
||||
|
@ -839,6 +845,13 @@ pattern and ")$" at the end. This option applies only to the patterns that are
|
|||
matched against the contents of files; it does not apply to patterns specified
|
||||
by any of the <b>--include</b> or <b>--exclude</b> options.
|
||||
</P>
|
||||
<P>
|
||||
<b>-Z</b>, <b>--null</b>
|
||||
Terminate files names in the regular output with a zero byte (the NUL
|
||||
character) instead of what would normally appear. This is useful when file
|
||||
names contain unusual characters such as colons, hyphens, or even newlines. The
|
||||
option does not apply to file names in error messages.
|
||||
</P>
|
||||
<br><a name="SEC7" href="#TOC1">ENVIRONMENT VARIABLES</a><br>
|
||||
<P>
|
||||
The environment variables <b>LC_ALL</b> and <b>LC_CTYPE</b> are examined, in that
|
||||
|
@ -1053,9 +1066,9 @@ Cambridge, England.
|
|||
</P>
|
||||
<br><a name="SEC16" href="#TOC1">REVISION</a><br>
|
||||
<P>
|
||||
Last updated: 31 August 2021
|
||||
Last updated: 30 July 2022
|
||||
<br>
|
||||
Copyright © 1997-2021 University of Cambridge.
|
||||
Copyright © 1997-2022 University of Cambridge.
|
||||
<br>
|
||||
<p>
|
||||
Return to the <a href="index.html">PCRE2 index page</a>.
|
||||
|
|
|
@ -269,11 +269,11 @@ starts another match, that match must use a different JIT stack to the one used
|
|||
for currently suspended match(es).
|
||||
</P>
|
||||
<P>
|
||||
In a multithread application, if you do not
|
||||
specify a JIT stack, or if you assign or pass back NULL from a callback, that
|
||||
is thread-safe, because each thread has its own machine stack. However, if you
|
||||
assign or pass back a non-NULL JIT stack, this must be a different stack for
|
||||
each thread so that the application is thread-safe.
|
||||
In a multithread application, if you do not specify a JIT stack, or if you
|
||||
assign or pass back NULL from a callback, that is thread-safe, because each
|
||||
thread has its own machine stack. However, if you assign or pass back a
|
||||
non-NULL JIT stack, this must be a different stack for each thread so that the
|
||||
application is thread-safe.
|
||||
</P>
|
||||
<P>
|
||||
Strictly speaking, even more is allowed. You can assign the same non-NULL stack
|
||||
|
@ -382,8 +382,8 @@ out this complicated API.
|
|||
<b>void pcre2_jit_free_unused_memory(pcre2_general_context *<i>gcontext</i>);</b>
|
||||
</P>
|
||||
<P>
|
||||
The JIT executable allocator does not free all memory when it is possible.
|
||||
It expects new allocations, and keeps some free memory around to improve
|
||||
The JIT executable allocator does not free all memory when it is possible. It
|
||||
expects new allocations, and keeps some free memory around to improve
|
||||
allocation speed. However, in low memory conditions, it might be better to free
|
||||
all possible memory. You can cause this to happen by calling
|
||||
pcre2_jit_free_unused_memory(). Its argument is a general context, for custom
|
||||
|
@ -442,10 +442,10 @@ that was not compiled.
|
|||
<P>
|
||||
When you call <b>pcre2_match()</b>, as well as testing for invalid options, a
|
||||
number of other sanity checks are performed on the arguments. For example, if
|
||||
the subject pointer is NULL, an immediate error is given. Also, unless
|
||||
PCRE2_NO_UTF_CHECK is set, a UTF subject string is tested for validity. In the
|
||||
interests of speed, these checks do not happen on the JIT fast path, and if
|
||||
invalid data is passed, the result is undefined.
|
||||
the subject pointer is NULL but the length is non-zero, an immediate error is
|
||||
given. Also, unless PCRE2_NO_UTF_CHECK is set, a UTF subject string is tested
|
||||
for validity. In the interests of speed, these checks do not happen on the JIT
|
||||
fast path, and if invalid data is passed, the result is undefined.
|
||||
</P>
|
||||
<P>
|
||||
Bypassing the sanity checks and the <b>pcre2_match()</b> wrapping can give
|
||||
|
@ -466,9 +466,9 @@ Cambridge, England.
|
|||
</P>
|
||||
<br><a name="SEC14" href="#TOC1">REVISION</a><br>
|
||||
<P>
|
||||
Last updated: 23 May 2019
|
||||
Last updated: 30 November 2021
|
||||
<br>
|
||||
Copyright © 1997-2019 University of Cambridge.
|
||||
Copyright © 1997-2021 University of Cambridge.
|
||||
<br>
|
||||
<p>
|
||||
Return to the <a href="index.html">PCRE2 index page</a>.
|
||||
|
|
|
@ -71,13 +71,18 @@ is 255 code units for the 8-bit library and 65535 code units for the 16-bit and
|
|||
The maximum length of a string argument to a callout is the largest number a
|
||||
32-bit unsigned integer can hold.
|
||||
</P>
|
||||
<P>
|
||||
The maximum amount of heap memory used for matching is controlled by the heap
|
||||
limit, which can be set in a pattern or in a match context. The default is a
|
||||
very large number, effectively unlimited.
|
||||
</P>
|
||||
<br><b>
|
||||
AUTHOR
|
||||
</b><br>
|
||||
<P>
|
||||
Philip Hazel
|
||||
<br>
|
||||
University Computing Service
|
||||
Retired from University Computing Service
|
||||
<br>
|
||||
Cambridge, England.
|
||||
<br>
|
||||
|
@ -86,9 +91,9 @@ Cambridge, England.
|
|||
REVISION
|
||||
</b><br>
|
||||
<P>
|
||||
Last updated: 02 February 2019
|
||||
Last updated: 26 July 2022
|
||||
<br>
|
||||
Copyright © 1997-2019 University of Cambridge.
|
||||
Copyright © 1997-2022 University of Cambridge.
|
||||
<br>
|
||||
<p>
|
||||
Return to the <a href="index.html">PCRE2 index page</a>.
|
||||
|
|
|
@ -534,7 +534,7 @@ for themselves. For example, outside a character class:
|
|||
\0113 is a tab followed by the character "3"
|
||||
\113 might be a backreference, otherwise the character with octal code 113
|
||||
\377 might be a backreference, otherwise the value 255 (decimal)
|
||||
\81 is always a backreference .sp
|
||||
\81 is always a backreference
|
||||
</pre>
|
||||
Note that octal values of 100 or greater that are specified using this syntax
|
||||
must not be introduced by a leading zero, because no more than three octal
|
||||
|
@ -776,194 +776,62 @@ can be used in any mode, though in 8-bit and 16-bit non-UTF modes these
|
|||
sequences are of course limited to testing characters whose code points are
|
||||
less than U+0100 and U+10000, respectively. In 32-bit non-UTF mode, code points
|
||||
greater than 0x10ffff (the Unicode limit) may be encountered. These are all
|
||||
treated as being in the Unknown script and with an unassigned type. The extra
|
||||
escape sequences are:
|
||||
treated as being in the Unknown script and with an unassigned type.
|
||||
</P>
|
||||
<P>
|
||||
Matching characters by Unicode property is not fast, because PCRE2 has to do a
|
||||
multistage table lookup in order to find a character's property. That is why
|
||||
the traditional escape sequences such as \d and \w do not use Unicode
|
||||
properties in PCRE2 by default, though you can make them do so by setting the
|
||||
PCRE2_UCP option or by starting the pattern with (*UCP).
|
||||
</P>
|
||||
<P>
|
||||
The extra escape sequences that provide property support are:
|
||||
<pre>
|
||||
\p{<i>xx</i>} a character with the <i>xx</i> property
|
||||
\P{<i>xx</i>} a character without the <i>xx</i> property
|
||||
\X a Unicode extended grapheme cluster
|
||||
</pre>
|
||||
The property names represented by <i>xx</i> above are case-sensitive. There is
|
||||
support for Unicode script names, Unicode general category properties, "Any",
|
||||
which matches any character (including newline), and some special PCRE2
|
||||
properties (described in the
|
||||
<a href="#extraprops">next section).</a>
|
||||
Other Perl properties such as "InMusicalSymbols" are not supported by PCRE2.
|
||||
Note that \P{Any} does not match any characters, so always causes a match
|
||||
failure.
|
||||
The property names represented by <i>xx</i> above are not case-sensitive, and in
|
||||
accordance with Unicode's "loose matching" rules, spaces, hyphens, and
|
||||
underscores are ignored. There is support for Unicode script names, Unicode
|
||||
general category properties, "Any", which matches any character (including
|
||||
newline), Bidi_Class, a number of binary (yes/no) properties, and some special
|
||||
PCRE2 properties (described
|
||||
<a href="#extraprops">below).</a>
|
||||
Certain other Perl properties such as "InMusicalSymbols" are not supported by
|
||||
PCRE2. Note that \P{Any} does not match any characters, so always causes a
|
||||
match failure.
|
||||
</P>
|
||||
<br><b>
|
||||
Script properties for \p and \P
|
||||
</b><br>
|
||||
<P>
|
||||
There are three different syntax forms for matching a script. Each Unicode
|
||||
character has a basic script and, optionally, a list of other scripts ("Script
|
||||
Extensions") with which it is commonly used. Using the Adlam script as an
|
||||
example, \p{sc:Adlam} matches characters whose basic script is Adlam, whereas
|
||||
\p{scx:Adlam} matches, in addition, characters that have Adlam in their
|
||||
extensions list. The full names "script" and "script extensions" for the
|
||||
property types are recognized, and a equals sign is an alternative to the
|
||||
colon. If a script name is given without a property type, for example,
|
||||
\p{Adlam}, it is treated as \p{scx:Adlam}. Perl changed to this
|
||||
interpretation at release 5.26 and PCRE2 changed at release 10.40.
|
||||
</P>
|
||||
<P>
|
||||
Sets of Unicode characters are defined as belonging to certain scripts. A
|
||||
character from one of these sets can be matched using a script name. For
|
||||
example:
|
||||
<pre>
|
||||
\p{Greek}
|
||||
\P{Han}
|
||||
</pre>
|
||||
Unassigned characters (and in non-UTF 32-bit mode, characters with code points
|
||||
greater than 0x10FFFF) are assigned the "Unknown" script. Others that are not
|
||||
part of an identified script are lumped together as "Common". The current list
|
||||
of scripts is:
|
||||
</P>
|
||||
<P>
|
||||
Adlam,
|
||||
Ahom,
|
||||
Anatolian_Hieroglyphs,
|
||||
Arabic,
|
||||
Armenian,
|
||||
Avestan,
|
||||
Balinese,
|
||||
Bamum,
|
||||
Bassa_Vah,
|
||||
Batak,
|
||||
Bengali,
|
||||
Bhaiksuki,
|
||||
Bopomofo,
|
||||
Brahmi,
|
||||
Braille,
|
||||
Buginese,
|
||||
Buhid,
|
||||
Canadian_Aboriginal,
|
||||
Carian,
|
||||
Caucasian_Albanian,
|
||||
Chakma,
|
||||
Cham,
|
||||
Cherokee,
|
||||
Chorasmian,
|
||||
Common,
|
||||
Coptic,
|
||||
Cuneiform,
|
||||
Cypriot,
|
||||
Cyrillic,
|
||||
Deseret,
|
||||
Devanagari,
|
||||
Dives_Akuru,
|
||||
Dogra,
|
||||
Duployan,
|
||||
Egyptian_Hieroglyphs,
|
||||
Elbasan,
|
||||
Elymaic,
|
||||
Ethiopic,
|
||||
Georgian,
|
||||
Glagolitic,
|
||||
Gothic,
|
||||
Grantha,
|
||||
Greek,
|
||||
Gujarati,
|
||||
Gunjala_Gondi,
|
||||
Gurmukhi,
|
||||
Han,
|
||||
Hangul,
|
||||
Hanifi_Rohingya,
|
||||
Hanunoo,
|
||||
Hatran,
|
||||
Hebrew,
|
||||
Hiragana,
|
||||
Imperial_Aramaic,
|
||||
Inherited,
|
||||
Inscriptional_Pahlavi,
|
||||
Inscriptional_Parthian,
|
||||
Javanese,
|
||||
Kaithi,
|
||||
Kannada,
|
||||
Katakana,
|
||||
Kayah_Li,
|
||||
Kharoshthi,
|
||||
Khitan_Small_Script,
|
||||
Khmer,
|
||||
Khojki,
|
||||
Khudawadi,
|
||||
Lao,
|
||||
Latin,
|
||||
Lepcha,
|
||||
Limbu,
|
||||
Linear_A,
|
||||
Linear_B,
|
||||
Lisu,
|
||||
Lycian,
|
||||
Lydian,
|
||||
Mahajani,
|
||||
Makasar,
|
||||
Malayalam,
|
||||
Mandaic,
|
||||
Manichaean,
|
||||
Marchen,
|
||||
Masaram_Gondi,
|
||||
Medefaidrin,
|
||||
Meetei_Mayek,
|
||||
Mende_Kikakui,
|
||||
Meroitic_Cursive,
|
||||
Meroitic_Hieroglyphs,
|
||||
Miao,
|
||||
Modi,
|
||||
Mongolian,
|
||||
Mro,
|
||||
Multani,
|
||||
Myanmar,
|
||||
Nabataean,
|
||||
Nandinagari,
|
||||
New_Tai_Lue,
|
||||
Newa,
|
||||
Nko,
|
||||
Nushu,
|
||||
Nyakeng_Puachue_Hmong,
|
||||
Ogham,
|
||||
Ol_Chiki,
|
||||
Old_Hungarian,
|
||||
Old_Italic,
|
||||
Old_North_Arabian,
|
||||
Old_Permic,
|
||||
Old_Persian,
|
||||
Old_Sogdian,
|
||||
Old_South_Arabian,
|
||||
Old_Turkic,
|
||||
Oriya,
|
||||
Osage,
|
||||
Osmanya,
|
||||
Pahawh_Hmong,
|
||||
Palmyrene,
|
||||
Pau_Cin_Hau,
|
||||
Phags_Pa,
|
||||
Phoenician,
|
||||
Psalter_Pahlavi,
|
||||
Rejang,
|
||||
Runic,
|
||||
Samaritan,
|
||||
Saurashtra,
|
||||
Sharada,
|
||||
Shavian,
|
||||
Siddham,
|
||||
SignWriting,
|
||||
Sinhala,
|
||||
Sogdian,
|
||||
Sora_Sompeng,
|
||||
Soyombo,
|
||||
Sundanese,
|
||||
Syloti_Nagri,
|
||||
Syriac,
|
||||
Tagalog,
|
||||
Tagbanwa,
|
||||
Tai_Le,
|
||||
Tai_Tham,
|
||||
Tai_Viet,
|
||||
Takri,
|
||||
Tamil,
|
||||
Tangut,
|
||||
Telugu,
|
||||
Thaana,
|
||||
Thai,
|
||||
Tibetan,
|
||||
Tifinagh,
|
||||
Tirhuta,
|
||||
Ugaritic,
|
||||
Unknown,
|
||||
Vai,
|
||||
Wancho,
|
||||
Warang_Citi,
|
||||
Yezidi,
|
||||
Yi,
|
||||
Zanabazar_Square.
|
||||
of recognized script names and their 4-character abbreviations can be obtained
|
||||
by running this command:
|
||||
<pre>
|
||||
pcre2test -LS
|
||||
|
||||
</PRE>
|
||||
</P>
|
||||
<br><b>
|
||||
The general category property for \p and \P
|
||||
</b><br>
|
||||
<P>
|
||||
Each character has exactly one Unicode general category property, specified by
|
||||
a two-letter abbreviation. For compatibility with Perl, negation can be
|
||||
|
@ -1025,9 +893,9 @@ The following general category property codes are supported:
|
|||
Zp Paragraph separator
|
||||
Zs Space separator
|
||||
</pre>
|
||||
The special property L& is also supported: it matches a character that has
|
||||
the Lu, Ll, or Lt property, in other words, a letter that is not classified as
|
||||
a modifier or "other".
|
||||
The special property LC, which has the synonym L&, is also supported: it
|
||||
matches a character that has the Lu, Ll, or Lt property, in other words, a
|
||||
letter that is not classified as a modifier or "other".
|
||||
</P>
|
||||
<P>
|
||||
The Cs (Surrogate) property applies only to characters whose code points are in
|
||||
|
@ -1054,12 +922,54 @@ Specifying caseless matching does not affect these escape sequences. For
|
|||
example, \p{Lu} always matches only upper case letters. This is different from
|
||||
the behaviour of current versions of Perl.
|
||||
</P>
|
||||
<br><b>
|
||||
Binary (yes/no) properties for \p and \P
|
||||
</b><br>
|
||||
<P>
|
||||
Matching characters by Unicode property is not fast, because PCRE2 has to do a
|
||||
multistage table lookup in order to find a character's property. That is why
|
||||
the traditional escape sequences such as \d and \w do not use Unicode
|
||||
properties in PCRE2 by default, though you can make them do so by setting the
|
||||
PCRE2_UCP option or by starting the pattern with (*UCP).
|
||||
Unicode defines a number of binary properties, that is, properties whose only
|
||||
values are true or false. You can obtain a list of those that are recognized by
|
||||
\p and \P, along with their abbreviations, by running this command:
|
||||
<pre>
|
||||
pcre2test -LP
|
||||
|
||||
</PRE>
|
||||
</P>
|
||||
<br><b>
|
||||
The Bidi_Class property for \p and \P
|
||||
</b><br>
|
||||
<P>
|
||||
<pre>
|
||||
\p{Bidi_Class:<class>} matches a character with the given class
|
||||
\p{BC:<class>} matches a character with the given class
|
||||
</pre>
|
||||
The recognized classes are:
|
||||
<pre>
|
||||
AL Arabic letter
|
||||
AN Arabic number
|
||||
B paragraph separator
|
||||
BN boundary neutral
|
||||
CS common separator
|
||||
EN European number
|
||||
ES European separator
|
||||
ET European terminator
|
||||
FSI first strong isolate
|
||||
L left-to-right
|
||||
LRE left-to-right embedding
|
||||
LRI left-to-right isolate
|
||||
LRO left-to-right override
|
||||
NSM non-spacing mark
|
||||
ON other neutral
|
||||
PDF pop directional format
|
||||
PDI pop directional isolate
|
||||
R right-to-left
|
||||
RLE right-to-left embedding
|
||||
RLI right-to-left isolate
|
||||
RLO right-to-left override
|
||||
S segment separator
|
||||
WS which space
|
||||
</pre>
|
||||
An equals sign may be used instead of a colon. The class names are
|
||||
case-insensitive; only the short names listed above are recognized.
|
||||
</P>
|
||||
<br><b>
|
||||
Extended grapheme clusters
|
||||
|
@ -1336,15 +1246,17 @@ end of the subject in both modes, and if all branches of a pattern start with
|
|||
<P>
|
||||
Outside a character class, a dot in the pattern matches any one character in
|
||||
the subject string except (by default) a character that signifies the end of a
|
||||
line.
|
||||
line. One or more characters may be specified as line terminators (see
|
||||
<a href="#newlines">"Newline conventions"</a>
|
||||
above).
|
||||
</P>
|
||||
<P>
|
||||
When a line ending is defined as a single character, dot never matches that
|
||||
character; when the two-character sequence CRLF is used, dot does not match CR
|
||||
if it is immediately followed by LF, but otherwise it matches all characters
|
||||
(including isolated CRs and LFs). When any Unicode line endings are being
|
||||
recognized, dot does not match CR or LF or any of the other line ending
|
||||
characters.
|
||||
Dot never matches a single line-ending character. When the two-character
|
||||
sequence CRLF is the only line ending, dot does not match CR if it is
|
||||
immediately followed by LF, but otherwise it matches all characters (including
|
||||
isolated CRs and LFs). When ANYCRLF is selected for line endings, no occurences
|
||||
of CR of LF match dot. When all Unicode line endings are being recognized, dot
|
||||
does not match CR or LF or any of the other line ending characters.
|
||||
</P>
|
||||
<P>
|
||||
The behaviour of dot with regard to newlines can be changed. If the
|
||||
|
@ -2175,10 +2087,10 @@ be easier to remember:
|
|||
<pre>
|
||||
(*atomic:\d+)foo
|
||||
</pre>
|
||||
This kind of parenthesized group "locks up" the part of the pattern it
|
||||
contains once it has matched, and a failure further into the pattern is
|
||||
prevented from backtracking into it. Backtracking past it to previous items,
|
||||
however, works as normal.
|
||||
This kind of parenthesized group "locks up" the part of the pattern it contains
|
||||
once it has matched, and a failure further into the pattern is prevented from
|
||||
backtracking into it. Backtracking past it to previous items, however, works as
|
||||
normal.
|
||||
</P>
|
||||
<P>
|
||||
An alternative description is that a group of this type matches exactly the
|
||||
|
@ -2899,7 +2811,7 @@ breaks):
|
|||
(?(DEFINE) (?<byte> 2[0-4]\d | 25[0-5] | 1\d\d | [1-9]?\d) )
|
||||
\b (?&byte) (\.(?&byte)){3} \b
|
||||
</pre>
|
||||
The first part of the pattern is a DEFINE group inside which a another group
|
||||
The first part of the pattern is a DEFINE group inside which another group
|
||||
named "byte" is defined. This matches an individual component of an IPv4
|
||||
address (a number less than 256). When matching takes place, this part of the
|
||||
pattern is skipped because DEFINE acts like a false condition. The rest of the
|
||||
|
@ -3854,9 +3766,9 @@ Cambridge, England.
|
|||
</P>
|
||||
<br><a name="SEC32" href="#TOC1">REVISION</a><br>
|
||||
<P>
|
||||
Last updated: 30 August 2021
|
||||
Last updated: 12 January 2022
|
||||
<br>
|
||||
Copyright © 1997-2021 University of Cambridge.
|
||||
Copyright © 1997-2022 University of Cambridge.
|
||||
<br>
|
||||
<p>
|
||||
Return to the <a href="index.html">PCRE2 index page</a>.
|
||||
|
|
|
@ -83,12 +83,31 @@ From release 10.30, the interpretive (non-JIT) version of <b>pcre2_match()</b>
|
|||
uses very little system stack at run time. In earlier releases recursive
|
||||
function calls could use a great deal of stack, and this could cause problems,
|
||||
but this usage has been eliminated. Backtracking positions are now explicitly
|
||||
remembered in memory frames controlled by the code. An initial 20KiB vector of
|
||||
frames is allocated on the system stack (enough for about 100 frames for small
|
||||
patterns), but if this is insufficient, heap memory is used. The amount of heap
|
||||
memory can be limited; if the limit is set to zero, only the initial stack
|
||||
vector is used. Rewriting patterns to be time-efficient, as described below,
|
||||
may also reduce the memory requirements.
|
||||
remembered in memory frames controlled by the code.
|
||||
</P>
|
||||
<P>
|
||||
The size of each frame depends on the size of pointer variables and the number
|
||||
of capturing parenthesized groups in the pattern being matched. On a 64-bit
|
||||
system the frame size for a pattern with no captures is 128 bytes. For each
|
||||
capturing group the size increases by 16 bytes.
|
||||
</P>
|
||||
<P>
|
||||
Until release 10.41, an initial 20KiB frames vector was allocated on the system
|
||||
stack, but this still caused some issues for multi-thread applications where
|
||||
each thread has a very small stack. From release 10.41 backtracking memory
|
||||
frames are always held in heap memory. An initial heap allocation is obtained
|
||||
the first time any match data block is passed to <b>pcre2_match()</b>. This is
|
||||
remembered with the match data block and re-used if that block is used for
|
||||
another match. It is freed when the match data block itself is freed.
|
||||
</P>
|
||||
<P>
|
||||
The size of the initial block is the larger of 20KiB or ten times the pattern's
|
||||
frame size, unless the heap limit is less than this, in which case the heap
|
||||
limit is used. If the initial block proves to be too small during matching, it
|
||||
is replaced by a larger block, subject to the heap limit. The heap limit is
|
||||
checked only when a new block is to be allocated. Reducing the heap limit
|
||||
between calls to <b>pcre2_match()</b> with the same match data block does not
|
||||
affect the saved block.
|
||||
</P>
|
||||
<P>
|
||||
In contrast to <b>pcre2_match()</b>, <b>pcre2_dfa_match()</b> does use recursive
|
||||
|
@ -245,16 +264,16 @@ pattern to match. This is done by repeatedly matching with different limits.
|
|||
<P>
|
||||
Philip Hazel
|
||||
<br>
|
||||
University Computing Service
|
||||
Retired from University Computing Service
|
||||
<br>
|
||||
Cambridge, England.
|
||||
<br>
|
||||
</P>
|
||||
<br><a name="SEC6" href="#TOC1">REVISION</a><br>
|
||||
<P>
|
||||
Last updated: 03 February 2019
|
||||
Last updated: 27 July 2022
|
||||
<br>
|
||||
Copyright © 1997-2019 University of Cambridge.
|
||||
Copyright © 1997-2022 University of Cambridge.
|
||||
<br>
|
||||
<p>
|
||||
Return to the <a href="index.html">PCRE2 index page</a>.
|
||||
|
|
|
@ -23,12 +23,12 @@ please consult the man page, in case the conversion went wrong.
|
|||
<br><a name="SEC1" href="#TOC1">SAVING AND RE-USING PRECOMPILED PCRE2 PATTERNS</a><br>
|
||||
<P>
|
||||
<b>int32_t pcre2_serialize_decode(pcre2_code **<i>codes</i>,</b>
|
||||
<b> int32_t <i>number_of_codes</i>, const uint32_t *<i>bytes</i>,</b>
|
||||
<b> int32_t <i>number_of_codes</i>, const uint8_t *<i>bytes</i>,</b>
|
||||
<b> pcre2_general_context *<i>gcontext</i>);</b>
|
||||
<br>
|
||||
<br>
|
||||
<b>int32_t pcre2_serialize_encode(pcre2_code **<i>codes</i>,</b>
|
||||
<b> int32_t <i>number_of_codes</i>, uint32_t **<i>serialized_bytes</i>,</b>
|
||||
<b>int32_t pcre2_serialize_encode(const pcre2_code **<i>codes</i>,</b>
|
||||
<b> int32_t <i>number_of_codes</i>, uint8_t **<i>serialized_bytes</i>,</b>
|
||||
<b> PCRE2_SIZE *<i>serialized_size</i>, pcre2_general_context *<i>gcontext</i>);</b>
|
||||
<br>
|
||||
<br>
|
||||
|
@ -94,7 +94,7 @@ of serialized patterns, or one of the following negative error codes:
|
|||
<pre>
|
||||
PCRE2_ERROR_BADDATA the number of patterns is zero or less
|
||||
PCRE2_ERROR_BADMAGIC mismatch of id bytes in one of the patterns
|
||||
PCRE2_ERROR_MEMORY memory allocation failed
|
||||
PCRE2_ERROR_NOMEMORY memory allocation failed
|
||||
PCRE2_ERROR_MIXEDTABLES the patterns do not all use the same tables
|
||||
PCRE2_ERROR_NULL the 1st, 3rd, or 4th argument is NULL
|
||||
</pre>
|
||||
|
@ -154,7 +154,6 @@ mangagement functions for the decoded patterns. If this argument is NULL,
|
|||
<b>malloc()</b> and <b>free()</b> are used. After deserialization, the byte
|
||||
stream is no longer needed and can be discarded.
|
||||
<pre>
|
||||
int32_t number_of_codes;
|
||||
pcre2_code *list_of_codes[2];
|
||||
uint8_t *bytes = <serialized data>;
|
||||
int32_t number_of_codes =
|
||||
|
|
|
@ -19,29 +19,31 @@ please consult the man page, in case the conversion went wrong.
|
|||
<li><a name="TOC4" href="#SEC4">CHARACTER TYPES</a>
|
||||
<li><a name="TOC5" href="#SEC5">GENERAL CATEGORY PROPERTIES FOR \p and \P</a>
|
||||
<li><a name="TOC6" href="#SEC6">PCRE2 SPECIAL CATEGORY PROPERTIES FOR \p and \P</a>
|
||||
<li><a name="TOC7" href="#SEC7">SCRIPT NAMES FOR \p AND \P</a>
|
||||
<li><a name="TOC8" href="#SEC8">CHARACTER CLASSES</a>
|
||||
<li><a name="TOC9" href="#SEC9">QUANTIFIERS</a>
|
||||
<li><a name="TOC10" href="#SEC10">ANCHORS AND SIMPLE ASSERTIONS</a>
|
||||
<li><a name="TOC11" href="#SEC11">REPORTED MATCH POINT SETTING</a>
|
||||
<li><a name="TOC12" href="#SEC12">ALTERNATION</a>
|
||||
<li><a name="TOC13" href="#SEC13">CAPTURING</a>
|
||||
<li><a name="TOC14" href="#SEC14">ATOMIC GROUPS</a>
|
||||
<li><a name="TOC15" href="#SEC15">COMMENT</a>
|
||||
<li><a name="TOC16" href="#SEC16">OPTION SETTING</a>
|
||||
<li><a name="TOC17" href="#SEC17">NEWLINE CONVENTION</a>
|
||||
<li><a name="TOC18" href="#SEC18">WHAT \R MATCHES</a>
|
||||
<li><a name="TOC19" href="#SEC19">LOOKAHEAD AND LOOKBEHIND ASSERTIONS</a>
|
||||
<li><a name="TOC20" href="#SEC20">NON-ATOMIC LOOKAROUND ASSERTIONS</a>
|
||||
<li><a name="TOC21" href="#SEC21">SCRIPT RUNS</a>
|
||||
<li><a name="TOC22" href="#SEC22">BACKREFERENCES</a>
|
||||
<li><a name="TOC23" href="#SEC23">SUBROUTINE REFERENCES (POSSIBLY RECURSIVE)</a>
|
||||
<li><a name="TOC24" href="#SEC24">CONDITIONAL PATTERNS</a>
|
||||
<li><a name="TOC25" href="#SEC25">BACKTRACKING CONTROL</a>
|
||||
<li><a name="TOC26" href="#SEC26">CALLOUTS</a>
|
||||
<li><a name="TOC27" href="#SEC27">SEE ALSO</a>
|
||||
<li><a name="TOC28" href="#SEC28">AUTHOR</a>
|
||||
<li><a name="TOC29" href="#SEC29">REVISION</a>
|
||||
<li><a name="TOC7" href="#SEC7">BINARY PROPERTIES FOR \p AND \P</a>
|
||||
<li><a name="TOC8" href="#SEC8">SCRIPT MATCHING WITH \p AND \P</a>
|
||||
<li><a name="TOC9" href="#SEC9">THE BIDI_CLASS PROPERTY FOR \p AND \P</a>
|
||||
<li><a name="TOC10" href="#SEC10">CHARACTER CLASSES</a>
|
||||
<li><a name="TOC11" href="#SEC11">QUANTIFIERS</a>
|
||||
<li><a name="TOC12" href="#SEC12">ANCHORS AND SIMPLE ASSERTIONS</a>
|
||||
<li><a name="TOC13" href="#SEC13">REPORTED MATCH POINT SETTING</a>
|
||||
<li><a name="TOC14" href="#SEC14">ALTERNATION</a>
|
||||
<li><a name="TOC15" href="#SEC15">CAPTURING</a>
|
||||
<li><a name="TOC16" href="#SEC16">ATOMIC GROUPS</a>
|
||||
<li><a name="TOC17" href="#SEC17">COMMENT</a>
|
||||
<li><a name="TOC18" href="#SEC18">OPTION SETTING</a>
|
||||
<li><a name="TOC19" href="#SEC19">NEWLINE CONVENTION</a>
|
||||
<li><a name="TOC20" href="#SEC20">WHAT \R MATCHES</a>
|
||||
<li><a name="TOC21" href="#SEC21">LOOKAHEAD AND LOOKBEHIND ASSERTIONS</a>
|
||||
<li><a name="TOC22" href="#SEC22">NON-ATOMIC LOOKAROUND ASSERTIONS</a>
|
||||
<li><a name="TOC23" href="#SEC23">SCRIPT RUNS</a>
|
||||
<li><a name="TOC24" href="#SEC24">BACKREFERENCES</a>
|
||||
<li><a name="TOC25" href="#SEC25">SUBROUTINE REFERENCES (POSSIBLY RECURSIVE)</a>
|
||||
<li><a name="TOC26" href="#SEC26">CONDITIONAL PATTERNS</a>
|
||||
<li><a name="TOC27" href="#SEC27">BACKTRACKING CONTROL</a>
|
||||
<li><a name="TOC28" href="#SEC28">CALLOUTS</a>
|
||||
<li><a name="TOC29" href="#SEC29">SEE ALSO</a>
|
||||
<li><a name="TOC30" href="#SEC30">AUTHOR</a>
|
||||
<li><a name="TOC31" href="#SEC31">REVISION</a>
|
||||
</ul>
|
||||
<br><a name="SEC1" href="#TOC1">PCRE2 REGULAR EXPRESSION SYNTAX SUMMARY</a><br>
|
||||
<P>
|
||||
|
@ -136,6 +138,11 @@ happening, \s and \w may also match characters with code points in the range
|
|||
sequences is changed to use Unicode properties and they match many more
|
||||
characters.
|
||||
</P>
|
||||
<P>
|
||||
Property descriptions in \p and \P are matched caselessly; hyphens,
|
||||
underscores, and white space are ignored, in accordance with Unicode's "loose
|
||||
matching" rules.
|
||||
</P>
|
||||
<br><a name="SEC5" href="#TOC1">GENERAL CATEGORY PROPERTIES FOR \p and \P</a><br>
|
||||
<P>
|
||||
<pre>
|
||||
|
@ -152,6 +159,7 @@ characters.
|
|||
Lo Other letter
|
||||
Lt Title case letter
|
||||
Lu Upper case letter
|
||||
Lc Ll, Lu, or Lt
|
||||
L& Ll, Lu, or Lt
|
||||
|
||||
M Mark
|
||||
|
@ -198,166 +206,58 @@ characters.
|
|||
Perl and POSIX space are now the same. Perl added VT to its space character set
|
||||
at release 5.18.
|
||||
</P>
|
||||
<br><a name="SEC7" href="#TOC1">SCRIPT NAMES FOR \p AND \P</a><br>
|
||||
<br><a name="SEC7" href="#TOC1">BINARY PROPERTIES FOR \p AND \P</a><br>
|
||||
<P>
|
||||
Adlam,
|
||||
Ahom,
|
||||
Anatolian_Hieroglyphs,
|
||||
Arabic,
|
||||
Armenian,
|
||||
Avestan,
|
||||
Balinese,
|
||||
Bamum,
|
||||
Bassa_Vah,
|
||||
Batak,
|
||||
Bengali,
|
||||
Bhaiksuki,
|
||||
Bopomofo,
|
||||
Brahmi,
|
||||
Braille,
|
||||
Buginese,
|
||||
Buhid,
|
||||
Canadian_Aboriginal,
|
||||
Carian,
|
||||
Caucasian_Albanian,
|
||||
Chakma,
|
||||
Cham,
|
||||
Cherokee,
|
||||
Chorasmian,
|
||||
Common,
|
||||
Coptic,
|
||||
Cuneiform,
|
||||
Cypriot,
|
||||
Cyrillic,
|
||||
Deseret,
|
||||
Devanagari,
|
||||
Dives_Akuru,
|
||||
Dogra,
|
||||
Duployan,
|
||||
Egyptian_Hieroglyphs,
|
||||
Elbasan,
|
||||
Elymaic,
|
||||
Ethiopic,
|
||||
Georgian,
|
||||
Glagolitic,
|
||||
Gothic,
|
||||
Grantha,
|
||||
Greek,
|
||||
Gujarati,
|
||||
Gunjala_Gondi,
|
||||
Gurmukhi,
|
||||
Han,
|
||||
Hangul,
|
||||
Hanifi_Rohingya,
|
||||
Hanunoo,
|
||||
Hatran,
|
||||
Hebrew,
|
||||
Hiragana,
|
||||
Imperial_Aramaic,
|
||||
Inherited,
|
||||
Inscriptional_Pahlavi,
|
||||
Inscriptional_Parthian,
|
||||
Javanese,
|
||||
Kaithi,
|
||||
Kannada,
|
||||
Katakana,
|
||||
Kayah_Li,
|
||||
Kharoshthi,
|
||||
Khitan_Small_Script,
|
||||
Khmer,
|
||||
Khojki,
|
||||
Khudawadi,
|
||||
Lao,
|
||||
Latin,
|
||||
Lepcha,
|
||||
Limbu,
|
||||
Linear_A,
|
||||
Linear_B,
|
||||
Lisu,
|
||||
Lycian,
|
||||
Lydian,
|
||||
Mahajani,
|
||||
Makasar,
|
||||
Malayalam,
|
||||
Mandaic,
|
||||
Manichaean,
|
||||
Marchen,
|
||||
Masaram_Gondi,
|
||||
Medefaidrin,
|
||||
Meetei_Mayek,
|
||||
Mende_Kikakui,
|
||||
Meroitic_Cursive,
|
||||
Meroitic_Hieroglyphs,
|
||||
Miao,
|
||||
Modi,
|
||||
Mongolian,
|
||||
Mro,
|
||||
Multani,
|
||||
Myanmar,
|
||||
Nabataean,
|
||||
Nandinagari,
|
||||
New_Tai_Lue,
|
||||
Newa,
|
||||
Nko,
|
||||
Nushu,
|
||||
Nyakeng_Puachue_Hmong,
|
||||
Ogham,
|
||||
Ol_Chiki,
|
||||
Old_Hungarian,
|
||||
Old_Italic,
|
||||
Old_North_Arabian,
|
||||
Old_Permic,
|
||||
Old_Persian,
|
||||
Old_Sogdian,
|
||||
Old_South_Arabian,
|
||||
Old_Turkic,
|
||||
Oriya,
|
||||
Osage,
|
||||
Osmanya,
|
||||
Pahawh_Hmong,
|
||||
Palmyrene,
|
||||
Pau_Cin_Hau,
|
||||
Phags_Pa,
|
||||
Phoenician,
|
||||
Psalter_Pahlavi,
|
||||
Rejang,
|
||||
Runic,
|
||||
Samaritan,
|
||||
Saurashtra,
|
||||
Sharada,
|
||||
Shavian,
|
||||
Siddham,
|
||||
SignWriting,
|
||||
Sinhala,
|
||||
Sogdian,
|
||||
Sora_Sompeng,
|
||||
Soyombo,
|
||||
Sundanese,
|
||||
Syloti_Nagri,
|
||||
Syriac,
|
||||
Tagalog,
|
||||
Tagbanwa,
|
||||
Tai_Le,
|
||||
Tai_Tham,
|
||||
Tai_Viet,
|
||||
Takri,
|
||||
Tamil,
|
||||
Tangut,
|
||||
Telugu,
|
||||
Thaana,
|
||||
Thai,
|
||||
Tibetan,
|
||||
Tifinagh,
|
||||
Tirhuta,
|
||||
Ugaritic,
|
||||
Vai,
|
||||
Wancho,
|
||||
Warang_Citi,
|
||||
Yezidi,
|
||||
Yi,
|
||||
Zanabazar_Square.
|
||||
Unicode defines a number of binary properties, that is, properties whose only
|
||||
values are true or false. You can obtain a list of those that are recognized by
|
||||
\p and \P, along with their abbreviations, by running this command:
|
||||
<pre>
|
||||
pcre2test -LP
|
||||
</PRE>
|
||||
</P>
|
||||
<br><a name="SEC8" href="#TOC1">CHARACTER CLASSES</a><br>
|
||||
<br><a name="SEC8" href="#TOC1">SCRIPT MATCHING WITH \p AND \P</a><br>
|
||||
<P>
|
||||
Many script names and their 4-letter abbreviations are recognized in
|
||||
\p{sc:...} or \p{scx:...} items, or on their own with \p (and also \P of
|
||||
course). You can obtain a list of these scripts by running this command:
|
||||
<pre>
|
||||
pcre2test -LS
|
||||
</PRE>
|
||||
</P>
|
||||
<br><a name="SEC9" href="#TOC1">THE BIDI_CLASS PROPERTY FOR \p AND \P</a><br>
|
||||
<P>
|
||||
<pre>
|
||||
\p{Bidi_Class:<class>} matches a character with the given class
|
||||
\p{BC:<class>} matches a character with the given class
|
||||
</pre>
|
||||
The recognized classes are:
|
||||
<pre>
|
||||
AL Arabic letter
|
||||
AN Arabic number
|
||||
B paragraph separator
|
||||
BN boundary neutral
|
||||
CS common separator
|
||||
EN European number
|
||||
ES European separator
|
||||
ET European terminator
|
||||
FSI first strong isolate
|
||||
L left-to-right
|
||||
LRE left-to-right embedding
|
||||
LRI left-to-right isolate
|
||||
LRO left-to-right override
|
||||
NSM non-spacing mark
|
||||
ON other neutral
|
||||
PDF pop directional format
|
||||
PDI pop directional isolate
|
||||
R right-to-left
|
||||
RLE right-to-left embedding
|
||||
RLI right-to-left isolate
|
||||
RLO right-to-left override
|
||||
S segment separator
|
||||
WS which space
|
||||
</PRE>
|
||||
</P>
|
||||
<br><a name="SEC10" href="#TOC1">CHARACTER CLASSES</a><br>
|
||||
<P>
|
||||
<pre>
|
||||
[...] positive character class
|
||||
|
@ -385,7 +285,7 @@ In PCRE2, POSIX character set names recognize only ASCII characters by default,
|
|||
but some of them use Unicode properties if PCRE2_UCP is set. You can use
|
||||
\Q...\E inside a character class.
|
||||
</P>
|
||||
<br><a name="SEC9" href="#TOC1">QUANTIFIERS</a><br>
|
||||
<br><a name="SEC11" href="#TOC1">QUANTIFIERS</a><br>
|
||||
<P>
|
||||
<pre>
|
||||
? 0 or 1, greedy
|
||||
|
@ -406,7 +306,7 @@ but some of them use Unicode properties if PCRE2_UCP is set. You can use
|
|||
{n,}? n or more, lazy
|
||||
</PRE>
|
||||
</P>
|
||||
<br><a name="SEC10" href="#TOC1">ANCHORS AND SIMPLE ASSERTIONS</a><br>
|
||||
<br><a name="SEC12" href="#TOC1">ANCHORS AND SIMPLE ASSERTIONS</a><br>
|
||||
<P>
|
||||
<pre>
|
||||
\b word boundary
|
||||
|
@ -424,7 +324,7 @@ but some of them use Unicode properties if PCRE2_UCP is set. You can use
|
|||
\G first matching position in subject
|
||||
</PRE>
|
||||
</P>
|
||||
<br><a name="SEC11" href="#TOC1">REPORTED MATCH POINT SETTING</a><br>
|
||||
<br><a name="SEC13" href="#TOC1">REPORTED MATCH POINT SETTING</a><br>
|
||||
<P>
|
||||
<pre>
|
||||
\K set reported start of match
|
||||
|
@ -434,13 +334,13 @@ for compatibility with Perl. However, if the PCRE2_EXTRA_ALLOW_LOOKAROUND_BSK
|
|||
option is set, the previous behaviour is re-enabled. When this option is set,
|
||||
\K is honoured in positive assertions, but ignored in negative ones.
|
||||
</P>
|
||||
<br><a name="SEC12" href="#TOC1">ALTERNATION</a><br>
|
||||
<br><a name="SEC14" href="#TOC1">ALTERNATION</a><br>
|
||||
<P>
|
||||
<pre>
|
||||
expr|expr|expr...
|
||||
</PRE>
|
||||
</P>
|
||||
<br><a name="SEC13" href="#TOC1">CAPTURING</a><br>
|
||||
<br><a name="SEC15" href="#TOC1">CAPTURING</a><br>
|
||||
<P>
|
||||
<pre>
|
||||
(...) capture group
|
||||
|
@ -455,20 +355,20 @@ In non-UTF modes, names may contain underscores and ASCII letters and digits;
|
|||
in UTF modes, any Unicode letters and Unicode decimal digits are permitted. In
|
||||
both cases, a name must not start with a digit.
|
||||
</P>
|
||||
<br><a name="SEC14" href="#TOC1">ATOMIC GROUPS</a><br>
|
||||
<br><a name="SEC16" href="#TOC1">ATOMIC GROUPS</a><br>
|
||||
<P>
|
||||
<pre>
|
||||
(?>...) atomic non-capture group
|
||||
(*atomic:...) atomic non-capture group
|
||||
</PRE>
|
||||
</P>
|
||||
<br><a name="SEC15" href="#TOC1">COMMENT</a><br>
|
||||
<br><a name="SEC17" href="#TOC1">COMMENT</a><br>
|
||||
<P>
|
||||
<pre>
|
||||
(?#....) comment (not nestable)
|
||||
</PRE>
|
||||
</P>
|
||||
<br><a name="SEC16" href="#TOC1">OPTION SETTING</a><br>
|
||||
<br><a name="SEC18" href="#TOC1">OPTION SETTING</a><br>
|
||||
<P>
|
||||
Changes of these options within a group are automatically cancelled at the end
|
||||
of the group.
|
||||
|
@ -513,7 +413,7 @@ not increase them. LIMIT_RECURSION is an obsolete synonym for LIMIT_DEPTH. The
|
|||
application can lock out the use of (*UTF) and (*UCP) by setting the
|
||||
PCRE2_NEVER_UTF or PCRE2_NEVER_UCP options, respectively, at compile time.
|
||||
</P>
|
||||
<br><a name="SEC17" href="#TOC1">NEWLINE CONVENTION</a><br>
|
||||
<br><a name="SEC19" href="#TOC1">NEWLINE CONVENTION</a><br>
|
||||
<P>
|
||||
These are recognized only at the very start of the pattern or after option
|
||||
settings with a similar syntax.
|
||||
|
@ -526,7 +426,7 @@ settings with a similar syntax.
|
|||
(*NUL) the NUL character (binary zero)
|
||||
</PRE>
|
||||
</P>
|
||||
<br><a name="SEC18" href="#TOC1">WHAT \R MATCHES</a><br>
|
||||
<br><a name="SEC20" href="#TOC1">WHAT \R MATCHES</a><br>
|
||||
<P>
|
||||
These are recognized only at the very start of the pattern or after option
|
||||
setting with a similar syntax.
|
||||
|
@ -535,7 +435,7 @@ setting with a similar syntax.
|
|||
(*BSR_UNICODE) any Unicode newline sequence
|
||||
</PRE>
|
||||
</P>
|
||||
<br><a name="SEC19" href="#TOC1">LOOKAHEAD AND LOOKBEHIND ASSERTIONS</a><br>
|
||||
<br><a name="SEC21" href="#TOC1">LOOKAHEAD AND LOOKBEHIND ASSERTIONS</a><br>
|
||||
<P>
|
||||
<pre>
|
||||
(?=...) )
|
||||
|
@ -556,7 +456,7 @@ setting with a similar syntax.
|
|||
</pre>
|
||||
Each top-level branch of a lookbehind must be of a fixed length.
|
||||
</P>
|
||||
<br><a name="SEC20" href="#TOC1">NON-ATOMIC LOOKAROUND ASSERTIONS</a><br>
|
||||
<br><a name="SEC22" href="#TOC1">NON-ATOMIC LOOKAROUND ASSERTIONS</a><br>
|
||||
<P>
|
||||
These assertions are specific to PCRE2 and are not Perl-compatible.
|
||||
<pre>
|
||||
|
@ -569,7 +469,7 @@ These assertions are specific to PCRE2 and are not Perl-compatible.
|
|||
(*non_atomic_positive_lookbehind:...) )
|
||||
</PRE>
|
||||
</P>
|
||||
<br><a name="SEC21" href="#TOC1">SCRIPT RUNS</a><br>
|
||||
<br><a name="SEC23" href="#TOC1">SCRIPT RUNS</a><br>
|
||||
<P>
|
||||
<pre>
|
||||
(*script_run:...) ) script run, can be backtracked into
|
||||
|
@ -579,7 +479,7 @@ These assertions are specific to PCRE2 and are not Perl-compatible.
|
|||
(*asr:...) )
|
||||
</PRE>
|
||||
</P>
|
||||
<br><a name="SEC22" href="#TOC1">BACKREFERENCES</a><br>
|
||||
<br><a name="SEC24" href="#TOC1">BACKREFERENCES</a><br>
|
||||
<P>
|
||||
<pre>
|
||||
\n reference by number (can be ambiguous)
|
||||
|
@ -596,7 +496,7 @@ These assertions are specific to PCRE2 and are not Perl-compatible.
|
|||
(?P=name) reference by name (Python)
|
||||
</PRE>
|
||||
</P>
|
||||
<br><a name="SEC23" href="#TOC1">SUBROUTINE REFERENCES (POSSIBLY RECURSIVE)</a><br>
|
||||
<br><a name="SEC25" href="#TOC1">SUBROUTINE REFERENCES (POSSIBLY RECURSIVE)</a><br>
|
||||
<P>
|
||||
<pre>
|
||||
(?R) recurse whole pattern
|
||||
|
@ -615,7 +515,7 @@ These assertions are specific to PCRE2 and are not Perl-compatible.
|
|||
\g'-n' call subroutine by relative number (PCRE2 extension)
|
||||
</PRE>
|
||||
</P>
|
||||
<br><a name="SEC24" href="#TOC1">CONDITIONAL PATTERNS</a><br>
|
||||
<br><a name="SEC26" href="#TOC1">CONDITIONAL PATTERNS</a><br>
|
||||
<P>
|
||||
<pre>
|
||||
(?(condition)yes-pattern)
|
||||
|
@ -638,7 +538,7 @@ Note the ambiguity of (?(R) and (?(Rn) which might be named reference
|
|||
conditions or recursion tests. Such a condition is interpreted as a reference
|
||||
condition if the relevant named group exists.
|
||||
</P>
|
||||
<br><a name="SEC25" href="#TOC1">BACKTRACKING CONTROL</a><br>
|
||||
<br><a name="SEC27" href="#TOC1">BACKTRACKING CONTROL</a><br>
|
||||
<P>
|
||||
All backtracking control verbs may be in the form (*VERB:NAME). For (*MARK) the
|
||||
name is mandatory, for the others it is optional. (*SKIP) changes its behaviour
|
||||
|
@ -665,7 +565,7 @@ pattern is not anchored.
|
|||
The effect of one of these verbs in a group called as a subroutine is confined
|
||||
to the subroutine call.
|
||||
</P>
|
||||
<br><a name="SEC26" href="#TOC1">CALLOUTS</a><br>
|
||||
<br><a name="SEC28" href="#TOC1">CALLOUTS</a><br>
|
||||
<P>
|
||||
<pre>
|
||||
(?C) callout (assumed number 0)
|
||||
|
@ -676,12 +576,12 @@ The allowed string delimiters are ` ' " ^ % # $ (which are the same for the
|
|||
start and the end), and the starting delimiter { matched with the ending
|
||||
delimiter }. To encode the ending delimiter within the string, double it.
|
||||
</P>
|
||||
<br><a name="SEC27" href="#TOC1">SEE ALSO</a><br>
|
||||
<br><a name="SEC29" href="#TOC1">SEE ALSO</a><br>
|
||||
<P>
|
||||
<b>pcre2pattern</b>(3), <b>pcre2api</b>(3), <b>pcre2callout</b>(3),
|
||||
<b>pcre2matching</b>(3), <b>pcre2</b>(3).
|
||||
</P>
|
||||
<br><a name="SEC28" href="#TOC1">AUTHOR</a><br>
|
||||
<br><a name="SEC30" href="#TOC1">AUTHOR</a><br>
|
||||
<P>
|
||||
Philip Hazel
|
||||
<br>
|
||||
|
@ -690,11 +590,11 @@ Retired from University Computing Service
|
|||
Cambridge, England.
|
||||
<br>
|
||||
</P>
|
||||
<br><a name="SEC29" href="#TOC1">REVISION</a><br>
|
||||
<br><a name="SEC31" href="#TOC1">REVISION</a><br>
|
||||
<P>
|
||||
Last updated: 30 August 2021
|
||||
Last updated: 12 January 2022
|
||||
<br>
|
||||
Copyright © 1997-2021 University of Cambridge.
|
||||
Copyright © 1997-2022 University of Cambridge.
|
||||
<br>
|
||||
<p>
|
||||
Return to the <a href="index.html">PCRE2 index page</a>.
|
||||
|
|
|
@ -78,7 +78,7 @@ to 8-bit code units for output.
|
|||
</P>
|
||||
<P>
|
||||
In the rest of this document, the names of library functions and structures
|
||||
are given in generic form, for example, <b>pcre_compile()</b>. The actual
|
||||
are given in generic form, for example, <b>pcre2_compile()</b>. The actual
|
||||
names used in the libraries have a suffix _8, _16, or _32, as appropriate.
|
||||
<a name="inputencoding"></a></P>
|
||||
<br><a name="SEC3" href="#TOC1">INPUT ENCODING</a><br>
|
||||
|
@ -253,7 +253,19 @@ available, and the use of JIT for matching is verified.
|
|||
<b>-LM</b>
|
||||
List modifiers: write a list of available pattern and subject modifiers to the
|
||||
standard output, then exit with zero exit code. All other options are ignored.
|
||||
If both -C and -LM are present, whichever is first is recognized.
|
||||
If both -C and any -Lx options are present, whichever is first is recognized.
|
||||
</P>
|
||||
<P>
|
||||
<b>-LP</b>
|
||||
List properties: write a list of recognized Unicode properties to the standard
|
||||
output, then exit with zero exit code. All other options are ignored. If both
|
||||
-C and any -Lx options are present, whichever is first is recognized.
|
||||
</P>
|
||||
<P>
|
||||
<b>-LS</b>
|
||||
List scripts: write a list of recogized Unicode script names to the standard
|
||||
output, then exit with zero exit code. All other options are ignored. If both
|
||||
-C and any -Lx options are present, whichever is first is recognized.
|
||||
</P>
|
||||
<P>
|
||||
<b>-pattern</b> <i>modifier-list</i>
|
||||
|
@ -1229,7 +1241,8 @@ pattern, but can be overridden by modifiers on the subject.
|
|||
copy=<number or name> copy captured substring
|
||||
depth_limit=<n> set a depth limit
|
||||
dfa use <b>pcre2_dfa_match()</b>
|
||||
find_limits find match and depth limits
|
||||
find_limits find heap, match and depth limits
|
||||
find_limits_noheap find match and depth limits
|
||||
get=<number or name> extract captured substring
|
||||
getall extract all captured substrings
|
||||
/g global global matching
|
||||
|
@ -1239,6 +1252,8 @@ pattern, but can be overridden by modifiers on the subject.
|
|||
match_limit=<n> set a match limit
|
||||
memory show heap memory usage
|
||||
null_context match with a NULL context
|
||||
null_replacement substitute with NULL replacement
|
||||
null_subject match with NULL subject
|
||||
offset=<n> set starting offset
|
||||
offset_limit=<n> set offset limit
|
||||
ovector=<n> set size of output vector
|
||||
|
@ -1550,7 +1565,7 @@ Setting heap, match, and depth limits
|
|||
<P>
|
||||
The <b>heap_limit</b>, <b>match_limit</b>, and <b>depth_limit</b> modifiers set
|
||||
the appropriate limits in the match context. These values are ignored when the
|
||||
<b>find_limits</b> modifier is specified.
|
||||
<b>find_limits</b> or <b>find_limits_noheap</b> modifier is specified.
|
||||
</P>
|
||||
<br><b>
|
||||
Finding minimum limits
|
||||
|
@ -1560,8 +1575,12 @@ If the <b>find_limits</b> modifier is present on a subject line, <b>pcre2test</b
|
|||
calls the relevant matching function several times, setting different values in
|
||||
the match context via <b>pcre2_set_heap_limit()</b>,
|
||||
<b>pcre2_set_match_limit()</b>, or <b>pcre2_set_depth_limit()</b> until it finds
|
||||
the minimum values for each parameter that allows the match to complete without
|
||||
error. If JIT is being used, only the match limit is relevant.
|
||||
the smallest value for each parameter that allows the match to complete without
|
||||
a "limit exceeded" error. The match itself may succeed or fail. An alternative
|
||||
modifier, <b>find_limits_noheap</b>, omits the heap limit. This is used in the
|
||||
standard tests, because the minimum heap limit varies between systems. If JIT
|
||||
is being used, only the match limit is relevant, and the other two are
|
||||
automatically omitted.
|
||||
</P>
|
||||
<P>
|
||||
When using this modifier, the pattern should not contain any limit settings
|
||||
|
@ -1589,9 +1608,7 @@ overall amount of computing resource that is used.
|
|||
</P>
|
||||
<P>
|
||||
For both kinds of matching, the <i>heap_limit</i> number, which is in kibibytes
|
||||
(units of 1024 bytes), limits the amount of heap memory used for matching. A
|
||||
value of zero disables the use of any heap memory; many simple pattern matches
|
||||
can be done without using the heap, so zero is not an unreasonable setting.
|
||||
(units of 1024 bytes), limits the amount of heap memory used for matching.
|
||||
</P>
|
||||
<br><b>
|
||||
Showing MARK names
|
||||
|
@ -1609,12 +1626,10 @@ Showing memory usage
|
|||
<P>
|
||||
The <b>memory</b> modifier causes <b>pcre2test</b> to log the sizes of all heap
|
||||
memory allocation and freeing calls that occur during a call to
|
||||
<b>pcre2_match()</b> or <b>pcre2_dfa_match()</b>. These occur only when a match
|
||||
requires a bigger vector than the default for remembering backtracking points
|
||||
(<b>pcre2_match()</b>) or for internal workspace (<b>pcre2_dfa_match()</b>). In
|
||||
many cases there will be no heap memory used and therefore no additional
|
||||
output. No heap memory is allocated during matching with JIT, so in that case
|
||||
the <b>memory</b> modifier never has any effect. For this modifier to work, the
|
||||
<b>pcre2_match()</b> or <b>pcre2_dfa_match()</b>. In the latter case, heap memory
|
||||
is used only when a match requires more internal workspace that the default
|
||||
allocation on the stack, so in many cases there will be no output. No heap
|
||||
memory is allocated during matching with JIT. For this modifier to work, the
|
||||
<b>null_context</b> modifier must not be set on both the pattern and the
|
||||
subject, though it can be set on one or the other.
|
||||
</P>
|
||||
|
@ -1668,7 +1683,7 @@ When testing <b>pcre2_substitute()</b>, this modifier also has the effect of
|
|||
passing the replacement string as zero-terminated.
|
||||
</P>
|
||||
<br><b>
|
||||
Passing a NULL context
|
||||
Passing a NULL context, subject, or replacement
|
||||
</b><br>
|
||||
<P>
|
||||
Normally, <b>pcre2test</b> passes a context block to <b>pcre2_match()</b>,
|
||||
|
@ -1676,7 +1691,13 @@ Normally, <b>pcre2test</b> passes a context block to <b>pcre2_match()</b>,
|
|||
If the <b>null_context</b> modifier is set, however, NULL is passed. This is for
|
||||
testing that the matching and substitution functions behave correctly in this
|
||||
case (they use default values). This modifier cannot be used with the
|
||||
<b>find_limits</b> or <b>substitute_callout</b> modifiers.
|
||||
<b>find_limits</b>, <b>find_limits_noheap</b>, or <b>substitute_callout</b>
|
||||
modifiers.
|
||||
</P>
|
||||
<P>
|
||||
Similarly, for testing purposes, if the <b>null_subject</b> or
|
||||
<b>null_replacement</b> modifier is set, the subject or replacement string
|
||||
pointers are passed as NULL, respectively, to the relevant functions.
|
||||
</P>
|
||||
<br><a name="SEC12" href="#TOC1">THE ALTERNATIVE MATCHING FUNCTION</a><br>
|
||||
<P>
|
||||
|
@ -2122,9 +2143,9 @@ Cambridge, England.
|
|||
</P>
|
||||
<br><a name="SEC21" href="#TOC1">REVISION</a><br>
|
||||
<P>
|
||||
Last updated: 30 August 2021
|
||||
Last updated: 27 July 2022
|
||||
<br>
|
||||
Copyright © 1997-2021 University of Cambridge.
|
||||
Copyright © 1997-2022 University of Cambridge.
|
||||
<br>
|
||||
<p>
|
||||
Return to the <a href="index.html">PCRE2 index page</a>.
|
||||
|
|
|
@ -50,17 +50,18 @@ UNICODE PROPERTY SUPPORT
|
|||
<P>
|
||||
When PCRE2 is built with Unicode support, the escape sequences \p{..},
|
||||
\P{..}, and \X can be used. This is not dependent on the PCRE2_UTF setting.
|
||||
The Unicode properties that can be tested are limited to the general category
|
||||
properties such as Lu for an upper case letter or Nd for a decimal number, the
|
||||
Unicode script names such as Arabic or Han, and the derived properties Any and
|
||||
L&. Full lists are given in the
|
||||
The Unicode properties that can be tested are a subset of those that Perl
|
||||
supports. Currently they are limited to the general category properties such as
|
||||
Lu for an upper case letter or Nd for a decimal number, the Unicode script
|
||||
names such as Arabic or Han, Bidi_Class, Bidi_Control, and the derived
|
||||
properties Any and LC (synonym L&). Full lists are given in the
|
||||
<a href="pcre2pattern.html"><b>pcre2pattern</b></a>
|
||||
and
|
||||
<a href="pcre2syntax.html"><b>pcre2syntax</b></a>
|
||||
documentation. Only the short names for properties are supported. For example,
|
||||
\p{L} matches a letter. Its Perl synonym, \p{Letter}, is not supported.
|
||||
Furthermore, in Perl, many properties may optionally be prefixed by "Is", for
|
||||
compatibility with Perl 5.6. PCRE2 does not support this.
|
||||
documentation. In general, only the short names for properties are supported.
|
||||
For example, \p{L} matches a letter. Its longer synonym, \p{Letter}, is not
|
||||
supported. Furthermore, in Perl, many properties may optionally be prefixed by
|
||||
"Is", for compatibility with Perl 5.6. PCRE2 does not support this.
|
||||
</P>
|
||||
<br><b>
|
||||
WIDE CHARACTERS AND UTF MODES
|
||||
|
@ -477,7 +478,7 @@ AUTHOR
|
|||
<P>
|
||||
Philip Hazel
|
||||
<br>
|
||||
University Computing Service
|
||||
Retired from University Computing Service
|
||||
<br>
|
||||
Cambridge, England.
|
||||
<br>
|
||||
|
@ -486,9 +487,9 @@ Cambridge, England.
|
|||
REVISION
|
||||
</b><br>
|
||||
<P>
|
||||
Last updated: 23 February 2020
|
||||
Last updated: 22 December 2021
|
||||
<br>
|
||||
Copyright © 1997-2020 University of Cambridge.
|
||||
Copyright © 1997-2021 University of Cambridge.
|
||||
<br>
|
||||
<p>
|
||||
Return to the <a href="index.html">PCRE2 index page</a>.
|
||||
|
|
1358
doc/pcre2.txt
1358
doc/pcre2.txt
File diff suppressed because it is too large
Load Diff
|
@ -1,4 +1,4 @@
|
|||
.TH PCRE2_COMPILE 3 "23 May 2019" "PCRE2 10.34"
|
||||
.TH PCRE2_COMPILE 3 "22 April 2022" "PCRE2 10.41"
|
||||
.SH NAME
|
||||
PCRE2 - Perl-compatible regular expressions (revised API)
|
||||
.SH SYNOPSIS
|
||||
|
@ -80,8 +80,17 @@ Additional options may be set in the compile context via the
|
|||
.\"
|
||||
function.
|
||||
.P
|
||||
The yield of this function is a pointer to a private data structure that
|
||||
contains the compiled pattern, or NULL if an error was detected.
|
||||
If either of \fIerrorcode\fP or \fIerroroffset\fP is NULL, the function returns
|
||||
NULL immediately. Otherwise, the yield of this function is a pointer to a
|
||||
private data structure that contains the compiled pattern, or NULL if an error
|
||||
was detected. In the error case, a text error message can be obtained by
|
||||
passing the value returned via the \fIerrorcode\fP argument to the the
|
||||
\fBpcre2_get_error_message()\fP function. The offset (in code units) where the
|
||||
error was encountered is returned via the \fIerroroffset\fP argument.
|
||||
.P
|
||||
If there is no error, the value passed via \fIerrorcode\fP returns the message
|
||||
"no error" if passed to \fBpcre2_get_error_message()\fP, and the value passed
|
||||
via \fIerroroffset\fP is zero.
|
||||
.P
|
||||
There is a complete description of the PCRE2 native API, with more detail on
|
||||
each option, in the
|
||||
|
|
|
@ -22,7 +22,8 @@ allocation. The result can be passed to the JIT run-time code by calling
|
|||
\fBpcre2_jit_stack_assign()\fP to associate the stack with a compiled pattern,
|
||||
which can then be processed by \fBpcre2_match()\fP or \fBpcre2_jit_match()\fP.
|
||||
A maximum stack size of 512KiB to 1MiB should be more than enough for any
|
||||
pattern. For more details, see the
|
||||
pattern. If the stack couldn't be allocated or the values passed were not
|
||||
reasonable, NULL will be returned. For more details, see the
|
||||
.\" HREF
|
||||
\fBpcre2jit\fP
|
||||
.\"
|
||||
|
|
|
@ -36,7 +36,7 @@ the following negative error codes:
|
|||
PCRE2_ERROR_BADDATA \fInumber_of_codes\fP is zero or less
|
||||
PCRE2_ERROR_BADMAGIC mismatch of id bytes in \fIbytes\fP
|
||||
PCRE2_ERROR_BADMODE mismatch of variable unit size or PCRE version
|
||||
PCRE2_ERROR_MEMORY memory allocation failed
|
||||
PCRE2_ERROR_NOMEMORY memory allocation failed
|
||||
PCRE2_ERROR_NULL \fIcodes\fP or \fIbytes\fP is NULL
|
||||
.sp
|
||||
PCRE2_ERROR_BADMAGIC may mean that the data is corrupt, or that it was compiled
|
||||
|
|
|
@ -18,9 +18,9 @@ This function sets additional option bits for \fBpcre2_compile()\fP that are
|
|||
housed in a compile context. It completely replaces all the bits. The extra
|
||||
options are:
|
||||
.sp
|
||||
.\" JOIN
|
||||
PCRE2_EXTRA_ALLOW_LOOKAROUND_BSK Allow \eK in lookarounds
|
||||
PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES Allow \ex{df800} to \ex{dfff}
|
||||
.\" JOIN
|
||||
PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES Allow \ex{d800} to \ex{dfff}
|
||||
in UTF-8 and UTF-32 modes
|
||||
.\" JOIN
|
||||
PCRE2_EXTRA_ALT_BSUX Extended alternate \eu, \eU, and
|
||||
|
|
|
@ -55,32 +55,42 @@ automatically added.
|
|||
The subject and replacement lengths can be given as PCRE2_ZERO_TERMINATED for
|
||||
zero-terminated strings. The options are:
|
||||
.sp
|
||||
PCRE2_ANCHORED Match only at the first position
|
||||
PCRE2_ENDANCHORED Pattern can match only at end of subject
|
||||
PCRE2_NOTBOL Subject is not the beginning of a line
|
||||
PCRE2_NOTEOL Subject is not the end of a line
|
||||
PCRE2_NOTEMPTY An empty string is not a valid match
|
||||
PCRE2_ANCHORED Match only at the first position
|
||||
PCRE2_ENDANCHORED Match only at end of subject
|
||||
.\" JOIN
|
||||
PCRE2_NOTEMPTY_ATSTART An empty string at the start of the
|
||||
subject is not a valid match
|
||||
PCRE2_NO_JIT Do not use JIT matching
|
||||
PCRE2_NOTBOL Subject is not the beginning of a
|
||||
line
|
||||
PCRE2_NOTEOL Subject is not the end of a line
|
||||
.\" JOIN
|
||||
PCRE2_NO_UTF_CHECK Do not check the subject or replacement
|
||||
for UTF validity (only relevant if
|
||||
PCRE2_UTF was set at compile time)
|
||||
PCRE2_SUBSTITUTE_EXTENDED Do extended replacement processing
|
||||
PCRE2_SUBSTITUTE_GLOBAL Replace all occurrences in the subject
|
||||
PCRE2_SUBSTITUTE_LITERAL The replacement string is literal
|
||||
PCRE2_SUBSTITUTE_MATCHED Use pre-existing match data for 1st match
|
||||
PCRE2_SUBSTITUTE_OVERFLOW_LENGTH If overflow, compute needed length
|
||||
PCRE2_NOTEMPTY An empty string is not a
|
||||
valid match
|
||||
.\" JOIN
|
||||
PCRE2_NOTEMPTY_ATSTART An empty string at the start of
|
||||
the subject is not a valid match
|
||||
PCRE2_NO_JIT Do not use JIT matching
|
||||
.\" JOIN
|
||||
PCRE2_NO_UTF_CHECK Do not check for UTF validity in
|
||||
the subject or replacement
|
||||
.\" JOIN
|
||||
(only relevant if PCRE2_UTF was
|
||||
set at compile time)
|
||||
PCRE2_SUBSTITUTE_EXTENDED Do extended replacement processing
|
||||
.\" JOIN
|
||||
PCRE2_SUBSTITUTE_GLOBAL Replace all occurrences in the
|
||||
subject
|
||||
PCRE2_SUBSTITUTE_LITERAL The replacement string is literal
|
||||
.\" JOIN
|
||||
PCRE2_SUBSTITUTE_MATCHED Use pre-existing match data for
|
||||
first match
|
||||
PCRE2_SUBSTITUTE_OVERFLOW_LENGTH If overflow, compute needed length
|
||||
PCRE2_SUBSTITUTE_REPLACEMENT_ONLY Return only replacement string(s)
|
||||
PCRE2_SUBSTITUTE_UNKNOWN_UNSET Treat unknown group as unset
|
||||
PCRE2_SUBSTITUTE_UNSET_EMPTY Simple unset insert = empty string
|
||||
PCRE2_SUBSTITUTE_UNKNOWN_UNSET Treat unknown group as unset
|
||||
PCRE2_SUBSTITUTE_UNSET_EMPTY Simple unset insert = empty string
|
||||
.sp
|
||||
If PCRE2_SUBSTITUTE_LITERAL is set, PCRE2_SUBSTITUTE_EXTENDED,
|
||||
PCRE2_SUBSTITUTE_UNKNOWN_UNSET, and PCRE2_SUBSTITUTE_UNSET_EMPTY are ignored.
|
||||
.P
|
||||
If PCRE2_SUBSTITUTE_MATCHED is set, \fImatch_data\fP must be non-zero; its
|
||||
If PCRE2_SUBSTITUTE_MATCHED is set, \fImatch_data\fP must be non-NULL; its
|
||||
contents must be the result of a call to \fBpcre2_match()\fP using the same
|
||||
pattern and subject.
|
||||
.P
|
||||
|
|
123
doc/pcre2api.3
123
doc/pcre2api.3
|
@ -1,4 +1,4 @@
|
|||
.TH PCRE2API 3 "30 August 2021" "PCRE2 10.38"
|
||||
.TH PCRE2API 3 "27 July 2022" "PCRE2 10.41"
|
||||
.SH NAME
|
||||
PCRE2 - Perl-compatible regular expressions (revised API)
|
||||
.sp
|
||||
|
@ -953,7 +953,7 @@ has its own memory control arrangements (see the
|
|||
documentation for more details). If the limit is reached, the negative error
|
||||
code PCRE2_ERROR_HEAPLIMIT is returned. The default limit can be set when PCRE2
|
||||
is built; if it is not, the default is set very large and is essentially
|
||||
"unlimited".
|
||||
unlimited.
|
||||
.P
|
||||
A value for the heap limit may also be supplied by an item at the start of a
|
||||
pattern of the form
|
||||
|
@ -964,18 +964,18 @@ where ddd is a decimal number. However, such a setting is ignored unless ddd is
|
|||
less than the limit set by the caller of \fBpcre2_match()\fP or, if no such
|
||||
limit is set, less than the default.
|
||||
.P
|
||||
The \fBpcre2_match()\fP function starts out using a 20KiB vector on the system
|
||||
stack for recording backtracking points. The more nested backtracking points
|
||||
there are (that is, the deeper the search tree), the more memory is needed.
|
||||
Heap memory is used only if the initial vector is too small. If the heap limit
|
||||
is set to a value less than 21 (in particular, zero) no heap memory will be
|
||||
used. In this case, only patterns that do not have a lot of nested backtracking
|
||||
can be successfully processed.
|
||||
The \fBpcre2_match()\fP function always needs some heap memory, so setting a
|
||||
value of zero guarantees a "heap limit exceeded" error. Details of how
|
||||
\fBpcre2_match()\fP uses the heap are given in the
|
||||
.\" HREF
|
||||
\fBpcre2perform\fP
|
||||
.\"
|
||||
documentation.
|
||||
.P
|
||||
Similarly, for \fBpcre2_dfa_match()\fP, a vector on the system stack is used
|
||||
when processing pattern recursions, lookarounds, or atomic groups, and only if
|
||||
this is not big enough is heap memory used. In this case, too, setting a value
|
||||
of zero disables the use of the heap.
|
||||
For \fBpcre2_dfa_match()\fP, a vector on the system stack is used when
|
||||
processing pattern recursions, lookarounds, or atomic groups, and only if this
|
||||
is not big enough is heap memory used. In this case, setting a value of zero
|
||||
disables the use of the heap.
|
||||
.sp
|
||||
.nf
|
||||
.B int pcre2_set_match_limit(pcre2_match_context *\fImcontext\fP,
|
||||
|
@ -1019,10 +1019,10 @@ less than the limit set by the caller of \fBpcre2_match()\fP or
|
|||
.fi
|
||||
.sp
|
||||
This parameter limits the depth of nested backtracking in \fBpcre2_match()\fP.
|
||||
Each time a nested backtracking point is passed, a new memory "frame" is used
|
||||
Each time a nested backtracking point is passed, a new memory frame is used
|
||||
to remember the state of matching at that point. Thus, this parameter
|
||||
indirectly limits the amount of memory that is used in a match. However,
|
||||
because the size of each memory "frame" depends on the number of capturing
|
||||
because the size of each memory frame depends on the number of capturing
|
||||
parentheses, the actual memory limit varies from pattern to pattern. This limit
|
||||
was more useful in versions before 10.30, where function recursion was used for
|
||||
backtracking.
|
||||
|
@ -1323,8 +1323,7 @@ If \fIerrorcode\fP or \fIerroroffset\fP is NULL, \fBpcre2_compile()\fP returns
|
|||
NULL immediately. Otherwise, the variables to which these point are set to an
|
||||
error code and an offset (number of code units) within the pattern,
|
||||
respectively, when \fBpcre2_compile()\fP returns NULL because a compilation
|
||||
error has occurred. The values are not defined when compilation is successful
|
||||
and \fBpcre2_compile()\fP returns a non-NULL value.
|
||||
error has occurred.
|
||||
.P
|
||||
There are nearly 100 positive error codes that \fBpcre2_compile()\fP may return
|
||||
if it finds an error in the pattern. There are also some negative error codes
|
||||
|
@ -1343,14 +1342,17 @@ message"
|
|||
below)
|
||||
.\"
|
||||
should be self-explanatory. Macro names starting with PCRE2_ERROR_ are defined
|
||||
for both positive and negative error codes in \fBpcre2.h\fP.
|
||||
for both positive and negative error codes in \fBpcre2.h\fP. When compilation
|
||||
is successful \fIerrorcode\fP is set to a value that returns the message "no
|
||||
error" if passed to \fBpcre2_get_error_message()\fP.
|
||||
.P
|
||||
The value returned in \fIerroroffset\fP is an indication of where in the
|
||||
pattern the error occurred. It is not necessarily the furthest point in the
|
||||
pattern that was read. For example, after the error "lookbehind assertion is
|
||||
not fixed length", the error offset points to the start of the failing
|
||||
assertion. For an invalid UTF-8 or UTF-16 string, the offset is that of the
|
||||
first code unit of the failing character.
|
||||
pattern an error occurred. When there is no error, zero is returned. A non-zero
|
||||
value is not necessarily the furthest point in the pattern that was read. For
|
||||
example, after the error "lookbehind assertion is not fixed length", the error
|
||||
offset points to the start of the failing assertion. For an invalid UTF-8 or
|
||||
UTF-16 string, the offset is that of the first code unit of the failing
|
||||
character.
|
||||
.P
|
||||
Some errors are not detected until the whole pattern has been scanned; in these
|
||||
cases, the offset passed back is the length of the pattern. Note that the
|
||||
|
@ -1794,7 +1796,7 @@ it is set, the effect of passing an invalid UTF string as a pattern is
|
|||
undefined. It may cause your program to crash or loop.
|
||||
.P
|
||||
Note that this option can also be passed to \fBpcre2_match()\fP and
|
||||
\fBpcre_dfa_match()\fP, to suppress UTF validity checking of the subject
|
||||
\fBpcre2_dfa_match()\fP, to suppress UTF validity checking of the subject
|
||||
string.
|
||||
.P
|
||||
Note also that setting PCRE2_NO_UTF_CHECK at compile time does not disable the
|
||||
|
@ -2015,8 +2017,8 @@ point. However, this applies only to characters whose code points are less than
|
|||
256. By default, higher-valued code points never match escapes such as \ew or
|
||||
\ed.
|
||||
.P
|
||||
When PCRE2 is built with Unicode support (the default), the Unicode properties
|
||||
of all characters can be tested with \ep and \eP, or, alternatively, the
|
||||
When PCRE2 is built with Unicode support (the default), certain Unicode
|
||||
character properties can be tested with \ep and \eP, or, alternatively, the
|
||||
PCRE2_UCP option can be set when a pattern is compiled; this causes \ew and
|
||||
friends to use Unicode property support instead of the built-in tables.
|
||||
PCRE2_UCP also causes upper/lower casing operations on characters with code
|
||||
|
@ -2279,7 +2281,7 @@ return zero. The third argument should point to a \fBsize_t\fP variable.
|
|||
PCRE2_INFO_LASTCODETYPE
|
||||
.sp
|
||||
Returns 1 if there is a rightmost literal code unit that must exist in any
|
||||
matched string, other than at its start. The third argument should point to a
|
||||
matched string, other than at its start. The third argument should point to a
|
||||
\fBuint32_t\fP variable. If there is no such value, 0 is returned. When 1 is
|
||||
returned, the code unit value itself can be retrieved using
|
||||
PCRE2_INFO_LASTCODEUNIT. For anchored patterns, a last literal value is
|
||||
|
@ -2624,7 +2626,9 @@ The subject string is passed to \fBpcre2_match()\fP as a pointer in
|
|||
\fIstartoffset\fP. The length and offset are in code units, not characters.
|
||||
That is, they are in bytes for the 8-bit library, 16-bit code units for the
|
||||
16-bit library, and 32-bit code units for the 32-bit library, whether or not
|
||||
UTF processing is enabled.
|
||||
UTF processing is enabled. As a special case, if \fIsubject\fP is NULL and
|
||||
\fIlength\fP is zero, the subject is assumed to be an empty string. If
|
||||
\fIlength\fP is non-zero, an error occurs if \fIsubject\fP is NULL.
|
||||
.P
|
||||
If \fIstartoffset\fP is greater than the length of the subject,
|
||||
\fBpcre2_match()\fP returns PCRE2_ERROR_BADOFFSET. When the starting offset is
|
||||
|
@ -3158,11 +3162,11 @@ The backtracking match limit was reached.
|
|||
.sp
|
||||
PCRE2_ERROR_NOMEMORY
|
||||
.sp
|
||||
If a pattern contains many nested backtracking points, heap memory is used to
|
||||
remember them. This error is given when the memory allocation function (default
|
||||
or custom) fails. Note that a different error, PCRE2_ERROR_HEAPLIMIT, is given
|
||||
if the amount of memory needed exceeds the heap limit. PCRE2_ERROR_NOMEMORY is
|
||||
also returned if PCRE2_COPY_MATCHED_SUBJECT is set and memory allocation fails.
|
||||
Heap memory is used to remember backgracking points. This error is given when
|
||||
the memory allocation function (default or custom) fails. Note that a different
|
||||
error, PCRE2_ERROR_HEAPLIMIT, is given if the amount of memory needed exceeds
|
||||
the heap limit. PCRE2_ERROR_NOMEMORY is also returned if
|
||||
PCRE2_COPY_MATCHED_SUBJECT is set and memory allocation fails.
|
||||
.sp
|
||||
PCRE2_ERROR_NULL
|
||||
.sp
|
||||
|
@ -3413,12 +3417,16 @@ same number causes an error at compile time.
|
|||
.P
|
||||
This function optionally calls \fBpcre2_match()\fP and then makes a copy of the
|
||||
subject string in \fIoutputbuffer\fP, replacing parts that were matched with
|
||||
the \fIreplacement\fP string, whose length is supplied in \fBrlength\fP. This
|
||||
can be given as PCRE2_ZERO_TERMINATED for a zero-terminated string. There is an
|
||||
option (see PCRE2_SUBSTITUTE_REPLACEMENT_ONLY below) to return just the
|
||||
replacement string(s). The default action is to perform just one replacement if
|
||||
the pattern matches, but there is an option that requests multiple replacements
|
||||
(see PCRE2_SUBSTITUTE_GLOBAL below).
|
||||
the \fIreplacement\fP string, whose length is supplied in \fBrlength\fP, which
|
||||
can be given as PCRE2_ZERO_TERMINATED for a zero-terminated string. As a
|
||||
special case, if \fIreplacement\fP is NULL and \fIrlength\fP is zero, the
|
||||
replacement is assumed to be an empty string. If \fIrlength\fP is non-zero, an
|
||||
error occurs if \fIreplacement\fP is NULL.
|
||||
.P
|
||||
There is an option (see PCRE2_SUBSTITUTE_REPLACEMENT_ONLY below) to return just
|
||||
the replacement string(s). The default action is to perform just one
|
||||
replacement if the pattern matches, but there is an option that requests
|
||||
multiple replacements (see PCRE2_SUBSTITUTE_GLOBAL below).
|
||||
.P
|
||||
If successful, \fBpcre2_substitute()\fP returns the number of substitutions
|
||||
that were carried out. This may be zero if no match was found, and is never
|
||||
|
@ -3447,12 +3455,12 @@ block may or may not have been changed.
|
|||
As well as the usual options for \fBpcre2_match()\fP, a number of additional
|
||||
options can be set in the \fIoptions\fP argument of \fBpcre2_substitute()\fP.
|
||||
One such option is PCRE2_SUBSTITUTE_MATCHED. When this is set, an external
|
||||
\fImatch_data\fP block must be provided, and it must have been used for an
|
||||
external call to \fBpcre2_match()\fP. The data in the \fImatch_data\fP block
|
||||
(return code, offset vector) is used for the first substitution instead of
|
||||
calling \fBpcre2_match()\fP from within \fBpcre2_substitute()\fP. This allows
|
||||
an application to check for a match before choosing to substitute, without
|
||||
having to repeat the match.
|
||||
\fImatch_data\fP block must be provided, and it must have already been used for
|
||||
an external call to \fBpcre2_match()\fP with the same pattern and subject
|
||||
arguments. The data in the \fImatch_data\fP block (return code, offset vector)
|
||||
is then used for the first substitution instead of calling \fBpcre2_match()\fP
|
||||
from within \fBpcre2_substitute()\fP. This allows an application to check for a
|
||||
match before choosing to substitute, without having to repeat the match.
|
||||
.P
|
||||
The contents of the externally supplied match data block are not changed when
|
||||
PCRE2_SUBSTITUTE_MATCHED is set. If PCRE2_SUBSTITUTE_GLOBAL is also set,
|
||||
|
@ -3584,7 +3592,7 @@ and force lower case. The escape sequences change the current state: \eU and
|
|||
terminating a \eQ quoted sequence) reverts to no case forcing. The sequences
|
||||
\eu and \el force the next character (if it is a letter) to upper or lower
|
||||
case, respectively, and then the state automatically reverts to no case
|
||||
forcing. Case forcing applies to all inserted characters, including those from
|
||||
forcing. Case forcing applies to all inserted characters, including those from
|
||||
capture groups and letters within \eQ...\eE quoted sequences. If either
|
||||
PCRE2_UTF or PCRE2_UCP was set when the pattern was compiled, Unicode
|
||||
properties are used for case forcing characters whose code points are greater
|
||||
|
@ -3649,7 +3657,9 @@ needed is returned via \fIoutlengthptr\fP. Note that this does not happen by
|
|||
default.
|
||||
.P
|
||||
PCRE2_ERROR_NULL is returned if PCRE2_SUBSTITUTE_MATCHED is set but the
|
||||
\fImatch_data\fP argument is NULL.
|
||||
\fImatch_data\fP argument is NULL or if the \fIsubject\fP or \fIreplacement\fP
|
||||
arguments are NULL. For backward compatibility reasons an exception is made for
|
||||
the \fIreplacement\fP argument if the \fIrlength\fP argument is also 0.
|
||||
.P
|
||||
PCRE2_ERROR_BADREPLACEMENT is used for miscellaneous syntax errors in the
|
||||
replacement string, with more particular errors being PCRE2_ERROR_BADREPESCAPE
|
||||
|
@ -3811,12 +3821,13 @@ other alternatives. Ultimately, when it runs out of matches,
|
|||
.P
|
||||
The function \fBpcre2_dfa_match()\fP is called to match a subject string
|
||||
against a compiled pattern, using a matching algorithm that scans the subject
|
||||
string just once (not counting lookaround assertions), and does not backtrack.
|
||||
This has different characteristics to the normal algorithm, and is not
|
||||
compatible with Perl. Some of the features of PCRE2 patterns are not supported.
|
||||
Nevertheless, there are times when this kind of matching can be useful. For a
|
||||
discussion of the two matching algorithms, and a list of features that
|
||||
\fBpcre2_dfa_match()\fP does not support, see the
|
||||
string just once (not counting lookaround assertions), and does not backtrack
|
||||
(except when processing lookaround assertions). This has different
|
||||
characteristics to the normal algorithm, and is not compatible with Perl. Some
|
||||
of the features of PCRE2 patterns are not supported. Nevertheless, there are
|
||||
times when this kind of matching can be useful. For a discussion of the two
|
||||
matching algorithms, and a list of features that \fBpcre2_dfa_match()\fP does
|
||||
not support, see the
|
||||
.\" HREF
|
||||
\fBpcre2matching\fP
|
||||
.\"
|
||||
|
@ -3848,7 +3859,7 @@ Here is an example of a simple call to \fBpcre2_dfa_match()\fP:
|
|||
wspace, /* working space vector */
|
||||
20); /* number of elements (NOT size in bytes) */
|
||||
.
|
||||
.SS "Option bits for \fBpcre_dfa_match()\fP"
|
||||
.SS "Option bits for \fBpcre2_dfa_match()\fP"
|
||||
.rs
|
||||
.sp
|
||||
The unused bits of the \fIoptions\fP argument for \fBpcre2_dfa_match()\fP must
|
||||
|
@ -4016,6 +4027,6 @@ Cambridge, England.
|
|||
.rs
|
||||
.sp
|
||||
.nf
|
||||
Last updated: 30 August 2021
|
||||
Copyright (c) 1997-2021 University of Cambridge.
|
||||
Last updated: 27 July 2022
|
||||
Copyright (c) 1997-2022 University of Cambridge.
|
||||
.fi
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
.TH PCRE2BUILD 3 "20 March 2020" "PCRE2 10.35"
|
||||
.TH PCRE2BUILD 3 "27 July 2022" "PCRE2 10.41"
|
||||
.SH NAME
|
||||
PCRE2 - Perl-compatible regular expressions (revised API)
|
||||
.
|
||||
|
@ -122,8 +122,9 @@ locked this out by setting PCRE2_NEVER_UTF.
|
|||
UTF support allows the libraries to process character code points up to
|
||||
0x10ffff in the strings that they handle. Unicode support also gives access to
|
||||
the Unicode properties of characters, using pattern escapes such as \eP, \ep,
|
||||
and \eX. Only the general category properties such as \fILu\fP and \fINd\fP are
|
||||
supported. Details are given in the
|
||||
and \eX. Only the general category properties such as \fILu\fP and \fINd\fP,
|
||||
script names, and some bi-directional properties are supported. Details are
|
||||
given in the
|
||||
.\" HREF
|
||||
\fBpcre2pattern\fP
|
||||
.\"
|
||||
|
@ -277,12 +278,11 @@ to the \fBconfigure\fP command. This setting also applies to the
|
|||
\fBpcre2_dfa_match()\fP matching function, and to JIT matching (though the
|
||||
counting is done differently).
|
||||
.P
|
||||
The \fBpcre2_match()\fP function starts out using a 20KiB vector on the system
|
||||
stack to record backtracking points. The more nested backtracking points there
|
||||
are (that is, the deeper the search tree), the more memory is needed. If the
|
||||
initial vector is not large enough, heap memory is used, up to a certain limit,
|
||||
which is specified in kibibytes (units of 1024 bytes). The limit can be changed
|
||||
at run time, as described in the
|
||||
The \fBpcre2_match()\fP function uses heap memory to record backtracking
|
||||
points. The more nested backtracking points there are (that is, the deeper the
|
||||
search tree), the more memory is needed. There is an upper limit, specified in
|
||||
kibibytes (units of 1024 bytes). This limit can be changed at run time, as
|
||||
described in the
|
||||
.\" HREF
|
||||
\fBpcre2api\fP
|
||||
.\"
|
||||
|
@ -302,7 +302,7 @@ You can also explicitly limit the depth of nested backtracking in the
|
|||
for --with-match-limit. You can set a lower default limit by adding, for
|
||||
example,
|
||||
.sp
|
||||
--with-match-limit_depth=10000
|
||||
--with-match-limit-depth=10000
|
||||
.sp
|
||||
to the \fBconfigure\fP command. This value can be overridden at run time. This
|
||||
depth limit indirectly limits the amount of heap memory that is used, but
|
||||
|
@ -563,15 +563,16 @@ documentation.
|
|||
.sp
|
||||
The C99 standard defines formatting modifiers z and t for size_t and
|
||||
ptrdiff_t values, respectively. By default, PCRE2 uses these modifiers in
|
||||
environments other than Microsoft Visual Studio when __STDC_VERSION__ is
|
||||
defined and has a value greater than or equal to 199901L (indicating C99).
|
||||
environments other than old versions of Microsoft Visual Studio when
|
||||
__STDC_VERSION__ is defined and has a value greater than or equal to 199901L
|
||||
(indicating support for C99).
|
||||
However, there is at least one environment that claims to be C99 but does not
|
||||
support these modifiers. If
|
||||
.sp
|
||||
--disable-percent-zt
|
||||
.sp
|
||||
is specified, no use is made of the z or t modifiers. Instead of %td or %zu,
|
||||
%lu is used, with a cast for size_t values.
|
||||
a suitable format is used depending in the size of long for the platform.
|
||||
.
|
||||
.
|
||||
.SH "SUPPORT FOR FUZZERS"
|
||||
|
@ -623,7 +624,7 @@ give a warning.
|
|||
.sp
|
||||
.nf
|
||||
Philip Hazel
|
||||
University Computing Service
|
||||
Retired from University Computing Service
|
||||
Cambridge, England.
|
||||
.fi
|
||||
.
|
||||
|
@ -632,6 +633,6 @@ Cambridge, England.
|
|||
.rs
|
||||
.sp
|
||||
.nf
|
||||
Last updated: 20 March 2020
|
||||
Copyright (c) 1997-2020 University of Cambridge.
|
||||
Last updated: 27 July 2022
|
||||
Copyright (c) 1997-2022 University of Cambridge.
|
||||
.fi
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
.TH PCRE2COMPAT 3 "30 August 2021" "PCRE2 10.38"
|
||||
.TH PCRE2COMPAT 3 "08 December 2021" "PCRE2 10.40"
|
||||
.SH NAME
|
||||
PCRE2 - Perl-compatible regular expressions (revised API)
|
||||
.SH "DIFFERENCES BETWEEN PCRE2 AND PERL"
|
||||
|
@ -6,31 +6,38 @@ PCRE2 - Perl-compatible regular expressions (revised API)
|
|||
.sp
|
||||
This document describes some of the differences in the ways that PCRE2 and Perl
|
||||
handle regular expressions. The differences described here are with respect to
|
||||
Perl version 5.32.0, but as both Perl and PCRE2 are continually changing, the
|
||||
Perl version 5.34.0, but as both Perl and PCRE2 are continually changing, the
|
||||
information may at times be out of date.
|
||||
.P
|
||||
1. PCRE2 has only a subset of Perl's Unicode support. Details of what it does
|
||||
1. When PCRE2_DOTALL (equivalent to Perl's /s qualifier) is not set, the
|
||||
behaviour of the '.' metacharacter differs from Perl. In PCRE2, '.' matches the
|
||||
next character unless it is the start of a newline sequence. This means that,
|
||||
if the newline setting is CR, CRLF, or NUL, '.' will match the code point LF
|
||||
(0x0A) in ASCII/Unicode environments, and NL (either 0x15 or 0x25) when using
|
||||
EBCDIC. In Perl, '.' appears never to match LF, even when 0x0A is not a newline
|
||||
indicator.
|
||||
.P
|
||||
2. PCRE2 has only a subset of Perl's Unicode support. Details of what it does
|
||||
have are given in the
|
||||
.\" HREF
|
||||
\fBpcre2unicode\fP
|
||||
.\"
|
||||
page.
|
||||
.P
|
||||
2. Like Perl, PCRE2 allows repeat quantifiers on parenthesized assertions, but
|
||||
3. Like Perl, PCRE2 allows repeat quantifiers on parenthesized assertions, but
|
||||
they do not mean what you might think. For example, (?!a){3} does not assert
|
||||
that the next three characters are not "a". It just asserts that the next
|
||||
character is not "a" three times (in principle; PCRE2 optimizes this to run the
|
||||
assertion just once). Perl allows some repeat quantifiers on other assertions,
|
||||
for example, \eb* (but not \eb{3}, though oddly it does allow ^{3}), but these
|
||||
do not seem to have any use. PCRE2 does not allow any kind of quantifier on
|
||||
non-lookaround assertions.
|
||||
for example, \eb* , but these do not seem to have any use. PCRE2 does not allow
|
||||
any kind of quantifier on non-lookaround assertions.
|
||||
.P
|
||||
3. Capture groups that occur inside negative lookaround assertions are counted,
|
||||
4. Capture groups that occur inside negative lookaround assertions are counted,
|
||||
but their entries in the offsets vector are set only when a negative assertion
|
||||
is a condition that has a matching branch (that is, the condition is false).
|
||||
Perl may set such capture groups in other circumstances.
|
||||
.P
|
||||
4. The following Perl escape sequences are not supported: \eF, \el, \eL, \eu,
|
||||
5. The following Perl escape sequences are not supported: \eF, \el, \eL, \eu,
|
||||
\eU, and \eN when followed by a character name. \eN on its own, matching a
|
||||
non-newline character, and \eN{U+dd..}, matching a Unicode code point, are
|
||||
supported. The escapes that modify the case of following letters are
|
||||
|
@ -40,12 +47,12 @@ generated by default. However, if either of the PCRE2_ALT_BSUX or
|
|||
PCRE2_EXTRA_ALT_BSUX options is set, \eU and \eu are interpreted as ECMAScript
|
||||
interprets them.
|
||||
.P
|
||||
5. The Perl escape sequences \ep, \eP, and \eX are supported only if PCRE2 is
|
||||
6. The Perl escape sequences \ep, \eP, and \eX are supported only if PCRE2 is
|
||||
built with Unicode support (the default). The properties that can be tested
|
||||
with \ep and \eP are limited to the general category properties such as Lu and
|
||||
Nd, script names such as Greek or Han, and the derived properties Any and L&.
|
||||
Both PCRE2 and Perl support the Cs (surrogate) property, but in PCRE2 its use
|
||||
is limited. See the
|
||||
Nd, script names such as Greek or Han, Bidi_Class, Bidi_Control, and the
|
||||
derived properties Any and LC (synonym L&). Both PCRE2 and Perl support the Cs
|
||||
(surrogate) property, but in PCRE2 its use is limited. See the
|
||||
.\" HREF
|
||||
\fBpcre2pattern\fP
|
||||
.\"
|
||||
|
@ -53,14 +60,14 @@ documentation for details. The long synonyms for property names that Perl
|
|||
supports (such as \ep{Letter}) are not supported by PCRE2, nor is it permitted
|
||||
to prefix any of these properties with "Is".
|
||||
.P
|
||||
6. PCRE2 supports the \eQ...\eE escape for quoting substrings. Characters
|
||||
7. PCRE2 supports the \eQ...\eE escape for quoting substrings. Characters
|
||||
in between are treated as literals. However, this is slightly different from
|
||||
Perl in that $ and @ are also handled as literals inside the quotes. In Perl,
|
||||
they cause variable interpolation (but of course PCRE2 does not have
|
||||
variables). Also, Perl does "double-quotish backslash interpolation" on any
|
||||
backslashes between \eQ and \eE which, its documentation says, "may lead to
|
||||
confusing results". PCRE2 treats a backslash between \eQ and \eE just like any
|
||||
other character. Note the following examples:
|
||||
they cause variable interpolation (PCRE2 does not have variables). Also, Perl
|
||||
does "double-quotish backslash interpolation" on any backslashes between \eQ
|
||||
and \eE which, its documentation says, "may lead to confusing results". PCRE2
|
||||
treats a backslash between \eQ and \eE just like any other character. Note the
|
||||
following examples:
|
||||
.sp
|
||||
Pattern PCRE2 matches Perl matches
|
||||
.sp
|
||||
|
@ -75,7 +82,7 @@ other character. Note the following examples:
|
|||
The \eQ...\eE sequence is recognized both inside and outside character classes
|
||||
by both PCRE2 and Perl.
|
||||
.P
|
||||
7. Fairly obviously, PCRE2 does not support the (?{code}) and (??{code})
|
||||
8. Fairly obviously, PCRE2 does not support the (?{code}) and (??{code})
|
||||
constructions. However, PCRE2 does have a "callout" feature, which allows an
|
||||
external function to be called during pattern matching. See the
|
||||
.\" HREF
|
||||
|
@ -83,11 +90,11 @@ external function to be called during pattern matching. See the
|
|||
.\"
|
||||
documentation for details.
|
||||
.P
|
||||
8. Subroutine calls (whether recursive or not) were treated as atomic groups up
|
||||
9. Subroutine calls (whether recursive or not) were treated as atomic groups up
|
||||
to PCRE2 release 10.23, but from release 10.30 this changed, and backtracking
|
||||
into subroutine calls is now supported, as in Perl.
|
||||
.P
|
||||
9. In PCRE2, if any of the backtracking control verbs are used in a group that
|
||||
10. In PCRE2, if any of the backtracking control verbs are used in a group that
|
||||
is called as a subroutine (whether or not recursively), their effect is
|
||||
confined to that group; it does not extend to the surrounding pattern. This is
|
||||
not always the case in Perl. In particular, if (*THEN) is present in a group
|
||||
|
@ -95,18 +102,18 @@ that is called as a subroutine, its action is limited to that group, even if
|
|||
the group does not contain any | characters. Note that such groups are
|
||||
processed as anchored at the point where they are tested.
|
||||
.P
|
||||
10. If a pattern contains more than one backtracking control verb, the first
|
||||
11. If a pattern contains more than one backtracking control verb, the first
|
||||
one that is backtracked onto acts. For example, in the pattern
|
||||
A(*COMMIT)B(*PRUNE)C a failure in B triggers (*COMMIT), but a failure in C
|
||||
triggers (*PRUNE). Perl's behaviour is more complex; in many cases it is the
|
||||
same as PCRE2, but there are cases where it differs.
|
||||
.P
|
||||
11. There are some differences that are concerned with the settings of captured
|
||||
12. There are some differences that are concerned with the settings of captured
|
||||
strings when part of a pattern is repeated. For example, matching "aba" against
|
||||
the pattern /^(a(b)?)+$/ in Perl leaves $2 unset, but in PCRE2 it is set to
|
||||
"b".
|
||||
.P
|
||||
12. PCRE2's handling of duplicate capture group numbers and names is not as
|
||||
13. PCRE2's handling of duplicate capture group numbers and names is not as
|
||||
general as Perl's. This is a consequence of the fact the PCRE2 works internally
|
||||
just with numbers, using an external table to translate between numbers and
|
||||
names. In particular, a pattern such as (?|(?<a>A)|(?<b>B)), where the two
|
||||
|
@ -115,37 +122,38 @@ causes an error at compile time. If it were allowed, it would not be possible
|
|||
to distinguish which group matched, because both names map to capture group
|
||||
number 1. To avoid this confusing situation, an error is given at compile time.
|
||||
.P
|
||||
13. Perl used to recognize comments in some places that PCRE2 does not, for
|
||||
14. Perl used to recognize comments in some places that PCRE2 does not, for
|
||||
example, between the ( and ? at the start of a group. If the /x modifier is
|
||||
set, Perl allowed white space between ( and ? though the latest Perls give an
|
||||
error (for a while it was just deprecated). There may still be some cases where
|
||||
Perl behaves differently.
|
||||
.P
|
||||
14. Perl, when in warning mode, gives warnings for character classes such as
|
||||
15. Perl, when in warning mode, gives warnings for character classes such as
|
||||
[A-\ed] or [a-[:digit:]]. It then treats the hyphens as literals. PCRE2 has no
|
||||
warning features, so it gives an error in these cases because they are almost
|
||||
certainly user mistakes.
|
||||
.P
|
||||
15. In PCRE2, the upper/lower case character properties Lu and Ll are not
|
||||
16. In PCRE2, the upper/lower case character properties Lu and Ll are not
|
||||
affected when case-independent matching is specified. For example, \ep{Lu}
|
||||
always matches an upper case letter. I think Perl has changed in this respect;
|
||||
in the release at the time of writing (5.32), \ep{Lu} and \ep{Ll} match all
|
||||
in the release at the time of writing (5.34), \ep{Lu} and \ep{Ll} match all
|
||||
letters, regardless of case, when case independence is specified.
|
||||
.P
|
||||
16. From release 5.32.0, Perl locks out the use of \eK in lookaround
|
||||
17. From release 5.32.0, Perl locks out the use of \eK in lookaround
|
||||
assertions. From release 10.38 PCRE2 does the same by default. However, there
|
||||
is an option for re-enabling the previous behaviour. When this option is set,
|
||||
\eK is acted on when it occurs in positive assertions, but is ignored in
|
||||
negative assertions.
|
||||
.P
|
||||
17. PCRE2 provides some extensions to the Perl regular expression facilities.
|
||||
18. PCRE2 provides some extensions to the Perl regular expression facilities.
|
||||
Perl 5.10 included new features that were not in earlier versions of Perl, some
|
||||
of which (such as named parentheses) were in PCRE2 for some time before. This
|
||||
list is with respect to Perl 5.32:
|
||||
list is with respect to Perl 5.34:
|
||||
.sp
|
||||
(a) Although lookbehind assertions in PCRE2 must match fixed length strings,
|
||||
each alternative toplevel branch of a lookbehind assertion can match a
|
||||
different length of string. Perl requires them all to have the same length.
|
||||
different length of string. Perl used to require them all to have the same
|
||||
length, but the latest version has some variable length support.
|
||||
.sp
|
||||
(b) From PCRE2 10.23, backreferences to groups of fixed length are supported
|
||||
in lookbehinds, provided that there is no possibility of referencing a
|
||||
|
@ -186,11 +194,11 @@ the pattern.
|
|||
extension to the lookaround facilities. The default, Perl-compatible
|
||||
lookarounds are atomic.
|
||||
.P
|
||||
18. The Perl /a modifier restricts /d numbers to pure ascii, and the /aa
|
||||
19. The Perl /a modifier restricts /d numbers to pure ascii, and the /aa
|
||||
modifier restricts /i case-insensitive matching to pure ascii, ignoring Unicode
|
||||
rules. This separation cannot be represented with PCRE2_UCP.
|
||||
.P
|
||||
19. Perl has different limits than PCRE2. See the
|
||||
20. Perl has different limits than PCRE2. See the
|
||||
.\" HREF
|
||||
\fBpcre2limit\fP
|
||||
.\"
|
||||
|
@ -214,6 +222,6 @@ Cambridge, England.
|
|||
.rs
|
||||
.sp
|
||||
.nf
|
||||
Last updated: 30 August 2021
|
||||
Last updated: 08 December 2021
|
||||
Copyright (c) 1997-2021 University of Cambridge.
|
||||
.fi
|
||||
|
|
|
@ -116,8 +116,8 @@ permitted to match separator characters, but the double-star (**) feature
|
|||
(which does match separators) is supported.
|
||||
.P
|
||||
PCRE2_CONVERT_GLOB_NO_WILD_SEPARATOR matches globs with wildcards allowed to
|
||||
match separator characters. PCRE2_GLOB_NO_STARSTAR matches globs with the
|
||||
double-star feature disabled. These options may be given together.
|
||||
match separator characters. PCRE2_CONVERT_GLOB_NO_STARSTAR matches globs with
|
||||
the double-star feature disabled. These options may be given together.
|
||||
.
|
||||
.
|
||||
.SH "CONVERTING POSIX PATTERNS"
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
.TH PCRE2GREP 1 "31 August 2021" "PCRE2 10.38"
|
||||
.TH PCRE2GREP 1 "30 July 2022" "PCRE2 10.41"
|
||||
.SH NAME
|
||||
pcre2grep - a grep with Perl-compatible regular expressions.
|
||||
.SH SYNOPSIS
|
||||
|
@ -43,13 +43,15 @@ For example:
|
|||
.sp
|
||||
pcre2grep some-pattern file1 - file3
|
||||
.sp
|
||||
Input files are searched line by line. By default, each line that matches a
|
||||
By default, input files are searched line by line. Each line that matches a
|
||||
pattern is copied to the standard output, and if there is more than one file,
|
||||
the file name is output at the start of each line, followed by a colon.
|
||||
However, there are options that can change how \fBpcre2grep\fP behaves. In
|
||||
particular, the \fB-M\fP option makes it possible to search for strings that
|
||||
span line boundaries. What defines a line boundary is controlled by the
|
||||
\fB-N\fP (\fB--newline\fP) option.
|
||||
However, there are options that can change how \fBpcre2grep\fP behaves. For
|
||||
example, the \fB-M\fP option makes it possible to search for strings that span
|
||||
line boundaries. What defines a line boundary is controlled by the \fB-N\fP
|
||||
(\fB--newline\fP) option. The \fB-h\fP and \fB-H\fP options control whether or
|
||||
not file names are shown, and the \fB-Z\fP option changes the file name
|
||||
terminator to a zero byte.
|
||||
.P
|
||||
The amount of memory used for buffering files that are being scanned is
|
||||
controlled by parameters that can be set by the \fB--buffer-size\fP and
|
||||
|
@ -149,9 +151,11 @@ Output up to \fInumber\fP lines of context after each matching line. Fewer
|
|||
lines are output if the next match or the end of the file is reached, or if the
|
||||
processing buffer size has been set too small. If file names and/or line
|
||||
numbers are being output, a hyphen separator is used instead of a colon for the
|
||||
context lines. A line containing "--" is output between each group of lines,
|
||||
unless they are in fact contiguous in the input file. The value of \fInumber\fP
|
||||
is expected to be relatively small. When \fB-c\fP is used, \fB-A\fP is ignored.
|
||||
context lines (the \fB-Z\fP option can be used to change the file name
|
||||
terminator to a zero byte). A line containing "--" is output between each group
|
||||
of lines, unless they are in fact contiguous in the input file. The value of
|
||||
\fInumber\fP is expected to be relatively small. When \fB-c\fP is used,
|
||||
\fB-A\fP is ignored.
|
||||
.TP
|
||||
\fB-a\fP, \fB--text\fP
|
||||
Treat binary files as text. This is equivalent to
|
||||
|
@ -167,9 +171,10 @@ Output up to \fInumber\fP lines of context before each matching line. Fewer
|
|||
lines are output if the previous match or the start of the file is within
|
||||
\fInumber\fP lines, or if the processing buffer size has been set too small. If
|
||||
file names and/or line numbers are being output, a hyphen separator is used
|
||||
instead of a colon for the context lines. A line containing "--" is output
|
||||
between each group of lines, unless they are in fact contiguous in the input
|
||||
file. The value of \fInumber\fP is expected to be relatively small. When
|
||||
instead of a colon for the context lines (the \fB-Z\fP option can be used to
|
||||
change the file name terminator to a zero byte). A line containing "--" is
|
||||
output between each group of lines, unless they are in fact contiguous in the
|
||||
input file. The value of \fInumber\fP is expected to be relatively small. When
|
||||
\fB-c\fP is used, \fB-B\fP is ignored.
|
||||
.TP
|
||||
\fB--binary-files=\fP\fIword\fP
|
||||
|
@ -356,19 +361,21 @@ shown separately. This option is mutually exclusive with \fB--output\fP,
|
|||
.TP
|
||||
\fB-H\fP, \fB--with-filename\fP
|
||||
Force the inclusion of the file name at the start of output lines when
|
||||
searching a single file. By default, the file name is not shown in this case.
|
||||
For matching lines, the file name is followed by a colon; for context lines, a
|
||||
hyphen separator is used. If a line number is also being output, it follows the
|
||||
file name. When the \fB-M\fP option causes a pattern to match more than one
|
||||
line, only the first is preceded by the file name. This option overrides any
|
||||
previous \fB-h\fP, \fB-l\fP, or \fB-L\fP options.
|
||||
searching a single file. The file name is not normally shown in this case.
|
||||
By default, for matching lines, the file name is followed by a colon; for
|
||||
context lines, a hyphen separator is used. The \fB-Z\fP option can be used to
|
||||
change the terminator to a zero byte. If a line number is also being output,
|
||||
it follows the file name. When the \fB-M\fP option causes a pattern to match
|
||||
more than one line, only the first is preceded by the file name. This option
|
||||
overrides any previous \fB-h\fP, \fB-l\fP, or \fB-L\fP options.
|
||||
.TP
|
||||
\fB-h\fP, \fB--no-filename\fP
|
||||
Suppress the output file names when searching multiple files. By default,
|
||||
file names are shown when multiple files are searched. For matching lines, the
|
||||
file name is followed by a colon; for context lines, a hyphen separator is used.
|
||||
If a line number is also being output, it follows the file name. This option
|
||||
overrides any previous \fB-H\fP, \fB-L\fP, or \fB-l\fP options.
|
||||
Suppress the output file names when searching multiple files. File names are
|
||||
normally shown when multiple files are searched. By default, for matching
|
||||
lines, the file name is followed by a colon; for context lines, a hyphen
|
||||
separator is used. The \fB-Z\fP option can be used to change the terminator to
|
||||
a zero byte. If a line number is also being output, it follows the file name.
|
||||
This option overrides any previous \fB-H\fP, \fB-L\fP, or \fB-l\fP options.
|
||||
.TP
|
||||
\fB--heap-limit\fP=\fInumber\fP
|
||||
See \fB--match-limit\fP below.
|
||||
|
@ -417,17 +424,19 @@ given any number of times. If a directory matches both \fB--include-dir\fP and
|
|||
\fB-L\fP, \fB--files-without-match\fP
|
||||
Instead of outputting lines from the files, just output the names of the files
|
||||
that do not contain any lines that would have been output. Each file name is
|
||||
output once, on a separate line. This option overrides any previous \fB-H\fP,
|
||||
\fB-h\fP, or \fB-l\fP options.
|
||||
output once, on a separate line by default, but if the \fB-Z\fP option is set,
|
||||
they are separated by zero bytes instead of newlines. This option overrides any
|
||||
previous \fB-H\fP, \fB-h\fP, or \fB-l\fP options.
|
||||
.TP
|
||||
\fB-l\fP, \fB--files-with-matches\fP
|
||||
Instead of outputting lines from the files, just output the names of the files
|
||||
containing lines that would have been output. Each file name is output once, on
|
||||
a separate line. Searching normally stops as soon as a matching line is found
|
||||
in a file. However, if the \fB-c\fP (count) option is also used, matching
|
||||
continues in order to obtain the correct count, and those files that have at
|
||||
least one match are listed along with their counts. Using this option with
|
||||
\fB-c\fP is a way of suppressing the listing of files with no matches that
|
||||
a separate line, but if the \fB-Z\fP option is set, they are separated by zero
|
||||
bytes instead of newlines. Searching normally stops as soon as a matching line
|
||||
is found in a file. However, if the \fB-c\fP (count) option is also used,
|
||||
matching continues in order to obtain the correct count, and those files that
|
||||
have at least one match are listed along with their counts. Using this option
|
||||
with \fB-c\fP is a way of suppressing the listing of files with no matches that
|
||||
occurs with \fB-c\fP on its own. This option overrides any previous \fB-H\fP,
|
||||
\fB-h\fP, or \fB-L\fP options.
|
||||
.TP
|
||||
|
@ -516,10 +525,7 @@ counter that is incremented each time around its main processing loop. If the
|
|||
value set by \fB--match-limit\fP is reached, an error occurs.
|
||||
.sp
|
||||
The \fB--heap-limit\fP option specifies, as a number of kibibytes (units of
|
||||
1024 bytes), the amount of heap memory that may be used for matching. Heap
|
||||
memory is needed only if matching the pattern requires a significant number of
|
||||
nested backtracking points to be remembered. This parameter can be set to zero
|
||||
to forbid the use of heap memory altogether.
|
||||
1024 bytes), the maximum amount of heap memory that may be used for matching.
|
||||
.sp
|
||||
The \fB--depth-limit\fP option limits the depth of nested backtracking points,
|
||||
which indirectly limits the amount of memory that is used. The amount of memory
|
||||
|
@ -732,6 +738,12 @@ be more than one line. This is equivalent to having "^(?:" at the start of each
|
|||
pattern and ")$" at the end. This option applies only to the patterns that are
|
||||
matched against the contents of files; it does not apply to patterns specified
|
||||
by any of the \fB--include\fP or \fB--exclude\fP options.
|
||||
.TP
|
||||
\fB-Z\fP, \fB--null\fP
|
||||
Terminate files names in the regular output with a zero byte (the NUL
|
||||
character) instead of what would normally appear. This is useful when file
|
||||
names contain unusual characters such as colons, hyphens, or even newlines. The
|
||||
option does not apply to file names in error messages.
|
||||
.
|
||||
.
|
||||
.SH "ENVIRONMENT VARIABLES"
|
||||
|
@ -960,6 +972,6 @@ Cambridge, England.
|
|||
.rs
|
||||
.sp
|
||||
.nf
|
||||
Last updated: 31 August 2021
|
||||
Copyright (c) 1997-2021 University of Cambridge.
|
||||
Last updated: 30 July 2022
|
||||
Copyright (c) 1997-2022 University of Cambridge.
|
||||
.fi
|
||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -1,4 +1,4 @@
|
|||
.TH PCRE2JIT 3 "23 May 2019" "PCRE2 10.34"
|
||||
.TH PCRE2JIT 3 "30 November 2021" "PCRE2 10.40"
|
||||
.SH NAME
|
||||
PCRE2 - Perl-compatible regular expressions (revised API)
|
||||
.SH "PCRE2 JUST-IN-TIME COMPILER SUPPORT"
|
||||
|
@ -251,11 +251,11 @@ non-sequential matches in one thread is to use callouts: if a callout function
|
|||
starts another match, that match must use a different JIT stack to the one used
|
||||
for currently suspended match(es).
|
||||
.P
|
||||
In a multithread application, if you do not
|
||||
specify a JIT stack, or if you assign or pass back NULL from a callback, that
|
||||
is thread-safe, because each thread has its own machine stack. However, if you
|
||||
assign or pass back a non-NULL JIT stack, this must be a different stack for
|
||||
each thread so that the application is thread-safe.
|
||||
In a multithread application, if you do not specify a JIT stack, or if you
|
||||
assign or pass back NULL from a callback, that is thread-safe, because each
|
||||
thread has its own machine stack. However, if you assign or pass back a
|
||||
non-NULL JIT stack, this must be a different stack for each thread so that the
|
||||
application is thread-safe.
|
||||
.P
|
||||
Strictly speaking, even more is allowed. You can assign the same non-NULL stack
|
||||
to a match context that is used by any number of patterns, as long as they are
|
||||
|
@ -355,8 +355,8 @@ out this complicated API.
|
|||
.B void pcre2_jit_free_unused_memory(pcre2_general_context *\fIgcontext\fP);
|
||||
.fi
|
||||
.P
|
||||
The JIT executable allocator does not free all memory when it is possible.
|
||||
It expects new allocations, and keeps some free memory around to improve
|
||||
The JIT executable allocator does not free all memory when it is possible. It
|
||||
expects new allocations, and keeps some free memory around to improve
|
||||
allocation speed. However, in low memory conditions, it might be better to free
|
||||
all possible memory. You can cause this to happen by calling
|
||||
pcre2_jit_free_unused_memory(). Its argument is a general context, for custom
|
||||
|
@ -416,10 +416,10 @@ that was not compiled.
|
|||
.P
|
||||
When you call \fBpcre2_match()\fP, as well as testing for invalid options, a
|
||||
number of other sanity checks are performed on the arguments. For example, if
|
||||
the subject pointer is NULL, an immediate error is given. Also, unless
|
||||
PCRE2_NO_UTF_CHECK is set, a UTF subject string is tested for validity. In the
|
||||
interests of speed, these checks do not happen on the JIT fast path, and if
|
||||
invalid data is passed, the result is undefined.
|
||||
the subject pointer is NULL but the length is non-zero, an immediate error is
|
||||
given. Also, unless PCRE2_NO_UTF_CHECK is set, a UTF subject string is tested
|
||||
for validity. In the interests of speed, these checks do not happen on the JIT
|
||||
fast path, and if invalid data is passed, the result is undefined.
|
||||
.P
|
||||
Bypassing the sanity checks and the \fBpcre2_match()\fP wrapping can give
|
||||
speedups of more than 10%.
|
||||
|
@ -445,6 +445,6 @@ Cambridge, England.
|
|||
.rs
|
||||
.sp
|
||||
.nf
|
||||
Last updated: 23 May 2019
|
||||
Copyright (c) 1997-2019 University of Cambridge.
|
||||
Last updated: 30 November 2021
|
||||
Copyright (c) 1997-2021 University of Cambridge.
|
||||
.fi
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
.TH PCRE2LIMITS 3 "03 February 2019" "PCRE2 10.33"
|
||||
.TH PCRE2LIMITS 3 "26 July 2022" "PCRE2 10.41"
|
||||
.SH NAME
|
||||
PCRE2 - Perl-compatible regular expressions (revised API)
|
||||
.SH "SIZE AND OTHER LIMITATIONS"
|
||||
|
@ -51,6 +51,10 @@ is 255 code units for the 8-bit library and 65535 code units for the 16-bit and
|
|||
.P
|
||||
The maximum length of a string argument to a callout is the largest number a
|
||||
32-bit unsigned integer can hold.
|
||||
.P
|
||||
The maximum amount of heap memory used for matching is controlled by the heap
|
||||
limit, which can be set in a pattern or in a match context. The default is a
|
||||
very large number, effectively unlimited.
|
||||
.
|
||||
.
|
||||
.SH AUTHOR
|
||||
|
@ -58,7 +62,7 @@ The maximum length of a string argument to a callout is the largest number a
|
|||
.sp
|
||||
.nf
|
||||
Philip Hazel
|
||||
University Computing Service
|
||||
Retired from University Computing Service
|
||||
Cambridge, England.
|
||||
.fi
|
||||
.
|
||||
|
@ -67,6 +71,6 @@ Cambridge, England.
|
|||
.rs
|
||||
.sp
|
||||
.nf
|
||||
Last updated: 02 February 2019
|
||||
Copyright (c) 1997-2019 University of Cambridge.
|
||||
Last updated: 26 July 2022
|
||||
Copyright (c) 1997-2022 University of Cambridge.
|
||||
.fi
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
.TH PCRE2PATTERN 3 "3o0 August 2021" "PCRE2 10.38"
|
||||
.TH PCRE2PATTERN 3 "12 January 2022" "PCRE2 10.40"
|
||||
.SH NAME
|
||||
PCRE2 - Perl-compatible regular expressions (revised API)
|
||||
.SH "PCRE2 REGULAR EXPRESSION DETAILS"
|
||||
|
@ -509,7 +509,6 @@ for themselves. For example, outside a character class:
|
|||
.\" JOIN
|
||||
\e377 might be a backreference, otherwise
|
||||
the value 255 (decimal)
|
||||
.\" JOIN
|
||||
\e81 is always a backreference
|
||||
.sp
|
||||
Note that octal values of 100 or greater that are specified using this syntax
|
||||
|
@ -773,195 +772,64 @@ can be used in any mode, though in 8-bit and 16-bit non-UTF modes these
|
|||
sequences are of course limited to testing characters whose code points are
|
||||
less than U+0100 and U+10000, respectively. In 32-bit non-UTF mode, code points
|
||||
greater than 0x10ffff (the Unicode limit) may be encountered. These are all
|
||||
treated as being in the Unknown script and with an unassigned type. The extra
|
||||
escape sequences are:
|
||||
treated as being in the Unknown script and with an unassigned type.
|
||||
.P
|
||||
Matching characters by Unicode property is not fast, because PCRE2 has to do a
|
||||
multistage table lookup in order to find a character's property. That is why
|
||||
the traditional escape sequences such as \ed and \ew do not use Unicode
|
||||
properties in PCRE2 by default, though you can make them do so by setting the
|
||||
PCRE2_UCP option or by starting the pattern with (*UCP).
|
||||
.P
|
||||
The extra escape sequences that provide property support are:
|
||||
.sp
|
||||
\ep{\fIxx\fP} a character with the \fIxx\fP property
|
||||
\eP{\fIxx\fP} a character without the \fIxx\fP property
|
||||
\eX a Unicode extended grapheme cluster
|
||||
.sp
|
||||
The property names represented by \fIxx\fP above are case-sensitive. There is
|
||||
support for Unicode script names, Unicode general category properties, "Any",
|
||||
which matches any character (including newline), and some special PCRE2
|
||||
properties (described in the
|
||||
The property names represented by \fIxx\fP above are not case-sensitive, and in
|
||||
accordance with Unicode's "loose matching" rules, spaces, hyphens, and
|
||||
underscores are ignored. There is support for Unicode script names, Unicode
|
||||
general category properties, "Any", which matches any character (including
|
||||
newline), Bidi_Class, a number of binary (yes/no) properties, and some special
|
||||
PCRE2 properties (described
|
||||
.\" HTML <a href="#extraprops">
|
||||
.\" </a>
|
||||
next section).
|
||||
below).
|
||||
.\"
|
||||
Other Perl properties such as "InMusicalSymbols" are not supported by PCRE2.
|
||||
Note that \eP{Any} does not match any characters, so always causes a match
|
||||
failure.
|
||||
Certain other Perl properties such as "InMusicalSymbols" are not supported by
|
||||
PCRE2. Note that \eP{Any} does not match any characters, so always causes a
|
||||
match failure.
|
||||
.
|
||||
.
|
||||
.
|
||||
.SS "Script properties for \ep and \eP"
|
||||
.rs
|
||||
.sp
|
||||
There are three different syntax forms for matching a script. Each Unicode
|
||||
character has a basic script and, optionally, a list of other scripts ("Script
|
||||
Extensions") with which it is commonly used. Using the Adlam script as an
|
||||
example, \ep{sc:Adlam} matches characters whose basic script is Adlam, whereas
|
||||
\ep{scx:Adlam} matches, in addition, characters that have Adlam in their
|
||||
extensions list. The full names "script" and "script extensions" for the
|
||||
property types are recognized, and a equals sign is an alternative to the
|
||||
colon. If a script name is given without a property type, for example,
|
||||
\ep{Adlam}, it is treated as \ep{scx:Adlam}. Perl changed to this
|
||||
interpretation at release 5.26 and PCRE2 changed at release 10.40.
|
||||
.P
|
||||
Sets of Unicode characters are defined as belonging to certain scripts. A
|
||||
character from one of these sets can be matched using a script name. For
|
||||
example:
|
||||
.sp
|
||||
\ep{Greek}
|
||||
\eP{Han}
|
||||
.sp
|
||||
Unassigned characters (and in non-UTF 32-bit mode, characters with code points
|
||||
greater than 0x10FFFF) are assigned the "Unknown" script. Others that are not
|
||||
part of an identified script are lumped together as "Common". The current list
|
||||
of scripts is:
|
||||
.P
|
||||
Adlam,
|
||||
Ahom,
|
||||
Anatolian_Hieroglyphs,
|
||||
Arabic,
|
||||
Armenian,
|
||||
Avestan,
|
||||
Balinese,
|
||||
Bamum,
|
||||
Bassa_Vah,
|
||||
Batak,
|
||||
Bengali,
|
||||
Bhaiksuki,
|
||||
Bopomofo,
|
||||
Brahmi,
|
||||
Braille,
|
||||
Buginese,
|
||||
Buhid,
|
||||
Canadian_Aboriginal,
|
||||
Carian,
|
||||
Caucasian_Albanian,
|
||||
Chakma,
|
||||
Cham,
|
||||
Cherokee,
|
||||
Chorasmian,
|
||||
Common,
|
||||
Coptic,
|
||||
Cuneiform,
|
||||
Cypriot,
|
||||
Cyrillic,
|
||||
Deseret,
|
||||
Devanagari,
|
||||
Dives_Akuru,
|
||||
Dogra,
|
||||
Duployan,
|
||||
Egyptian_Hieroglyphs,
|
||||
Elbasan,
|
||||
Elymaic,
|
||||
Ethiopic,
|
||||
Georgian,
|
||||
Glagolitic,
|
||||
Gothic,
|
||||
Grantha,
|
||||
Greek,
|
||||
Gujarati,
|
||||
Gunjala_Gondi,
|
||||
Gurmukhi,
|
||||
Han,
|
||||
Hangul,
|
||||
Hanifi_Rohingya,
|
||||
Hanunoo,
|
||||
Hatran,
|
||||
Hebrew,
|
||||
Hiragana,
|
||||
Imperial_Aramaic,
|
||||
Inherited,
|
||||
Inscriptional_Pahlavi,
|
||||
Inscriptional_Parthian,
|
||||
Javanese,
|
||||
Kaithi,
|
||||
Kannada,
|
||||
Katakana,
|
||||
Kayah_Li,
|
||||
Kharoshthi,
|
||||
Khitan_Small_Script,
|
||||
Khmer,
|
||||
Khojki,
|
||||
Khudawadi,
|
||||
Lao,
|
||||
Latin,
|
||||
Lepcha,
|
||||
Limbu,
|
||||
Linear_A,
|
||||
Linear_B,
|
||||
Lisu,
|
||||
Lycian,
|
||||
Lydian,
|
||||
Mahajani,
|
||||
Makasar,
|
||||
Malayalam,
|
||||
Mandaic,
|
||||
Manichaean,
|
||||
Marchen,
|
||||
Masaram_Gondi,
|
||||
Medefaidrin,
|
||||
Meetei_Mayek,
|
||||
Mende_Kikakui,
|
||||
Meroitic_Cursive,
|
||||
Meroitic_Hieroglyphs,
|
||||
Miao,
|
||||
Modi,
|
||||
Mongolian,
|
||||
Mro,
|
||||
Multani,
|
||||
Myanmar,
|
||||
Nabataean,
|
||||
Nandinagari,
|
||||
New_Tai_Lue,
|
||||
Newa,
|
||||
Nko,
|
||||
Nushu,
|
||||
Nyakeng_Puachue_Hmong,
|
||||
Ogham,
|
||||
Ol_Chiki,
|
||||
Old_Hungarian,
|
||||
Old_Italic,
|
||||
Old_North_Arabian,
|
||||
Old_Permic,
|
||||
Old_Persian,
|
||||
Old_Sogdian,
|
||||
Old_South_Arabian,
|
||||
Old_Turkic,
|
||||
Oriya,
|
||||
Osage,
|
||||
Osmanya,
|
||||
Pahawh_Hmong,
|
||||
Palmyrene,
|
||||
Pau_Cin_Hau,
|
||||
Phags_Pa,
|
||||
Phoenician,
|
||||
Psalter_Pahlavi,
|
||||
Rejang,
|
||||
Runic,
|
||||
Samaritan,
|
||||
Saurashtra,
|
||||
Sharada,
|
||||
Shavian,
|
||||
Siddham,
|
||||
SignWriting,
|
||||
Sinhala,
|
||||
Sogdian,
|
||||
Sora_Sompeng,
|
||||
Soyombo,
|
||||
Sundanese,
|
||||
Syloti_Nagri,
|
||||
Syriac,
|
||||
Tagalog,
|
||||
Tagbanwa,
|
||||
Tai_Le,
|
||||
Tai_Tham,
|
||||
Tai_Viet,
|
||||
Takri,
|
||||
Tamil,
|
||||
Tangut,
|
||||
Telugu,
|
||||
Thaana,
|
||||
Thai,
|
||||
Tibetan,
|
||||
Tifinagh,
|
||||
Tirhuta,
|
||||
Ugaritic,
|
||||
Unknown,
|
||||
Vai,
|
||||
Wancho,
|
||||
Warang_Citi,
|
||||
Yezidi,
|
||||
Yi,
|
||||
Zanabazar_Square.
|
||||
.P
|
||||
of recognized script names and their 4-character abbreviations can be obtained
|
||||
by running this command:
|
||||
.sp
|
||||
pcre2test -LS
|
||||
.sp
|
||||
.
|
||||
.
|
||||
.
|
||||
.SS "The general category property for \ep and \eP"
|
||||
.rs
|
||||
.sp
|
||||
Each character has exactly one Unicode general category property, specified by
|
||||
a two-letter abbreviation. For compatibility with Perl, negation can be
|
||||
specified by including a circumflex between the opening brace and the property
|
||||
|
@ -1021,9 +889,9 @@ The following general category property codes are supported:
|
|||
Zp Paragraph separator
|
||||
Zs Space separator
|
||||
.sp
|
||||
The special property L& is also supported: it matches a character that has
|
||||
the Lu, Ll, or Lt property, in other words, a letter that is not classified as
|
||||
a modifier or "other".
|
||||
The special property LC, which has the synonym L&, is also supported: it
|
||||
matches a character that has the Lu, Ll, or Lt property, in other words, a
|
||||
letter that is not classified as a modifier or "other".
|
||||
.P
|
||||
The Cs (Surrogate) property applies only to characters whose code points are in
|
||||
the range U+D800 to U+DFFF. These characters are no different to any other
|
||||
|
@ -1047,12 +915,53 @@ Unicode table.
|
|||
Specifying caseless matching does not affect these escape sequences. For
|
||||
example, \ep{Lu} always matches only upper case letters. This is different from
|
||||
the behaviour of current versions of Perl.
|
||||
.P
|
||||
Matching characters by Unicode property is not fast, because PCRE2 has to do a
|
||||
multistage table lookup in order to find a character's property. That is why
|
||||
the traditional escape sequences such as \ed and \ew do not use Unicode
|
||||
properties in PCRE2 by default, though you can make them do so by setting the
|
||||
PCRE2_UCP option or by starting the pattern with (*UCP).
|
||||
.
|
||||
.
|
||||
.SS "Binary (yes/no) properties for \ep and \eP"
|
||||
.rs
|
||||
.sp
|
||||
Unicode defines a number of binary properties, that is, properties whose only
|
||||
values are true or false. You can obtain a list of those that are recognized by
|
||||
\ep and \eP, along with their abbreviations, by running this command:
|
||||
.sp
|
||||
pcre2test -LP
|
||||
.sp
|
||||
.
|
||||
.
|
||||
.SS "The Bidi_Class property for \ep and \eP"
|
||||
.rs
|
||||
.sp
|
||||
\ep{Bidi_Class:<class>} matches a character with the given class
|
||||
\ep{BC:<class>} matches a character with the given class
|
||||
.sp
|
||||
The recognized classes are:
|
||||
.sp
|
||||
AL Arabic letter
|
||||
AN Arabic number
|
||||
B paragraph separator
|
||||
BN boundary neutral
|
||||
CS common separator
|
||||
EN European number
|
||||
ES European separator
|
||||
ET European terminator
|
||||
FSI first strong isolate
|
||||
L left-to-right
|
||||
LRE left-to-right embedding
|
||||
LRI left-to-right isolate
|
||||
LRO left-to-right override
|
||||
NSM non-spacing mark
|
||||
ON other neutral
|
||||
PDF pop directional format
|
||||
PDI pop directional isolate
|
||||
R right-to-left
|
||||
RLE right-to-left embedding
|
||||
RLI right-to-left isolate
|
||||
RLO right-to-left override
|
||||
S segment separator
|
||||
WS which space
|
||||
.sp
|
||||
An equals sign may be used instead of a colon. The class names are
|
||||
case-insensitive; only the short names listed above are recognized.
|
||||
.
|
||||
.
|
||||
.SS Extended grapheme clusters
|
||||
|
@ -1331,14 +1240,19 @@ end of the subject in both modes, and if all branches of a pattern start with
|
|||
.sp
|
||||
Outside a character class, a dot in the pattern matches any one character in
|
||||
the subject string except (by default) a character that signifies the end of a
|
||||
line.
|
||||
line. One or more characters may be specified as line terminators (see
|
||||
.\" HTML <a href="#newlines">
|
||||
.\" </a>
|
||||
"Newline conventions"
|
||||
.\"
|
||||
above).
|
||||
.P
|
||||
When a line ending is defined as a single character, dot never matches that
|
||||
character; when the two-character sequence CRLF is used, dot does not match CR
|
||||
if it is immediately followed by LF, but otherwise it matches all characters
|
||||
(including isolated CRs and LFs). When any Unicode line endings are being
|
||||
recognized, dot does not match CR or LF or any of the other line ending
|
||||
characters.
|
||||
Dot never matches a single line-ending character. When the two-character
|
||||
sequence CRLF is the only line ending, dot does not match CR if it is
|
||||
immediately followed by LF, but otherwise it matches all characters (including
|
||||
isolated CRs and LFs). When ANYCRLF is selected for line endings, no occurences
|
||||
of CR of LF match dot. When all Unicode line endings are being recognized, dot
|
||||
does not match CR or LF or any of the other line ending characters.
|
||||
.P
|
||||
The behaviour of dot with regard to newlines can be changed. If the
|
||||
PCRE2_DOTALL option is set, a dot matches any one character, without exception.
|
||||
|
@ -2181,10 +2095,10 @@ be easier to remember:
|
|||
.sp
|
||||
(*atomic:\ed+)foo
|
||||
.sp
|
||||
This kind of parenthesized group "locks up" the part of the pattern it
|
||||
contains once it has matched, and a failure further into the pattern is
|
||||
prevented from backtracking into it. Backtracking past it to previous items,
|
||||
however, works as normal.
|
||||
This kind of parenthesized group "locks up" the part of the pattern it contains
|
||||
once it has matched, and a failure further into the pattern is prevented from
|
||||
backtracking into it. Backtracking past it to previous items, however, works as
|
||||
normal.
|
||||
.P
|
||||
An alternative description is that a group of this type matches exactly the
|
||||
string of characters that an identical standalone pattern would match, if
|
||||
|
@ -2930,7 +2844,7 @@ breaks):
|
|||
(?(DEFINE) (?<byte> 2[0-4]\ed | 25[0-5] | 1\ed\ed | [1-9]?\ed) )
|
||||
\eb (?&byte) (\e.(?&byte)){3} \eb
|
||||
.sp
|
||||
The first part of the pattern is a DEFINE group inside which a another group
|
||||
The first part of the pattern is a DEFINE group inside which another group
|
||||
named "byte" is defined. This matches an individual component of an IPv4
|
||||
address (a number less than 256). When matching takes place, this part of the
|
||||
pattern is skipped because DEFINE acts like a false condition. The rest of the
|
||||
|
@ -3900,6 +3814,6 @@ Cambridge, England.
|
|||
.rs
|
||||
.sp
|
||||
.nf
|
||||
Last updated: 30 August 2021
|
||||
Copyright (c) 1997-2021 University of Cambridge.
|
||||
Last updated: 12 January 2022
|
||||
Copyright (c) 1997-2022 University of Cambridge.
|
||||
.fi
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
.TH PCRE2PERFORM 3 "03 February 2019" "PCRE2 10.33"
|
||||
.TH PCRE2PERFORM 3 "27 July 2022" "PCRE2 10.41"
|
||||
.SH NAME
|
||||
PCRE2 - Perl-compatible regular expressions (revised API)
|
||||
.SH "PCRE2 PERFORMANCE"
|
||||
|
@ -69,12 +69,28 @@ From release 10.30, the interpretive (non-JIT) version of \fBpcre2_match()\fP
|
|||
uses very little system stack at run time. In earlier releases recursive
|
||||
function calls could use a great deal of stack, and this could cause problems,
|
||||
but this usage has been eliminated. Backtracking positions are now explicitly
|
||||
remembered in memory frames controlled by the code. An initial 20KiB vector of
|
||||
frames is allocated on the system stack (enough for about 100 frames for small
|
||||
patterns), but if this is insufficient, heap memory is used. The amount of heap
|
||||
memory can be limited; if the limit is set to zero, only the initial stack
|
||||
vector is used. Rewriting patterns to be time-efficient, as described below,
|
||||
may also reduce the memory requirements.
|
||||
remembered in memory frames controlled by the code.
|
||||
.P
|
||||
The size of each frame depends on the size of pointer variables and the number
|
||||
of capturing parenthesized groups in the pattern being matched. On a 64-bit
|
||||
system the frame size for a pattern with no captures is 128 bytes. For each
|
||||
capturing group the size increases by 16 bytes.
|
||||
.P
|
||||
Until release 10.41, an initial 20KiB frames vector was allocated on the system
|
||||
stack, but this still caused some issues for multi-thread applications where
|
||||
each thread has a very small stack. From release 10.41 backtracking memory
|
||||
frames are always held in heap memory. An initial heap allocation is obtained
|
||||
the first time any match data block is passed to \fBpcre2_match()\fP. This is
|
||||
remembered with the match data block and re-used if that block is used for
|
||||
another match. It is freed when the match data block itself is freed.
|
||||
.P
|
||||
The size of the initial block is the larger of 20KiB or ten times the pattern's
|
||||
frame size, unless the heap limit is less than this, in which case the heap
|
||||
limit is used. If the initial block proves to be too small during matching, it
|
||||
is replaced by a larger block, subject to the heap limit. The heap limit is
|
||||
checked only when a new block is to be allocated. Reducing the heap limit
|
||||
between calls to \fBpcre2_match()\fP with the same match data block does not
|
||||
affect the saved block.
|
||||
.P
|
||||
In contrast to \fBpcre2_match()\fP, \fBpcre2_dfa_match()\fP does use recursive
|
||||
function calls, but only for processing atomic groups, lookaround assertions,
|
||||
|
@ -230,7 +246,7 @@ pattern to match. This is done by repeatedly matching with different limits.
|
|||
.sp
|
||||
.nf
|
||||
Philip Hazel
|
||||
University Computing Service
|
||||
Retired from University Computing Service
|
||||
Cambridge, England.
|
||||
.fi
|
||||
.
|
||||
|
@ -239,6 +255,6 @@ Cambridge, England.
|
|||
.rs
|
||||
.sp
|
||||
.nf
|
||||
Last updated: 03 February 2019
|
||||
Copyright (c) 1997-2019 University of Cambridge.
|
||||
Last updated: 27 July 2022
|
||||
Copyright (c) 1997-2022 University of Cambridge.
|
||||
.fi
|
||||
|
|
|
@ -6,11 +6,11 @@ PCRE2 - Perl-compatible regular expressions (revised API)
|
|||
.sp
|
||||
.nf
|
||||
.B int32_t pcre2_serialize_decode(pcre2_code **\fIcodes\fP,
|
||||
.B " int32_t \fInumber_of_codes\fP, const uint32_t *\fIbytes\fP,"
|
||||
.B " int32_t \fInumber_of_codes\fP, const uint8_t *\fIbytes\fP,"
|
||||
.B " pcre2_general_context *\fIgcontext\fP);"
|
||||
.sp
|
||||
.B int32_t pcre2_serialize_encode(pcre2_code **\fIcodes\fP,
|
||||
.B " int32_t \fInumber_of_codes\fP, uint32_t **\fIserialized_bytes\fP,"
|
||||
.B int32_t pcre2_serialize_encode(const pcre2_code **\fIcodes\fP,
|
||||
.B " int32_t \fInumber_of_codes\fP, uint8_t **\fIserialized_bytes\fP,"
|
||||
.B " PCRE2_SIZE *\fIserialized_size\fP, pcre2_general_context *\fIgcontext\fP);"
|
||||
.sp
|
||||
.B void pcre2_serialize_free(uint8_t *\fIbytes\fP);
|
||||
|
@ -81,7 +81,7 @@ of serialized patterns, or one of the following negative error codes:
|
|||
.sp
|
||||
PCRE2_ERROR_BADDATA the number of patterns is zero or less
|
||||
PCRE2_ERROR_BADMAGIC mismatch of id bytes in one of the patterns
|
||||
PCRE2_ERROR_MEMORY memory allocation failed
|
||||
PCRE2_ERROR_NOMEMORY memory allocation failed
|
||||
PCRE2_ERROR_MIXEDTABLES the patterns do not all use the same tables
|
||||
PCRE2_ERROR_NULL the 1st, 3rd, or 4th argument is NULL
|
||||
.sp
|
||||
|
@ -141,7 +141,6 @@ mangagement functions for the decoded patterns. If this argument is NULL,
|
|||
\fBmalloc()\fP and \fBfree()\fP are used. After deserialization, the byte
|
||||
stream is no longer needed and can be discarded.
|
||||
.sp
|
||||
int32_t number_of_codes;
|
||||
pcre2_code *list_of_codes[2];
|
||||
uint8_t *bytes = <serialized data>;
|
||||
int32_t number_of_codes =
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
.TH PCRE2SYNTAX 3 "30 August 2021" "PCRE2 10.38"
|
||||
.TH PCRE2SYNTAX 3 "12 January 2022" "PCRE2 10.40"
|
||||
.SH NAME
|
||||
PCRE2 - Perl-compatible regular expressions (revised API)
|
||||
.SH "PCRE2 REGULAR EXPRESSION SYNTAX SUMMARY"
|
||||
|
@ -102,6 +102,10 @@ happening, \es and \ew may also match characters with code points in the range
|
|||
128-255. If the PCRE2_UCP option is set, the behaviour of these escape
|
||||
sequences is changed to use Unicode properties and they match many more
|
||||
characters.
|
||||
.P
|
||||
Property descriptions in \ep and \eP are matched caselessly; hyphens,
|
||||
underscores, and white space are ignored, in accordance with Unicode's "loose
|
||||
matching" rules.
|
||||
.
|
||||
.
|
||||
.SH "GENERAL CATEGORY PROPERTIES FOR \ep and \eP"
|
||||
|
@ -120,6 +124,7 @@ characters.
|
|||
Lo Other letter
|
||||
Lt Title case letter
|
||||
Lu Upper case letter
|
||||
Lc Ll, Lu, or Lt
|
||||
L& Ll, Lu, or Lt
|
||||
.sp
|
||||
M Mark
|
||||
|
@ -167,165 +172,59 @@ Perl and POSIX space are now the same. Perl added VT to its space character set
|
|||
at release 5.18.
|
||||
.
|
||||
.
|
||||
.SH "SCRIPT NAMES FOR \ep AND \eP"
|
||||
.SH "BINARY PROPERTIES FOR \ep AND \eP"
|
||||
.rs
|
||||
.sp
|
||||
Adlam,
|
||||
Ahom,
|
||||
Anatolian_Hieroglyphs,
|
||||
Arabic,
|
||||
Armenian,
|
||||
Avestan,
|
||||
Balinese,
|
||||
Bamum,
|
||||
Bassa_Vah,
|
||||
Batak,
|
||||
Bengali,
|
||||
Bhaiksuki,
|
||||
Bopomofo,
|
||||
Brahmi,
|
||||
Braille,
|
||||
Buginese,
|
||||
Buhid,
|
||||
Canadian_Aboriginal,
|
||||
Carian,
|
||||
Caucasian_Albanian,
|
||||
Chakma,
|
||||
Cham,
|
||||
Cherokee,
|
||||
Chorasmian,
|
||||
Common,
|
||||
Coptic,
|
||||
Cuneiform,
|
||||
Cypriot,
|
||||
Cyrillic,
|
||||
Deseret,
|
||||
Devanagari,
|
||||
Dives_Akuru,
|
||||
Dogra,
|
||||
Duployan,
|
||||
Egyptian_Hieroglyphs,
|
||||
Elbasan,
|
||||
Elymaic,
|
||||
Ethiopic,
|
||||
Georgian,
|
||||
Glagolitic,
|
||||
Gothic,
|
||||
Grantha,
|
||||
Greek,
|
||||
Gujarati,
|
||||
Gunjala_Gondi,
|
||||
Gurmukhi,
|
||||
Han,
|
||||
Hangul,
|
||||
Hanifi_Rohingya,
|
||||
Hanunoo,
|
||||
Hatran,
|
||||
Hebrew,
|
||||
Hiragana,
|
||||
Imperial_Aramaic,
|
||||
Inherited,
|
||||
Inscriptional_Pahlavi,
|
||||
Inscriptional_Parthian,
|
||||
Javanese,
|
||||
Kaithi,
|
||||
Kannada,
|
||||
Katakana,
|
||||
Kayah_Li,
|
||||
Kharoshthi,
|
||||
Khitan_Small_Script,
|
||||
Khmer,
|
||||
Khojki,
|
||||
Khudawadi,
|
||||
Lao,
|
||||
Latin,
|
||||
Lepcha,
|
||||
Limbu,
|
||||
Linear_A,
|
||||
Linear_B,
|
||||
Lisu,
|
||||
Lycian,
|
||||
Lydian,
|
||||
Mahajani,
|
||||
Makasar,
|
||||
Malayalam,
|
||||
Mandaic,
|
||||
Manichaean,
|
||||
Marchen,
|
||||
Masaram_Gondi,
|
||||
Medefaidrin,
|
||||
Meetei_Mayek,
|
||||
Mende_Kikakui,
|
||||
Meroitic_Cursive,
|
||||
Meroitic_Hieroglyphs,
|
||||
Miao,
|
||||
Modi,
|
||||
Mongolian,
|
||||
Mro,
|
||||
Multani,
|
||||
Myanmar,
|
||||
Nabataean,
|
||||
Nandinagari,
|
||||
New_Tai_Lue,
|
||||
Newa,
|
||||
Nko,
|
||||
Nushu,
|
||||
Nyakeng_Puachue_Hmong,
|
||||
Ogham,
|
||||
Ol_Chiki,
|
||||
Old_Hungarian,
|
||||
Old_Italic,
|
||||
Old_North_Arabian,
|
||||
Old_Permic,
|
||||
Old_Persian,
|
||||
Old_Sogdian,
|
||||
Old_South_Arabian,
|
||||
Old_Turkic,
|
||||
Oriya,
|
||||
Osage,
|
||||
Osmanya,
|
||||
Pahawh_Hmong,
|
||||
Palmyrene,
|
||||
Pau_Cin_Hau,
|
||||
Phags_Pa,
|
||||
Phoenician,
|
||||
Psalter_Pahlavi,
|
||||
Rejang,
|
||||
Runic,
|
||||
Samaritan,
|
||||
Saurashtra,
|
||||
Sharada,
|
||||
Shavian,
|
||||
Siddham,
|
||||
SignWriting,
|
||||
Sinhala,
|
||||
Sogdian,
|
||||
Sora_Sompeng,
|
||||
Soyombo,
|
||||
Sundanese,
|
||||
Syloti_Nagri,
|
||||
Syriac,
|
||||
Tagalog,
|
||||
Tagbanwa,
|
||||
Tai_Le,
|
||||
Tai_Tham,
|
||||
Tai_Viet,
|
||||
Takri,
|
||||
Tamil,
|
||||
Tangut,
|
||||
Telugu,
|
||||
Thaana,
|
||||
Thai,
|
||||
Tibetan,
|
||||
Tifinagh,
|
||||
Tirhuta,
|
||||
Ugaritic,
|
||||
Vai,
|
||||
Wancho,
|
||||
Warang_Citi,
|
||||
Yezidi,
|
||||
Yi,
|
||||
Zanabazar_Square.
|
||||
Unicode defines a number of binary properties, that is, properties whose only
|
||||
values are true or false. You can obtain a list of those that are recognized by
|
||||
\ep and \eP, along with their abbreviations, by running this command:
|
||||
.sp
|
||||
pcre2test -LP
|
||||
.
|
||||
.
|
||||
.
|
||||
.SH "SCRIPT MATCHING WITH \ep AND \eP"
|
||||
.rs
|
||||
.sp
|
||||
Many script names and their 4-letter abbreviations are recognized in
|
||||
\ep{sc:...} or \ep{scx:...} items, or on their own with \ep (and also \eP of
|
||||
course). You can obtain a list of these scripts by running this command:
|
||||
.sp
|
||||
pcre2test -LS
|
||||
.
|
||||
.
|
||||
.
|
||||
.SH "THE BIDI_CLASS PROPERTY FOR \ep AND \eP"
|
||||
.rs
|
||||
.sp
|
||||
\ep{Bidi_Class:<class>} matches a character with the given class
|
||||
\ep{BC:<class>} matches a character with the given class
|
||||
.sp
|
||||
The recognized classes are:
|
||||
.sp
|
||||
AL Arabic letter
|
||||
AN Arabic number
|
||||
B paragraph separator
|
||||
BN boundary neutral
|
||||
CS common separator
|
||||
EN European number
|
||||
ES European separator
|
||||
ET European terminator
|
||||
FSI first strong isolate
|
||||
L left-to-right
|
||||
LRE left-to-right embedding
|
||||
LRI left-to-right isolate
|
||||
LRO left-to-right override
|
||||
NSM non-spacing mark
|
||||
ON other neutral
|
||||
PDF pop directional format
|
||||
PDI pop directional isolate
|
||||
R right-to-left
|
||||
RLE right-to-left embedding
|
||||
RLI right-to-left isolate
|
||||
RLO right-to-left override
|
||||
S segment separator
|
||||
WS which space
|
||||
.
|
||||
.
|
||||
.SH "CHARACTER CLASSES"
|
||||
|
@ -679,6 +578,6 @@ Cambridge, England.
|
|||
.rs
|
||||
.sp
|
||||
.nf
|
||||
Last updated: 30 August 2021
|
||||
Copyright (c) 1997-2021 University of Cambridge.
|
||||
Last updated: 12 January 2022
|
||||
Copyright (c) 1997-2022 University of Cambridge.
|
||||
.fi
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
.TH PCRE2TEST 1 "30 August 2021" "PCRE 10.38"
|
||||
.TH PCRE2TEST 1 "27 July 2022" "PCRE 10.41"
|
||||
.SH NAME
|
||||
pcre2test - a program for testing Perl-compatible regular expressions.
|
||||
.SH SYNOPSIS
|
||||
|
@ -47,7 +47,7 @@ format before being passed to the library functions. Results are converted back
|
|||
to 8-bit code units for output.
|
||||
.P
|
||||
In the rest of this document, the names of library functions and structures
|
||||
are given in generic form, for example, \fBpcre_compile()\fP. The actual
|
||||
are given in generic form, for example, \fBpcre2_compile()\fP. The actual
|
||||
names used in the libraries have a suffix _8, _16, or _32, as appropriate.
|
||||
.
|
||||
.
|
||||
|
@ -211,7 +211,17 @@ available, and the use of JIT for matching is verified.
|
|||
\fB-LM\fP
|
||||
List modifiers: write a list of available pattern and subject modifiers to the
|
||||
standard output, then exit with zero exit code. All other options are ignored.
|
||||
If both -C and -LM are present, whichever is first is recognized.
|
||||
If both -C and any -Lx options are present, whichever is first is recognized.
|
||||
.TP 10
|
||||
\fB-LP\fP
|
||||
List properties: write a list of recognized Unicode properties to the standard
|
||||
output, then exit with zero exit code. All other options are ignored. If both
|
||||
-C and any -Lx options are present, whichever is first is recognized.
|
||||
.TP 10
|
||||
\fB-LS\fP
|
||||
List scripts: write a list of recogized Unicode script names to the standard
|
||||
output, then exit with zero exit code. All other options are ignored. If both
|
||||
-C and any -Lx options are present, whichever is first is recognized.
|
||||
.TP 10
|
||||
\fB-pattern\fP \fImodifier-list\fP
|
||||
Behave as if each pattern line contains the given modifiers.
|
||||
|
@ -1196,7 +1206,8 @@ pattern, but can be overridden by modifiers on the subject.
|
|||
copy=<number or name> copy captured substring
|
||||
depth_limit=<n> set a depth limit
|
||||
dfa use \fBpcre2_dfa_match()\fP
|
||||
find_limits find match and depth limits
|
||||
find_limits find heap, match and depth limits
|
||||
find_limits_noheap find match and depth limits
|
||||
get=<number or name> extract captured substring
|
||||
getall extract all captured substrings
|
||||
/g global global matching
|
||||
|
@ -1206,6 +1217,8 @@ pattern, but can be overridden by modifiers on the subject.
|
|||
match_limit=<n> set a match limit
|
||||
memory show heap memory usage
|
||||
null_context match with a NULL context
|
||||
null_replacement substitute with NULL replacement
|
||||
null_subject match with NULL subject
|
||||
offset=<n> set starting offset
|
||||
offset_limit=<n> set offset limit
|
||||
ovector=<n> set size of output vector
|
||||
|
@ -1516,7 +1529,7 @@ value that was set on the pattern.
|
|||
.sp
|
||||
The \fBheap_limit\fP, \fBmatch_limit\fP, and \fBdepth_limit\fP modifiers set
|
||||
the appropriate limits in the match context. These values are ignored when the
|
||||
\fBfind_limits\fP modifier is specified.
|
||||
\fBfind_limits\fP or \fBfind_limits_noheap\fP modifier is specified.
|
||||
.
|
||||
.
|
||||
.SS "Finding minimum limits"
|
||||
|
@ -1526,8 +1539,12 @@ If the \fBfind_limits\fP modifier is present on a subject line, \fBpcre2test\fP
|
|||
calls the relevant matching function several times, setting different values in
|
||||
the match context via \fBpcre2_set_heap_limit()\fP,
|
||||
\fBpcre2_set_match_limit()\fP, or \fBpcre2_set_depth_limit()\fP until it finds
|
||||
the minimum values for each parameter that allows the match to complete without
|
||||
error. If JIT is being used, only the match limit is relevant.
|
||||
the smallest value for each parameter that allows the match to complete without
|
||||
a "limit exceeded" error. The match itself may succeed or fail. An alternative
|
||||
modifier, \fBfind_limits_noheap\fP, omits the heap limit. This is used in the
|
||||
standard tests, because the minimum heap limit varies between systems. If JIT
|
||||
is being used, only the match limit is relevant, and the other two are
|
||||
automatically omitted.
|
||||
.P
|
||||
When using this modifier, the pattern should not contain any limit settings
|
||||
such as (*LIMIT_MATCH=...) within it. If such a setting is present and is
|
||||
|
@ -1551,9 +1568,7 @@ and non-recursive, to the internal matching function, thus controlling the
|
|||
overall amount of computing resource that is used.
|
||||
.P
|
||||
For both kinds of matching, the \fIheap_limit\fP number, which is in kibibytes
|
||||
(units of 1024 bytes), limits the amount of heap memory used for matching. A
|
||||
value of zero disables the use of any heap memory; many simple pattern matches
|
||||
can be done without using the heap, so zero is not an unreasonable setting.
|
||||
(units of 1024 bytes), limits the amount of heap memory used for matching.
|
||||
.
|
||||
.
|
||||
.SS "Showing MARK names"
|
||||
|
@ -1572,12 +1587,10 @@ is added to the non-match message.
|
|||
.sp
|
||||
The \fBmemory\fP modifier causes \fBpcre2test\fP to log the sizes of all heap
|
||||
memory allocation and freeing calls that occur during a call to
|
||||
\fBpcre2_match()\fP or \fBpcre2_dfa_match()\fP. These occur only when a match
|
||||
requires a bigger vector than the default for remembering backtracking points
|
||||
(\fBpcre2_match()\fP) or for internal workspace (\fBpcre2_dfa_match()\fP). In
|
||||
many cases there will be no heap memory used and therefore no additional
|
||||
output. No heap memory is allocated during matching with JIT, so in that case
|
||||
the \fBmemory\fP modifier never has any effect. For this modifier to work, the
|
||||
\fBpcre2_match()\fP or \fBpcre2_dfa_match()\fP. In the latter case, heap memory
|
||||
is used only when a match requires more internal workspace that the default
|
||||
allocation on the stack, so in many cases there will be no output. No heap
|
||||
memory is allocated during matching with JIT. For this modifier to work, the
|
||||
\fBnull_context\fP modifier must not be set on both the pattern and the
|
||||
subject, though it can be set on one or the other.
|
||||
.
|
||||
|
@ -1629,7 +1642,7 @@ When testing \fBpcre2_substitute()\fP, this modifier also has the effect of
|
|||
passing the replacement string as zero-terminated.
|
||||
.
|
||||
.
|
||||
.SS "Passing a NULL context"
|
||||
.SS "Passing a NULL context, subject, or replacement"
|
||||
.rs
|
||||
.sp
|
||||
Normally, \fBpcre2test\fP passes a context block to \fBpcre2_match()\fP,
|
||||
|
@ -1637,7 +1650,12 @@ Normally, \fBpcre2test\fP passes a context block to \fBpcre2_match()\fP,
|
|||
If the \fBnull_context\fP modifier is set, however, NULL is passed. This is for
|
||||
testing that the matching and substitution functions behave correctly in this
|
||||
case (they use default values). This modifier cannot be used with the
|
||||
\fBfind_limits\fP or \fBsubstitute_callout\fP modifiers.
|
||||
\fBfind_limits\fP, \fBfind_limits_noheap\fP, or \fBsubstitute_callout\fP
|
||||
modifiers.
|
||||
.P
|
||||
Similarly, for testing purposes, if the \fBnull_subject\fP or
|
||||
\fBnull_replacement\fP modifier is set, the subject or replacement string
|
||||
pointers are passed as NULL, respectively, to the relevant functions.
|
||||
.
|
||||
.
|
||||
.SH "THE ALTERNATIVE MATCHING FUNCTION"
|
||||
|
@ -2103,6 +2121,6 @@ Cambridge, England.
|
|||
.rs
|
||||
.sp
|
||||
.nf
|
||||
Last updated: 30 August 2021
|
||||
Copyright (c) 1997-2021 University of Cambridge.
|
||||
Last updated: 27 July 2022
|
||||
Copyright (c) 1997-2022 University of Cambridge.
|
||||
.fi
|
||||
|
|
|
@ -44,7 +44,7 @@ PCRE2's 8-BIT, 16-BIT AND 32-BIT LIBRARIES
|
|||
output.
|
||||
|
||||
In the rest of this document, the names of library functions and struc-
|
||||
tures are given in generic form, for example, pcre_compile(). The ac-
|
||||
tures are given in generic form, for example, pcre2_compile(). The ac-
|
||||
tual names used in the libraries have a suffix _8, _16, or _32, as ap-
|
||||
propriate.
|
||||
|
||||
|
@ -197,7 +197,17 @@ COMMAND LINE OPTIONS
|
|||
|
||||
-LM List modifiers: write a list of available pattern and subject
|
||||
modifiers to the standard output, then exit with zero exit
|
||||
code. All other options are ignored. If both -C and -LM are
|
||||
code. All other options are ignored. If both -C and any -Lx
|
||||
options are present, whichever is first is recognized.
|
||||
|
||||
-LP List properties: write a list of recognized Unicode proper-
|
||||
ties to the standard output, then exit with zero exit code.
|
||||
All other options are ignored. If both -C and any -Lx options
|
||||
are present, whichever is first is recognized.
|
||||
|
||||
-LS List scripts: write a list of recogized Unicode script names
|
||||
to the standard output, then exit with zero exit code. All
|
||||
other options are ignored. If both -C and any -Lx options are
|
||||
present, whichever is first is recognized.
|
||||
|
||||
-pattern modifier-list
|
||||
|
@ -1101,7 +1111,8 @@ SUBJECT MODIFIERS
|
|||
copy=<number or name> copy captured substring
|
||||
depth_limit=<n> set a depth limit
|
||||
dfa use pcre2_dfa_match()
|
||||
find_limits find match and depth limits
|
||||
find_limits find heap, match and depth limits
|
||||
find_limits_noheap find match and depth limits
|
||||
get=<number or name> extract captured substring
|
||||
getall extract all captured substrings
|
||||
/g global global matching
|
||||
|
@ -1111,6 +1122,8 @@ SUBJECT MODIFIERS
|
|||
match_limit=<n> set a match limit
|
||||
memory show heap memory usage
|
||||
null_context match with a NULL context
|
||||
null_replacement substitute with NULL replacement
|
||||
null_subject match with NULL subject
|
||||
offset=<n> set starting offset
|
||||
offset_limit=<n> set offset limit
|
||||
ovector=<n> set size of output vector
|
||||
|
@ -1399,7 +1412,7 @@ SUBJECT MODIFIERS
|
|||
|
||||
The heap_limit, match_limit, and depth_limit modifiers set the appro-
|
||||
priate limits in the match context. These values are ignored when the
|
||||
find_limits modifier is specified.
|
||||
find_limits or find_limits_noheap modifier is specified.
|
||||
|
||||
Finding minimum limits
|
||||
|
||||
|
@ -1407,8 +1420,12 @@ SUBJECT MODIFIERS
|
|||
calls the relevant matching function several times, setting different
|
||||
values in the match context via pcre2_set_heap_limit(),
|
||||
pcre2_set_match_limit(), or pcre2_set_depth_limit() until it finds the
|
||||
minimum values for each parameter that allows the match to complete
|
||||
without error. If JIT is being used, only the match limit is relevant.
|
||||
smallest value for each parameter that allows the match to complete
|
||||
without a "limit exceeded" error. The match itself may succeed or fail.
|
||||
An alternative modifier, find_limits_noheap, omits the heap limit. This
|
||||
is used in the standard tests, because the minimum heap limit varies
|
||||
between systems. If JIT is being used, only the match limit is rele-
|
||||
vant, and the other two are automatically omitted.
|
||||
|
||||
When using this modifier, the pattern should not contain any limit set-
|
||||
tings such as (*LIMIT_MATCH=...) within it. If such a setting is
|
||||
|
@ -1434,9 +1451,7 @@ SUBJECT MODIFIERS
|
|||
|
||||
For both kinds of matching, the heap_limit number, which is in
|
||||
kibibytes (units of 1024 bytes), limits the amount of heap memory used
|
||||
for matching. A value of zero disables the use of any heap memory; many
|
||||
simple pattern matches can be done without using the heap, so zero is
|
||||
not an unreasonable setting.
|
||||
for matching.
|
||||
|
||||
Showing MARK names
|
||||
|
||||
|
@ -1451,13 +1466,11 @@ SUBJECT MODIFIERS
|
|||
|
||||
The memory modifier causes pcre2test to log the sizes of all heap mem-
|
||||
ory allocation and freeing calls that occur during a call to
|
||||
pcre2_match() or pcre2_dfa_match(). These occur only when a match re-
|
||||
quires a bigger vector than the default for remembering backtracking
|
||||
points (pcre2_match()) or for internal workspace (pcre2_dfa_match()).
|
||||
In many cases there will be no heap memory used and therefore no addi-
|
||||
tional output. No heap memory is allocated during matching with JIT, so
|
||||
in that case the memory modifier never has any effect. For this modi-
|
||||
fier to work, the null_context modifier must not be set on both the
|
||||
pcre2_match() or pcre2_dfa_match(). In the latter case, heap memory is
|
||||
used only when a match requires more internal workspace that the de-
|
||||
fault allocation on the stack, so in many cases there will be no out-
|
||||
put. No heap memory is allocated during matching with JIT. For this
|
||||
modifier to work, the null_context modifier must not be set on both the
|
||||
pattern and the subject, though it can be set on one or the other.
|
||||
|
||||
Setting a starting offset
|
||||
|
@ -1499,48 +1512,53 @@ SUBJECT MODIFIERS
|
|||
When testing pcre2_substitute(), this modifier also has the effect of
|
||||
passing the replacement string as zero-terminated.
|
||||
|
||||
Passing a NULL context
|
||||
Passing a NULL context, subject, or replacement
|
||||
|
||||
Normally, pcre2test passes a context block to pcre2_match(),
|
||||
pcre2_dfa_match(), pcre2_jit_match() or pcre2_substitute(). If the
|
||||
null_context modifier is set, however, NULL is passed. This is for
|
||||
testing that the matching and substitution functions behave correctly
|
||||
in this case (they use default values). This modifier cannot be used
|
||||
with the find_limits or substitute_callout modifiers.
|
||||
with the find_limits, find_limits_noheap, or substitute_callout modi-
|
||||
fiers.
|
||||
|
||||
Similarly, for testing purposes, if the null_subject or null_replace-
|
||||
ment modifier is set, the subject or replacement string pointers are
|
||||
passed as NULL, respectively, to the relevant functions.
|
||||
|
||||
|
||||
THE ALTERNATIVE MATCHING FUNCTION
|
||||
|
||||
By default, pcre2test uses the standard PCRE2 matching function,
|
||||
By default, pcre2test uses the standard PCRE2 matching function,
|
||||
pcre2_match() to match each subject line. PCRE2 also supports an alter-
|
||||
native matching function, pcre2_dfa_match(), which operates in a dif-
|
||||
ferent way, and has some restrictions. The differences between the two
|
||||
native matching function, pcre2_dfa_match(), which operates in a dif-
|
||||
ferent way, and has some restrictions. The differences between the two
|
||||
functions are described in the pcre2matching documentation.
|
||||
|
||||
If the dfa modifier is set, the alternative matching function is used.
|
||||
This function finds all possible matches at a given point in the sub-
|
||||
ject. If, however, the dfa_shortest modifier is set, processing stops
|
||||
after the first match is found. This is always the shortest possible
|
||||
If the dfa modifier is set, the alternative matching function is used.
|
||||
This function finds all possible matches at a given point in the sub-
|
||||
ject. If, however, the dfa_shortest modifier is set, processing stops
|
||||
after the first match is found. This is always the shortest possible
|
||||
match.
|
||||
|
||||
|
||||
DEFAULT OUTPUT FROM pcre2test
|
||||
|
||||
This section describes the output when the normal matching function,
|
||||
This section describes the output when the normal matching function,
|
||||
pcre2_match(), is being used.
|
||||
|
||||
When a match succeeds, pcre2test outputs the list of captured sub-
|
||||
strings, starting with number 0 for the string that matched the whole
|
||||
When a match succeeds, pcre2test outputs the list of captured sub-
|
||||
strings, starting with number 0 for the string that matched the whole
|
||||
pattern. Otherwise, it outputs "No match" when the return is PCRE2_ER-
|
||||
ROR_NOMATCH, or "Partial match:" followed by the partially matching
|
||||
substring when the return is PCRE2_ERROR_PARTIAL. (Note that this is
|
||||
the entire substring that was inspected during the partial match; it
|
||||
may include characters before the actual match start if a lookbehind
|
||||
ROR_NOMATCH, or "Partial match:" followed by the partially matching
|
||||
substring when the return is PCRE2_ERROR_PARTIAL. (Note that this is
|
||||
the entire substring that was inspected during the partial match; it
|
||||
may include characters before the actual match start if a lookbehind
|
||||
assertion, \K, \b, or \B was involved.)
|
||||
|
||||
For any other return, pcre2test outputs the PCRE2 negative error number
|
||||
and a short descriptive phrase. If the error is a failed UTF string
|
||||
check, the code unit offset of the start of the failing character is
|
||||
and a short descriptive phrase. If the error is a failed UTF string
|
||||
check, the code unit offset of the start of the failing character is
|
||||
also output. Here is an example of an interactive pcre2test run.
|
||||
|
||||
$ pcre2test
|
||||
|
@ -1556,8 +1574,8 @@ DEFAULT OUTPUT FROM pcre2test
|
|||
Unset capturing substrings that are not followed by one that is set are
|
||||
not shown by pcre2test unless the allcaptures modifier is specified. In
|
||||
the following example, there are two capturing substrings, but when the
|
||||
first data line is matched, the second, unset substring is not shown.
|
||||
An "internal" unset substring is shown as "<unset>", as for the second
|
||||
first data line is matched, the second, unset substring is not shown.
|
||||
An "internal" unset substring is shown as "<unset>", as for the second
|
||||
data line.
|
||||
|
||||
re> /(a)|(b)/
|
||||
|
@ -1569,11 +1587,11 @@ DEFAULT OUTPUT FROM pcre2test
|
|||
1: <unset>
|
||||
2: b
|
||||
|
||||
If the strings contain any non-printing characters, they are output as
|
||||
\xhh escapes if the value is less than 256 and UTF mode is not set.
|
||||
If the strings contain any non-printing characters, they are output as
|
||||
\xhh escapes if the value is less than 256 and UTF mode is not set.
|
||||
Otherwise they are output as \x{hh...} escapes. See below for the defi-
|
||||
nition of non-printing characters. If the aftertext modifier is set,
|
||||
the output for substring 0 is followed by the the rest of the subject
|
||||
nition of non-printing characters. If the aftertext modifier is set,
|
||||
the output for substring 0 is followed by the the rest of the subject
|
||||
string, identified by "0+" like this:
|
||||
|
||||
re> /cat/aftertext
|
||||
|
@ -1593,8 +1611,8 @@ DEFAULT OUTPUT FROM pcre2test
|
|||
0: ipp
|
||||
1: pp
|
||||
|
||||
"No match" is output only if the first match attempt fails. Here is an
|
||||
example of a failure message (the offset 4 that is specified by the
|
||||
"No match" is output only if the first match attempt fails. Here is an
|
||||
example of a failure message (the offset 4 that is specified by the
|
||||
offset modifier is past the end of the subject string):
|
||||
|
||||
re> /xyz/
|
||||
|
@ -1602,7 +1620,7 @@ DEFAULT OUTPUT FROM pcre2test
|
|||
Error -24 (bad offset value)
|
||||
|
||||
Note that whereas patterns can be continued over several lines (a plain
|
||||
">" prompt is used for continuations), subject lines may not. However
|
||||
">" prompt is used for continuations), subject lines may not. However
|
||||
newlines can be included in a subject by means of the \n escape (or \r,
|
||||
\r\n, etc., depending on the newline sequence setting).
|
||||
|
||||
|
@ -1610,7 +1628,7 @@ DEFAULT OUTPUT FROM pcre2test
|
|||
OUTPUT FROM THE ALTERNATIVE MATCHING FUNCTION
|
||||
|
||||
When the alternative matching function, pcre2_dfa_match(), is used, the
|
||||
output consists of a list of all the matches that start at the first
|
||||
output consists of a list of all the matches that start at the first
|
||||
point in the subject where there is at least one match. For example:
|
||||
|
||||
re> /(tang|tangerine|tan)/
|
||||
|
@ -1619,11 +1637,11 @@ OUTPUT FROM THE ALTERNATIVE MATCHING FUNCTION
|
|||
1: tang
|
||||
2: tan
|
||||
|
||||
Using the normal matching function on this data finds only "tang". The
|
||||
longest matching string is always given first (and numbered zero). Af-
|
||||
ter a PCRE2_ERROR_PARTIAL return, the output is "Partial match:", fol-
|
||||
Using the normal matching function on this data finds only "tang". The
|
||||
longest matching string is always given first (and numbered zero). Af-
|
||||
ter a PCRE2_ERROR_PARTIAL return, the output is "Partial match:", fol-
|
||||
lowed by the partially matching substring. Note that this is the entire
|
||||
substring that was inspected during the partial match; it may include
|
||||
substring that was inspected during the partial match; it may include
|
||||
characters before the actual match start if a lookbehind assertion, \b,
|
||||
or \B was involved. (\K is not supported for DFA matching.)
|
||||
|
||||
|
@ -1639,16 +1657,16 @@ OUTPUT FROM THE ALTERNATIVE MATCHING FUNCTION
|
|||
1: tan
|
||||
0: tan
|
||||
|
||||
The alternative matching function does not support substring capture,
|
||||
so the modifiers that are concerned with captured substrings are not
|
||||
The alternative matching function does not support substring capture,
|
||||
so the modifiers that are concerned with captured substrings are not
|
||||
relevant.
|
||||
|
||||
|
||||
RESTARTING AFTER A PARTIAL MATCH
|
||||
|
||||
When the alternative matching function has given the PCRE2_ERROR_PAR-
|
||||
When the alternative matching function has given the PCRE2_ERROR_PAR-
|
||||
TIAL return, indicating that the subject partially matched the pattern,
|
||||
you can restart the match with additional subject data by means of the
|
||||
you can restart the match with additional subject data by means of the
|
||||
dfa_restart modifier. For example:
|
||||
|
||||
re> /^\d?\d(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)\d\d$/
|
||||
|
@ -1657,37 +1675,37 @@ RESTARTING AFTER A PARTIAL MATCH
|
|||
data> n05\=dfa,dfa_restart
|
||||
0: n05
|
||||
|
||||
For further information about partial matching, see the pcre2partial
|
||||
For further information about partial matching, see the pcre2partial
|
||||
documentation.
|
||||
|
||||
|
||||
CALLOUTS
|
||||
|
||||
If the pattern contains any callout requests, pcre2test's callout func-
|
||||
tion is called during matching unless callout_none is specified. This
|
||||
tion is called during matching unless callout_none is specified. This
|
||||
works with both matching functions, and with JIT, though there are some
|
||||
differences in behaviour. The output for callouts with numerical argu-
|
||||
differences in behaviour. The output for callouts with numerical argu-
|
||||
ments and those with string arguments is slightly different.
|
||||
|
||||
Callouts with numerical arguments
|
||||
|
||||
By default, the callout function displays the callout number, the start
|
||||
and current positions in the subject text at the callout time, and the
|
||||
and current positions in the subject text at the callout time, and the
|
||||
next pattern item to be tested. For example:
|
||||
|
||||
--->pqrabcdef
|
||||
0 ^ ^ \d
|
||||
|
||||
This output indicates that callout number 0 occurred for a match at-
|
||||
tempt starting at the fourth character of the subject string, when the
|
||||
pointer was at the seventh character, and when the next pattern item
|
||||
was \d. Just one circumflex is output if the start and current posi-
|
||||
This output indicates that callout number 0 occurred for a match at-
|
||||
tempt starting at the fourth character of the subject string, when the
|
||||
pointer was at the seventh character, and when the next pattern item
|
||||
was \d. Just one circumflex is output if the start and current posi-
|
||||
tions are the same, or if the current position precedes the start posi-
|
||||
tion, which can happen if the callout is in a lookbehind assertion.
|
||||
|
||||
Callouts numbered 255 are assumed to be automatic callouts, inserted as
|
||||
a result of the auto_callout pattern modifier. In this case, instead of
|
||||
showing the callout number, the offset in the pattern, preceded by a
|
||||
showing the callout number, the offset in the pattern, preceded by a
|
||||
plus, is output. For example:
|
||||
|
||||
re> /\d?[A-E]\*/auto_callout
|
||||
|
@ -1714,17 +1732,17 @@ CALLOUTS
|
|||
+12 ^ ^
|
||||
0: abc
|
||||
|
||||
The mark changes between matching "a" and "b", but stays the same for
|
||||
the rest of the match, so nothing more is output. If, as a result of
|
||||
backtracking, the mark reverts to being unset, the text "<unset>" is
|
||||
The mark changes between matching "a" and "b", but stays the same for
|
||||
the rest of the match, so nothing more is output. If, as a result of
|
||||
backtracking, the mark reverts to being unset, the text "<unset>" is
|
||||
output.
|
||||
|
||||
Callouts with string arguments
|
||||
|
||||
The output for a callout with a string argument is similar, except that
|
||||
instead of outputting a callout number before the position indicators,
|
||||
the callout string and its offset in the pattern string are output be-
|
||||
fore the reflection of the subject string, and the subject string is
|
||||
instead of outputting a callout number before the position indicators,
|
||||
the callout string and its offset in the pattern string are output be-
|
||||
fore the reflection of the subject string, and the subject string is
|
||||
reflected for each callout. For example:
|
||||
|
||||
re> /^ab(?C'first')cd(?C"second")ef/
|
||||
|
@ -1740,26 +1758,26 @@ CALLOUTS
|
|||
|
||||
Callout modifiers
|
||||
|
||||
The callout function in pcre2test returns zero (carry on matching) by
|
||||
default, but you can use a callout_fail modifier in a subject line to
|
||||
The callout function in pcre2test returns zero (carry on matching) by
|
||||
default, but you can use a callout_fail modifier in a subject line to
|
||||
change this and other parameters of the callout (see below).
|
||||
|
||||
If the callout_capture modifier is set, the current captured groups are
|
||||
output when a callout occurs. This is useful only for non-DFA matching,
|
||||
as pcre2_dfa_match() does not support capturing, so no captures are
|
||||
as pcre2_dfa_match() does not support capturing, so no captures are
|
||||
ever shown.
|
||||
|
||||
The normal callout output, showing the callout number or pattern offset
|
||||
(as described above) is suppressed if the callout_no_where modifier is
|
||||
(as described above) is suppressed if the callout_no_where modifier is
|
||||
set.
|
||||
|
||||
When using the interpretive matching function pcre2_match() without
|
||||
JIT, setting the callout_extra modifier causes additional output from
|
||||
pcre2test's callout function to be generated. For the first callout in
|
||||
a match attempt at a new starting position in the subject, "New match
|
||||
attempt" is output. If there has been a backtrack since the last call-
|
||||
When using the interpretive matching function pcre2_match() without
|
||||
JIT, setting the callout_extra modifier causes additional output from
|
||||
pcre2test's callout function to be generated. For the first callout in
|
||||
a match attempt at a new starting position in the subject, "New match
|
||||
attempt" is output. If there has been a backtrack since the last call-
|
||||
out (or start of matching if this is the first callout), "Backtrack" is
|
||||
output, followed by "No other matching paths" if the backtrack ended
|
||||
output, followed by "No other matching paths" if the backtrack ended
|
||||
the previous match attempt. For example:
|
||||
|
||||
re> /(a+)b/auto_callout,no_start_optimize,no_auto_possess
|
||||
|
@ -1796,86 +1814,86 @@ CALLOUTS
|
|||
+1 ^ a+
|
||||
No match
|
||||
|
||||
Notice that various optimizations must be turned off if you want all
|
||||
possible matching paths to be scanned. If no_start_optimize is not
|
||||
used, there is an immediate "no match", without any callouts, because
|
||||
the starting optimization fails to find "b" in the subject, which it
|
||||
knows must be present for any match. If no_auto_possess is not used,
|
||||
the "a+" item is turned into "a++", which reduces the number of back-
|
||||
Notice that various optimizations must be turned off if you want all
|
||||
possible matching paths to be scanned. If no_start_optimize is not
|
||||
used, there is an immediate "no match", without any callouts, because
|
||||
the starting optimization fails to find "b" in the subject, which it
|
||||
knows must be present for any match. If no_auto_possess is not used,
|
||||
the "a+" item is turned into "a++", which reduces the number of back-
|
||||
tracks.
|
||||
|
||||
The callout_extra modifier has no effect if used with the DFA matching
|
||||
The callout_extra modifier has no effect if used with the DFA matching
|
||||
function, or with JIT.
|
||||
|
||||
Return values from callouts
|
||||
|
||||
The default return from the callout function is zero, which allows
|
||||
The default return from the callout function is zero, which allows
|
||||
matching to continue. The callout_fail modifier can be given one or two
|
||||
numbers. If there is only one number, 1 is returned instead of 0 (caus-
|
||||
ing matching to backtrack) when a callout of that number is reached. If
|
||||
two numbers (<n>:<m>) are given, 1 is returned when callout <n> is
|
||||
reached and there have been at least <m> callouts. The callout_error
|
||||
two numbers (<n>:<m>) are given, 1 is returned when callout <n> is
|
||||
reached and there have been at least <m> callouts. The callout_error
|
||||
modifier is similar, except that PCRE2_ERROR_CALLOUT is returned, caus-
|
||||
ing the entire matching process to be aborted. If both these modifiers
|
||||
are set for the same callout number, callout_error takes precedence.
|
||||
Note that callouts with string arguments are always given the number
|
||||
ing the entire matching process to be aborted. If both these modifiers
|
||||
are set for the same callout number, callout_error takes precedence.
|
||||
Note that callouts with string arguments are always given the number
|
||||
zero.
|
||||
|
||||
The callout_data modifier can be given an unsigned or a negative num-
|
||||
ber. This is set as the "user data" that is passed to the matching
|
||||
function, and passed back when the callout function is invoked. Any
|
||||
value other than zero is used as a return from pcre2test's callout
|
||||
The callout_data modifier can be given an unsigned or a negative num-
|
||||
ber. This is set as the "user data" that is passed to the matching
|
||||
function, and passed back when the callout function is invoked. Any
|
||||
value other than zero is used as a return from pcre2test's callout
|
||||
function.
|
||||
|
||||
Inserting callouts can be helpful when using pcre2test to check compli-
|
||||
cated regular expressions. For further information about callouts, see
|
||||
cated regular expressions. For further information about callouts, see
|
||||
the pcre2callout documentation.
|
||||
|
||||
|
||||
NON-PRINTING CHARACTERS
|
||||
|
||||
When pcre2test is outputting text in the compiled version of a pattern,
|
||||
bytes other than 32-126 are always treated as non-printing characters
|
||||
bytes other than 32-126 are always treated as non-printing characters
|
||||
and are therefore shown as hex escapes.
|
||||
|
||||
When pcre2test is outputting text that is a matched part of a subject
|
||||
string, it behaves in the same way, unless a different locale has been
|
||||
set for the pattern (using the locale modifier). In this case, the is-
|
||||
When pcre2test is outputting text that is a matched part of a subject
|
||||
string, it behaves in the same way, unless a different locale has been
|
||||
set for the pattern (using the locale modifier). In this case, the is-
|
||||
print() function is used to distinguish printing and non-printing char-
|
||||
acters.
|
||||
|
||||
|
||||
SAVING AND RESTORING COMPILED PATTERNS
|
||||
|
||||
It is possible to save compiled patterns on disc or elsewhere, and
|
||||
It is possible to save compiled patterns on disc or elsewhere, and
|
||||
reload them later, subject to a number of restrictions. JIT data cannot
|
||||
be saved. The host on which the patterns are reloaded must be running
|
||||
be saved. The host on which the patterns are reloaded must be running
|
||||
the same version of PCRE2, with the same code unit width, and must also
|
||||
have the same endianness, pointer width and PCRE2_SIZE type. Before
|
||||
compiled patterns can be saved they must be serialized, that is, con-
|
||||
verted to a stream of bytes. A single byte stream may contain any num-
|
||||
ber of compiled patterns, but they must all use the same character ta-
|
||||
bles. A single copy of the tables is included in the byte stream (its
|
||||
have the same endianness, pointer width and PCRE2_SIZE type. Before
|
||||
compiled patterns can be saved they must be serialized, that is, con-
|
||||
verted to a stream of bytes. A single byte stream may contain any num-
|
||||
ber of compiled patterns, but they must all use the same character ta-
|
||||
bles. A single copy of the tables is included in the byte stream (its
|
||||
size is 1088 bytes).
|
||||
|
||||
The functions whose names begin with pcre2_serialize_ are used for se-
|
||||
rializing and de-serializing. They are described in the pcre2serialize
|
||||
documentation. In this section we describe the features of pcre2test
|
||||
The functions whose names begin with pcre2_serialize_ are used for se-
|
||||
rializing and de-serializing. They are described in the pcre2serialize
|
||||
documentation. In this section we describe the features of pcre2test
|
||||
that can be used to test these functions.
|
||||
|
||||
Note that "serialization" in PCRE2 does not convert compiled patterns
|
||||
to an abstract format like Java or .NET. It just makes a reloadable
|
||||
Note that "serialization" in PCRE2 does not convert compiled patterns
|
||||
to an abstract format like Java or .NET. It just makes a reloadable
|
||||
byte code stream. Hence the restrictions on reloading mentioned above.
|
||||
|
||||
In pcre2test, when a pattern with push modifier is successfully com-
|
||||
piled, it is pushed onto a stack of compiled patterns, and pcre2test
|
||||
expects the next line to contain a new pattern (or command) instead of
|
||||
In pcre2test, when a pattern with push modifier is successfully com-
|
||||
piled, it is pushed onto a stack of compiled patterns, and pcre2test
|
||||
expects the next line to contain a new pattern (or command) instead of
|
||||
a subject line. By contrast, the pushcopy modifier causes a copy of the
|
||||
compiled pattern to be stacked, leaving the original available for im-
|
||||
mediate matching. By using push and/or pushcopy, a number of patterns
|
||||
can be compiled and retained. These modifiers are incompatible with
|
||||
compiled pattern to be stacked, leaving the original available for im-
|
||||
mediate matching. By using push and/or pushcopy, a number of patterns
|
||||
can be compiled and retained. These modifiers are incompatible with
|
||||
posix, and control modifiers that act at match time are ignored (with a
|
||||
message) for the stacked patterns. The jitverify modifier applies only
|
||||
message) for the stacked patterns. The jitverify modifier applies only
|
||||
at compile time.
|
||||
|
||||
The command
|
||||
|
@ -1883,21 +1901,21 @@ SAVING AND RESTORING COMPILED PATTERNS
|
|||
#save <filename>
|
||||
|
||||
causes all the stacked patterns to be serialized and the result written
|
||||
to the named file. Afterwards, all the stacked patterns are freed. The
|
||||
to the named file. Afterwards, all the stacked patterns are freed. The
|
||||
command
|
||||
|
||||
#load <filename>
|
||||
|
||||
reads the data in the file, and then arranges for it to be de-serial-
|
||||
ized, with the resulting compiled patterns added to the pattern stack.
|
||||
The pattern on the top of the stack can be retrieved by the #pop com-
|
||||
mand, which must be followed by lines of subjects that are to be
|
||||
matched with the pattern, terminated as usual by an empty line or end
|
||||
of file. This command may be followed by a modifier list containing
|
||||
only control modifiers that act after a pattern has been compiled. In
|
||||
particular, hex, posix, posix_nosub, push, and pushcopy are not al-
|
||||
lowed, nor are any option-setting modifiers. The JIT modifiers are,
|
||||
however permitted. Here is an example that saves and reloads two pat-
|
||||
reads the data in the file, and then arranges for it to be de-serial-
|
||||
ized, with the resulting compiled patterns added to the pattern stack.
|
||||
The pattern on the top of the stack can be retrieved by the #pop com-
|
||||
mand, which must be followed by lines of subjects that are to be
|
||||
matched with the pattern, terminated as usual by an empty line or end
|
||||
of file. This command may be followed by a modifier list containing
|
||||
only control modifiers that act after a pattern has been compiled. In
|
||||
particular, hex, posix, posix_nosub, push, and pushcopy are not al-
|
||||
lowed, nor are any option-setting modifiers. The JIT modifiers are,
|
||||
however permitted. Here is an example that saves and reloads two pat-
|
||||
terns.
|
||||
|
||||
/abc/push
|
||||
|
@ -1910,10 +1928,10 @@ SAVING AND RESTORING COMPILED PATTERNS
|
|||
#pop jit,bincode
|
||||
abc
|
||||
|
||||
If jitverify is used with #pop, it does not automatically imply jit,
|
||||
If jitverify is used with #pop, it does not automatically imply jit,
|
||||
which is different behaviour from when it is used on a pattern.
|
||||
|
||||
The #popcopy command is analagous to the pushcopy modifier in that it
|
||||
The #popcopy command is analagous to the pushcopy modifier in that it
|
||||
makes current a copy of the topmost stack pattern, leaving the original
|
||||
still on the stack.
|
||||
|
||||
|
@ -1933,5 +1951,5 @@ AUTHOR
|
|||
|
||||
REVISION
|
||||
|
||||
Last updated: 30 August 2021
|
||||
Copyright (c) 1997-2021 University of Cambridge.
|
||||
Last updated: 27 July 2022
|
||||
Copyright (c) 1997-2022 University of Cambridge.
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
.TH PCRE2UNICODE 3 "23 February 2020" "PCRE2 10.35"
|
||||
.TH PCRE2UNICODE 3 "22 December 2021" "PCRE2 10.40"
|
||||
.SH NAME
|
||||
PCRE - Perl-compatible regular expressions (revised API)
|
||||
.SH "UNICODE AND UTF SUPPORT"
|
||||
|
@ -40,10 +40,11 @@ handled, as documented below.
|
|||
.sp
|
||||
When PCRE2 is built with Unicode support, the escape sequences \ep{..},
|
||||
\eP{..}, and \eX can be used. This is not dependent on the PCRE2_UTF setting.
|
||||
The Unicode properties that can be tested are limited to the general category
|
||||
properties such as Lu for an upper case letter or Nd for a decimal number, the
|
||||
Unicode script names such as Arabic or Han, and the derived properties Any and
|
||||
L&. Full lists are given in the
|
||||
The Unicode properties that can be tested are a subset of those that Perl
|
||||
supports. Currently they are limited to the general category properties such as
|
||||
Lu for an upper case letter or Nd for a decimal number, the Unicode script
|
||||
names such as Arabic or Han, Bidi_Class, Bidi_Control, and the derived
|
||||
properties Any and LC (synonym L&). Full lists are given in the
|
||||
.\" HREF
|
||||
\fBpcre2pattern\fP
|
||||
.\"
|
||||
|
@ -51,10 +52,10 @@ and
|
|||
.\" HREF
|
||||
\fBpcre2syntax\fP
|
||||
.\"
|
||||
documentation. Only the short names for properties are supported. For example,
|
||||
\ep{L} matches a letter. Its Perl synonym, \ep{Letter}, is not supported.
|
||||
Furthermore, in Perl, many properties may optionally be prefixed by "Is", for
|
||||
compatibility with Perl 5.6. PCRE2 does not support this.
|
||||
documentation. In general, only the short names for properties are supported.
|
||||
For example, \ep{L} matches a letter. Its longer synonym, \ep{Letter}, is not
|
||||
supported. Furthermore, in Perl, many properties may optionally be prefixed by
|
||||
"Is", for compatibility with Perl 5.6. PCRE2 does not support this.
|
||||
.
|
||||
.
|
||||
.SH "WIDE CHARACTERS AND UTF MODES"
|
||||
|
@ -448,7 +449,7 @@ can be useful when searching for UTF text in executable or other binary files.
|
|||
.sp
|
||||
.nf
|
||||
Philip Hazel
|
||||
University Computing Service
|
||||
Retired from University Computing Service
|
||||
Cambridge, England.
|
||||
.fi
|
||||
.
|
||||
|
@ -457,6 +458,6 @@ Cambridge, England.
|
|||
.rs
|
||||
.sp
|
||||
.nf
|
||||
Last updated: 23 February 2020
|
||||
Copyright (c) 1997-2020 University of Cambridge.
|
||||
Last updated: 22 December 2021
|
||||
Copyright (c) 1997-2021 University of Cambridge.
|
||||
.fi
|
||||
|
|
8
index.md
8
index.md
|
@ -14,14 +14,14 @@ flexible API, the code of PCRE2 has been much improved since the fork.
|
|||
## Download
|
||||
|
||||
As well as downloading from the
|
||||
[GitHub site](https://github.com/PhilipHazel/pcre2), you can download PCRE2
|
||||
[GitHub site](https://github.com/PCRE2Project/pcre2), you can download PCRE2
|
||||
or the older, unmaintained PCRE1 library from an
|
||||
[*unofficial* mirror](https://sourceforge.net/projects/pcre/files/) at SourceForge.
|
||||
|
||||
You can check out the PCRE2 source code via Git or Subversion:
|
||||
|
||||
git clone https://github.com/PhilipHazel/pcre2.git
|
||||
svn co https://github.com/PhilipHazel/pcre2.git
|
||||
git clone https://github.com/PCRE2Project/pcre2.git
|
||||
svn co https://github.com/PCRE2Project/pcre2.git
|
||||
|
||||
## Contributed Ports
|
||||
|
||||
|
@ -36,7 +36,7 @@ default character encoding, can be found at
|
|||
## Documentation
|
||||
|
||||
You can read the PCRE2 documentation
|
||||
[here](https://philiphazel.github.io/pcre2/doc/html/index.html).
|
||||
[here](https://PCRE2Project.github.io/pcre2/doc/html/index.html).
|
||||
|
||||
Comparisons to Perl's regular expression semantics can be found in the
|
||||
community authored Wikipedia entry for PCRE.
|
||||
|
|
|
@ -0,0 +1,355 @@
|
|||
#! /usr/bin/python
|
||||
|
||||
# PCRE2 UNICODE PROPERTY SUPPORT
|
||||
# ------------------------------
|
||||
|
||||
# This file is a Python module containing common lists and functions for the
|
||||
# GenerateXXX scripts that create various.c and .h files from Unicode data
|
||||
# files. It was created as part of a re-organizaton of these scripts in
|
||||
# December 2021.
|
||||
|
||||
|
||||
import re
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# DATA LISTS
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
# BIDI classes in the DerivedBidiClass.txt file, with comments.
|
||||
|
||||
bidi_classes = [
|
||||
'AL', 'Arabic letter',
|
||||
'AN', 'Arabic number',
|
||||
'B', 'Paragraph separator',
|
||||
'BN', 'Boundary neutral',
|
||||
'CS', 'Common separator',
|
||||
'EN', 'European number',
|
||||
'ES', 'European separator',
|
||||
'ET', 'European terminator',
|
||||
'FSI', 'First strong isolate',
|
||||
'L', 'Left to right',
|
||||
'LRE', 'Left to right embedding',
|
||||
'LRI', 'Left to right isolate',
|
||||
'LRO', 'Left to right override',
|
||||
'NSM', 'Non-spacing mark',
|
||||
'ON', 'Other neutral',
|
||||
'PDF', 'Pop directional format',
|
||||
'PDI', 'Pop directional isolate',
|
||||
'R', 'Right to left',
|
||||
'RLE', 'Right to left embedding',
|
||||
'RLI', 'Right to left isolate',
|
||||
'RLO', 'Right to left override',
|
||||
'S', 'Segment separator',
|
||||
'WS', 'White space'
|
||||
]
|
||||
|
||||
# Particular category property names, with comments. NOTE: If ever this list
|
||||
# is changed, the table called "catposstab" in the pcre2_auto_possess.c file
|
||||
# must be edited to keep in step.
|
||||
|
||||
category_names = [
|
||||
'Cc', 'Control',
|
||||
'Cf', 'Format',
|
||||
'Cn', 'Unassigned',
|
||||
'Co', 'Private use',
|
||||
'Cs', 'Surrogate',
|
||||
'Ll', 'Lower case letter',
|
||||
'Lm', 'Modifier letter',
|
||||
'Lo', 'Other letter',
|
||||
'Lt', 'Title case letter',
|
||||
'Lu', 'Upper case letter',
|
||||
'Mc', 'Spacing mark',
|
||||
'Me', 'Enclosing mark',
|
||||
'Mn', 'Non-spacing mark',
|
||||
'Nd', 'Decimal number',
|
||||
'Nl', 'Letter number',
|
||||
'No', 'Other number',
|
||||
'Pc', 'Connector punctuation',
|
||||
'Pd', 'Dash punctuation',
|
||||
'Pe', 'Close punctuation',
|
||||
'Pf', 'Final punctuation',
|
||||
'Pi', 'Initial punctuation',
|
||||
'Po', 'Other punctuation',
|
||||
'Ps', 'Open punctuation',
|
||||
'Sc', 'Currency symbol',
|
||||
'Sk', 'Modifier symbol',
|
||||
'Sm', 'Mathematical symbol',
|
||||
'So', 'Other symbol',
|
||||
'Zl', 'Line separator',
|
||||
'Zp', 'Paragraph separator',
|
||||
'Zs', 'Space separator'
|
||||
]
|
||||
|
||||
# The Extended_Pictographic property is not found in the file where all the
|
||||
# others are (GraphemeBreakProperty.txt). It comes from the emoji-data.txt
|
||||
# file, but we list it here so that the name has the correct index value.
|
||||
|
||||
break_properties = [
|
||||
'CR', ' 0',
|
||||
'LF', ' 1',
|
||||
'Control', ' 2',
|
||||
'Extend', ' 3',
|
||||
'Prepend', ' 4',
|
||||
'SpacingMark', ' 5',
|
||||
'L', ' 6 Hangul syllable type L',
|
||||
'V', ' 7 Hangul syllable type V',
|
||||
'T', ' 8 Hangul syllable type T',
|
||||
'LV', ' 9 Hangul syllable type LV',
|
||||
'LVT', '10 Hangul syllable type LVT',
|
||||
'Regional_Indicator', '11',
|
||||
'Other', '12',
|
||||
'ZWJ', '13',
|
||||
'Extended_Pictographic', '14'
|
||||
]
|
||||
|
||||
# List of files from which the names of Boolean properties are obtained, along
|
||||
# with a list of regex patterns for properties to be ignored, and a list of
|
||||
# extra pattern names to add.
|
||||
|
||||
bool_propsfiles = ['PropList.txt', 'DerivedCoreProperties.txt', 'emoji-data.txt']
|
||||
bool_propsignore = [r'^Other_', r'^Hyphen$']
|
||||
bool_propsextras = ['ASCII', 'Bidi_Mirrored']
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# GET BOOLEAN PROPERTY NAMES
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
# Get a list of Boolean property names from a number of files.
|
||||
|
||||
def getbpropslist():
|
||||
bplist = []
|
||||
bplast = ""
|
||||
|
||||
for filename in bool_propsfiles:
|
||||
try:
|
||||
file = open('Unicode.tables/' + filename, 'r')
|
||||
except IOError:
|
||||
print(f"** Couldn't open {'Unicode.tables/' + filename}\n")
|
||||
sys.exit(1)
|
||||
|
||||
for line in file:
|
||||
line = re.sub(r'#.*', '', line)
|
||||
data = list(map(str.strip, line.split(';')))
|
||||
if len(data) <= 1 or data[1] == bplast:
|
||||
continue
|
||||
bplast = data[1]
|
||||
for pat in bool_propsignore:
|
||||
if re.match(pat, bplast) != None:
|
||||
break
|
||||
else:
|
||||
bplist.append(bplast)
|
||||
|
||||
file.close()
|
||||
|
||||
bplist.extend(bool_propsextras)
|
||||
bplist.sort()
|
||||
return bplist
|
||||
|
||||
bool_properties = getbpropslist()
|
||||
bool_props_list_item_size = (len(bool_properties) + 31) // 32
|
||||
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# COLLECTING PROPERTY NAMES AND ALIASES
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
script_names = ['Unknown']
|
||||
abbreviations = {}
|
||||
|
||||
def collect_property_names():
|
||||
global script_names
|
||||
global abbreviations
|
||||
|
||||
names_re = re.compile(r'^[0-9A-F]{4,6}(?:\.\.[0-9A-F]{4,6})? +; ([A-Za-z_]+) #')
|
||||
|
||||
last_script_name = ""
|
||||
with open("Unicode.tables/Scripts.txt") as f:
|
||||
for line in f:
|
||||
match_obj = names_re.match(line)
|
||||
|
||||
if match_obj == None or match_obj.group(1) == last_script_name:
|
||||
continue
|
||||
|
||||
last_script_name = match_obj.group(1)
|
||||
script_names.append(last_script_name)
|
||||
|
||||
# Sometimes there is comment in the line
|
||||
# so splitting around semicolon is not enough
|
||||
value_alias_re = re.compile(r' *([A-Za-z_]+) *; *([A-Za-z_]+) *; *([A-Za-z_]+)(?: *; *([A-Za-z_ ]+))?')
|
||||
|
||||
with open("Unicode.tables/PropertyValueAliases.txt") as f:
|
||||
for line in f:
|
||||
match_obj = value_alias_re.match(line)
|
||||
|
||||
if match_obj == None:
|
||||
continue
|
||||
|
||||
if match_obj.group(1) == "sc":
|
||||
if match_obj.group(2) == match_obj.group(3):
|
||||
abbreviations[match_obj.group(3)] = ()
|
||||
elif match_obj.group(4) == None:
|
||||
abbreviations[match_obj.group(3)] = (match_obj.group(2),)
|
||||
else:
|
||||
abbreviations[match_obj.group(3)] = (match_obj.group(2), match_obj.group(4))
|
||||
|
||||
# We can also collect Boolean property abbreviations into the same dictionary
|
||||
|
||||
bin_alias_re = re.compile(r' *([A-Za-z_]+) *; *([A-Za-z_]+)(?: *; *([A-Za-z_]+))?')
|
||||
with open("Unicode.tables/PropertyAliases.txt") as f:
|
||||
for line in f:
|
||||
match_obj = bin_alias_re.match(line)
|
||||
if match_obj == None:
|
||||
continue
|
||||
|
||||
if match_obj.group(2) in bool_properties:
|
||||
if match_obj.group(3) == None:
|
||||
abbreviations[match_obj.group(2)] = (match_obj.group(1),)
|
||||
else:
|
||||
abbreviations[match_obj.group(2)] = (match_obj.group(1), match_obj.group(3))
|
||||
|
||||
collect_property_names()
|
||||
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# REORDERING SCRIPT NAMES
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
script_abbrevs = []
|
||||
|
||||
def reorder_scripts():
|
||||
global script_names
|
||||
global script_abbrevs
|
||||
global abbreviations
|
||||
|
||||
for name in script_names:
|
||||
abbrevs = abbreviations[name]
|
||||
script_abbrevs.append(name if len(abbrevs) == 0 else abbrevs[0])
|
||||
|
||||
extended_script_abbrevs = set()
|
||||
with open("Unicode.tables/ScriptExtensions.txt") as f:
|
||||
names_re = re.compile(r'^[0-9A-F]{4,6}(?:\.\.[0-9A-F]{4,6})? +; ([A-Za-z_ ]+) #')
|
||||
|
||||
for line in f:
|
||||
match_obj = names_re.match(line)
|
||||
|
||||
if match_obj == None:
|
||||
continue
|
||||
|
||||
for name in match_obj.group(1).split(" "):
|
||||
extended_script_abbrevs.add(name)
|
||||
|
||||
new_script_names = []
|
||||
new_script_abbrevs = []
|
||||
|
||||
for idx, abbrev in enumerate(script_abbrevs):
|
||||
if abbrev in extended_script_abbrevs:
|
||||
new_script_names.append(script_names[idx])
|
||||
new_script_abbrevs.append(abbrev)
|
||||
|
||||
for idx, abbrev in enumerate(script_abbrevs):
|
||||
if abbrev not in extended_script_abbrevs:
|
||||
new_script_names.append(script_names[idx])
|
||||
new_script_abbrevs.append(abbrev)
|
||||
|
||||
script_names = new_script_names
|
||||
script_abbrevs = new_script_abbrevs
|
||||
|
||||
reorder_scripts()
|
||||
script_list_item_size = (script_names.index('Unknown') + 31) // 32
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# DERIVED LISTS
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
# Create general character property names from the first letters of the
|
||||
# particular categories.
|
||||
|
||||
gcn_set = set(category_names[i][0] for i in range(0, len(category_names), 2))
|
||||
general_category_names = list(gcn_set)
|
||||
general_category_names.sort()
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# FUNCTIONS
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
import sys
|
||||
|
||||
# Open an output file, using the command's argument or a default. Write common
|
||||
# preliminary header information.
|
||||
|
||||
def open_output(default):
|
||||
if len(sys.argv) > 2:
|
||||
print('** Too many arguments: just give a file name')
|
||||
sys.exit(1)
|
||||
if len(sys.argv) == 2:
|
||||
output_name = sys.argv[1]
|
||||
else:
|
||||
output_name = default
|
||||
try:
|
||||
file = open(output_name, "w")
|
||||
except IOError:
|
||||
print ("** Couldn't open %s" % output_name)
|
||||
sys.exit(1)
|
||||
|
||||
script_name = sys.argv[0]
|
||||
i = script_name.rfind('/')
|
||||
if i >= 0:
|
||||
script_name = script_name[i+1:]
|
||||
|
||||
file.write("""\
|
||||
/*************************************************
|
||||
* Perl-Compatible Regular Expressions *
|
||||
*************************************************/
|
||||
|
||||
/* PCRE is a library of functions to support regular expressions whose syntax
|
||||
and semantics are as close as possible to those of the Perl 5 language.
|
||||
|
||||
Written by Philip Hazel
|
||||
Original API code Copyright (c) 1997-2012 University of Cambridge
|
||||
New API code Copyright (c) 2016-2022 University of Cambridge
|
||||
|
||||
This module is auto-generated from Unicode data files. DO NOT EDIT MANUALLY!
|
||||
""")
|
||||
|
||||
file.write("Instead, modify the maint/%s script and run it to generate\n"
|
||||
"a new version of this code.\n\n" % script_name)
|
||||
|
||||
file.write("""\
|
||||
-----------------------------------------------------------------------------
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright notice,
|
||||
this list of conditions and the following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
|
||||
* Neither the name of the University of Cambridge nor the names of its
|
||||
contributors may be used to endorse or promote products derived from
|
||||
this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
POSSIBILITY OF SUCH DAMAGE.
|
||||
-----------------------------------------------------------------------------
|
||||
*/
|
||||
\n""")
|
||||
return file
|
||||
|
||||
# End of UcpCommon.py
|
|
@ -0,0 +1,188 @@
|
|||
#! /usr/bin/python
|
||||
|
||||
# PCRE2 UNICODE PROPERTY SUPPORT
|
||||
# ------------------------------
|
||||
#
|
||||
# This file auto-generates unicode property tests and their expected output.
|
||||
# It is recommended to re-run this generator after the unicode files are
|
||||
# updated. The names of the generated files are `testinput26` and `testoutput26`
|
||||
|
||||
import re
|
||||
import sys
|
||||
|
||||
from GenerateCommon import \
|
||||
script_names, \
|
||||
script_abbrevs
|
||||
|
||||
def write_both(text):
|
||||
input_file.write(text)
|
||||
output_file.write(text)
|
||||
|
||||
def to_string_char(ch_idx):
|
||||
if ch_idx < 128:
|
||||
if ch_idx < 16:
|
||||
return "\\x{0%x}" % ch_idx
|
||||
if ch_idx >= 32:
|
||||
return chr(ch_idx)
|
||||
return "\\x{%x}" % ch_idx
|
||||
|
||||
output_directory = ""
|
||||
|
||||
if len(sys.argv) > 2:
|
||||
print('** Too many arguments: just give a directory name')
|
||||
sys.exit(1)
|
||||
if len(sys.argv) == 2:
|
||||
output_directory = sys.argv[1]
|
||||
if not output_directory.endswith("/"):
|
||||
output_directory += "/"
|
||||
|
||||
try:
|
||||
input_file = open(output_directory + "testinput26", "w")
|
||||
output_file = open(output_directory + "testoutput26", "w")
|
||||
except IOError:
|
||||
print ("** Couldn't open output files")
|
||||
sys.exit(1)
|
||||
|
||||
write_both("# These tests are generated by maint/GenerateTest26.py, do not edit.\n\n")
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# UNICODE SCRIPT EXTENSION TESTS
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
write_both("# Unicode Script Extension tests.\n\n")
|
||||
|
||||
def gen_script_tests():
|
||||
script_data = [None] * len(script_names)
|
||||
char_data = [None] * 0x110000
|
||||
|
||||
property_re = re.compile("^([0-9A-F]{4,6})(?:\\.\\.([0-9A-F]{4,6}))? +; ([A-Za-z_ ]+) #")
|
||||
prev_name = ""
|
||||
script_idx = -1
|
||||
|
||||
with open("Unicode.tables/Scripts.txt") as f:
|
||||
for line in f:
|
||||
match_obj = property_re.match(line)
|
||||
|
||||
if match_obj == None:
|
||||
continue
|
||||
|
||||
name = match_obj.group(3)
|
||||
if name != prev_name:
|
||||
script_idx = script_names.index(name)
|
||||
prev_name = name
|
||||
|
||||
low = int(match_obj.group(1), 16)
|
||||
high = low
|
||||
char_data[low] = name
|
||||
|
||||
if match_obj.group(2) != None:
|
||||
high = int(match_obj.group(2), 16)
|
||||
for idx in range(low + 1, high + 1):
|
||||
char_data[idx] = name
|
||||
|
||||
if script_data[script_idx] == None:
|
||||
script_data[script_idx] = [low, None, None, None, None]
|
||||
script_data[script_idx][1] = high
|
||||
|
||||
extended_script_indicies = {}
|
||||
|
||||
with open("Unicode.tables/ScriptExtensions.txt") as f:
|
||||
for line in f:
|
||||
match_obj = property_re.match(line)
|
||||
|
||||
if match_obj == None:
|
||||
continue
|
||||
|
||||
low = int(match_obj.group(1), 16)
|
||||
high = low
|
||||
if match_obj.group(2) != None:
|
||||
high = int(match_obj.group(2), 16)
|
||||
|
||||
for abbrev in match_obj.group(3).split(" "):
|
||||
if abbrev not in extended_script_indicies:
|
||||
idx = script_abbrevs.index(abbrev)
|
||||
extended_script_indicies[abbrev] = idx
|
||||
rec = script_data[idx]
|
||||
rec[2] = low
|
||||
rec[3] = high
|
||||
else:
|
||||
idx = extended_script_indicies[abbrev]
|
||||
rec = script_data[idx]
|
||||
if rec[2] > low:
|
||||
rec[2] = low
|
||||
if rec[3] < high:
|
||||
rec[3] = high
|
||||
|
||||
if rec[4] == None:
|
||||
name = script_names[idx]
|
||||
for idx in range(low, high + 1):
|
||||
if char_data[idx] != name:
|
||||
rec[4] = idx
|
||||
break
|
||||
|
||||
long_property_name = False
|
||||
|
||||
for idx, rec in enumerate(script_data):
|
||||
script_name = script_names[idx]
|
||||
|
||||
if script_name == "Unknown":
|
||||
continue
|
||||
|
||||
script_abbrev = script_abbrevs[idx]
|
||||
|
||||
write_both("# Base script check\n")
|
||||
write_both("/^\\p{sc=%s}/utf\n" % script_name)
|
||||
write_both(" %s\n" % to_string_char(rec[0]))
|
||||
output_file.write(" 0: %s\n" % to_string_char(rec[0]))
|
||||
write_both("\n")
|
||||
|
||||
write_both("/^\\p{Script=%s}/utf\n" % script_abbrev)
|
||||
write_both(" %s\n" % to_string_char(rec[1]))
|
||||
output_file.write(" 0: %s\n" % to_string_char(rec[1]))
|
||||
write_both("\n")
|
||||
|
||||
if rec[2] != None:
|
||||
property_name = "scx"
|
||||
if long_property_name:
|
||||
property_name = "Script_Extensions"
|
||||
|
||||
write_both("# Script extension check\n")
|
||||
write_both("/^\\p{%s}/utf\n" % script_name)
|
||||
write_both(" %s\n" % to_string_char(rec[2]))
|
||||
output_file.write(" 0: %s\n" % to_string_char(rec[2]))
|
||||
write_both("\n")
|
||||
|
||||
write_both("/^\\p{%s=%s}/utf\n" % (property_name, script_abbrev))
|
||||
write_both(" %s\n" % to_string_char(rec[3]))
|
||||
output_file.write(" 0: %s\n" % to_string_char(rec[3]))
|
||||
write_both("\n")
|
||||
|
||||
long_property_name = not long_property_name
|
||||
|
||||
if rec[4] != None:
|
||||
write_both("# Script extension only character\n")
|
||||
write_both("/^\\p{%s}/utf\n" % script_name)
|
||||
write_both(" %s\n" % to_string_char(rec[4]))
|
||||
output_file.write(" 0: %s\n" % to_string_char(rec[4]))
|
||||
write_both("\n")
|
||||
|
||||
write_both("/^\\p{sc=%s}/utf\n" % script_name)
|
||||
write_both(" %s\n" % to_string_char(rec[4]))
|
||||
output_file.write("No match\n")
|
||||
write_both("\n")
|
||||
else:
|
||||
print("External character has not found for %s" % script_name)
|
||||
|
||||
high = rec[1]
|
||||
if rec[3] != None and rec[3] > rec[1]:
|
||||
high = rec[3]
|
||||
write_both("# Character not in script\n")
|
||||
write_both("/^\\p{%s}/utf\n" % script_name)
|
||||
write_both(" %s\n" % to_string_char(high + 1))
|
||||
output_file.write("No match\n")
|
||||
write_both("\n")
|
||||
|
||||
|
||||
gen_script_tests()
|
||||
|
||||
write_both("# End of testinput26\n")
|
|
@ -0,0 +1,923 @@
|
|||
#! /usr/bin/python
|
||||
|
||||
# PCRE2 UNICODE PROPERTY SUPPORT
|
||||
# ------------------------------
|
||||
#
|
||||
# This script generates the pcre2_ucd.c file from Unicode data files. This is
|
||||
# the compressed Unicode property data used by PCRE2. The script was created in
|
||||
# December 2021 as part of the Unicode data generation refactoring. It is
|
||||
# basically a re-working of the MultiStage2.py script that was submitted to the
|
||||
# PCRE project by Peter Kankowski in 2008 as part of a previous upgrading of
|
||||
# Unicode property support. A number of extensions have since been added. The
|
||||
# main difference in the 2021 upgrade (apart from comments and layout) is that
|
||||
# the data tables (e.g. list of script names) are now listed in or generated by
|
||||
# a separate Python module that is shared with the other Generate scripts.
|
||||
#
|
||||
# This script must be run in the "maint" directory. It requires the following
|
||||
# Unicode data tables: BidiMirrorring.txt, CaseFolding.txt,
|
||||
# DerivedBidiClass.txt, DerivedCoreProperties.txt, DerivedGeneralCategory.txt,
|
||||
# GraphemeBreakProperty.txt, PropList.txt, PropertyAliases.txt,
|
||||
# PropertyValueAliases.txt, ScriptExtensions.txt, Scripts.txt, and
|
||||
# emoji-data.txt. These must be in the Unicode.tables subdirectory.
|
||||
#
|
||||
# The emoji-data.txt file is found in the "emoji" subdirectory even though it
|
||||
# is technically part of a different (but coordinated) standard as shown
|
||||
# in files associated with Unicode Technical Standard #51 ("Unicode Emoji"),
|
||||
# for example:
|
||||
#
|
||||
# http://unicode.org/Public/emoji/13.0/ReadMe.txt
|
||||
#
|
||||
# DerivedBidiClass.txt and DerivedGeneralCategory.txt are in the "extracted"
|
||||
# subdirectory of the Unicode database (UCD) on the Unicode web site;
|
||||
# GraphemeBreakProperty.txt is in the "auxiliary" subdirectory. The other files
|
||||
# are in the top-level UCD directory.
|
||||
#
|
||||
# -----------------------------------------------------------------------------
|
||||
# Minor modifications made to the original script:
|
||||
# Added #! line at start
|
||||
# Removed tabs
|
||||
# Made it work with Python 2.4 by rewriting two statements that needed 2.5
|
||||
# Consequent code tidy
|
||||
# Adjusted data file names to take from the Unicode.tables directory
|
||||
# Adjusted global table names by prefixing _pcre_.
|
||||
# Commented out stuff relating to the casefolding table, which isn't used;
|
||||
# removed completely in 2012.
|
||||
# Corrected size calculation
|
||||
# Add #ifndef SUPPORT_UCP to use dummy tables when no UCP support is needed.
|
||||
# Update for PCRE2: name changes, and SUPPORT_UCP is abolished.
|
||||
#
|
||||
# Major modifications made to the original script:
|
||||
# Added code to add a grapheme break property field to records.
|
||||
#
|
||||
# Added code to search for sets of more than two characters that must match
|
||||
# each other caselessly. A new table is output containing these sets, and
|
||||
# offsets into the table are added to the main output records. This new
|
||||
# code scans CaseFolding.txt instead of UnicodeData.txt, which is no longer
|
||||
# used.
|
||||
#
|
||||
# Update for Python3:
|
||||
# . Processed with 2to3, but that didn't fix everything
|
||||
# . Changed string.strip to str.strip
|
||||
# . Added encoding='utf-8' to the open() call
|
||||
# . Inserted 'int' before blocksize/ELEMS_PER_LINE because an int is
|
||||
# required and the result of the division is a float
|
||||
#
|
||||
# Added code to scan the emoji-data.txt file to find the Extended Pictographic
|
||||
# property, which is used by PCRE2 as a grapheme breaking property. This was
|
||||
# done when updating to Unicode 11.0.0 (July 2018).
|
||||
#
|
||||
# Added code to add a Script Extensions field to records. This has increased
|
||||
# their size from 8 to 12 bytes, only 10 of which are currently used.
|
||||
#
|
||||
# Added code to add a bidi class field to records by scanning the
|
||||
# DerivedBidiClass.txt and PropList.txt files. This uses one of the two spare
|
||||
# bytes, so now 11 out of 12 are in use.
|
||||
#
|
||||
# 01-March-2010: Updated list of scripts for Unicode 5.2.0
|
||||
# 30-April-2011: Updated list of scripts for Unicode 6.0.0
|
||||
# July-2012: Updated list of scripts for Unicode 6.1.0
|
||||
# 20-August-2012: Added scan of GraphemeBreakProperty.txt and added a new
|
||||
# field in the record to hold the value. Luckily, the
|
||||
# structure had a hole in it, so the resulting table is
|
||||
# not much bigger than before.
|
||||
# 18-September-2012: Added code for multiple caseless sets. This uses the
|
||||
# final hole in the structure.
|
||||
# 30-September-2012: Added RegionalIndicator break property from Unicode 6.2.0
|
||||
# 13-May-2014: Updated for PCRE2
|
||||
# 03-June-2014: Updated for Python 3
|
||||
# 20-June-2014: Updated for Unicode 7.0.0
|
||||
# 12-August-2014: Updated to put Unicode version into the file
|
||||
# 19-June-2015: Updated for Unicode 8.0.0
|
||||
# 02-July-2017: Updated for Unicode 10.0.0
|
||||
# 03-July-2018: Updated for Unicode 11.0.0
|
||||
# 07-July-2018: Added code to scan emoji-data.txt for the Extended
|
||||
# Pictographic property.
|
||||
# 01-October-2018: Added the 'Unknown' script name
|
||||
# 03-October-2018: Added new field for Script Extensions
|
||||
# 27-July-2019: Updated for Unicode 12.1.0
|
||||
# 10-March-2020: Updated for Unicode 13.0.0
|
||||
# PCRE2-10.39: Updated for Unicode 14.0.0
|
||||
# 05-December-2021: Added code to scan DerivedBidiClass.txt for bidi class,
|
||||
# and also PropList.txt for the Bidi_Control property
|
||||
# 19-December-2021: Reworked script extensions lists to be bit maps instead
|
||||
# of zero-terminated lists of script numbers.
|
||||
# ----------------------------------------------------------------------------
|
||||
#
|
||||
# Changes to the refactored script:
|
||||
#
|
||||
# 26-December-2021: Refactoring completed
|
||||
# 10-January-2022: Addition of general Boolean property support
|
||||
# 12-January-2022: Merge scriptx and bidiclass fields
|
||||
# 14-January-2022: Enlarge Boolean property offset to 12 bits
|
||||
#
|
||||
# ----------------------------------------------------------------------------
|
||||
#
|
||||
#
|
||||
# The main tables generated by this script are used by macros defined in
|
||||
# pcre2_internal.h. They look up Unicode character properties using short
|
||||
# sequences of code that contains no branches, which makes for greater speed.
|
||||
#
|
||||
# Conceptually, there is a table of records (of type ucd_record), one for each
|
||||
# Unicode character. Each record contains the script number, script extension
|
||||
# value, character type, grapheme break type, offset to caseless matching set,
|
||||
# offset to the character's other case, the bidi class, and offset to bitmap of
|
||||
# Boolean properties.
|
||||
#
|
||||
# A real table covering all Unicode characters would be far too big. It can be
|
||||
# efficiently compressed by observing that many characters have the same
|
||||
# record, and many blocks of characters (taking 128 characters in a block) have
|
||||
# the same set of records as other blocks. This leads to a 2-stage lookup
|
||||
# process.
|
||||
#
|
||||
# This script constructs seven tables. The ucd_caseless_sets table contains
|
||||
# lists of characters that all match each other caselessly. Each list is
|
||||
# in order, and is terminated by NOTACHAR (0xffffffff), which is larger than
|
||||
# any valid character. The first list is empty; this is used for characters
|
||||
# that are not part of any list.
|
||||
#
|
||||
# The ucd_digit_sets table contains the code points of the '9' characters in
|
||||
# each set of 10 decimal digits in Unicode. This is used to ensure that digits
|
||||
# in script runs all come from the same set. The first element in the vector
|
||||
# contains the number of subsequent elements, which are in ascending order.
|
||||
#
|
||||
# Scripts are partitioned into two groups. Scripts that appear in at least one
|
||||
# character's script extension list come first, followed by "Unknown" and then
|
||||
# all the rest. This sorting is done automatically in the GenerateCommon.py
|
||||
# script. A script's number is its index in the script_names list.
|
||||
#
|
||||
# The ucd_script_sets table contains bitmaps that represent lists of scripts
|
||||
# for Script Extensions properties. Each bitmap consists of a fixed number of
|
||||
# unsigned 32-bit numbers, enough to allocate a bit for every script that is
|
||||
# used in any character's extension list, that is, enough for every script
|
||||
# whose number is less than ucp_Unknown. A character's script extension value
|
||||
# in its ucd record is an offset into the ucd_script_sets vector. The first
|
||||
# bitmap has no bits set; characters that have no script extensions have zero
|
||||
# as their script extensions value so that they use this map.
|
||||
#
|
||||
# The ucd_boolprop_sets table contains bitmaps that represent lists of Boolean
|
||||
# properties. Each bitmap consists of a fixed number of unsigned 32-bit
|
||||
# numbers, enough to allocate a bit for each supported Boolean property.
|
||||
#
|
||||
# The ucd_records table contains one instance of every unique character record
|
||||
# that is required. The ucd_stage1 table is indexed by a character's block
|
||||
# number, which is the character's code point divided by 128, since 128 is the
|
||||
# size of each block. The result of a lookup in ucd_stage1 a "virtual" block
|
||||
# number.
|
||||
#
|
||||
# The ucd_stage2 table is a table of "virtual" blocks; each block is indexed by
|
||||
# the offset of a character within its own block, and the result is the index
|
||||
# number of the required record in the ucd_records vector.
|
||||
#
|
||||
# The following examples are correct for the Unicode 14.0.0 database. Future
|
||||
# updates may make change the actual lookup values.
|
||||
#
|
||||
# Example: lowercase "a" (U+0061) is in block 0
|
||||
# lookup 0 in stage1 table yields 0
|
||||
# lookup 97 (0x61) in the first table in stage2 yields 35
|
||||
# record 35 is { 0, 5, 12, 0, -32, 18432, 44 }
|
||||
# 0 = ucp_Latin => Latin script
|
||||
# 5 = ucp_Ll => Lower case letter
|
||||
# 12 = ucp_gbOther => Grapheme break property "Other"
|
||||
# 0 => Not part of a caseless set
|
||||
# -32 (-0x20) => Other case is U+0041
|
||||
# 18432 = 0x4800 => Combined Bidi class + script extension values
|
||||
# 44 => Offset to Boolean properties
|
||||
#
|
||||
# The top 5 bits of the sixth field are the Bidi class, with the rest being the
|
||||
# script extension value, giving:
|
||||
#
|
||||
# 9 = ucp_bidiL => Bidi class left-to-right
|
||||
# 0 => No special script extension property
|
||||
#
|
||||
# Almost all lowercase latin characters resolve to the same record. One or two
|
||||
# are different because they are part of a multi-character caseless set (for
|
||||
# example, k, K and the Kelvin symbol are such a set).
|
||||
#
|
||||
# Example: hiragana letter A (U+3042) is in block 96 (0x60)
|
||||
# lookup 96 in stage1 table yields 93
|
||||
# lookup 66 (0x42) in table 93 in stage2 yields 819
|
||||
# record 819 is { 20, 7, 12, 0, 0, 18432, 82 }
|
||||
# 20 = ucp_Hiragana => Hiragana script
|
||||
# 7 = ucp_Lo => Other letter
|
||||
# 12 = ucp_gbOther => Grapheme break property "Other"
|
||||
# 0 => Not part of a caseless set
|
||||
# 0 => No other case
|
||||
# 18432 = 0x4800 => Combined Bidi class + script extension values
|
||||
# 82 => Offset to Boolean properties
|
||||
#
|
||||
# The top 5 bits of the sixth field are the Bidi class, with the rest being the
|
||||
# script extension value, giving:
|
||||
#
|
||||
# 9 = ucp_bidiL => Bidi class left-to-right
|
||||
# 0 => No special script extension property
|
||||
#
|
||||
# Example: vedic tone karshana (U+1CD0) is in block 57 (0x39)
|
||||
# lookup 57 in stage1 table yields 55
|
||||
# lookup 80 (0x50) in table 55 in stage2 yields 621
|
||||
# record 621 is { 84, 12, 3, 0, 0, 26762, 96 }
|
||||
# 84 = ucp_Inherited => Script inherited from predecessor
|
||||
# 12 = ucp_Mn => Non-spacing mark
|
||||
# 3 = ucp_gbExtend => Grapheme break property "Extend"
|
||||
# 0 => Not part of a caseless set
|
||||
# 0 => No other case
|
||||
# 26762 = 0x688A => Combined Bidi class + script extension values
|
||||
# 96 => Offset to Boolean properties
|
||||
#
|
||||
# The top 5 bits of the sixth field are the Bidi class, with the rest being the
|
||||
# script extension value, giving:
|
||||
#
|
||||
# 13 = ucp_bidiNSM => Bidi class non-spacing mark
|
||||
# 138 => Script Extension list offset = 138
|
||||
#
|
||||
# At offset 138 in the ucd_script_sets vector we find a bitmap with bits 1, 8,
|
||||
# 18, and 47 set. This means that this character is expected to be used with
|
||||
# any of those scripts, which are Bengali, Devanagari, Kannada, and Grantha.
|
||||
#
|
||||
# Philip Hazel, last updated 14 January 2022.
|
||||
##############################################################################
|
||||
|
||||
|
||||
# Import standard modules
|
||||
|
||||
import re
|
||||
import string
|
||||
import sys
|
||||
|
||||
# Import common data lists and functions
|
||||
|
||||
from GenerateCommon import \
|
||||
bidi_classes, \
|
||||
bool_properties, \
|
||||
bool_propsfiles, \
|
||||
bool_props_list_item_size, \
|
||||
break_properties, \
|
||||
category_names, \
|
||||
general_category_names, \
|
||||
script_abbrevs, \
|
||||
script_list_item_size, \
|
||||
script_names, \
|
||||
open_output
|
||||
|
||||
# Some general parameters
|
||||
|
||||
MAX_UNICODE = 0x110000
|
||||
NOTACHAR = 0xffffffff
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# DEFINE FUNCTIONS
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
# Parse a line of Scripts.txt, GraphemeBreakProperty.txt, DerivedBidiClass.txt
|
||||
# or DerivedGeneralCategory.txt
|
||||
|
||||
def make_get_names(enum):
|
||||
return lambda chardata: enum.index(chardata[1])
|
||||
|
||||
|
||||
# Parse a line of CaseFolding.txt
|
||||
|
||||
def get_other_case(chardata):
|
||||
if chardata[1] == 'C' or chardata[1] == 'S':
|
||||
return int(chardata[2], 16) - int(chardata[0], 16)
|
||||
return 0
|
||||
|
||||
|
||||
# Parse a line of ScriptExtensions.txt
|
||||
|
||||
def get_script_extension(chardata):
|
||||
global last_script_extension
|
||||
|
||||
offset = len(script_lists) * script_list_item_size
|
||||
if last_script_extension == chardata[1]:
|
||||
return offset - script_list_item_size
|
||||
|
||||
last_script_extension = chardata[1]
|
||||
script_lists.append(tuple(script_abbrevs.index(abbrev) for abbrev in last_script_extension.split(' ')))
|
||||
return offset
|
||||
|
||||
|
||||
# Read a whole table in memory, setting/checking the Unicode version
|
||||
|
||||
def read_table(file_name, get_value, default_value):
|
||||
global unicode_version
|
||||
|
||||
f = re.match(r'^[^/]+/([^.]+)\.txt$', file_name)
|
||||
file_base = f.group(1)
|
||||
version_pat = r"^# " + re.escape(file_base) + r"-(\d+\.\d+\.\d+)\.txt$"
|
||||
file = open(file_name, 'r', encoding='utf-8')
|
||||
f = re.match(version_pat, file.readline())
|
||||
version = f.group(1)
|
||||
if unicode_version == "":
|
||||
unicode_version = version
|
||||
elif unicode_version != version:
|
||||
print("WARNING: Unicode version differs in %s", file_name, file=sys.stderr)
|
||||
|
||||
table = [default_value] * MAX_UNICODE
|
||||
for line in file:
|
||||
line = re.sub(r'#.*', '', line)
|
||||
chardata = list(map(str.strip, line.split(';')))
|
||||
if len(chardata) <= 1:
|
||||
continue
|
||||
value = get_value(chardata)
|
||||
m = re.match(r'([0-9a-fA-F]+)(\.\.([0-9a-fA-F]+))?$', chardata[0])
|
||||
char = int(m.group(1), 16)
|
||||
if m.group(3) is None:
|
||||
last = char
|
||||
else:
|
||||
last = int(m.group(3), 16)
|
||||
for i in range(char, last + 1):
|
||||
# It is important not to overwrite a previously set value because in the
|
||||
# CaseFolding file there are lines to be ignored (returning the default
|
||||
# value of 0) which often come after a line which has already set data.
|
||||
if table[i] == default_value:
|
||||
table[i] = value
|
||||
file.close()
|
||||
return table
|
||||
|
||||
|
||||
# Get the smallest possible C language type for the values in a table
|
||||
|
||||
def get_type_size(table):
|
||||
type_size = [("uint8_t", 1), ("uint16_t", 2), ("uint32_t", 4),
|
||||
("signed char", 1), ("int16_t", 2), ("int32_t", 4)]
|
||||
limits = [(0, 255), (0, 65535), (0, 4294967295), (-128, 127),
|
||||
(-32768, 32767), (-2147483648, 2147483647)]
|
||||
minval = min(table)
|
||||
maxval = max(table)
|
||||
for num, (minlimit, maxlimit) in enumerate(limits):
|
||||
if minlimit <= minval and maxval <= maxlimit:
|
||||
return type_size[num]
|
||||
raise OverflowError("Too large to fit into C types")
|
||||
|
||||
|
||||
# Get the total size of a list of tables
|
||||
|
||||
def get_tables_size(*tables):
|
||||
total_size = 0
|
||||
for table in tables:
|
||||
type, size = get_type_size(table)
|
||||
total_size += size * len(table)
|
||||
return total_size
|
||||
|
||||
|
||||
# Compress a table into the two stages
|
||||
|
||||
def compress_table(table, block_size):
|
||||
blocks = {} # Dictionary for finding identical blocks
|
||||
stage1 = [] # Stage 1 table contains block numbers (indices into stage 2 table)
|
||||
stage2 = [] # Stage 2 table contains the blocks with property values
|
||||
table = tuple(table)
|
||||
for i in range(0, len(table), block_size):
|
||||
block = table[i:i+block_size]
|
||||
start = blocks.get(block)
|
||||
if start is None:
|
||||
# Allocate a new block
|
||||
start = len(stage2) / block_size
|
||||
stage2 += block
|
||||
blocks[block] = start
|
||||
stage1.append(start)
|
||||
return stage1, stage2
|
||||
|
||||
|
||||
# Output a table
|
||||
|
||||
def write_table(table, table_name, block_size = None):
|
||||
type, size = get_type_size(table)
|
||||
ELEMS_PER_LINE = 16
|
||||
|
||||
s = "const %s %s[] = { /* %d bytes" % (type, table_name, size * len(table))
|
||||
if block_size:
|
||||
s += ", block = %d" % block_size
|
||||
f.write(s + " */\n")
|
||||
table = tuple(table)
|
||||
if block_size is None:
|
||||
fmt = "%3d," * ELEMS_PER_LINE + " /* U+%04X */\n"
|
||||
mult = MAX_UNICODE / len(table)
|
||||
for i in range(0, len(table), ELEMS_PER_LINE):
|
||||
f.write(fmt % (table[i:i+ELEMS_PER_LINE] + (int(i * mult),)))
|
||||
else:
|
||||
if block_size > ELEMS_PER_LINE:
|
||||
el = ELEMS_PER_LINE
|
||||
else:
|
||||
el = block_size
|
||||
fmt = "%3d," * el + "\n"
|
||||
if block_size > ELEMS_PER_LINE:
|
||||
fmt = fmt * int(block_size / ELEMS_PER_LINE)
|
||||
for i in range(0, len(table), block_size):
|
||||
f.write(("\n/* block %d */\n" + fmt) % ((i / block_size,) + table[i:i+block_size]))
|
||||
f.write("};\n\n")
|
||||
|
||||
|
||||
# Extract the unique combinations of properties into records
|
||||
|
||||
def combine_tables(*tables):
|
||||
records = {}
|
||||
index = []
|
||||
for t in zip(*tables):
|
||||
i = records.get(t)
|
||||
if i is None:
|
||||
i = records[t] = len(records)
|
||||
index.append(i)
|
||||
return index, records
|
||||
|
||||
|
||||
# Create a record struct
|
||||
|
||||
def get_record_size_struct(records):
|
||||
size = 0
|
||||
structure = 'typedef struct {\n'
|
||||
for i in range(len(records[0])):
|
||||
record_slice = [record[i] for record in records]
|
||||
slice_type, slice_size = get_type_size(record_slice)
|
||||
# add padding: round up to the nearest power of slice_size
|
||||
size = (size + slice_size - 1) & -slice_size
|
||||
size += slice_size
|
||||
structure += '%s property_%d;\n' % (slice_type, i)
|
||||
|
||||
# round up to the first item of the next structure in array
|
||||
record_slice = [record[0] for record in records]
|
||||
slice_type, slice_size = get_type_size(record_slice)
|
||||
size = (size + slice_size - 1) & -slice_size
|
||||
|
||||
structure += '} ucd_record;\n*/\n'
|
||||
return size, structure
|
||||
|
||||
|
||||
# Write records
|
||||
|
||||
def write_records(records, record_size):
|
||||
f.write('const ucd_record PRIV(ucd_records)[] = { ' + \
|
||||
'/* %d bytes, record size %d */\n' % (len(records) * record_size, record_size))
|
||||
records = list(zip(list(records.keys()), list(records.values())))
|
||||
records.sort(key = lambda x: x[1])
|
||||
for i, record in enumerate(records):
|
||||
f.write((' {' + '%6d, ' * len(record[0]) + '}, /* %3d */\n') % (record[0] + (i,)))
|
||||
f.write('};\n\n')
|
||||
|
||||
|
||||
# Write a bit set
|
||||
|
||||
def write_bitsets(list, item_size):
|
||||
for d in list:
|
||||
bitwords = [0] * item_size
|
||||
for idx in d:
|
||||
bitwords[idx // 32] |= 1 << (idx & 31)
|
||||
s = " "
|
||||
for x in bitwords:
|
||||
f.write("%s" % s)
|
||||
s = ", "
|
||||
f.write("0x%08xu" % x)
|
||||
f.write(",\n")
|
||||
f.write("};\n\n")
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# This bit of code must have been useful when the original script was being
|
||||
# developed. Retain it just in case it is ever needed again.
|
||||
|
||||
# def test_record_size():
|
||||
# tests = [ \
|
||||
# ( [(3,), (6,), (6,), (1,)], 1 ), \
|
||||
# ( [(300,), (600,), (600,), (100,)], 2 ), \
|
||||
# ( [(25, 3), (6, 6), (34, 6), (68, 1)], 2 ), \
|
||||
# ( [(300, 3), (6, 6), (340, 6), (690, 1)], 4 ), \
|
||||
# ( [(3, 300), (6, 6), (6, 340), (1, 690)], 4 ), \
|
||||
# ( [(300, 300), (6, 6), (6, 340), (1, 690)], 4 ), \
|
||||
# ( [(3, 100000), (6, 6), (6, 123456), (1, 690)], 8 ), \
|
||||
# ( [(100000, 300), (6, 6), (123456, 6), (1, 690)], 8 ), \
|
||||
# ]
|
||||
# for test in tests:
|
||||
# size, struct = get_record_size_struct(test[0])
|
||||
# assert(size == test[1])
|
||||
# test_record_size()
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# MAIN CODE FOR CREATING TABLES
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
unicode_version = ""
|
||||
|
||||
# Some of the tables imported from GenerateCommon.py have alternate comment
|
||||
# strings for use by GenerateUcpHeader. The comments are not wanted here, so
|
||||
# remove them.
|
||||
|
||||
bidi_classes = bidi_classes[::2]
|
||||
break_properties = break_properties[::2]
|
||||
category_names = category_names[::2]
|
||||
|
||||
# Create the various tables from Unicode data files
|
||||
|
||||
script = read_table('Unicode.tables/Scripts.txt', make_get_names(script_names), script_names.index('Unknown'))
|
||||
category = read_table('Unicode.tables/DerivedGeneralCategory.txt', make_get_names(category_names), category_names.index('Cn'))
|
||||
break_props = read_table('Unicode.tables/GraphemeBreakProperty.txt', make_get_names(break_properties), break_properties.index('Other'))
|
||||
other_case = read_table('Unicode.tables/CaseFolding.txt', get_other_case, 0)
|
||||
bidi_class = read_table('Unicode.tables/DerivedBidiClass.txt', make_get_names(bidi_classes), bidi_classes.index('L'))
|
||||
|
||||
# The grapheme breaking rules were changed for Unicode 11.0.0 (June 2018). Now
|
||||
# we need to find the Extended_Pictographic property for emoji characters. This
|
||||
# can be set as an additional grapheme break property, because the default for
|
||||
# all the emojis is "other". We scan the emoji-data.txt file and modify the
|
||||
# break-props table.
|
||||
|
||||
file = open('Unicode.tables/emoji-data.txt', 'r', encoding='utf-8')
|
||||
for line in file:
|
||||
line = re.sub(r'#.*', '', line)
|
||||
chardata = list(map(str.strip, line.split(';')))
|
||||
if len(chardata) <= 1:
|
||||
continue
|
||||
if chardata[1] != "Extended_Pictographic":
|
||||
continue
|
||||
m = re.match(r'([0-9a-fA-F]+)(\.\.([0-9a-fA-F]+))?$', chardata[0])
|
||||
char = int(m.group(1), 16)
|
||||
if m.group(3) is None:
|
||||
last = char
|
||||
else:
|
||||
last = int(m.group(3), 16)
|
||||
for i in range(char, last + 1):
|
||||
if break_props[i] != break_properties.index('Other'):
|
||||
print("WARNING: Emoji 0x%x has break property %s, not 'Other'",
|
||||
i, break_properties[break_props[i]], file=sys.stderr)
|
||||
break_props[i] = break_properties.index('Extended_Pictographic')
|
||||
file.close()
|
||||
|
||||
# Handle script extensions. The get_script_extesion() function maintains a
|
||||
# list of unique bitmaps representing lists of scripts, returning the offset
|
||||
# in that list. Initialize the list with an empty set, which is used for
|
||||
# characters that have no script extensions.
|
||||
|
||||
script_lists = [[]]
|
||||
last_script_extension = ""
|
||||
scriptx_bidi_class = read_table('Unicode.tables/ScriptExtensions.txt', get_script_extension, 0)
|
||||
|
||||
for idx in range(len(scriptx_bidi_class)):
|
||||
scriptx_bidi_class[idx] = scriptx_bidi_class[idx] | (bidi_class[idx] << 11)
|
||||
bidi_class = None
|
||||
|
||||
# Find the Boolean properties of each character. This next bit of magic creates
|
||||
# a list of empty lists. Using [[]] * MAX_UNICODE gives a list of references to
|
||||
# the *same* list, which is not what we want.
|
||||
|
||||
bprops = [[] for _ in range(MAX_UNICODE)]
|
||||
|
||||
# Collect the properties from the various files
|
||||
|
||||
for filename in bool_propsfiles:
|
||||
try:
|
||||
file = open('Unicode.tables/' + filename, 'r')
|
||||
except IOError:
|
||||
print(f"** Couldn't open {'Unicode.tables/' + filename}\n")
|
||||
sys.exit(1)
|
||||
|
||||
for line in file:
|
||||
line = re.sub(r'#.*', '', line)
|
||||
data = list(map(str.strip, line.split(';')))
|
||||
if len(data) <= 1:
|
||||
continue
|
||||
|
||||
try:
|
||||
ix = bool_properties.index(data[1])
|
||||
except ValueError:
|
||||
continue
|
||||
|
||||
m = re.match(r'([0-9a-fA-F]+)(\.\.([0-9a-fA-F]+))?$', data[0])
|
||||
char = int(m.group(1), 16)
|
||||
if m.group(3) is None:
|
||||
last = char
|
||||
else:
|
||||
last = int(m.group(3), 16)
|
||||
|
||||
for i in range(char, last + 1):
|
||||
bprops[i].append(ix)
|
||||
|
||||
file.close()
|
||||
|
||||
# The ASCII property isn't listed in any files, but it is easy enough to add
|
||||
# it manually.
|
||||
|
||||
ix = bool_properties.index("ASCII")
|
||||
for i in range(128):
|
||||
bprops[i].append(ix)
|
||||
|
||||
# The Bidi_Mirrored property isn't listed in any property files. We have to
|
||||
# deduce it from the file that lists the mirrored characters.
|
||||
|
||||
ix = bool_properties.index("Bidi_Mirrored")
|
||||
|
||||
try:
|
||||
file = open('Unicode.tables/BidiMirroring.txt', 'r')
|
||||
except IOError:
|
||||
print(f"** Couldn't open {'Unicode.tables/BidiMirroring.txt'}\n")
|
||||
sys.exit(1)
|
||||
|
||||
for line in file:
|
||||
line = re.sub(r'#.*', '', line)
|
||||
data = list(map(str.strip, line.split(';')))
|
||||
if len(data) <= 1:
|
||||
continue
|
||||
c = int(data[0], 16)
|
||||
bprops[c].append(ix)
|
||||
|
||||
file.close()
|
||||
|
||||
# Scan each character's boolean property list and created a list of unique
|
||||
# lists, at the same time, setting the index in that list for each property in
|
||||
# the bool_props vector.
|
||||
|
||||
bool_props = [0] * MAX_UNICODE
|
||||
bool_props_lists = [[]]
|
||||
|
||||
for c in range(MAX_UNICODE):
|
||||
s = set(bprops[c])
|
||||
for i in range(len(bool_props_lists)):
|
||||
if s == set(bool_props_lists[i]):
|
||||
break;
|
||||
else:
|
||||
bool_props_lists.append(bprops[c])
|
||||
i += 1
|
||||
|
||||
bool_props[c] = i * bool_props_list_item_size
|
||||
|
||||
# This block of code was added by PH in September 2012. It scans the other_case
|
||||
# table to find sets of more than two characters that must all match each other
|
||||
# caselessly. Later in this script a table of these sets is written out.
|
||||
# However, we have to do this work here in order to compute the offsets in the
|
||||
# table that are inserted into the main table.
|
||||
|
||||
# The CaseFolding.txt file lists pairs, but the common logic for reading data
|
||||
# sets only one value, so first we go through the table and set "return"
|
||||
# offsets for those that are not already set.
|
||||
|
||||
for c in range(MAX_UNICODE):
|
||||
if other_case[c] != 0 and other_case[c + other_case[c]] == 0:
|
||||
other_case[c + other_case[c]] = -other_case[c]
|
||||
|
||||
# Now scan again and create equivalence sets.
|
||||
|
||||
caseless_sets = []
|
||||
|
||||
for c in range(MAX_UNICODE):
|
||||
o = c + other_case[c]
|
||||
|
||||
# Trigger when this character's other case does not point back here. We
|
||||
# now have three characters that are case-equivalent.
|
||||
|
||||
if other_case[o] != -other_case[c]:
|
||||
t = o + other_case[o]
|
||||
|
||||
# Scan the existing sets to see if any of the three characters are already
|
||||
# part of a set. If so, unite the existing set with the new set.
|
||||
|
||||
appended = 0
|
||||
for s in caseless_sets:
|
||||
found = 0
|
||||
for x in s:
|
||||
if x == c or x == o or x == t:
|
||||
found = 1
|
||||
|
||||
# Add new characters to an existing set
|
||||
|
||||
if found:
|
||||
found = 0
|
||||
for y in [c, o, t]:
|
||||
for x in s:
|
||||
if x == y:
|
||||
found = 1
|
||||
if not found:
|
||||
s.append(y)
|
||||
appended = 1
|
||||
|
||||
# If we have not added to an existing set, create a new one.
|
||||
|
||||
if not appended:
|
||||
caseless_sets.append([c, o, t])
|
||||
|
||||
# End of loop looking for caseless sets.
|
||||
|
||||
# Now scan the sets and set appropriate offsets for the characters.
|
||||
|
||||
caseless_offsets = [0] * MAX_UNICODE
|
||||
|
||||
offset = 1;
|
||||
for s in caseless_sets:
|
||||
for x in s:
|
||||
caseless_offsets[x] = offset
|
||||
offset += len(s) + 1
|
||||
|
||||
# End of block of code for creating offsets for caseless matching sets.
|
||||
|
||||
|
||||
# Combine all the tables
|
||||
|
||||
table, records = combine_tables(script, category, break_props,
|
||||
caseless_offsets, other_case, scriptx_bidi_class, bool_props)
|
||||
|
||||
# Find the record size and create a string definition of the structure for
|
||||
# outputting as a comment.
|
||||
|
||||
record_size, record_struct = get_record_size_struct(list(records.keys()))
|
||||
|
||||
# Find the optimum block size for the two-stage table
|
||||
|
||||
min_size = sys.maxsize
|
||||
for block_size in [2 ** i for i in range(5,10)]:
|
||||
size = len(records) * record_size
|
||||
stage1, stage2 = compress_table(table, block_size)
|
||||
size += get_tables_size(stage1, stage2)
|
||||
#print "/* block size %5d => %5d bytes */" % (block_size, size)
|
||||
if size < min_size:
|
||||
min_size = size
|
||||
min_stage1, min_stage2 = stage1, stage2
|
||||
min_block_size = block_size
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# MAIN CODE FOR WRITING THE OUTPUT FILE
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
# Open the output file (no return on failure). This call also writes standard
|
||||
# header boilerplate.
|
||||
|
||||
f = open_output("pcre2_ucd.c")
|
||||
|
||||
# Output this file's heading text
|
||||
|
||||
f.write("""\
|
||||
/* This file contains tables of Unicode properties that are extracted from
|
||||
Unicode data files. See the comments at the start of maint/GenerateUcd.py for
|
||||
details.
|
||||
|
||||
As well as being part of the PCRE2 library, this file is #included by the
|
||||
pcre2test program, which redefines the PRIV macro to change table names from
|
||||
_pcre2_xxx to xxxx, thereby avoiding name clashes with the library. At present,
|
||||
just one of these tables is actually needed. When compiling the library, some
|
||||
headers are needed. */
|
||||
|
||||
#ifndef PCRE2_PCRE2TEST
|
||||
#ifdef HAVE_CONFIG_H
|
||||
#include "config.h"
|
||||
#endif
|
||||
#include "pcre2_internal.h"
|
||||
#endif /* PCRE2_PCRE2TEST */
|
||||
|
||||
/* The tables herein are needed only when UCP support is built, and in PCRE2
|
||||
that happens automatically with UTF support. This module should not be
|
||||
referenced otherwise, so it should not matter whether it is compiled or not.
|
||||
However a comment was received about space saving - maybe the guy linked all
|
||||
the modules rather than using a library - so we include a condition to cut out
|
||||
the tables when not needed. But don't leave a totally empty module because some
|
||||
compilers barf at that. Instead, just supply some small dummy tables. */
|
||||
|
||||
#ifndef SUPPORT_UNICODE
|
||||
const ucd_record PRIV(ucd_records)[] = {{0,0,0,0,0,0,0}};
|
||||
const uint16_t PRIV(ucd_stage1)[] = {0};
|
||||
const uint16_t PRIV(ucd_stage2)[] = {0};
|
||||
const uint32_t PRIV(ucd_caseless_sets)[] = {0};
|
||||
#else
|
||||
\n""")
|
||||
|
||||
# --- Output some variable heading stuff ---
|
||||
|
||||
f.write("/* Total size: %d bytes, block size: %d. */\n\n" % (min_size, min_block_size))
|
||||
f.write('const char *PRIV(unicode_version) = "{}";\n\n'.format(unicode_version))
|
||||
|
||||
f.write("""\
|
||||
/* When recompiling tables with a new Unicode version, please check the types
|
||||
in this structure definition with those in pcre2_internal.h (the actual field
|
||||
names will be different).
|
||||
\n""")
|
||||
|
||||
f.write(record_struct)
|
||||
|
||||
f.write("""
|
||||
/* If the 32-bit library is run in non-32-bit mode, character values greater
|
||||
than 0x10ffff may be encountered. For these we set up a special record. */
|
||||
|
||||
#if PCRE2_CODE_UNIT_WIDTH == 32
|
||||
const ucd_record PRIV(dummy_ucd_record)[] = {{
|
||||
ucp_Unknown, /* script */
|
||||
ucp_Cn, /* type unassigned */
|
||||
ucp_gbOther, /* grapheme break property */
|
||||
0, /* case set */
|
||||
0, /* other case */
|
||||
0 | (ucp_bidiL << UCD_BIDICLASS_SHIFT), /* script extension and bidi class */
|
||||
0, /* bool properties offset */
|
||||
}};
|
||||
#endif
|
||||
\n""")
|
||||
|
||||
# --- Output the table of caseless character sets ---
|
||||
|
||||
f.write("""\
|
||||
/* This table contains lists of characters that are caseless sets of
|
||||
more than one character. Each list is terminated by NOTACHAR. */
|
||||
|
||||
const uint32_t PRIV(ucd_caseless_sets)[] = {
|
||||
NOTACHAR,
|
||||
""")
|
||||
|
||||
for s in caseless_sets:
|
||||
s = sorted(s)
|
||||
for x in s:
|
||||
f.write(' 0x%04x,' % x)
|
||||
f.write(' NOTACHAR,\n')
|
||||
f.write('};\n\n')
|
||||
|
||||
# --- Other tables are not needed by pcre2test ---
|
||||
|
||||
f.write("""\
|
||||
/* When #included in pcre2test, we don't need the table of digit sets, nor the
|
||||
the large main UCD tables. */
|
||||
|
||||
#ifndef PCRE2_PCRE2TEST
|
||||
\n""")
|
||||
|
||||
# --- Read Scripts.txt again for the sets of 10 digits. ---
|
||||
|
||||
digitsets = []
|
||||
file = open('Unicode.tables/Scripts.txt', 'r', encoding='utf-8')
|
||||
|
||||
for line in file:
|
||||
m = re.match(r'([0-9a-fA-F]+)\.\.([0-9a-fA-F]+)\s+;\s+\S+\s+#\s+Nd\s+', line)
|
||||
if m is None:
|
||||
continue
|
||||
first = int(m.group(1),16)
|
||||
last = int(m.group(2),16)
|
||||
if ((last - first + 1) % 10) != 0:
|
||||
f.write("ERROR: %04x..%04x does not contain a multiple of 10 characters" % (first, last),
|
||||
file=sys.stderr)
|
||||
while first < last:
|
||||
digitsets.append(first + 9)
|
||||
first += 10
|
||||
file.close()
|
||||
digitsets.sort()
|
||||
|
||||
f.write("""\
|
||||
/* This table lists the code points for the '9' characters in each set of
|
||||
decimal digits. It is used to ensure that all the digits in a script run come
|
||||
from the same set. */
|
||||
|
||||
const uint32_t PRIV(ucd_digit_sets)[] = {
|
||||
""")
|
||||
|
||||
f.write(" %d, /* Number of subsequent values */" % len(digitsets))
|
||||
count = 8
|
||||
for d in digitsets:
|
||||
if count == 8:
|
||||
f.write("\n ")
|
||||
count = 0
|
||||
f.write(" 0x%05x," % d)
|
||||
count += 1
|
||||
f.write("\n};\n\n")
|
||||
|
||||
f.write("""\
|
||||
/* This vector is a list of script bitsets for the Script Extension property.
|
||||
The number of 32-bit words in each bitset is #defined in pcre2_ucp.h as
|
||||
ucd_script_sets_item_size. */
|
||||
|
||||
const uint32_t PRIV(ucd_script_sets)[] = {
|
||||
""")
|
||||
write_bitsets(script_lists, script_list_item_size)
|
||||
|
||||
f.write("""\
|
||||
/* This vector is a list of bitsets for Boolean properties. The number of
|
||||
32_bit words in each bitset is #defined as ucd_boolprop_sets_item_size in
|
||||
pcre2_ucp.h. */
|
||||
|
||||
const uint32_t PRIV(ucd_boolprop_sets)[] = {
|
||||
""")
|
||||
write_bitsets(bool_props_lists, bool_props_list_item_size)
|
||||
|
||||
|
||||
# Output the main UCD tables.
|
||||
|
||||
f.write("""\
|
||||
/* These are the main two-stage UCD tables. The fields in each record are:
|
||||
script (8 bits), character type (8 bits), grapheme break property (8 bits),
|
||||
offset to multichar other cases or zero (8 bits), offset to other case or zero
|
||||
(32 bits, signed), bidi class (5 bits) and script extension (11 bits) packed
|
||||
into a 16-bit field, and offset in binary properties table (16 bits). */
|
||||
\n""")
|
||||
|
||||
write_records(records, record_size)
|
||||
write_table(min_stage1, 'PRIV(ucd_stage1)')
|
||||
write_table(min_stage2, 'PRIV(ucd_stage2)', min_block_size)
|
||||
|
||||
f.write("#if UCD_BLOCK_SIZE != %d\n" % min_block_size)
|
||||
f.write("""\
|
||||
#error Please correct UCD_BLOCK_SIZE in pcre2_internal.h
|
||||
#endif
|
||||
#endif /* SUPPORT_UNICODE */
|
||||
|
||||
#endif /* PCRE2_PCRE2TEST */
|
||||
|
||||
/* End of pcre2_ucd.c */
|
||||
""")
|
||||
|
||||
f.close
|
||||
|
||||
# End
|
|
@ -0,0 +1,98 @@
|
|||
#! /usr/bin/python
|
||||
|
||||
# PCRE2 UNICODE PROPERTY SUPPORT
|
||||
# ------------------------------
|
||||
|
||||
# This script generates the pcre2_ucp.h file from Unicode data files. This
|
||||
# header uses enumerations to give names to Unicode property types and script
|
||||
# names.
|
||||
|
||||
# This script was created in December 2021 as part of the Unicode data
|
||||
# generation refactoring.
|
||||
|
||||
|
||||
# Import common data lists and functions
|
||||
|
||||
from GenerateCommon import \
|
||||
bidi_classes, \
|
||||
bool_properties, \
|
||||
bool_props_list_item_size, \
|
||||
break_properties, \
|
||||
category_names, \
|
||||
general_category_names, \
|
||||
script_list_item_size, \
|
||||
script_names, \
|
||||
open_output
|
||||
|
||||
# Open the output file (no return on failure). This call also writes standard
|
||||
# header boilerplate.
|
||||
|
||||
f = open_output("pcre2_ucp.h")
|
||||
|
||||
# Output this file's heading text
|
||||
|
||||
f.write("""\
|
||||
#ifndef PCRE2_UCP_H_IDEMPOTENT_GUARD
|
||||
#define PCRE2_UCP_H_IDEMPOTENT_GUARD
|
||||
|
||||
/* This file contains definitions of the Unicode property values that are
|
||||
returned by the UCD access macros and used throughout PCRE2.
|
||||
|
||||
IMPORTANT: The specific values of the first two enums (general and particular
|
||||
character categories) are assumed by the table called catposstab in the file
|
||||
pcre2_auto_possess.c. They are unlikely to change, but should be checked after
|
||||
an update. */
|
||||
\n""")
|
||||
|
||||
f.write("/* These are the general character categories. */\n\nenum {\n")
|
||||
for i in general_category_names:
|
||||
f.write(" ucp_%s,\n" % i)
|
||||
f.write("};\n\n")
|
||||
|
||||
f.write("/* These are the particular character categories. */\n\nenum {\n")
|
||||
for i in range(0, len(category_names), 2):
|
||||
f.write(" ucp_%s, /* %s */\n" % (category_names[i], category_names[i+1]))
|
||||
f.write("};\n\n")
|
||||
|
||||
f.write("/* These are Boolean properties. */\n\nenum {\n")
|
||||
for i in bool_properties:
|
||||
f.write(" ucp_%s,\n" % i)
|
||||
|
||||
f.write(" /* This must be last */\n")
|
||||
f.write(" ucp_Bprop_Count\n};\n\n")
|
||||
|
||||
f.write("/* Size of entries in ucd_boolprop_sets[] */\n\n")
|
||||
f.write("#define ucd_boolprop_sets_item_size %d\n\n" % bool_props_list_item_size)
|
||||
|
||||
f.write("/* These are the bidi class values. */\n\nenum {\n")
|
||||
for i in range(0, len(bidi_classes), 2):
|
||||
sp = ' ' * (4 - len(bidi_classes[i]))
|
||||
f.write(" ucp_bidi%s,%s /* %s */\n" % (bidi_classes[i], sp, bidi_classes[i+1]))
|
||||
f.write("};\n\n")
|
||||
|
||||
f.write("/* These are grapheme break properties. The Extended Pictographic "
|
||||
"property\ncomes from the emoji-data.txt file. */\n\nenum {\n")
|
||||
for i in range(0, len(break_properties), 2):
|
||||
sp = ' ' * (21 - len(break_properties[i]))
|
||||
f.write(" ucp_gb%s,%s /* %s */\n" % (break_properties[i], sp, break_properties[i+1]))
|
||||
f.write("};\n\n")
|
||||
|
||||
f.write("/* These are the script identifications. */\n\nenum {\n /* Scripts which has characters in other scripts. */\n")
|
||||
for i in script_names:
|
||||
if i == "Unknown":
|
||||
f.write("\n /* Scripts which has no characters in other scripts. */\n")
|
||||
f.write(" ucp_%s,\n" % i)
|
||||
f.write("\n")
|
||||
|
||||
f.write(" /* This must be last */\n")
|
||||
f.write(" ucp_Script_Count\n};\n\n")
|
||||
|
||||
f.write("/* Size of entries in ucd_script_sets[] */\n\n")
|
||||
f.write("#define ucd_script_sets_item_size %d\n\n" % script_list_item_size)
|
||||
|
||||
f.write("#endif /* PCRE2_UCP_H_IDEMPOTENT_GUARD */\n\n")
|
||||
f.write("/* End of pcre2_ucp.h */\n")
|
||||
|
||||
f.close()
|
||||
|
||||
# End
|
|
@ -0,0 +1,203 @@
|
|||
#! /usr/bin/python
|
||||
|
||||
# PCRE2 UNICODE PROPERTY SUPPORT
|
||||
# ------------------------------
|
||||
|
||||
# This script generates the pcre2_ucptables.c file, which contains tables for
|
||||
# recognizing Unicode property names. It is #included by pcre2_tables.c. In
|
||||
# order to reduce the number of relocations when loading the PCRE2 library, the
|
||||
# names are held as a single large string, with offsets in the table. This is
|
||||
# tedious to maintain by hand. Therefore, a script is used to generate the
|
||||
# table.
|
||||
|
||||
# This script was created in December 2021 based on the previous GenerateUtt
|
||||
# script, whose output had to be manually edited into pcre2_tables.c. Here is
|
||||
# the history of the original script:
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
# Modified by PH 17-March-2009 to generate the more verbose form that works
|
||||
# for UTF-support in EBCDIC as well as ASCII environments.
|
||||
# Modified by PH 01-March-2010 to add new scripts for Unicode 5.2.0.
|
||||
# Modified by PH 04-May-2010 to add new "X.." special categories.
|
||||
# Modified by PH 30-April-2011 to add new scripts for Unicode 6.0.0
|
||||
# Modified by ChPe 30-September-2012 to add this note; no other changes were
|
||||
# necessary for Unicode 6.2.0 support.
|
||||
# Modfied by PH 26-February-2013 to add the Xuc special category.
|
||||
# Comment modified by PH 13-May-2014 to update to PCRE2 file names.
|
||||
# Script updated to Python 3 by running it through the 2to3 converter.
|
||||
# Added script names for Unicode 7.0.0, 20-June-2014.
|
||||
# Added script names for Unicode 8.0.0, 19-June-2015.
|
||||
# Added script names for Unicode 10.0.0, 02-July-2017.
|
||||
# Added script names for Unicode 11.0.0, 03-July-2018.
|
||||
# Added 'Unknown' script, 01-October-2018.
|
||||
# Added script names for Unicode 12.1.0, 27-July-2019.
|
||||
# Added script names for Unicode 13.0.0, 10-March-2020.
|
||||
# Added Script names for Unicode 14.0.0, PCRE2-10.39
|
||||
# Added support for bidi class and bidi control, 06-December-2021
|
||||
# This also involved lower casing strings and removing underscores, in
|
||||
# accordance with Unicode's "loose matching" rules, which Perl observes.
|
||||
# Changed default script type from PT_SC to PT_SCX, 18-December-2021
|
||||
# -----------------------------------------------------------------------------
|
||||
#
|
||||
# Note subsequent changes here:
|
||||
#
|
||||
# 27-December-2021: Added support for 4-letter script abbreviations.
|
||||
# 10-January-2022: Further updates for Boolean property support
|
||||
# -----------------------------------------------------------------------------
|
||||
|
||||
|
||||
# Import common data lists and functions
|
||||
|
||||
from GenerateCommon import \
|
||||
abbreviations, \
|
||||
bool_properties, \
|
||||
bidi_classes, \
|
||||
category_names, \
|
||||
general_category_names, \
|
||||
script_names, \
|
||||
open_output
|
||||
|
||||
# Open the output file (no return on failure). This call also writes standard
|
||||
# header boilerplate.
|
||||
|
||||
f = open_output("pcre2_ucptables.c")
|
||||
|
||||
# The list in bidi_classes contains just the Unicode classes such as AN, LRE,
|
||||
# etc., along with comments. We need to add "bidi" in front of each value, in
|
||||
# order to create names that don't clash with other types of property.
|
||||
|
||||
bidi_class_names = []
|
||||
for i in range(0, len(bidi_classes), 2):
|
||||
bidi_class_names.append("bidi" + bidi_classes[i])
|
||||
|
||||
# Remove the comments from other lists that contain them.
|
||||
|
||||
category_names = category_names[::2]
|
||||
|
||||
# Create standardized versions of the names by lowercasing and removing
|
||||
# underscores.
|
||||
|
||||
def stdname(x):
|
||||
return x.lower().replace('_', '')
|
||||
|
||||
def stdnames(x):
|
||||
y = [''] * len(x)
|
||||
for i in range(len(x)):
|
||||
y[i] = stdname(x[i])
|
||||
return y
|
||||
|
||||
std_category_names = stdnames(category_names)
|
||||
std_general_category_names = stdnames(general_category_names)
|
||||
std_bidi_class_names = stdnames(bidi_class_names)
|
||||
std_bool_properties = stdnames(bool_properties)
|
||||
|
||||
# Create the table, starting with the Unicode script, category and bidi class
|
||||
# names. We keep both the standardized name and the original, because the
|
||||
# latter is used for the ucp_xx names. NOTE: for the script abbreviations, we
|
||||
# still use the full original names.
|
||||
|
||||
utt_table = []
|
||||
|
||||
scx_end = script_names.index('Unknown')
|
||||
|
||||
for idx, name in enumerate(script_names):
|
||||
pt_type = 'PT_SCX' if idx < scx_end else 'PT_SC'
|
||||
utt_table.append((stdname(name), name, pt_type))
|
||||
for abbrev in abbreviations[name]:
|
||||
utt_table.append((stdname(abbrev), name, pt_type))
|
||||
|
||||
# Add the remaining property lists
|
||||
|
||||
utt_table += list(zip(std_category_names, category_names, ['PT_PC'] * len(category_names)))
|
||||
utt_table += list(zip(std_general_category_names, general_category_names, ['PT_GC'] * len(general_category_names)))
|
||||
utt_table += list(zip(std_bidi_class_names, bidi_class_names, ['PT_BIDICL'] * len(bidi_class_names)))
|
||||
|
||||
for name in bool_properties:
|
||||
utt_table.append((stdname(name), name, 'PT_BOOL'))
|
||||
if name in abbreviations:
|
||||
for abbrev in abbreviations[name]:
|
||||
utt_table.append((stdname(abbrev), name, 'PT_BOOL'))
|
||||
|
||||
# Now add specials and synonyms. Note both the standardized and capitalized
|
||||
# forms are needed.
|
||||
|
||||
utt_table.append(('any', 'Any', 'PT_ANY'))
|
||||
utt_table.append(('l&', 'L&', 'PT_LAMP'))
|
||||
utt_table.append(('lc', 'LC', 'PT_LAMP'))
|
||||
utt_table.append(('xan', 'Xan', 'PT_ALNUM'))
|
||||
utt_table.append(('xps', 'Xps', 'PT_PXSPACE'))
|
||||
utt_table.append(('xsp', 'Xsp', 'PT_SPACE'))
|
||||
utt_table.append(('xuc', 'Xuc', 'PT_UCNC'))
|
||||
utt_table.append(('xwd', 'Xwd', 'PT_WORD'))
|
||||
|
||||
# Remove duplicates from the table and then sort it.
|
||||
|
||||
utt_table = list(set(utt_table))
|
||||
utt_table.sort()
|
||||
|
||||
# Output file-specific heading
|
||||
|
||||
f.write("""\
|
||||
#ifdef SUPPORT_UNICODE
|
||||
|
||||
/* The PRIV(utt)[] table below translates Unicode property names into type and
|
||||
code values. It is searched by binary chop, so must be in collating sequence of
|
||||
name. Originally, the table contained pointers to the name strings in the first
|
||||
field of each entry. However, that leads to a large number of relocations when
|
||||
a shared library is dynamically loaded. A significant reduction is made by
|
||||
putting all the names into a single, large string and using offsets instead.
|
||||
All letters are lower cased, and underscores are removed, in accordance with
|
||||
the "loose matching" rules that Unicode advises and Perl uses. */
|
||||
\n""")
|
||||
|
||||
# We have to use STR_ macros to define the strings so that it all works in
|
||||
# UTF-8 mode on EBCDIC platforms.
|
||||
|
||||
for utt in utt_table:
|
||||
f.write('#define STRING_%s0' % (utt[0].replace('&', '_AMPERSAND')))
|
||||
for c in utt[0]:
|
||||
if c == '&':
|
||||
f.write(' STR_AMPERSAND')
|
||||
else:
|
||||
f.write(' STR_%s' % c);
|
||||
f.write(' "\\0"\n')
|
||||
|
||||
# Output the long string of concatenated names
|
||||
|
||||
f.write('\nconst char PRIV(utt_names)[] =\n');
|
||||
last = ''
|
||||
for utt in utt_table:
|
||||
if utt == utt_table[-1]:
|
||||
last = ';'
|
||||
f.write(' STRING_%s0%s\n' % (utt[0].replace('&', '_AMPERSAND'), last))
|
||||
|
||||
# Output the property type table
|
||||
|
||||
f.write('\nconst ucp_type_table PRIV(utt)[] = {\n')
|
||||
offset = 0
|
||||
last = ','
|
||||
for utt in utt_table:
|
||||
if utt[2] in ('PT_ANY', 'PT_LAMP', 'PT_ALNUM', 'PT_PXSPACE',
|
||||
'PT_SPACE', 'PT_UCNC', 'PT_WORD'):
|
||||
value = '0'
|
||||
else:
|
||||
value = 'ucp_' + utt[1]
|
||||
if utt == utt_table[-1]:
|
||||
last = ''
|
||||
f.write(' { %3d, %s, %s }%s\n' % (offset, utt[2], value, last))
|
||||
offset += len(utt[0]) + 1
|
||||
f.write('};\n\n')
|
||||
|
||||
# Ending text
|
||||
|
||||
f.write("""\
|
||||
const size_t PRIV(utt_size) = sizeof(PRIV(utt)) / sizeof(ucp_type_table);
|
||||
|
||||
#endif /* SUPPORT_UNICODE */
|
||||
|
||||
/* End of pcre2_ucptables.c */
|
||||
""")
|
||||
|
||||
f.close
|
||||
|
||||
# End
|
|
@ -1,137 +0,0 @@
|
|||
#! /usr/bin/python
|
||||
|
||||
# Generate utt tables. Note: this script has now been converted to Python 3.
|
||||
|
||||
# The source file pcre2_tables.c contains (amongst other things), a table that
|
||||
# is indexed by script name. In order to reduce the number of relocations when
|
||||
# loading the library, the names are held as a single large string, with
|
||||
# offsets in the table. This is tedious to maintain by hand. Therefore, this
|
||||
# script is used to generate the table. The output is sent to stdout; usually
|
||||
# that should be directed to a temporary file. Then pcre2_tables.c can be
|
||||
# edited by replacing the relevant definitions and table therein with the
|
||||
# temporary file.
|
||||
|
||||
# Modified by PH 17-March-2009 to generate the more verbose form that works
|
||||
# for UTF-support in EBCDIC as well as ASCII environments.
|
||||
# Modified by PH 01-March-2010 to add new scripts for Unicode 5.2.0.
|
||||
# Modified by PH 04-May-2010 to add new "X.." special categories.
|
||||
# Modified by PH 30-April-2011 to add new scripts for Unicode 6.0.0
|
||||
# Modified by ChPe 30-September-2012 to add this note; no other changes were
|
||||
# necessary for Unicode 6.2.0 support.
|
||||
# Modfied by PH 26-February-2013 to add the Xuc special category.
|
||||
# Comment modified by PH 13-May-2014 to update to PCRE2 file names.
|
||||
# Script updated to Python 3 by running it through the 2to3 converter.
|
||||
# Added script names for Unicode 7.0.0, 20-June-2014.
|
||||
# Added script names for Unicode 8.0.0, 19-June-2015.
|
||||
# Added script names for Unicode 10.0.0, 02-July-2017.
|
||||
# Added script names for Unicode 11.0.0, 03-July-2018.
|
||||
# Added 'Unknown' script, 01-October-2018.
|
||||
# Added script names for Unicode 12.1.0, 27-July-2019.
|
||||
# Added script names for Unicode 13.0.0, 10-March-2020.
|
||||
|
||||
script_names = ['Unknown', 'Arabic', 'Armenian', 'Bengali', 'Bopomofo', 'Braille', 'Buginese', 'Buhid', 'Canadian_Aboriginal', \
|
||||
'Cherokee', 'Common', 'Coptic', 'Cypriot', 'Cyrillic', 'Deseret', 'Devanagari', 'Ethiopic', 'Georgian', \
|
||||
'Glagolitic', 'Gothic', 'Greek', 'Gujarati', 'Gurmukhi', 'Han', 'Hangul', 'Hanunoo', 'Hebrew', 'Hiragana', \
|
||||
'Inherited', 'Kannada', 'Katakana', 'Kharoshthi', 'Khmer', 'Lao', 'Latin', 'Limbu', 'Linear_B', 'Malayalam', \
|
||||
'Mongolian', 'Myanmar', 'New_Tai_Lue', 'Ogham', 'Old_Italic', 'Old_Persian', 'Oriya', 'Osmanya', 'Runic', \
|
||||
'Shavian', 'Sinhala', 'Syloti_Nagri', 'Syriac', 'Tagalog', 'Tagbanwa', 'Tai_Le', 'Tamil', 'Telugu', 'Thaana', \
|
||||
'Thai', 'Tibetan', 'Tifinagh', 'Ugaritic', 'Yi', \
|
||||
# New for Unicode 5.0
|
||||
'Balinese', 'Cuneiform', 'Nko', 'Phags_Pa', 'Phoenician', \
|
||||
# New for Unicode 5.1
|
||||
'Carian', 'Cham', 'Kayah_Li', 'Lepcha', 'Lycian', 'Lydian', 'Ol_Chiki', 'Rejang', 'Saurashtra', 'Sundanese', 'Vai', \
|
||||
# New for Unicode 5.2
|
||||
'Avestan', 'Bamum', 'Egyptian_Hieroglyphs', 'Imperial_Aramaic', \
|
||||
'Inscriptional_Pahlavi', 'Inscriptional_Parthian', \
|
||||
'Javanese', 'Kaithi', 'Lisu', 'Meetei_Mayek', \
|
||||
'Old_South_Arabian', 'Old_Turkic', 'Samaritan', 'Tai_Tham', 'Tai_Viet', \
|
||||
# New for Unicode 6.0.0
|
||||
'Batak', 'Brahmi', 'Mandaic', \
|
||||
# New for Unicode 6.1.0
|
||||
'Chakma', 'Meroitic_Cursive', 'Meroitic_Hieroglyphs', 'Miao', 'Sharada', 'Sora_Sompeng', 'Takri',
|
||||
# New for Unicode 7.0.0
|
||||
'Bassa_Vah', 'Caucasian_Albanian', 'Duployan', 'Elbasan', 'Grantha', 'Khojki', 'Khudawadi',
|
||||
'Linear_A', 'Mahajani', 'Manichaean', 'Mende_Kikakui', 'Modi', 'Mro', 'Nabataean',
|
||||
'Old_North_Arabian', 'Old_Permic', 'Pahawh_Hmong', 'Palmyrene', 'Psalter_Pahlavi',
|
||||
'Pau_Cin_Hau', 'Siddham', 'Tirhuta', 'Warang_Citi',
|
||||
# New for Unicode 8.0.0
|
||||
'Ahom', 'Anatolian_Hieroglyphs', 'Hatran', 'Multani', 'Old_Hungarian',
|
||||
'SignWriting',
|
||||
# New for Unicode 10.0.0
|
||||
'Adlam', 'Bhaiksuki', 'Marchen', 'Newa', 'Osage', 'Tangut', 'Masaram_Gondi',
|
||||
'Nushu', 'Soyombo', 'Zanabazar_Square',
|
||||
# New for Unicode 11.0.0
|
||||
'Dogra', 'Gunjala_Gondi', 'Hanifi_Rohingya', 'Makasar', 'Medefaidrin',
|
||||
'Old_Sogdian', 'Sogdian',
|
||||
# New for Unicode 12.0.0
|
||||
'Elymaic', 'Nandinagari', 'Nyiakeng_Puachue_Hmong', 'Wancho',
|
||||
# New for Unicode 13.0.0
|
||||
'Chorasmian', 'Dives_Akuru', 'Khitan_Small_Script', 'Yezidi'
|
||||
]
|
||||
|
||||
category_names = ['Cc', 'Cf', 'Cn', 'Co', 'Cs', 'Ll', 'Lm', 'Lo', 'Lt', 'Lu',
|
||||
'Mc', 'Me', 'Mn', 'Nd', 'Nl', 'No', 'Pc', 'Pd', 'Pe', 'Pf', 'Pi', 'Po', 'Ps',
|
||||
'Sc', 'Sk', 'Sm', 'So', 'Zl', 'Zp', 'Zs' ]
|
||||
|
||||
general_category_names = ['C', 'L', 'M', 'N', 'P', 'S', 'Z']
|
||||
|
||||
# First add the Unicode script and category names.
|
||||
|
||||
utt_table = list(zip(script_names, ['PT_SC'] * len(script_names)))
|
||||
utt_table += list(zip(category_names, ['PT_PC'] * len(category_names)))
|
||||
utt_table += list(zip(general_category_names, ['PT_GC'] * len(general_category_names)))
|
||||
|
||||
# Now add our own specials.
|
||||
|
||||
utt_table.append(('Any', 'PT_ANY'))
|
||||
utt_table.append(('L&', 'PT_LAMP'))
|
||||
utt_table.append(('Xan', 'PT_ALNUM'))
|
||||
utt_table.append(('Xps', 'PT_PXSPACE'))
|
||||
utt_table.append(('Xsp', 'PT_SPACE'))
|
||||
utt_table.append(('Xuc', 'PT_UCNC'))
|
||||
utt_table.append(('Xwd', 'PT_WORD'))
|
||||
|
||||
# Sort the table.
|
||||
|
||||
utt_table.sort()
|
||||
|
||||
# We have to use STR_ macros to define the strings so that it all works in
|
||||
# UTF-8 mode on EBCDIC platforms.
|
||||
|
||||
for utt in utt_table:
|
||||
print('#define STRING_%s0' % (utt[0].replace('&', '_AMPERSAND')), end=' ')
|
||||
for c in utt[0]:
|
||||
if c == '_':
|
||||
print('STR_UNDERSCORE', end=' ')
|
||||
elif c == '&':
|
||||
print('STR_AMPERSAND', end=' ')
|
||||
else:
|
||||
print('STR_%s' % c, end=' ');
|
||||
print('"\\0"')
|
||||
|
||||
# Print the actual table, using the string names
|
||||
|
||||
print('')
|
||||
print('const char PRIV(utt_names)[] =');
|
||||
last = ''
|
||||
for utt in utt_table:
|
||||
if utt == utt_table[-1]:
|
||||
last = ';'
|
||||
print(' STRING_%s0%s' % (utt[0].replace('&', '_AMPERSAND'), last))
|
||||
# This was how it was done before the EBCDIC-compatible modification.
|
||||
# print ' "%s\\0"%s' % (utt[0], last)
|
||||
|
||||
print('\nconst ucp_type_table PRIV(utt)[] = {')
|
||||
offset = 0
|
||||
last = ','
|
||||
for utt in utt_table:
|
||||
if utt[1] in ('PT_ANY', 'PT_LAMP', 'PT_ALNUM', 'PT_PXSPACE',
|
||||
'PT_SPACE', 'PT_UCNC', 'PT_WORD'):
|
||||
value = '0'
|
||||
else:
|
||||
value = 'ucp_' + utt[0]
|
||||
if utt == utt_table[-1]:
|
||||
last = ''
|
||||
print(' { %3d, %s, %s }%s' % (offset, utt[1], value, last))
|
||||
offset += len(utt[0]) + 1
|
||||
print('};')
|
|
@ -1,814 +0,0 @@
|
|||
#! /usr/bin/python
|
||||
|
||||
# Multistage table builder
|
||||
# (c) Peter Kankowski, 2008
|
||||
|
||||
##############################################################################
|
||||
# This script was submitted to the PCRE project by Peter Kankowski as part of
|
||||
# the upgrading of Unicode property support. The new code speeds up property
|
||||
# matching many times. The script is for the use of PCRE maintainers, to
|
||||
# generate the pcre2_ucd.c file that contains a digested form of the Unicode
|
||||
# data tables. A number of extensions have been added to the original script.
|
||||
#
|
||||
# The script has now been upgraded to Python 3 for PCRE2, and should be run in
|
||||
# the maint subdirectory, using the command
|
||||
#
|
||||
# [python3] ./MultiStage2.py >../src/pcre2_ucd.c
|
||||
#
|
||||
# It requires six Unicode data tables: DerivedGeneralCategory.txt,
|
||||
# GraphemeBreakProperty.txt, Scripts.txt, ScriptExtensions.txt,
|
||||
# CaseFolding.txt, and emoji-data.txt. These must be in the
|
||||
# maint/Unicode.tables subdirectory.
|
||||
#
|
||||
# DerivedGeneralCategory.txt is found in the "extracted" subdirectory of the
|
||||
# Unicode database (UCD) on the Unicode web site; GraphemeBreakProperty.txt is
|
||||
# in the "auxiliary" subdirectory. Scripts.txt, ScriptExtensions.txt, and
|
||||
# CaseFolding.txt are directly in the UCD directory.
|
||||
#
|
||||
# The emoji-data.txt file is found in the "emoji" subdirectory even though it
|
||||
# is technically part of a different (but coordinated) standard as shown
|
||||
# in files associated with Unicode Technical Standard #51 ("Unicode Emoji"),
|
||||
# for example:
|
||||
#
|
||||
# http://unicode.org/Public/emoji/13.0/ReadMe.txt
|
||||
#
|
||||
# -----------------------------------------------------------------------------
|
||||
# Minor modifications made to this script:
|
||||
# Added #! line at start
|
||||
# Removed tabs
|
||||
# Made it work with Python 2.4 by rewriting two statements that needed 2.5
|
||||
# Consequent code tidy
|
||||
# Adjusted data file names to take from the Unicode.tables directory
|
||||
# Adjusted global table names by prefixing _pcre_.
|
||||
# Commented out stuff relating to the casefolding table, which isn't used;
|
||||
# removed completely in 2012.
|
||||
# Corrected size calculation
|
||||
# Add #ifndef SUPPORT_UCP to use dummy tables when no UCP support is needed.
|
||||
# Update for PCRE2: name changes, and SUPPORT_UCP is abolished.
|
||||
#
|
||||
# Major modifications made to this script:
|
||||
# Added code to add a grapheme break property field to records.
|
||||
#
|
||||
# Added code to search for sets of more than two characters that must match
|
||||
# each other caselessly. A new table is output containing these sets, and
|
||||
# offsets into the table are added to the main output records. This new
|
||||
# code scans CaseFolding.txt instead of UnicodeData.txt, which is no longer
|
||||
# used.
|
||||
#
|
||||
# Update for Python3:
|
||||
# . Processed with 2to3, but that didn't fix everything
|
||||
# . Changed string.strip to str.strip
|
||||
# . Added encoding='utf-8' to the open() call
|
||||
# . Inserted 'int' before blocksize/ELEMS_PER_LINE because an int is
|
||||
# required and the result of the division is a float
|
||||
#
|
||||
# Added code to scan the emoji-data.txt file to find the Extended Pictographic
|
||||
# property, which is used by PCRE2 as a grapheme breaking property. This was
|
||||
# done when updating to Unicode 11.0.0 (July 2018).
|
||||
#
|
||||
# Added code to add a Script Extensions field to records. This has increased
|
||||
# their size from 8 to 12 bytes, only 10 of which are currently used.
|
||||
#
|
||||
# 01-March-2010: Updated list of scripts for Unicode 5.2.0
|
||||
# 30-April-2011: Updated list of scripts for Unicode 6.0.0
|
||||
# July-2012: Updated list of scripts for Unicode 6.1.0
|
||||
# 20-August-2012: Added scan of GraphemeBreakProperty.txt and added a new
|
||||
# field in the record to hold the value. Luckily, the
|
||||
# structure had a hole in it, so the resulting table is
|
||||
# not much bigger than before.
|
||||
# 18-September-2012: Added code for multiple caseless sets. This uses the
|
||||
# final hole in the structure.
|
||||
# 30-September-2012: Added RegionalIndicator break property from Unicode 6.2.0
|
||||
# 13-May-2014: Updated for PCRE2
|
||||
# 03-June-2014: Updated for Python 3
|
||||
# 20-June-2014: Updated for Unicode 7.0.0
|
||||
# 12-August-2014: Updated to put Unicode version into the file
|
||||
# 19-June-2015: Updated for Unicode 8.0.0
|
||||
# 02-July-2017: Updated for Unicode 10.0.0
|
||||
# 03-July-2018: Updated for Unicode 11.0.0
|
||||
# 07-July-2018: Added code to scan emoji-data.txt for the Extended
|
||||
# Pictographic property.
|
||||
# 01-October-2018: Added the 'Unknown' script name
|
||||
# 03-October-2018: Added new field for Script Extensions
|
||||
# 27-July-2019: Updated for Unicode 12.1.0
|
||||
# 10-March-2020: Updated for Unicode 13.0.0
|
||||
# ----------------------------------------------------------------------------
|
||||
#
|
||||
#
|
||||
# The main tables generated by this script are used by macros defined in
|
||||
# pcre2_internal.h. They look up Unicode character properties using short
|
||||
# sequences of code that contains no branches, which makes for greater speed.
|
||||
#
|
||||
# Conceptually, there is a table of records (of type ucd_record), containing a
|
||||
# script number, script extension value, character type, grapheme break type,
|
||||
# offset to caseless matching set, offset to the character's other case, for
|
||||
# every Unicode character. However, a real table covering all Unicode
|
||||
# characters would be far too big. It can be efficiently compressed by
|
||||
# observing that many characters have the same record, and many blocks of
|
||||
# characters (taking 128 characters in a block) have the same set of records as
|
||||
# other blocks. This leads to a 2-stage lookup process.
|
||||
#
|
||||
# This script constructs six tables. The ucd_caseless_sets table contains
|
||||
# lists of characters that all match each other caselessly. Each list is
|
||||
# in order, and is terminated by NOTACHAR (0xffffffff), which is larger than
|
||||
# any valid character. The first list is empty; this is used for characters
|
||||
# that are not part of any list.
|
||||
#
|
||||
# The ucd_digit_sets table contains the code points of the '9' characters in
|
||||
# each set of 10 decimal digits in Unicode. This is used to ensure that digits
|
||||
# in script runs all come from the same set. The first element in the vector
|
||||
# contains the number of subsequent elements, which are in ascending order.
|
||||
#
|
||||
# The ucd_script_sets vector contains lists of script numbers that are the
|
||||
# Script Extensions properties of certain characters. Each list is terminated
|
||||
# by zero (ucp_Unknown). A character with more than one script listed for its
|
||||
# Script Extension property has a negative value in its record. This is the
|
||||
# negated offset to the start of the relevant list in the ucd_script_sets
|
||||
# vector.
|
||||
#
|
||||
# The ucd_records table contains one instance of every unique record that is
|
||||
# required. The ucd_stage1 table is indexed by a character's block number,
|
||||
# which is the character's code point divided by 128, since 128 is the size
|
||||
# of each block. The result of a lookup in ucd_stage1 a "virtual" block number.
|
||||
#
|
||||
# The ucd_stage2 table is a table of "virtual" blocks; each block is indexed by
|
||||
# the offset of a character within its own block, and the result is the index
|
||||
# number of the required record in the ucd_records vector.
|
||||
#
|
||||
# The following examples are correct for the Unicode 11.0.0 database. Future
|
||||
# updates may make change the actual lookup values.
|
||||
#
|
||||
# Example: lowercase "a" (U+0061) is in block 0
|
||||
# lookup 0 in stage1 table yields 0
|
||||
# lookup 97 (0x61) in the first table in stage2 yields 17
|
||||
# record 17 is { 34, 5, 12, 0, -32, 34, 0 }
|
||||
# 34 = ucp_Latin => Latin script
|
||||
# 5 = ucp_Ll => Lower case letter
|
||||
# 12 = ucp_gbOther => Grapheme break property "Other"
|
||||
# 0 => Not part of a caseless set
|
||||
# -32 (-0x20) => Other case is U+0041
|
||||
# 34 = ucp_Latin => No special Script Extension property
|
||||
# 0 => Dummy value, unused at present
|
||||
#
|
||||
# Almost all lowercase latin characters resolve to the same record. One or two
|
||||
# are different because they are part of a multi-character caseless set (for
|
||||
# example, k, K and the Kelvin symbol are such a set).
|
||||
#
|
||||
# Example: hiragana letter A (U+3042) is in block 96 (0x60)
|
||||
# lookup 96 in stage1 table yields 90
|
||||
# lookup 66 (0x42) in table 90 in stage2 yields 564
|
||||
# record 564 is { 27, 7, 12, 0, 0, 27, 0 }
|
||||
# 27 = ucp_Hiragana => Hiragana script
|
||||
# 7 = ucp_Lo => Other letter
|
||||
# 12 = ucp_gbOther => Grapheme break property "Other"
|
||||
# 0 => Not part of a caseless set
|
||||
# 0 => No other case
|
||||
# 27 = ucp_Hiragana => No special Script Extension property
|
||||
# 0 => Dummy value, unused at present
|
||||
#
|
||||
# Example: vedic tone karshana (U+1CD0) is in block 57 (0x39)
|
||||
# lookup 57 in stage1 table yields 55
|
||||
# lookup 80 (0x50) in table 55 in stage2 yields 458
|
||||
# record 458 is { 28, 12, 3, 0, 0, -101, 0 }
|
||||
# 28 = ucp_Inherited => Script inherited from predecessor
|
||||
# 12 = ucp_Mn => Non-spacing mark
|
||||
# 3 = ucp_gbExtend => Grapheme break property "Extend"
|
||||
# 0 => Not part of a caseless set
|
||||
# 0 => No other case
|
||||
# -101 => Script Extension list offset = 101
|
||||
# 0 => Dummy value, unused at present
|
||||
#
|
||||
# At offset 101 in the ucd_script_sets vector we find the list 3, 15, 107, 29,
|
||||
# and terminator 0. This means that this character is expected to be used with
|
||||
# any of those scripts, which are Bengali, Devanagari, Grantha, and Kannada.
|
||||
#
|
||||
# Philip Hazel, 03 July 2008
|
||||
##############################################################################
|
||||
|
||||
|
||||
import re
|
||||
import string
|
||||
import sys
|
||||
|
||||
MAX_UNICODE = 0x110000
|
||||
NOTACHAR = 0xffffffff
|
||||
|
||||
|
||||
# Parse a line of Scripts.txt, GraphemeBreakProperty.txt or DerivedGeneralCategory.txt
|
||||
def make_get_names(enum):
|
||||
return lambda chardata: enum.index(chardata[1])
|
||||
|
||||
# Parse a line of CaseFolding.txt
|
||||
def get_other_case(chardata):
|
||||
if chardata[1] == 'C' or chardata[1] == 'S':
|
||||
return int(chardata[2], 16) - int(chardata[0], 16)
|
||||
return 0
|
||||
|
||||
# Parse a line of ScriptExtensions.txt
|
||||
def get_script_extension(chardata):
|
||||
this_script_list = list(chardata[1].split(' '))
|
||||
if len(this_script_list) == 1:
|
||||
return script_abbrevs.index(this_script_list[0])
|
||||
|
||||
script_numbers = []
|
||||
for d in this_script_list:
|
||||
script_numbers.append(script_abbrevs.index(d))
|
||||
script_numbers.append(0)
|
||||
script_numbers_length = len(script_numbers)
|
||||
|
||||
for i in range(1, len(script_lists) - script_numbers_length + 1):
|
||||
for j in range(0, script_numbers_length):
|
||||
found = True
|
||||
if script_lists[i+j] != script_numbers[j]:
|
||||
found = False
|
||||
break
|
||||
if found:
|
||||
return -i
|
||||
|
||||
# Not found in existing lists
|
||||
|
||||
return_value = len(script_lists)
|
||||
script_lists.extend(script_numbers)
|
||||
return -return_value
|
||||
|
||||
# Read the whole table in memory, setting/checking the Unicode version
|
||||
def read_table(file_name, get_value, default_value):
|
||||
global unicode_version
|
||||
|
||||
f = re.match(r'^[^/]+/([^.]+)\.txt$', file_name)
|
||||
file_base = f.group(1)
|
||||
version_pat = r"^# " + re.escape(file_base) + r"-(\d+\.\d+\.\d+)\.txt$"
|
||||
file = open(file_name, 'r', encoding='utf-8')
|
||||
f = re.match(version_pat, file.readline())
|
||||
version = f.group(1)
|
||||
if unicode_version == "":
|
||||
unicode_version = version
|
||||
elif unicode_version != version:
|
||||
print("WARNING: Unicode version differs in %s", file_name, file=sys.stderr)
|
||||
|
||||
table = [default_value] * MAX_UNICODE
|
||||
for line in file:
|
||||
line = re.sub(r'#.*', '', line)
|
||||
chardata = list(map(str.strip, line.split(';')))
|
||||
if len(chardata) <= 1:
|
||||
continue
|
||||
value = get_value(chardata)
|
||||
m = re.match(r'([0-9a-fA-F]+)(\.\.([0-9a-fA-F]+))?$', chardata[0])
|
||||
char = int(m.group(1), 16)
|
||||
if m.group(3) is None:
|
||||
last = char
|
||||
else:
|
||||
last = int(m.group(3), 16)
|
||||
for i in range(char, last + 1):
|
||||
# It is important not to overwrite a previously set
|
||||
# value because in the CaseFolding file there are lines
|
||||
# to be ignored (returning the default value of 0)
|
||||
# which often come after a line which has already set
|
||||
# data.
|
||||
if table[i] == default_value:
|
||||
table[i] = value
|
||||
file.close()
|
||||
return table
|
||||
|
||||
# Get the smallest possible C language type for the values
|
||||
def get_type_size(table):
|
||||
type_size = [("uint8_t", 1), ("uint16_t", 2), ("uint32_t", 4),
|
||||
("signed char", 1), ("pcre_int16", 2), ("pcre_int32", 4)]
|
||||
limits = [(0, 255), (0, 65535), (0, 4294967295),
|
||||
(-128, 127), (-32768, 32767), (-2147483648, 2147483647)]
|
||||
minval = min(table)
|
||||
maxval = max(table)
|
||||
for num, (minlimit, maxlimit) in enumerate(limits):
|
||||
if minlimit <= minval and maxval <= maxlimit:
|
||||
return type_size[num]
|
||||
else:
|
||||
raise OverflowError("Too large to fit into C types")
|
||||
|
||||
def get_tables_size(*tables):
|
||||
total_size = 0
|
||||
for table in tables:
|
||||
type, size = get_type_size(table)
|
||||
total_size += size * len(table)
|
||||
return total_size
|
||||
|
||||
# Compress the table into the two stages
|
||||
def compress_table(table, block_size):
|
||||
blocks = {} # Dictionary for finding identical blocks
|
||||
stage1 = [] # Stage 1 table contains block numbers (indices into stage 2 table)
|
||||
stage2 = [] # Stage 2 table contains the blocks with property values
|
||||
table = tuple(table)
|
||||
for i in range(0, len(table), block_size):
|
||||
block = table[i:i+block_size]
|
||||
start = blocks.get(block)
|
||||
if start is None:
|
||||
# Allocate a new block
|
||||
start = len(stage2) / block_size
|
||||
stage2 += block
|
||||
blocks[block] = start
|
||||
stage1.append(start)
|
||||
|
||||
return stage1, stage2
|
||||
|
||||
# Print a table
|
||||
def print_table(table, table_name, block_size = None):
|
||||
type, size = get_type_size(table)
|
||||
ELEMS_PER_LINE = 16
|
||||
|
||||
s = "const %s %s[] = { /* %d bytes" % (type, table_name, size * len(table))
|
||||
if block_size:
|
||||
s += ", block = %d" % block_size
|
||||
print(s + " */")
|
||||
table = tuple(table)
|
||||
if block_size is None:
|
||||
fmt = "%3d," * ELEMS_PER_LINE + " /* U+%04X */"
|
||||
mult = MAX_UNICODE / len(table)
|
||||
for i in range(0, len(table), ELEMS_PER_LINE):
|
||||
print(fmt % (table[i:i+ELEMS_PER_LINE] +
|
||||
(int(i * mult),)))
|
||||
else:
|
||||
if block_size > ELEMS_PER_LINE:
|
||||
el = ELEMS_PER_LINE
|
||||
else:
|
||||
el = block_size
|
||||
fmt = "%3d," * el + "\n"
|
||||
if block_size > ELEMS_PER_LINE:
|
||||
fmt = fmt * int(block_size / ELEMS_PER_LINE)
|
||||
for i in range(0, len(table), block_size):
|
||||
print(("/* block %d */\n" + fmt) % ((i / block_size,) + table[i:i+block_size]))
|
||||
print("};\n")
|
||||
|
||||
# Extract the unique combinations of properties into records
|
||||
def combine_tables(*tables):
|
||||
records = {}
|
||||
index = []
|
||||
for t in zip(*tables):
|
||||
i = records.get(t)
|
||||
if i is None:
|
||||
i = records[t] = len(records)
|
||||
index.append(i)
|
||||
return index, records
|
||||
|
||||
def get_record_size_struct(records):
|
||||
size = 0
|
||||
structure = '/* When recompiling tables with a new Unicode version, please check the\n' + \
|
||||
'types in this structure definition from pcre2_internal.h (the actual\n' + \
|
||||
'field names will be different):\n\ntypedef struct {\n'
|
||||
for i in range(len(records[0])):
|
||||
record_slice = [record[i] for record in records]
|
||||
slice_type, slice_size = get_type_size(record_slice)
|
||||
# add padding: round up to the nearest power of slice_size
|
||||
size = (size + slice_size - 1) & -slice_size
|
||||
size += slice_size
|
||||
structure += '%s property_%d;\n' % (slice_type, i)
|
||||
|
||||
# round up to the first item of the next structure in array
|
||||
record_slice = [record[0] for record in records]
|
||||
slice_type, slice_size = get_type_size(record_slice)
|
||||
size = (size + slice_size - 1) & -slice_size
|
||||
|
||||
structure += '} ucd_record;\n*/\n'
|
||||
return size, structure
|
||||
|
||||
def test_record_size():
|
||||
tests = [ \
|
||||
( [(3,), (6,), (6,), (1,)], 1 ), \
|
||||
( [(300,), (600,), (600,), (100,)], 2 ), \
|
||||
( [(25, 3), (6, 6), (34, 6), (68, 1)], 2 ), \
|
||||
( [(300, 3), (6, 6), (340, 6), (690, 1)], 4 ), \
|
||||
( [(3, 300), (6, 6), (6, 340), (1, 690)], 4 ), \
|
||||
( [(300, 300), (6, 6), (6, 340), (1, 690)], 4 ), \
|
||||
( [(3, 100000), (6, 6), (6, 123456), (1, 690)], 8 ), \
|
||||
( [(100000, 300), (6, 6), (123456, 6), (1, 690)], 8 ), \
|
||||
]
|
||||
for test in tests:
|
||||
size, struct = get_record_size_struct(test[0])
|
||||
assert(size == test[1])
|
||||
#print struct
|
||||
|
||||
def print_records(records, record_size):
|
||||
print('const ucd_record PRIV(ucd_records)[] = { ' + \
|
||||
'/* %d bytes, record size %d */' % (len(records) * record_size, record_size))
|
||||
|
||||
records = list(zip(list(records.keys()), list(records.values())))
|
||||
records.sort(key = lambda x: x[1])
|
||||
for i, record in enumerate(records):
|
||||
print((' {' + '%6d, ' * len(record[0]) + '}, /* %3d */') % (record[0] + (i,)))
|
||||
print('};\n')
|
||||
|
||||
script_names = ['Unknown', 'Arabic', 'Armenian', 'Bengali', 'Bopomofo', 'Braille', 'Buginese', 'Buhid', 'Canadian_Aboriginal',
|
||||
'Cherokee', 'Common', 'Coptic', 'Cypriot', 'Cyrillic', 'Deseret', 'Devanagari', 'Ethiopic', 'Georgian',
|
||||
'Glagolitic', 'Gothic', 'Greek', 'Gujarati', 'Gurmukhi', 'Han', 'Hangul', 'Hanunoo', 'Hebrew', 'Hiragana',
|
||||
'Inherited', 'Kannada', 'Katakana', 'Kharoshthi', 'Khmer', 'Lao', 'Latin', 'Limbu', 'Linear_B', 'Malayalam',
|
||||
'Mongolian', 'Myanmar', 'New_Tai_Lue', 'Ogham', 'Old_Italic', 'Old_Persian', 'Oriya', 'Osmanya', 'Runic',
|
||||
'Shavian', 'Sinhala', 'Syloti_Nagri', 'Syriac', 'Tagalog', 'Tagbanwa', 'Tai_Le', 'Tamil', 'Telugu', 'Thaana',
|
||||
'Thai', 'Tibetan', 'Tifinagh', 'Ugaritic', 'Yi',
|
||||
# New for Unicode 5.0
|
||||
'Balinese', 'Cuneiform', 'Nko', 'Phags_Pa', 'Phoenician',
|
||||
# New for Unicode 5.1
|
||||
'Carian', 'Cham', 'Kayah_Li', 'Lepcha', 'Lycian', 'Lydian', 'Ol_Chiki', 'Rejang', 'Saurashtra', 'Sundanese', 'Vai',
|
||||
# New for Unicode 5.2
|
||||
'Avestan', 'Bamum', 'Egyptian_Hieroglyphs', 'Imperial_Aramaic',
|
||||
'Inscriptional_Pahlavi', 'Inscriptional_Parthian',
|
||||
'Javanese', 'Kaithi', 'Lisu', 'Meetei_Mayek',
|
||||
'Old_South_Arabian', 'Old_Turkic', 'Samaritan', 'Tai_Tham', 'Tai_Viet',
|
||||
# New for Unicode 6.0.0
|
||||
'Batak', 'Brahmi', 'Mandaic',
|
||||
# New for Unicode 6.1.0
|
||||
'Chakma', 'Meroitic_Cursive', 'Meroitic_Hieroglyphs', 'Miao', 'Sharada', 'Sora_Sompeng', 'Takri',
|
||||
# New for Unicode 7.0.0
|
||||
'Bassa_Vah', 'Caucasian_Albanian', 'Duployan', 'Elbasan', 'Grantha', 'Khojki', 'Khudawadi',
|
||||
'Linear_A', 'Mahajani', 'Manichaean', 'Mende_Kikakui', 'Modi', 'Mro', 'Nabataean',
|
||||
'Old_North_Arabian', 'Old_Permic', 'Pahawh_Hmong', 'Palmyrene', 'Psalter_Pahlavi',
|
||||
'Pau_Cin_Hau', 'Siddham', 'Tirhuta', 'Warang_Citi',
|
||||
# New for Unicode 8.0.0
|
||||
'Ahom', 'Anatolian_Hieroglyphs', 'Hatran', 'Multani', 'Old_Hungarian',
|
||||
'SignWriting',
|
||||
# New for Unicode 10.0.0
|
||||
'Adlam', 'Bhaiksuki', 'Marchen', 'Newa', 'Osage', 'Tangut', 'Masaram_Gondi',
|
||||
'Nushu', 'Soyombo', 'Zanabazar_Square',
|
||||
# New for Unicode 11.0.0
|
||||
'Dogra', 'Gunjala_Gondi', 'Hanifi_Rohingya', 'Makasar', 'Medefaidrin',
|
||||
'Old_Sogdian', 'Sogdian',
|
||||
# New for Unicode 12.0.0
|
||||
'Elymaic', 'Nandinagari', 'Nyiakeng_Puachue_Hmong', 'Wancho',
|
||||
# New for Unicode 13.0.0
|
||||
'Chorasmian', 'Dives_Akuru', 'Khitan_Small_Script', 'Yezidi'
|
||||
]
|
||||
|
||||
script_abbrevs = [
|
||||
'Zzzz', 'Arab', 'Armn', 'Beng', 'Bopo', 'Brai', 'Bugi', 'Buhd', 'Cans',
|
||||
'Cher', 'Zyyy', 'Copt', 'Cprt', 'Cyrl', 'Dsrt', 'Deva', 'Ethi', 'Geor',
|
||||
'Glag', 'Goth', 'Grek', 'Gujr', 'Guru', 'Hani', 'Hang', 'Hano', 'Hebr',
|
||||
'Hira', 'Zinh', 'Knda', 'Kana', 'Khar', 'Khmr', 'Laoo', 'Latn', 'Limb',
|
||||
'Linb', 'Mlym', 'Mong', 'Mymr', 'Talu', 'Ogam', 'Ital', 'Xpeo', 'Orya',
|
||||
'Osma', 'Runr', 'Shaw', 'Sinh', 'Sylo', 'Syrc', 'Tglg', 'Tagb', 'Tale',
|
||||
'Taml', 'Telu', 'Thaa', 'Thai', 'Tibt', 'Tfng', 'Ugar', 'Yiii',
|
||||
#New for Unicode 5.0
|
||||
'Bali', 'Xsux', 'Nkoo', 'Phag', 'Phnx',
|
||||
#New for Unicode 5.1
|
||||
'Cari', 'Cham', 'Kali', 'Lepc', 'Lyci', 'Lydi', 'Olck', 'Rjng', 'Saur',
|
||||
'Sund', 'Vaii',
|
||||
#New for Unicode 5.2
|
||||
'Avst', 'Bamu', 'Egyp', 'Armi', 'Phli', 'Prti', 'Java', 'Kthi', 'Lisu',
|
||||
'Mtei', 'Sarb', 'Orkh', 'Samr', 'Lana', 'Tavt',
|
||||
#New for Unicode 6.0.0
|
||||
'Batk', 'Brah', 'Mand',
|
||||
#New for Unicode 6.1.0
|
||||
'Cakm', 'Merc', 'Mero', 'Plrd', 'Shrd', 'Sora', 'Takr',
|
||||
#New for Unicode 7.0.0
|
||||
'Bass', 'Aghb', 'Dupl', 'Elba', 'Gran', 'Khoj', 'Sind', 'Lina', 'Mahj',
|
||||
'Mani', 'Mend', 'Modi', 'Mroo', 'Nbat', 'Narb', 'Perm', 'Hmng', 'Palm',
|
||||
'Phlp', 'Pauc', 'Sidd', 'Tirh', 'Wara',
|
||||
#New for Unicode 8.0.0
|
||||
'Ahom', 'Hluw', 'Hatr', 'Mult', 'Hung', 'Sgnw',
|
||||
#New for Unicode 10.0.0
|
||||
'Adlm', 'Bhks', 'Marc', 'Newa', 'Osge', 'Tang', 'Gonm', 'Nshu', 'Soyo',
|
||||
'Zanb',
|
||||
#New for Unicode 11.0.0
|
||||
'Dogr', 'Gong', 'Rohg', 'Maka', 'Medf', 'Sogo', 'Sogd',
|
||||
#New for Unicode 12.0.0
|
||||
'Elym', 'Nand', 'Hmnp', 'Wcho',
|
||||
#New for Unicode 13.0.0
|
||||
'Chrs', 'Diak', 'Kits', 'Yezi'
|
||||
]
|
||||
|
||||
category_names = ['Cc', 'Cf', 'Cn', 'Co', 'Cs', 'Ll', 'Lm', 'Lo', 'Lt', 'Lu',
|
||||
'Mc', 'Me', 'Mn', 'Nd', 'Nl', 'No', 'Pc', 'Pd', 'Pe', 'Pf', 'Pi', 'Po', 'Ps',
|
||||
'Sc', 'Sk', 'Sm', 'So', 'Zl', 'Zp', 'Zs' ]
|
||||
|
||||
# The Extended_Pictographic property is not found in the file where all the
|
||||
# others are (GraphemeBreakProperty.txt). It comes from the emoji-data.txt
|
||||
# file, but we list it here so that the name has the correct index value.
|
||||
|
||||
break_property_names = ['CR', 'LF', 'Control', 'Extend', 'Prepend',
|
||||
'SpacingMark', 'L', 'V', 'T', 'LV', 'LVT', 'Regional_Indicator', 'Other',
|
||||
'ZWJ', 'Extended_Pictographic' ]
|
||||
|
||||
test_record_size()
|
||||
unicode_version = ""
|
||||
|
||||
script = read_table('Unicode.tables/Scripts.txt', make_get_names(script_names), script_names.index('Unknown'))
|
||||
category = read_table('Unicode.tables/DerivedGeneralCategory.txt', make_get_names(category_names), category_names.index('Cn'))
|
||||
break_props = read_table('Unicode.tables/GraphemeBreakProperty.txt', make_get_names(break_property_names), break_property_names.index('Other'))
|
||||
other_case = read_table('Unicode.tables/CaseFolding.txt', get_other_case, 0)
|
||||
|
||||
# The grapheme breaking rules were changed for Unicode 11.0.0 (June 2018). Now
|
||||
# we need to find the Extended_Pictographic property for emoji characters. This
|
||||
# can be set as an additional grapheme break property, because the default for
|
||||
# all the emojis is "other". We scan the emoji-data.txt file and modify the
|
||||
# break-props table.
|
||||
|
||||
file = open('Unicode.tables/emoji-data.txt', 'r', encoding='utf-8')
|
||||
for line in file:
|
||||
line = re.sub(r'#.*', '', line)
|
||||
chardata = list(map(str.strip, line.split(';')))
|
||||
if len(chardata) <= 1:
|
||||
continue
|
||||
|
||||
if chardata[1] != "Extended_Pictographic":
|
||||
continue
|
||||
|
||||
m = re.match(r'([0-9a-fA-F]+)(\.\.([0-9a-fA-F]+))?$', chardata[0])
|
||||
char = int(m.group(1), 16)
|
||||
if m.group(3) is None:
|
||||
last = char
|
||||
else:
|
||||
last = int(m.group(3), 16)
|
||||
for i in range(char, last + 1):
|
||||
if break_props[i] != break_property_names.index('Other'):
|
||||
print("WARNING: Emoji 0x%x has break property %s, not 'Other'",
|
||||
i, break_property_names[break_props[i]], file=sys.stderr)
|
||||
break_props[i] = break_property_names.index('Extended_Pictographic')
|
||||
file.close()
|
||||
|
||||
# The Script Extensions property default value is the Script value. Parse the
|
||||
# file, setting 'Unknown' as the default (this will never be a Script Extension
|
||||
# value), then scan it and fill in the default from Scripts. Code added by PH
|
||||
# in October 2018. Positive values are used for just a single script for a
|
||||
# code point. Negative values are negated offsets in a list of lists of
|
||||
# multiple scripts. Initialize this list with a single entry, as the zeroth
|
||||
# element is never used.
|
||||
|
||||
script_lists = [0]
|
||||
script_abbrevs_default = script_abbrevs.index('Zzzz')
|
||||
scriptx = read_table('Unicode.tables/ScriptExtensions.txt', get_script_extension, script_abbrevs_default)
|
||||
|
||||
for i in range(0, MAX_UNICODE):
|
||||
if scriptx[i] == script_abbrevs_default:
|
||||
scriptx[i] = script[i]
|
||||
|
||||
# With the addition of the new Script Extensions field, we need some padding
|
||||
# to get the Unicode records up to 12 bytes (multiple of 4). Set a value
|
||||
# greater than 255 to make the field 16 bits.
|
||||
|
||||
padding_dummy = [0] * MAX_UNICODE
|
||||
padding_dummy[0] = 256
|
||||
|
||||
# This block of code was added by PH in September 2012. I am not a Python
|
||||
# programmer, so the style is probably dreadful, but it does the job. It scans
|
||||
# the other_case table to find sets of more than two characters that must all
|
||||
# match each other caselessly. Later in this script a table of these sets is
|
||||
# written out. However, we have to do this work here in order to compute the
|
||||
# offsets in the table that are inserted into the main table.
|
||||
|
||||
# The CaseFolding.txt file lists pairs, but the common logic for reading data
|
||||
# sets only one value, so first we go through the table and set "return"
|
||||
# offsets for those that are not already set.
|
||||
|
||||
for c in range(MAX_UNICODE):
|
||||
if other_case[c] != 0 and other_case[c + other_case[c]] == 0:
|
||||
other_case[c + other_case[c]] = -other_case[c]
|
||||
|
||||
# Now scan again and create equivalence sets.
|
||||
|
||||
sets = []
|
||||
|
||||
for c in range(MAX_UNICODE):
|
||||
o = c + other_case[c]
|
||||
|
||||
# Trigger when this character's other case does not point back here. We
|
||||
# now have three characters that are case-equivalent.
|
||||
|
||||
if other_case[o] != -other_case[c]:
|
||||
t = o + other_case[o]
|
||||
|
||||
# Scan the existing sets to see if any of the three characters are already
|
||||
# part of a set. If so, unite the existing set with the new set.
|
||||
|
||||
appended = 0
|
||||
for s in sets:
|
||||
found = 0
|
||||
for x in s:
|
||||
if x == c or x == o or x == t:
|
||||
found = 1
|
||||
|
||||
# Add new characters to an existing set
|
||||
|
||||
if found:
|
||||
found = 0
|
||||
for y in [c, o, t]:
|
||||
for x in s:
|
||||
if x == y:
|
||||
found = 1
|
||||
if not found:
|
||||
s.append(y)
|
||||
appended = 1
|
||||
|
||||
# If we have not added to an existing set, create a new one.
|
||||
|
||||
if not appended:
|
||||
sets.append([c, o, t])
|
||||
|
||||
# End of loop looking for caseless sets.
|
||||
|
||||
# Now scan the sets and set appropriate offsets for the characters.
|
||||
|
||||
caseless_offsets = [0] * MAX_UNICODE
|
||||
|
||||
offset = 1;
|
||||
for s in sets:
|
||||
for x in s:
|
||||
caseless_offsets[x] = offset
|
||||
offset += len(s) + 1
|
||||
|
||||
# End of block of code for creating offsets for caseless matching sets.
|
||||
|
||||
|
||||
# Combine the tables
|
||||
|
||||
table, records = combine_tables(script, category, break_props,
|
||||
caseless_offsets, other_case, scriptx, padding_dummy)
|
||||
|
||||
record_size, record_struct = get_record_size_struct(list(records.keys()))
|
||||
|
||||
# Find the optimum block size for the two-stage table
|
||||
min_size = sys.maxsize
|
||||
for block_size in [2 ** i for i in range(5,10)]:
|
||||
size = len(records) * record_size
|
||||
stage1, stage2 = compress_table(table, block_size)
|
||||
size += get_tables_size(stage1, stage2)
|
||||
#print "/* block size %5d => %5d bytes */" % (block_size, size)
|
||||
if size < min_size:
|
||||
min_size = size
|
||||
min_stage1, min_stage2 = stage1, stage2
|
||||
min_block_size = block_size
|
||||
|
||||
print("/* This module is generated by the maint/MultiStage2.py script.")
|
||||
print("Do not modify it by hand. Instead modify the script and run it")
|
||||
print("to regenerate this code.")
|
||||
print()
|
||||
print("As well as being part of the PCRE2 library, this module is #included")
|
||||
print("by the pcre2test program, which redefines the PRIV macro to change")
|
||||
print("table names from _pcre2_xxx to xxxx, thereby avoiding name clashes")
|
||||
print("with the library. At present, just one of these tables is actually")
|
||||
print("needed. */")
|
||||
print()
|
||||
print("#ifndef PCRE2_PCRE2TEST")
|
||||
print()
|
||||
print("#ifdef HAVE_CONFIG_H")
|
||||
print("#include \"config.h\"")
|
||||
print("#endif")
|
||||
print()
|
||||
print("#include \"pcre2_internal.h\"")
|
||||
print()
|
||||
print("#endif /* PCRE2_PCRE2TEST */")
|
||||
print()
|
||||
print("/* Unicode character database. */")
|
||||
print("/* This file was autogenerated by the MultiStage2.py script. */")
|
||||
print("/* Total size: %d bytes, block size: %d. */" % (min_size, min_block_size))
|
||||
print()
|
||||
print("/* The tables herein are needed only when UCP support is built,")
|
||||
print("and in PCRE2 that happens automatically with UTF support.")
|
||||
print("This module should not be referenced otherwise, so")
|
||||
print("it should not matter whether it is compiled or not. However")
|
||||
print("a comment was received about space saving - maybe the guy linked")
|
||||
print("all the modules rather than using a library - so we include a")
|
||||
print("condition to cut out the tables when not needed. But don't leave")
|
||||
print("a totally empty module because some compilers barf at that.")
|
||||
print("Instead, just supply some small dummy tables. */")
|
||||
print()
|
||||
print("#ifndef SUPPORT_UNICODE")
|
||||
print("const ucd_record PRIV(ucd_records)[] = {{0,0,0,0,0,0,0 }};")
|
||||
print("const uint16_t PRIV(ucd_stage1)[] = {0};")
|
||||
print("const uint16_t PRIV(ucd_stage2)[] = {0};")
|
||||
print("const uint32_t PRIV(ucd_caseless_sets)[] = {0};")
|
||||
print("#else")
|
||||
print()
|
||||
print("const char *PRIV(unicode_version) = \"{}\";".format(unicode_version))
|
||||
print()
|
||||
print("/* If the 32-bit library is run in non-32-bit mode, character values")
|
||||
print("greater than 0x10ffff may be encountered. For these we set up a")
|
||||
print("special record. */")
|
||||
print()
|
||||
print("#if PCRE2_CODE_UNIT_WIDTH == 32")
|
||||
print("const ucd_record PRIV(dummy_ucd_record)[] = {{")
|
||||
print(" ucp_Unknown, /* script */")
|
||||
print(" ucp_Cn, /* type unassigned */")
|
||||
print(" ucp_gbOther, /* grapheme break property */")
|
||||
print(" 0, /* case set */")
|
||||
print(" 0, /* other case */")
|
||||
print(" ucp_Unknown, /* script extension */")
|
||||
print(" 0, /* dummy filler */")
|
||||
print(" }};")
|
||||
print("#endif")
|
||||
print()
|
||||
print(record_struct)
|
||||
|
||||
# --- Added by PH: output the table of caseless character sets ---
|
||||
|
||||
print("/* This table contains lists of characters that are caseless sets of")
|
||||
print("more than one character. Each list is terminated by NOTACHAR. */\n")
|
||||
|
||||
print("const uint32_t PRIV(ucd_caseless_sets)[] = {")
|
||||
print(" NOTACHAR,")
|
||||
for s in sets:
|
||||
s = sorted(s)
|
||||
for x in s:
|
||||
print(' 0x%04x,' % x, end=' ')
|
||||
print(' NOTACHAR,')
|
||||
print('};')
|
||||
print()
|
||||
|
||||
# ------
|
||||
|
||||
print("/* When #included in pcre2test, we don't need the table of digit")
|
||||
print("sets, nor the the large main UCD tables. */")
|
||||
print()
|
||||
print("#ifndef PCRE2_PCRE2TEST")
|
||||
print()
|
||||
|
||||
# --- Added by PH: read Scripts.txt again for the sets of 10 digits. ---
|
||||
|
||||
digitsets = []
|
||||
file = open('Unicode.tables/Scripts.txt', 'r', encoding='utf-8')
|
||||
|
||||
for line in file:
|
||||
m = re.match(r'([0-9a-fA-F]+)\.\.([0-9a-fA-F]+)\s+;\s+\S+\s+#\s+Nd\s+', line)
|
||||
if m is None:
|
||||
continue
|
||||
first = int(m.group(1),16)
|
||||
last = int(m.group(2),16)
|
||||
if ((last - first + 1) % 10) != 0:
|
||||
print("ERROR: %04x..%04x does not contain a multiple of 10 characters" % (first, last),
|
||||
file=sys.stderr)
|
||||
while first < last:
|
||||
digitsets.append(first + 9)
|
||||
first += 10
|
||||
file.close()
|
||||
digitsets.sort()
|
||||
|
||||
print("/* This table lists the code points for the '9' characters in each")
|
||||
print("set of decimal digits. It is used to ensure that all the digits in")
|
||||
print("a script run come from the same set. */\n")
|
||||
print("const uint32_t PRIV(ucd_digit_sets)[] = {")
|
||||
|
||||
print(" %d, /* Number of subsequent values */" % len(digitsets), end='')
|
||||
count = 8
|
||||
for d in digitsets:
|
||||
if count == 8:
|
||||
print("\n ", end='')
|
||||
count = 0
|
||||
print(" 0x%05x," % d, end='')
|
||||
count += 1
|
||||
print("\n};\n")
|
||||
|
||||
print("/* This vector is a list of lists of scripts for the Script Extension")
|
||||
print("property. Each sublist is zero-terminated. */\n")
|
||||
print("const uint8_t PRIV(ucd_script_sets)[] = {")
|
||||
|
||||
count = 0
|
||||
print(" /* 0 */", end='')
|
||||
for d in script_lists:
|
||||
print(" %3d," % d, end='')
|
||||
count += 1
|
||||
if d == 0:
|
||||
print("\n /* %3d */" % count, end='')
|
||||
print("\n};\n")
|
||||
|
||||
# Output the main UCD tables.
|
||||
|
||||
print("/* These are the main two-stage UCD tables. The fields in each record are:")
|
||||
print("script (8 bits), character type (8 bits), grapheme break property (8 bits),")
|
||||
print("offset to multichar other cases or zero (8 bits), offset to other case")
|
||||
print("or zero (32 bits, signed), script extension (16 bits, signed), and a dummy")
|
||||
print("16-bit field to make the whole thing a multiple of 4 bytes. */\n")
|
||||
|
||||
print_records(records, record_size)
|
||||
print_table(min_stage1, 'PRIV(ucd_stage1)')
|
||||
print_table(min_stage2, 'PRIV(ucd_stage2)', min_block_size)
|
||||
print("#if UCD_BLOCK_SIZE != %d" % min_block_size)
|
||||
print("#error Please correct UCD_BLOCK_SIZE in pcre2_internal.h")
|
||||
print("#endif")
|
||||
print("#endif /* SUPPORT_UNICODE */")
|
||||
print()
|
||||
print("#endif /* PCRE2_PCRE2TEST */")
|
||||
|
||||
|
||||
# This code was part of the original contribution, but is commented out as it
|
||||
# was never used. A two-stage table has sufficed.
|
||||
|
||||
"""
|
||||
|
||||
# Three-stage tables:
|
||||
|
||||
# Find the optimum block size for 3-stage table
|
||||
min_size = sys.maxint
|
||||
for stage3_block in [2 ** i for i in range(2,6)]:
|
||||
stage_i, stage3 = compress_table(table, stage3_block)
|
||||
for stage2_block in [2 ** i for i in range(5,10)]:
|
||||
size = len(records) * 4
|
||||
stage1, stage2 = compress_table(stage_i, stage2_block)
|
||||
size += get_tables_size(stage1, stage2, stage3)
|
||||
# print "/* %5d / %3d => %5d bytes */" % (stage2_block, stage3_block, size)
|
||||
if size < min_size:
|
||||
min_size = size
|
||||
min_stage1, min_stage2, min_stage3 = stage1, stage2, stage3
|
||||
min_stage2_block, min_stage3_block = stage2_block, stage3_block
|
||||
|
||||
print "/* Total size: %d bytes" % min_size */
|
||||
print_records(records)
|
||||
print_table(min_stage1, 'ucd_stage1')
|
||||
print_table(min_stage2, 'ucd_stage2', min_stage2_block)
|
||||
print_table(min_stage3, 'ucd_stage3', min_stage3_block)
|
||||
|
||||
"""
|
202
maint/README
202
maint/README
|
@ -16,99 +16,122 @@ and also contains some notes for maintainers. Its contents are:
|
|||
Files in the maint directory
|
||||
============================
|
||||
|
||||
GenerateUtt.py A Python script to generate part of the pcre2_tables.c file
|
||||
that contains Unicode script names in a long string with
|
||||
offsets, which is tedious to maintain by hand.
|
||||
GenerateCommon.py
|
||||
A Python module containing data and functions that are used by the other
|
||||
Generate scripts.
|
||||
|
||||
GenerateTest26.py
|
||||
A Python script that generates input and expected output test data for test
|
||||
26, which tests certain aspects of Unicode property support.
|
||||
|
||||
ManyConfigTests A shell script that runs "configure, make, test" a number of
|
||||
times with different configuration settings.
|
||||
GenerateUcd.py
|
||||
A Python script that generates the file pcre2_ucd.c from GenerateCommon.py
|
||||
and Unicode data files, which are themselves downloaded from the Unicode web
|
||||
site. The generated file contains the tables for a 2-stage lookup of Unicode
|
||||
properties, along with some auxiliary tables. The script starts with a long
|
||||
comment that gives details of the tables it constructs.
|
||||
|
||||
MultiStage2.py A Python script that generates the file pcre2_ucd.c from six
|
||||
Unicode data files, which are themselves downloaded from the
|
||||
Unicode web site. Run this script in the "maint" directory.
|
||||
The generated file is written to stdout. It contains the
|
||||
tables for a 2-stage lookup of Unicode properties, along with
|
||||
some auxiliary tables.
|
||||
GenerateUcpHeader.py
|
||||
A Python script that generates the file pcre2_ucp.h from GenerateCommon.py
|
||||
and Unicode data files. The generated file defines constants for various
|
||||
Unicode property values.
|
||||
|
||||
GenerateUcpTables.py
|
||||
A Python script that generates the file pcre2_ucptables.c from
|
||||
GenerateCommon.py and Unicode data files. The generated file contains tables
|
||||
for looking up Unicode property names.
|
||||
|
||||
ManyConfigTests
|
||||
A shell script that runs "configure, make, test" a number of times with
|
||||
different configuration settings.
|
||||
|
||||
pcre2_chartables.c.non-standard
|
||||
This is a set of character tables that came from a Windows
|
||||
system. It has characters greater than 128 that are set as
|
||||
spaces, amongst other things. I kept it so that it can be
|
||||
used for testing from time to time.
|
||||
This is a set of character tables that came from a Windows system. It has
|
||||
characters greater than 128 that are set as spaces, amongst other things. I
|
||||
kept it so that it can be used for testing from time to time.
|
||||
|
||||
README This file.
|
||||
README
|
||||
This file.
|
||||
|
||||
Unicode.tables The files in this directory were downloaded from the Unicode
|
||||
web site. They contain information about Unicode characters
|
||||
and scripts. The ones used by the MultiStage2.py script are
|
||||
CaseFolding.txt, DerivedGeneralCategory.txt, Scripts.txt,
|
||||
ScriptExtensions.txt, GraphemeBreakProperty.txt, and
|
||||
emoji-data.txt. I've kept UnicodeData.txt (which is no longer
|
||||
used by the script) because it is useful occasionally for
|
||||
manually looking up the details of certain characters.
|
||||
However, note that character names in this file such as
|
||||
"Arabic sign sanah" do NOT mean that the character is in a
|
||||
particular script (in this case, Arabic). Scripts.txt and
|
||||
ScriptExtensions.txt are where to look for script information.
|
||||
Unicode.tables
|
||||
The files in this directory were downloaded from the Unicode web site. They
|
||||
contain information about Unicode characters and scripts, and are used by the
|
||||
Generate scripts. There is also UnicodeData.txt, which is no longer used by
|
||||
any script, because it is useful occasionally for manually looking up the
|
||||
details of certain characters. However, note that character names in this
|
||||
file such as "Arabic sign sanah" do NOT mean that the character is in a
|
||||
particular script (in this case, Arabic). Scripts.txt and
|
||||
ScriptExtensions.txt are where to look for script information.
|
||||
|
||||
ucptest.c A short C program for testing the Unicode property macros
|
||||
that do lookups in the pcre2_ucd.c data, mainly useful after
|
||||
rebuilding the Unicode property table. Compile and run this in
|
||||
the "maint" directory (see comments at its head). This program
|
||||
can also be used to find characters with specific properties.
|
||||
ucptest.c
|
||||
A program for testing the Unicode property macros that do lookups in the
|
||||
pcre2_ucd.c data, mainly useful after rebuilding the Unicode property tables.
|
||||
Compile and run this in the "maint" directory (see comments at its head).
|
||||
This program can also be used to find characters with specific properties and
|
||||
to list which properties are supported.
|
||||
|
||||
ucptestdata A directory containing four files, testinput{1,2} and
|
||||
testoutput{1,2}, for use in conjunction with the ucptest
|
||||
program.
|
||||
ucptestdata
|
||||
A directory containing four files, testinput{1,2} and testoutput{1,2}, for
|
||||
use in conjunction with the ucptest program.
|
||||
|
||||
utf8.c A short, freestanding C program for converting a Unicode code
|
||||
point into a sequence of bytes in the UTF-8 encoding, and vice
|
||||
versa. If its argument is a hex number such as 0x1234, it
|
||||
outputs a list of the equivalent UTF-8 bytes. If its argument
|
||||
is a sequence of concatenated UTF-8 bytes (e.g. e188b4) it
|
||||
treats them as a UTF-8 character and outputs the equivalent
|
||||
code point in hex. See comments at its head for details.
|
||||
utf8.c
|
||||
A short, freestanding C program for converting a Unicode code point into a
|
||||
sequence of bytes in the UTF-8 encoding, and vice versa. If its argument is a
|
||||
hex number such as 0x1234, it outputs a list of the equivalent UTF-8 bytes.
|
||||
If its argument is a sequence of concatenated UTF-8 bytes (e.g. 12e188b4) it
|
||||
treats them as a UTF-8 string and outputs the equivalent code points in hex.
|
||||
See comments at its head for details.
|
||||
|
||||
|
||||
Updating to a new Unicode release
|
||||
=================================
|
||||
|
||||
When there is a new release of Unicode, the files in Unicode.tables must be
|
||||
refreshed from the web site. If the new version of Unicode adds new character
|
||||
scripts, the source file pcre2_ucp.h and both the MultiStage2.py and the
|
||||
GenerateUtt.py scripts must be edited to add the new names. I have been adding
|
||||
each new group at the end of the relevant list, with a comment. Note also that
|
||||
both the pcre2syntax.3 and pcre2pattern.3 man pages contain lists of Unicode
|
||||
script names.
|
||||
refreshed from the web site. Once that is done, the four Python scripts that
|
||||
generate files from the Unicode data can be run from within the "maint"
|
||||
directory.
|
||||
|
||||
MultiStage2.py has two lists: the full names and the abbreviations that are
|
||||
found in the ScriptExtensions.txt file. A list of script names and their
|
||||
abbreviations can be found in the PropertyValueAliases.txt file on the
|
||||
Unicode web site. There is also a Wikipedia page that lists them, and notes the
|
||||
Unicode version in which they were introduced:
|
||||
Note: Previously, it was necessary to update lists of scripts and their
|
||||
abbreviations by hand before running the Python scripts. This is no longer
|
||||
necessary because the scripts have been upgraded to extract this information
|
||||
themselves. Also, there used to be explicit lists of scripts in two of the man
|
||||
pages. This is no longer the case; the pcre2test program can now output a list
|
||||
of supported scripts.
|
||||
|
||||
https://en.wikipedia.org/wiki/Unicode_scripts#Table_of_Unicode_scripts
|
||||
You can give an output file name as an argument to the following scripts, but
|
||||
by default:
|
||||
|
||||
Once the script name lists have been updated, MultiStage2.py can be run to
|
||||
generate a new version of pcre2_ucd.c, and GenerateUtt.py can be run to
|
||||
generate the tricky tables for inclusion in pcre2_tables.c (which must be
|
||||
hand-edited). If MultiStage2.py gives the error "ValueError: list.index(x): x
|
||||
not in list", the cause is usually a missing (or misspelt) name in one of the
|
||||
lists of scripts.
|
||||
GenerateUcd.py creates pcre2_ucd.c )
|
||||
GenerateUcpHeader.py creates pcre2_ucp.h ) in the current directory
|
||||
GenerateUcpTables.py creates pcre2_ucptables.c )
|
||||
|
||||
The ucptest program can be compiled and used to check that the new tables in
|
||||
pcre2_ucd.c work properly, using the data files in ucptestdata to check a
|
||||
number of test characters. It used to be necessary to update the source
|
||||
ucptest.c whenever new Unicode scripts were added, but this is no longer
|
||||
required because that program now uses the lists in the PCRE2 source. However,
|
||||
adding a few tests for new scripts to the files in ucptestdata is a good idea.
|
||||
These files can be compared against the existing versions in the src directory
|
||||
to check on any changes before replacing the old files, but you can also
|
||||
generate directly into the final location by running:
|
||||
|
||||
./GenerateUcd.py ../src/pcre2_ucd.c
|
||||
./GenerateUcpHeader.py ../src/pcre2_ucp.h
|
||||
./GenerateUcpTables.py ../src/pcre2_ucptables.c
|
||||
|
||||
Once the .c and .h files are in the ../src directory, the ucptest program can
|
||||
be compiled and used to check that the new tables work properly. The data files
|
||||
in ucptestdata are set up to check a number of test characters. See the
|
||||
comments at the start of ucptest.c. If there are new scripts, adding a few
|
||||
tests to the files in ucptestdata is a good idea.
|
||||
|
||||
Finally, you should run the GenerateTest26.py script to regenerate new versions
|
||||
of the input and expected output from a series of Unicode property tests that
|
||||
are automatically generated from the Unicode data files. By default, the files
|
||||
are written to testinput26 and testoutput26 in the current directory, but you
|
||||
can give an alternative directory name as an argument to the script. These
|
||||
files should eventually be installed in the main testdata directory.
|
||||
|
||||
|
||||
Preparing for a PCRE2 release
|
||||
=============================
|
||||
|
||||
This section contains a checklist of things that I consult before building a
|
||||
distribution for a new release.
|
||||
This section contains a checklist of things that I do before building a new
|
||||
release.
|
||||
|
||||
. Ensure that the version number and version date are correct in configure.ac.
|
||||
|
||||
|
@ -117,17 +140,16 @@ distribution for a new release.
|
|||
|
||||
. If new build options or new source files have been added, ensure that they
|
||||
are added to the CMake files as well as to the autoconf files. The relevant
|
||||
files are CMakeLists.txt and config-cmake.h.in. After making a release
|
||||
tarball, test it out with CMake if there have been changes here.
|
||||
files are CMakeLists.txt and config-cmake.h.in. After making a release, test
|
||||
it out with CMake if there have been changes here.
|
||||
|
||||
. Run ./autogen.sh to ensure everything is up-to-date.
|
||||
|
||||
. Compile and test with many different config options, and combinations of
|
||||
options. Also, test with valgrind by running "RunTest valgrind" and
|
||||
"RunGrepTest valgrind" (which takes quite a long time). The script
|
||||
maint/ManyConfigTests now encapsulates this testing. It runs tests with
|
||||
different configurations, and it also runs some of them with valgrind, all of
|
||||
which can take quite some time.
|
||||
"RunGrepTest valgrind". The script maint/ManyConfigTests now encapsulates
|
||||
this testing. It runs tests with different configurations, and it also runs
|
||||
some of them with valgrind, all of which can take quite some time.
|
||||
|
||||
. Run tests in both 32-bit and 64-bit environments if possible. I can no longer
|
||||
run 32-bit tests.
|
||||
|
@ -142,7 +164,8 @@ distribution for a new release.
|
|||
-fsanitize=signed-integer-overflow
|
||||
|
||||
. Do a test build using CMake. Remove src/config.h first, lest it override the
|
||||
version that CMake creates. Do NOT use parallel make.
|
||||
version that CMake creates. Also do a CMake unity build to check that it
|
||||
still works: [c]cmake -DCMAKE_UNITY_BUILD=ON sets up a unity build.
|
||||
|
||||
. Run perltest.sh on the test data for tests 1 and 4. The output should match
|
||||
the PCRE2 test output, apart from the version identification at the start of
|
||||
|
@ -161,11 +184,12 @@ distribution for a new release.
|
|||
systems. For example, on Solaris it is helpful to test using Sun's cc
|
||||
compiler as a change from gcc. Adding -xarch=v9 to the cc options does a
|
||||
64-bit test, but it also needs -S 64 for pcre2test to increase the stack size
|
||||
for test 2. Since I retired I can no longer do much of this, but instead I
|
||||
rely on putting out release candidates for testing by the community.
|
||||
for test 2. Since I retired I can no longer do much of this. There are
|
||||
automated tests under Ubuntu, Alpine, and Windows that are now set up as
|
||||
GitHub actions. Check that they are running clean.
|
||||
|
||||
. The buildbots at http://buildfarm.opencsw.org/ do some automated testing
|
||||
of PCRE2 and should be checked before putting out a release.
|
||||
of PCRE2 and should also be checked before putting out a release.
|
||||
|
||||
|
||||
Updating version info for libtool
|
||||
|
@ -221,10 +245,11 @@ it reports them and then aborts. Otherwise it removes trailing spaces from
|
|||
sources and refreshes the HTML documentation. Update the GitHub repository with
|
||||
"git push".
|
||||
|
||||
Once PrepareRelease has run clean, run "make distcheck" to create the tarball
|
||||
Once PrepareRelease has run clean, run "make distcheck" to create the tarballs
|
||||
and the zipball. I then sign these files. Double-check with "git status" that
|
||||
the repository is fully up-to-date, then create a new tag on GitHub. Upload the
|
||||
tarball, zipball, and the signatures as "assets" of the GitHub release.
|
||||
the repository is fully up-to-date, then create a new tag and a release on
|
||||
GitHub. Upload the tarballs, zipball, and the signatures as "assets" of the
|
||||
GitHub release.
|
||||
|
||||
When the new release is out, don't forget to tell webmaster@pcre.org and the
|
||||
mailing list.
|
||||
|
@ -343,8 +368,6 @@ years.
|
|||
|
||||
See Unicode TR 29. The last two are very much aimed at natural language.
|
||||
|
||||
. (?[...]) extended classes: big project.
|
||||
|
||||
. Allow a callout to specify a number of characters to skip. This can be done
|
||||
compatibly via an extra callout field.
|
||||
|
||||
|
@ -414,13 +437,8 @@ years.
|
|||
with lookarounds for \b and \B. Ideally the setting should last till the end
|
||||
of the group, which means remembering all previous settings; maybe a fixed
|
||||
amount of stack would do - how deep would anyone want to nest these things?
|
||||
See GitHub issue #13 for a compendium of character class issues.
|
||||
|
||||
. Recognize the short script names. They are already listed in maint/
|
||||
Multistage2.py because they are needed for scanning the script extensions
|
||||
file.
|
||||
|
||||
. Use script extensions for \p?
|
||||
See GitHub issue #13 for a compendium of character class issues, including
|
||||
(?[...]) extended classes.
|
||||
|
||||
. A user suggested something like --with-build-info to set a build information
|
||||
string that could be retrieved by pcre2_config(). However, there's no
|
||||
|
@ -439,4 +457,4 @@ years.
|
|||
Philip Hazel
|
||||
Email local part: Philip.Hazel
|
||||
Email domain: gmail.com
|
||||
Last updated: 26 August 2021
|
||||
Last updated: 25 April 2022
|
||||
|
|
|
@ -0,0 +1,633 @@
|
|||
# BidiMirroring-14.0.0.txt
|
||||
# Date: 2021-08-08, 22:55:00 GMT [KW, RP]
|
||||
# © 2021 Unicode®, Inc.
|
||||
# For terms of use, see https://www.unicode.org/terms_of_use.html
|
||||
#
|
||||
# Unicode Character Database
|
||||
# For documentation, see https://www.unicode.org/reports/tr44/
|
||||
#
|
||||
# Bidi_Mirroring_Glyph Property
|
||||
#
|
||||
# This file is an informative contributory data file in the
|
||||
# Unicode Character Database.
|
||||
#
|
||||
# This data file lists characters that have the Bidi_Mirrored=Yes property
|
||||
# value, for which there is another Unicode character that typically has a glyph
|
||||
# that is the mirror image of the original character's glyph.
|
||||
#
|
||||
# The repertoire covered by the file is Unicode 14.0.0.
|
||||
#
|
||||
# The file contains a list of lines with mappings from one code point
|
||||
# to another one for character-based mirroring.
|
||||
# Note that for "real" mirroring, a rendering engine needs to select
|
||||
# appropriate alternative glyphs, and that many Unicode characters do not
|
||||
# have a mirror-image Unicode character.
|
||||
#
|
||||
# Each mapping line contains two fields, separated by a semicolon (';').
|
||||
# Each of the two fields contains a code point represented as a
|
||||
# variable-length hexadecimal value with 4 to 6 digits.
|
||||
# A comment indicates where the characters are "BEST FIT" mirroring.
|
||||
#
|
||||
# Code points for which Bidi_Mirrored=Yes, but for which no appropriate
|
||||
# characters exist with mirrored glyphs, are
|
||||
# listed as comments at the end of the file.
|
||||
#
|
||||
# Formally, the default value of the Bidi_Mirroring_Glyph property
|
||||
# for each code point is <none>, unless a mapping to
|
||||
# some other character is specified in this data file. When a code
|
||||
# point has the default value for the Bidi_Mirroring_Glyph property,
|
||||
# that means that no other character exists whose glyph is suitable
|
||||
# for character-based mirroring.
|
||||
#
|
||||
# For information on bidi mirroring, see UAX #9: Unicode Bidirectional Algorithm,
|
||||
# at https://www.unicode.org/reports/tr9/
|
||||
#
|
||||
# This file was originally created by Markus Scherer.
|
||||
# Extended for Unicode 3.2, 4.0, 4.1, 5.0, 5.1, 5.2, and 6.0 by Ken Whistler,
|
||||
# and for subsequent versions by Ken Whistler, Laurentiu Iancu, and Roozbeh Pournader.
|
||||
#
|
||||
# Historical and Compatibility Information:
|
||||
#
|
||||
# The OpenType Mirroring Pairs List (OMPL) is frozen to match the
|
||||
# Unicode 5.1 version of the Bidi_Mirroring_Glyph property (2008).
|
||||
# See https://www.microsoft.com/typography/otspec/ompl.txt
|
||||
#
|
||||
# The Unicode 6.1 version of the Bidi_Mirroring_Glyph property (2011)
|
||||
# added one mirroring pair: 27CB <--> 27CD.
|
||||
#
|
||||
# The Unicode 11.0 version of the Bidi_Mirroring_Glyph property (2018)
|
||||
# underwent a substantial revision, to formally recognize all of the
|
||||
# exact mirroring pairs and "BEST FIT" mirroring pairs that had been
|
||||
# added after the freezing of the OMPL list. As a result, starting
|
||||
# with Unicode 11.0, the bmg mapping values more accurately reflect
|
||||
# the current status of glyphs for Bidi_Mirrored characters in
|
||||
# the Unicode Standard, but this listing now extends significantly
|
||||
# beyond the frozen OMPL list. Implementers should be aware of this
|
||||
# intentional distinction.
|
||||
#
|
||||
# ############################################################
|
||||
#
|
||||
# Property: Bidi_Mirroring_Glyph
|
||||
#
|
||||
# @missing: 0000..10FFFF; <none>
|
||||
|
||||
0028; 0029 # LEFT PARENTHESIS
|
||||
0029; 0028 # RIGHT PARENTHESIS
|
||||
003C; 003E # LESS-THAN SIGN
|
||||
003E; 003C # GREATER-THAN SIGN
|
||||
005B; 005D # LEFT SQUARE BRACKET
|
||||
005D; 005B # RIGHT SQUARE BRACKET
|
||||
007B; 007D # LEFT CURLY BRACKET
|
||||
007D; 007B # RIGHT CURLY BRACKET
|
||||
00AB; 00BB # LEFT-POINTING DOUBLE ANGLE QUOTATION MARK
|
||||
00BB; 00AB # RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK
|
||||
0F3A; 0F3B # TIBETAN MARK GUG RTAGS GYON
|
||||
0F3B; 0F3A # TIBETAN MARK GUG RTAGS GYAS
|
||||
0F3C; 0F3D # TIBETAN MARK ANG KHANG GYON
|
||||
0F3D; 0F3C # TIBETAN MARK ANG KHANG GYAS
|
||||
169B; 169C # OGHAM FEATHER MARK
|
||||
169C; 169B # OGHAM REVERSED FEATHER MARK
|
||||
2039; 203A # SINGLE LEFT-POINTING ANGLE QUOTATION MARK
|
||||
203A; 2039 # SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
|
||||
2045; 2046 # LEFT SQUARE BRACKET WITH QUILL
|
||||
2046; 2045 # RIGHT SQUARE BRACKET WITH QUILL
|
||||
207D; 207E # SUPERSCRIPT LEFT PARENTHESIS
|
||||
207E; 207D # SUPERSCRIPT RIGHT PARENTHESIS
|
||||
208D; 208E # SUBSCRIPT LEFT PARENTHESIS
|
||||
208E; 208D # SUBSCRIPT RIGHT PARENTHESIS
|
||||
2208; 220B # ELEMENT OF
|
||||
2209; 220C # [BEST FIT] NOT AN ELEMENT OF
|
||||
220A; 220D # SMALL ELEMENT OF
|
||||
220B; 2208 # CONTAINS AS MEMBER
|
||||
220C; 2209 # [BEST FIT] DOES NOT CONTAIN AS MEMBER
|
||||
220D; 220A # SMALL CONTAINS AS MEMBER
|
||||
2215; 29F5 # DIVISION SLASH
|
||||
221F; 2BFE # RIGHT ANGLE
|
||||
2220; 29A3 # ANGLE
|
||||
2221; 299B # MEASURED ANGLE
|
||||
2222; 29A0 # SPHERICAL ANGLE
|
||||
2224; 2AEE # DOES NOT DIVIDE
|
||||
223C; 223D # TILDE OPERATOR
|
||||
223D; 223C # REVERSED TILDE
|
||||
2243; 22CD # ASYMPTOTICALLY EQUAL TO
|
||||
2245; 224C # APPROXIMATELY EQUAL TO
|
||||
224C; 2245 # ALL EQUAL TO
|
||||
2252; 2253 # APPROXIMATELY EQUAL TO OR THE IMAGE OF
|
||||
2253; 2252 # IMAGE OF OR APPROXIMATELY EQUAL TO
|
||||
2254; 2255 # COLON EQUALS
|
||||
2255; 2254 # EQUALS COLON
|
||||
2264; 2265 # LESS-THAN OR EQUAL TO
|
||||
2265; 2264 # GREATER-THAN OR EQUAL TO
|
||||
2266; 2267 # LESS-THAN OVER EQUAL TO
|
||||
2267; 2266 # GREATER-THAN OVER EQUAL TO
|
||||
2268; 2269 # [BEST FIT] LESS-THAN BUT NOT EQUAL TO
|
||||
2269; 2268 # [BEST FIT] GREATER-THAN BUT NOT EQUAL TO
|
||||
226A; 226B # MUCH LESS-THAN
|
||||
226B; 226A # MUCH GREATER-THAN
|
||||
226E; 226F # [BEST FIT] NOT LESS-THAN
|
||||
226F; 226E # [BEST FIT] NOT GREATER-THAN
|
||||
2270; 2271 # [BEST FIT] NEITHER LESS-THAN NOR EQUAL TO
|
||||
2271; 2270 # [BEST FIT] NEITHER GREATER-THAN NOR EQUAL TO
|
||||
2272; 2273 # [BEST FIT] LESS-THAN OR EQUIVALENT TO
|
||||
2273; 2272 # [BEST FIT] GREATER-THAN OR EQUIVALENT TO
|
||||
2274; 2275 # [BEST FIT] NEITHER LESS-THAN NOR EQUIVALENT TO
|
||||
2275; 2274 # [BEST FIT] NEITHER GREATER-THAN NOR EQUIVALENT TO
|
||||
2276; 2277 # LESS-THAN OR GREATER-THAN
|
||||
2277; 2276 # GREATER-THAN OR LESS-THAN
|
||||
2278; 2279 # [BEST FIT] NEITHER LESS-THAN NOR GREATER-THAN
|
||||
2279; 2278 # [BEST FIT] NEITHER GREATER-THAN NOR LESS-THAN
|
||||
227A; 227B # PRECEDES
|
||||
227B; 227A # SUCCEEDS
|
||||
227C; 227D # PRECEDES OR EQUAL TO
|
||||
227D; 227C # SUCCEEDS OR EQUAL TO
|
||||
227E; 227F # [BEST FIT] PRECEDES OR EQUIVALENT TO
|
||||
227F; 227E # [BEST FIT] SUCCEEDS OR EQUIVALENT TO
|
||||
2280; 2281 # [BEST FIT] DOES NOT PRECEDE
|
||||
2281; 2280 # [BEST FIT] DOES NOT SUCCEED
|
||||
2282; 2283 # SUBSET OF
|
||||
2283; 2282 # SUPERSET OF
|
||||
2284; 2285 # [BEST FIT] NOT A SUBSET OF
|
||||
2285; 2284 # [BEST FIT] NOT A SUPERSET OF
|
||||
2286; 2287 # SUBSET OF OR EQUAL TO
|
||||
2287; 2286 # SUPERSET OF OR EQUAL TO
|
||||
2288; 2289 # [BEST FIT] NEITHER A SUBSET OF NOR EQUAL TO
|
||||
2289; 2288 # [BEST FIT] NEITHER A SUPERSET OF NOR EQUAL TO
|
||||
228A; 228B # [BEST FIT] SUBSET OF WITH NOT EQUAL TO
|
||||
228B; 228A # [BEST FIT] SUPERSET OF WITH NOT EQUAL TO
|
||||
228F; 2290 # SQUARE IMAGE OF
|
||||
2290; 228F # SQUARE ORIGINAL OF
|
||||
2291; 2292 # SQUARE IMAGE OF OR EQUAL TO
|
||||
2292; 2291 # SQUARE ORIGINAL OF OR EQUAL TO
|
||||
2298; 29B8 # CIRCLED DIVISION SLASH
|
||||
22A2; 22A3 # RIGHT TACK
|
||||
22A3; 22A2 # LEFT TACK
|
||||
22A6; 2ADE # ASSERTION
|
||||
22A8; 2AE4 # TRUE
|
||||
22A9; 2AE3 # FORCES
|
||||
22AB; 2AE5 # DOUBLE VERTICAL BAR DOUBLE RIGHT TURNSTILE
|
||||
22B0; 22B1 # PRECEDES UNDER RELATION
|
||||
22B1; 22B0 # SUCCEEDS UNDER RELATION
|
||||
22B2; 22B3 # NORMAL SUBGROUP OF
|
||||
22B3; 22B2 # CONTAINS AS NORMAL SUBGROUP
|
||||
22B4; 22B5 # NORMAL SUBGROUP OF OR EQUAL TO
|
||||
22B5; 22B4 # CONTAINS AS NORMAL SUBGROUP OR EQUAL TO
|
||||
22B6; 22B7 # ORIGINAL OF
|
||||
22B7; 22B6 # IMAGE OF
|
||||
22B8; 27DC # MULTIMAP
|
||||
22C9; 22CA # LEFT NORMAL FACTOR SEMIDIRECT PRODUCT
|
||||
22CA; 22C9 # RIGHT NORMAL FACTOR SEMIDIRECT PRODUCT
|
||||
22CB; 22CC # LEFT SEMIDIRECT PRODUCT
|
||||
22CC; 22CB # RIGHT SEMIDIRECT PRODUCT
|
||||
22CD; 2243 # REVERSED TILDE EQUALS
|
||||
22D0; 22D1 # DOUBLE SUBSET
|
||||
22D1; 22D0 # DOUBLE SUPERSET
|
||||
22D6; 22D7 # LESS-THAN WITH DOT
|
||||
22D7; 22D6 # GREATER-THAN WITH DOT
|
||||
22D8; 22D9 # VERY MUCH LESS-THAN
|
||||
22D9; 22D8 # VERY MUCH GREATER-THAN
|
||||
22DA; 22DB # LESS-THAN EQUAL TO OR GREATER-THAN
|
||||
22DB; 22DA # GREATER-THAN EQUAL TO OR LESS-THAN
|
||||
22DC; 22DD # EQUAL TO OR LESS-THAN
|
||||
22DD; 22DC # EQUAL TO OR GREATER-THAN
|
||||
22DE; 22DF # EQUAL TO OR PRECEDES
|
||||
22DF; 22DE # EQUAL TO OR SUCCEEDS
|
||||
22E0; 22E1 # [BEST FIT] DOES NOT PRECEDE OR EQUAL
|
||||
22E1; 22E0 # [BEST FIT] DOES NOT SUCCEED OR EQUAL
|
||||
22E2; 22E3 # [BEST FIT] NOT SQUARE IMAGE OF OR EQUAL TO
|
||||
22E3; 22E2 # [BEST FIT] NOT SQUARE ORIGINAL OF OR EQUAL TO
|
||||
22E4; 22E5 # [BEST FIT] SQUARE IMAGE OF OR NOT EQUAL TO
|
||||
22E5; 22E4 # [BEST FIT] SQUARE ORIGINAL OF OR NOT EQUAL TO
|
||||
22E6; 22E7 # [BEST FIT] LESS-THAN BUT NOT EQUIVALENT TO
|
||||
22E7; 22E6 # [BEST FIT] GREATER-THAN BUT NOT EQUIVALENT TO
|
||||
22E8; 22E9 # [BEST FIT] PRECEDES BUT NOT EQUIVALENT TO
|
||||
22E9; 22E8 # [BEST FIT] SUCCEEDS BUT NOT EQUIVALENT TO
|
||||
22EA; 22EB # [BEST FIT] NOT NORMAL SUBGROUP OF
|
||||
22EB; 22EA # [BEST FIT] DOES NOT CONTAIN AS NORMAL SUBGROUP
|
||||
22EC; 22ED # [BEST FIT] NOT NORMAL SUBGROUP OF OR EQUAL TO
|
||||
22ED; 22EC # [BEST FIT] DOES NOT CONTAIN AS NORMAL SUBGROUP OR EQUAL
|
||||
22F0; 22F1 # UP RIGHT DIAGONAL ELLIPSIS
|
||||
22F1; 22F0 # DOWN RIGHT DIAGONAL ELLIPSIS
|
||||
22F2; 22FA # ELEMENT OF WITH LONG HORIZONTAL STROKE
|
||||
22F3; 22FB # ELEMENT OF WITH VERTICAL BAR AT END OF HORIZONTAL STROKE
|
||||
22F4; 22FC # SMALL ELEMENT OF WITH VERTICAL BAR AT END OF HORIZONTAL STROKE
|
||||
22F6; 22FD # ELEMENT OF WITH OVERBAR
|
||||
22F7; 22FE # SMALL ELEMENT OF WITH OVERBAR
|
||||
22FA; 22F2 # CONTAINS WITH LONG HORIZONTAL STROKE
|
||||
22FB; 22F3 # CONTAINS WITH VERTICAL BAR AT END OF HORIZONTAL STROKE
|
||||
22FC; 22F4 # SMALL CONTAINS WITH VERTICAL BAR AT END OF HORIZONTAL STROKE
|
||||
22FD; 22F6 # CONTAINS WITH OVERBAR
|
||||
22FE; 22F7 # SMALL CONTAINS WITH OVERBAR
|
||||
2308; 2309 # LEFT CEILING
|
||||
2309; 2308 # RIGHT CEILING
|
||||
230A; 230B # LEFT FLOOR
|
||||
230B; 230A # RIGHT FLOOR
|
||||
2329; 232A # LEFT-POINTING ANGLE BRACKET
|
||||
232A; 2329 # RIGHT-POINTING ANGLE BRACKET
|
||||
2768; 2769 # MEDIUM LEFT PARENTHESIS ORNAMENT
|
||||
2769; 2768 # MEDIUM RIGHT PARENTHESIS ORNAMENT
|
||||
276A; 276B # MEDIUM FLATTENED LEFT PARENTHESIS ORNAMENT
|
||||
276B; 276A # MEDIUM FLATTENED RIGHT PARENTHESIS ORNAMENT
|
||||
276C; 276D # MEDIUM LEFT-POINTING ANGLE BRACKET ORNAMENT
|
||||
276D; 276C # MEDIUM RIGHT-POINTING ANGLE BRACKET ORNAMENT
|
||||
276E; 276F # HEAVY LEFT-POINTING ANGLE QUOTATION MARK ORNAMENT
|
||||
276F; 276E # HEAVY RIGHT-POINTING ANGLE QUOTATION MARK ORNAMENT
|
||||
2770; 2771 # HEAVY LEFT-POINTING ANGLE BRACKET ORNAMENT
|
||||
2771; 2770 # HEAVY RIGHT-POINTING ANGLE BRACKET ORNAMENT
|
||||
2772; 2773 # LIGHT LEFT TORTOISE SHELL BRACKET ORNAMENT
|
||||
2773; 2772 # LIGHT RIGHT TORTOISE SHELL BRACKET ORNAMENT
|
||||
2774; 2775 # MEDIUM LEFT CURLY BRACKET ORNAMENT
|
||||
2775; 2774 # MEDIUM RIGHT CURLY BRACKET ORNAMENT
|
||||
27C3; 27C4 # OPEN SUBSET
|
||||
27C4; 27C3 # OPEN SUPERSET
|
||||
27C5; 27C6 # LEFT S-SHAPED BAG DELIMITER
|
||||
27C6; 27C5 # RIGHT S-SHAPED BAG DELIMITER
|
||||
27C8; 27C9 # REVERSE SOLIDUS PRECEDING SUBSET
|
||||
27C9; 27C8 # SUPERSET PRECEDING SOLIDUS
|
||||
27CB; 27CD # MATHEMATICAL RISING DIAGONAL
|
||||
27CD; 27CB # MATHEMATICAL FALLING DIAGONAL
|
||||
27D5; 27D6 # LEFT OUTER JOIN
|
||||
27D6; 27D5 # RIGHT OUTER JOIN
|
||||
27DC; 22B8 # LEFT MULTIMAP
|
||||
27DD; 27DE # LONG RIGHT TACK
|
||||
27DE; 27DD # LONG LEFT TACK
|
||||
27E2; 27E3 # WHITE CONCAVE-SIDED DIAMOND WITH LEFTWARDS TICK
|
||||
27E3; 27E2 # WHITE CONCAVE-SIDED DIAMOND WITH RIGHTWARDS TICK
|
||||
27E4; 27E5 # WHITE SQUARE WITH LEFTWARDS TICK
|
||||
27E5; 27E4 # WHITE SQUARE WITH RIGHTWARDS TICK
|
||||
27E6; 27E7 # MATHEMATICAL LEFT WHITE SQUARE BRACKET
|
||||
27E7; 27E6 # MATHEMATICAL RIGHT WHITE SQUARE BRACKET
|
||||
27E8; 27E9 # MATHEMATICAL LEFT ANGLE BRACKET
|
||||
27E9; 27E8 # MATHEMATICAL RIGHT ANGLE BRACKET
|
||||
27EA; 27EB # MATHEMATICAL LEFT DOUBLE ANGLE BRACKET
|
||||
27EB; 27EA # MATHEMATICAL RIGHT DOUBLE ANGLE BRACKET
|
||||
27EC; 27ED # MATHEMATICAL LEFT WHITE TORTOISE SHELL BRACKET
|
||||
27ED; 27EC # MATHEMATICAL RIGHT WHITE TORTOISE SHELL BRACKET
|
||||
27EE; 27EF # MATHEMATICAL LEFT FLATTENED PARENTHESIS
|
||||
27EF; 27EE # MATHEMATICAL RIGHT FLATTENED PARENTHESIS
|
||||
2983; 2984 # LEFT WHITE CURLY BRACKET
|
||||
2984; 2983 # RIGHT WHITE CURLY BRACKET
|
||||
2985; 2986 # LEFT WHITE PARENTHESIS
|
||||
2986; 2985 # RIGHT WHITE PARENTHESIS
|
||||
2987; 2988 # Z NOTATION LEFT IMAGE BRACKET
|
||||
2988; 2987 # Z NOTATION RIGHT IMAGE BRACKET
|
||||
2989; 298A # Z NOTATION LEFT BINDING BRACKET
|
||||
298A; 2989 # Z NOTATION RIGHT BINDING BRACKET
|
||||
298B; 298C # LEFT SQUARE BRACKET WITH UNDERBAR
|
||||
298C; 298B # RIGHT SQUARE BRACKET WITH UNDERBAR
|
||||
298D; 2990 # LEFT SQUARE BRACKET WITH TICK IN TOP CORNER
|
||||
298E; 298F # RIGHT SQUARE BRACKET WITH TICK IN BOTTOM CORNER
|
||||
298F; 298E # LEFT SQUARE BRACKET WITH TICK IN BOTTOM CORNER
|
||||
2990; 298D # RIGHT SQUARE BRACKET WITH TICK IN TOP CORNER
|
||||
2991; 2992 # LEFT ANGLE BRACKET WITH DOT
|
||||
2992; 2991 # RIGHT ANGLE BRACKET WITH DOT
|
||||
2993; 2994 # LEFT ARC LESS-THAN BRACKET
|
||||
2994; 2993 # RIGHT ARC GREATER-THAN BRACKET
|
||||
2995; 2996 # DOUBLE LEFT ARC GREATER-THAN BRACKET
|
||||
2996; 2995 # DOUBLE RIGHT ARC LESS-THAN BRACKET
|
||||
2997; 2998 # LEFT BLACK TORTOISE SHELL BRACKET
|
||||
2998; 2997 # RIGHT BLACK TORTOISE SHELL BRACKET
|
||||
299B; 2221 # MEASURED ANGLE OPENING LEFT
|
||||
29A0; 2222 # SPHERICAL ANGLE OPENING LEFT
|
||||
29A3; 2220 # REVERSED ANGLE
|
||||
29A4; 29A5 # ANGLE WITH UNDERBAR
|
||||
29A5; 29A4 # REVERSED ANGLE WITH UNDERBAR
|
||||
29A8; 29A9 # MEASURED ANGLE WITH OPEN ARM ENDING IN ARROW POINTING UP AND RIGHT
|
||||
29A9; 29A8 # MEASURED ANGLE WITH OPEN ARM ENDING IN ARROW POINTING UP AND LEFT
|
||||
29AA; 29AB # MEASURED ANGLE WITH OPEN ARM ENDING IN ARROW POINTING DOWN AND RIGHT
|
||||
29AB; 29AA # MEASURED ANGLE WITH OPEN ARM ENDING IN ARROW POINTING DOWN AND LEFT
|
||||
29AC; 29AD # MEASURED ANGLE WITH OPEN ARM ENDING IN ARROW POINTING RIGHT AND UP
|
||||
29AD; 29AC # MEASURED ANGLE WITH OPEN ARM ENDING IN ARROW POINTING LEFT AND UP
|
||||
29AE; 29AF # MEASURED ANGLE WITH OPEN ARM ENDING IN ARROW POINTING RIGHT AND DOWN
|
||||
29AF; 29AE # MEASURED ANGLE WITH OPEN ARM ENDING IN ARROW POINTING LEFT AND DOWN
|
||||
29B8; 2298 # CIRCLED REVERSE SOLIDUS
|
||||
29C0; 29C1 # CIRCLED LESS-THAN
|
||||
29C1; 29C0 # CIRCLED GREATER-THAN
|
||||
29C4; 29C5 # SQUARED RISING DIAGONAL SLASH
|
||||
29C5; 29C4 # SQUARED FALLING DIAGONAL SLASH
|
||||
29CF; 29D0 # LEFT TRIANGLE BESIDE VERTICAL BAR
|
||||
29D0; 29CF # VERTICAL BAR BESIDE RIGHT TRIANGLE
|
||||
29D1; 29D2 # BOWTIE WITH LEFT HALF BLACK
|
||||
29D2; 29D1 # BOWTIE WITH RIGHT HALF BLACK
|
||||
29D4; 29D5 # TIMES WITH LEFT HALF BLACK
|
||||
29D5; 29D4 # TIMES WITH RIGHT HALF BLACK
|
||||
29D8; 29D9 # LEFT WIGGLY FENCE
|
||||
29D9; 29D8 # RIGHT WIGGLY FENCE
|
||||
29DA; 29DB # LEFT DOUBLE WIGGLY FENCE
|
||||
29DB; 29DA # RIGHT DOUBLE WIGGLY FENCE
|
||||
29E8; 29E9 # DOWN-POINTING TRIANGLE WITH LEFT HALF BLACK
|
||||
29E9; 29E8 # DOWN-POINTING TRIANGLE WITH RIGHT HALF BLACK
|
||||
29F5; 2215 # REVERSE SOLIDUS OPERATOR
|
||||
29F8; 29F9 # BIG SOLIDUS
|
||||
29F9; 29F8 # BIG REVERSE SOLIDUS
|
||||
29FC; 29FD # LEFT-POINTING CURVED ANGLE BRACKET
|
||||
29FD; 29FC # RIGHT-POINTING CURVED ANGLE BRACKET
|
||||
2A2B; 2A2C # MINUS SIGN WITH FALLING DOTS
|
||||
2A2C; 2A2B # MINUS SIGN WITH RISING DOTS
|
||||
2A2D; 2A2E # PLUS SIGN IN LEFT HALF CIRCLE
|
||||
2A2E; 2A2D # PLUS SIGN IN RIGHT HALF CIRCLE
|
||||
2A34; 2A35 # MULTIPLICATION SIGN IN LEFT HALF CIRCLE
|
||||
2A35; 2A34 # MULTIPLICATION SIGN IN RIGHT HALF CIRCLE
|
||||
2A3C; 2A3D # INTERIOR PRODUCT
|
||||
2A3D; 2A3C # RIGHTHAND INTERIOR PRODUCT
|
||||
2A64; 2A65 # Z NOTATION DOMAIN ANTIRESTRICTION
|
||||
2A65; 2A64 # Z NOTATION RANGE ANTIRESTRICTION
|
||||
2A79; 2A7A # LESS-THAN WITH CIRCLE INSIDE
|
||||
2A7A; 2A79 # GREATER-THAN WITH CIRCLE INSIDE
|
||||
2A7B; 2A7C # [BEST FIT] LESS-THAN WITH QUESTION MARK ABOVE
|
||||
2A7C; 2A7B # [BEST FIT] GREATER-THAN WITH QUESTION MARK ABOVE
|
||||
2A7D; 2A7E # LESS-THAN OR SLANTED EQUAL TO
|
||||
2A7E; 2A7D # GREATER-THAN OR SLANTED EQUAL TO
|
||||
2A7F; 2A80 # LESS-THAN OR SLANTED EQUAL TO WITH DOT INSIDE
|
||||
2A80; 2A7F # GREATER-THAN OR SLANTED EQUAL TO WITH DOT INSIDE
|
||||
2A81; 2A82 # LESS-THAN OR SLANTED EQUAL TO WITH DOT ABOVE
|
||||
2A82; 2A81 # GREATER-THAN OR SLANTED EQUAL TO WITH DOT ABOVE
|
||||
2A83; 2A84 # LESS-THAN OR SLANTED EQUAL TO WITH DOT ABOVE RIGHT
|
||||
2A84; 2A83 # GREATER-THAN OR SLANTED EQUAL TO WITH DOT ABOVE LEFT
|
||||
2A85; 2A86 # [BEST FIT] LESS-THAN OR APPROXIMATE
|
||||
2A86; 2A85 # [BEST FIT] GREATER-THAN OR APPROXIMATE
|
||||
2A87; 2A88 # [BEST FIT] LESS-THAN AND SINGLE-LINE NOT EQUAL TO
|
||||
2A88; 2A87 # [BEST FIT] GREATER-THAN AND SINGLE-LINE NOT EQUAL TO
|
||||
2A89; 2A8A # [BEST FIT] LESS-THAN AND NOT APPROXIMATE
|
||||
2A8A; 2A89 # [BEST FIT] GREATER-THAN AND NOT APPROXIMATE
|
||||
2A8B; 2A8C # LESS-THAN ABOVE DOUBLE-LINE EQUAL ABOVE GREATER-THAN
|
||||
2A8C; 2A8B # GREATER-THAN ABOVE DOUBLE-LINE EQUAL ABOVE LESS-THAN
|
||||
2A8D; 2A8E # [BEST FIT] LESS-THAN ABOVE SIMILAR OR EQUAL
|
||||
2A8E; 2A8D # [BEST FIT] GREATER-THAN ABOVE SIMILAR OR EQUAL
|
||||
2A8F; 2A90 # [BEST FIT] LESS-THAN ABOVE SIMILAR ABOVE GREATER-THAN
|
||||
2A90; 2A8F # [BEST FIT] GREATER-THAN ABOVE SIMILAR ABOVE LESS-THAN
|
||||
2A91; 2A92 # LESS-THAN ABOVE GREATER-THAN ABOVE DOUBLE-LINE EQUAL
|
||||
2A92; 2A91 # GREATER-THAN ABOVE LESS-THAN ABOVE DOUBLE-LINE EQUAL
|
||||
2A93; 2A94 # LESS-THAN ABOVE SLANTED EQUAL ABOVE GREATER-THAN ABOVE SLANTED EQUAL
|
||||
2A94; 2A93 # GREATER-THAN ABOVE SLANTED EQUAL ABOVE LESS-THAN ABOVE SLANTED EQUAL
|
||||
2A95; 2A96 # SLANTED EQUAL TO OR LESS-THAN
|
||||
2A96; 2A95 # SLANTED EQUAL TO OR GREATER-THAN
|
||||
2A97; 2A98 # SLANTED EQUAL TO OR LESS-THAN WITH DOT INSIDE
|
||||
2A98; 2A97 # SLANTED EQUAL TO OR GREATER-THAN WITH DOT INSIDE
|
||||
2A99; 2A9A # DOUBLE-LINE EQUAL TO OR LESS-THAN
|
||||
2A9A; 2A99 # DOUBLE-LINE EQUAL TO OR GREATER-THAN
|
||||
2A9B; 2A9C # DOUBLE-LINE SLANTED EQUAL TO OR LESS-THAN
|
||||
2A9C; 2A9B # DOUBLE-LINE SLANTED EQUAL TO OR GREATER-THAN
|
||||
2A9D; 2A9E # [BEST FIT] SIMILAR OR LESS-THAN
|
||||
2A9E; 2A9D # [BEST FIT] SIMILAR OR GREATER-THAN
|
||||
2A9F; 2AA0 # [BEST FIT] SIMILAR ABOVE LESS-THAN ABOVE EQUALS SIGN
|
||||
2AA0; 2A9F # [BEST FIT] SIMILAR ABOVE GREATER-THAN ABOVE EQUALS SIGN
|
||||
2AA1; 2AA2 # DOUBLE NESTED LESS-THAN
|
||||
2AA2; 2AA1 # DOUBLE NESTED GREATER-THAN
|
||||
2AA6; 2AA7 # LESS-THAN CLOSED BY CURVE
|
||||
2AA7; 2AA6 # GREATER-THAN CLOSED BY CURVE
|
||||
2AA8; 2AA9 # LESS-THAN CLOSED BY CURVE ABOVE SLANTED EQUAL
|
||||
2AA9; 2AA8 # GREATER-THAN CLOSED BY CURVE ABOVE SLANTED EQUAL
|
||||
2AAA; 2AAB # SMALLER THAN
|
||||
2AAB; 2AAA # LARGER THAN
|
||||
2AAC; 2AAD # SMALLER THAN OR EQUAL TO
|
||||
2AAD; 2AAC # LARGER THAN OR EQUAL TO
|
||||
2AAF; 2AB0 # PRECEDES ABOVE SINGLE-LINE EQUALS SIGN
|
||||
2AB0; 2AAF # SUCCEEDS ABOVE SINGLE-LINE EQUALS SIGN
|
||||
2AB1; 2AB2 # [BEST FIT] PRECEDES ABOVE SINGLE-LINE NOT EQUAL TO
|
||||
2AB2; 2AB1 # [BEST FIT] SUCCEEDS ABOVE SINGLE-LINE NOT EQUAL TO
|
||||
2AB3; 2AB4 # PRECEDES ABOVE EQUALS SIGN
|
||||
2AB4; 2AB3 # SUCCEEDS ABOVE EQUALS SIGN
|
||||
2AB5; 2AB6 # [BEST FIT] PRECEDES ABOVE NOT EQUAL TO
|
||||
2AB6; 2AB5 # [BEST FIT] SUCCEEDS ABOVE NOT EQUAL TO
|
||||
2AB7; 2AB8 # [BEST FIT] PRECEDES ABOVE ALMOST EQUAL TO
|
||||
2AB8; 2AB7 # [BEST FIT] SUCCEEDS ABOVE ALMOST EQUAL TO
|
||||
2AB9; 2ABA # [BEST FIT] PRECEDES ABOVE NOT ALMOST EQUAL TO
|
||||
2ABA; 2AB9 # [BEST FIT] SUCCEEDS ABOVE NOT ALMOST EQUAL TO
|
||||
2ABB; 2ABC # DOUBLE PRECEDES
|
||||
2ABC; 2ABB # DOUBLE SUCCEEDS
|
||||
2ABD; 2ABE # SUBSET WITH DOT
|
||||
2ABE; 2ABD # SUPERSET WITH DOT
|
||||
2ABF; 2AC0 # SUBSET WITH PLUS SIGN BELOW
|
||||
2AC0; 2ABF # SUPERSET WITH PLUS SIGN BELOW
|
||||
2AC1; 2AC2 # SUBSET WITH MULTIPLICATION SIGN BELOW
|
||||
2AC2; 2AC1 # SUPERSET WITH MULTIPLICATION SIGN BELOW
|
||||
2AC3; 2AC4 # SUBSET OF OR EQUAL TO WITH DOT ABOVE
|
||||
2AC4; 2AC3 # SUPERSET OF OR EQUAL TO WITH DOT ABOVE
|
||||
2AC5; 2AC6 # SUBSET OF ABOVE EQUALS SIGN
|
||||
2AC6; 2AC5 # SUPERSET OF ABOVE EQUALS SIGN
|
||||
2AC7; 2AC8 # [BEST FIT] SUBSET OF ABOVE TILDE OPERATOR
|
||||
2AC8; 2AC7 # [BEST FIT] SUPERSET OF ABOVE TILDE OPERATOR
|
||||
2AC9; 2ACA # [BEST FIT] SUBSET OF ABOVE ALMOST EQUAL TO
|
||||
2ACA; 2AC9 # [BEST FIT] SUPERSET OF ABOVE ALMOST EQUAL TO
|
||||
2ACB; 2ACC # [BEST FIT] SUBSET OF ABOVE NOT EQUAL TO
|
||||
2ACC; 2ACB # [BEST FIT] SUPERSET OF ABOVE NOT EQUAL TO
|
||||
2ACD; 2ACE # SQUARE LEFT OPEN BOX OPERATOR
|
||||
2ACE; 2ACD # SQUARE RIGHT OPEN BOX OPERATOR
|
||||
2ACF; 2AD0 # CLOSED SUBSET
|
||||
2AD0; 2ACF # CLOSED SUPERSET
|
||||
2AD1; 2AD2 # CLOSED SUBSET OR EQUAL TO
|
||||
2AD2; 2AD1 # CLOSED SUPERSET OR EQUAL TO
|
||||
2AD3; 2AD4 # SUBSET ABOVE SUPERSET
|
||||
2AD4; 2AD3 # SUPERSET ABOVE SUBSET
|
||||
2AD5; 2AD6 # SUBSET ABOVE SUBSET
|
||||
2AD6; 2AD5 # SUPERSET ABOVE SUPERSET
|
||||
2ADE; 22A6 # SHORT LEFT TACK
|
||||
2AE3; 22A9 # DOUBLE VERTICAL BAR LEFT TURNSTILE
|
||||
2AE4; 22A8 # VERTICAL BAR DOUBLE LEFT TURNSTILE
|
||||
2AE5; 22AB # DOUBLE VERTICAL BAR DOUBLE LEFT TURNSTILE
|
||||
2AEC; 2AED # DOUBLE STROKE NOT SIGN
|
||||
2AED; 2AEC # REVERSED DOUBLE STROKE NOT SIGN
|
||||
2AEE; 2224 # DOES NOT DIVIDE WITH REVERSED NEGATION SLASH
|
||||
2AF7; 2AF8 # TRIPLE NESTED LESS-THAN
|
||||
2AF8; 2AF7 # TRIPLE NESTED GREATER-THAN
|
||||
2AF9; 2AFA # DOUBLE-LINE SLANTED LESS-THAN OR EQUAL TO
|
||||
2AFA; 2AF9 # DOUBLE-LINE SLANTED GREATER-THAN OR EQUAL TO
|
||||
2BFE; 221F # REVERSED RIGHT ANGLE
|
||||
2E02; 2E03 # LEFT SUBSTITUTION BRACKET
|
||||
2E03; 2E02 # RIGHT SUBSTITUTION BRACKET
|
||||
2E04; 2E05 # LEFT DOTTED SUBSTITUTION BRACKET
|
||||
2E05; 2E04 # RIGHT DOTTED SUBSTITUTION BRACKET
|
||||
2E09; 2E0A # LEFT TRANSPOSITION BRACKET
|
||||
2E0A; 2E09 # RIGHT TRANSPOSITION BRACKET
|
||||
2E0C; 2E0D # LEFT RAISED OMISSION BRACKET
|
||||
2E0D; 2E0C # RIGHT RAISED OMISSION BRACKET
|
||||
2E1C; 2E1D # LEFT LOW PARAPHRASE BRACKET
|
||||
2E1D; 2E1C # RIGHT LOW PARAPHRASE BRACKET
|
||||
2E20; 2E21 # LEFT VERTICAL BAR WITH QUILL
|
||||
2E21; 2E20 # RIGHT VERTICAL BAR WITH QUILL
|
||||
2E22; 2E23 # TOP LEFT HALF BRACKET
|
||||
2E23; 2E22 # TOP RIGHT HALF BRACKET
|
||||
2E24; 2E25 # BOTTOM LEFT HALF BRACKET
|
||||
2E25; 2E24 # BOTTOM RIGHT HALF BRACKET
|
||||
2E26; 2E27 # LEFT SIDEWAYS U BRACKET
|
||||
2E27; 2E26 # RIGHT SIDEWAYS U BRACKET
|
||||
2E28; 2E29 # LEFT DOUBLE PARENTHESIS
|
||||
2E29; 2E28 # RIGHT DOUBLE PARENTHESIS
|
||||
2E55; 2E56 # LEFT SQUARE BRACKET WITH STROKE
|
||||
2E56; 2E55 # RIGHT SQUARE BRACKET WITH STROKE
|
||||
2E57; 2E58 # LEFT SQUARE BRACKET WITH DOUBLE STROKE
|
||||
2E58; 2E57 # RIGHT SQUARE BRACKET WITH DOUBLE STROKE
|
||||
2E59; 2E5A # TOP HALF LEFT PARENTHESIS
|
||||
2E5A; 2E59 # TOP HALF RIGHT PARENTHESIS
|
||||
2E5B; 2E5C # BOTTOM HALF LEFT PARENTHESIS
|
||||
2E5C; 2E5B # BOTTOM HALF RIGHT PARENTHESIS
|
||||
3008; 3009 # LEFT ANGLE BRACKET
|
||||
3009; 3008 # RIGHT ANGLE BRACKET
|
||||
300A; 300B # LEFT DOUBLE ANGLE BRACKET
|
||||
300B; 300A # RIGHT DOUBLE ANGLE BRACKET
|
||||
300C; 300D # [BEST FIT] LEFT CORNER BRACKET
|
||||
300D; 300C # [BEST FIT] RIGHT CORNER BRACKET
|
||||
300E; 300F # [BEST FIT] LEFT WHITE CORNER BRACKET
|
||||
300F; 300E # [BEST FIT] RIGHT WHITE CORNER BRACKET
|
||||
3010; 3011 # LEFT BLACK LENTICULAR BRACKET
|
||||
3011; 3010 # RIGHT BLACK LENTICULAR BRACKET
|
||||
3014; 3015 # LEFT TORTOISE SHELL BRACKET
|
||||
3015; 3014 # RIGHT TORTOISE SHELL BRACKET
|
||||
3016; 3017 # LEFT WHITE LENTICULAR BRACKET
|
||||
3017; 3016 # RIGHT WHITE LENTICULAR BRACKET
|
||||
3018; 3019 # LEFT WHITE TORTOISE SHELL BRACKET
|
||||
3019; 3018 # RIGHT WHITE TORTOISE SHELL BRACKET
|
||||
301A; 301B # LEFT WHITE SQUARE BRACKET
|
||||
301B; 301A # RIGHT WHITE SQUARE BRACKET
|
||||
FE59; FE5A # SMALL LEFT PARENTHESIS
|
||||
FE5A; FE59 # SMALL RIGHT PARENTHESIS
|
||||
FE5B; FE5C # SMALL LEFT CURLY BRACKET
|
||||
FE5C; FE5B # SMALL RIGHT CURLY BRACKET
|
||||
FE5D; FE5E # SMALL LEFT TORTOISE SHELL BRACKET
|
||||
FE5E; FE5D # SMALL RIGHT TORTOISE SHELL BRACKET
|
||||
FE64; FE65 # SMALL LESS-THAN SIGN
|
||||
FE65; FE64 # SMALL GREATER-THAN SIGN
|
||||
FF08; FF09 # FULLWIDTH LEFT PARENTHESIS
|
||||
FF09; FF08 # FULLWIDTH RIGHT PARENTHESIS
|
||||
FF1C; FF1E # FULLWIDTH LESS-THAN SIGN
|
||||
FF1E; FF1C # FULLWIDTH GREATER-THAN SIGN
|
||||
FF3B; FF3D # FULLWIDTH LEFT SQUARE BRACKET
|
||||
FF3D; FF3B # FULLWIDTH RIGHT SQUARE BRACKET
|
||||
FF5B; FF5D # FULLWIDTH LEFT CURLY BRACKET
|
||||
FF5D; FF5B # FULLWIDTH RIGHT CURLY BRACKET
|
||||
FF5F; FF60 # FULLWIDTH LEFT WHITE PARENTHESIS
|
||||
FF60; FF5F # FULLWIDTH RIGHT WHITE PARENTHESIS
|
||||
FF62; FF63 # [BEST FIT] HALFWIDTH LEFT CORNER BRACKET
|
||||
FF63; FF62 # [BEST FIT] HALFWIDTH RIGHT CORNER BRACKET
|
||||
|
||||
# The following characters have no appropriate mirroring character.
|
||||
# For these characters it is up to the rendering system
|
||||
# to provide mirrored glyphs.
|
||||
|
||||
# 2140; DOUBLE-STRUCK N-ARY SUMMATION
|
||||
# 2201; COMPLEMENT
|
||||
# 2202; PARTIAL DIFFERENTIAL
|
||||
# 2203; THERE EXISTS
|
||||
# 2204; THERE DOES NOT EXIST
|
||||
# 2211; N-ARY SUMMATION
|
||||
# 2216; SET MINUS
|
||||
# 221A; SQUARE ROOT
|
||||
# 221B; CUBE ROOT
|
||||
# 221C; FOURTH ROOT
|
||||
# 221D; PROPORTIONAL TO
|
||||
# 2226; NOT PARALLEL TO
|
||||
# 222B; INTEGRAL
|
||||
# 222C; DOUBLE INTEGRAL
|
||||
# 222D; TRIPLE INTEGRAL
|
||||
# 222E; CONTOUR INTEGRAL
|
||||
# 222F; SURFACE INTEGRAL
|
||||
# 2230; VOLUME INTEGRAL
|
||||
# 2231; CLOCKWISE INTEGRAL
|
||||
# 2232; CLOCKWISE CONTOUR INTEGRAL
|
||||
# 2233; ANTICLOCKWISE CONTOUR INTEGRAL
|
||||
# 2239; EXCESS
|
||||
# 223B; HOMOTHETIC
|
||||
# 223E; INVERTED LAZY S
|
||||
# 223F; SINE WAVE
|
||||
# 2240; WREATH PRODUCT
|
||||
# 2241; NOT TILDE
|
||||
# 2242; MINUS TILDE
|
||||
# 2244; NOT ASYMPTOTICALLY EQUAL TO
|
||||
# 2246; APPROXIMATELY BUT NOT ACTUALLY EQUAL TO
|
||||
# 2247; NEITHER APPROXIMATELY NOR ACTUALLY EQUAL TO
|
||||
# 2248; ALMOST EQUAL TO
|
||||
# 2249; NOT ALMOST EQUAL TO
|
||||
# 224A; ALMOST EQUAL OR EQUAL TO
|
||||
# 224B; TRIPLE TILDE
|
||||
# 225F; QUESTIONED EQUAL TO
|
||||
# 2260; NOT EQUAL TO
|
||||
# 2262; NOT IDENTICAL TO
|
||||
# 228C; MULTISET
|
||||
# 22A7; MODELS
|
||||
# 22AA; TRIPLE VERTICAL BAR RIGHT TURNSTILE
|
||||
# 22AC; DOES NOT PROVE
|
||||
# 22AD; NOT TRUE
|
||||
# 22AE; DOES NOT FORCE
|
||||
# 22AF; NEGATED DOUBLE VERTICAL BAR DOUBLE RIGHT TURNSTILE
|
||||
# 22BE; RIGHT ANGLE WITH ARC
|
||||
# 22BF; RIGHT TRIANGLE
|
||||
# 22F5; ELEMENT OF WITH DOT ABOVE
|
||||
# 22F8; ELEMENT OF WITH UNDERBAR
|
||||
# 22F9; ELEMENT OF WITH TWO HORIZONTAL STROKES
|
||||
# 22FF; Z NOTATION BAG MEMBERSHIP
|
||||
# 2320; TOP HALF INTEGRAL
|
||||
# 2321; BOTTOM HALF INTEGRAL
|
||||
# 27C0; THREE DIMENSIONAL ANGLE
|
||||
# 27CC; LONG DIVISION
|
||||
# 27D3; LOWER RIGHT CORNER WITH DOT
|
||||
# 27D4; UPPER LEFT CORNER WITH DOT
|
||||
# 299C; RIGHT ANGLE VARIANT WITH SQUARE
|
||||
# 299D; MEASURED RIGHT ANGLE WITH DOT
|
||||
# 299E; ANGLE WITH S INSIDE
|
||||
# 299F; ACUTE ANGLE
|
||||
# 29A2; TURNED ANGLE
|
||||
# 29A6; OBLIQUE ANGLE OPENING UP
|
||||
# 29A7; OBLIQUE ANGLE OPENING DOWN
|
||||
# 29C2; CIRCLE WITH SMALL CIRCLE TO THE RIGHT
|
||||
# 29C3; CIRCLE WITH TWO HORIZONTAL STROKES TO THE RIGHT
|
||||
# 29C9; TWO JOINED SQUARES
|
||||
# 29CE; RIGHT TRIANGLE ABOVE LEFT TRIANGLE
|
||||
# 29DC; INCOMPLETE INFINITY
|
||||
# 29E1; INCREASES AS
|
||||
# 29E3; EQUALS SIGN AND SLANTED PARALLEL
|
||||
# 29E4; EQUALS SIGN AND SLANTED PARALLEL WITH TILDE ABOVE
|
||||
# 29E5; IDENTICAL TO AND SLANTED PARALLEL
|
||||
# 29F4; RULE-DELAYED
|
||||
# 29F6; SOLIDUS WITH OVERBAR
|
||||
# 29F7; REVERSE SOLIDUS WITH HORIZONTAL STROKE
|
||||
# 2A0A; MODULO TWO SUM
|
||||
# 2A0B; SUMMATION WITH INTEGRAL
|
||||
# 2A0C; QUADRUPLE INTEGRAL OPERATOR
|
||||
# 2A0D; FINITE PART INTEGRAL
|
||||
# 2A0E; INTEGRAL WITH DOUBLE STROKE
|
||||
# 2A0F; INTEGRAL AVERAGE WITH SLASH
|
||||
# 2A10; CIRCULATION FUNCTION
|
||||
# 2A11; ANTICLOCKWISE INTEGRATION
|
||||
# 2A12; LINE INTEGRATION WITH RECTANGULAR PATH AROUND POLE
|
||||
# 2A13; LINE INTEGRATION WITH SEMICIRCULAR PATH AROUND POLE
|
||||
# 2A14; LINE INTEGRATION NOT INCLUDING THE POLE
|
||||
# 2A15; INTEGRAL AROUND A POINT OPERATOR
|
||||
# 2A16; QUATERNION INTEGRAL OPERATOR
|
||||
# 2A17; INTEGRAL WITH LEFTWARDS ARROW WITH HOOK
|
||||
# 2A18; INTEGRAL WITH TIMES SIGN
|
||||
# 2A19; INTEGRAL WITH INTERSECTION
|
||||
# 2A1A; INTEGRAL WITH UNION
|
||||
# 2A1B; INTEGRAL WITH OVERBAR
|
||||
# 2A1C; INTEGRAL WITH UNDERBAR
|
||||
# 2A1E; LARGE LEFT TRIANGLE OPERATOR
|
||||
# 2A1F; Z NOTATION SCHEMA COMPOSITION
|
||||
# 2A20; Z NOTATION SCHEMA PIPING
|
||||
# 2A21; Z NOTATION SCHEMA PROJECTION
|
||||
# 2A24; PLUS SIGN WITH TILDE ABOVE
|
||||
# 2A26; PLUS SIGN WITH TILDE BELOW
|
||||
# 2A29; MINUS SIGN WITH COMMA ABOVE
|
||||
# 2A3E; Z NOTATION RELATIONAL COMPOSITION
|
||||
# 2A57; SLOPING LARGE OR
|
||||
# 2A58; SLOPING LARGE AND
|
||||
# 2A6A; TILDE OPERATOR WITH DOT ABOVE
|
||||
# 2A6B; TILDE OPERATOR WITH RISING DOTS
|
||||
# 2A6C; SIMILAR MINUS SIMILAR
|
||||
# 2A6D; CONGRUENT WITH DOT ABOVE
|
||||
# 2A6F; ALMOST EQUAL TO WITH CIRCUMFLEX ACCENT
|
||||
# 2A70; APPROXIMATELY EQUAL OR EQUAL TO
|
||||
# 2A73; EQUALS SIGN ABOVE TILDE OPERATOR
|
||||
# 2A74; DOUBLE COLON EQUAL
|
||||
# 2AA3; DOUBLE NESTED LESS-THAN WITH UNDERBAR
|
||||
# 2ADC; FORKING
|
||||
# 2AE2; VERTICAL BAR TRIPLE RIGHT TURNSTILE
|
||||
# 2AE6; LONG DASH FROM LEFT MEMBER OF DOUBLE VERTICAL
|
||||
# 2AF3; PARALLEL WITH TILDE OPERATOR
|
||||
# 2AFB; TRIPLE SOLIDUS BINARY RELATION
|
||||
# 2AFD; DOUBLE SOLIDUS OPERATOR
|
||||
# 1D6DB; MATHEMATICAL BOLD PARTIAL DIFFERENTIAL
|
||||
# 1D715; MATHEMATICAL ITALIC PARTIAL DIFFERENTIAL
|
||||
# 1D74F; MATHEMATICAL BOLD ITALIC PARTIAL DIFFERENTIAL
|
||||
# 1D789; MATHEMATICAL SANS-SERIF BOLD PARTIAL DIFFERENTIAL
|
||||
# 1D7C3; MATHEMATICAL SANS-SERIF BOLD ITALIC PARTIAL DIFFERENTIAL
|
||||
|
||||
# EOF
|
|
@ -1,6 +1,6 @@
|
|||
# CaseFolding-13.0.0.txt
|
||||
# Date: 2019-09-08, 23:30:59 GMT
|
||||
# © 2019 Unicode®, Inc.
|
||||
# CaseFolding-14.0.0.txt
|
||||
# Date: 2021-03-08, 19:35:41 GMT
|
||||
# © 2021 Unicode®, Inc.
|
||||
# Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries.
|
||||
# For terms of use, see http://www.unicode.org/terms_of_use.html
|
||||
#
|
||||
|
@ -1050,6 +1050,7 @@
|
|||
2C2C; C; 2C5C; # GLAGOLITIC CAPITAL LETTER SHTAPIC
|
||||
2C2D; C; 2C5D; # GLAGOLITIC CAPITAL LETTER TROKUTASTI A
|
||||
2C2E; C; 2C5E; # GLAGOLITIC CAPITAL LETTER LATINATE MYSLITE
|
||||
2C2F; C; 2C5F; # GLAGOLITIC CAPITAL LETTER CAUDATE CHRIVI
|
||||
2C60; C; 2C61; # LATIN CAPITAL LETTER L WITH DOUBLE BAR
|
||||
2C62; C; 026B; # LATIN CAPITAL LETTER L WITH MIDDLE TILDE
|
||||
2C63; C; 1D7D; # LATIN CAPITAL LETTER P WITH STROKE
|
||||
|
@ -1230,12 +1231,16 @@ A7B8; C; A7B9; # LATIN CAPITAL LETTER U WITH STROKE
|
|||
A7BA; C; A7BB; # LATIN CAPITAL LETTER GLOTTAL A
|
||||
A7BC; C; A7BD; # LATIN CAPITAL LETTER GLOTTAL I
|
||||
A7BE; C; A7BF; # LATIN CAPITAL LETTER GLOTTAL U
|
||||
A7C0; C; A7C1; # LATIN CAPITAL LETTER OLD POLISH O
|
||||
A7C2; C; A7C3; # LATIN CAPITAL LETTER ANGLICANA W
|
||||
A7C4; C; A794; # LATIN CAPITAL LETTER C WITH PALATAL HOOK
|
||||
A7C5; C; 0282; # LATIN CAPITAL LETTER S WITH HOOK
|
||||
A7C6; C; 1D8E; # LATIN CAPITAL LETTER Z WITH PALATAL HOOK
|
||||
A7C7; C; A7C8; # LATIN CAPITAL LETTER D WITH SHORT STROKE OVERLAY
|
||||
A7C9; C; A7CA; # LATIN CAPITAL LETTER S WITH SHORT STROKE OVERLAY
|
||||
A7D0; C; A7D1; # LATIN CAPITAL LETTER CLOSED INSULAR G
|
||||
A7D6; C; A7D7; # LATIN CAPITAL LETTER MIDDLE SCOTS S
|
||||
A7D8; C; A7D9; # LATIN CAPITAL LETTER SIGMOID S
|
||||
A7F5; C; A7F6; # LATIN CAPITAL LETTER REVERSED HALF H
|
||||
AB70; C; 13A0; # CHEROKEE SMALL LETTER A
|
||||
AB71; C; 13A1; # CHEROKEE SMALL LETTER E
|
||||
|
@ -1431,6 +1436,41 @@ FF3A; C; FF5A; # FULLWIDTH LATIN CAPITAL LETTER Z
|
|||
104D1; C; 104F9; # OSAGE CAPITAL LETTER GHA
|
||||
104D2; C; 104FA; # OSAGE CAPITAL LETTER ZA
|
||||
104D3; C; 104FB; # OSAGE CAPITAL LETTER ZHA
|
||||
10570; C; 10597; # VITHKUQI CAPITAL LETTER A
|
||||
10571; C; 10598; # VITHKUQI CAPITAL LETTER BBE
|
||||
10572; C; 10599; # VITHKUQI CAPITAL LETTER BE
|
||||
10573; C; 1059A; # VITHKUQI CAPITAL LETTER CE
|
||||
10574; C; 1059B; # VITHKUQI CAPITAL LETTER CHE
|
||||
10575; C; 1059C; # VITHKUQI CAPITAL LETTER DE
|
||||
10576; C; 1059D; # VITHKUQI CAPITAL LETTER DHE
|
||||
10577; C; 1059E; # VITHKUQI CAPITAL LETTER EI
|
||||
10578; C; 1059F; # VITHKUQI CAPITAL LETTER E
|
||||
10579; C; 105A0; # VITHKUQI CAPITAL LETTER FE
|
||||
1057A; C; 105A1; # VITHKUQI CAPITAL LETTER GA
|
||||
1057C; C; 105A3; # VITHKUQI CAPITAL LETTER HA
|
||||
1057D; C; 105A4; # VITHKUQI CAPITAL LETTER HHA
|
||||
1057E; C; 105A5; # VITHKUQI CAPITAL LETTER I
|
||||
1057F; C; 105A6; # VITHKUQI CAPITAL LETTER IJE
|
||||
10580; C; 105A7; # VITHKUQI CAPITAL LETTER JE
|
||||
10581; C; 105A8; # VITHKUQI CAPITAL LETTER KA
|
||||
10582; C; 105A9; # VITHKUQI CAPITAL LETTER LA
|
||||
10583; C; 105AA; # VITHKUQI CAPITAL LETTER LLA
|
||||
10584; C; 105AB; # VITHKUQI CAPITAL LETTER ME
|
||||
10585; C; 105AC; # VITHKUQI CAPITAL LETTER NE
|
||||
10586; C; 105AD; # VITHKUQI CAPITAL LETTER NJE
|
||||
10587; C; 105AE; # VITHKUQI CAPITAL LETTER O
|
||||
10588; C; 105AF; # VITHKUQI CAPITAL LETTER PE
|
||||
10589; C; 105B0; # VITHKUQI CAPITAL LETTER QA
|
||||
1058A; C; 105B1; # VITHKUQI CAPITAL LETTER RE
|
||||
1058C; C; 105B3; # VITHKUQI CAPITAL LETTER SE
|
||||
1058D; C; 105B4; # VITHKUQI CAPITAL LETTER SHE
|
||||
1058E; C; 105B5; # VITHKUQI CAPITAL LETTER TE
|
||||
1058F; C; 105B6; # VITHKUQI CAPITAL LETTER THE
|
||||
10590; C; 105B7; # VITHKUQI CAPITAL LETTER U
|
||||
10591; C; 105B8; # VITHKUQI CAPITAL LETTER VE
|
||||
10592; C; 105B9; # VITHKUQI CAPITAL LETTER XE
|
||||
10594; C; 105BB; # VITHKUQI CAPITAL LETTER Y
|
||||
10595; C; 105BC; # VITHKUQI CAPITAL LETTER ZE
|
||||
10C80; C; 10CC0; # OLD HUNGARIAN CAPITAL LETTER A
|
||||
10C81; C; 10CC1; # OLD HUNGARIAN CAPITAL LETTER AA
|
||||
10C82; C; 10CC2; # OLD HUNGARIAN CAPITAL LETTER EB
|
||||
|
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
|
@ -1,6 +1,6 @@
|
|||
# DerivedGeneralCategory-13.0.0.txt
|
||||
# Date: 2019-10-21, 14:30:32 GMT
|
||||
# © 2019 Unicode®, Inc.
|
||||
# DerivedGeneralCategory-14.0.0.txt
|
||||
# Date: 2021-07-10, 00:35:08 GMT
|
||||
# © 2021 Unicode®, Inc.
|
||||
# Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries.
|
||||
# For terms of use, see http://www.unicode.org/terms_of_use.html
|
||||
#
|
||||
|
@ -27,7 +27,6 @@
|
|||
05C8..05CF ; Cn # [8] <reserved-05C8>..<reserved-05CF>
|
||||
05EB..05EE ; Cn # [4] <reserved-05EB>..<reserved-05EE>
|
||||
05F5..05FF ; Cn # [11] <reserved-05F5>..<reserved-05FF>
|
||||
061D ; Cn # <reserved-061D>
|
||||
070E ; Cn # <reserved-070E>
|
||||
074B..074C ; Cn # [2] <reserved-074B>..<reserved-074C>
|
||||
07B2..07BF ; Cn # [14] <reserved-07B2>..<reserved-07BF>
|
||||
|
@ -36,9 +35,9 @@
|
|||
083F ; Cn # <reserved-083F>
|
||||
085C..085D ; Cn # [2] <reserved-085C>..<reserved-085D>
|
||||
085F ; Cn # <reserved-085F>
|
||||
086B..089F ; Cn # [53] <reserved-086B>..<reserved-089F>
|
||||
08B5 ; Cn # <reserved-08B5>
|
||||
08C8..08D2 ; Cn # [11] <reserved-08C8>..<reserved-08D2>
|
||||
086B..086F ; Cn # [5] <reserved-086B>..<reserved-086F>
|
||||
088F ; Cn # <reserved-088F>
|
||||
0892..0897 ; Cn # [6] <reserved-0892>..<reserved-0897>
|
||||
0984 ; Cn # <reserved-0984>
|
||||
098D..098E ; Cn # [2] <reserved-098D>..<reserved-098E>
|
||||
0991..0992 ; Cn # [2] <reserved-0991>..<reserved-0992>
|
||||
|
@ -116,12 +115,13 @@
|
|||
0C0D ; Cn # <reserved-0C0D>
|
||||
0C11 ; Cn # <reserved-0C11>
|
||||
0C29 ; Cn # <reserved-0C29>
|
||||
0C3A..0C3C ; Cn # [3] <reserved-0C3A>..<reserved-0C3C>
|
||||
0C3A..0C3B ; Cn # [2] <reserved-0C3A>..<reserved-0C3B>
|
||||
0C45 ; Cn # <reserved-0C45>
|
||||
0C49 ; Cn # <reserved-0C49>
|
||||
0C4E..0C54 ; Cn # [7] <reserved-0C4E>..<reserved-0C54>
|
||||
0C57 ; Cn # <reserved-0C57>
|
||||
0C5B..0C5F ; Cn # [5] <reserved-0C5B>..<reserved-0C5F>
|
||||
0C5B..0C5C ; Cn # [2] <reserved-0C5B>..<reserved-0C5C>
|
||||
0C5E..0C5F ; Cn # [2] <reserved-0C5E>..<reserved-0C5F>
|
||||
0C64..0C65 ; Cn # [2] <reserved-0C64>..<reserved-0C65>
|
||||
0C70..0C76 ; Cn # [7] <reserved-0C70>..<reserved-0C76>
|
||||
0C8D ; Cn # <reserved-0C8D>
|
||||
|
@ -132,7 +132,7 @@
|
|||
0CC5 ; Cn # <reserved-0CC5>
|
||||
0CC9 ; Cn # <reserved-0CC9>
|
||||
0CCE..0CD4 ; Cn # [7] <reserved-0CCE>..<reserved-0CD4>
|
||||
0CD7..0CDD ; Cn # [7] <reserved-0CD7>..<reserved-0CDD>
|
||||
0CD7..0CDC ; Cn # [6] <reserved-0CD7>..<reserved-0CDC>
|
||||
0CDF ; Cn # <reserved-0CDF>
|
||||
0CE4..0CE5 ; Cn # [2] <reserved-0CE4>..<reserved-0CE5>
|
||||
0CF0 ; Cn # <reserved-0CF0>
|
||||
|
@ -200,8 +200,7 @@
|
|||
13FE..13FF ; Cn # [2] <reserved-13FE>..<reserved-13FF>
|
||||
169D..169F ; Cn # [3] <reserved-169D>..<reserved-169F>
|
||||
16F9..16FF ; Cn # [7] <reserved-16F9>..<reserved-16FF>
|
||||
170D ; Cn # <reserved-170D>
|
||||
1715..171F ; Cn # [11] <reserved-1715>..<reserved-171F>
|
||||
1716..171E ; Cn # [9] <reserved-1716>..<reserved-171E>
|
||||
1737..173F ; Cn # [9] <reserved-1737>..<reserved-173F>
|
||||
1754..175F ; Cn # [12] <reserved-1754>..<reserved-175F>
|
||||
176D ; Cn # <reserved-176D>
|
||||
|
@ -210,7 +209,6 @@
|
|||
17DE..17DF ; Cn # [2] <reserved-17DE>..<reserved-17DF>
|
||||
17EA..17EF ; Cn # [6] <reserved-17EA>..<reserved-17EF>
|
||||
17FA..17FF ; Cn # [6] <reserved-17FA>..<reserved-17FF>
|
||||
180F ; Cn # <reserved-180F>
|
||||
181A..181F ; Cn # [6] <reserved-181A>..<reserved-181F>
|
||||
1879..187F ; Cn # [7] <reserved-1879>..<reserved-187F>
|
||||
18AB..18AF ; Cn # [5] <reserved-18AB>..<reserved-18AF>
|
||||
|
@ -230,9 +228,9 @@
|
|||
1A8A..1A8F ; Cn # [6] <reserved-1A8A>..<reserved-1A8F>
|
||||
1A9A..1A9F ; Cn # [6] <reserved-1A9A>..<reserved-1A9F>
|
||||
1AAE..1AAF ; Cn # [2] <reserved-1AAE>..<reserved-1AAF>
|
||||
1AC1..1AFF ; Cn # [63] <reserved-1AC1>..<reserved-1AFF>
|
||||
1B4C..1B4F ; Cn # [4] <reserved-1B4C>..<reserved-1B4F>
|
||||
1B7D..1B7F ; Cn # [3] <reserved-1B7D>..<reserved-1B7F>
|
||||
1ACF..1AFF ; Cn # [49] <reserved-1ACF>..<reserved-1AFF>
|
||||
1B4D..1B4F ; Cn # [3] <reserved-1B4D>..<reserved-1B4F>
|
||||
1B7F ; Cn # <reserved-1B7F>
|
||||
1BF4..1BFB ; Cn # [8] <reserved-1BF4>..<reserved-1BFB>
|
||||
1C38..1C3A ; Cn # [3] <reserved-1C38>..<reserved-1C3A>
|
||||
1C4A..1C4C ; Cn # [3] <reserved-1C4A>..<reserved-1C4C>
|
||||
|
@ -240,7 +238,6 @@
|
|||
1CBB..1CBC ; Cn # [2] <reserved-1CBB>..<reserved-1CBC>
|
||||
1CC8..1CCF ; Cn # [8] <reserved-1CC8>..<reserved-1CCF>
|
||||
1CFB..1CFF ; Cn # [5] <reserved-1CFB>..<reserved-1CFF>
|
||||
1DFA ; Cn # <reserved-1DFA>
|
||||
1F16..1F17 ; Cn # [2] <reserved-1F16>..<reserved-1F17>
|
||||
1F1E..1F1F ; Cn # [2] <reserved-1F1E>..<reserved-1F1F>
|
||||
1F46..1F47 ; Cn # [2] <reserved-1F46>..<reserved-1F47>
|
||||
|
@ -261,15 +258,13 @@
|
|||
2072..2073 ; Cn # [2] <reserved-2072>..<reserved-2073>
|
||||
208F ; Cn # <reserved-208F>
|
||||
209D..209F ; Cn # [3] <reserved-209D>..<reserved-209F>
|
||||
20C0..20CF ; Cn # [16] <reserved-20C0>..<reserved-20CF>
|
||||
20C1..20CF ; Cn # [15] <reserved-20C1>..<reserved-20CF>
|
||||
20F1..20FF ; Cn # [15] <reserved-20F1>..<reserved-20FF>
|
||||
218C..218F ; Cn # [4] <reserved-218C>..<reserved-218F>
|
||||
2427..243F ; Cn # [25] <reserved-2427>..<reserved-243F>
|
||||
244B..245F ; Cn # [21] <reserved-244B>..<reserved-245F>
|
||||
2B74..2B75 ; Cn # [2] <reserved-2B74>..<reserved-2B75>
|
||||
2B96 ; Cn # <reserved-2B96>
|
||||
2C2F ; Cn # <reserved-2C2F>
|
||||
2C5F ; Cn # <reserved-2C5F>
|
||||
2CF4..2CF8 ; Cn # [5] <reserved-2CF4>..<reserved-2CF8>
|
||||
2D26 ; Cn # <reserved-2D26>
|
||||
2D28..2D2C ; Cn # [5] <reserved-2D28>..<reserved-2D2C>
|
||||
|
@ -285,7 +280,7 @@
|
|||
2DCF ; Cn # <reserved-2DCF>
|
||||
2DD7 ; Cn # <reserved-2DD7>
|
||||
2DDF ; Cn # <reserved-2DDF>
|
||||
2E53..2E7F ; Cn # [45] <reserved-2E53>..<reserved-2E7F>
|
||||
2E5E..2E7F ; Cn # [34] <reserved-2E5E>..<reserved-2E7F>
|
||||
2E9A ; Cn # <reserved-2E9A>
|
||||
2EF4..2EFF ; Cn # [12] <reserved-2EF4>..<reserved-2EFF>
|
||||
2FD6..2FEF ; Cn # [26] <reserved-2FD6>..<reserved-2FEF>
|
||||
|
@ -297,13 +292,14 @@
|
|||
318F ; Cn # <reserved-318F>
|
||||
31E4..31EF ; Cn # [12] <reserved-31E4>..<reserved-31EF>
|
||||
321F ; Cn # <reserved-321F>
|
||||
9FFD..9FFF ; Cn # [3] <reserved-9FFD>..<reserved-9FFF>
|
||||
A48D..A48F ; Cn # [3] <reserved-A48D>..<reserved-A48F>
|
||||
A4C7..A4CF ; Cn # [9] <reserved-A4C7>..<reserved-A4CF>
|
||||
A62C..A63F ; Cn # [20] <reserved-A62C>..<reserved-A63F>
|
||||
A6F8..A6FF ; Cn # [8] <reserved-A6F8>..<reserved-A6FF>
|
||||
A7C0..A7C1 ; Cn # [2] <reserved-A7C0>..<reserved-A7C1>
|
||||
A7CB..A7F4 ; Cn # [42] <reserved-A7CB>..<reserved-A7F4>
|
||||
A7CB..A7CF ; Cn # [5] <reserved-A7CB>..<reserved-A7CF>
|
||||
A7D2 ; Cn # <reserved-A7D2>
|
||||
A7D4 ; Cn # <reserved-A7D4>
|
||||
A7DA..A7F1 ; Cn # [24] <reserved-A7DA>..<reserved-A7F1>
|
||||
A82D..A82F ; Cn # [3] <reserved-A82D>..<reserved-A82F>
|
||||
A83A..A83F ; Cn # [6] <reserved-A83A>..<reserved-A83F>
|
||||
A878..A87F ; Cn # [8] <reserved-A878>..<reserved-A87F>
|
||||
|
@ -339,11 +335,10 @@ FB3D ; Cn # <reserved-FB3D>
|
|||
FB3F ; Cn # <reserved-FB3F>
|
||||
FB42 ; Cn # <reserved-FB42>
|
||||
FB45 ; Cn # <reserved-FB45>
|
||||
FBC2..FBD2 ; Cn # [17] <reserved-FBC2>..<reserved-FBD2>
|
||||
FD40..FD4F ; Cn # [16] <reserved-FD40>..<reserved-FD4F>
|
||||
FBC3..FBD2 ; Cn # [16] <reserved-FBC3>..<reserved-FBD2>
|
||||
FD90..FD91 ; Cn # [2] <reserved-FD90>..<reserved-FD91>
|
||||
FDC8..FDEF ; Cn # [40] <reserved-FDC8>..<noncharacter-FDEF>
|
||||
FDFE..FDFF ; Cn # [2] <reserved-FDFE>..<reserved-FDFF>
|
||||
FDC8..FDCE ; Cn # [7] <reserved-FDC8>..<reserved-FDCE>
|
||||
FDD0..FDEF ; Cn # [32] <noncharacter-FDD0>..<noncharacter-FDEF>
|
||||
FE1A..FE1F ; Cn # [6] <reserved-FE1A>..<reserved-FE1F>
|
||||
FE53 ; Cn # <reserved-FE53>
|
||||
FE67 ; Cn # <reserved-FE67>
|
||||
|
@ -387,10 +382,20 @@ FFFE..FFFF ; Cn # [2] <noncharacter-FFFE>..<noncharacter-FFFF>
|
|||
104FC..104FF ; Cn # [4] <reserved-104FC>..<reserved-104FF>
|
||||
10528..1052F ; Cn # [8] <reserved-10528>..<reserved-1052F>
|
||||
10564..1056E ; Cn # [11] <reserved-10564>..<reserved-1056E>
|
||||
10570..105FF ; Cn # [144] <reserved-10570>..<reserved-105FF>
|
||||
1057B ; Cn # <reserved-1057B>
|
||||
1058B ; Cn # <reserved-1058B>
|
||||
10593 ; Cn # <reserved-10593>
|
||||
10596 ; Cn # <reserved-10596>
|
||||
105A2 ; Cn # <reserved-105A2>
|
||||
105B2 ; Cn # <reserved-105B2>
|
||||
105BA ; Cn # <reserved-105BA>
|
||||
105BD..105FF ; Cn # [67] <reserved-105BD>..<reserved-105FF>
|
||||
10737..1073F ; Cn # [9] <reserved-10737>..<reserved-1073F>
|
||||
10756..1075F ; Cn # [10] <reserved-10756>..<reserved-1075F>
|
||||
10768..107FF ; Cn # [152] <reserved-10768>..<reserved-107FF>
|
||||
10768..1077F ; Cn # [24] <reserved-10768>..<reserved-1077F>
|
||||
10786 ; Cn # <reserved-10786>
|
||||
107B1 ; Cn # <reserved-107B1>
|
||||
107BB..107FF ; Cn # [69] <reserved-107BB>..<reserved-107FF>
|
||||
10806..10807 ; Cn # [2] <reserved-10806>..<reserved-10807>
|
||||
10809 ; Cn # <reserved-10809>
|
||||
10836 ; Cn # <reserved-10836>
|
||||
|
@ -433,12 +438,13 @@ FFFE..FFFF ; Cn # [2] <noncharacter-FFFE>..<noncharacter-FFFF>
|
|||
10EAE..10EAF ; Cn # [2] <reserved-10EAE>..<reserved-10EAF>
|
||||
10EB2..10EFF ; Cn # [78] <reserved-10EB2>..<reserved-10EFF>
|
||||
10F28..10F2F ; Cn # [8] <reserved-10F28>..<reserved-10F2F>
|
||||
10F5A..10FAF ; Cn # [86] <reserved-10F5A>..<reserved-10FAF>
|
||||
10F5A..10F6F ; Cn # [22] <reserved-10F5A>..<reserved-10F6F>
|
||||
10F8A..10FAF ; Cn # [38] <reserved-10F8A>..<reserved-10FAF>
|
||||
10FCC..10FDF ; Cn # [20] <reserved-10FCC>..<reserved-10FDF>
|
||||
10FF7..10FFF ; Cn # [9] <reserved-10FF7>..<reserved-10FFF>
|
||||
1104E..11051 ; Cn # [4] <reserved-1104E>..<reserved-11051>
|
||||
11070..1107E ; Cn # [15] <reserved-11070>..<reserved-1107E>
|
||||
110C2..110CC ; Cn # [11] <reserved-110C2>..<reserved-110CC>
|
||||
11076..1107E ; Cn # [9] <reserved-11076>..<reserved-1107E>
|
||||
110C3..110CC ; Cn # [10] <reserved-110C3>..<reserved-110CC>
|
||||
110CE..110CF ; Cn # [2] <reserved-110CE>..<reserved-110CF>
|
||||
110E9..110EF ; Cn # [7] <reserved-110E9>..<reserved-110EF>
|
||||
110FA..110FF ; Cn # [6] <reserved-110FA>..<reserved-110FF>
|
||||
|
@ -480,11 +486,11 @@ FFFE..FFFF ; Cn # [2] <noncharacter-FFFE>..<noncharacter-FFFF>
|
|||
11645..1164F ; Cn # [11] <reserved-11645>..<reserved-1164F>
|
||||
1165A..1165F ; Cn # [6] <reserved-1165A>..<reserved-1165F>
|
||||
1166D..1167F ; Cn # [19] <reserved-1166D>..<reserved-1167F>
|
||||
116B9..116BF ; Cn # [7] <reserved-116B9>..<reserved-116BF>
|
||||
116BA..116BF ; Cn # [6] <reserved-116BA>..<reserved-116BF>
|
||||
116CA..116FF ; Cn # [54] <reserved-116CA>..<reserved-116FF>
|
||||
1171B..1171C ; Cn # [2] <reserved-1171B>..<reserved-1171C>
|
||||
1172C..1172F ; Cn # [4] <reserved-1172C>..<reserved-1172F>
|
||||
11740..117FF ; Cn # [192] <reserved-11740>..<reserved-117FF>
|
||||
11747..117FF ; Cn # [185] <reserved-11747>..<reserved-117FF>
|
||||
1183C..1189F ; Cn # [100] <reserved-1183C>..<reserved-1189F>
|
||||
118F3..118FE ; Cn # [12] <reserved-118F3>..<reserved-118FE>
|
||||
11907..11908 ; Cn # [2] <reserved-11907>..<reserved-11908>
|
||||
|
@ -499,7 +505,7 @@ FFFE..FFFF ; Cn # [2] <noncharacter-FFFE>..<noncharacter-FFFF>
|
|||
119D8..119D9 ; Cn # [2] <reserved-119D8>..<reserved-119D9>
|
||||
119E5..119FF ; Cn # [27] <reserved-119E5>..<reserved-119FF>
|
||||
11A48..11A4F ; Cn # [8] <reserved-11A48>..<reserved-11A4F>
|
||||
11AA3..11ABF ; Cn # [29] <reserved-11AA3>..<reserved-11ABF>
|
||||
11AA3..11AAF ; Cn # [13] <reserved-11AA3>..<reserved-11AAF>
|
||||
11AF9..11BFF ; Cn # [263] <reserved-11AF9>..<reserved-11BFF>
|
||||
11C09 ; Cn # <reserved-11C09>
|
||||
11C37 ; Cn # <reserved-11C37>
|
||||
|
@ -527,14 +533,16 @@ FFFE..FFFF ; Cn # [2] <noncharacter-FFFE>..<noncharacter-FFFF>
|
|||
1239A..123FF ; Cn # [102] <reserved-1239A>..<reserved-123FF>
|
||||
1246F ; Cn # <reserved-1246F>
|
||||
12475..1247F ; Cn # [11] <reserved-12475>..<reserved-1247F>
|
||||
12544..12FFF ; Cn # [2748] <reserved-12544>..<reserved-12FFF>
|
||||
12544..12F8F ; Cn # [2636] <reserved-12544>..<reserved-12F8F>
|
||||
12FF3..12FFF ; Cn # [13] <reserved-12FF3>..<reserved-12FFF>
|
||||
1342F ; Cn # <reserved-1342F>
|
||||
13439..143FF ; Cn # [4039] <reserved-13439>..<reserved-143FF>
|
||||
14647..167FF ; Cn # [8633] <reserved-14647>..<reserved-167FF>
|
||||
16A39..16A3F ; Cn # [7] <reserved-16A39>..<reserved-16A3F>
|
||||
16A5F ; Cn # <reserved-16A5F>
|
||||
16A6A..16A6D ; Cn # [4] <reserved-16A6A>..<reserved-16A6D>
|
||||
16A70..16ACF ; Cn # [96] <reserved-16A70>..<reserved-16ACF>
|
||||
16ABF ; Cn # <reserved-16ABF>
|
||||
16ACA..16ACF ; Cn # [6] <reserved-16ACA>..<reserved-16ACF>
|
||||
16AEE..16AEF ; Cn # [2] <reserved-16AEE>..<reserved-16AEF>
|
||||
16AF6..16AFF ; Cn # [10] <reserved-16AF6>..<reserved-16AFF>
|
||||
16B46..16B4F ; Cn # [10] <reserved-16B46>..<reserved-16B4F>
|
||||
|
@ -550,8 +558,11 @@ FFFE..FFFF ; Cn # [2] <noncharacter-FFFE>..<noncharacter-FFFF>
|
|||
16FF2..16FFF ; Cn # [14] <reserved-16FF2>..<reserved-16FFF>
|
||||
187F8..187FF ; Cn # [8] <reserved-187F8>..<reserved-187FF>
|
||||
18CD6..18CFF ; Cn # [42] <reserved-18CD6>..<reserved-18CFF>
|
||||
18D09..1AFFF ; Cn # [8951] <reserved-18D09>..<reserved-1AFFF>
|
||||
1B11F..1B14F ; Cn # [49] <reserved-1B11F>..<reserved-1B14F>
|
||||
18D09..1AFEF ; Cn # [8935] <reserved-18D09>..<reserved-1AFEF>
|
||||
1AFF4 ; Cn # <reserved-1AFF4>
|
||||
1AFFC ; Cn # <reserved-1AFFC>
|
||||
1AFFF ; Cn # <reserved-1AFFF>
|
||||
1B123..1B14F ; Cn # [45] <reserved-1B123>..<reserved-1B14F>
|
||||
1B153..1B163 ; Cn # [17] <reserved-1B153>..<reserved-1B163>
|
||||
1B168..1B16F ; Cn # [8] <reserved-1B168>..<reserved-1B16F>
|
||||
1B2FC..1BBFF ; Cn # [2308] <reserved-1B2FC>..<reserved-1BBFF>
|
||||
|
@ -559,10 +570,13 @@ FFFE..FFFF ; Cn # [2] <noncharacter-FFFE>..<noncharacter-FFFF>
|
|||
1BC7D..1BC7F ; Cn # [3] <reserved-1BC7D>..<reserved-1BC7F>
|
||||
1BC89..1BC8F ; Cn # [7] <reserved-1BC89>..<reserved-1BC8F>
|
||||
1BC9A..1BC9B ; Cn # [2] <reserved-1BC9A>..<reserved-1BC9B>
|
||||
1BCA4..1CFFF ; Cn # [4956] <reserved-1BCA4>..<reserved-1CFFF>
|
||||
1BCA4..1CEFF ; Cn # [4700] <reserved-1BCA4>..<reserved-1CEFF>
|
||||
1CF2E..1CF2F ; Cn # [2] <reserved-1CF2E>..<reserved-1CF2F>
|
||||
1CF47..1CF4F ; Cn # [9] <reserved-1CF47>..<reserved-1CF4F>
|
||||
1CFC4..1CFFF ; Cn # [60] <reserved-1CFC4>..<reserved-1CFFF>
|
||||
1D0F6..1D0FF ; Cn # [10] <reserved-1D0F6>..<reserved-1D0FF>
|
||||
1D127..1D128 ; Cn # [2] <reserved-1D127>..<reserved-1D128>
|
||||
1D1E9..1D1FF ; Cn # [23] <reserved-1D1E9>..<reserved-1D1FF>
|
||||
1D1EB..1D1FF ; Cn # [21] <reserved-1D1EB>..<reserved-1D1FF>
|
||||
1D246..1D2DF ; Cn # [154] <reserved-1D246>..<reserved-1D2DF>
|
||||
1D2F4..1D2FF ; Cn # [12] <reserved-1D2F4>..<reserved-1D2FF>
|
||||
1D357..1D35F ; Cn # [9] <reserved-1D357>..<reserved-1D35F>
|
||||
|
@ -589,7 +603,8 @@ FFFE..FFFF ; Cn # [2] <noncharacter-FFFE>..<noncharacter-FFFF>
|
|||
1D7CC..1D7CD ; Cn # [2] <reserved-1D7CC>..<reserved-1D7CD>
|
||||
1DA8C..1DA9A ; Cn # [15] <reserved-1DA8C>..<reserved-1DA9A>
|
||||
1DAA0 ; Cn # <reserved-1DAA0>
|
||||
1DAB0..1DFFF ; Cn # [1360] <reserved-1DAB0>..<reserved-1DFFF>
|
||||
1DAB0..1DEFF ; Cn # [1104] <reserved-1DAB0>..<reserved-1DEFF>
|
||||
1DF1F..1DFFF ; Cn # [225] <reserved-1DF1F>..<reserved-1DFFF>
|
||||
1E007 ; Cn # <reserved-1E007>
|
||||
1E019..1E01A ; Cn # [2] <reserved-1E019>..<reserved-1E01A>
|
||||
1E022 ; Cn # <reserved-1E022>
|
||||
|
@ -598,9 +613,14 @@ FFFE..FFFF ; Cn # [2] <noncharacter-FFFE>..<noncharacter-FFFF>
|
|||
1E12D..1E12F ; Cn # [3] <reserved-1E12D>..<reserved-1E12F>
|
||||
1E13E..1E13F ; Cn # [2] <reserved-1E13E>..<reserved-1E13F>
|
||||
1E14A..1E14D ; Cn # [4] <reserved-1E14A>..<reserved-1E14D>
|
||||
1E150..1E2BF ; Cn # [368] <reserved-1E150>..<reserved-1E2BF>
|
||||
1E150..1E28F ; Cn # [320] <reserved-1E150>..<reserved-1E28F>
|
||||
1E2AF..1E2BF ; Cn # [17] <reserved-1E2AF>..<reserved-1E2BF>
|
||||
1E2FA..1E2FE ; Cn # [5] <reserved-1E2FA>..<reserved-1E2FE>
|
||||
1E300..1E7FF ; Cn # [1280] <reserved-1E300>..<reserved-1E7FF>
|
||||
1E300..1E7DF ; Cn # [1248] <reserved-1E300>..<reserved-1E7DF>
|
||||
1E7E7 ; Cn # <reserved-1E7E7>
|
||||
1E7EC ; Cn # <reserved-1E7EC>
|
||||
1E7EF ; Cn # <reserved-1E7EF>
|
||||
1E7FF ; Cn # <reserved-1E7FF>
|
||||
1E8C5..1E8C6 ; Cn # [2] <reserved-1E8C5>..<reserved-1E8C6>
|
||||
1E8D7..1E8FF ; Cn # [41] <reserved-1E8D7>..<reserved-1E8FF>
|
||||
1E94C..1E94F ; Cn # [4] <reserved-1E94C>..<reserved-1E94F>
|
||||
|
@ -654,34 +674,35 @@ FFFE..FFFF ; Cn # [2] <noncharacter-FFFE>..<noncharacter-FFFF>
|
|||
1F249..1F24F ; Cn # [7] <reserved-1F249>..<reserved-1F24F>
|
||||
1F252..1F25F ; Cn # [14] <reserved-1F252>..<reserved-1F25F>
|
||||
1F266..1F2FF ; Cn # [154] <reserved-1F266>..<reserved-1F2FF>
|
||||
1F6D8..1F6DF ; Cn # [8] <reserved-1F6D8>..<reserved-1F6DF>
|
||||
1F6D8..1F6DC ; Cn # [5] <reserved-1F6D8>..<reserved-1F6DC>
|
||||
1F6ED..1F6EF ; Cn # [3] <reserved-1F6ED>..<reserved-1F6EF>
|
||||
1F6FD..1F6FF ; Cn # [3] <reserved-1F6FD>..<reserved-1F6FF>
|
||||
1F774..1F77F ; Cn # [12] <reserved-1F774>..<reserved-1F77F>
|
||||
1F7D9..1F7DF ; Cn # [7] <reserved-1F7D9>..<reserved-1F7DF>
|
||||
1F7EC..1F7FF ; Cn # [20] <reserved-1F7EC>..<reserved-1F7FF>
|
||||
1F7EC..1F7EF ; Cn # [4] <reserved-1F7EC>..<reserved-1F7EF>
|
||||
1F7F1..1F7FF ; Cn # [15] <reserved-1F7F1>..<reserved-1F7FF>
|
||||
1F80C..1F80F ; Cn # [4] <reserved-1F80C>..<reserved-1F80F>
|
||||
1F848..1F84F ; Cn # [8] <reserved-1F848>..<reserved-1F84F>
|
||||
1F85A..1F85F ; Cn # [6] <reserved-1F85A>..<reserved-1F85F>
|
||||
1F888..1F88F ; Cn # [8] <reserved-1F888>..<reserved-1F88F>
|
||||
1F8AE..1F8AF ; Cn # [2] <reserved-1F8AE>..<reserved-1F8AF>
|
||||
1F8B2..1F8FF ; Cn # [78] <reserved-1F8B2>..<reserved-1F8FF>
|
||||
1F979 ; Cn # <reserved-1F979>
|
||||
1F9CC ; Cn # <reserved-1F9CC>
|
||||
1FA54..1FA5F ; Cn # [12] <reserved-1FA54>..<reserved-1FA5F>
|
||||
1FA6E..1FA6F ; Cn # [2] <reserved-1FA6E>..<reserved-1FA6F>
|
||||
1FA75..1FA77 ; Cn # [3] <reserved-1FA75>..<reserved-1FA77>
|
||||
1FA7B..1FA7F ; Cn # [5] <reserved-1FA7B>..<reserved-1FA7F>
|
||||
1FA7D..1FA7F ; Cn # [3] <reserved-1FA7D>..<reserved-1FA7F>
|
||||
1FA87..1FA8F ; Cn # [9] <reserved-1FA87>..<reserved-1FA8F>
|
||||
1FAA9..1FAAF ; Cn # [7] <reserved-1FAA9>..<reserved-1FAAF>
|
||||
1FAB7..1FABF ; Cn # [9] <reserved-1FAB7>..<reserved-1FABF>
|
||||
1FAC3..1FACF ; Cn # [13] <reserved-1FAC3>..<reserved-1FACF>
|
||||
1FAD7..1FAFF ; Cn # [41] <reserved-1FAD7>..<reserved-1FAFF>
|
||||
1FAAD..1FAAF ; Cn # [3] <reserved-1FAAD>..<reserved-1FAAF>
|
||||
1FABB..1FABF ; Cn # [5] <reserved-1FABB>..<reserved-1FABF>
|
||||
1FAC6..1FACF ; Cn # [10] <reserved-1FAC6>..<reserved-1FACF>
|
||||
1FADA..1FADF ; Cn # [6] <reserved-1FADA>..<reserved-1FADF>
|
||||
1FAE8..1FAEF ; Cn # [8] <reserved-1FAE8>..<reserved-1FAEF>
|
||||
1FAF7..1FAFF ; Cn # [9] <reserved-1FAF7>..<reserved-1FAFF>
|
||||
1FB93 ; Cn # <reserved-1FB93>
|
||||
1FBCB..1FBEF ; Cn # [37] <reserved-1FBCB>..<reserved-1FBEF>
|
||||
1FBFA..1FFFF ; Cn # [1030] <reserved-1FBFA>..<noncharacter-1FFFF>
|
||||
2A6DE..2A6FF ; Cn # [34] <reserved-2A6DE>..<reserved-2A6FF>
|
||||
2B735..2B73F ; Cn # [11] <reserved-2B735>..<reserved-2B73F>
|
||||
2A6E0..2A6FF ; Cn # [32] <reserved-2A6E0>..<reserved-2A6FF>
|
||||
2B739..2B73F ; Cn # [7] <reserved-2B739>..<reserved-2B73F>
|
||||
2B81E..2B81F ; Cn # [2] <reserved-2B81E>..<reserved-2B81F>
|
||||
2CEA2..2CEAF ; Cn # [14] <reserved-2CEA2>..<reserved-2CEAF>
|
||||
2EBE1..2F7FF ; Cn # [3103] <reserved-2EBE1>..<reserved-2F7FF>
|
||||
|
@ -693,7 +714,7 @@ E01F0..EFFFF ; Cn # [65040] <reserved-E01F0>..<noncharacter-EFFFF>
|
|||
FFFFE..FFFFF ; Cn # [2] <noncharacter-FFFFE>..<noncharacter-FFFFF>
|
||||
10FFFE..10FFFF; Cn # [2] <noncharacter-10FFFE>..<noncharacter-10FFFF>
|
||||
|
||||
# Total code points: 830672
|
||||
# Total code points: 829834
|
||||
|
||||
# ================================================
|
||||
|
||||
|
@ -1130,7 +1151,7 @@ FFFFE..FFFFF ; Cn # [2] <noncharacter-FFFFE>..<noncharacter-FFFFF>
|
|||
213E..213F ; Lu # [2] DOUBLE-STRUCK CAPITAL GAMMA..DOUBLE-STRUCK CAPITAL PI
|
||||
2145 ; Lu # DOUBLE-STRUCK ITALIC CAPITAL D
|
||||
2183 ; Lu # ROMAN NUMERAL REVERSED ONE HUNDRED
|
||||
2C00..2C2E ; Lu # [47] GLAGOLITIC CAPITAL LETTER AZU..GLAGOLITIC CAPITAL LETTER LATINATE MYSLITE
|
||||
2C00..2C2F ; Lu # [48] GLAGOLITIC CAPITAL LETTER AZU..GLAGOLITIC CAPITAL LETTER CAUDATE CHRIVI
|
||||
2C60 ; Lu # LATIN CAPITAL LETTER L WITH DOUBLE BAR
|
||||
2C62..2C64 ; Lu # [3] LATIN CAPITAL LETTER L WITH MIDDLE TILDE..LATIN CAPITAL LETTER R WITH TAIL
|
||||
2C67 ; Lu # LATIN CAPITAL LETTER H WITH DESCENDER
|
||||
|
@ -1295,13 +1316,21 @@ A7B8 ; Lu # LATIN CAPITAL LETTER U WITH STROKE
|
|||
A7BA ; Lu # LATIN CAPITAL LETTER GLOTTAL A
|
||||
A7BC ; Lu # LATIN CAPITAL LETTER GLOTTAL I
|
||||
A7BE ; Lu # LATIN CAPITAL LETTER GLOTTAL U
|
||||
A7C0 ; Lu # LATIN CAPITAL LETTER OLD POLISH O
|
||||
A7C2 ; Lu # LATIN CAPITAL LETTER ANGLICANA W
|
||||
A7C4..A7C7 ; Lu # [4] LATIN CAPITAL LETTER C WITH PALATAL HOOK..LATIN CAPITAL LETTER D WITH SHORT STROKE OVERLAY
|
||||
A7C9 ; Lu # LATIN CAPITAL LETTER S WITH SHORT STROKE OVERLAY
|
||||
A7D0 ; Lu # LATIN CAPITAL LETTER CLOSED INSULAR G
|
||||
A7D6 ; Lu # LATIN CAPITAL LETTER MIDDLE SCOTS S
|
||||
A7D8 ; Lu # LATIN CAPITAL LETTER SIGMOID S
|
||||
A7F5 ; Lu # LATIN CAPITAL LETTER REVERSED HALF H
|
||||
FF21..FF3A ; Lu # [26] FULLWIDTH LATIN CAPITAL LETTER A..FULLWIDTH LATIN CAPITAL LETTER Z
|
||||
10400..10427 ; Lu # [40] DESERET CAPITAL LETTER LONG I..DESERET CAPITAL LETTER EW
|
||||
104B0..104D3 ; Lu # [36] OSAGE CAPITAL LETTER A..OSAGE CAPITAL LETTER ZHA
|
||||
10570..1057A ; Lu # [11] VITHKUQI CAPITAL LETTER A..VITHKUQI CAPITAL LETTER GA
|
||||
1057C..1058A ; Lu # [15] VITHKUQI CAPITAL LETTER HA..VITHKUQI CAPITAL LETTER RE
|
||||
1058C..10592 ; Lu # [7] VITHKUQI CAPITAL LETTER SE..VITHKUQI CAPITAL LETTER XE
|
||||
10594..10595 ; Lu # [2] VITHKUQI CAPITAL LETTER Y..VITHKUQI CAPITAL LETTER ZE
|
||||
10C80..10CB2 ; Lu # [51] OLD HUNGARIAN CAPITAL LETTER A..OLD HUNGARIAN CAPITAL LETTER US
|
||||
118A0..118BF ; Lu # [32] WARANG CITI CAPITAL LETTER NGAA..WARANG CITI CAPITAL LETTER VIYO
|
||||
16E40..16E5F ; Lu # [32] MEDEFAIDRIN CAPITAL LETTER M..MEDEFAIDRIN CAPITAL LETTER Y
|
||||
|
@ -1338,7 +1367,7 @@ FF21..FF3A ; Lu # [26] FULLWIDTH LATIN CAPITAL LETTER A..FULLWIDTH LATIN CAP
|
|||
1D7CA ; Lu # MATHEMATICAL BOLD CAPITAL DIGAMMA
|
||||
1E900..1E921 ; Lu # [34] ADLAM CAPITAL LETTER ALIF..ADLAM CAPITAL LETTER SHA
|
||||
|
||||
# Total code points: 1791
|
||||
# Total code points: 1831
|
||||
|
||||
# ================================================
|
||||
|
||||
|
@ -1775,7 +1804,7 @@ FF21..FF3A ; Lu # [26] FULLWIDTH LATIN CAPITAL LETTER A..FULLWIDTH LATIN CAP
|
|||
2146..2149 ; Ll # [4] DOUBLE-STRUCK ITALIC SMALL D..DOUBLE-STRUCK ITALIC SMALL J
|
||||
214E ; Ll # TURNED SMALL F
|
||||
2184 ; Ll # LATIN SMALL LETTER REVERSED C
|
||||
2C30..2C5E ; Ll # [47] GLAGOLITIC SMALL LETTER AZU..GLAGOLITIC SMALL LETTER LATINATE MYSLITE
|
||||
2C30..2C5F ; Ll # [48] GLAGOLITIC SMALL LETTER AZU..GLAGOLITIC SMALL LETTER CAUDATE CHRIVI
|
||||
2C61 ; Ll # LATIN SMALL LETTER L WITH DOUBLE BAR
|
||||
2C65..2C66 ; Ll # [2] LATIN SMALL LETTER A WITH STROKE..LATIN SMALL LETTER T WITH DIAGONAL STROKE
|
||||
2C68 ; Ll # LATIN SMALL LETTER H WITH DESCENDER
|
||||
|
@ -1944,9 +1973,15 @@ A7B9 ; Ll # LATIN SMALL LETTER U WITH STROKE
|
|||
A7BB ; Ll # LATIN SMALL LETTER GLOTTAL A
|
||||
A7BD ; Ll # LATIN SMALL LETTER GLOTTAL I
|
||||
A7BF ; Ll # LATIN SMALL LETTER GLOTTAL U
|
||||
A7C1 ; Ll # LATIN SMALL LETTER OLD POLISH O
|
||||
A7C3 ; Ll # LATIN SMALL LETTER ANGLICANA W
|
||||
A7C8 ; Ll # LATIN SMALL LETTER D WITH SHORT STROKE OVERLAY
|
||||
A7CA ; Ll # LATIN SMALL LETTER S WITH SHORT STROKE OVERLAY
|
||||
A7D1 ; Ll # LATIN SMALL LETTER CLOSED INSULAR G
|
||||
A7D3 ; Ll # LATIN SMALL LETTER DOUBLE THORN
|
||||
A7D5 ; Ll # LATIN SMALL LETTER DOUBLE WYNN
|
||||
A7D7 ; Ll # LATIN SMALL LETTER MIDDLE SCOTS S
|
||||
A7D9 ; Ll # LATIN SMALL LETTER SIGMOID S
|
||||
A7F6 ; Ll # LATIN SMALL LETTER REVERSED HALF H
|
||||
A7FA ; Ll # LATIN LETTER SMALL CAPITAL TURNED M
|
||||
AB30..AB5A ; Ll # [43] LATIN SMALL LETTER BARRED ALPHA..LATIN SMALL LETTER Y WITH SHORT RIGHT LEG
|
||||
|
@ -1957,6 +1992,10 @@ FB13..FB17 ; Ll # [5] ARMENIAN SMALL LIGATURE MEN NOW..ARMENIAN SMALL LIGAT
|
|||
FF41..FF5A ; Ll # [26] FULLWIDTH LATIN SMALL LETTER A..FULLWIDTH LATIN SMALL LETTER Z
|
||||
10428..1044F ; Ll # [40] DESERET SMALL LETTER LONG I..DESERET SMALL LETTER EW
|
||||
104D8..104FB ; Ll # [36] OSAGE SMALL LETTER A..OSAGE SMALL LETTER ZHA
|
||||
10597..105A1 ; Ll # [11] VITHKUQI SMALL LETTER A..VITHKUQI SMALL LETTER GA
|
||||
105A3..105B1 ; Ll # [15] VITHKUQI SMALL LETTER HA..VITHKUQI SMALL LETTER RE
|
||||
105B3..105B9 ; Ll # [7] VITHKUQI SMALL LETTER SE..VITHKUQI SMALL LETTER XE
|
||||
105BB..105BC ; Ll # [2] VITHKUQI SMALL LETTER Y..VITHKUQI SMALL LETTER ZE
|
||||
10CC0..10CF2 ; Ll # [51] OLD HUNGARIAN SMALL LETTER A..OLD HUNGARIAN SMALL LETTER US
|
||||
118C0..118DF ; Ll # [32] WARANG CITI SMALL LETTER NGAA..WARANG CITI SMALL LETTER VIYO
|
||||
16E60..16E7F ; Ll # [32] MEDEFAIDRIN SMALL LETTER M..MEDEFAIDRIN SMALL LETTER Y
|
||||
|
@ -1988,9 +2027,11 @@ FF41..FF5A ; Ll # [26] FULLWIDTH LATIN SMALL LETTER A..FULLWIDTH LATIN SMALL
|
|||
1D7AA..1D7C2 ; Ll # [25] MATHEMATICAL SANS-SERIF BOLD ITALIC SMALL ALPHA..MATHEMATICAL SANS-SERIF BOLD ITALIC SMALL OMEGA
|
||||
1D7C4..1D7C9 ; Ll # [6] MATHEMATICAL SANS-SERIF BOLD ITALIC EPSILON SYMBOL..MATHEMATICAL SANS-SERIF BOLD ITALIC PI SYMBOL
|
||||
1D7CB ; Ll # MATHEMATICAL BOLD SMALL DIGAMMA
|
||||
1DF00..1DF09 ; Ll # [10] LATIN SMALL LETTER FENG DIGRAPH WITH TRILL..LATIN SMALL LETTER T WITH HOOK AND RETROFLEX HOOK
|
||||
1DF0B..1DF1E ; Ll # [20] LATIN SMALL LETTER ESH WITH DOUBLE BAR..LATIN SMALL LETTER S WITH CURL
|
||||
1E922..1E943 ; Ll # [34] ADLAM SMALL LETTER ALIF..ADLAM SMALL LETTER SHA
|
||||
|
||||
# Total code points: 2155
|
||||
# Total code points: 2227
|
||||
|
||||
# ================================================
|
||||
|
||||
|
@ -2028,6 +2069,7 @@ FF41..FF5A ; Ll # [26] FULLWIDTH LATIN SMALL LETTER A..FULLWIDTH LATIN SMALL
|
|||
081A ; Lm # SAMARITAN MODIFIER LETTER EPENTHETIC YUT
|
||||
0824 ; Lm # SAMARITAN MODIFIER LETTER SHORT A
|
||||
0828 ; Lm # SAMARITAN MODIFIER LETTER I
|
||||
08C9 ; Lm # ARABIC SMALL FARSI YEH
|
||||
0971 ; Lm # DEVANAGARI SIGN HIGH SPACING DOT
|
||||
0E46 ; Lm # THAI CHARACTER MAIYAMOK
|
||||
0EC6 ; Lm # LAO KO LA
|
||||
|
@ -2058,6 +2100,7 @@ A69C..A69D ; Lm # [2] MODIFIER LETTER CYRILLIC HARD SIGN..MODIFIER LETTER C
|
|||
A717..A71F ; Lm # [9] MODIFIER LETTER DOT VERTICAL BAR..MODIFIER LETTER LOW INVERTED EXCLAMATION MARK
|
||||
A770 ; Lm # MODIFIER LETTER US
|
||||
A788 ; Lm # MODIFIER LETTER LOW CIRCUMFLEX ACCENT
|
||||
A7F2..A7F4 ; Lm # [3] MODIFIER LETTER CAPITAL C..MODIFIER LETTER CAPITAL Q
|
||||
A7F8..A7F9 ; Lm # [2] MODIFIER LETTER CAPITAL H WITH STROKE..MODIFIER LETTER SMALL LIGATURE OE
|
||||
A9CF ; Lm # JAVANESE PANGRANGKEP
|
||||
A9E6 ; Lm # MYANMAR MODIFIER LETTER SHAN REDUPLICATION
|
||||
|
@ -2068,14 +2111,20 @@ AB5C..AB5F ; Lm # [4] MODIFIER LETTER SMALL HENG..MODIFIER LETTER SMALL U W
|
|||
AB69 ; Lm # MODIFIER LETTER SMALL TURNED W
|
||||
FF70 ; Lm # HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK
|
||||
FF9E..FF9F ; Lm # [2] HALFWIDTH KATAKANA VOICED SOUND MARK..HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK
|
||||
10780..10785 ; Lm # [6] MODIFIER LETTER SMALL CAPITAL AA..MODIFIER LETTER SMALL B WITH HOOK
|
||||
10787..107B0 ; Lm # [42] MODIFIER LETTER SMALL DZ DIGRAPH..MODIFIER LETTER SMALL V WITH RIGHT HOOK
|
||||
107B2..107BA ; Lm # [9] MODIFIER LETTER SMALL CAPITAL Y..MODIFIER LETTER SMALL S WITH CURL
|
||||
16B40..16B43 ; Lm # [4] PAHAWH HMONG SIGN VOS SEEV..PAHAWH HMONG SIGN IB YAM
|
||||
16F93..16F9F ; Lm # [13] MIAO LETTER TONE-2..MIAO LETTER REFORMED TONE-8
|
||||
16FE0..16FE1 ; Lm # [2] TANGUT ITERATION MARK..NUSHU ITERATION MARK
|
||||
16FE3 ; Lm # OLD CHINESE ITERATION MARK
|
||||
1AFF0..1AFF3 ; Lm # [4] KATAKANA LETTER MINNAN TONE-2..KATAKANA LETTER MINNAN TONE-5
|
||||
1AFF5..1AFFB ; Lm # [7] KATAKANA LETTER MINNAN TONE-7..KATAKANA LETTER MINNAN NASALIZED TONE-5
|
||||
1AFFD..1AFFE ; Lm # [2] KATAKANA LETTER MINNAN NASALIZED TONE-7..KATAKANA LETTER MINNAN NASALIZED TONE-8
|
||||
1E137..1E13D ; Lm # [7] NYIAKENG PUACHUE HMONG SIGN FOR PERSON..NYIAKENG PUACHUE HMONG SYLLABLE LENGTHENER
|
||||
1E94B ; Lm # ADLAM NASALIZATION MARK
|
||||
|
||||
# Total code points: 260
|
||||
# Total code points: 334
|
||||
|
||||
# ================================================
|
||||
|
||||
|
@ -2104,8 +2153,9 @@ FF9E..FF9F ; Lm # [2] HALFWIDTH KATAKANA VOICED SOUND MARK..HALFWIDTH KATAK
|
|||
0800..0815 ; Lo # [22] SAMARITAN LETTER ALAF..SAMARITAN LETTER TAAF
|
||||
0840..0858 ; Lo # [25] MANDAIC LETTER HALQA..MANDAIC LETTER AIN
|
||||
0860..086A ; Lo # [11] SYRIAC LETTER MALAYALAM NGA..SYRIAC LETTER MALAYALAM SSA
|
||||
08A0..08B4 ; Lo # [21] ARABIC LETTER BEH WITH SMALL V BELOW..ARABIC LETTER KAF WITH DOT BELOW
|
||||
08B6..08C7 ; Lo # [18] ARABIC LETTER BEH WITH SMALL MEEM ABOVE..ARABIC LETTER LAM WITH SMALL ARABIC LETTER TAH ABOVE
|
||||
0870..0887 ; Lo # [24] ARABIC LETTER ALEF WITH ATTACHED FATHA..ARABIC BASELINE ROUND DOT
|
||||
0889..088E ; Lo # [6] ARABIC LETTER NOON WITH INVERTED SMALL V..ARABIC VERTICAL TAIL
|
||||
08A0..08C8 ; Lo # [41] ARABIC LETTER BEH WITH SMALL V BELOW..ARABIC LETTER GRAF
|
||||
0904..0939 ; Lo # [54] DEVANAGARI LETTER SHORT A..DEVANAGARI LETTER HA
|
||||
093D ; Lo # DEVANAGARI SIGN AVAGRAHA
|
||||
0950 ; Lo # DEVANAGARI OM
|
||||
|
@ -2170,6 +2220,7 @@ FF9E..FF9F ; Lm # [2] HALFWIDTH KATAKANA VOICED SOUND MARK..HALFWIDTH KATAK
|
|||
0C2A..0C39 ; Lo # [16] TELUGU LETTER PA..TELUGU LETTER HA
|
||||
0C3D ; Lo # TELUGU SIGN AVAGRAHA
|
||||
0C58..0C5A ; Lo # [3] TELUGU LETTER TSA..TELUGU LETTER RRRA
|
||||
0C5D ; Lo # TELUGU LETTER NAKAARA POLLU
|
||||
0C60..0C61 ; Lo # [2] TELUGU LETTER VOCALIC RR..TELUGU LETTER VOCALIC LL
|
||||
0C80 ; Lo # KANNADA SIGN SPACING CANDRABINDU
|
||||
0C85..0C8C ; Lo # [8] KANNADA LETTER A..KANNADA LETTER VOCALIC L
|
||||
|
@ -2178,7 +2229,7 @@ FF9E..FF9F ; Lm # [2] HALFWIDTH KATAKANA VOICED SOUND MARK..HALFWIDTH KATAK
|
|||
0CAA..0CB3 ; Lo # [10] KANNADA LETTER PA..KANNADA LETTER LLA
|
||||
0CB5..0CB9 ; Lo # [5] KANNADA LETTER VA..KANNADA LETTER HA
|
||||
0CBD ; Lo # KANNADA SIGN AVAGRAHA
|
||||
0CDE ; Lo # KANNADA LETTER FA
|
||||
0CDD..0CDE ; Lo # [2] KANNADA LETTER NAKAARA POLLU..KANNADA LETTER FA
|
||||
0CE0..0CE1 ; Lo # [2] KANNADA LETTER VOCALIC RR..KANNADA LETTER VOCALIC LL
|
||||
0CF1..0CF2 ; Lo # [2] KANNADA SIGN JIHVAMULIYA..KANNADA SIGN UPADHMANIYA
|
||||
0D04..0D0C ; Lo # [9] MALAYALAM LETTER VEDIC ANUSVARA..MALAYALAM LETTER VOCALIC L
|
||||
|
@ -2242,9 +2293,8 @@ FF9E..FF9F ; Lm # [2] HALFWIDTH KATAKANA VOICED SOUND MARK..HALFWIDTH KATAK
|
|||
1681..169A ; Lo # [26] OGHAM LETTER BEITH..OGHAM LETTER PEITH
|
||||
16A0..16EA ; Lo # [75] RUNIC LETTER FEHU FEOH FE F..RUNIC LETTER X
|
||||
16F1..16F8 ; Lo # [8] RUNIC LETTER K..RUNIC LETTER FRANKS CASKET AESC
|
||||
1700..170C ; Lo # [13] TAGALOG LETTER A..TAGALOG LETTER YA
|
||||
170E..1711 ; Lo # [4] TAGALOG LETTER LA..TAGALOG LETTER HA
|
||||
1720..1731 ; Lo # [18] HANUNOO LETTER A..HANUNOO LETTER HA
|
||||
1700..1711 ; Lo # [18] TAGALOG LETTER A..TAGALOG LETTER HA
|
||||
171F..1731 ; Lo # [19] TAGALOG LETTER ARCHAIC RA..HANUNOO LETTER HA
|
||||
1740..1751 ; Lo # [18] BUHID LETTER A..BUHID LETTER HA
|
||||
1760..176C ; Lo # [13] TAGBANWA LETTER A..TAGBANWA LETTER YA
|
||||
176E..1770 ; Lo # [3] TAGBANWA LETTER LA..TAGBANWA LETTER SA
|
||||
|
@ -2264,7 +2314,7 @@ FF9E..FF9F ; Lm # [2] HALFWIDTH KATAKANA VOICED SOUND MARK..HALFWIDTH KATAK
|
|||
1A00..1A16 ; Lo # [23] BUGINESE LETTER KA..BUGINESE LETTER HA
|
||||
1A20..1A54 ; Lo # [53] TAI THAM LETTER HIGH KA..TAI THAM LETTER GREAT SA
|
||||
1B05..1B33 ; Lo # [47] BALINESE LETTER AKARA..BALINESE LETTER HA
|
||||
1B45..1B4B ; Lo # [7] BALINESE LETTER KAF SASAK..BALINESE LETTER ASYURA SASAK
|
||||
1B45..1B4C ; Lo # [8] BALINESE LETTER KAF SASAK..BALINESE LETTER ARCHAIC JNYA
|
||||
1B83..1BA0 ; Lo # [30] SUNDANESE LETTER A..SUNDANESE LETTER HA
|
||||
1BAE..1BAF ; Lo # [2] SUNDANESE LETTER KHA..SUNDANESE LETTER SYA
|
||||
1BBA..1BE5 ; Lo # [44] SUNDANESE AVAGRAHA..BATAK LETTER U
|
||||
|
@ -2297,8 +2347,7 @@ FF9E..FF9F ; Lm # [2] HALFWIDTH KATAKANA VOICED SOUND MARK..HALFWIDTH KATAK
|
|||
31A0..31BF ; Lo # [32] BOPOMOFO LETTER BU..BOPOMOFO LETTER AH
|
||||
31F0..31FF ; Lo # [16] KATAKANA LETTER SMALL KU..KATAKANA LETTER SMALL RO
|
||||
3400..4DBF ; Lo # [6592] CJK UNIFIED IDEOGRAPH-3400..CJK UNIFIED IDEOGRAPH-4DBF
|
||||
4E00..9FFC ; Lo # [20989] CJK UNIFIED IDEOGRAPH-4E00..CJK UNIFIED IDEOGRAPH-9FFC
|
||||
A000..A014 ; Lo # [21] YI SYLLABLE IT..YI SYLLABLE E
|
||||
4E00..A014 ; Lo # [21013] CJK UNIFIED IDEOGRAPH-4E00..YI SYLLABLE E
|
||||
A016..A48C ; Lo # [1143] YI SYLLABLE BIT..YI SYLLABLE YYR
|
||||
A4D0..A4F7 ; Lo # [40] LISU LETTER BA..LISU LETTER OE
|
||||
A500..A60B ; Lo # [268] VAI SYLLABLE EE..VAI SYLLABLE NG
|
||||
|
@ -2426,9 +2475,12 @@ FFDA..FFDC ; Lo # [3] HALFWIDTH HANGUL LETTER EU..HALFWIDTH HANGUL LETTER I
|
|||
10F00..10F1C ; Lo # [29] OLD SOGDIAN LETTER ALEPH..OLD SOGDIAN LETTER FINAL TAW WITH VERTICAL TAIL
|
||||
10F27 ; Lo # OLD SOGDIAN LIGATURE AYIN-DALETH
|
||||
10F30..10F45 ; Lo # [22] SOGDIAN LETTER ALEPH..SOGDIAN INDEPENDENT SHIN
|
||||
10F70..10F81 ; Lo # [18] OLD UYGHUR LETTER ALEPH..OLD UYGHUR LETTER LESH
|
||||
10FB0..10FC4 ; Lo # [21] CHORASMIAN LETTER ALEPH..CHORASMIAN LETTER TAW
|
||||
10FE0..10FF6 ; Lo # [23] ELYMAIC LETTER ALEPH..ELYMAIC LIGATURE ZAYIN-YODH
|
||||
11003..11037 ; Lo # [53] BRAHMI SIGN JIHVAMULIYA..BRAHMI LETTER OLD TAMIL NNNA
|
||||
11071..11072 ; Lo # [2] BRAHMI LETTER OLD TAMIL SHORT E..BRAHMI LETTER OLD TAMIL SHORT O
|
||||
11075 ; Lo # BRAHMI LETTER OLD TAMIL LLA
|
||||
11083..110AF ; Lo # [45] KAITHI LETTER A..KAITHI LETTER HA
|
||||
110D0..110E8 ; Lo # [25] SORA SOMPENG LETTER SAH..SORA SOMPENG LETTER MAE
|
||||
11103..11126 ; Lo # [36] CHAKMA LETTER AA..CHAKMA LETTER HAA
|
||||
|
@ -2470,6 +2522,7 @@ FFDA..FFDC ; Lo # [3] HALFWIDTH HANGUL LETTER EU..HALFWIDTH HANGUL LETTER I
|
|||
11680..116AA ; Lo # [43] TAKRI LETTER A..TAKRI LETTER RRA
|
||||
116B8 ; Lo # TAKRI LETTER ARCHAIC KHA
|
||||
11700..1171A ; Lo # [27] AHOM LETTER KA..AHOM LETTER ALTERNATE BA
|
||||
11740..11746 ; Lo # [7] AHOM LETTER CA..AHOM LETTER LLA
|
||||
11800..1182B ; Lo # [44] DOGRA LETTER A..DOGRA LETTER RRA
|
||||
118FF..11906 ; Lo # [8] WARANG CITI OM..DIVES AKURU LETTER E
|
||||
11909 ; Lo # DIVES AKURU LETTER O
|
||||
|
@ -2488,7 +2541,7 @@ FFDA..FFDC ; Lo # [3] HALFWIDTH HANGUL LETTER EU..HALFWIDTH HANGUL LETTER I
|
|||
11A50 ; Lo # SOYOMBO LETTER A
|
||||
11A5C..11A89 ; Lo # [46] SOYOMBO LETTER KA..SOYOMBO CLUSTER-INITIAL LETTER SA
|
||||
11A9D ; Lo # SOYOMBO MARK PLUTA
|
||||
11AC0..11AF8 ; Lo # [57] PAU CIN HAU LETTER PA..PAU CIN HAU GLOTTAL STOP FINAL
|
||||
11AB0..11AF8 ; Lo # [73] CANADIAN SYLLABICS NATTILIK HI..PAU CIN HAU GLOTTAL STOP FINAL
|
||||
11C00..11C08 ; Lo # [9] BHAIKSUKI LETTER A..BHAIKSUKI LETTER VOCALIC L
|
||||
11C0A..11C2E ; Lo # [37] BHAIKSUKI LETTER E..BHAIKSUKI LETTER HA
|
||||
11C40 ; Lo # BHAIKSUKI SIGN AVAGRAHA
|
||||
|
@ -2505,10 +2558,12 @@ FFDA..FFDC ; Lo # [3] HALFWIDTH HANGUL LETTER EU..HALFWIDTH HANGUL LETTER I
|
|||
11FB0 ; Lo # LISU LETTER YHA
|
||||
12000..12399 ; Lo # [922] CUNEIFORM SIGN A..CUNEIFORM SIGN U U
|
||||
12480..12543 ; Lo # [196] CUNEIFORM SIGN AB TIMES NUN TENU..CUNEIFORM SIGN ZU5 TIMES THREE DISH TENU
|
||||
12F90..12FF0 ; Lo # [97] CYPRO-MINOAN SIGN CM001..CYPRO-MINOAN SIGN CM114
|
||||
13000..1342E ; Lo # [1071] EGYPTIAN HIEROGLYPH A001..EGYPTIAN HIEROGLYPH AA032
|
||||
14400..14646 ; Lo # [583] ANATOLIAN HIEROGLYPH A001..ANATOLIAN HIEROGLYPH A530
|
||||
16800..16A38 ; Lo # [569] BAMUM LETTER PHASE-A NGKUE MFON..BAMUM LETTER PHASE-F VUEQ
|
||||
16A40..16A5E ; Lo # [31] MRO LETTER TA..MRO LETTER TEK
|
||||
16A70..16ABE ; Lo # [79] TANGSA LETTER OZ..TANGSA LETTER ZA
|
||||
16AD0..16AED ; Lo # [30] BASSA VAH LETTER ENNI..BASSA VAH LETTER I
|
||||
16B00..16B2F ; Lo # [48] PAHAWH HMONG VOWEL KEEB..PAHAWH HMONG CONSONANT CAU
|
||||
16B63..16B77 ; Lo # [21] PAHAWH HMONG SIGN VOS LUB..PAHAWH HMONG SIGN CIM NRES TOS
|
||||
|
@ -2518,7 +2573,7 @@ FFDA..FFDC ; Lo # [3] HALFWIDTH HANGUL LETTER EU..HALFWIDTH HANGUL LETTER I
|
|||
17000..187F7 ; Lo # [6136] TANGUT IDEOGRAPH-17000..TANGUT IDEOGRAPH-187F7
|
||||
18800..18CD5 ; Lo # [1238] TANGUT COMPONENT-001..KHITAN SMALL SCRIPT CHARACTER-18CD5
|
||||
18D00..18D08 ; Lo # [9] TANGUT IDEOGRAPH-18D00..TANGUT IDEOGRAPH-18D08
|
||||
1B000..1B11E ; Lo # [287] KATAKANA LETTER ARCHAIC E..HENTAIGANA LETTER N-MU-MO-2
|
||||
1B000..1B122 ; Lo # [291] KATAKANA LETTER ARCHAIC E..KATAKANA LETTER ARCHAIC WU
|
||||
1B150..1B152 ; Lo # [3] HIRAGANA LETTER SMALL WI..HIRAGANA LETTER SMALL WO
|
||||
1B164..1B167 ; Lo # [4] KATAKANA LETTER SMALL WI..KATAKANA LETTER SMALL N
|
||||
1B170..1B2FB ; Lo # [396] NUSHU CHARACTER-1B170..NUSHU CHARACTER-1B2FB
|
||||
|
@ -2526,9 +2581,15 @@ FFDA..FFDC ; Lo # [3] HALFWIDTH HANGUL LETTER EU..HALFWIDTH HANGUL LETTER I
|
|||
1BC70..1BC7C ; Lo # [13] DUPLOYAN AFFIX LEFT HORIZONTAL SECANT..DUPLOYAN AFFIX ATTACHED TANGENT HOOK
|
||||
1BC80..1BC88 ; Lo # [9] DUPLOYAN AFFIX HIGH ACUTE..DUPLOYAN AFFIX HIGH VERTICAL
|
||||
1BC90..1BC99 ; Lo # [10] DUPLOYAN AFFIX LOW ACUTE..DUPLOYAN AFFIX LOW ARROW
|
||||
1DF0A ; Lo # LATIN LETTER RETROFLEX CLICK WITH RETROFLEX HOOK
|
||||
1E100..1E12C ; Lo # [45] NYIAKENG PUACHUE HMONG LETTER MA..NYIAKENG PUACHUE HMONG LETTER W
|
||||
1E14E ; Lo # NYIAKENG PUACHUE HMONG LOGOGRAM NYAJ
|
||||
1E290..1E2AD ; Lo # [30] TOTO LETTER PA..TOTO LETTER A
|
||||
1E2C0..1E2EB ; Lo # [44] WANCHO LETTER AA..WANCHO LETTER YIH
|
||||
1E7E0..1E7E6 ; Lo # [7] ETHIOPIC SYLLABLE HHYA..ETHIOPIC SYLLABLE HHYO
|
||||
1E7E8..1E7EB ; Lo # [4] ETHIOPIC SYLLABLE GURAGE HHWA..ETHIOPIC SYLLABLE HHWE
|
||||
1E7ED..1E7EE ; Lo # [2] ETHIOPIC SYLLABLE GURAGE MWI..ETHIOPIC SYLLABLE GURAGE MWEE
|
||||
1E7F0..1E7FE ; Lo # [15] ETHIOPIC SYLLABLE GURAGE QWI..ETHIOPIC SYLLABLE GURAGE PWEE
|
||||
1E800..1E8C4 ; Lo # [197] MENDE KIKAKUI SYLLABLE M001 KI..MENDE KIKAKUI SYLLABLE M060 NYON
|
||||
1EE00..1EE03 ; Lo # [4] ARABIC MATHEMATICAL ALEF..ARABIC MATHEMATICAL DAL
|
||||
1EE05..1EE1F ; Lo # [27] ARABIC MATHEMATICAL WAW..ARABIC MATHEMATICAL DOTLESS QAF
|
||||
|
@ -2563,15 +2624,15 @@ FFDA..FFDC ; Lo # [3] HALFWIDTH HANGUL LETTER EU..HALFWIDTH HANGUL LETTER I
|
|||
1EEA1..1EEA3 ; Lo # [3] ARABIC MATHEMATICAL DOUBLE-STRUCK BEH..ARABIC MATHEMATICAL DOUBLE-STRUCK DAL
|
||||
1EEA5..1EEA9 ; Lo # [5] ARABIC MATHEMATICAL DOUBLE-STRUCK WAW..ARABIC MATHEMATICAL DOUBLE-STRUCK YEH
|
||||
1EEAB..1EEBB ; Lo # [17] ARABIC MATHEMATICAL DOUBLE-STRUCK LAM..ARABIC MATHEMATICAL DOUBLE-STRUCK GHAIN
|
||||
20000..2A6DD ; Lo # [42718] CJK UNIFIED IDEOGRAPH-20000..CJK UNIFIED IDEOGRAPH-2A6DD
|
||||
2A700..2B734 ; Lo # [4149] CJK UNIFIED IDEOGRAPH-2A700..CJK UNIFIED IDEOGRAPH-2B734
|
||||
20000..2A6DF ; Lo # [42720] CJK UNIFIED IDEOGRAPH-20000..CJK UNIFIED IDEOGRAPH-2A6DF
|
||||
2A700..2B738 ; Lo # [4153] CJK UNIFIED IDEOGRAPH-2A700..CJK UNIFIED IDEOGRAPH-2B738
|
||||
2B740..2B81D ; Lo # [222] CJK UNIFIED IDEOGRAPH-2B740..CJK UNIFIED IDEOGRAPH-2B81D
|
||||
2B820..2CEA1 ; Lo # [5762] CJK UNIFIED IDEOGRAPH-2B820..CJK UNIFIED IDEOGRAPH-2CEA1
|
||||
2CEB0..2EBE0 ; Lo # [7473] CJK UNIFIED IDEOGRAPH-2CEB0..CJK UNIFIED IDEOGRAPH-2EBE0
|
||||
2F800..2FA1D ; Lo # [542] CJK COMPATIBILITY IDEOGRAPH-2F800..CJK COMPATIBILITY IDEOGRAPH-2FA1D
|
||||
30000..3134A ; Lo # [4939] CJK UNIFIED IDEOGRAPH-30000..CJK UNIFIED IDEOGRAPH-3134A
|
||||
|
||||
# Total code points: 127004
|
||||
# Total code points: 127333
|
||||
|
||||
# ================================================
|
||||
|
||||
|
@ -2601,7 +2662,8 @@ FFDA..FFDC ; Lo # [3] HALFWIDTH HANGUL LETTER EU..HALFWIDTH HANGUL LETTER I
|
|||
0825..0827 ; Mn # [3] SAMARITAN VOWEL SIGN SHORT A..SAMARITAN VOWEL SIGN U
|
||||
0829..082D ; Mn # [5] SAMARITAN VOWEL SIGN LONG I..SAMARITAN MARK NEQUDAA
|
||||
0859..085B ; Mn # [3] MANDAIC AFFRICATION MARK..MANDAIC GEMINATION MARK
|
||||
08D3..08E1 ; Mn # [15] ARABIC SMALL LOW WAW..ARABIC SMALL HIGH SIGN SAFHA
|
||||
0898..089F ; Mn # [8] ARABIC SMALL HIGH WORD AL-JUZ..ARABIC HALF MADDA OVER MADDA
|
||||
08CA..08E1 ; Mn # [24] ARABIC SMALL HIGH FARSI YEH..ARABIC SMALL HIGH SIGN SAFHA
|
||||
08E3..0902 ; Mn # [32] ARABIC TURNED DAMMA BELOW..DEVANAGARI SIGN ANUSVARA
|
||||
093A ; Mn # DEVANAGARI VOWEL SIGN OE
|
||||
093C ; Mn # DEVANAGARI SIGN NUKTA
|
||||
|
@ -2642,6 +2704,7 @@ FFDA..FFDC ; Lo # [3] HALFWIDTH HANGUL LETTER EU..HALFWIDTH HANGUL LETTER I
|
|||
0BCD ; Mn # TAMIL SIGN VIRAMA
|
||||
0C00 ; Mn # TELUGU SIGN COMBINING CANDRABINDU ABOVE
|
||||
0C04 ; Mn # TELUGU SIGN COMBINING ANUSVARA ABOVE
|
||||
0C3C ; Mn # TELUGU SIGN NUKTA
|
||||
0C3E..0C40 ; Mn # [3] TELUGU VOWEL SIGN AA..TELUGU VOWEL SIGN II
|
||||
0C46..0C48 ; Mn # [3] TELUGU VOWEL SIGN E..TELUGU VOWEL SIGN AI
|
||||
0C4A..0C4D ; Mn # [4] TELUGU VOWEL SIGN O..TELUGU SIGN VIRAMA
|
||||
|
@ -2691,7 +2754,7 @@ FFDA..FFDC ; Lo # [3] HALFWIDTH HANGUL LETTER EU..HALFWIDTH HANGUL LETTER I
|
|||
109D ; Mn # MYANMAR VOWEL SIGN AITON AI
|
||||
135D..135F ; Mn # [3] ETHIOPIC COMBINING GEMINATION AND VOWEL LENGTH MARK..ETHIOPIC COMBINING GEMINATION MARK
|
||||
1712..1714 ; Mn # [3] TAGALOG VOWEL SIGN I..TAGALOG SIGN VIRAMA
|
||||
1732..1734 ; Mn # [3] HANUNOO VOWEL SIGN I..HANUNOO SIGN PAMUDPOD
|
||||
1732..1733 ; Mn # [2] HANUNOO VOWEL SIGN I..HANUNOO VOWEL SIGN U
|
||||
1752..1753 ; Mn # [2] BUHID VOWEL SIGN I..BUHID VOWEL SIGN U
|
||||
1772..1773 ; Mn # [2] TAGBANWA VOWEL SIGN I..TAGBANWA VOWEL SIGN U
|
||||
17B4..17B5 ; Mn # [2] KHMER VOWEL INHERENT AQ..KHMER VOWEL INHERENT AA
|
||||
|
@ -2700,6 +2763,7 @@ FFDA..FFDC ; Lo # [3] HALFWIDTH HANGUL LETTER EU..HALFWIDTH HANGUL LETTER I
|
|||
17C9..17D3 ; Mn # [11] KHMER SIGN MUUSIKATOAN..KHMER SIGN BATHAMASAT
|
||||
17DD ; Mn # KHMER SIGN ATTHACAN
|
||||
180B..180D ; Mn # [3] MONGOLIAN FREE VARIATION SELECTOR ONE..MONGOLIAN FREE VARIATION SELECTOR THREE
|
||||
180F ; Mn # MONGOLIAN FREE VARIATION SELECTOR FOUR
|
||||
1885..1886 ; Mn # [2] MONGOLIAN LETTER ALI GALI BALUDA..MONGOLIAN LETTER ALI GALI THREE BALUDA
|
||||
18A9 ; Mn # MONGOLIAN LETTER ALI GALI DAGALGA
|
||||
1920..1922 ; Mn # [3] LIMBU VOWEL SIGN A..LIMBU VOWEL SIGN U
|
||||
|
@ -2716,7 +2780,7 @@ FFDA..FFDC ; Lo # [3] HALFWIDTH HANGUL LETTER EU..HALFWIDTH HANGUL LETTER I
|
|||
1A73..1A7C ; Mn # [10] TAI THAM VOWEL SIGN OA ABOVE..TAI THAM SIGN KHUEN-LUE KARAN
|
||||
1A7F ; Mn # TAI THAM COMBINING CRYPTOGRAMMIC DOT
|
||||
1AB0..1ABD ; Mn # [14] COMBINING DOUBLED CIRCUMFLEX ACCENT..COMBINING PARENTHESES BELOW
|
||||
1ABF..1AC0 ; Mn # [2] COMBINING LATIN SMALL LETTER W BELOW..COMBINING LATIN SMALL LETTER TURNED W BELOW
|
||||
1ABF..1ACE ; Mn # [16] COMBINING LATIN SMALL LETTER W BELOW..COMBINING LATIN SMALL LETTER INSULAR T
|
||||
1B00..1B03 ; Mn # [4] BALINESE SIGN ULU RICEM..BALINESE SIGN SURANG
|
||||
1B34 ; Mn # BALINESE SIGN REREKAN
|
||||
1B36..1B3A ; Mn # [5] BALINESE VOWEL SIGN ULU..BALINESE VOWEL SIGN RA REPA
|
||||
|
@ -2739,8 +2803,7 @@ FFDA..FFDC ; Lo # [3] HALFWIDTH HANGUL LETTER EU..HALFWIDTH HANGUL LETTER I
|
|||
1CED ; Mn # VEDIC SIGN TIRYAK
|
||||
1CF4 ; Mn # VEDIC TONE CANDRA ABOVE
|
||||
1CF8..1CF9 ; Mn # [2] VEDIC TONE RING ABOVE..VEDIC TONE DOUBLE RING ABOVE
|
||||
1DC0..1DF9 ; Mn # [58] COMBINING DOTTED GRAVE ACCENT..COMBINING WIDE INVERTED BRIDGE BELOW
|
||||
1DFB..1DFF ; Mn # [5] COMBINING DELETION MARK..COMBINING RIGHT ARROWHEAD AND DOWN ARROWHEAD BELOW
|
||||
1DC0..1DFF ; Mn # [64] COMBINING DOTTED GRAVE ACCENT..COMBINING RIGHT ARROWHEAD AND DOWN ARROWHEAD BELOW
|
||||
20D0..20DC ; Mn # [13] COMBINING LEFT HARPOON ABOVE..COMBINING FOUR DOTS ABOVE
|
||||
20E1 ; Mn # COMBINING LEFT RIGHT ARROW ABOVE
|
||||
20E5..20F0 ; Mn # [12] COMBINING REVERSE SOLIDUS OVERLAY..COMBINING ASTERISK ABOVE
|
||||
|
@ -2799,11 +2862,15 @@ FE20..FE2F ; Mn # [16] COMBINING LIGATURE LEFT HALF..COMBINING CYRILLIC TITL
|
|||
10D24..10D27 ; Mn # [4] HANIFI ROHINGYA SIGN HARBAHAY..HANIFI ROHINGYA SIGN TASSI
|
||||
10EAB..10EAC ; Mn # [2] YEZIDI COMBINING HAMZA MARK..YEZIDI COMBINING MADDA MARK
|
||||
10F46..10F50 ; Mn # [11] SOGDIAN COMBINING DOT BELOW..SOGDIAN COMBINING STROKE BELOW
|
||||
10F82..10F85 ; Mn # [4] OLD UYGHUR COMBINING DOT ABOVE..OLD UYGHUR COMBINING TWO DOTS BELOW
|
||||
11001 ; Mn # BRAHMI SIGN ANUSVARA
|
||||
11038..11046 ; Mn # [15] BRAHMI VOWEL SIGN AA..BRAHMI VIRAMA
|
||||
11070 ; Mn # BRAHMI SIGN OLD TAMIL VIRAMA
|
||||
11073..11074 ; Mn # [2] BRAHMI VOWEL SIGN OLD TAMIL SHORT E..BRAHMI VOWEL SIGN OLD TAMIL SHORT O
|
||||
1107F..11081 ; Mn # [3] BRAHMI NUMBER JOINER..KAITHI SIGN ANUSVARA
|
||||
110B3..110B6 ; Mn # [4] KAITHI VOWEL SIGN U..KAITHI VOWEL SIGN AI
|
||||
110B9..110BA ; Mn # [2] KAITHI SIGN VIRAMA..KAITHI SIGN NUKTA
|
||||
110C2 ; Mn # KAITHI VOWEL SIGN VOCALIC R
|
||||
11100..11102 ; Mn # [3] CHAKMA SIGN CANDRABINDU..CHAKMA SIGN VISARGA
|
||||
11127..1112B ; Mn # [5] CHAKMA VOWEL SIGN A..CHAKMA VOWEL SIGN UU
|
||||
1112D..11134 ; Mn # [8] CHAKMA VOWEL SIGN AI..CHAKMA MAAYYAA
|
||||
|
@ -2883,6 +2950,8 @@ FE20..FE2F ; Mn # [16] COMBINING LIGATURE LEFT HALF..COMBINING CYRILLIC TITL
|
|||
16F8F..16F92 ; Mn # [4] MIAO TONE RIGHT..MIAO TONE BELOW
|
||||
16FE4 ; Mn # KHITAN SMALL SCRIPT FILLER
|
||||
1BC9D..1BC9E ; Mn # [2] DUPLOYAN THICK LETTER SELECTOR..DUPLOYAN DOUBLE MARK
|
||||
1CF00..1CF2D ; Mn # [46] ZNAMENNY COMBINING MARK GORAZDO NIZKO S KRYZHEM ON LEFT..ZNAMENNY COMBINING MARK KRYZH ON LEFT
|
||||
1CF30..1CF46 ; Mn # [23] ZNAMENNY COMBINING TONAL RANGE MARK MRACHNO..ZNAMENNY PRIZNAK MODIFIER ROG
|
||||
1D167..1D169 ; Mn # [3] MUSICAL SYMBOL COMBINING TREMOLO-1..MUSICAL SYMBOL COMBINING TREMOLO-3
|
||||
1D17B..1D182 ; Mn # [8] MUSICAL SYMBOL COMBINING ACCENT..MUSICAL SYMBOL COMBINING LOURE
|
||||
1D185..1D18B ; Mn # [7] MUSICAL SYMBOL COMBINING DOIT..MUSICAL SYMBOL COMBINING TRIPLE TONGUE
|
||||
|
@ -2900,12 +2969,13 @@ FE20..FE2F ; Mn # [16] COMBINING LIGATURE LEFT HALF..COMBINING CYRILLIC TITL
|
|||
1E023..1E024 ; Mn # [2] COMBINING GLAGOLITIC LETTER YU..COMBINING GLAGOLITIC LETTER SMALL YUS
|
||||
1E026..1E02A ; Mn # [5] COMBINING GLAGOLITIC LETTER YO..COMBINING GLAGOLITIC LETTER FITA
|
||||
1E130..1E136 ; Mn # [7] NYIAKENG PUACHUE HMONG TONE-B..NYIAKENG PUACHUE HMONG TONE-D
|
||||
1E2AE ; Mn # TOTO SIGN RISING TONE
|
||||
1E2EC..1E2EF ; Mn # [4] WANCHO TONE TUP..WANCHO TONE KOINI
|
||||
1E8D0..1E8D6 ; Mn # [7] MENDE KIKAKUI COMBINING NUMBER TEENS..MENDE KIKAKUI COMBINING NUMBER MILLIONS
|
||||
1E944..1E94A ; Mn # [7] ADLAM ALIF LENGTHENER..ADLAM NUKTA
|
||||
E0100..E01EF ; Mn # [240] VARIATION SELECTOR-17..VARIATION SELECTOR-256
|
||||
|
||||
# Total code points: 1839
|
||||
# Total code points: 1950
|
||||
|
||||
# ================================================
|
||||
|
||||
|
@ -2980,6 +3050,8 @@ A670..A672 ; Me # [3] COMBINING CYRILLIC TEN MILLIONS SIGN..COMBINING CYRIL
|
|||
1087..108C ; Mc # [6] MYANMAR SIGN SHAN TONE-2..MYANMAR SIGN SHAN COUNCIL TONE-3
|
||||
108F ; Mc # MYANMAR SIGN RUMAI PALAUNG TONE-5
|
||||
109A..109C ; Mc # [3] MYANMAR SIGN KHAMTI TONE-1..MYANMAR VOWEL SIGN AITON A
|
||||
1715 ; Mc # TAGALOG SIGN PAMUDPOD
|
||||
1734 ; Mc # HANUNOO SIGN PAMUDPOD
|
||||
17B6 ; Mc # KHMER VOWEL SIGN AA
|
||||
17BE..17C5 ; Mc # [8] KHMER VOWEL SIGN OE..KHMER VOWEL SIGN AU
|
||||
17C7..17C8 ; Mc # [2] KHMER SIGN REAHMUK..KHMER SIGN YUUKALEAPINTU
|
||||
|
@ -3099,7 +3171,7 @@ ABEC ; Mc # MEETEI MAYEK LUM IYEK
|
|||
1D165..1D166 ; Mc # [2] MUSICAL SYMBOL COMBINING STEM..MUSICAL SYMBOL COMBINING SPRECHGESANG STEM
|
||||
1D16D..1D172 ; Mc # [6] MUSICAL SYMBOL COMBINING AUGMENTATION DOT..MUSICAL SYMBOL COMBINING FLAG-5
|
||||
|
||||
# Total code points: 443
|
||||
# Total code points: 445
|
||||
|
||||
# ================================================
|
||||
|
||||
|
@ -3160,6 +3232,7 @@ FF10..FF19 ; Nd # [10] FULLWIDTH DIGIT ZERO..FULLWIDTH DIGIT NINE
|
|||
11D50..11D59 ; Nd # [10] MASARAM GONDI DIGIT ZERO..MASARAM GONDI DIGIT NINE
|
||||
11DA0..11DA9 ; Nd # [10] GUNJALA GONDI DIGIT ZERO..GUNJALA GONDI DIGIT NINE
|
||||
16A60..16A69 ; Nd # [10] MRO DIGIT ZERO..MRO DIGIT NINE
|
||||
16AC0..16AC9 ; Nd # [10] TANGSA DIGIT ZERO..TANGSA DIGIT NINE
|
||||
16B50..16B59 ; Nd # [10] PAHAWH HMONG DIGIT ZERO..PAHAWH HMONG DIGIT NINE
|
||||
1D7CE..1D7FF ; Nd # [50] MATHEMATICAL BOLD DIGIT ZERO..MATHEMATICAL MONOSPACE DIGIT NINE
|
||||
1E140..1E149 ; Nd # [10] NYIAKENG PUACHUE HMONG DIGIT ZERO..NYIAKENG PUACHUE HMONG DIGIT NINE
|
||||
|
@ -3167,7 +3240,7 @@ FF10..FF19 ; Nd # [10] FULLWIDTH DIGIT ZERO..FULLWIDTH DIGIT NINE
|
|||
1E950..1E959 ; Nd # [10] ADLAM DIGIT ZERO..ADLAM DIGIT NINE
|
||||
1FBF0..1FBF9 ; Nd # [10] SEGMENTED DIGIT ZERO..SEGMENTED DIGIT NINE
|
||||
|
||||
# Total code points: 650
|
||||
# Total code points: 660
|
||||
|
||||
# ================================================
|
||||
|
||||
|
@ -3314,6 +3387,7 @@ A830..A835 ; No # [6] NORTH INDIC FRACTION ONE QUARTER..NORTH INDIC FRACTIO
|
|||
061C ; Cf # ARABIC LETTER MARK
|
||||
06DD ; Cf # ARABIC END OF AYAH
|
||||
070F ; Cf # SYRIAC ABBREVIATION MARK
|
||||
0890..0891 ; Cf # [2] ARABIC POUND MARK ABOVE..ARABIC PIASTRE MARK ABOVE
|
||||
08E2 ; Cf # ARABIC DISPUTED END OF AYAH
|
||||
180E ; Cf # MONGOLIAN VOWEL SEPARATOR
|
||||
200B..200F ; Cf # [5] ZERO WIDTH SPACE..RIGHT-TO-LEFT MARK
|
||||
|
@ -3330,7 +3404,7 @@ FFF9..FFFB ; Cf # [3] INTERLINEAR ANNOTATION ANCHOR..INTERLINEAR ANNOTATION
|
|||
E0001 ; Cf # LANGUAGE TAG
|
||||
E0020..E007F ; Cf # [96] TAG SPACE..CANCEL TAG
|
||||
|
||||
# Total code points: 161
|
||||
# Total code points: 163
|
||||
|
||||
# ================================================
|
||||
|
||||
|
@ -3364,6 +3438,7 @@ D800..DFFF ; Cs # [2048] <surrogate-D800>..<surrogate-DFFF>
|
|||
2E1A ; Pd # HYPHEN WITH DIAERESIS
|
||||
2E3A..2E3B ; Pd # [2] TWO-EM DASH..THREE-EM DASH
|
||||
2E40 ; Pd # DOUBLE HYPHEN
|
||||
2E5D ; Pd # OBLIQUE HYPHEN
|
||||
301C ; Pd # WAVE DASH
|
||||
3030 ; Pd # WAVY DASH
|
||||
30A0 ; Pd # KATAKANA-HIRAGANA DOUBLE HYPHEN
|
||||
|
@ -3373,7 +3448,7 @@ FE63 ; Pd # SMALL HYPHEN-MINUS
|
|||
FF0D ; Pd # FULLWIDTH HYPHEN-MINUS
|
||||
10EAD ; Pd # YEZIDI HYPHENATION MARK
|
||||
|
||||
# Total code points: 25
|
||||
# Total code points: 26
|
||||
|
||||
# ================================================
|
||||
|
||||
|
@ -3425,6 +3500,10 @@ FF0D ; Pd # FULLWIDTH HYPHEN-MINUS
|
|||
2E26 ; Ps # LEFT SIDEWAYS U BRACKET
|
||||
2E28 ; Ps # LEFT DOUBLE PARENTHESIS
|
||||
2E42 ; Ps # DOUBLE LOW-REVERSED-9 QUOTATION MARK
|
||||
2E55 ; Ps # LEFT SQUARE BRACKET WITH STROKE
|
||||
2E57 ; Ps # LEFT SQUARE BRACKET WITH DOUBLE STROKE
|
||||
2E59 ; Ps # TOP HALF LEFT PARENTHESIS
|
||||
2E5B ; Ps # BOTTOM HALF LEFT PARENTHESIS
|
||||
3008 ; Ps # LEFT ANGLE BRACKET
|
||||
300A ; Ps # LEFT DOUBLE ANGLE BRACKET
|
||||
300C ; Ps # LEFT CORNER BRACKET
|
||||
|
@ -3455,7 +3534,7 @@ FF5B ; Ps # FULLWIDTH LEFT CURLY BRACKET
|
|||
FF5F ; Ps # FULLWIDTH LEFT WHITE PARENTHESIS
|
||||
FF62 ; Ps # HALFWIDTH LEFT CORNER BRACKET
|
||||
|
||||
# Total code points: 75
|
||||
# Total code points: 79
|
||||
|
||||
# ================================================
|
||||
|
||||
|
@ -3504,6 +3583,10 @@ FF62 ; Ps # HALFWIDTH LEFT CORNER BRACKET
|
|||
2E25 ; Pe # BOTTOM RIGHT HALF BRACKET
|
||||
2E27 ; Pe # RIGHT SIDEWAYS U BRACKET
|
||||
2E29 ; Pe # RIGHT DOUBLE PARENTHESIS
|
||||
2E56 ; Pe # RIGHT SQUARE BRACKET WITH STROKE
|
||||
2E58 ; Pe # RIGHT SQUARE BRACKET WITH DOUBLE STROKE
|
||||
2E5A ; Pe # TOP HALF RIGHT PARENTHESIS
|
||||
2E5C ; Pe # BOTTOM HALF RIGHT PARENTHESIS
|
||||
3009 ; Pe # RIGHT ANGLE BRACKET
|
||||
300B ; Pe # RIGHT DOUBLE ANGLE BRACKET
|
||||
300D ; Pe # RIGHT CORNER BRACKET
|
||||
|
@ -3534,7 +3617,7 @@ FF5D ; Pe # FULLWIDTH RIGHT CURLY BRACKET
|
|||
FF60 ; Pe # FULLWIDTH RIGHT WHITE PARENTHESIS
|
||||
FF63 ; Pe # HALFWIDTH RIGHT CORNER BRACKET
|
||||
|
||||
# Total code points: 73
|
||||
# Total code points: 77
|
||||
|
||||
# ================================================
|
||||
|
||||
|
@ -3576,7 +3659,7 @@ FF3F ; Pc # FULLWIDTH LOW LINE
|
|||
0609..060A ; Po # [2] ARABIC-INDIC PER MILLE SIGN..ARABIC-INDIC PER TEN THOUSAND SIGN
|
||||
060C..060D ; Po # [2] ARABIC COMMA..ARABIC DATE SEPARATOR
|
||||
061B ; Po # ARABIC SEMICOLON
|
||||
061E..061F ; Po # [2] ARABIC TRIPLE DOT PUNCTUATION MARK..ARABIC QUESTION MARK
|
||||
061D..061F ; Po # [3] ARABIC END OF TEXT MARK..ARABIC QUESTION MARK
|
||||
066A..066D ; Po # [4] ARABIC PERCENT SIGN..ARABIC FIVE POINTED STAR
|
||||
06D4 ; Po # ARABIC FULL STOP
|
||||
0700..070D ; Po # [14] SYRIAC END OF PARAGRAPH..SYRIAC HARKLEAN ASTERISCUS
|
||||
|
@ -3613,6 +3696,7 @@ FF3F ; Pc # FULLWIDTH LOW LINE
|
|||
1AA0..1AA6 ; Po # [7] TAI THAM SIGN WIANG..TAI THAM SIGN REVERSED ROTATED RANA
|
||||
1AA8..1AAD ; Po # [6] TAI THAM SIGN KAAN..TAI THAM SIGN CAANG
|
||||
1B5A..1B60 ; Po # [7] BALINESE PANTI..BALINESE PAMENENG
|
||||
1B7D..1B7E ; Po # [2] BALINESE PANTI LANTANG..BALINESE PAMADA LANTANG
|
||||
1BFC..1BFF ; Po # [4] BATAK SYMBOL BINDU NA METEK..BATAK SYMBOL BINDU PANGOLAT
|
||||
1C3B..1C3F ; Po # [5] LEPCHA PUNCTUATION TA-ROL..LEPCHA PUNCTUATION TSHOOK
|
||||
1C7E..1C7F ; Po # [2] OL CHIKI PUNCTUATION MUCAAD..OL CHIKI PUNCTUATION DOUBLE MUCAAD
|
||||
|
@ -3641,7 +3725,7 @@ FF3F ; Pc # FULLWIDTH LOW LINE
|
|||
2E3C..2E3F ; Po # [4] STENOGRAPHIC FULL STOP..CAPITULUM
|
||||
2E41 ; Po # REVERSED COMMA
|
||||
2E43..2E4F ; Po # [13] DASH WITH LEFT UPTURN..CORNISH VERSE DIVIDER
|
||||
2E52 ; Po # TIRONIAN SIGN CAPITAL ET
|
||||
2E52..2E54 ; Po # [3] TIRONIAN SIGN CAPITAL ET..MEDIEVAL QUESTION MARK
|
||||
3001..3003 ; Po # [3] IDEOGRAPHIC COMMA..DITTO MARK
|
||||
303D ; Po # PART ALTERNATION MARK
|
||||
30FB ; Po # KATAKANA MIDDLE DOT
|
||||
|
@ -3695,6 +3779,7 @@ FF64..FF65 ; Po # [2] HALFWIDTH IDEOGRAPHIC COMMA..HALFWIDTH KATAKANA MIDDL
|
|||
10B39..10B3F ; Po # [7] AVESTAN ABBREVIATION MARK..LARGE ONE RING OVER TWO RINGS PUNCTUATION
|
||||
10B99..10B9C ; Po # [4] PSALTER PAHLAVI SECTION MARK..PSALTER PAHLAVI FOUR DOTS WITH DOT
|
||||
10F55..10F59 ; Po # [5] SOGDIAN PUNCTUATION TWO VERTICAL BARS..SOGDIAN PUNCTUATION HALF CIRCLE WITH DOT
|
||||
10F86..10F89 ; Po # [4] OLD UYGHUR PUNCTUATION BAR..OLD UYGHUR PUNCTUATION FOUR DOTS
|
||||
11047..1104D ; Po # [7] BRAHMI DANDA..BRAHMI PUNCTUATION LOTUS
|
||||
110BB..110BC ; Po # [2] KAITHI ABBREVIATION SIGN..KAITHI ENUMERATION SIGN
|
||||
110BE..110C1 ; Po # [4] KAITHI SECTION MARK..KAITHI DOUBLE DANDA
|
||||
|
@ -3713,6 +3798,7 @@ FF64..FF65 ; Po # [2] HALFWIDTH IDEOGRAPHIC COMMA..HALFWIDTH KATAKANA MIDDL
|
|||
115C1..115D7 ; Po # [23] SIDDHAM SIGN SIDDHAM..SIDDHAM SECTION MARK WITH CIRCLES AND FOUR ENCLOSURES
|
||||
11641..11643 ; Po # [3] MODI DANDA..MODI ABBREVIATION SIGN
|
||||
11660..1166C ; Po # [13] MONGOLIAN BIRGA WITH ORNAMENT..MONGOLIAN TURNED SWIRL BIRGA WITH DOUBLE ORNAMENT
|
||||
116B9 ; Po # TAKRI ABBREVIATION SIGN
|
||||
1173C..1173E ; Po # [3] AHOM SIGN SMALL SECTION..AHOM SIGN RULAI
|
||||
1183B ; Po # DOGRA ABBREVIATION SIGN
|
||||
11944..11946 ; Po # [3] DIVES AKURU DOUBLE DANDA..DIVES AKURU END OF TEXT MARK
|
||||
|
@ -3725,6 +3811,7 @@ FF64..FF65 ; Po # [2] HALFWIDTH IDEOGRAPHIC COMMA..HALFWIDTH KATAKANA MIDDL
|
|||
11EF7..11EF8 ; Po # [2] MAKASAR PASSIMBANG..MAKASAR END OF SECTION
|
||||
11FFF ; Po # TAMIL PUNCTUATION END OF TEXT
|
||||
12470..12474 ; Po # [5] CUNEIFORM PUNCTUATION SIGN OLD ASSYRIAN WORD DIVIDER..CUNEIFORM PUNCTUATION SIGN DIAGONAL QUADCOLON
|
||||
12FF1..12FF2 ; Po # [2] CYPRO-MINOAN SIGN CM301..CYPRO-MINOAN SIGN CM302
|
||||
16A6E..16A6F ; Po # [2] MRO DANDA..MRO DOUBLE DANDA
|
||||
16AF5 ; Po # BASSA VAH FULL STOP
|
||||
16B37..16B3B ; Po # [5] PAHAWH HMONG SIGN VOS THOM..PAHAWH HMONG SIGN VOS FEEM
|
||||
|
@ -3735,7 +3822,7 @@ FF64..FF65 ; Po # [2] HALFWIDTH IDEOGRAPHIC COMMA..HALFWIDTH KATAKANA MIDDL
|
|||
1DA87..1DA8B ; Po # [5] SIGNWRITING COMMA..SIGNWRITING PARENTHESIS
|
||||
1E95E..1E95F ; Po # [2] ADLAM INITIAL EXCLAMATION MARK..ADLAM INITIAL QUESTION MARK
|
||||
|
||||
# Total code points: 593
|
||||
# Total code points: 605
|
||||
|
||||
# ================================================
|
||||
|
||||
|
@ -3823,7 +3910,7 @@ FFE9..FFEC ; Sm # [4] HALFWIDTH LEFTWARDS ARROW..HALFWIDTH DOWNWARDS ARROW
|
|||
0BF9 ; Sc # TAMIL RUPEE SIGN
|
||||
0E3F ; Sc # THAI CURRENCY SYMBOL BAHT
|
||||
17DB ; Sc # KHMER CURRENCY SYMBOL RIEL
|
||||
20A0..20BF ; Sc # [32] EURO-CURRENCY SIGN..BITCOIN SIGN
|
||||
20A0..20C0 ; Sc # [33] EURO-CURRENCY SIGN..SOM SIGN
|
||||
A838 ; Sc # NORTH INDIC RUPEE MARK
|
||||
FDFC ; Sc # RIAL SIGN
|
||||
FE69 ; Sc # SMALL DOLLAR SIGN
|
||||
|
@ -3834,7 +3921,7 @@ FFE5..FFE6 ; Sc # [2] FULLWIDTH YEN SIGN..FULLWIDTH WON SIGN
|
|||
1E2FF ; Sc # WANCHO NGUN SIGN
|
||||
1ECB0 ; Sc # INDIC SIYAQ RUPEE MARK
|
||||
|
||||
# Total code points: 62
|
||||
# Total code points: 63
|
||||
|
||||
# ================================================
|
||||
|
||||
|
@ -3853,6 +3940,7 @@ FFE5..FFE6 ; Sc # [2] FULLWIDTH YEN SIGN..FULLWIDTH WON SIGN
|
|||
02EF..02FF ; Sk # [17] MODIFIER LETTER LOW DOWN ARROWHEAD..MODIFIER LETTER LOW LEFT ARROW
|
||||
0375 ; Sk # GREEK LOWER NUMERAL SIGN
|
||||
0384..0385 ; Sk # [2] GREEK TONOS..GREEK DIALYTIKA TONOS
|
||||
0888 ; Sk # ARABIC RAISED ROUND DOT
|
||||
1FBD ; Sk # GREEK KORONIS
|
||||
1FBF..1FC1 ; Sk # [3] GREEK PSILI..GREEK DIALYTIKA AND PERISPOMENI
|
||||
1FCD..1FCF ; Sk # [3] GREEK PSILI AND VARIA..GREEK PSILI AND PERISPOMENI
|
||||
|
@ -3865,13 +3953,13 @@ A720..A721 ; Sk # [2] MODIFIER LETTER STRESS AND HIGH TONE..MODIFIER LETTER
|
|||
A789..A78A ; Sk # [2] MODIFIER LETTER COLON..MODIFIER LETTER SHORT EQUALS SIGN
|
||||
AB5B ; Sk # MODIFIER BREVE WITH INVERTED BREVE
|
||||
AB6A..AB6B ; Sk # [2] MODIFIER LETTER LEFT TACK..MODIFIER LETTER RIGHT TACK
|
||||
FBB2..FBC1 ; Sk # [16] ARABIC SYMBOL DOT ABOVE..ARABIC SYMBOL SMALL TAH BELOW
|
||||
FBB2..FBC2 ; Sk # [17] ARABIC SYMBOL DOT ABOVE..ARABIC SYMBOL WASLA ABOVE
|
||||
FF3E ; Sk # FULLWIDTH CIRCUMFLEX ACCENT
|
||||
FF40 ; Sk # FULLWIDTH GRAVE ACCENT
|
||||
FFE3 ; Sk # FULLWIDTH MACRON
|
||||
1F3FB..1F3FF ; Sk # [5] EMOJI MODIFIER FITZPATRICK TYPE-1-2..EMOJI MODIFIER FITZPATRICK TYPE-6
|
||||
|
||||
# Total code points: 123
|
||||
# Total code points: 125
|
||||
|
||||
# ================================================
|
||||
|
||||
|
@ -3984,7 +4072,9 @@ A828..A82B ; So # [4] SYLOTI NAGRI POETRY MARK-1..SYLOTI NAGRI POETRY MARK-
|
|||
A836..A837 ; So # [2] NORTH INDIC QUARTER MARK..NORTH INDIC PLACEHOLDER MARK
|
||||
A839 ; So # NORTH INDIC QUANTITY MARK
|
||||
AA77..AA79 ; So # [3] MYANMAR SYMBOL AITON EXCLAMATION..MYANMAR SYMBOL AITON TWO
|
||||
FDFD ; So # ARABIC LIGATURE BISMILLAH AR-RAHMAN AR-RAHEEM
|
||||
FD40..FD4F ; So # [16] ARABIC LIGATURE RAHIMAHU ALLAAH..ARABIC LIGATURE RAHIMAHUM ALLAAH
|
||||
FDCF ; So # ARABIC LIGATURE SALAAMUHU ALAYNAA
|
||||
FDFD..FDFF ; So # [3] ARABIC LIGATURE BISMILLAH AR-RAHMAN AR-RAHEEM..ARABIC LIGATURE AZZA WA JALL
|
||||
FFE4 ; So # FULLWIDTH BROKEN BAR
|
||||
FFE8 ; So # HALFWIDTH FORMS LIGHT VERTICAL
|
||||
FFED..FFEE ; So # [2] HALFWIDTH BLACK SQUARE..HALFWIDTH WHITE CIRCLE
|
||||
|
@ -4003,13 +4093,14 @@ FFFC..FFFD ; So # [2] OBJECT REPLACEMENT CHARACTER..REPLACEMENT CHARACTER
|
|||
16B3C..16B3F ; So # [4] PAHAWH HMONG SIGN XYEEM NTXIV..PAHAWH HMONG SIGN XYEEM FAIB
|
||||
16B45 ; So # PAHAWH HMONG SIGN CIM TSOV ROG
|
||||
1BC9C ; So # DUPLOYAN SIGN O WITH CROSS
|
||||
1CF50..1CFC3 ; So # [116] ZNAMENNY NEUME KRYUK..ZNAMENNY NEUME PAUK
|
||||
1D000..1D0F5 ; So # [246] BYZANTINE MUSICAL SYMBOL PSILI..BYZANTINE MUSICAL SYMBOL GORGON NEO KATO
|
||||
1D100..1D126 ; So # [39] MUSICAL SYMBOL SINGLE BARLINE..MUSICAL SYMBOL DRUM CLEF-2
|
||||
1D129..1D164 ; So # [60] MUSICAL SYMBOL MULTIPLE MEASURE REST..MUSICAL SYMBOL ONE HUNDRED TWENTY-EIGHTH NOTE
|
||||
1D16A..1D16C ; So # [3] MUSICAL SYMBOL FINGERED TREMOLO-1..MUSICAL SYMBOL FINGERED TREMOLO-3
|
||||
1D183..1D184 ; So # [2] MUSICAL SYMBOL ARPEGGIATO UP..MUSICAL SYMBOL ARPEGGIATO DOWN
|
||||
1D18C..1D1A9 ; So # [30] MUSICAL SYMBOL RINFORZANDO..MUSICAL SYMBOL DEGREE SLASH
|
||||
1D1AE..1D1E8 ; So # [59] MUSICAL SYMBOL PEDAL MARK..MUSICAL SYMBOL KIEVAN FLAT SIGN
|
||||
1D1AE..1D1EA ; So # [61] MUSICAL SYMBOL PEDAL MARK..MUSICAL SYMBOL KORON
|
||||
1D200..1D241 ; So # [66] GREEK VOCAL NOTATION SYMBOL-1..GREEK INSTRUMENTAL NOTATION SYMBOL-54
|
||||
1D245 ; So # GREEK MUSICAL LEIMMA
|
||||
1D300..1D356 ; So # [87] MONOGRAM FOR EARTH..TETRAGRAM FOR FOSTERING
|
||||
|
@ -4035,32 +4126,33 @@ FFFC..FFFD ; So # [2] OBJECT REPLACEMENT CHARACTER..REPLACEMENT CHARACTER
|
|||
1F260..1F265 ; So # [6] ROUNDED SYMBOL FOR FU..ROUNDED SYMBOL FOR CAI
|
||||
1F300..1F3FA ; So # [251] CYCLONE..AMPHORA
|
||||
1F400..1F6D7 ; So # [728] RAT..ELEVATOR
|
||||
1F6E0..1F6EC ; So # [13] HAMMER AND WRENCH..AIRPLANE ARRIVING
|
||||
1F6DD..1F6EC ; So # [16] PLAYGROUND SLIDE..AIRPLANE ARRIVING
|
||||
1F6F0..1F6FC ; So # [13] SATELLITE..ROLLER SKATE
|
||||
1F700..1F773 ; So # [116] ALCHEMICAL SYMBOL FOR QUINTESSENCE..ALCHEMICAL SYMBOL FOR HALF OUNCE
|
||||
1F780..1F7D8 ; So # [89] BLACK LEFT-POINTING ISOSCELES RIGHT TRIANGLE..NEGATIVE CIRCLED SQUARE
|
||||
1F7E0..1F7EB ; So # [12] LARGE ORANGE CIRCLE..LARGE BROWN SQUARE
|
||||
1F7F0 ; So # HEAVY EQUALS SIGN
|
||||
1F800..1F80B ; So # [12] LEFTWARDS ARROW WITH SMALL TRIANGLE ARROWHEAD..DOWNWARDS ARROW WITH LARGE TRIANGLE ARROWHEAD
|
||||
1F810..1F847 ; So # [56] LEFTWARDS ARROW WITH SMALL EQUILATERAL ARROWHEAD..DOWNWARDS HEAVY ARROW
|
||||
1F850..1F859 ; So # [10] LEFTWARDS SANS-SERIF ARROW..UP DOWN SANS-SERIF ARROW
|
||||
1F860..1F887 ; So # [40] WIDE-HEADED LEFTWARDS LIGHT BARB ARROW..WIDE-HEADED SOUTH WEST VERY HEAVY BARB ARROW
|
||||
1F890..1F8AD ; So # [30] LEFTWARDS TRIANGLE ARROWHEAD..WHITE ARROW SHAFT WIDTH TWO THIRDS
|
||||
1F8B0..1F8B1 ; So # [2] ARROW POINTING UPWARDS THEN NORTH WEST..ARROW POINTING RIGHTWARDS THEN CURVING SOUTH WEST
|
||||
1F900..1F978 ; So # [121] CIRCLED CROSS FORMEE WITH FOUR DOTS..DISGUISED FACE
|
||||
1F97A..1F9CB ; So # [82] FACE WITH PLEADING EYES..BUBBLE TEA
|
||||
1F9CD..1FA53 ; So # [135] STANDING PERSON..BLACK CHESS KNIGHT-BISHOP
|
||||
1F900..1FA53 ; So # [340] CIRCLED CROSS FORMEE WITH FOUR DOTS..BLACK CHESS KNIGHT-BISHOP
|
||||
1FA60..1FA6D ; So # [14] XIANGQI RED GENERAL..XIANGQI BLACK SOLDIER
|
||||
1FA70..1FA74 ; So # [5] BALLET SHOES..THONG SANDAL
|
||||
1FA78..1FA7A ; So # [3] DROP OF BLOOD..STETHOSCOPE
|
||||
1FA78..1FA7C ; So # [5] DROP OF BLOOD..CRUTCH
|
||||
1FA80..1FA86 ; So # [7] YO-YO..NESTING DOLLS
|
||||
1FA90..1FAA8 ; So # [25] RINGED PLANET..ROCK
|
||||
1FAB0..1FAB6 ; So # [7] FLY..FEATHER
|
||||
1FAC0..1FAC2 ; So # [3] ANATOMICAL HEART..PEOPLE HUGGING
|
||||
1FAD0..1FAD6 ; So # [7] BLUEBERRIES..TEAPOT
|
||||
1FA90..1FAAC ; So # [29] RINGED PLANET..HAMSA
|
||||
1FAB0..1FABA ; So # [11] FLY..NEST WITH EGGS
|
||||
1FAC0..1FAC5 ; So # [6] ANATOMICAL HEART..PERSON WITH CROWN
|
||||
1FAD0..1FAD9 ; So # [10] BLUEBERRIES..JAR
|
||||
1FAE0..1FAE7 ; So # [8] MELTING FACE..BUBBLES
|
||||
1FAF0..1FAF6 ; So # [7] HAND WITH INDEX FINGER AND THUMB CROSSED..HEART HANDS
|
||||
1FB00..1FB92 ; So # [147] BLOCK SEXTANT-1..UPPER HALF INVERSE MEDIUM SHADE AND LOWER HALF BLOCK
|
||||
1FB94..1FBCA ; So # [55] LEFT HALF INVERSE MEDIUM SHADE AND RIGHT HALF BLOCK..WHITE UP-POINTING CHEVRON
|
||||
|
||||
# Total code points: 6431
|
||||
# Total code points: 6605
|
||||
|
||||
# ================================================
|
||||
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
# GraphemeBreakProperty-13.0.0.txt
|
||||
# Date: 2019-10-21, 14:30:35 GMT
|
||||
# © 2019 Unicode®, Inc.
|
||||
# GraphemeBreakProperty-14.0.0.txt
|
||||
# Date: 2021-08-12, 23:13:02 GMT
|
||||
# © 2021 Unicode®, Inc.
|
||||
# Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries.
|
||||
# For terms of use, see http://www.unicode.org/terms_of_use.html
|
||||
#
|
||||
|
@ -21,6 +21,7 @@
|
|||
0600..0605 ; Prepend # Cf [6] ARABIC NUMBER SIGN..ARABIC NUMBER MARK ABOVE
|
||||
06DD ; Prepend # Cf ARABIC END OF AYAH
|
||||
070F ; Prepend # Cf SYRIAC ABBREVIATION MARK
|
||||
0890..0891 ; Prepend # Cf [2] ARABIC POUND MARK ABOVE..ARABIC PIASTRE MARK ABOVE
|
||||
08E2 ; Prepend # Cf ARABIC DISPUTED END OF AYAH
|
||||
0D4E ; Prepend # Lo MALAYALAM LETTER DOT REPH
|
||||
110BD ; Prepend # Cf KAITHI NUMBER SIGN
|
||||
|
@ -32,7 +33,7 @@
|
|||
11A84..11A89 ; Prepend # Lo [6] SOYOMBO SIGN JIHVAMULIYA..SOYOMBO CLUSTER-INITIAL LETTER SA
|
||||
11D46 ; Prepend # Lo MASARAM GONDI REPHA
|
||||
|
||||
# Total code points: 24
|
||||
# Total code points: 26
|
||||
|
||||
# ================================================
|
||||
|
||||
|
@ -104,7 +105,8 @@ E01F0..E0FFF ; Control # Cn [3600] <reserved-E01F0>..<reserved-E0FFF>
|
|||
0825..0827 ; Extend # Mn [3] SAMARITAN VOWEL SIGN SHORT A..SAMARITAN VOWEL SIGN U
|
||||
0829..082D ; Extend # Mn [5] SAMARITAN VOWEL SIGN LONG I..SAMARITAN MARK NEQUDAA
|
||||
0859..085B ; Extend # Mn [3] MANDAIC AFFRICATION MARK..MANDAIC GEMINATION MARK
|
||||
08D3..08E1 ; Extend # Mn [15] ARABIC SMALL LOW WAW..ARABIC SMALL HIGH SIGN SAFHA
|
||||
0898..089F ; Extend # Mn [8] ARABIC SMALL HIGH WORD AL-JUZ..ARABIC HALF MADDA OVER MADDA
|
||||
08CA..08E1 ; Extend # Mn [24] ARABIC SMALL HIGH FARSI YEH..ARABIC SMALL HIGH SIGN SAFHA
|
||||
08E3..0902 ; Extend # Mn [32] ARABIC TURNED DAMMA BELOW..DEVANAGARI SIGN ANUSVARA
|
||||
093A ; Extend # Mn DEVANAGARI VOWEL SIGN OE
|
||||
093C ; Extend # Mn DEVANAGARI SIGN NUKTA
|
||||
|
@ -151,6 +153,7 @@ E01F0..E0FFF ; Control # Cn [3600] <reserved-E01F0>..<reserved-E0FFF>
|
|||
0BD7 ; Extend # Mc TAMIL AU LENGTH MARK
|
||||
0C00 ; Extend # Mn TELUGU SIGN COMBINING CANDRABINDU ABOVE
|
||||
0C04 ; Extend # Mn TELUGU SIGN COMBINING ANUSVARA ABOVE
|
||||
0C3C ; Extend # Mn TELUGU SIGN NUKTA
|
||||
0C3E..0C40 ; Extend # Mn [3] TELUGU VOWEL SIGN AA..TELUGU VOWEL SIGN II
|
||||
0C46..0C48 ; Extend # Mn [3] TELUGU VOWEL SIGN E..TELUGU VOWEL SIGN AI
|
||||
0C4A..0C4D ; Extend # Mn [4] TELUGU VOWEL SIGN O..TELUGU SIGN VIRAMA
|
||||
|
@ -206,7 +209,7 @@ E01F0..E0FFF ; Control # Cn [3600] <reserved-E01F0>..<reserved-E0FFF>
|
|||
109D ; Extend # Mn MYANMAR VOWEL SIGN AITON AI
|
||||
135D..135F ; Extend # Mn [3] ETHIOPIC COMBINING GEMINATION AND VOWEL LENGTH MARK..ETHIOPIC COMBINING GEMINATION MARK
|
||||
1712..1714 ; Extend # Mn [3] TAGALOG VOWEL SIGN I..TAGALOG SIGN VIRAMA
|
||||
1732..1734 ; Extend # Mn [3] HANUNOO VOWEL SIGN I..HANUNOO SIGN PAMUDPOD
|
||||
1732..1733 ; Extend # Mn [2] HANUNOO VOWEL SIGN I..HANUNOO VOWEL SIGN U
|
||||
1752..1753 ; Extend # Mn [2] BUHID VOWEL SIGN I..BUHID VOWEL SIGN U
|
||||
1772..1773 ; Extend # Mn [2] TAGBANWA VOWEL SIGN I..TAGBANWA VOWEL SIGN U
|
||||
17B4..17B5 ; Extend # Mn [2] KHMER VOWEL INHERENT AQ..KHMER VOWEL INHERENT AA
|
||||
|
@ -215,6 +218,7 @@ E01F0..E0FFF ; Control # Cn [3600] <reserved-E01F0>..<reserved-E0FFF>
|
|||
17C9..17D3 ; Extend # Mn [11] KHMER SIGN MUUSIKATOAN..KHMER SIGN BATHAMASAT
|
||||
17DD ; Extend # Mn KHMER SIGN ATTHACAN
|
||||
180B..180D ; Extend # Mn [3] MONGOLIAN FREE VARIATION SELECTOR ONE..MONGOLIAN FREE VARIATION SELECTOR THREE
|
||||
180F ; Extend # Mn MONGOLIAN FREE VARIATION SELECTOR FOUR
|
||||
1885..1886 ; Extend # Mn [2] MONGOLIAN LETTER ALI GALI BALUDA..MONGOLIAN LETTER ALI GALI THREE BALUDA
|
||||
18A9 ; Extend # Mn MONGOLIAN LETTER ALI GALI DAGALGA
|
||||
1920..1922 ; Extend # Mn [3] LIMBU VOWEL SIGN A..LIMBU VOWEL SIGN U
|
||||
|
@ -232,7 +236,7 @@ E01F0..E0FFF ; Control # Cn [3600] <reserved-E01F0>..<reserved-E0FFF>
|
|||
1A7F ; Extend # Mn TAI THAM COMBINING CRYPTOGRAMMIC DOT
|
||||
1AB0..1ABD ; Extend # Mn [14] COMBINING DOUBLED CIRCUMFLEX ACCENT..COMBINING PARENTHESES BELOW
|
||||
1ABE ; Extend # Me COMBINING PARENTHESES OVERLAY
|
||||
1ABF..1AC0 ; Extend # Mn [2] COMBINING LATIN SMALL LETTER W BELOW..COMBINING LATIN SMALL LETTER TURNED W BELOW
|
||||
1ABF..1ACE ; Extend # Mn [16] COMBINING LATIN SMALL LETTER W BELOW..COMBINING LATIN SMALL LETTER INSULAR T
|
||||
1B00..1B03 ; Extend # Mn [4] BALINESE SIGN ULU RICEM..BALINESE SIGN SURANG
|
||||
1B34 ; Extend # Mn BALINESE SIGN REREKAN
|
||||
1B35 ; Extend # Mc BALINESE VOWEL SIGN TEDUNG
|
||||
|
@ -256,8 +260,7 @@ E01F0..E0FFF ; Control # Cn [3600] <reserved-E01F0>..<reserved-E0FFF>
|
|||
1CED ; Extend # Mn VEDIC SIGN TIRYAK
|
||||
1CF4 ; Extend # Mn VEDIC TONE CANDRA ABOVE
|
||||
1CF8..1CF9 ; Extend # Mn [2] VEDIC TONE RING ABOVE..VEDIC TONE DOUBLE RING ABOVE
|
||||
1DC0..1DF9 ; Extend # Mn [58] COMBINING DOTTED GRAVE ACCENT..COMBINING WIDE INVERTED BRIDGE BELOW
|
||||
1DFB..1DFF ; Extend # Mn [5] COMBINING DELETION MARK..COMBINING RIGHT ARROWHEAD AND DOWN ARROWHEAD BELOW
|
||||
1DC0..1DFF ; Extend # Mn [64] COMBINING DOTTED GRAVE ACCENT..COMBINING RIGHT ARROWHEAD AND DOWN ARROWHEAD BELOW
|
||||
200C ; Extend # Cf ZERO WIDTH NON-JOINER
|
||||
20D0..20DC ; Extend # Mn [13] COMBINING LEFT HARPOON ABOVE..COMBINING FOUR DOTS ABOVE
|
||||
20DD..20E0 ; Extend # Me [4] COMBINING ENCLOSING CIRCLE..COMBINING ENCLOSING CIRCLE BACKSLASH
|
||||
|
@ -322,11 +325,15 @@ FF9E..FF9F ; Extend # Lm [2] HALFWIDTH KATAKANA VOICED SOUND MARK..HALFWIDT
|
|||
10D24..10D27 ; Extend # Mn [4] HANIFI ROHINGYA SIGN HARBAHAY..HANIFI ROHINGYA SIGN TASSI
|
||||
10EAB..10EAC ; Extend # Mn [2] YEZIDI COMBINING HAMZA MARK..YEZIDI COMBINING MADDA MARK
|
||||
10F46..10F50 ; Extend # Mn [11] SOGDIAN COMBINING DOT BELOW..SOGDIAN COMBINING STROKE BELOW
|
||||
10F82..10F85 ; Extend # Mn [4] OLD UYGHUR COMBINING DOT ABOVE..OLD UYGHUR COMBINING TWO DOTS BELOW
|
||||
11001 ; Extend # Mn BRAHMI SIGN ANUSVARA
|
||||
11038..11046 ; Extend # Mn [15] BRAHMI VOWEL SIGN AA..BRAHMI VIRAMA
|
||||
11070 ; Extend # Mn BRAHMI SIGN OLD TAMIL VIRAMA
|
||||
11073..11074 ; Extend # Mn [2] BRAHMI VOWEL SIGN OLD TAMIL SHORT E..BRAHMI VOWEL SIGN OLD TAMIL SHORT O
|
||||
1107F..11081 ; Extend # Mn [3] BRAHMI NUMBER JOINER..KAITHI SIGN ANUSVARA
|
||||
110B3..110B6 ; Extend # Mn [4] KAITHI VOWEL SIGN U..KAITHI VOWEL SIGN AI
|
||||
110B9..110BA ; Extend # Mn [2] KAITHI SIGN VIRAMA..KAITHI SIGN NUKTA
|
||||
110C2 ; Extend # Mn KAITHI VOWEL SIGN VOCALIC R
|
||||
11100..11102 ; Extend # Mn [3] CHAKMA SIGN CANDRABINDU..CHAKMA SIGN VISARGA
|
||||
11127..1112B ; Extend # Mn [5] CHAKMA VOWEL SIGN A..CHAKMA VOWEL SIGN UU
|
||||
1112D..11134 ; Extend # Mn [8] CHAKMA VOWEL SIGN AI..CHAKMA MAAYYAA
|
||||
|
@ -412,6 +419,8 @@ FF9E..FF9F ; Extend # Lm [2] HALFWIDTH KATAKANA VOICED SOUND MARK..HALFWIDT
|
|||
16F8F..16F92 ; Extend # Mn [4] MIAO TONE RIGHT..MIAO TONE BELOW
|
||||
16FE4 ; Extend # Mn KHITAN SMALL SCRIPT FILLER
|
||||
1BC9D..1BC9E ; Extend # Mn [2] DUPLOYAN THICK LETTER SELECTOR..DUPLOYAN DOUBLE MARK
|
||||
1CF00..1CF2D ; Extend # Mn [46] ZNAMENNY COMBINING MARK GORAZDO NIZKO S KRYZHEM ON LEFT..ZNAMENNY COMBINING MARK KRYZH ON LEFT
|
||||
1CF30..1CF46 ; Extend # Mn [23] ZNAMENNY COMBINING TONAL RANGE MARK MRACHNO..ZNAMENNY PRIZNAK MODIFIER ROG
|
||||
1D165 ; Extend # Mc MUSICAL SYMBOL COMBINING STEM
|
||||
1D167..1D169 ; Extend # Mn [3] MUSICAL SYMBOL COMBINING TREMOLO-1..MUSICAL SYMBOL COMBINING TREMOLO-3
|
||||
1D16E..1D172 ; Extend # Mc [5] MUSICAL SYMBOL COMBINING FLAG-1..MUSICAL SYMBOL COMBINING FLAG-5
|
||||
|
@ -431,6 +440,7 @@ FF9E..FF9F ; Extend # Lm [2] HALFWIDTH KATAKANA VOICED SOUND MARK..HALFWIDT
|
|||
1E023..1E024 ; Extend # Mn [2] COMBINING GLAGOLITIC LETTER YU..COMBINING GLAGOLITIC LETTER SMALL YUS
|
||||
1E026..1E02A ; Extend # Mn [5] COMBINING GLAGOLITIC LETTER YO..COMBINING GLAGOLITIC LETTER FITA
|
||||
1E130..1E136 ; Extend # Mn [7] NYIAKENG PUACHUE HMONG TONE-B..NYIAKENG PUACHUE HMONG TONE-D
|
||||
1E2AE ; Extend # Mn TOTO SIGN RISING TONE
|
||||
1E2EC..1E2EF ; Extend # Mn [4] WANCHO TONE TUP..WANCHO TONE KOINI
|
||||
1E8D0..1E8D6 ; Extend # Mn [7] MENDE KIKAKUI COMBINING NUMBER TEENS..MENDE KIKAKUI COMBINING NUMBER MILLIONS
|
||||
1E944..1E94A ; Extend # Mn [7] ADLAM ALIF LENGTHENER..ADLAM NUKTA
|
||||
|
@ -438,7 +448,7 @@ FF9E..FF9F ; Extend # Lm [2] HALFWIDTH KATAKANA VOICED SOUND MARK..HALFWIDT
|
|||
E0020..E007F ; Extend # Cf [96] TAG SPACE..CANCEL TAG
|
||||
E0100..E01EF ; Extend # Mn [240] VARIATION SELECTOR-17..VARIATION SELECTOR-256
|
||||
|
||||
# Total code points: 1984
|
||||
# Total code points: 2095
|
||||
|
||||
# ================================================
|
||||
|
||||
|
@ -495,6 +505,8 @@ E0100..E01EF ; Extend # Mn [240] VARIATION SELECTOR-17..VARIATION SELECTOR-256
|
|||
103B..103C ; SpacingMark # Mc [2] MYANMAR CONSONANT SIGN MEDIAL YA..MYANMAR CONSONANT SIGN MEDIAL RA
|
||||
1056..1057 ; SpacingMark # Mc [2] MYANMAR VOWEL SIGN VOCALIC R..MYANMAR VOWEL SIGN VOCALIC RR
|
||||
1084 ; SpacingMark # Mc MYANMAR VOWEL SIGN SHAN E
|
||||
1715 ; SpacingMark # Mc TAGALOG SIGN PAMUDPOD
|
||||
1734 ; SpacingMark # Mc HANUNOO SIGN PAMUDPOD
|
||||
17B6 ; SpacingMark # Mc KHMER VOWEL SIGN AA
|
||||
17BE..17C5 ; SpacingMark # Mc [8] KHMER VOWEL SIGN OE..KHMER VOWEL SIGN AU
|
||||
17C7..17C8 ; SpacingMark # Mc [2] KHMER SIGN REAHMUK..KHMER SIGN YUUKALEAPINTU
|
||||
|
@ -579,7 +591,6 @@ ABEC ; SpacingMark # Mc MEETEI MAYEK LUM IYEK
|
|||
116AC ; SpacingMark # Mc TAKRI SIGN VISARGA
|
||||
116AE..116AF ; SpacingMark # Mc [2] TAKRI VOWEL SIGN I..TAKRI VOWEL SIGN II
|
||||
116B6 ; SpacingMark # Mc TAKRI SIGN VIRAMA
|
||||
11720..11721 ; SpacingMark # Mc [2] AHOM VOWEL SIGN A..AHOM VOWEL SIGN AA
|
||||
11726 ; SpacingMark # Mc AHOM VOWEL SIGN E
|
||||
1182C..1182E ; SpacingMark # Mc [3] DOGRA VOWEL SIGN AA..DOGRA VOWEL SIGN II
|
||||
11838 ; SpacingMark # Mc DOGRA SIGN VISARGA
|
||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,212 @@
|
|||
# PropertyAliases-14.0.0.txt
|
||||
# Date: 2021-03-08, 19:35:48 GMT
|
||||
# © 2021 Unicode®, Inc.
|
||||
# Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries.
|
||||
# For terms of use, see http://www.unicode.org/terms_of_use.html
|
||||
#
|
||||
# Unicode Character Database
|
||||
# For documentation, see http://www.unicode.org/reports/tr44/
|
||||
#
|
||||
# This file contains aliases for properties used in the UCD.
|
||||
# These names can be used for XML formats of UCD data, for regular-expression
|
||||
# property tests, and other programmatic textual descriptions of Unicode data.
|
||||
#
|
||||
# The names may be translated in appropriate environments, and additional
|
||||
# aliases may be useful.
|
||||
#
|
||||
# FORMAT
|
||||
#
|
||||
# Each line has two or more fields, separated by semicolons.
|
||||
#
|
||||
# First Field: The first field is the short name for the property.
|
||||
# It is typically an abbreviation, but in a number of cases it is simply
|
||||
# a duplicate of the "long name" in the second field.
|
||||
# For Unihan database tags, the short name is actually a longer string than
|
||||
# the tag specified in the second field.
|
||||
#
|
||||
# Second Field: The second field is the long name for the property,
|
||||
# typically the formal name used in documentation about the property.
|
||||
#
|
||||
# The above are the preferred aliases. Other aliases may be listed in additional fields.
|
||||
#
|
||||
# Loose matching should be applied to all property names and property values, with
|
||||
# the exception of String Property values. With loose matching of property names and
|
||||
# values, the case distinctions, whitespace, and '_' are ignored. For Numeric Property
|
||||
# values, numeric equivalencies are applied: thus "01.00" is equivalent to "1".
|
||||
#
|
||||
# NOTE: Property value names are NOT unique across properties. For example:
|
||||
#
|
||||
# AL means Arabic Letter for the Bidi_Class property, and
|
||||
# AL means Above_Left for the Combining_Class property, and
|
||||
# AL means Alphabetic for the Line_Break property.
|
||||
#
|
||||
# In addition, some property names may be the same as some property value names.
|
||||
# For example:
|
||||
#
|
||||
# sc means the Script property, and
|
||||
# Sc means the General_Category property value Currency_Symbol (Sc)
|
||||
#
|
||||
# The combination of property value and property name is, however, unique.
|
||||
#
|
||||
# For more information, see UAX #44, Unicode Character Database, and
|
||||
# UTS #18, Unicode Regular Expressions.
|
||||
# ================================================
|
||||
|
||||
|
||||
# ================================================
|
||||
# Numeric Properties
|
||||
# ================================================
|
||||
cjkAccountingNumeric ; kAccountingNumeric
|
||||
cjkOtherNumeric ; kOtherNumeric
|
||||
cjkPrimaryNumeric ; kPrimaryNumeric
|
||||
nv ; Numeric_Value
|
||||
|
||||
# ================================================
|
||||
# String Properties
|
||||
# ================================================
|
||||
cf ; Case_Folding
|
||||
cjkCompatibilityVariant ; kCompatibilityVariant
|
||||
dm ; Decomposition_Mapping
|
||||
FC_NFKC ; FC_NFKC_Closure
|
||||
lc ; Lowercase_Mapping
|
||||
NFKC_CF ; NFKC_Casefold
|
||||
scf ; Simple_Case_Folding ; sfc
|
||||
slc ; Simple_Lowercase_Mapping
|
||||
stc ; Simple_Titlecase_Mapping
|
||||
suc ; Simple_Uppercase_Mapping
|
||||
tc ; Titlecase_Mapping
|
||||
uc ; Uppercase_Mapping
|
||||
|
||||
# ================================================
|
||||
# Miscellaneous Properties
|
||||
# ================================================
|
||||
bmg ; Bidi_Mirroring_Glyph
|
||||
bpb ; Bidi_Paired_Bracket
|
||||
cjkIICore ; kIICore
|
||||
cjkIRG_GSource ; kIRG_GSource
|
||||
cjkIRG_HSource ; kIRG_HSource
|
||||
cjkIRG_JSource ; kIRG_JSource
|
||||
cjkIRG_KPSource ; kIRG_KPSource
|
||||
cjkIRG_KSource ; kIRG_KSource
|
||||
cjkIRG_MSource ; kIRG_MSource
|
||||
cjkIRG_SSource ; kIRG_SSource
|
||||
cjkIRG_TSource ; kIRG_TSource
|
||||
cjkIRG_UKSource ; kIRG_UKSource
|
||||
cjkIRG_USource ; kIRG_USource
|
||||
cjkIRG_VSource ; kIRG_VSource
|
||||
cjkRSUnicode ; kRSUnicode ; Unicode_Radical_Stroke; URS
|
||||
EqUIdeo ; Equivalent_Unified_Ideograph
|
||||
isc ; ISO_Comment
|
||||
JSN ; Jamo_Short_Name
|
||||
na ; Name
|
||||
na1 ; Unicode_1_Name
|
||||
Name_Alias ; Name_Alias
|
||||
scx ; Script_Extensions
|
||||
|
||||
# ================================================
|
||||
# Catalog Properties
|
||||
# ================================================
|
||||
age ; Age
|
||||
blk ; Block
|
||||
sc ; Script
|
||||
|
||||
# ================================================
|
||||
# Enumerated Properties
|
||||
# ================================================
|
||||
bc ; Bidi_Class
|
||||
bpt ; Bidi_Paired_Bracket_Type
|
||||
ccc ; Canonical_Combining_Class
|
||||
dt ; Decomposition_Type
|
||||
ea ; East_Asian_Width
|
||||
gc ; General_Category
|
||||
GCB ; Grapheme_Cluster_Break
|
||||
hst ; Hangul_Syllable_Type
|
||||
InPC ; Indic_Positional_Category
|
||||
InSC ; Indic_Syllabic_Category
|
||||
jg ; Joining_Group
|
||||
jt ; Joining_Type
|
||||
lb ; Line_Break
|
||||
NFC_QC ; NFC_Quick_Check
|
||||
NFD_QC ; NFD_Quick_Check
|
||||
NFKC_QC ; NFKC_Quick_Check
|
||||
NFKD_QC ; NFKD_Quick_Check
|
||||
nt ; Numeric_Type
|
||||
SB ; Sentence_Break
|
||||
vo ; Vertical_Orientation
|
||||
WB ; Word_Break
|
||||
|
||||
# ================================================
|
||||
# Binary Properties
|
||||
# ================================================
|
||||
AHex ; ASCII_Hex_Digit
|
||||
Alpha ; Alphabetic
|
||||
Bidi_C ; Bidi_Control
|
||||
Bidi_M ; Bidi_Mirrored
|
||||
Cased ; Cased
|
||||
CE ; Composition_Exclusion
|
||||
CI ; Case_Ignorable
|
||||
Comp_Ex ; Full_Composition_Exclusion
|
||||
CWCF ; Changes_When_Casefolded
|
||||
CWCM ; Changes_When_Casemapped
|
||||
CWKCF ; Changes_When_NFKC_Casefolded
|
||||
CWL ; Changes_When_Lowercased
|
||||
CWT ; Changes_When_Titlecased
|
||||
CWU ; Changes_When_Uppercased
|
||||
Dash ; Dash
|
||||
Dep ; Deprecated
|
||||
DI ; Default_Ignorable_Code_Point
|
||||
Dia ; Diacritic
|
||||
EBase ; Emoji_Modifier_Base
|
||||
EComp ; Emoji_Component
|
||||
EMod ; Emoji_Modifier
|
||||
Emoji ; Emoji
|
||||
EPres ; Emoji_Presentation
|
||||
Ext ; Extender
|
||||
ExtPict ; Extended_Pictographic
|
||||
Gr_Base ; Grapheme_Base
|
||||
Gr_Ext ; Grapheme_Extend
|
||||
Gr_Link ; Grapheme_Link
|
||||
Hex ; Hex_Digit
|
||||
Hyphen ; Hyphen
|
||||
IDC ; ID_Continue
|
||||
Ideo ; Ideographic
|
||||
IDS ; ID_Start
|
||||
IDSB ; IDS_Binary_Operator
|
||||
IDST ; IDS_Trinary_Operator
|
||||
Join_C ; Join_Control
|
||||
LOE ; Logical_Order_Exception
|
||||
Lower ; Lowercase
|
||||
Math ; Math
|
||||
NChar ; Noncharacter_Code_Point
|
||||
OAlpha ; Other_Alphabetic
|
||||
ODI ; Other_Default_Ignorable_Code_Point
|
||||
OGr_Ext ; Other_Grapheme_Extend
|
||||
OIDC ; Other_ID_Continue
|
||||
OIDS ; Other_ID_Start
|
||||
OLower ; Other_Lowercase
|
||||
OMath ; Other_Math
|
||||
OUpper ; Other_Uppercase
|
||||
Pat_Syn ; Pattern_Syntax
|
||||
Pat_WS ; Pattern_White_Space
|
||||
PCM ; Prepended_Concatenation_Mark
|
||||
QMark ; Quotation_Mark
|
||||
Radical ; Radical
|
||||
RI ; Regional_Indicator
|
||||
SD ; Soft_Dotted
|
||||
STerm ; Sentence_Terminal
|
||||
Term ; Terminal_Punctuation
|
||||
UIdeo ; Unified_Ideograph
|
||||
Upper ; Uppercase
|
||||
VS ; Variation_Selector
|
||||
WSpace ; White_Space ; space
|
||||
XIDC ; XID_Continue
|
||||
XIDS ; XID_Start
|
||||
XO_NFC ; Expands_On_NFC
|
||||
XO_NFD ; Expands_On_NFD
|
||||
XO_NFKC ; Expands_On_NFKC
|
||||
XO_NFKD ; Expands_On_NFKD
|
||||
|
||||
# ================================================
|
||||
# Total: 129
|
||||
|
||||
# EOF
|
File diff suppressed because it is too large
Load Diff
|
@ -1,6 +1,6 @@
|
|||
# ScriptExtensions-13.0.0.txt
|
||||
# Date: 2020-01-22, 00:07:43 GMT
|
||||
# © 2020 Unicode®, Inc.
|
||||
# ScriptExtensions-14.0.0.txt
|
||||
# Date: 2021-06-04, 02:19:38 GMT
|
||||
# © 2021 Unicode®, Inc.
|
||||
# Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries.
|
||||
# For terms of use, see http://www.unicode.org/terms_of_use.html
|
||||
#
|
||||
|
@ -11,10 +11,10 @@
|
|||
# with more than one script, but with a limited number of scripts.
|
||||
# For each code point, there is one or more property values. Each such value is a Script property value.
|
||||
# For more information, see:
|
||||
# UAX #24, Unicode Script Property: http://www.unicode.org/reports/tr24/
|
||||
# UAX #24, Unicode Script Property: https://www.unicode.org/reports/tr24/
|
||||
# Especially the sections:
|
||||
# http://www.unicode.org/reports/tr24/#Assignment_Script_Values
|
||||
# http://www.unicode.org/reports/tr24/#Assignment_ScriptX_Values
|
||||
# https://www.unicode.org/reports/tr24/#Assignment_Script_Values
|
||||
# https://www.unicode.org/reports/tr24/#Assignment_ScriptX_Values
|
||||
#
|
||||
# Each Script_Extensions value in this file consists of a set
|
||||
# of one or more abbreviated Script property values. The ordering of the
|
||||
|
@ -119,6 +119,14 @@
|
|||
|
||||
# ================================================
|
||||
|
||||
# Script_Extensions=Syrc
|
||||
|
||||
1DFA ; Syrc # Mn COMBINING DOT BELOW LEFT
|
||||
|
||||
# Total code points: 1
|
||||
|
||||
# ================================================
|
||||
|
||||
# Script_Extensions=Arab Copt
|
||||
|
||||
102E0 ; Arab Copt # Mn COPTIC EPACT THOUSANDS MARK
|
||||
|
@ -136,6 +144,15 @@
|
|||
|
||||
# ================================================
|
||||
|
||||
# Script_Extensions=Arab Nkoo
|
||||
|
||||
FD3E ; Arab Nkoo # Pe ORNATE LEFT PARENTHESIS
|
||||
FD3F ; Arab Nkoo # Ps ORNATE RIGHT PARENTHESIS
|
||||
|
||||
# Total code points: 2
|
||||
|
||||
# ================================================
|
||||
|
||||
# Script_Extensions=Arab Syrc
|
||||
|
||||
064B..0655 ; Arab Syrc # Mn [11] ARABIC FATHATAN..ARABIC HAMZA BELOW
|
||||
|
@ -186,10 +203,10 @@ A9CF ; Bugi Java # Lm JAVANESE PANGRANGKEP
|
|||
|
||||
# Script_Extensions=Cprt Linb
|
||||
|
||||
10100..10102 ; Cprt Linb # Po [3] AEGEAN WORD SEPARATOR LINE..AEGEAN CHECK MARK
|
||||
10102 ; Cprt Linb # Po AEGEAN CHECK MARK
|
||||
10137..1013F ; Cprt Linb # So [9] AEGEAN WEIGHT BASE UNIT..AEGEAN MEASURE THIRD SUBUNIT
|
||||
|
||||
# Total code points: 12
|
||||
# Total code points: 10
|
||||
|
||||
# ================================================
|
||||
|
||||
|
@ -342,6 +359,14 @@ FF9E..FF9F ; Hira Kana # Lm [2] HALFWIDTH KATAKANA VOICED SOUND MARK..HALFW
|
|||
|
||||
# ================================================
|
||||
|
||||
# Script_Extensions=Mani Ougr
|
||||
|
||||
10AF2 ; Mani Ougr # Po MANICHAEAN PUNCTUATION DOUBLE DOT WITHIN DOT
|
||||
|
||||
# Total code points: 1
|
||||
|
||||
# ================================================
|
||||
|
||||
# Script_Extensions=Mong Phag
|
||||
|
||||
1802..1803 ; Mong Phag # Po [2] MONGOLIAN COMMA..MONGOLIAN FULL STOP
|
||||
|
@ -383,6 +408,14 @@ FF9E..FF9F ; Hira Kana # Lm [2] HALFWIDTH KATAKANA VOICED SOUND MARK..HALFW
|
|||
|
||||
# ================================================
|
||||
|
||||
# Script_Extensions=Cpmn Cprt Linb
|
||||
|
||||
10100..10101 ; Cpmn Cprt Linb # Po [2] AEGEAN WORD SEPARATOR LINE..AEGEAN WORD SEPARATOR DOT
|
||||
|
||||
# Total code points: 2
|
||||
|
||||
# ================================================
|
||||
|
||||
# Script_Extensions=Cprt Lina Linb
|
||||
|
||||
10107..10133 ; Cprt Lina Linb # No [45] AEGEAN NUMBER ONE..AEGEAN NUMBER NINETY THOUSAND
|
||||
|
@ -449,16 +482,6 @@ A92E ; Kali Latn Mymr # Po KAYAH LI SIGN CWI
|
|||
|
||||
# ================================================
|
||||
|
||||
# Script_Extensions=Arab Rohg Syrc Thaa Yezi
|
||||
|
||||
060C ; Arab Rohg Syrc Thaa Yezi # Po ARABIC COMMA
|
||||
061B ; Arab Rohg Syrc Thaa Yezi # Po ARABIC SEMICOLON
|
||||
061F ; Arab Rohg Syrc Thaa Yezi # Po ARABIC QUESTION MARK
|
||||
|
||||
# Total code points: 3
|
||||
|
||||
# ================================================
|
||||
|
||||
# Script_Extensions=Bopo Hang Hani Hira Kana
|
||||
|
||||
3003 ; Bopo Hang Hani Hira Kana # Po DITTO MARK
|
||||
|
@ -474,6 +497,15 @@ FE45..FE46 ; Bopo Hang Hani Hira Kana # Po [2] SESAME DOT..WHITE SESAME DOT
|
|||
|
||||
# ================================================
|
||||
|
||||
# Script_Extensions=Arab Nkoo Rohg Syrc Thaa Yezi
|
||||
|
||||
060C ; Arab Nkoo Rohg Syrc Thaa Yezi # Po ARABIC COMMA
|
||||
061B ; Arab Nkoo Rohg Syrc Thaa Yezi # Po ARABIC SEMICOLON
|
||||
|
||||
# Total code points: 2
|
||||
|
||||
# ================================================
|
||||
|
||||
# Script_Extensions=Bopo Hang Hani Hira Kana Yiii
|
||||
|
||||
3001..3002 ; Bopo Hang Hani Hira Kana Yiii # Po [2] IDEOGRAPHIC COMMA..IDEOGRAPHIC FULL STOP
|
||||
|
@ -513,9 +545,9 @@ FF64..FF65 ; Bopo Hang Hani Hira Kana Yiii # Po [2] HALFWIDTH IDEOGRAPHIC C
|
|||
|
||||
# ================================================
|
||||
|
||||
# Script_Extensions=Adlm Arab Mand Mani Phlp Rohg Sogd Syrc
|
||||
# Script_Extensions=Adlm Arab Nkoo Rohg Syrc Thaa Yezi
|
||||
|
||||
0640 ; Adlm Arab Mand Mani Phlp Rohg Sogd Syrc # Lm ARABIC TATWEEL
|
||||
061F ; Adlm Arab Nkoo Rohg Syrc Thaa Yezi # Po ARABIC QUESTION MARK
|
||||
|
||||
# Total code points: 1
|
||||
|
||||
|
@ -529,6 +561,14 @@ FF64..FF65 ; Bopo Hang Hani Hira Kana Yiii # Po [2] HALFWIDTH IDEOGRAPHIC C
|
|||
|
||||
# ================================================
|
||||
|
||||
# Script_Extensions=Adlm Arab Mand Mani Ougr Phlp Rohg Sogd Syrc
|
||||
|
||||
0640 ; Adlm Arab Mand Mani Ougr Phlp Rohg Sogd Syrc # Lm ARABIC TATWEEL
|
||||
|
||||
# Total code points: 1
|
||||
|
||||
# ================================================
|
||||
|
||||
# Script_Extensions=Deva Dogr Gujr Guru Khoj Kthi Mahj Modi Sind Takr Tirh
|
||||
|
||||
A836..A837 ; Deva Dogr Gujr Guru Khoj Kthi Mahj Modi Sind Takr Tirh # So [2] NORTH INDIC QUARTER MARK..NORTH INDIC PLACEHOLDER MARK
|
||||
|
|
|
@ -1,16 +1,16 @@
|
|||
# Scripts-13.0.0.txt
|
||||
# Date: 2020-01-22, 00:07:43 GMT
|
||||
# © 2020 Unicode®, Inc.
|
||||
# Scripts-14.0.0.txt
|
||||
# Date: 2021-07-10, 00:35:31 GMT
|
||||
# © 2021 Unicode®, Inc.
|
||||
# Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries.
|
||||
# For terms of use, see http://www.unicode.org/terms_of_use.html
|
||||
#
|
||||
# Unicode Character Database
|
||||
# For documentation, see http://www.unicode.org/reports/tr44/
|
||||
# For more information, see:
|
||||
# UAX #24, Unicode Script Property: http://www.unicode.org/reports/tr24/
|
||||
# UAX #24, Unicode Script Property: https://www.unicode.org/reports/tr24/
|
||||
# Especially the sections:
|
||||
# http://www.unicode.org/reports/tr24/#Assignment_Script_Values
|
||||
# http://www.unicode.org/reports/tr24/#Assignment_ScriptX_Values
|
||||
# https://www.unicode.org/reports/tr24/#Assignment_Script_Values
|
||||
# https://www.unicode.org/reports/tr24/#Assignment_ScriptX_Values
|
||||
#
|
||||
|
||||
# ================================================
|
||||
|
@ -154,7 +154,7 @@
|
|||
208A..208C ; Common # Sm [3] SUBSCRIPT PLUS SIGN..SUBSCRIPT EQUALS SIGN
|
||||
208D ; Common # Ps SUBSCRIPT LEFT PARENTHESIS
|
||||
208E ; Common # Pe SUBSCRIPT RIGHT PARENTHESIS
|
||||
20A0..20BF ; Common # Sc [32] EURO-CURRENCY SIGN..BITCOIN SIGN
|
||||
20A0..20C0 ; Common # Sc [33] EURO-CURRENCY SIGN..SOM SIGN
|
||||
2100..2101 ; Common # So [2] ACCOUNT OF..ADDRESSED TO THE SUBJECT
|
||||
2102 ; Common # L& DOUBLE-STRUCK CAPITAL C
|
||||
2103..2106 ; Common # So [4] DEGREE CELSIUS..CADA UNA
|
||||
|
@ -347,7 +347,16 @@
|
|||
2E42 ; Common # Ps DOUBLE LOW-REVERSED-9 QUOTATION MARK
|
||||
2E43..2E4F ; Common # Po [13] DASH WITH LEFT UPTURN..CORNISH VERSE DIVIDER
|
||||
2E50..2E51 ; Common # So [2] CROSS PATTY WITH RIGHT CROSSBAR..CROSS PATTY WITH LEFT CROSSBAR
|
||||
2E52 ; Common # Po TIRONIAN SIGN CAPITAL ET
|
||||
2E52..2E54 ; Common # Po [3] TIRONIAN SIGN CAPITAL ET..MEDIEVAL QUESTION MARK
|
||||
2E55 ; Common # Ps LEFT SQUARE BRACKET WITH STROKE
|
||||
2E56 ; Common # Pe RIGHT SQUARE BRACKET WITH STROKE
|
||||
2E57 ; Common # Ps LEFT SQUARE BRACKET WITH DOUBLE STROKE
|
||||
2E58 ; Common # Pe RIGHT SQUARE BRACKET WITH DOUBLE STROKE
|
||||
2E59 ; Common # Ps TOP HALF LEFT PARENTHESIS
|
||||
2E5A ; Common # Pe TOP HALF RIGHT PARENTHESIS
|
||||
2E5B ; Common # Ps BOTTOM HALF LEFT PARENTHESIS
|
||||
2E5C ; Common # Pe BOTTOM HALF RIGHT PARENTHESIS
|
||||
2E5D ; Common # Pd OBLIQUE HYPHEN
|
||||
2FF0..2FFB ; Common # So [12] IDEOGRAPHIC DESCRIPTION CHARACTER LEFT TO RIGHT..IDEOGRAPHIC DESCRIPTION CHARACTER OVERLAID
|
||||
3000 ; Common # Zs IDEOGRAPHIC SPACE
|
||||
3001..3003 ; Common # Po [3] IDEOGRAPHIC COMMA..DITTO MARK
|
||||
|
@ -511,9 +520,8 @@ FFFC..FFFD ; Common # So [2] OBJECT REPLACEMENT CHARACTER..REPLACEMENT CHAR
|
|||
10190..1019C ; Common # So [13] ROMAN SEXTANS SIGN..ASCIA SYMBOL
|
||||
101D0..101FC ; Common # So [45] PHAISTOS DISC SIGN PEDESTRIAN..PHAISTOS DISC SIGN WAVY BAND
|
||||
102E1..102FB ; Common # No [27] COPTIC EPACT DIGIT ONE..COPTIC EPACT NUMBER NINE HUNDRED
|
||||
16FE2 ; Common # Po OLD CHINESE HOOK MARK
|
||||
16FE3 ; Common # Lm OLD CHINESE ITERATION MARK
|
||||
1BCA0..1BCA3 ; Common # Cf [4] SHORTHAND FORMAT LETTER OVERLAP..SHORTHAND FORMAT UP STEP
|
||||
1CF50..1CFC3 ; Common # So [116] ZNAMENNY NEUME KRYUK..ZNAMENNY NEUME PAUK
|
||||
1D000..1D0F5 ; Common # So [246] BYZANTINE MUSICAL SYMBOL PSILI..BYZANTINE MUSICAL SYMBOL GORGON NEO KATO
|
||||
1D100..1D126 ; Common # So [39] MUSICAL SYMBOL SINGLE BARLINE..MUSICAL SYMBOL DRUM CLEF-2
|
||||
1D129..1D164 ; Common # So [60] MUSICAL SYMBOL MULTIPLE MEASURE REST..MUSICAL SYMBOL ONE HUNDRED TWENTY-EIGHTH NOTE
|
||||
|
@ -523,7 +531,7 @@ FFFC..FFFD ; Common # So [2] OBJECT REPLACEMENT CHARACTER..REPLACEMENT CHAR
|
|||
1D173..1D17A ; Common # Cf [8] MUSICAL SYMBOL BEGIN BEAM..MUSICAL SYMBOL END PHRASE
|
||||
1D183..1D184 ; Common # So [2] MUSICAL SYMBOL ARPEGGIATO UP..MUSICAL SYMBOL ARPEGGIATO DOWN
|
||||
1D18C..1D1A9 ; Common # So [30] MUSICAL SYMBOL RINFORZANDO..MUSICAL SYMBOL DEGREE SLASH
|
||||
1D1AE..1D1E8 ; Common # So [59] MUSICAL SYMBOL PEDAL MARK..MUSICAL SYMBOL KIEVAN FLAT SIGN
|
||||
1D1AE..1D1EA ; Common # So [61] MUSICAL SYMBOL PEDAL MARK..MUSICAL SYMBOL KORON
|
||||
1D2E0..1D2F3 ; Common # No [20] MAYAN NUMERAL ZERO..MAYAN NUMERAL NINETEEN
|
||||
1D300..1D356 ; Common # So [87] MONOGRAM FOR EARTH..TETRAGRAM FOR FOSTERING
|
||||
1D360..1D378 ; Common # No [25] COUNTING ROD UNIT DIGIT ONE..TALLY MARK FIVE
|
||||
|
@ -593,35 +601,36 @@ FFFC..FFFD ; Common # So [2] OBJECT REPLACEMENT CHARACTER..REPLACEMENT CHAR
|
|||
1F300..1F3FA ; Common # So [251] CYCLONE..AMPHORA
|
||||
1F3FB..1F3FF ; Common # Sk [5] EMOJI MODIFIER FITZPATRICK TYPE-1-2..EMOJI MODIFIER FITZPATRICK TYPE-6
|
||||
1F400..1F6D7 ; Common # So [728] RAT..ELEVATOR
|
||||
1F6E0..1F6EC ; Common # So [13] HAMMER AND WRENCH..AIRPLANE ARRIVING
|
||||
1F6DD..1F6EC ; Common # So [16] PLAYGROUND SLIDE..AIRPLANE ARRIVING
|
||||
1F6F0..1F6FC ; Common # So [13] SATELLITE..ROLLER SKATE
|
||||
1F700..1F773 ; Common # So [116] ALCHEMICAL SYMBOL FOR QUINTESSENCE..ALCHEMICAL SYMBOL FOR HALF OUNCE
|
||||
1F780..1F7D8 ; Common # So [89] BLACK LEFT-POINTING ISOSCELES RIGHT TRIANGLE..NEGATIVE CIRCLED SQUARE
|
||||
1F7E0..1F7EB ; Common # So [12] LARGE ORANGE CIRCLE..LARGE BROWN SQUARE
|
||||
1F7F0 ; Common # So HEAVY EQUALS SIGN
|
||||
1F800..1F80B ; Common # So [12] LEFTWARDS ARROW WITH SMALL TRIANGLE ARROWHEAD..DOWNWARDS ARROW WITH LARGE TRIANGLE ARROWHEAD
|
||||
1F810..1F847 ; Common # So [56] LEFTWARDS ARROW WITH SMALL EQUILATERAL ARROWHEAD..DOWNWARDS HEAVY ARROW
|
||||
1F850..1F859 ; Common # So [10] LEFTWARDS SANS-SERIF ARROW..UP DOWN SANS-SERIF ARROW
|
||||
1F860..1F887 ; Common # So [40] WIDE-HEADED LEFTWARDS LIGHT BARB ARROW..WIDE-HEADED SOUTH WEST VERY HEAVY BARB ARROW
|
||||
1F890..1F8AD ; Common # So [30] LEFTWARDS TRIANGLE ARROWHEAD..WHITE ARROW SHAFT WIDTH TWO THIRDS
|
||||
1F8B0..1F8B1 ; Common # So [2] ARROW POINTING UPWARDS THEN NORTH WEST..ARROW POINTING RIGHTWARDS THEN CURVING SOUTH WEST
|
||||
1F900..1F978 ; Common # So [121] CIRCLED CROSS FORMEE WITH FOUR DOTS..DISGUISED FACE
|
||||
1F97A..1F9CB ; Common # So [82] FACE WITH PLEADING EYES..BUBBLE TEA
|
||||
1F9CD..1FA53 ; Common # So [135] STANDING PERSON..BLACK CHESS KNIGHT-BISHOP
|
||||
1F900..1FA53 ; Common # So [340] CIRCLED CROSS FORMEE WITH FOUR DOTS..BLACK CHESS KNIGHT-BISHOP
|
||||
1FA60..1FA6D ; Common # So [14] XIANGQI RED GENERAL..XIANGQI BLACK SOLDIER
|
||||
1FA70..1FA74 ; Common # So [5] BALLET SHOES..THONG SANDAL
|
||||
1FA78..1FA7A ; Common # So [3] DROP OF BLOOD..STETHOSCOPE
|
||||
1FA78..1FA7C ; Common # So [5] DROP OF BLOOD..CRUTCH
|
||||
1FA80..1FA86 ; Common # So [7] YO-YO..NESTING DOLLS
|
||||
1FA90..1FAA8 ; Common # So [25] RINGED PLANET..ROCK
|
||||
1FAB0..1FAB6 ; Common # So [7] FLY..FEATHER
|
||||
1FAC0..1FAC2 ; Common # So [3] ANATOMICAL HEART..PEOPLE HUGGING
|
||||
1FAD0..1FAD6 ; Common # So [7] BLUEBERRIES..TEAPOT
|
||||
1FA90..1FAAC ; Common # So [29] RINGED PLANET..HAMSA
|
||||
1FAB0..1FABA ; Common # So [11] FLY..NEST WITH EGGS
|
||||
1FAC0..1FAC5 ; Common # So [6] ANATOMICAL HEART..PERSON WITH CROWN
|
||||
1FAD0..1FAD9 ; Common # So [10] BLUEBERRIES..JAR
|
||||
1FAE0..1FAE7 ; Common # So [8] MELTING FACE..BUBBLES
|
||||
1FAF0..1FAF6 ; Common # So [7] HAND WITH INDEX FINGER AND THUMB CROSSED..HEART HANDS
|
||||
1FB00..1FB92 ; Common # So [147] BLOCK SEXTANT-1..UPPER HALF INVERSE MEDIUM SHADE AND LOWER HALF BLOCK
|
||||
1FB94..1FBCA ; Common # So [55] LEFT HALF INVERSE MEDIUM SHADE AND RIGHT HALF BLOCK..WHITE UP-POINTING CHEVRON
|
||||
1FBF0..1FBF9 ; Common # Nd [10] SEGMENTED DIGIT ZERO..SEGMENTED DIGIT NINE
|
||||
E0001 ; Common # Cf LANGUAGE TAG
|
||||
E0020..E007F ; Common # Cf [96] TAG SPACE..CANCEL TAG
|
||||
|
||||
# Total code points: 8087
|
||||
# Total code points: 8252
|
||||
|
||||
# ================================================
|
||||
|
||||
|
@ -664,8 +673,11 @@ A770 ; Latin # Lm MODIFIER LETTER US
|
|||
A771..A787 ; Latin # L& [23] LATIN SMALL LETTER DUM..LATIN SMALL LETTER INSULAR T
|
||||
A78B..A78E ; Latin # L& [4] LATIN CAPITAL LETTER SALTILLO..LATIN SMALL LETTER L WITH RETROFLEX HOOK AND BELT
|
||||
A78F ; Latin # Lo LATIN LETTER SINOLOGICAL DOT
|
||||
A790..A7BF ; Latin # L& [48] LATIN CAPITAL LETTER N WITH DESCENDER..LATIN SMALL LETTER GLOTTAL U
|
||||
A7C2..A7CA ; Latin # L& [9] LATIN CAPITAL LETTER ANGLICANA W..LATIN SMALL LETTER S WITH SHORT STROKE OVERLAY
|
||||
A790..A7CA ; Latin # L& [59] LATIN CAPITAL LETTER N WITH DESCENDER..LATIN SMALL LETTER S WITH SHORT STROKE OVERLAY
|
||||
A7D0..A7D1 ; Latin # L& [2] LATIN CAPITAL LETTER CLOSED INSULAR G..LATIN SMALL LETTER CLOSED INSULAR G
|
||||
A7D3 ; Latin # L& LATIN SMALL LETTER DOUBLE THORN
|
||||
A7D5..A7D9 ; Latin # L& [5] LATIN SMALL LETTER DOUBLE WYNN..LATIN SMALL LETTER SIGMOID S
|
||||
A7F2..A7F4 ; Latin # Lm [3] MODIFIER LETTER CAPITAL C..MODIFIER LETTER CAPITAL Q
|
||||
A7F5..A7F6 ; Latin # L& [2] LATIN CAPITAL LETTER REVERSED HALF H..LATIN SMALL LETTER REVERSED HALF H
|
||||
A7F7 ; Latin # Lo LATIN EPIGRAPHIC LETTER SIDEWAYS I
|
||||
A7F8..A7F9 ; Latin # Lm [2] MODIFIER LETTER CAPITAL H WITH STROKE..MODIFIER LETTER SMALL LIGATURE OE
|
||||
|
@ -679,8 +691,14 @@ AB69 ; Latin # Lm MODIFIER LETTER SMALL TURNED W
|
|||
FB00..FB06 ; Latin # L& [7] LATIN SMALL LIGATURE FF..LATIN SMALL LIGATURE ST
|
||||
FF21..FF3A ; Latin # L& [26] FULLWIDTH LATIN CAPITAL LETTER A..FULLWIDTH LATIN CAPITAL LETTER Z
|
||||
FF41..FF5A ; Latin # L& [26] FULLWIDTH LATIN SMALL LETTER A..FULLWIDTH LATIN SMALL LETTER Z
|
||||
10780..10785 ; Latin # Lm [6] MODIFIER LETTER SMALL CAPITAL AA..MODIFIER LETTER SMALL B WITH HOOK
|
||||
10787..107B0 ; Latin # Lm [42] MODIFIER LETTER SMALL DZ DIGRAPH..MODIFIER LETTER SMALL V WITH RIGHT HOOK
|
||||
107B2..107BA ; Latin # Lm [9] MODIFIER LETTER SMALL CAPITAL Y..MODIFIER LETTER SMALL S WITH CURL
|
||||
1DF00..1DF09 ; Latin # L& [10] LATIN SMALL LETTER FENG DIGRAPH WITH TRILL..LATIN SMALL LETTER T WITH HOOK AND RETROFLEX HOOK
|
||||
1DF0A ; Latin # Lo LATIN LETTER RETROFLEX CLICK WITH RETROFLEX HOOK
|
||||
1DF0B..1DF1E ; Latin # L& [20] LATIN SMALL LETTER ESH WITH DOUBLE BAR..LATIN SMALL LETTER S WITH CURL
|
||||
|
||||
# Total code points: 1374
|
||||
# Total code points: 1475
|
||||
|
||||
# ================================================
|
||||
|
||||
|
@ -820,7 +838,7 @@ FB46..FB4F ; Hebrew # Lo [10] HEBREW LETTER TSADI WITH DAGESH..HEBREW LIGATU
|
|||
060E..060F ; Arabic # So [2] ARABIC POETIC VERSE SIGN..ARABIC SIGN MISRA
|
||||
0610..061A ; Arabic # Mn [11] ARABIC SIGN SALLALLAHOU ALAYHE WASSALLAM..ARABIC SMALL KASRA
|
||||
061C ; Arabic # Cf ARABIC LETTER MARK
|
||||
061E ; Arabic # Po ARABIC TRIPLE DOT PUNCTUATION MARK
|
||||
061D..061E ; Arabic # Po [2] ARABIC END OF TEXT MARK..ARABIC TRIPLE DOT PUNCTUATION MARK
|
||||
0620..063F ; Arabic # Lo [32] ARABIC LETTER KASHMIRI YEH..ARABIC LETTER FARSI YEH WITH THREE DOTS ABOVE
|
||||
0641..064A ; Arabic # Lo [10] ARABIC LETTER FEH..ARABIC LETTER YEH
|
||||
0656..065F ; Arabic # Mn [10] ARABIC SUBSCRIPT ALEF..ARABIC WAVY HAMZA BELOW
|
||||
|
@ -843,18 +861,25 @@ FB46..FB4F ; Hebrew # Lo [10] HEBREW LETTER TSADI WITH DAGESH..HEBREW LIGATU
|
|||
06FD..06FE ; Arabic # So [2] ARABIC SIGN SINDHI AMPERSAND..ARABIC SIGN SINDHI POSTPOSITION MEN
|
||||
06FF ; Arabic # Lo ARABIC LETTER HEH WITH INVERTED V
|
||||
0750..077F ; Arabic # Lo [48] ARABIC LETTER BEH WITH THREE DOTS HORIZONTALLY BELOW..ARABIC LETTER KAF WITH TWO DOTS ABOVE
|
||||
08A0..08B4 ; Arabic # Lo [21] ARABIC LETTER BEH WITH SMALL V BELOW..ARABIC LETTER KAF WITH DOT BELOW
|
||||
08B6..08C7 ; Arabic # Lo [18] ARABIC LETTER BEH WITH SMALL MEEM ABOVE..ARABIC LETTER LAM WITH SMALL ARABIC LETTER TAH ABOVE
|
||||
08D3..08E1 ; Arabic # Mn [15] ARABIC SMALL LOW WAW..ARABIC SMALL HIGH SIGN SAFHA
|
||||
0870..0887 ; Arabic # Lo [24] ARABIC LETTER ALEF WITH ATTACHED FATHA..ARABIC BASELINE ROUND DOT
|
||||
0888 ; Arabic # Sk ARABIC RAISED ROUND DOT
|
||||
0889..088E ; Arabic # Lo [6] ARABIC LETTER NOON WITH INVERTED SMALL V..ARABIC VERTICAL TAIL
|
||||
0890..0891 ; Arabic # Cf [2] ARABIC POUND MARK ABOVE..ARABIC PIASTRE MARK ABOVE
|
||||
0898..089F ; Arabic # Mn [8] ARABIC SMALL HIGH WORD AL-JUZ..ARABIC HALF MADDA OVER MADDA
|
||||
08A0..08C8 ; Arabic # Lo [41] ARABIC LETTER BEH WITH SMALL V BELOW..ARABIC LETTER GRAF
|
||||
08C9 ; Arabic # Lm ARABIC SMALL FARSI YEH
|
||||
08CA..08E1 ; Arabic # Mn [24] ARABIC SMALL HIGH FARSI YEH..ARABIC SMALL HIGH SIGN SAFHA
|
||||
08E3..08FF ; Arabic # Mn [29] ARABIC TURNED DAMMA BELOW..ARABIC MARK SIDEWAYS NOON GHUNNA
|
||||
FB50..FBB1 ; Arabic # Lo [98] ARABIC LETTER ALEF WASLA ISOLATED FORM..ARABIC LETTER YEH BARREE WITH HAMZA ABOVE FINAL FORM
|
||||
FBB2..FBC1 ; Arabic # Sk [16] ARABIC SYMBOL DOT ABOVE..ARABIC SYMBOL SMALL TAH BELOW
|
||||
FBB2..FBC2 ; Arabic # Sk [17] ARABIC SYMBOL DOT ABOVE..ARABIC SYMBOL WASLA ABOVE
|
||||
FBD3..FD3D ; Arabic # Lo [363] ARABIC LETTER NG ISOLATED FORM..ARABIC LIGATURE ALEF WITH FATHATAN ISOLATED FORM
|
||||
FD40..FD4F ; Arabic # So [16] ARABIC LIGATURE RAHIMAHU ALLAAH..ARABIC LIGATURE RAHIMAHUM ALLAAH
|
||||
FD50..FD8F ; Arabic # Lo [64] ARABIC LIGATURE TEH WITH JEEM WITH MEEM INITIAL FORM..ARABIC LIGATURE MEEM WITH KHAH WITH MEEM INITIAL FORM
|
||||
FD92..FDC7 ; Arabic # Lo [54] ARABIC LIGATURE MEEM WITH JEEM WITH KHAH INITIAL FORM..ARABIC LIGATURE NOON WITH JEEM WITH YEH FINAL FORM
|
||||
FDCF ; Arabic # So ARABIC LIGATURE SALAAMUHU ALAYNAA
|
||||
FDF0..FDFB ; Arabic # Lo [12] ARABIC LIGATURE SALLA USED AS KORANIC STOP SIGN ISOLATED FORM..ARABIC LIGATURE JALLAJALALOUHOU
|
||||
FDFC ; Arabic # Sc RIAL SIGN
|
||||
FDFD ; Arabic # So ARABIC LIGATURE BISMILLAH AR-RAHMAN AR-RAHEEM
|
||||
FDFD..FDFF ; Arabic # So [3] ARABIC LIGATURE BISMILLAH AR-RAHMAN AR-RAHEEM..ARABIC LIGATURE AZZA WA JALL
|
||||
FE70..FE74 ; Arabic # Lo [5] ARABIC FATHATAN ISOLATED FORM..ARABIC KASRATAN ISOLATED FORM
|
||||
FE76..FEFC ; Arabic # Lo [135] ARABIC FATHA ISOLATED FORM..ARABIC LIGATURE LAM WITH ALEF FINAL FORM
|
||||
10E60..10E7E ; Arabic # No [31] RUMI DIGIT ONE..RUMI FRACTION TWO THIRDS
|
||||
|
@ -893,7 +918,7 @@ FE76..FEFC ; Arabic # Lo [135] ARABIC FATHA ISOLATED FORM..ARABIC LIGATURE LA
|
|||
1EEAB..1EEBB ; Arabic # Lo [17] ARABIC MATHEMATICAL DOUBLE-STRUCK LAM..ARABIC MATHEMATICAL DOUBLE-STRUCK GHAIN
|
||||
1EEF0..1EEF1 ; Arabic # Sm [2] ARABIC MATHEMATICAL OPERATOR MEEM WITH HAH WITH TATWEEL..ARABIC MATHEMATICAL OPERATOR HAH WITH DAL
|
||||
|
||||
# Total code points: 1291
|
||||
# Total code points: 1365
|
||||
|
||||
# ================================================
|
||||
|
||||
|
@ -1113,6 +1138,7 @@ A8FF ; Devanagari # Mn DEVANAGARI VOWEL SIGN AY
|
|||
0C0E..0C10 ; Telugu # Lo [3] TELUGU LETTER E..TELUGU LETTER AI
|
||||
0C12..0C28 ; Telugu # Lo [23] TELUGU LETTER O..TELUGU LETTER NA
|
||||
0C2A..0C39 ; Telugu # Lo [16] TELUGU LETTER PA..TELUGU LETTER HA
|
||||
0C3C ; Telugu # Mn TELUGU SIGN NUKTA
|
||||
0C3D ; Telugu # Lo TELUGU SIGN AVAGRAHA
|
||||
0C3E..0C40 ; Telugu # Mn [3] TELUGU VOWEL SIGN AA..TELUGU VOWEL SIGN II
|
||||
0C41..0C44 ; Telugu # Mc [4] TELUGU VOWEL SIGN U..TELUGU VOWEL SIGN VOCALIC RR
|
||||
|
@ -1120,6 +1146,7 @@ A8FF ; Devanagari # Mn DEVANAGARI VOWEL SIGN AY
|
|||
0C4A..0C4D ; Telugu # Mn [4] TELUGU VOWEL SIGN O..TELUGU SIGN VIRAMA
|
||||
0C55..0C56 ; Telugu # Mn [2] TELUGU LENGTH MARK..TELUGU AI LENGTH MARK
|
||||
0C58..0C5A ; Telugu # Lo [3] TELUGU LETTER TSA..TELUGU LETTER RRRA
|
||||
0C5D ; Telugu # Lo TELUGU LETTER NAKAARA POLLU
|
||||
0C60..0C61 ; Telugu # Lo [2] TELUGU LETTER VOCALIC RR..TELUGU LETTER VOCALIC LL
|
||||
0C62..0C63 ; Telugu # Mn [2] TELUGU VOWEL SIGN VOCALIC L..TELUGU VOWEL SIGN VOCALIC LL
|
||||
0C66..0C6F ; Telugu # Nd [10] TELUGU DIGIT ZERO..TELUGU DIGIT NINE
|
||||
|
@ -1127,7 +1154,7 @@ A8FF ; Devanagari # Mn DEVANAGARI VOWEL SIGN AY
|
|||
0C78..0C7E ; Telugu # No [7] TELUGU FRACTION DIGIT ZERO FOR ODD POWERS OF FOUR..TELUGU FRACTION DIGIT THREE FOR EVEN POWERS OF FOUR
|
||||
0C7F ; Telugu # So TELUGU SIGN TUUMU
|
||||
|
||||
# Total code points: 98
|
||||
# Total code points: 100
|
||||
|
||||
# ================================================
|
||||
|
||||
|
@ -1150,13 +1177,13 @@ A8FF ; Devanagari # Mn DEVANAGARI VOWEL SIGN AY
|
|||
0CCA..0CCB ; Kannada # Mc [2] KANNADA VOWEL SIGN O..KANNADA VOWEL SIGN OO
|
||||
0CCC..0CCD ; Kannada # Mn [2] KANNADA VOWEL SIGN AU..KANNADA SIGN VIRAMA
|
||||
0CD5..0CD6 ; Kannada # Mc [2] KANNADA LENGTH MARK..KANNADA AI LENGTH MARK
|
||||
0CDE ; Kannada # Lo KANNADA LETTER FA
|
||||
0CDD..0CDE ; Kannada # Lo [2] KANNADA LETTER NAKAARA POLLU..KANNADA LETTER FA
|
||||
0CE0..0CE1 ; Kannada # Lo [2] KANNADA LETTER VOCALIC RR..KANNADA LETTER VOCALIC LL
|
||||
0CE2..0CE3 ; Kannada # Mn [2] KANNADA VOWEL SIGN VOCALIC L..KANNADA VOWEL SIGN VOCALIC LL
|
||||
0CE6..0CEF ; Kannada # Nd [10] KANNADA DIGIT ZERO..KANNADA DIGIT NINE
|
||||
0CF1..0CF2 ; Kannada # Lo [2] KANNADA SIGN JIHVAMULIYA..KANNADA SIGN UPADHMANIYA
|
||||
|
||||
# Total code points: 89
|
||||
# Total code points: 90
|
||||
|
||||
# ================================================
|
||||
|
||||
|
@ -1411,8 +1438,12 @@ AB09..AB0E ; Ethiopic # Lo [6] ETHIOPIC SYLLABLE DDHU..ETHIOPIC SYLLABLE DD
|
|||
AB11..AB16 ; Ethiopic # Lo [6] ETHIOPIC SYLLABLE DZU..ETHIOPIC SYLLABLE DZO
|
||||
AB20..AB26 ; Ethiopic # Lo [7] ETHIOPIC SYLLABLE CCHHA..ETHIOPIC SYLLABLE CCHHO
|
||||
AB28..AB2E ; Ethiopic # Lo [7] ETHIOPIC SYLLABLE BBA..ETHIOPIC SYLLABLE BBO
|
||||
1E7E0..1E7E6 ; Ethiopic # Lo [7] ETHIOPIC SYLLABLE HHYA..ETHIOPIC SYLLABLE HHYO
|
||||
1E7E8..1E7EB ; Ethiopic # Lo [4] ETHIOPIC SYLLABLE GURAGE HHWA..ETHIOPIC SYLLABLE HHWE
|
||||
1E7ED..1E7EE ; Ethiopic # Lo [2] ETHIOPIC SYLLABLE GURAGE MWI..ETHIOPIC SYLLABLE GURAGE MWEE
|
||||
1E7F0..1E7FE ; Ethiopic # Lo [15] ETHIOPIC SYLLABLE GURAGE QWI..ETHIOPIC SYLLABLE GURAGE PWEE
|
||||
|
||||
# Total code points: 495
|
||||
# Total code points: 523
|
||||
|
||||
# ================================================
|
||||
|
||||
|
@ -1430,8 +1461,9 @@ AB70..ABBF ; Cherokee # L& [80] CHEROKEE SMALL LETTER A..CHEROKEE SMALL LETT
|
|||
166E ; Canadian_Aboriginal # Po CANADIAN SYLLABICS FULL STOP
|
||||
166F..167F ; Canadian_Aboriginal # Lo [17] CANADIAN SYLLABICS QAI..CANADIAN SYLLABICS BLACKFOOT W
|
||||
18B0..18F5 ; Canadian_Aboriginal # Lo [70] CANADIAN SYLLABICS OY..CANADIAN SYLLABICS CARRIER DENTAL S
|
||||
11AB0..11ABF ; Canadian_Aboriginal # Lo [16] CANADIAN SYLLABICS NATTILIK HI..CANADIAN SYLLABICS SPA
|
||||
|
||||
# Total code points: 710
|
||||
# Total code points: 726
|
||||
|
||||
# ================================================
|
||||
|
||||
|
@ -1480,6 +1512,7 @@ AB70..ABBF ; Cherokee # L& [80] CHEROKEE SMALL LETTER A..CHEROKEE SMALL LETT
|
|||
1807..180A ; Mongolian # Po [4] MONGOLIAN SIBE SYLLABLE BOUNDARY MARKER..MONGOLIAN NIRUGU
|
||||
180B..180D ; Mongolian # Mn [3] MONGOLIAN FREE VARIATION SELECTOR ONE..MONGOLIAN FREE VARIATION SELECTOR THREE
|
||||
180E ; Mongolian # Cf MONGOLIAN VOWEL SEPARATOR
|
||||
180F ; Mongolian # Mn MONGOLIAN FREE VARIATION SELECTOR FOUR
|
||||
1810..1819 ; Mongolian # Nd [10] MONGOLIAN DIGIT ZERO..MONGOLIAN DIGIT NINE
|
||||
1820..1842 ; Mongolian # Lo [35] MONGOLIAN LETTER A..MONGOLIAN LETTER CHI
|
||||
1843 ; Mongolian # Lm MONGOLIAN LETTER TODO LONG VOWEL SIGN
|
||||
|
@ -1491,18 +1524,18 @@ AB70..ABBF ; Cherokee # L& [80] CHEROKEE SMALL LETTER A..CHEROKEE SMALL LETT
|
|||
18AA ; Mongolian # Lo MONGOLIAN LETTER MANCHU ALI GALI LHA
|
||||
11660..1166C ; Mongolian # Po [13] MONGOLIAN BIRGA WITH ORNAMENT..MONGOLIAN TURNED SWIRL BIRGA WITH DOUBLE ORNAMENT
|
||||
|
||||
# Total code points: 167
|
||||
# Total code points: 168
|
||||
|
||||
# ================================================
|
||||
|
||||
3041..3096 ; Hiragana # Lo [86] HIRAGANA LETTER SMALL A..HIRAGANA LETTER SMALL KE
|
||||
309D..309E ; Hiragana # Lm [2] HIRAGANA ITERATION MARK..HIRAGANA VOICED ITERATION MARK
|
||||
309F ; Hiragana # Lo HIRAGANA DIGRAPH YORI
|
||||
1B001..1B11E ; Hiragana # Lo [286] HIRAGANA LETTER ARCHAIC YE..HENTAIGANA LETTER N-MU-MO-2
|
||||
1B001..1B11F ; Hiragana # Lo [287] HIRAGANA LETTER ARCHAIC YE..HIRAGANA LETTER ARCHAIC WU
|
||||
1B150..1B152 ; Hiragana # Lo [3] HIRAGANA LETTER SMALL WI..HIRAGANA LETTER SMALL WO
|
||||
1F200 ; Hiragana # So SQUARE HIRAGANA HOKA
|
||||
|
||||
# Total code points: 379
|
||||
# Total code points: 380
|
||||
|
||||
# ================================================
|
||||
|
||||
|
@ -1514,10 +1547,14 @@ AB70..ABBF ; Cherokee # L& [80] CHEROKEE SMALL LETTER A..CHEROKEE SMALL LETT
|
|||
3300..3357 ; Katakana # So [88] SQUARE APAATO..SQUARE WATTO
|
||||
FF66..FF6F ; Katakana # Lo [10] HALFWIDTH KATAKANA LETTER WO..HALFWIDTH KATAKANA LETTER SMALL TU
|
||||
FF71..FF9D ; Katakana # Lo [45] HALFWIDTH KATAKANA LETTER A..HALFWIDTH KATAKANA LETTER N
|
||||
1AFF0..1AFF3 ; Katakana # Lm [4] KATAKANA LETTER MINNAN TONE-2..KATAKANA LETTER MINNAN TONE-5
|
||||
1AFF5..1AFFB ; Katakana # Lm [7] KATAKANA LETTER MINNAN TONE-7..KATAKANA LETTER MINNAN NASALIZED TONE-5
|
||||
1AFFD..1AFFE ; Katakana # Lm [2] KATAKANA LETTER MINNAN NASALIZED TONE-7..KATAKANA LETTER MINNAN NASALIZED TONE-8
|
||||
1B000 ; Katakana # Lo KATAKANA LETTER ARCHAIC E
|
||||
1B120..1B122 ; Katakana # Lo [3] KATAKANA LETTER ARCHAIC YI..KATAKANA LETTER ARCHAIC WU
|
||||
1B164..1B167 ; Katakana # Lo [4] KATAKANA LETTER SMALL WI..KATAKANA LETTER SMALL N
|
||||
|
||||
# Total code points: 304
|
||||
# Total code points: 320
|
||||
|
||||
# ================================================
|
||||
|
||||
|
@ -1538,19 +1575,21 @@ FF71..FF9D ; Katakana # Lo [45] HALFWIDTH KATAKANA LETTER A..HALFWIDTH KATAK
|
|||
3038..303A ; Han # Nl [3] HANGZHOU NUMERAL TEN..HANGZHOU NUMERAL THIRTY
|
||||
303B ; Han # Lm VERTICAL IDEOGRAPHIC ITERATION MARK
|
||||
3400..4DBF ; Han # Lo [6592] CJK UNIFIED IDEOGRAPH-3400..CJK UNIFIED IDEOGRAPH-4DBF
|
||||
4E00..9FFC ; Han # Lo [20989] CJK UNIFIED IDEOGRAPH-4E00..CJK UNIFIED IDEOGRAPH-9FFC
|
||||
4E00..9FFF ; Han # Lo [20992] CJK UNIFIED IDEOGRAPH-4E00..CJK UNIFIED IDEOGRAPH-9FFF
|
||||
F900..FA6D ; Han # Lo [366] CJK COMPATIBILITY IDEOGRAPH-F900..CJK COMPATIBILITY IDEOGRAPH-FA6D
|
||||
FA70..FAD9 ; Han # Lo [106] CJK COMPATIBILITY IDEOGRAPH-FA70..CJK COMPATIBILITY IDEOGRAPH-FAD9
|
||||
16FE2 ; Han # Po OLD CHINESE HOOK MARK
|
||||
16FE3 ; Han # Lm OLD CHINESE ITERATION MARK
|
||||
16FF0..16FF1 ; Han # Mc [2] VIETNAMESE ALTERNATE READING MARK CA..VIETNAMESE ALTERNATE READING MARK NHAY
|
||||
20000..2A6DD ; Han # Lo [42718] CJK UNIFIED IDEOGRAPH-20000..CJK UNIFIED IDEOGRAPH-2A6DD
|
||||
2A700..2B734 ; Han # Lo [4149] CJK UNIFIED IDEOGRAPH-2A700..CJK UNIFIED IDEOGRAPH-2B734
|
||||
20000..2A6DF ; Han # Lo [42720] CJK UNIFIED IDEOGRAPH-20000..CJK UNIFIED IDEOGRAPH-2A6DF
|
||||
2A700..2B738 ; Han # Lo [4153] CJK UNIFIED IDEOGRAPH-2A700..CJK UNIFIED IDEOGRAPH-2B738
|
||||
2B740..2B81D ; Han # Lo [222] CJK UNIFIED IDEOGRAPH-2B740..CJK UNIFIED IDEOGRAPH-2B81D
|
||||
2B820..2CEA1 ; Han # Lo [5762] CJK UNIFIED IDEOGRAPH-2B820..CJK UNIFIED IDEOGRAPH-2CEA1
|
||||
2CEB0..2EBE0 ; Han # Lo [7473] CJK UNIFIED IDEOGRAPH-2CEB0..CJK UNIFIED IDEOGRAPH-2EBE0
|
||||
2F800..2FA1D ; Han # Lo [542] CJK COMPATIBILITY IDEOGRAPH-2F800..CJK COMPATIBILITY IDEOGRAPH-2FA1D
|
||||
30000..3134A ; Han # Lo [4939] CJK UNIFIED IDEOGRAPH-30000..CJK UNIFIED IDEOGRAPH-3134A
|
||||
|
||||
# Total code points: 94204
|
||||
# Total code points: 94215
|
||||
|
||||
# ================================================
|
||||
|
||||
|
@ -1593,15 +1632,14 @@ A490..A4C6 ; Yi # So [55] YI RADICAL QOT..YI RADICAL KE
|
|||
0951..0954 ; Inherited # Mn [4] DEVANAGARI STRESS SIGN UDATTA..DEVANAGARI ACUTE ACCENT
|
||||
1AB0..1ABD ; Inherited # Mn [14] COMBINING DOUBLED CIRCUMFLEX ACCENT..COMBINING PARENTHESES BELOW
|
||||
1ABE ; Inherited # Me COMBINING PARENTHESES OVERLAY
|
||||
1ABF..1AC0 ; Inherited # Mn [2] COMBINING LATIN SMALL LETTER W BELOW..COMBINING LATIN SMALL LETTER TURNED W BELOW
|
||||
1ABF..1ACE ; Inherited # Mn [16] COMBINING LATIN SMALL LETTER W BELOW..COMBINING LATIN SMALL LETTER INSULAR T
|
||||
1CD0..1CD2 ; Inherited # Mn [3] VEDIC TONE KARSHANA..VEDIC TONE PRENKHA
|
||||
1CD4..1CE0 ; Inherited # Mn [13] VEDIC SIGN YAJURVEDIC MIDLINE SVARITA..VEDIC TONE RIGVEDIC KASHMIRI INDEPENDENT SVARITA
|
||||
1CE2..1CE8 ; Inherited # Mn [7] VEDIC SIGN VISARGA SVARITA..VEDIC SIGN VISARGA ANUDATTA WITH TAIL
|
||||
1CED ; Inherited # Mn VEDIC SIGN TIRYAK
|
||||
1CF4 ; Inherited # Mn VEDIC TONE CANDRA ABOVE
|
||||
1CF8..1CF9 ; Inherited # Mn [2] VEDIC TONE RING ABOVE..VEDIC TONE DOUBLE RING ABOVE
|
||||
1DC0..1DF9 ; Inherited # Mn [58] COMBINING DOTTED GRAVE ACCENT..COMBINING WIDE INVERTED BRIDGE BELOW
|
||||
1DFB..1DFF ; Inherited # Mn [5] COMBINING DELETION MARK..COMBINING RIGHT ARROWHEAD AND DOWN ARROWHEAD BELOW
|
||||
1DC0..1DFF ; Inherited # Mn [64] COMBINING DOTTED GRAVE ACCENT..COMBINING RIGHT ARROWHEAD AND DOWN ARROWHEAD BELOW
|
||||
200C..200D ; Inherited # Cf [2] ZERO WIDTH NON-JOINER..ZERO WIDTH JOINER
|
||||
20D0..20DC ; Inherited # Mn [13] COMBINING LEFT HARPOON ABOVE..COMBINING FOUR DOTS ABOVE
|
||||
20DD..20E0 ; Inherited # Me [4] COMBINING ENCLOSING CIRCLE..COMBINING ENCLOSING CIRCLE BACKSLASH
|
||||
|
@ -1615,26 +1653,30 @@ FE20..FE2D ; Inherited # Mn [14] COMBINING LIGATURE LEFT HALF..COMBINING CON
|
|||
101FD ; Inherited # Mn PHAISTOS DISC SIGN COMBINING OBLIQUE STROKE
|
||||
102E0 ; Inherited # Mn COPTIC EPACT THOUSANDS MARK
|
||||
1133B ; Inherited # Mn COMBINING BINDU BELOW
|
||||
1CF00..1CF2D ; Inherited # Mn [46] ZNAMENNY COMBINING MARK GORAZDO NIZKO S KRYZHEM ON LEFT..ZNAMENNY COMBINING MARK KRYZH ON LEFT
|
||||
1CF30..1CF46 ; Inherited # Mn [23] ZNAMENNY COMBINING TONAL RANGE MARK MRACHNO..ZNAMENNY PRIZNAK MODIFIER ROG
|
||||
1D167..1D169 ; Inherited # Mn [3] MUSICAL SYMBOL COMBINING TREMOLO-1..MUSICAL SYMBOL COMBINING TREMOLO-3
|
||||
1D17B..1D182 ; Inherited # Mn [8] MUSICAL SYMBOL COMBINING ACCENT..MUSICAL SYMBOL COMBINING LOURE
|
||||
1D185..1D18B ; Inherited # Mn [7] MUSICAL SYMBOL COMBINING DOIT..MUSICAL SYMBOL COMBINING TRIPLE TONGUE
|
||||
1D1AA..1D1AD ; Inherited # Mn [4] MUSICAL SYMBOL COMBINING DOWN BOW..MUSICAL SYMBOL COMBINING SNAP PIZZICATO
|
||||
E0100..E01EF ; Inherited # Mn [240] VARIATION SELECTOR-17..VARIATION SELECTOR-256
|
||||
|
||||
# Total code points: 573
|
||||
# Total code points: 657
|
||||
|
||||
# ================================================
|
||||
|
||||
1700..170C ; Tagalog # Lo [13] TAGALOG LETTER A..TAGALOG LETTER YA
|
||||
170E..1711 ; Tagalog # Lo [4] TAGALOG LETTER LA..TAGALOG LETTER HA
|
||||
1700..1711 ; Tagalog # Lo [18] TAGALOG LETTER A..TAGALOG LETTER HA
|
||||
1712..1714 ; Tagalog # Mn [3] TAGALOG VOWEL SIGN I..TAGALOG SIGN VIRAMA
|
||||
1715 ; Tagalog # Mc TAGALOG SIGN PAMUDPOD
|
||||
171F ; Tagalog # Lo TAGALOG LETTER ARCHAIC RA
|
||||
|
||||
# Total code points: 20
|
||||
# Total code points: 23
|
||||
|
||||
# ================================================
|
||||
|
||||
1720..1731 ; Hanunoo # Lo [18] HANUNOO LETTER A..HANUNOO LETTER HA
|
||||
1732..1734 ; Hanunoo # Mn [3] HANUNOO VOWEL SIGN I..HANUNOO SIGN PAMUDPOD
|
||||
1732..1733 ; Hanunoo # Mn [2] HANUNOO VOWEL SIGN I..HANUNOO VOWEL SIGN U
|
||||
1734 ; Hanunoo # Mc HANUNOO SIGN PAMUDPOD
|
||||
|
||||
# Total code points: 21
|
||||
|
||||
|
@ -1762,15 +1804,14 @@ E0100..E01EF ; Inherited # Mn [240] VARIATION SELECTOR-17..VARIATION SELECTOR-2
|
|||
|
||||
# ================================================
|
||||
|
||||
2C00..2C2E ; Glagolitic # L& [47] GLAGOLITIC CAPITAL LETTER AZU..GLAGOLITIC CAPITAL LETTER LATINATE MYSLITE
|
||||
2C30..2C5E ; Glagolitic # L& [47] GLAGOLITIC SMALL LETTER AZU..GLAGOLITIC SMALL LETTER LATINATE MYSLITE
|
||||
2C00..2C5F ; Glagolitic # L& [96] GLAGOLITIC CAPITAL LETTER AZU..GLAGOLITIC SMALL LETTER CAUDATE CHRIVI
|
||||
1E000..1E006 ; Glagolitic # Mn [7] COMBINING GLAGOLITIC LETTER AZU..COMBINING GLAGOLITIC LETTER ZHIVETE
|
||||
1E008..1E018 ; Glagolitic # Mn [17] COMBINING GLAGOLITIC LETTER ZEMLJA..COMBINING GLAGOLITIC LETTER HERU
|
||||
1E01B..1E021 ; Glagolitic # Mn [7] COMBINING GLAGOLITIC LETTER SHTA..COMBINING GLAGOLITIC LETTER YATI
|
||||
1E023..1E024 ; Glagolitic # Mn [2] COMBINING GLAGOLITIC LETTER YU..COMBINING GLAGOLITIC LETTER SMALL YUS
|
||||
1E026..1E02A ; Glagolitic # Mn [5] COMBINING GLAGOLITIC LETTER YO..COMBINING GLAGOLITIC LETTER FITA
|
||||
|
||||
# Total code points: 132
|
||||
# Total code points: 134
|
||||
|
||||
# ================================================
|
||||
|
||||
|
@ -1836,14 +1877,15 @@ A82C ; Syloti_Nagri # Mn SYLOTI NAGRI SIGN ALTERNATE HASANTA
|
|||
1B3D..1B41 ; Balinese # Mc [5] BALINESE VOWEL SIGN LA LENGA TEDUNG..BALINESE VOWEL SIGN TALING REPA TEDUNG
|
||||
1B42 ; Balinese # Mn BALINESE VOWEL SIGN PEPET
|
||||
1B43..1B44 ; Balinese # Mc [2] BALINESE VOWEL SIGN PEPET TEDUNG..BALINESE ADEG ADEG
|
||||
1B45..1B4B ; Balinese # Lo [7] BALINESE LETTER KAF SASAK..BALINESE LETTER ASYURA SASAK
|
||||
1B45..1B4C ; Balinese # Lo [8] BALINESE LETTER KAF SASAK..BALINESE LETTER ARCHAIC JNYA
|
||||
1B50..1B59 ; Balinese # Nd [10] BALINESE DIGIT ZERO..BALINESE DIGIT NINE
|
||||
1B5A..1B60 ; Balinese # Po [7] BALINESE PANTI..BALINESE PAMENENG
|
||||
1B61..1B6A ; Balinese # So [10] BALINESE MUSICAL SYMBOL DONG..BALINESE MUSICAL SYMBOL DANG GEDE
|
||||
1B6B..1B73 ; Balinese # Mn [9] BALINESE MUSICAL SYMBOL COMBINING TEGEH..BALINESE MUSICAL SYMBOL COMBINING GONG
|
||||
1B74..1B7C ; Balinese # So [9] BALINESE MUSICAL SYMBOL RIGHT-HAND OPEN DUG..BALINESE MUSICAL SYMBOL LEFT-HAND OPEN PING
|
||||
1B7D..1B7E ; Balinese # Po [2] BALINESE PANTI LANTANG..BALINESE PAMADA LANTANG
|
||||
|
||||
# Total code points: 121
|
||||
# Total code points: 124
|
||||
|
||||
# ================================================
|
||||
|
||||
|
@ -2178,9 +2220,10 @@ ABF0..ABF9 ; Meetei_Mayek # Nd [10] MEETEI MAYEK DIGIT ZERO..MEETEI MAYEK DI
|
|||
110BB..110BC ; Kaithi # Po [2] KAITHI ABBREVIATION SIGN..KAITHI ENUMERATION SIGN
|
||||
110BD ; Kaithi # Cf KAITHI NUMBER SIGN
|
||||
110BE..110C1 ; Kaithi # Po [4] KAITHI SECTION MARK..KAITHI DOUBLE DANDA
|
||||
110C2 ; Kaithi # Mn KAITHI VOWEL SIGN VOCALIC R
|
||||
110CD ; Kaithi # Cf KAITHI NUMBER SIGN ABOVE
|
||||
|
||||
# Total code points: 67
|
||||
# Total code points: 68
|
||||
|
||||
# ================================================
|
||||
|
||||
|
@ -2207,9 +2250,13 @@ ABF0..ABF9 ; Meetei_Mayek # Nd [10] MEETEI MAYEK DIGIT ZERO..MEETEI MAYEK DI
|
|||
11047..1104D ; Brahmi # Po [7] BRAHMI DANDA..BRAHMI PUNCTUATION LOTUS
|
||||
11052..11065 ; Brahmi # No [20] BRAHMI NUMBER ONE..BRAHMI NUMBER ONE THOUSAND
|
||||
11066..1106F ; Brahmi # Nd [10] BRAHMI DIGIT ZERO..BRAHMI DIGIT NINE
|
||||
11070 ; Brahmi # Mn BRAHMI SIGN OLD TAMIL VIRAMA
|
||||
11071..11072 ; Brahmi # Lo [2] BRAHMI LETTER OLD TAMIL SHORT E..BRAHMI LETTER OLD TAMIL SHORT O
|
||||
11073..11074 ; Brahmi # Mn [2] BRAHMI VOWEL SIGN OLD TAMIL SHORT E..BRAHMI VOWEL SIGN OLD TAMIL SHORT O
|
||||
11075 ; Brahmi # Lo BRAHMI LETTER OLD TAMIL LLA
|
||||
1107F ; Brahmi # Mn BRAHMI NUMBER JOINER
|
||||
|
||||
# Total code points: 109
|
||||
# Total code points: 115
|
||||
|
||||
# ================================================
|
||||
|
||||
|
@ -2301,9 +2348,10 @@ ABF0..ABF9 ; Meetei_Mayek # Nd [10] MEETEI MAYEK DIGIT ZERO..MEETEI MAYEK DI
|
|||
116B6 ; Takri # Mc TAKRI SIGN VIRAMA
|
||||
116B7 ; Takri # Mn TAKRI SIGN NUKTA
|
||||
116B8 ; Takri # Lo TAKRI LETTER ARCHAIC KHA
|
||||
116B9 ; Takri # Po TAKRI ABBREVIATION SIGN
|
||||
116C0..116C9 ; Takri # Nd [10] TAKRI DIGIT ZERO..TAKRI DIGIT NINE
|
||||
|
||||
# Total code points: 67
|
||||
# Total code points: 68
|
||||
|
||||
# ================================================
|
||||
|
||||
|
@ -2561,8 +2609,9 @@ ABF0..ABF9 ; Meetei_Mayek # Nd [10] MEETEI MAYEK DIGIT ZERO..MEETEI MAYEK DI
|
|||
1173A..1173B ; Ahom # No [2] AHOM NUMBER TEN..AHOM NUMBER TWENTY
|
||||
1173C..1173E ; Ahom # Po [3] AHOM SIGN SMALL SECTION..AHOM SIGN RULAI
|
||||
1173F ; Ahom # So AHOM SYMBOL VI
|
||||
11740..11746 ; Ahom # Lo [7] AHOM LETTER CA..AHOM LETTER LLA
|
||||
|
||||
# Total code points: 58
|
||||
# Total code points: 65
|
||||
|
||||
# ================================================
|
||||
|
||||
|
@ -2897,4 +2946,46 @@ ABF0..ABF9 ; Meetei_Mayek # Nd [10] MEETEI MAYEK DIGIT ZERO..MEETEI MAYEK DI
|
|||
|
||||
# Total code points: 47
|
||||
|
||||
# ================================================
|
||||
|
||||
12F90..12FF0 ; Cypro_Minoan # Lo [97] CYPRO-MINOAN SIGN CM001..CYPRO-MINOAN SIGN CM114
|
||||
12FF1..12FF2 ; Cypro_Minoan # Po [2] CYPRO-MINOAN SIGN CM301..CYPRO-MINOAN SIGN CM302
|
||||
|
||||
# Total code points: 99
|
||||
|
||||
# ================================================
|
||||
|
||||
10F70..10F81 ; Old_Uyghur # Lo [18] OLD UYGHUR LETTER ALEPH..OLD UYGHUR LETTER LESH
|
||||
10F82..10F85 ; Old_Uyghur # Mn [4] OLD UYGHUR COMBINING DOT ABOVE..OLD UYGHUR COMBINING TWO DOTS BELOW
|
||||
10F86..10F89 ; Old_Uyghur # Po [4] OLD UYGHUR PUNCTUATION BAR..OLD UYGHUR PUNCTUATION FOUR DOTS
|
||||
|
||||
# Total code points: 26
|
||||
|
||||
# ================================================
|
||||
|
||||
16A70..16ABE ; Tangsa # Lo [79] TANGSA LETTER OZ..TANGSA LETTER ZA
|
||||
16AC0..16AC9 ; Tangsa # Nd [10] TANGSA DIGIT ZERO..TANGSA DIGIT NINE
|
||||
|
||||
# Total code points: 89
|
||||
|
||||
# ================================================
|
||||
|
||||
1E290..1E2AD ; Toto # Lo [30] TOTO LETTER PA..TOTO LETTER A
|
||||
1E2AE ; Toto # Mn TOTO SIGN RISING TONE
|
||||
|
||||
# Total code points: 31
|
||||
|
||||
# ================================================
|
||||
|
||||
10570..1057A ; Vithkuqi # L& [11] VITHKUQI CAPITAL LETTER A..VITHKUQI CAPITAL LETTER GA
|
||||
1057C..1058A ; Vithkuqi # L& [15] VITHKUQI CAPITAL LETTER HA..VITHKUQI CAPITAL LETTER RE
|
||||
1058C..10592 ; Vithkuqi # L& [7] VITHKUQI CAPITAL LETTER SE..VITHKUQI CAPITAL LETTER XE
|
||||
10594..10595 ; Vithkuqi # L& [2] VITHKUQI CAPITAL LETTER Y..VITHKUQI CAPITAL LETTER ZE
|
||||
10597..105A1 ; Vithkuqi # L& [11] VITHKUQI SMALL LETTER A..VITHKUQI SMALL LETTER GA
|
||||
105A3..105B1 ; Vithkuqi # L& [15] VITHKUQI SMALL LETTER HA..VITHKUQI SMALL LETTER RE
|
||||
105B3..105B9 ; Vithkuqi # L& [7] VITHKUQI SMALL LETTER SE..VITHKUQI SMALL LETTER XE
|
||||
105BB..105BC ; Vithkuqi # L& [2] VITHKUQI SMALL LETTER Y..VITHKUQI SMALL LETTER ZE
|
||||
|
||||
# Total code points: 70
|
||||
|
||||
# EOF
|
||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -1,11 +1,11 @@
|
|||
# emoji-data.txt
|
||||
# Date: 2020-01-28, 20:52:38 GMT
|
||||
# © 2020 Unicode®, Inc.
|
||||
# emoji-data-14.0.0.txt
|
||||
# Date: 2021-08-26, 17:22:22 GMT
|
||||
# © 2021 Unicode®, Inc.
|
||||
# Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries.
|
||||
# For terms of use, see http://www.unicode.org/terms_of_use.html
|
||||
#
|
||||
# Emoji Data for UTS #51
|
||||
# Version: 13.0
|
||||
# Used with Emoji Version 14.0 and subsequent minor revisions (if any)
|
||||
#
|
||||
# For documentation and usage, see http://www.unicode.org/reports/tr51
|
||||
#
|
||||
|
@ -22,7 +22,7 @@
|
|||
# All omitted code points have Emoji=No
|
||||
# @missing: 0000..10FFFF ; Emoji ; No
|
||||
|
||||
0023 ; Emoji # E0.0 [1] (#️) number sign
|
||||
0023 ; Emoji # E0.0 [1] (#️) hash sign
|
||||
002A ; Emoji # E0.0 [1] (*️) asterisk
|
||||
0030..0039 ; Emoji # E0.0 [10] (0️..9️) digit zero..digit nine
|
||||
00A9 ; Emoji # E0.6 [1] (©️) copyright
|
||||
|
@ -119,8 +119,8 @@
|
|||
2747 ; Emoji # E0.6 [1] (❇️) sparkle
|
||||
274C ; Emoji # E0.6 [1] (❌) cross mark
|
||||
274E ; Emoji # E0.6 [1] (❎) cross mark button
|
||||
2753..2755 ; Emoji # E0.6 [3] (❓..❕) question mark..white exclamation mark
|
||||
2757 ; Emoji # E0.6 [1] (❗) exclamation mark
|
||||
2753..2755 ; Emoji # E0.6 [3] (❓..❕) red question mark..white exclamation mark
|
||||
2757 ; Emoji # E0.6 [1] (❗) red exclamation mark
|
||||
2763 ; Emoji # E1.0 [1] (❣️) heart exclamation
|
||||
2764 ; Emoji # E0.6 [1] (❤️) red heart
|
||||
2795..2797 ; Emoji # E0.6 [3] (➕..➗) plus..divide
|
||||
|
@ -239,7 +239,7 @@
|
|||
1F509 ; Emoji # E1.0 [1] (🔉) speaker medium volume
|
||||
1F50A..1F514 ; Emoji # E0.6 [11] (🔊..🔔) speaker high volume..bell
|
||||
1F515 ; Emoji # E1.0 [1] (🔕) bell with slash
|
||||
1F516..1F52B ; Emoji # E0.6 [22] (🔖..🔫) bookmark..pistol
|
||||
1F516..1F52B ; Emoji # E0.6 [22] (🔖..🔫) bookmark..water pistol
|
||||
1F52C..1F52D ; Emoji # E1.0 [2] (🔬..🔭) microscope..telescope
|
||||
1F52E..1F53D ; Emoji # E0.6 [16] (🔮..🔽) crystal ball..downwards button
|
||||
1F549..1F54A ; Emoji # E0.7 [2] (🕉️..🕊️) om..dove
|
||||
|
@ -294,7 +294,7 @@
|
|||
1F62E..1F62F ; Emoji # E1.0 [2] (😮..😯) face with open mouth..hushed face
|
||||
1F630..1F633 ; Emoji # E0.6 [4] (😰..😳) anxious face with sweat..flushed face
|
||||
1F634 ; Emoji # E1.0 [1] (😴) sleeping face
|
||||
1F635 ; Emoji # E0.6 [1] (😵) dizzy face
|
||||
1F635 ; Emoji # E0.6 [1] (😵) face with crossed-out eyes
|
||||
1F636 ; Emoji # E1.0 [1] (😶) face without mouth
|
||||
1F637..1F640 ; Emoji # E0.6 [10] (😷..🙀) face with medical mask..weary cat
|
||||
1F641..1F644 ; Emoji # E1.0 [4] (🙁..🙄) slightly frowning face..face with rolling eyes
|
||||
|
@ -341,6 +341,7 @@
|
|||
1F6D1..1F6D2 ; Emoji # E3.0 [2] (🛑..🛒) stop sign..shopping cart
|
||||
1F6D5 ; Emoji # E12.0 [1] (🛕) hindu temple
|
||||
1F6D6..1F6D7 ; Emoji # E13.0 [2] (🛖..🛗) hut..elevator
|
||||
1F6DD..1F6DF ; Emoji # E14.0 [3] (🛝..🛟) playground slide..ring buoy
|
||||
1F6E0..1F6E5 ; Emoji # E0.7 [6] (🛠️..🛥️) hammer and wrench..motor boat
|
||||
1F6E9 ; Emoji # E0.7 [1] (🛩️) small airplane
|
||||
1F6EB..1F6EC ; Emoji # E1.0 [2] (🛫..🛬) airplane departure..airplane arrival
|
||||
|
@ -352,6 +353,7 @@
|
|||
1F6FA ; Emoji # E12.0 [1] (🛺) auto rickshaw
|
||||
1F6FB..1F6FC ; Emoji # E13.0 [2] (🛻..🛼) pickup truck..roller skate
|
||||
1F7E0..1F7EB ; Emoji # E12.0 [12] (🟠..🟫) orange circle..brown square
|
||||
1F7F0 ; Emoji # E14.0 [1] (🟰) heavy equals sign
|
||||
1F90C ; Emoji # E13.0 [1] (🤌) pinched fingers
|
||||
1F90D..1F90F ; Emoji # E12.0 [3] (🤍..🤏) white heart..pinching hand
|
||||
1F910..1F918 ; Emoji # E1.0 [9] (🤐..🤘) zipper-mouth face..sign of the horns
|
||||
|
@ -375,6 +377,7 @@
|
|||
1F972 ; Emoji # E13.0 [1] (🥲) smiling face with tear
|
||||
1F973..1F976 ; Emoji # E11.0 [4] (🥳..🥶) partying face..cold face
|
||||
1F977..1F978 ; Emoji # E13.0 [2] (🥷..🥸) ninja..disguised face
|
||||
1F979 ; Emoji # E14.0 [1] (🥹) face holding back tears
|
||||
1F97A ; Emoji # E11.0 [1] (🥺) pleading face
|
||||
1F97B ; Emoji # E12.0 [1] (🥻) sari
|
||||
1F97C..1F97F ; Emoji # E11.0 [4] (🥼..🥿) lab coat..flat shoe
|
||||
|
@ -392,21 +395,29 @@
|
|||
1F9C1..1F9C2 ; Emoji # E11.0 [2] (🧁..🧂) cupcake..salt
|
||||
1F9C3..1F9CA ; Emoji # E12.0 [8] (🧃..🧊) beverage box..ice
|
||||
1F9CB ; Emoji # E13.0 [1] (🧋) bubble tea
|
||||
1F9CC ; Emoji # E14.0 [1] (🧌) troll
|
||||
1F9CD..1F9CF ; Emoji # E12.0 [3] (🧍..🧏) person standing..deaf person
|
||||
1F9D0..1F9E6 ; Emoji # E5.0 [23] (🧐..🧦) face with monocle..socks
|
||||
1F9E7..1F9FF ; Emoji # E11.0 [25] (🧧..🧿) red envelope..nazar amulet
|
||||
1FA70..1FA73 ; Emoji # E12.0 [4] (🩰..🩳) ballet shoes..shorts
|
||||
1FA74 ; Emoji # E13.0 [1] (🩴) thong sandal
|
||||
1FA78..1FA7A ; Emoji # E12.0 [3] (🩸..🩺) drop of blood..stethoscope
|
||||
1FA7B..1FA7C ; Emoji # E14.0 [2] (🩻..🩼) x-ray..crutch
|
||||
1FA80..1FA82 ; Emoji # E12.0 [3] (🪀..🪂) yo-yo..parachute
|
||||
1FA83..1FA86 ; Emoji # E13.0 [4] (🪃..🪆) boomerang..nesting dolls
|
||||
1FA90..1FA95 ; Emoji # E12.0 [6] (🪐..🪕) ringed planet..banjo
|
||||
1FA96..1FAA8 ; Emoji # E13.0 [19] (🪖..🪨) military helmet..rock
|
||||
1FAA9..1FAAC ; Emoji # E14.0 [4] (🪩..🪬) mirror ball..hamsa
|
||||
1FAB0..1FAB6 ; Emoji # E13.0 [7] (🪰..🪶) fly..feather
|
||||
1FAB7..1FABA ; Emoji # E14.0 [4] (🪷..🪺) lotus..nest with eggs
|
||||
1FAC0..1FAC2 ; Emoji # E13.0 [3] (🫀..🫂) anatomical heart..people hugging
|
||||
1FAC3..1FAC5 ; Emoji # E14.0 [3] (🫃..🫅) pregnant man..person with crown
|
||||
1FAD0..1FAD6 ; Emoji # E13.0 [7] (🫐..🫖) blueberries..teapot
|
||||
1FAD7..1FAD9 ; Emoji # E14.0 [3] (🫗..🫙) pouring liquid..jar
|
||||
1FAE0..1FAE7 ; Emoji # E14.0 [8] (🫠..🫧) melting face..bubbles
|
||||
1FAF0..1FAF6 ; Emoji # E14.0 [7] (🫰..🫶) hand with index finger and thumb crossed..heart hands
|
||||
|
||||
# Total elements: 1367
|
||||
# Total elements: 1404
|
||||
|
||||
# ================================================
|
||||
|
||||
|
@ -438,8 +449,8 @@
|
|||
2728 ; Emoji_Presentation # E0.6 [1] (✨) sparkles
|
||||
274C ; Emoji_Presentation # E0.6 [1] (❌) cross mark
|
||||
274E ; Emoji_Presentation # E0.6 [1] (❎) cross mark button
|
||||
2753..2755 ; Emoji_Presentation # E0.6 [3] (❓..❕) question mark..white exclamation mark
|
||||
2757 ; Emoji_Presentation # E0.6 [1] (❗) exclamation mark
|
||||
2753..2755 ; Emoji_Presentation # E0.6 [3] (❓..❕) red question mark..white exclamation mark
|
||||
2757 ; Emoji_Presentation # E0.6 [1] (❗) red exclamation mark
|
||||
2795..2797 ; Emoji_Presentation # E0.6 [3] (➕..➗) plus..divide
|
||||
27B0 ; Emoji_Presentation # E0.6 [1] (➰) curly loop
|
||||
27BF ; Emoji_Presentation # E1.0 [1] (➿) double curly loop
|
||||
|
@ -533,7 +544,7 @@
|
|||
1F509 ; Emoji_Presentation # E1.0 [1] (🔉) speaker medium volume
|
||||
1F50A..1F514 ; Emoji_Presentation # E0.6 [11] (🔊..🔔) speaker high volume..bell
|
||||
1F515 ; Emoji_Presentation # E1.0 [1] (🔕) bell with slash
|
||||
1F516..1F52B ; Emoji_Presentation # E0.6 [22] (🔖..🔫) bookmark..pistol
|
||||
1F516..1F52B ; Emoji_Presentation # E0.6 [22] (🔖..🔫) bookmark..water pistol
|
||||
1F52C..1F52D ; Emoji_Presentation # E1.0 [2] (🔬..🔭) microscope..telescope
|
||||
1F52E..1F53D ; Emoji_Presentation # E0.6 [16] (🔮..🔽) crystal ball..downwards button
|
||||
1F54B..1F54E ; Emoji_Presentation # E1.0 [4] (🕋..🕎) kaaba..menorah
|
||||
|
@ -569,7 +580,7 @@
|
|||
1F62E..1F62F ; Emoji_Presentation # E1.0 [2] (😮..😯) face with open mouth..hushed face
|
||||
1F630..1F633 ; Emoji_Presentation # E0.6 [4] (😰..😳) anxious face with sweat..flushed face
|
||||
1F634 ; Emoji_Presentation # E1.0 [1] (😴) sleeping face
|
||||
1F635 ; Emoji_Presentation # E0.6 [1] (😵) dizzy face
|
||||
1F635 ; Emoji_Presentation # E0.6 [1] (😵) face with crossed-out eyes
|
||||
1F636 ; Emoji_Presentation # E1.0 [1] (😶) face without mouth
|
||||
1F637..1F640 ; Emoji_Presentation # E0.6 [10] (😷..🙀) face with medical mask..weary cat
|
||||
1F641..1F644 ; Emoji_Presentation # E1.0 [4] (🙁..🙄) slightly frowning face..face with rolling eyes
|
||||
|
@ -614,6 +625,7 @@
|
|||
1F6D1..1F6D2 ; Emoji_Presentation # E3.0 [2] (🛑..🛒) stop sign..shopping cart
|
||||
1F6D5 ; Emoji_Presentation # E12.0 [1] (🛕) hindu temple
|
||||
1F6D6..1F6D7 ; Emoji_Presentation # E13.0 [2] (🛖..🛗) hut..elevator
|
||||
1F6DD..1F6DF ; Emoji_Presentation # E14.0 [3] (🛝..🛟) playground slide..ring buoy
|
||||
1F6EB..1F6EC ; Emoji_Presentation # E1.0 [2] (🛫..🛬) airplane departure..airplane arrival
|
||||
1F6F4..1F6F6 ; Emoji_Presentation # E3.0 [3] (🛴..🛶) kick scooter..canoe
|
||||
1F6F7..1F6F8 ; Emoji_Presentation # E5.0 [2] (🛷..🛸) sled..flying saucer
|
||||
|
@ -621,6 +633,7 @@
|
|||
1F6FA ; Emoji_Presentation # E12.0 [1] (🛺) auto rickshaw
|
||||
1F6FB..1F6FC ; Emoji_Presentation # E13.0 [2] (🛻..🛼) pickup truck..roller skate
|
||||
1F7E0..1F7EB ; Emoji_Presentation # E12.0 [12] (🟠..🟫) orange circle..brown square
|
||||
1F7F0 ; Emoji_Presentation # E14.0 [1] (🟰) heavy equals sign
|
||||
1F90C ; Emoji_Presentation # E13.0 [1] (🤌) pinched fingers
|
||||
1F90D..1F90F ; Emoji_Presentation # E12.0 [3] (🤍..🤏) white heart..pinching hand
|
||||
1F910..1F918 ; Emoji_Presentation # E1.0 [9] (🤐..🤘) zipper-mouth face..sign of the horns
|
||||
|
@ -644,6 +657,7 @@
|
|||
1F972 ; Emoji_Presentation # E13.0 [1] (🥲) smiling face with tear
|
||||
1F973..1F976 ; Emoji_Presentation # E11.0 [4] (🥳..🥶) partying face..cold face
|
||||
1F977..1F978 ; Emoji_Presentation # E13.0 [2] (🥷..🥸) ninja..disguised face
|
||||
1F979 ; Emoji_Presentation # E14.0 [1] (🥹) face holding back tears
|
||||
1F97A ; Emoji_Presentation # E11.0 [1] (🥺) pleading face
|
||||
1F97B ; Emoji_Presentation # E12.0 [1] (🥻) sari
|
||||
1F97C..1F97F ; Emoji_Presentation # E11.0 [4] (🥼..🥿) lab coat..flat shoe
|
||||
|
@ -661,21 +675,29 @@
|
|||
1F9C1..1F9C2 ; Emoji_Presentation # E11.0 [2] (🧁..🧂) cupcake..salt
|
||||
1F9C3..1F9CA ; Emoji_Presentation # E12.0 [8] (🧃..🧊) beverage box..ice
|
||||
1F9CB ; Emoji_Presentation # E13.0 [1] (🧋) bubble tea
|
||||
1F9CC ; Emoji_Presentation # E14.0 [1] (🧌) troll
|
||||
1F9CD..1F9CF ; Emoji_Presentation # E12.0 [3] (🧍..🧏) person standing..deaf person
|
||||
1F9D0..1F9E6 ; Emoji_Presentation # E5.0 [23] (🧐..🧦) face with monocle..socks
|
||||
1F9E7..1F9FF ; Emoji_Presentation # E11.0 [25] (🧧..🧿) red envelope..nazar amulet
|
||||
1FA70..1FA73 ; Emoji_Presentation # E12.0 [4] (🩰..🩳) ballet shoes..shorts
|
||||
1FA74 ; Emoji_Presentation # E13.0 [1] (🩴) thong sandal
|
||||
1FA78..1FA7A ; Emoji_Presentation # E12.0 [3] (🩸..🩺) drop of blood..stethoscope
|
||||
1FA7B..1FA7C ; Emoji_Presentation # E14.0 [2] (🩻..🩼) x-ray..crutch
|
||||
1FA80..1FA82 ; Emoji_Presentation # E12.0 [3] (🪀..🪂) yo-yo..parachute
|
||||
1FA83..1FA86 ; Emoji_Presentation # E13.0 [4] (🪃..🪆) boomerang..nesting dolls
|
||||
1FA90..1FA95 ; Emoji_Presentation # E12.0 [6] (🪐..🪕) ringed planet..banjo
|
||||
1FA96..1FAA8 ; Emoji_Presentation # E13.0 [19] (🪖..🪨) military helmet..rock
|
||||
1FAA9..1FAAC ; Emoji_Presentation # E14.0 [4] (🪩..🪬) mirror ball..hamsa
|
||||
1FAB0..1FAB6 ; Emoji_Presentation # E13.0 [7] (🪰..🪶) fly..feather
|
||||
1FAB7..1FABA ; Emoji_Presentation # E14.0 [4] (🪷..🪺) lotus..nest with eggs
|
||||
1FAC0..1FAC2 ; Emoji_Presentation # E13.0 [3] (🫀..🫂) anatomical heart..people hugging
|
||||
1FAC3..1FAC5 ; Emoji_Presentation # E14.0 [3] (🫃..🫅) pregnant man..person with crown
|
||||
1FAD0..1FAD6 ; Emoji_Presentation # E13.0 [7] (🫐..🫖) blueberries..teapot
|
||||
1FAD7..1FAD9 ; Emoji_Presentation # E14.0 [3] (🫗..🫙) pouring liquid..jar
|
||||
1FAE0..1FAE7 ; Emoji_Presentation # E14.0 [8] (🫠..🫧) melting face..bubbles
|
||||
1FAF0..1FAF6 ; Emoji_Presentation # E14.0 [7] (🫰..🫶) hand with index finger and thumb crossed..heart hands
|
||||
|
||||
# Total elements: 1148
|
||||
# Total elements: 1185
|
||||
|
||||
# ================================================
|
||||
|
||||
|
@ -738,15 +760,17 @@
|
|||
1F9BB ; Emoji_Modifier_Base # E12.0 [1] (🦻) ear with hearing aid
|
||||
1F9CD..1F9CF ; Emoji_Modifier_Base # E12.0 [3] (🧍..🧏) person standing..deaf person
|
||||
1F9D1..1F9DD ; Emoji_Modifier_Base # E5.0 [13] (🧑..🧝) person..elf
|
||||
1FAC3..1FAC5 ; Emoji_Modifier_Base # E14.0 [3] (🫃..🫅) pregnant man..person with crown
|
||||
1FAF0..1FAF6 ; Emoji_Modifier_Base # E14.0 [7] (🫰..🫶) hand with index finger and thumb crossed..heart hands
|
||||
|
||||
# Total elements: 122
|
||||
# Total elements: 132
|
||||
|
||||
# ================================================
|
||||
|
||||
# All omitted code points have Emoji_Component=No
|
||||
# @missing: 0000..10FFFF ; Emoji_Component ; No
|
||||
|
||||
0023 ; Emoji_Component # E0.0 [1] (#️) number sign
|
||||
0023 ; Emoji_Component # E0.0 [1] (#️) hash sign
|
||||
002A ; Emoji_Component # E0.0 [1] (*️) asterisk
|
||||
0030..0039 ; Emoji_Component # E0.0 [10] (0️..9️) digit zero..digit nine
|
||||
200D ; Emoji_Component # E0.0 [1] () zero width joiner
|
||||
|
@ -902,8 +926,8 @@ E0020..E007F ; Emoji_Component # E0.0 [96] (..) tag space..c
|
|||
2747 ; Extended_Pictographic# E0.6 [1] (❇️) sparkle
|
||||
274C ; Extended_Pictographic# E0.6 [1] (❌) cross mark
|
||||
274E ; Extended_Pictographic# E0.6 [1] (❎) cross mark button
|
||||
2753..2755 ; Extended_Pictographic# E0.6 [3] (❓..❕) question mark..white exclamation mark
|
||||
2757 ; Extended_Pictographic# E0.6 [1] (❗) exclamation mark
|
||||
2753..2755 ; Extended_Pictographic# E0.6 [3] (❓..❕) red question mark..white exclamation mark
|
||||
2757 ; Extended_Pictographic# E0.6 [1] (❗) red exclamation mark
|
||||
2763 ; Extended_Pictographic# E1.0 [1] (❣️) heart exclamation
|
||||
2764 ; Extended_Pictographic# E0.6 [1] (❤️) red heart
|
||||
2765..2767 ; Extended_Pictographic# E0.0 [3] (❥..❧) ROTATED HEAVY BLACK HEART BULLET..ROTATED FLORAL HEART BULLET
|
||||
|
@ -1041,7 +1065,7 @@ E0020..E007F ; Emoji_Component # E0.0 [96] (..) tag space..c
|
|||
1F509 ; Extended_Pictographic# E1.0 [1] (🔉) speaker medium volume
|
||||
1F50A..1F514 ; Extended_Pictographic# E0.6 [11] (🔊..🔔) speaker high volume..bell
|
||||
1F515 ; Extended_Pictographic# E1.0 [1] (🔕) bell with slash
|
||||
1F516..1F52B ; Extended_Pictographic# E0.6 [22] (🔖..🔫) bookmark..pistol
|
||||
1F516..1F52B ; Extended_Pictographic# E0.6 [22] (🔖..🔫) bookmark..water pistol
|
||||
1F52C..1F52D ; Extended_Pictographic# E1.0 [2] (🔬..🔭) microscope..telescope
|
||||
1F52E..1F53D ; Extended_Pictographic# E0.6 [16] (🔮..🔽) crystal ball..downwards button
|
||||
1F546..1F548 ; Extended_Pictographic# E0.0 [3] (🕆..🕈) WHITE LATIN CROSS..CELTIC CROSS
|
||||
|
@ -1117,7 +1141,7 @@ E0020..E007F ; Emoji_Component # E0.0 [96] (..) tag space..c
|
|||
1F62E..1F62F ; Extended_Pictographic# E1.0 [2] (😮..😯) face with open mouth..hushed face
|
||||
1F630..1F633 ; Extended_Pictographic# E0.6 [4] (😰..😳) anxious face with sweat..flushed face
|
||||
1F634 ; Extended_Pictographic# E1.0 [1] (😴) sleeping face
|
||||
1F635 ; Extended_Pictographic# E0.6 [1] (😵) dizzy face
|
||||
1F635 ; Extended_Pictographic# E0.6 [1] (😵) face with crossed-out eyes
|
||||
1F636 ; Extended_Pictographic# E1.0 [1] (😶) face without mouth
|
||||
1F637..1F640 ; Extended_Pictographic# E0.6 [10] (😷..🙀) face with medical mask..weary cat
|
||||
1F641..1F644 ; Extended_Pictographic# E1.0 [4] (🙁..🙄) slightly frowning face..face with rolling eyes
|
||||
|
@ -1166,7 +1190,8 @@ E0020..E007F ; Emoji_Component # E0.0 [96] (..) tag space..c
|
|||
1F6D3..1F6D4 ; Extended_Pictographic# E0.0 [2] (🛓..🛔) STUPA..PAGODA
|
||||
1F6D5 ; Extended_Pictographic# E12.0 [1] (🛕) hindu temple
|
||||
1F6D6..1F6D7 ; Extended_Pictographic# E13.0 [2] (🛖..🛗) hut..elevator
|
||||
1F6D8..1F6DF ; Extended_Pictographic# E0.0 [8] (..🛟) <reserved-1F6D8>..<reserved-1F6DF>
|
||||
1F6D8..1F6DC ; Extended_Pictographic# E0.0 [5] (..🛜) <reserved-1F6D8>..<reserved-1F6DC>
|
||||
1F6DD..1F6DF ; Extended_Pictographic# E14.0 [3] (🛝..🛟) playground slide..ring buoy
|
||||
1F6E0..1F6E5 ; Extended_Pictographic# E0.7 [6] (🛠️..🛥️) hammer and wrench..motor boat
|
||||
1F6E6..1F6E8 ; Extended_Pictographic# E0.0 [3] (🛦..🛨) UP-POINTING MILITARY AIRPLANE..UP-POINTING SMALL AIRPLANE
|
||||
1F6E9 ; Extended_Pictographic# E0.7 [1] (🛩️) small airplane
|
||||
|
@ -1185,7 +1210,9 @@ E0020..E007F ; Emoji_Component # E0.0 [96] (..) tag space..c
|
|||
1F774..1F77F ; Extended_Pictographic# E0.0 [12] (🝴..🝿) <reserved-1F774>..<reserved-1F77F>
|
||||
1F7D5..1F7DF ; Extended_Pictographic# E0.0 [11] (🟕..) CIRCLED TRIANGLE..<reserved-1F7DF>
|
||||
1F7E0..1F7EB ; Extended_Pictographic# E12.0 [12] (🟠..🟫) orange circle..brown square
|
||||
1F7EC..1F7FF ; Extended_Pictographic# E0.0 [20] (..) <reserved-1F7EC>..<reserved-1F7FF>
|
||||
1F7EC..1F7EF ; Extended_Pictographic# E0.0 [4] (..) <reserved-1F7EC>..<reserved-1F7EF>
|
||||
1F7F0 ; Extended_Pictographic# E14.0 [1] (🟰) heavy equals sign
|
||||
1F7F1..1F7FF ; Extended_Pictographic# E0.0 [15] (..) <reserved-1F7F1>..<reserved-1F7FF>
|
||||
1F80C..1F80F ; Extended_Pictographic# E0.0 [4] (..) <reserved-1F80C>..<reserved-1F80F>
|
||||
1F848..1F84F ; Extended_Pictographic# E0.0 [8] (..) <reserved-1F848>..<reserved-1F84F>
|
||||
1F85A..1F85F ; Extended_Pictographic# E0.0 [6] (..) <reserved-1F85A>..<reserved-1F85F>
|
||||
|
@ -1214,7 +1241,7 @@ E0020..E007F ; Emoji_Component # E0.0 [96] (..) tag space..c
|
|||
1F972 ; Extended_Pictographic# E13.0 [1] (🥲) smiling face with tear
|
||||
1F973..1F976 ; Extended_Pictographic# E11.0 [4] (🥳..🥶) partying face..cold face
|
||||
1F977..1F978 ; Extended_Pictographic# E13.0 [2] (🥷..🥸) ninja..disguised face
|
||||
1F979 ; Extended_Pictographic# E0.0 [1] (🥹) <reserved-1F979>
|
||||
1F979 ; Extended_Pictographic# E14.0 [1] (🥹) face holding back tears
|
||||
1F97A ; Extended_Pictographic# E11.0 [1] (🥺) pleading face
|
||||
1F97B ; Extended_Pictographic# E12.0 [1] (🥻) sari
|
||||
1F97C..1F97F ; Extended_Pictographic# E11.0 [4] (🥼..🥿) lab coat..flat shoe
|
||||
|
@ -1232,7 +1259,7 @@ E0020..E007F ; Emoji_Component # E0.0 [96] (..) tag space..c
|
|||
1F9C1..1F9C2 ; Extended_Pictographic# E11.0 [2] (🧁..🧂) cupcake..salt
|
||||
1F9C3..1F9CA ; Extended_Pictographic# E12.0 [8] (🧃..🧊) beverage box..ice
|
||||
1F9CB ; Extended_Pictographic# E13.0 [1] (🧋) bubble tea
|
||||
1F9CC ; Extended_Pictographic# E0.0 [1] (🧌) <reserved-1F9CC>
|
||||
1F9CC ; Extended_Pictographic# E14.0 [1] (🧌) troll
|
||||
1F9CD..1F9CF ; Extended_Pictographic# E12.0 [3] (🧍..🧏) person standing..deaf person
|
||||
1F9D0..1F9E6 ; Extended_Pictographic# E5.0 [23] (🧐..🧦) face with monocle..socks
|
||||
1F9E7..1F9FF ; Extended_Pictographic# E11.0 [25] (🧧..🧿) red envelope..nazar amulet
|
||||
|
@ -1241,19 +1268,28 @@ E0020..E007F ; Emoji_Component # E0.0 [96] (..) tag space..c
|
|||
1FA74 ; Extended_Pictographic# E13.0 [1] (🩴) thong sandal
|
||||
1FA75..1FA77 ; Extended_Pictographic# E0.0 [3] (🩵..🩷) <reserved-1FA75>..<reserved-1FA77>
|
||||
1FA78..1FA7A ; Extended_Pictographic# E12.0 [3] (🩸..🩺) drop of blood..stethoscope
|
||||
1FA7B..1FA7F ; Extended_Pictographic# E0.0 [5] (🩻..) <reserved-1FA7B>..<reserved-1FA7F>
|
||||
1FA7B..1FA7C ; Extended_Pictographic# E14.0 [2] (🩻..🩼) x-ray..crutch
|
||||
1FA7D..1FA7F ; Extended_Pictographic# E0.0 [3] (..) <reserved-1FA7D>..<reserved-1FA7F>
|
||||
1FA80..1FA82 ; Extended_Pictographic# E12.0 [3] (🪀..🪂) yo-yo..parachute
|
||||
1FA83..1FA86 ; Extended_Pictographic# E13.0 [4] (🪃..🪆) boomerang..nesting dolls
|
||||
1FA87..1FA8F ; Extended_Pictographic# E0.0 [9] (🪇..) <reserved-1FA87>..<reserved-1FA8F>
|
||||
1FA90..1FA95 ; Extended_Pictographic# E12.0 [6] (🪐..🪕) ringed planet..banjo
|
||||
1FA96..1FAA8 ; Extended_Pictographic# E13.0 [19] (🪖..🪨) military helmet..rock
|
||||
1FAA9..1FAAF ; Extended_Pictographic# E0.0 [7] (🪩..🪯) <reserved-1FAA9>..<reserved-1FAAF>
|
||||
1FAA9..1FAAC ; Extended_Pictographic# E14.0 [4] (🪩..🪬) mirror ball..hamsa
|
||||
1FAAD..1FAAF ; Extended_Pictographic# E0.0 [3] (🪭..🪯) <reserved-1FAAD>..<reserved-1FAAF>
|
||||
1FAB0..1FAB6 ; Extended_Pictographic# E13.0 [7] (🪰..🪶) fly..feather
|
||||
1FAB7..1FABF ; Extended_Pictographic# E0.0 [9] (🪷..🪿) <reserved-1FAB7>..<reserved-1FABF>
|
||||
1FAB7..1FABA ; Extended_Pictographic# E14.0 [4] (🪷..🪺) lotus..nest with eggs
|
||||
1FABB..1FABF ; Extended_Pictographic# E0.0 [5] (🪻..🪿) <reserved-1FABB>..<reserved-1FABF>
|
||||
1FAC0..1FAC2 ; Extended_Pictographic# E13.0 [3] (🫀..🫂) anatomical heart..people hugging
|
||||
1FAC3..1FACF ; Extended_Pictographic# E0.0 [13] (🫃..🫏) <reserved-1FAC3>..<reserved-1FACF>
|
||||
1FAC3..1FAC5 ; Extended_Pictographic# E14.0 [3] (🫃..🫅) pregnant man..person with crown
|
||||
1FAC6..1FACF ; Extended_Pictographic# E0.0 [10] (..🫏) <reserved-1FAC6>..<reserved-1FACF>
|
||||
1FAD0..1FAD6 ; Extended_Pictographic# E13.0 [7] (🫐..🫖) blueberries..teapot
|
||||
1FAD7..1FAFF ; Extended_Pictographic# E0.0 [41] (🫗..) <reserved-1FAD7>..<reserved-1FAFF>
|
||||
1FAD7..1FAD9 ; Extended_Pictographic# E14.0 [3] (🫗..🫙) pouring liquid..jar
|
||||
1FADA..1FADF ; Extended_Pictographic# E0.0 [6] (🫚..) <reserved-1FADA>..<reserved-1FADF>
|
||||
1FAE0..1FAE7 ; Extended_Pictographic# E14.0 [8] (🫠..🫧) melting face..bubbles
|
||||
1FAE8..1FAEF ; Extended_Pictographic# E0.0 [8] (🫨..) <reserved-1FAE8>..<reserved-1FAEF>
|
||||
1FAF0..1FAF6 ; Extended_Pictographic# E14.0 [7] (🫰..🫶) hand with index finger and thumb crossed..heart hands
|
||||
1FAF7..1FAFF ; Extended_Pictographic# E0.0 [9] (🫷..) <reserved-1FAF7>..<reserved-1FAFF>
|
||||
1FC00..1FFFD ; Extended_Pictographic# E0.0[1022] (..) <reserved-1FC00>..<reserved-1FFFD>
|
||||
|
||||
# Total elements: 3537
|
||||
|
|
540
maint/ucptest.c
540
maint/ucptest.c
|
@ -2,7 +2,7 @@
|
|||
* A program for testing the Unicode property table *
|
||||
***************************************************/
|
||||
|
||||
/* Copyright (c) University of Cambridge 2008-2020 */
|
||||
/* Copyright (c) University of Cambridge 2008-2022 */
|
||||
|
||||
/* Compile thus:
|
||||
|
||||
|
@ -14,40 +14,50 @@
|
|||
*/
|
||||
|
||||
/* This is a hacked-up program for testing the Unicode properties tables of
|
||||
PCRE2. It can also be used for finding characters with certain properties.
|
||||
I wrote it to help with debugging PCRE, and have added things that I found
|
||||
useful, in a rather haphazard way. The code has never been seriously tidied or
|
||||
checked for robustness, but it shouldn't now give compiler warnings.
|
||||
PCRE2. It can also be used for finding characters with certain properties. I
|
||||
wrote it to help with debugging, and have added things that I found useful, in
|
||||
a rather haphazard way. The code has never been seriously tidied or checked for
|
||||
robustness, but it shouldn't now give compiler warnings.
|
||||
|
||||
There is only one option: "-s". If given, it applies only to the "findprop"
|
||||
command. It causes the UTF-8 sequence of bytes that encode the character to be
|
||||
output between angle brackets at the end of the line. On a UTF-8 terminal, this
|
||||
There is only one option: "-s". If given, it applies only to the "findprop"
|
||||
command. It causes the UTF-8 sequence of bytes that encode the character to be
|
||||
output between angle brackets at the end of the line. On a UTF-8 terminal, this
|
||||
will show the appropriate graphic for the code point.
|
||||
|
||||
If the command has arguments, they are concatenated into a buffer, separated by
|
||||
spaces. If the first argument starts "U+" or consists entirely of hexadecimal
|
||||
digits, "findprop" is inserted at the start. The buffer is then processed as a
|
||||
single line file, after which the program exits. If there are no arguments, the
|
||||
program reads commands line by line on stdin and writes output to stdout. The
|
||||
program reads commands line by line on stdin and writes output to stdout. The
|
||||
return code is always zero.
|
||||
|
||||
There are three commands:
|
||||
|
||||
"findprop" must be followed by a space-separated list of Unicode code points as
|
||||
hex numbers, either without any prefix or starting with "U+". The output is one
|
||||
line per character, giving its Unicode properties followed by its other case or
|
||||
cases if one or more exist, followed by its Script Extension list if it is not
|
||||
just the same as the base script. This list is in square brackets. The
|
||||
properties are:
|
||||
The command "findprop" must be followed by a space-separated list of Unicode
|
||||
code points as hex numbers, either without any prefix or starting with "U+", or
|
||||
as individual UTF-8 characters preceded by '+'. For example:
|
||||
|
||||
General type e.g. Letter
|
||||
Specific type e.g. Upper case letter
|
||||
Script e.g. Medefaidrin
|
||||
Grapheme break type e.g. Extend (most common is Other)
|
||||
findprop U+1234 5Abc +?
|
||||
|
||||
"find" must be followed by a list of property names and their values. The
|
||||
values are case-sensitive. This finds characters that have those properties. If
|
||||
multiple properties are listed, they must all be matched. Currently supported:
|
||||
The output is one long line per character, listing Unicode properties that have
|
||||
values, followed by its other case or cases if one or more exist, followed by
|
||||
its Script Extension list if there is one. This list is in square brackets. A
|
||||
second list in square brackets gives all the Boolean properties of the
|
||||
character. The properties that come first are:
|
||||
|
||||
Bidi class e.g. NSM (most common is L)
|
||||
General type e.g. Letter
|
||||
Specific type e.g. Upper case letter
|
||||
Script e.g. Medefaidrin
|
||||
Grapheme break type e.g. Extend (most common is Other)
|
||||
|
||||
Script names and Boolean property names are all in lower case, with underscores
|
||||
and hyphens removed, because that's how they are stored for "loose" matching.
|
||||
|
||||
The command "find" must be followed by a list of property types and their
|
||||
values. The values are case-sensitive, except for bidi class. This finds
|
||||
characters that have those properties. If multiple properties are listed, they
|
||||
must all be matched. Currently supported:
|
||||
|
||||
script <name> The character must have this script property. Only one
|
||||
such script may be given.
|
||||
|
@ -56,17 +66,20 @@ multiple properties are listed, they must all be matched. Currently supported:
|
|||
scripts must be present.
|
||||
type <abbrev> The character's specific type (e.g. Lu or Nd) must match.
|
||||
gbreak <name> The grapheme break property must match.
|
||||
bidi <class> The character's bidi class must match.
|
||||
bool <name> The character's Boolean property list must contain this
|
||||
property.
|
||||
|
||||
If a <name> or <abbrev> is preceded by !, the value must NOT be present. For
|
||||
Script Extensions, there may be a mixture of positive and negative
|
||||
requirements. All must be satisfied.
|
||||
Script Extensions and Boolean properties, there may be a mixture of positive
|
||||
and negative requirements. All must be satisfied.
|
||||
|
||||
Sequences of two or more characters are shown as ranges, for example
|
||||
U+0041..U+004A. No more than 100 lines are are output. If there are more
|
||||
characters, the list ends with ...
|
||||
characters, the list ends with ...
|
||||
|
||||
"list" must be followed by a property name (script, type, or gbreak). The
|
||||
defined values for that property are listed. */
|
||||
The command "list" must be followed by one of property names script, bool,
|
||||
type, gbreak or bidi. The defined values for that property are listed. */
|
||||
|
||||
|
||||
#ifdef HAVE_CONFIG_H
|
||||
|
@ -97,6 +110,9 @@ defined values for that property are listed. */
|
|||
#include <editline/readline.h>
|
||||
#else
|
||||
#include <readline/readline.h>
|
||||
#ifdef RL_VERSION_MAJOR
|
||||
#include <readline/history.h>
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
|
@ -145,7 +161,7 @@ static const unsigned char *type_names[] = {
|
|||
US"So", US"Other symbol",
|
||||
US"Zl", US"Line separator",
|
||||
US"Zp", US"Paragraph separator",
|
||||
US"Zs", US"Space separator"
|
||||
US"Zs", US"Space separator"
|
||||
};
|
||||
|
||||
static const unsigned char *gb_names[] = {
|
||||
|
@ -160,12 +176,37 @@ static const unsigned char *gb_names[] = {
|
|||
US"T", US"Hangul syllable type T",
|
||||
US"LV", US"Hangul syllable type LV",
|
||||
US"LVT", US"Hangul syllable type LVT",
|
||||
US"RegionalIndicator", US"",
|
||||
US"Regional_Indicator", US"",
|
||||
US"Other", US"",
|
||||
US"ZWJ", US"zero width joiner",
|
||||
US"Extended_Pictographic", US""
|
||||
};
|
||||
|
||||
static const unsigned char *bd_names[] = {
|
||||
US"AL", US"Arabic letter",
|
||||
US"AN", US"Arabid number",
|
||||
US"B", US"Paragraph separator",
|
||||
US"BN", US"Boundary neutral",
|
||||
US"CS", US"Common separator",
|
||||
US"EN", US"European number",
|
||||
US"ES", US"European separator",
|
||||
US"ET", US"European terminator",
|
||||
US"FSI", US"First string isolate",
|
||||
US"L", US"Left-to-right",
|
||||
US"LRE", US"Left-to-right embedding",
|
||||
US"LRI", US"Left-to-right isolate",
|
||||
US"LRO", US"Left-to-right override",
|
||||
US"NSM", US"Non-spacing mark",
|
||||
US"ON", US"Other neutral",
|
||||
US"PDF", US"Pop directional format",
|
||||
US"PDI", US"Pop directional isolate",
|
||||
US"R", US"Right-to-left",
|
||||
US"RLE", US"Right-to-left embedding",
|
||||
US"RLI", US"Right-to-left isolate",
|
||||
US"RLO", US"Right-to-left override",
|
||||
US"S", US"Segment separator",
|
||||
US"WS", US"White space"
|
||||
};
|
||||
|
||||
static const unsigned int utf8_table1[] = {
|
||||
0x0000007f, 0x000007ff, 0x0000ffff, 0x001fffff, 0x03ffffff, 0x7fffffff};
|
||||
|
@ -173,6 +214,41 @@ static const unsigned int utf8_table1[] = {
|
|||
static const int utf8_table2[] = {
|
||||
0, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc};
|
||||
|
||||
/* Macro to pick up the remaining bytes of a UTF-8 character, advancing
|
||||
the pointer. */
|
||||
|
||||
#define GETUTF8INC(c, eptr) \
|
||||
{ \
|
||||
if ((c & 0x20u) == 0) \
|
||||
c = ((c & 0x1fu) << 6) | (*eptr++ & 0x3fu); \
|
||||
else if ((c & 0x10u) == 0) \
|
||||
{ \
|
||||
c = ((c & 0x0fu) << 12) | ((*eptr & 0x3fu) << 6) | (eptr[1] & 0x3fu); \
|
||||
eptr += 2; \
|
||||
} \
|
||||
else if ((c & 0x08u) == 0) \
|
||||
{ \
|
||||
c = ((c & 0x07u) << 18) | ((*eptr & 0x3fu) << 12) | \
|
||||
((eptr[1] & 0x3fu) << 6) | (eptr[2] & 0x3fu); \
|
||||
eptr += 3; \
|
||||
} \
|
||||
else if ((c & 0x04u) == 0) \
|
||||
{ \
|
||||
c = ((c & 0x03u) << 24) | ((*eptr & 0x3fu) << 18) | \
|
||||
((eptr[1] & 0x3fu) << 12) | ((eptr[2] & 0x3fu) << 6) | \
|
||||
(eptr[3] & 0x3fu); \
|
||||
eptr += 4; \
|
||||
} \
|
||||
else \
|
||||
{ \
|
||||
c = ((c & 0x01u) << 30) | ((*eptr & 0x3fu) << 24) | \
|
||||
((eptr[1] & 0x3fu) << 18) | ((eptr[2] & 0x3fu) << 12) | \
|
||||
((eptr[3] & 0x3fu) << 6) | (eptr[4] & 0x3fu); \
|
||||
eptr += 5; \
|
||||
} \
|
||||
}
|
||||
|
||||
|
||||
|
||||
/*************************************************
|
||||
* Convert character value to UTF-8 *
|
||||
|
@ -224,25 +300,54 @@ return isatty(fileno(stdin));
|
|||
|
||||
|
||||
/*************************************************
|
||||
* Get script name from ucp ident *
|
||||
* Get name from ucp ident *
|
||||
*************************************************/
|
||||
|
||||
static const char *
|
||||
get_scriptname(int script)
|
||||
{
|
||||
size_t i;
|
||||
const ucp_type_table *u;
|
||||
/* The utt table contains both full names and abbreviations. So search for both
|
||||
and use the longer if two are found, unless the first one is only 3 characters
|
||||
and we are looking for a script (some scripts have 3-character names). If this
|
||||
were not just a test program it might be worth making some kind of reverse
|
||||
index. */
|
||||
|
||||
static const char *
|
||||
get_propname(int prop, int type)
|
||||
{
|
||||
size_t i, j, len;
|
||||
size_t foundlist[2];
|
||||
const char *yield;
|
||||
int typex = (type == PT_SC)? PT_SCX : type;
|
||||
|
||||
j = 0;
|
||||
for (i = 0; i < PRIV(utt_size); i++)
|
||||
{
|
||||
u = PRIV(utt) + i;
|
||||
if (u->type == PT_SC && u->value == script) break;
|
||||
const ucp_type_table *u = PRIV(utt) + i;
|
||||
if ((u->type == type || u->type == typex) && u->value == prop)
|
||||
{
|
||||
foundlist[j++] = i;
|
||||
if (j >= 2) break;
|
||||
}
|
||||
}
|
||||
if (i < PRIV(utt_size))
|
||||
return PRIV(utt_names) + u->name_offset;
|
||||
|
||||
return "??";
|
||||
}
|
||||
if (j == 0) return "??";
|
||||
|
||||
yield = NULL;
|
||||
len = 0;
|
||||
|
||||
for (i = 0; i < j; i++)
|
||||
{
|
||||
const char *s = PRIV(utt_names) + (PRIV(utt) + foundlist[i])->name_offset;
|
||||
size_t sl = strlen(s);
|
||||
|
||||
if (sl > len)
|
||||
{
|
||||
yield = s;
|
||||
if (sl == 3 && type == PT_SC) break;
|
||||
len = sl;
|
||||
}
|
||||
}
|
||||
|
||||
return yield;
|
||||
}
|
||||
|
||||
|
||||
/*************************************************
|
||||
|
@ -257,13 +362,16 @@ int fulltype = UCD_CHARTYPE(c);
|
|||
int script = UCD_SCRIPT(c);
|
||||
int scriptx = UCD_SCRIPTX(c);
|
||||
int gbprop = UCD_GRAPHBREAK(c);
|
||||
int bidi = UCD_BIDICLASS(c);
|
||||
unsigned int othercase = UCD_OTHERCASE(c);
|
||||
int caseset = UCD_CASESET(c);
|
||||
int bprops = UCD_BPROPS(c);
|
||||
|
||||
const unsigned char *fulltypename = US"??";
|
||||
const unsigned char *typename = US"??";
|
||||
const unsigned char *graphbreak = US"??";
|
||||
const unsigned char *scriptname = CUS get_scriptname(script);
|
||||
const unsigned char *bidiclass = US"??";
|
||||
const unsigned char *scriptname = CUS get_propname(script, PT_SC);
|
||||
|
||||
switch (type)
|
||||
{
|
||||
|
@ -323,7 +431,7 @@ switch(gbprop)
|
|||
case ucp_gbT: graphbreak = US"Hangul syllable type T"; break;
|
||||
case ucp_gbLV: graphbreak = US"Hangul syllable type LV"; break;
|
||||
case ucp_gbLVT: graphbreak = US"Hangul syllable type LVT"; break;
|
||||
case ucp_gbRegionalIndicator:
|
||||
case ucp_gbRegional_Indicator:
|
||||
graphbreak = US"Regional Indicator"; break;
|
||||
case ucp_gbOther: graphbreak = US"Other"; break;
|
||||
case ucp_gbZWJ: graphbreak = US"Zero Width Joiner"; break;
|
||||
|
@ -332,7 +440,37 @@ switch(gbprop)
|
|||
default: graphbreak = US"Unknown"; break;
|
||||
}
|
||||
|
||||
printf("U+%04X %s: %s, %s, %s", c, typename, fulltypename, scriptname, graphbreak);
|
||||
switch(bidi)
|
||||
{
|
||||
case ucp_bidiAL: bidiclass = US"AL "; break;
|
||||
case ucp_bidiFSI: bidiclass = US"FSI"; break;
|
||||
case ucp_bidiL: bidiclass = US"L "; break;
|
||||
case ucp_bidiLRE: bidiclass = US"LRE"; break;
|
||||
case ucp_bidiLRI: bidiclass = US"LRI"; break;
|
||||
case ucp_bidiLRO: bidiclass = US"LRO"; break;
|
||||
case ucp_bidiPDF: bidiclass = US"PDF"; break;
|
||||
case ucp_bidiPDI: bidiclass = US"PDI"; break;
|
||||
case ucp_bidiR: bidiclass = US"R "; break;
|
||||
case ucp_bidiRLE: bidiclass = US"RLE"; break;
|
||||
case ucp_bidiRLI: bidiclass = US"RLI"; break;
|
||||
case ucp_bidiRLO: bidiclass = US"RLO"; break;
|
||||
case ucp_bidiAN: bidiclass = US"AN "; break;
|
||||
case ucp_bidiB: bidiclass = US"B "; break;
|
||||
case ucp_bidiBN: bidiclass = US"BN "; break;
|
||||
case ucp_bidiCS: bidiclass = US"CS "; break;
|
||||
case ucp_bidiEN: bidiclass = US"EN "; break;
|
||||
case ucp_bidiES: bidiclass = US"ES "; break;
|
||||
case ucp_bidiET: bidiclass = US"ET "; break;
|
||||
case ucp_bidiNSM: bidiclass = US"NSM"; break;
|
||||
case ucp_bidiON: bidiclass = US"ON "; break;
|
||||
case ucp_bidiS: bidiclass = US"S "; break;
|
||||
case ucp_bidiWS: bidiclass = US"WS "; break;
|
||||
default: bidiclass = US"???"; break;
|
||||
}
|
||||
|
||||
printf("U+%04X %s %s: %s, %s, %s", c, bidiclass, typename, fulltypename,
|
||||
scriptname, graphbreak);
|
||||
|
||||
if (is_just_one && othercase != c)
|
||||
{
|
||||
printf(", U+%04X", othercase);
|
||||
|
@ -341,36 +479,47 @@ if (is_just_one && othercase != c)
|
|||
const uint32_t *p = PRIV(ucd_caseless_sets) + caseset - 1;
|
||||
while (*(++p) < NOTACHAR)
|
||||
{
|
||||
unsigned int d = *p;
|
||||
unsigned int d = *p;
|
||||
if (d != othercase && d != c) printf(", U+%04X", d);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (scriptx != script)
|
||||
if (scriptx != 0)
|
||||
{
|
||||
const char *sep = "";
|
||||
const uint32_t *p = PRIV(ucd_script_sets) + scriptx;
|
||||
printf(", [");
|
||||
if (scriptx >= 0)
|
||||
printf("%s", get_scriptname(scriptx));
|
||||
else
|
||||
for (int i = 0; i < ucp_Unknown; i++)
|
||||
if (MAPBIT(p, i) != 0)
|
||||
{
|
||||
const char *sep = "";
|
||||
const uint8_t *p = PRIV(ucd_script_sets) - scriptx;
|
||||
while (*p != 0)
|
||||
{
|
||||
printf("%s%s", sep, get_scriptname(*p++));
|
||||
sep = ", ";
|
||||
}
|
||||
printf("%s%s", sep, get_propname(i, PT_SC));
|
||||
sep = ", ";
|
||||
}
|
||||
printf("]");
|
||||
}
|
||||
|
||||
|
||||
if (bprops != 0)
|
||||
{
|
||||
const char *sep = "";
|
||||
const uint32_t *p = PRIV(ucd_boolprop_sets) +
|
||||
bprops * ucd_boolprop_sets_item_size;
|
||||
printf(", [");
|
||||
for (int i = 0; i < ucp_Bprop_Count; i++)
|
||||
if (MAPBIT(p, i) != 0)
|
||||
{
|
||||
printf("%s%s", sep, get_propname(i, PT_BOOL));
|
||||
sep = ", ";
|
||||
}
|
||||
printf("]");
|
||||
}
|
||||
|
||||
if (show_character && is_just_one)
|
||||
{
|
||||
unsigned char buffer[8];
|
||||
size_t len = ord2utf8(c, buffer);
|
||||
printf(", >%.*s<", (int)len, buffer);
|
||||
}
|
||||
printf(", >%.*s<", (int)len, buffer);
|
||||
}
|
||||
|
||||
printf("\n");
|
||||
}
|
||||
|
@ -384,19 +533,23 @@ printf("\n");
|
|||
static void
|
||||
find_chars(unsigned char *s)
|
||||
{
|
||||
unsigned char name[24];
|
||||
unsigned char value[24];
|
||||
unsigned char name[128];
|
||||
unsigned char value[128];
|
||||
unsigned char *t;
|
||||
unsigned int count= 0;
|
||||
int scriptx_list[24];
|
||||
int scriptx_list[128];
|
||||
unsigned int scriptx_count = 0;
|
||||
int bprop_list[128];
|
||||
unsigned int bprop_count = 0;
|
||||
uint32_t i, c;
|
||||
int script = -1;
|
||||
int type = -1;
|
||||
int gbreak = -1;
|
||||
int bidiclass = -1;
|
||||
BOOL script_not = FALSE;
|
||||
BOOL type_not = FALSE;
|
||||
BOOL gbreak_not = FALSE;
|
||||
BOOL bidiclass_not = FALSE;
|
||||
BOOL hadrange = FALSE;
|
||||
const ucd_record *ucd, *next_ucd;
|
||||
const char *pad = " ";
|
||||
|
@ -410,13 +563,18 @@ while (*s != 0)
|
|||
*t = 0;
|
||||
while (isspace(*s)) s++;
|
||||
|
||||
for (t = value; *s != 0 && !isspace(*s); s++) *t++ = *s;
|
||||
for (t = value; *s != 0 && !isspace(*s); s++)
|
||||
{
|
||||
if (*s != '_' && *s != '-') *t++ = *s;
|
||||
}
|
||||
*t = 0;
|
||||
while (isspace(*s)) s++;
|
||||
|
||||
if (strcmp(CS name, "script") == 0 ||
|
||||
strcmp(CS name, "scriptx") == 0)
|
||||
{
|
||||
for (t = value; *t != 0; t++) *t = tolower(*t);
|
||||
|
||||
if (value[0] == '!')
|
||||
{
|
||||
if (name[6] == 'x') scriptx_not = TRUE;
|
||||
|
@ -426,11 +584,11 @@ while (*s != 0)
|
|||
|
||||
for (i = 0; i < PRIV(utt_size); i++)
|
||||
{
|
||||
const ucp_type_table *u = PRIV(utt) + i;
|
||||
if (u->type == PT_SC && strcmp(CS(value + offset),
|
||||
const ucp_type_table *u = PRIV(utt) + i;
|
||||
if ((u->type == PT_SCX || u->type == PT_SC) && strcmp(CS(value + offset),
|
||||
PRIV(utt_names) + u->name_offset) == 0)
|
||||
{
|
||||
c = u->value;
|
||||
c = u->value;
|
||||
if (name[6] == 'x')
|
||||
{
|
||||
scriptx_list[scriptx_count++] = scriptx_not? (-c):c;
|
||||
|
@ -454,6 +612,33 @@ while (*s != 0)
|
|||
}
|
||||
}
|
||||
|
||||
else if (strcmp(CS name, "bool") == 0)
|
||||
{
|
||||
int not = 1;
|
||||
if (value[0] == '!')
|
||||
{
|
||||
not = -1;
|
||||
offset = 1;
|
||||
}
|
||||
|
||||
for (i = 0; i < PRIV(utt_size); i++)
|
||||
{
|
||||
const ucp_type_table *u = PRIV(utt) + i;
|
||||
if (u->type == PT_BOOL && strcmp(CS(value + offset),
|
||||
PRIV(utt_names) + u->name_offset) == 0)
|
||||
{
|
||||
bprop_list[bprop_count++] = u->value * not;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (i >= PRIV(utt_size))
|
||||
{
|
||||
printf("** Unrecognized property name \"%s\"\n", value);
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
else if (strcmp(CS name, "type") == 0)
|
||||
{
|
||||
if (type >= 0)
|
||||
|
@ -516,6 +701,38 @@ while (*s != 0)
|
|||
}
|
||||
}
|
||||
|
||||
else if (strcmp(CS name, "bidi") == 0 ||
|
||||
strcmp(CS name, "bidiclass") == 0 ||
|
||||
strcmp(CS name, "bidi_class") == 0 )
|
||||
{
|
||||
if (bidiclass >= 0)
|
||||
{
|
||||
printf("** Only 1 bidi class value allowed\n");
|
||||
return;
|
||||
}
|
||||
else
|
||||
{
|
||||
if (value[0] == '!')
|
||||
{
|
||||
bidiclass_not = TRUE;
|
||||
offset = 1;
|
||||
}
|
||||
for (i = 0; i < sizeof(bd_names)/sizeof(char *); i += 2)
|
||||
{
|
||||
if (strcasecmp(CS (value + offset), CS bd_names[i]) == 0)
|
||||
{
|
||||
bidiclass = i/2;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (i >= sizeof(bd_names)/sizeof(char *))
|
||||
{
|
||||
printf("** Unrecognized bidi class name \"%s\"\n", value);
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
else
|
||||
{
|
||||
printf("** Unrecognized property name \"%s\"\n", name);
|
||||
|
@ -523,7 +740,8 @@ while (*s != 0)
|
|||
}
|
||||
}
|
||||
|
||||
if (script < 0 && scriptx_count == 0 && type < 0 && gbreak < 0)
|
||||
if (script < 0 && scriptx_count == 0 && bprop_count == 0 && type < 0 &&
|
||||
gbreak < 0 && bidiclass < 0)
|
||||
{
|
||||
printf("** No properties specified\n");
|
||||
return;
|
||||
|
@ -535,55 +753,55 @@ for (c = 0; c <= 0x10ffff; c++)
|
|||
|
||||
if (scriptx_count > 0)
|
||||
{
|
||||
const uint8_t *char_scriptx = NULL;
|
||||
const uint32_t *bits_scriptx = PRIV(ucd_script_sets) + UCD_SCRIPTX(c);
|
||||
unsigned int found = 0;
|
||||
int scriptx = UCD_SCRIPTX(c);
|
||||
|
||||
if (scriptx < 0) char_scriptx = PRIV(ucd_script_sets) - scriptx;
|
||||
|
||||
for (i = 0; i < scriptx_count; i++)
|
||||
{
|
||||
int x = scriptx_list[i]/32;
|
||||
int y = scriptx_list[i]%32;
|
||||
|
||||
/* Positive requirment */
|
||||
if (scriptx_list[i] >= 0)
|
||||
{
|
||||
if (scriptx >= 0)
|
||||
{
|
||||
if (scriptx == scriptx_list[i]) found++;
|
||||
}
|
||||
|
||||
else
|
||||
{
|
||||
const uint8_t *p;
|
||||
for (p = char_scriptx; *p != 0; p++)
|
||||
{
|
||||
if (scriptx_list[i] == *p)
|
||||
{
|
||||
found++;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
if ((bits_scriptx[x] & (1u<<y)) != 0) found++;
|
||||
}
|
||||
/* Negative requirement */
|
||||
else
|
||||
{
|
||||
if (scriptx >= 0)
|
||||
{
|
||||
if (scriptx != -scriptx_list[i]) found++;
|
||||
}
|
||||
else
|
||||
{
|
||||
const uint8_t *p;
|
||||
for (p = char_scriptx; *p != 0; p++)
|
||||
if (-scriptx_list[i] == *p) break;
|
||||
if (*p == 0) found++;
|
||||
}
|
||||
if ((bits_scriptx[x] & (1u<<y)) == 0) found++;
|
||||
}
|
||||
}
|
||||
|
||||
if (found != scriptx_count) continue;
|
||||
}
|
||||
|
||||
if (bprop_count > 0)
|
||||
{
|
||||
const uint32_t *bits_bprop = PRIV(ucd_boolprop_sets) +
|
||||
UCD_BPROPS(c) * ucd_boolprop_sets_item_size;
|
||||
unsigned int found = 0;
|
||||
|
||||
for (i = 0; i < bprop_count; i++)
|
||||
{
|
||||
int x = bprop_list[i]/32;
|
||||
int y = bprop_list[i]%32;
|
||||
|
||||
/* Positive requirement */
|
||||
if (bprop_list[i] >= 0)
|
||||
{
|
||||
if ((bits_bprop[x] & (1u<<y)) != 0) found++;
|
||||
}
|
||||
/* Negative requirement */
|
||||
else
|
||||
{
|
||||
if ((bits_bprop[-x] & (1u<<(-y))) == 0) found++;
|
||||
}
|
||||
}
|
||||
|
||||
if (found != bprop_count) continue;
|
||||
}
|
||||
|
||||
if (type >= 0)
|
||||
{
|
||||
if (type_not)
|
||||
|
@ -608,6 +826,18 @@ for (c = 0; c <= 0x10ffff; c++)
|
|||
}
|
||||
}
|
||||
|
||||
if (bidiclass >= 0)
|
||||
{
|
||||
if (bidiclass_not)
|
||||
{
|
||||
if (bidiclass == UCD_BIDICLASS(c)) continue;
|
||||
}
|
||||
else
|
||||
{
|
||||
if (bidiclass != UCD_BIDICLASS(c)) continue;
|
||||
}
|
||||
}
|
||||
|
||||
/* All conditions are met. Look for runs. */
|
||||
|
||||
ucd = GET_UCD(c);
|
||||
|
@ -663,23 +893,37 @@ if (strcmp(CS name, "findprop") == 0)
|
|||
{
|
||||
while (*s != 0)
|
||||
{
|
||||
unsigned int c;
|
||||
unsigned int c;
|
||||
unsigned char *endptr;
|
||||
t = s;
|
||||
if (strncmp(CS t, "U+", 2) == 0) t += 2;
|
||||
c = strtoul(CS t, CSS(&endptr), 16);
|
||||
t = s;
|
||||
|
||||
if (*t == '+')
|
||||
{
|
||||
c = *(++t);
|
||||
if (c > 0x7fu)
|
||||
{
|
||||
GETCHARINC(c, t);
|
||||
}
|
||||
endptr = t+1;
|
||||
}
|
||||
else
|
||||
{
|
||||
if (strncmp(CS t, "U+", 2) == 0) t += 2;
|
||||
c = strtoul(CS t, CSS(&endptr), 16);
|
||||
}
|
||||
|
||||
if (*endptr != 0 && !isspace(*endptr))
|
||||
{
|
||||
while (*endptr != 0 && !isspace(*endptr)) endptr++;
|
||||
printf("** Invalid hex number: ignored \"%.*s\"\n", (int)(endptr-s), s);
|
||||
printf("** Invalid character specifier: ignored \"%.*s\"\n", (int)(endptr-s), s);
|
||||
}
|
||||
else
|
||||
else
|
||||
{
|
||||
if (c > 0x10ffff)
|
||||
if (c > 0x10ffff)
|
||||
printf("** U+%x is too big for a Unicode code point\n", c);
|
||||
else
|
||||
else
|
||||
print_prop(c, TRUE);
|
||||
}
|
||||
}
|
||||
s = endptr;
|
||||
while (isspace(*s)) s++;
|
||||
}
|
||||
|
@ -689,7 +933,7 @@ else if (strcmp(CS name, "find") == 0)
|
|||
{
|
||||
find_chars(s);
|
||||
}
|
||||
|
||||
|
||||
else if (strcmp(CS name, "list") == 0)
|
||||
{
|
||||
while (*s != 0)
|
||||
|
@ -698,38 +942,52 @@ else if (strcmp(CS name, "list") == 0)
|
|||
for (t = name; *s != 0 && !isspace(*s); s++) *t++ = *s;
|
||||
*t = 0;
|
||||
while (isspace(*s)) s++;
|
||||
|
||||
|
||||
if (strcmp(CS name, "script") == 0 || strcmp(CS name, "scripts") == 0)
|
||||
{
|
||||
for (i = 0; i < PRIV(utt_size); i++)
|
||||
if (PRIV(utt)[i].type == PT_SC)
|
||||
printf("%s\n", PRIV(utt_names) + PRIV(utt)[i].name_offset);
|
||||
for (i = 0; i < PRIV(utt_size); i++)
|
||||
if (PRIV(utt)[i].type == PT_SCX || PRIV(utt)[i].type == PT_SC)
|
||||
printf("%s\n", PRIV(utt_names) + PRIV(utt)[i].name_offset);
|
||||
}
|
||||
|
||||
|
||||
else if (strcmp(CS name, "bool") == 0)
|
||||
{
|
||||
for (i = 0; i < PRIV(utt_size); i++)
|
||||
if (PRIV(utt)[i].type == PT_BOOL)
|
||||
printf("%s\n", PRIV(utt_names) + PRIV(utt)[i].name_offset);
|
||||
}
|
||||
|
||||
else if (strcmp(CS name, "type") == 0 || strcmp(CS name, "types") == 0)
|
||||
{
|
||||
for (i = 0; i < sizeof(type_names)/sizeof(char *); i += 2)
|
||||
printf("%s %s\n", type_names[i], type_names[i+1]);
|
||||
}
|
||||
|
||||
printf("%s %s\n", type_names[i], type_names[i+1]);
|
||||
}
|
||||
|
||||
else if (strcmp(CS name, "gbreak") == 0 || strcmp(CS name, "gbreaks") == 0)
|
||||
{
|
||||
for (i = 0; i < sizeof(gb_names)/sizeof(char *); i += 2)
|
||||
{
|
||||
if (gb_names[i+1][0] != 0)
|
||||
if (gb_names[i+1][0] != 0)
|
||||
printf("%-3s (%s)\n", gb_names[i], gb_names[i+1]);
|
||||
else
|
||||
else
|
||||
printf("%s\n", gb_names[i]);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
else
|
||||
else if (strcmp(CS name, "bidi") == 0 ||
|
||||
strcmp(CS name, "bidiclasses") == 0)
|
||||
{
|
||||
printf("** Unknown property \"%s\"\n", name);
|
||||
for (i = 0; i < sizeof(bd_names)/sizeof(char *); i += 2)
|
||||
printf("%3s %s\n", bd_names[i], bd_names[i+1]);
|
||||
}
|
||||
|
||||
else
|
||||
{
|
||||
printf("** Unknown property \"%s\"\n", name);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
else printf("** Unknown test command \"%s\"\n", name);
|
||||
}
|
||||
|
@ -751,32 +1009,32 @@ if (argc > 1 && strcmp(argv[1], "-s") == 0)
|
|||
{
|
||||
show_character = TRUE;
|
||||
first_arg++;
|
||||
}
|
||||
}
|
||||
|
||||
if (argc > first_arg)
|
||||
{
|
||||
int i;
|
||||
BOOL hexfirst = TRUE;
|
||||
char *arg = argv[first_arg];
|
||||
BOOL datafirst = TRUE;
|
||||
char *arg = argv[first_arg];
|
||||
unsigned char *s = buffer;
|
||||
|
||||
if (strncmp(arg, "U+", 2) != 0 && !isdigit(*arg))
|
||||
|
||||
if (*arg != '+' && strncmp(arg, "U+", 2) != 0 && !isdigit(*arg))
|
||||
{
|
||||
while (*arg != 0)
|
||||
while (*arg != 0)
|
||||
{
|
||||
if (!isxdigit(*arg++)) { hexfirst = FALSE; break; }
|
||||
}
|
||||
}
|
||||
|
||||
if (hexfirst)
|
||||
if (!isxdigit(*arg++)) { datafirst = FALSE; break; }
|
||||
}
|
||||
}
|
||||
|
||||
if (datafirst)
|
||||
{
|
||||
strcpy(CS s, "findprop ");
|
||||
s += 9;
|
||||
}
|
||||
|
||||
|
||||
for (i = first_arg; i < argc; i++)
|
||||
{
|
||||
s += sprintf(CS s, "%s ", argv[i]);
|
||||
s += sprintf(CS s, "%s ", argv[i]);
|
||||
}
|
||||
|
||||
process_command_line(buffer);
|
||||
|
@ -812,7 +1070,7 @@ for(;;)
|
|||
if (fgets(CS buffer, sizeof(buffer), stdin) == NULL) break;
|
||||
if (!interactive) printf("%s", buffer);
|
||||
}
|
||||
|
||||
|
||||
process_command_line(buffer);
|
||||
}
|
||||
|
||||
|
|
|
@ -46,3 +46,5 @@ findprop 32ff
|
|||
findprop 1f16d
|
||||
|
||||
findprop U+10e93 U+10eaa
|
||||
|
||||
findprop 0602 202a 202b 202c 2068 2069 202d 202e 2067
|
||||
|
|
|
@ -3,3 +3,17 @@ find type Pe script Common scriptx Hangul
|
|||
find type Sk
|
||||
find type Pd
|
||||
find gbreak LVT
|
||||
find script Old_Uyghur
|
||||
find bidi PDF
|
||||
find bidi CS
|
||||
find bidi CS type Sm
|
||||
find bidi B
|
||||
find bidi FSI
|
||||
find bidi PDI
|
||||
find bidi RLI
|
||||
find bidi RLO
|
||||
find bidi S
|
||||
find bidi WS
|
||||
find script bopo
|
||||
find bool prependedconcatenationmark
|
||||
find bool pcm
|
||||
|
|
|
@ -1,398 +1,409 @@
|
|||
findprop 00 01 02 03 04 05 06 07 08 09 0a 0b 0c 0d 0e 0f
|
||||
U+0000 Control: Control, Common, Control
|
||||
U+0001 Control: Control, Common, Control
|
||||
U+0002 Control: Control, Common, Control
|
||||
U+0003 Control: Control, Common, Control
|
||||
U+0004 Control: Control, Common, Control
|
||||
U+0005 Control: Control, Common, Control
|
||||
U+0006 Control: Control, Common, Control
|
||||
U+0007 Control: Control, Common, Control
|
||||
U+0008 Control: Control, Common, Control
|
||||
U+0009 Control: Control, Common, Control
|
||||
U+000A Control: Control, Common, LF
|
||||
U+000B Control: Control, Common, Control
|
||||
U+000C Control: Control, Common, Control
|
||||
U+000D Control: Control, Common, CR
|
||||
U+000E Control: Control, Common, Control
|
||||
U+000F Control: Control, Common, Control
|
||||
U+0000 BN Control: Control, common, Control, [ascii, patternwhitespace, whitespace]
|
||||
U+0001 BN Control: Control, common, Control, [ascii, patternwhitespace, whitespace]
|
||||
U+0002 BN Control: Control, common, Control, [ascii, patternwhitespace, whitespace]
|
||||
U+0003 BN Control: Control, common, Control, [ascii, patternwhitespace, whitespace]
|
||||
U+0004 BN Control: Control, common, Control, [ascii, patternwhitespace, whitespace]
|
||||
U+0005 BN Control: Control, common, Control, [ascii, patternwhitespace, whitespace]
|
||||
U+0006 BN Control: Control, common, Control, [ascii, patternwhitespace, whitespace]
|
||||
U+0007 BN Control: Control, common, Control, [ascii, patternwhitespace, whitespace]
|
||||
U+0008 BN Control: Control, common, Control, [ascii, patternwhitespace, whitespace]
|
||||
U+0009 S Control: Control, common, Control, [ascii, graphemebase, patternsyntax, sentenceterminal, terminalpunctuation]
|
||||
U+000A B Control: Control, common, LF, [ascii, graphemebase, patternsyntax, sentenceterminal, terminalpunctuation]
|
||||
U+000B S Control: Control, common, Control, [ascii, graphemebase, patternsyntax, sentenceterminal, terminalpunctuation]
|
||||
U+000C WS Control: Control, common, Control, [ascii, graphemebase, patternsyntax, sentenceterminal, terminalpunctuation]
|
||||
U+000D B Control: Control, common, CR, [ascii, graphemebase, patternsyntax, sentenceterminal, terminalpunctuation]
|
||||
U+000E BN Control: Control, common, Control, [ascii, patternwhitespace, whitespace]
|
||||
U+000F BN Control: Control, common, Control, [ascii, patternwhitespace, whitespace]
|
||||
findprop 10 11 12 13 14 15 16 17 18 19 1a 1b 1c 1d 1e 1f
|
||||
U+0010 Control: Control, Common, Control
|
||||
U+0011 Control: Control, Common, Control
|
||||
U+0012 Control: Control, Common, Control
|
||||
U+0013 Control: Control, Common, Control
|
||||
U+0014 Control: Control, Common, Control
|
||||
U+0015 Control: Control, Common, Control
|
||||
U+0016 Control: Control, Common, Control
|
||||
U+0017 Control: Control, Common, Control
|
||||
U+0018 Control: Control, Common, Control
|
||||
U+0019 Control: Control, Common, Control
|
||||
U+001A Control: Control, Common, Control
|
||||
U+001B Control: Control, Common, Control
|
||||
U+001C Control: Control, Common, Control
|
||||
U+001D Control: Control, Common, Control
|
||||
U+001E Control: Control, Common, Control
|
||||
U+001F Control: Control, Common, Control
|
||||
U+0010 BN Control: Control, common, Control, [ascii, patternwhitespace, whitespace]
|
||||
U+0011 BN Control: Control, common, Control, [ascii, patternwhitespace, whitespace]
|
||||
U+0012 BN Control: Control, common, Control, [ascii, patternwhitespace, whitespace]
|
||||
U+0013 BN Control: Control, common, Control, [ascii, patternwhitespace, whitespace]
|
||||
U+0014 BN Control: Control, common, Control, [ascii, patternwhitespace, whitespace]
|
||||
U+0015 BN Control: Control, common, Control, [ascii, patternwhitespace, whitespace]
|
||||
U+0016 BN Control: Control, common, Control, [ascii, patternwhitespace, whitespace]
|
||||
U+0017 BN Control: Control, common, Control, [ascii, patternwhitespace, whitespace]
|
||||
U+0018 BN Control: Control, common, Control, [ascii, patternwhitespace, whitespace]
|
||||
U+0019 BN Control: Control, common, Control, [ascii, patternwhitespace, whitespace]
|
||||
U+001A BN Control: Control, common, Control, [ascii, patternwhitespace, whitespace]
|
||||
U+001B BN Control: Control, common, Control, [ascii, patternwhitespace, whitespace]
|
||||
U+001C B Control: Control, common, Control, [ascii, patternwhitespace, whitespace]
|
||||
U+001D B Control: Control, common, Control, [ascii, patternwhitespace, whitespace]
|
||||
U+001E B Control: Control, common, Control, [ascii, patternwhitespace, whitespace]
|
||||
U+001F S Control: Control, common, Control, [ascii, patternwhitespace, whitespace]
|
||||
findprop 20 21 22 23 24 25 26 27 28 29 2a 2b 2c 2d 2e 2f
|
||||
U+0020 Separator: Space separator, Common, Other
|
||||
U+0021 Punctuation: Other punctuation, Common, Other
|
||||
U+0022 Punctuation: Other punctuation, Common, Other
|
||||
U+0023 Punctuation: Other punctuation, Common, Other
|
||||
U+0024 Symbol: Currency symbol, Common, Other
|
||||
U+0025 Punctuation: Other punctuation, Common, Other
|
||||
U+0026 Punctuation: Other punctuation, Common, Other
|
||||
U+0027 Punctuation: Other punctuation, Common, Other
|
||||
U+0028 Punctuation: Open punctuation, Common, Other
|
||||
U+0029 Punctuation: Close punctuation, Common, Other
|
||||
U+002A Punctuation: Other punctuation, Common, Other
|
||||
U+002B Symbol: Mathematical symbol, Common, Other
|
||||
U+002C Punctuation: Other punctuation, Common, Other
|
||||
U+002D Punctuation: Dash punctuation, Common, Other
|
||||
U+002E Punctuation: Other punctuation, Common, Other
|
||||
U+002F Punctuation: Other punctuation, Common, Other
|
||||
U+0020 WS Separator: Space separator, common, Other, [ascii, emoji, emojicomponent, graphemebase, patternsyntax]
|
||||
U+0021 ON Punctuation: Other punctuation, common, Other, [ascii, caseignorable, graphemebase, patternsyntax, quotationmark]
|
||||
U+0022 ON Punctuation: Other punctuation, common, Other, [ascii, graphemebase, math, patternsyntax]
|
||||
U+0023 ET Punctuation: Other punctuation, common, Other, [ascii, dash, graphemebase, patternsyntax]
|
||||
U+0024 ET Symbol: Currency symbol, common, Other, [ascii, asciihexdigit, emoji, emojicomponent, graphemebase, hexdigit, idcontinue, xidcontinue]
|
||||
U+0025 ET Punctuation: Other punctuation, common, Other, [ascii, asciihexdigit, emoji, emojicomponent, graphemebase, hexdigit, idcontinue, xidcontinue]
|
||||
U+0026 ON Punctuation: Other punctuation, common, Other, [ascii, asciihexdigit, emoji, emojicomponent, graphemebase, hexdigit, idcontinue, xidcontinue]
|
||||
U+0027 ON Punctuation: Other punctuation, common, Other, [ascii, bidimirrored, graphemebase, math, patternsyntax]
|
||||
U+0028 ON Punctuation: Open punctuation, common, Other, [ascii, alphabetic, cased, changeswhencasefolded, changeswhencasemapped, changeswhenlowercased, graphemebase, idcontinue, idstart, uppercase, xidcontinue, xidstart]
|
||||
U+0029 ON Punctuation: Close punctuation, common, Other, [ascii, alphabetic, cased, changeswhencasefolded, changeswhencasemapped, changeswhenlowercased, graphemebase, idcontinue, idstart, uppercase, xidcontinue, xidstart]
|
||||
U+002A ON Punctuation: Other punctuation, common, Other, [ascii, dash, graphemebase, patternsyntax]
|
||||
U+002B ES Symbol: Mathematical symbol, common, Other, [ascii, graphemebase, idcontinue, xidcontinue]
|
||||
U+002C CS Punctuation: Other punctuation, common, Other, [ascii, asciihexdigit, alphabetic, cased, changeswhencasemapped, changeswhentitlecased, changeswhenuppercased, graphemebase, hexdigit, idcontinue, idstart, lowercase, xidcontinue, xidstart]
|
||||
U+002D ES Punctuation: Dash punctuation, common, Other, [ascii, alphabetic, cased, changeswhencasemapped, changeswhentitlecased, changeswhenuppercased, graphemebase, idcontinue, idstart, lowercase, softdotted, xidcontinue, xidstart]
|
||||
U+002E CS Punctuation: Other punctuation, common, Other, [graphemebase, whitespace]
|
||||
U+002F CS Punctuation: Other punctuation, common, Other, [ascii, asciihexdigit, emoji, emojicomponent, graphemebase, hexdigit, idcontinue, xidcontinue]
|
||||
findprop 30 31 32 33 34 35 36 37 38 39 3a 3b 3c 3d 3e 3f
|
||||
U+0030 Number: Decimal number, Common, Other
|
||||
U+0031 Number: Decimal number, Common, Other
|
||||
U+0032 Number: Decimal number, Common, Other
|
||||
U+0033 Number: Decimal number, Common, Other
|
||||
U+0034 Number: Decimal number, Common, Other
|
||||
U+0035 Number: Decimal number, Common, Other
|
||||
U+0036 Number: Decimal number, Common, Other
|
||||
U+0037 Number: Decimal number, Common, Other
|
||||
U+0038 Number: Decimal number, Common, Other
|
||||
U+0039 Number: Decimal number, Common, Other
|
||||
U+003A Punctuation: Other punctuation, Common, Other
|
||||
U+003B Punctuation: Other punctuation, Common, Other
|
||||
U+003C Symbol: Mathematical symbol, Common, Other
|
||||
U+003D Symbol: Mathematical symbol, Common, Other
|
||||
U+003E Symbol: Mathematical symbol, Common, Other
|
||||
U+003F Punctuation: Other punctuation, Common, Other
|
||||
U+0030 EN Number: Decimal number, common, Other, [caseignorable, diacritic, graphemebase]
|
||||
U+0031 EN Number: Decimal number, common, Other, [caseignorable, diacritic, graphemebase]
|
||||
U+0032 EN Number: Decimal number, common, Other, [caseignorable, diacritic, graphemebase]
|
||||
U+0033 EN Number: Decimal number, common, Other, [caseignorable, diacritic, graphemebase]
|
||||
U+0034 EN Number: Decimal number, common, Other, [caseignorable, diacritic, graphemebase]
|
||||
U+0035 EN Number: Decimal number, common, Other, [caseignorable, diacritic, graphemebase]
|
||||
U+0036 EN Number: Decimal number, common, Other, [caseignorable, diacritic, graphemebase]
|
||||
U+0037 EN Number: Decimal number, common, Other, [caseignorable, diacritic, graphemebase]
|
||||
U+0038 EN Number: Decimal number, common, Other, [caseignorable, diacritic, graphemebase]
|
||||
U+0039 EN Number: Decimal number, common, Other, [caseignorable, diacritic, graphemebase]
|
||||
U+003A CS Punctuation: Other punctuation, common, Other, [alphabetic, cased, graphemebase, idcontinue, idstart, lowercase, xidcontinue, xidstart]
|
||||
U+003B ON Punctuation: Other punctuation, common, Other, [ascii, asciihexdigit, alphabetic, cased, changeswhencasemapped, changeswhentitlecased, changeswhenuppercased, graphemebase, hexdigit, idcontinue, idstart, lowercase, xidcontinue, xidstart]
|
||||
U+003C ON Symbol: Mathematical symbol, common, Other, [graphemebase, math, patternsyntax]
|
||||
U+003D ON Symbol: Mathematical symbol, common, Other, [ascii, graphemebase, idcontinue, xidcontinue]
|
||||
U+003E ON Symbol: Mathematical symbol, common, Other, [graphemebase, math, patternsyntax]
|
||||
U+003F ON Punctuation: Other punctuation, common, Other, [ascii, caseignorable, graphemebase, patternsyntax, quotationmark]
|
||||
findprop 40 41 42 43 44 45 46 47 48 49 4a 4b 4c 4d 4e 4f
|
||||
U+0040 Punctuation: Other punctuation, Common, Other
|
||||
U+0041 Letter: Upper case letter, Latin, Other, U+0061
|
||||
U+0042 Letter: Upper case letter, Latin, Other, U+0062
|
||||
U+0043 Letter: Upper case letter, Latin, Other, U+0063
|
||||
U+0044 Letter: Upper case letter, Latin, Other, U+0064
|
||||
U+0045 Letter: Upper case letter, Latin, Other, U+0065
|
||||
U+0046 Letter: Upper case letter, Latin, Other, U+0066
|
||||
U+0047 Letter: Upper case letter, Latin, Other, U+0067
|
||||
U+0048 Letter: Upper case letter, Latin, Other, U+0068
|
||||
U+0049 Letter: Upper case letter, Latin, Other, U+0069
|
||||
U+004A Letter: Upper case letter, Latin, Other, U+006A
|
||||
U+004B Letter: Upper case letter, Latin, Other, U+006B, U+212A
|
||||
U+004C Letter: Upper case letter, Latin, Other, U+006C
|
||||
U+004D Letter: Upper case letter, Latin, Other, U+006D
|
||||
U+004E Letter: Upper case letter, Latin, Other, U+006E
|
||||
U+004F Letter: Upper case letter, Latin, Other, U+006F
|
||||
U+0040 ON Punctuation: Other punctuation, common, Other, [ascii, asciihexdigit, emoji, emojicomponent, graphemebase, hexdigit, idcontinue, xidcontinue]
|
||||
U+0041 L Letter: Upper case letter, latin, Other, U+0061, [graphemebase]
|
||||
U+0042 L Letter: Upper case letter, latin, Other, U+0062, [graphemebase]
|
||||
U+0043 L Letter: Upper case letter, latin, Other, U+0063, [graphemebase]
|
||||
U+0044 L Letter: Upper case letter, latin, Other, U+0064, [graphemebase]
|
||||
U+0045 L Letter: Upper case letter, latin, Other, U+0065, [graphemebase]
|
||||
U+0046 L Letter: Upper case letter, latin, Other, U+0066, [graphemebase]
|
||||
U+0047 L Letter: Upper case letter, latin, Other, U+0067, [caseignorable, diacritic, extender, graphemebase, idcontinue, xidcontinue]
|
||||
U+0048 L Letter: Upper case letter, latin, Other, U+0068, [caseignorable, diacritic, extender, graphemebase, idcontinue, xidcontinue]
|
||||
U+0049 L Letter: Upper case letter, latin, Other, U+0069, [caseignorable, diacritic, extender, graphemebase, idcontinue, xidcontinue]
|
||||
U+004A L Letter: Upper case letter, latin, Other, U+006A, [caseignorable, diacritic, extender, graphemebase, idcontinue, xidcontinue]
|
||||
U+004B L Letter: Upper case letter, latin, Other, U+006B, U+212A, [caseignorable, diacritic, extender, graphemebase, idcontinue, xidcontinue]
|
||||
U+004C L Letter: Upper case letter, latin, Other, U+006C, [caseignorable, diacritic, extender, graphemebase, idcontinue, xidcontinue]
|
||||
U+004D L Letter: Upper case letter, latin, Other, U+006D, [caseignorable, diacritic, extender, graphemebase, idcontinue, xidcontinue]
|
||||
U+004E L Letter: Upper case letter, latin, Other, U+006E, [caseignorable, diacritic, extender, graphemebase, idcontinue, xidcontinue]
|
||||
U+004F L Letter: Upper case letter, latin, Other, U+006F, [caseignorable, diacritic, extender, graphemebase, idcontinue, xidcontinue]
|
||||
findprop 50 51 52 53 54 55 56 57 58 59 5a 5b 5c 5d 5e 5f
|
||||
U+0050 Letter: Upper case letter, Latin, Other, U+0070
|
||||
U+0051 Letter: Upper case letter, Latin, Other, U+0071
|
||||
U+0052 Letter: Upper case letter, Latin, Other, U+0072
|
||||
U+0053 Letter: Upper case letter, Latin, Other, U+0073, U+017F
|
||||
U+0054 Letter: Upper case letter, Latin, Other, U+0074
|
||||
U+0055 Letter: Upper case letter, Latin, Other, U+0075
|
||||
U+0056 Letter: Upper case letter, Latin, Other, U+0076
|
||||
U+0057 Letter: Upper case letter, Latin, Other, U+0077
|
||||
U+0058 Letter: Upper case letter, Latin, Other, U+0078
|
||||
U+0059 Letter: Upper case letter, Latin, Other, U+0079
|
||||
U+005A Letter: Upper case letter, Latin, Other, U+007A
|
||||
U+005B Punctuation: Open punctuation, Common, Other
|
||||
U+005C Punctuation: Other punctuation, Common, Other
|
||||
U+005D Punctuation: Close punctuation, Common, Other
|
||||
U+005E Symbol: Modifier symbol, Common, Other
|
||||
U+005F Punctuation: Connector punctuation, Common, Other
|
||||
U+0050 L Letter: Upper case letter, latin, Other, U+0070, [caseignorable, diacritic, extender, graphemebase, idcontinue, xidcontinue]
|
||||
U+0051 L Letter: Upper case letter, latin, Other, U+0071, [caseignorable, diacritic, extender, graphemebase, idcontinue, xidcontinue]
|
||||
U+0052 L Letter: Upper case letter, latin, Other, U+0072, [caseignorable, diacritic, extender, graphemebase, idcontinue, xidcontinue]
|
||||
U+0053 L Letter: Upper case letter, latin, Other, U+0073, U+017F, [caseignorable, diacritic, extender, graphemebase, idcontinue, xidcontinue]
|
||||
U+0054 L Letter: Upper case letter, latin, Other, U+0074, [caseignorable, diacritic, extender, graphemebase, idcontinue, xidcontinue]
|
||||
U+0055 L Letter: Upper case letter, latin, Other, U+0075, [caseignorable, diacritic, extender, graphemebase, idcontinue, xidcontinue]
|
||||
U+0056 L Letter: Upper case letter, latin, Other, U+0076, [caseignorable, diacritic, extender, graphemebase, idcontinue, xidcontinue]
|
||||
U+0057 L Letter: Upper case letter, latin, Other, U+0077, [caseignorable, diacritic, extender, graphemebase, idcontinue, xidcontinue]
|
||||
U+0058 L Letter: Upper case letter, latin, Other, U+0078, [caseignorable, diacritic, extender, graphemebase, idcontinue, xidcontinue]
|
||||
U+0059 L Letter: Upper case letter, latin, Other, U+0079, [caseignorable, diacritic, extender, graphemebase, idcontinue, xidcontinue]
|
||||
U+005A L Letter: Upper case letter, latin, Other, U+007A, [caseignorable, diacritic, extender, graphemebase, idcontinue, xidcontinue]
|
||||
U+005B ON Punctuation: Open punctuation, common, Other, [ascii, alphabetic, cased, changeswhencasefolded, changeswhencasemapped, changeswhenlowercased, graphemebase, idcontinue, idstart, uppercase, xidcontinue, xidstart]
|
||||
U+005C ON Punctuation: Other punctuation, common, Other, [ascii, asciihexdigit, emoji, emojicomponent, graphemebase, hexdigit, idcontinue, xidcontinue]
|
||||
U+005D ON Punctuation: Close punctuation, common, Other, [ascii, alphabetic, cased, changeswhencasefolded, changeswhencasemapped, changeswhenlowercased, graphemebase, idcontinue, idstart, uppercase, xidcontinue, xidstart]
|
||||
U+005E ON Symbol: Modifier symbol, common, Other, [alphabetic, cased, changeswhencasemapped, changeswhentitlecased, changeswhenuppercased, graphemebase, idcontinue, idstart, lowercase, xidcontinue, xidstart]
|
||||
U+005F ON Punctuation: Connector punctuation, common, Other, [alphabetic, cased, changeswhencasefolded, changeswhencasemapped, changeswhentitlecased, changeswhenuppercased, deprecated, graphemebase, idcontinue, idstart, lowercase, xidcontinue, xidstart]
|
||||
findprop 60 61 62 63 64 65 66 67 68 69 6a 6b 6c 6d 6e 6f
|
||||
U+0060 Symbol: Modifier symbol, Common, Other
|
||||
U+0061 Letter: Lower case letter, Latin, Other, U+0041
|
||||
U+0062 Letter: Lower case letter, Latin, Other, U+0042
|
||||
U+0063 Letter: Lower case letter, Latin, Other, U+0043
|
||||
U+0064 Letter: Lower case letter, Latin, Other, U+0044
|
||||
U+0065 Letter: Lower case letter, Latin, Other, U+0045
|
||||
U+0066 Letter: Lower case letter, Latin, Other, U+0046
|
||||
U+0067 Letter: Lower case letter, Latin, Other, U+0047
|
||||
U+0068 Letter: Lower case letter, Latin, Other, U+0048
|
||||
U+0069 Letter: Lower case letter, Latin, Other, U+0049
|
||||
U+006A Letter: Lower case letter, Latin, Other, U+004A
|
||||
U+006B Letter: Lower case letter, Latin, Other, U+004B, U+212A
|
||||
U+006C Letter: Lower case letter, Latin, Other, U+004C
|
||||
U+006D Letter: Lower case letter, Latin, Other, U+004D
|
||||
U+006E Letter: Lower case letter, Latin, Other, U+004E
|
||||
U+006F Letter: Lower case letter, Latin, Other, U+004F
|
||||
U+0060 ON Symbol: Modifier symbol, common, Other, [alphabetic, cased, changeswhencasefolded, changeswhencasemapped, changeswhenlowercased, changeswhentitlecased, graphemebase, idcontinue, idstart, uppercase, xidcontinue, xidstart]
|
||||
U+0061 L Letter: Lower case letter, latin, Other, U+0041, [alphabetic, caseignorable, cased, diacritic, graphemebase, idcontinue, idstart, lowercase, xidcontinue, xidstart]
|
||||
U+0062 L Letter: Lower case letter, latin, Other, U+0042, [alphabetic, caseignorable, cased, diacritic, graphemebase, idcontinue, idstart, lowercase, xidcontinue, xidstart]
|
||||
U+0063 L Letter: Lower case letter, latin, Other, U+0043, [alphabetic, caseignorable, cased, diacritic, graphemebase, idcontinue, idstart, lowercase, xidcontinue, xidstart]
|
||||
U+0064 L Letter: Lower case letter, latin, Other, U+0044, [alphabetic, caseignorable, cased, diacritic, graphemebase, idcontinue, idstart, lowercase, xidcontinue, xidstart]
|
||||
U+0065 L Letter: Lower case letter, latin, Other, U+0045, [alphabetic, caseignorable, cased, diacritic, graphemebase, idcontinue, idstart, lowercase, xidcontinue, xidstart]
|
||||
U+0066 L Letter: Lower case letter, latin, Other, U+0046, [alphabetic, caseignorable, cased, diacritic, graphemebase, idcontinue, idstart, lowercase, xidcontinue, xidstart]
|
||||
U+0067 L Letter: Lower case letter, latin, Other, U+0047, [alphabetic, caseignorable, diacritic, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
|
||||
U+0068 L Letter: Lower case letter, latin, Other, U+0048, [alphabetic, caseignorable, diacritic, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
|
||||
U+0069 L Letter: Lower case letter, latin, Other, U+0049, [caseignorable, diacritic, graphemeextend, idcontinue, xidcontinue]
|
||||
U+006A L Letter: Lower case letter, latin, Other, U+004A, [caseignorable, diacritic, graphemeextend, idcontinue, xidcontinue]
|
||||
U+006B L Letter: Lower case letter, latin, Other, U+004B, U+212A, [alphabetic, caseignorable, diacritic, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
|
||||
U+006C L Letter: Lower case letter, latin, Other, U+004C, [alphabetic, caseignorable, diacritic, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
|
||||
U+006D L Letter: Lower case letter, latin, Other, U+004D, [alphabetic, caseignorable, diacritic, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
|
||||
U+006E L Letter: Lower case letter, latin, Other, U+004E, [alphabetic, caseignorable, diacritic, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
|
||||
U+006F L Letter: Lower case letter, latin, Other, U+004F, [alphabetic, caseignorable, diacritic, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
|
||||
findprop 70 71 72 73 74 75 76 77 78 79 7a 7b 7c 7d 7e 7f
|
||||
U+0070 Letter: Lower case letter, Latin, Other, U+0050
|
||||
U+0071 Letter: Lower case letter, Latin, Other, U+0051
|
||||
U+0072 Letter: Lower case letter, Latin, Other, U+0052
|
||||
U+0073 Letter: Lower case letter, Latin, Other, U+0053, U+017F
|
||||
U+0074 Letter: Lower case letter, Latin, Other, U+0054
|
||||
U+0075 Letter: Lower case letter, Latin, Other, U+0055
|
||||
U+0076 Letter: Lower case letter, Latin, Other, U+0056
|
||||
U+0077 Letter: Lower case letter, Latin, Other, U+0057
|
||||
U+0078 Letter: Lower case letter, Latin, Other, U+0058
|
||||
U+0079 Letter: Lower case letter, Latin, Other, U+0059
|
||||
U+007A Letter: Lower case letter, Latin, Other, U+005A
|
||||
U+007B Punctuation: Open punctuation, Common, Other
|
||||
U+007C Symbol: Mathematical symbol, Common, Other
|
||||
U+007D Punctuation: Close punctuation, Common, Other
|
||||
U+007E Symbol: Mathematical symbol, Common, Other
|
||||
U+007F Control: Control, Common, Control
|
||||
U+0070 L Letter: Lower case letter, latin, Other, U+0050, [alphabetic, caseignorable, diacritic, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
|
||||
U+0071 L Letter: Lower case letter, latin, Other, U+0051, [alphabetic, caseignorable, diacritic, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
|
||||
U+0072 L Letter: Lower case letter, latin, Other, U+0052, [alphabetic, caseignorable, diacritic, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
|
||||
U+0073 L Letter: Lower case letter, latin, Other, U+0053, U+017F, [alphabetic, caseignorable, diacritic, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
|
||||
U+0074 L Letter: Lower case letter, latin, Other, U+0054, [alphabetic, caseignorable, diacritic, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
|
||||
U+0075 L Letter: Lower case letter, latin, Other, U+0055, [alphabetic, caseignorable, diacritic, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
|
||||
U+0076 L Letter: Lower case letter, latin, Other, U+0056, [alphabetic, caseignorable, diacritic, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
|
||||
U+0077 L Letter: Lower case letter, latin, Other, U+0057, [alphabetic, caseignorable, diacritic, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
|
||||
U+0078 L Letter: Lower case letter, latin, Other, U+0058, [alphabetic, caseignorable, diacritic, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
|
||||
U+0079 L Letter: Lower case letter, latin, Other, U+0059, [alphabetic, caseignorable, diacritic, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
|
||||
U+007A L Letter: Lower case letter, latin, Other, U+005A, [alphabetic, caseignorable, diacritic, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
|
||||
U+007B ON Punctuation: Open punctuation, common, Other, [ascii, alphabetic, cased, changeswhencasefolded, changeswhencasemapped, changeswhenlowercased, graphemebase, idcontinue, idstart, uppercase, xidcontinue, xidstart]
|
||||
U+007C ON Symbol: Mathematical symbol, common, Other, [ascii, graphemebase, idcontinue, xidcontinue]
|
||||
U+007D ON Punctuation: Close punctuation, common, Other, [ascii, alphabetic, cased, changeswhencasefolded, changeswhencasemapped, changeswhenlowercased, graphemebase, idcontinue, idstart, uppercase, xidcontinue, xidstart]
|
||||
U+007E ON Symbol: Mathematical symbol, common, Other, [ascii, graphemebase, idcontinue, xidcontinue]
|
||||
U+007F BN Control: Control, common, Control, [ascii, patternwhitespace, whitespace]
|
||||
|
||||
findprop 80 81 82 83 84 85 86 87 88 89 8a 8b 8c 8d 8e 8f
|
||||
U+0080 Control: Control, Common, Control
|
||||
U+0081 Control: Control, Common, Control
|
||||
U+0082 Control: Control, Common, Control
|
||||
U+0083 Control: Control, Common, Control
|
||||
U+0084 Control: Control, Common, Control
|
||||
U+0085 Control: Control, Common, Control
|
||||
U+0086 Control: Control, Common, Control
|
||||
U+0087 Control: Control, Common, Control
|
||||
U+0088 Control: Control, Common, Control
|
||||
U+0089 Control: Control, Common, Control
|
||||
U+008A Control: Control, Common, Control
|
||||
U+008B Control: Control, Common, Control
|
||||
U+008C Control: Control, Common, Control
|
||||
U+008D Control: Control, Common, Control
|
||||
U+008E Control: Control, Common, Control
|
||||
U+008F Control: Control, Common, Control
|
||||
U+0080 BN Control: Control, common, Control
|
||||
U+0081 BN Control: Control, common, Control
|
||||
U+0082 BN Control: Control, common, Control
|
||||
U+0083 BN Control: Control, common, Control
|
||||
U+0084 BN Control: Control, common, Control
|
||||
U+0085 B Control: Control, common, Control, [caseignorable, defaultignorablecodepoint, graphemeextend, idcontinue, xidcontinue]
|
||||
U+0086 BN Control: Control, common, Control
|
||||
U+0087 BN Control: Control, common, Control
|
||||
U+0088 BN Control: Control, common, Control
|
||||
U+0089 BN Control: Control, common, Control
|
||||
U+008A BN Control: Control, common, Control
|
||||
U+008B BN Control: Control, common, Control
|
||||
U+008C BN Control: Control, common, Control
|
||||
U+008D BN Control: Control, common, Control
|
||||
U+008E BN Control: Control, common, Control
|
||||
U+008F BN Control: Control, common, Control
|
||||
findprop 90 91 92 93 94 95 96 97 98 99 9a 9b 9c 9d 9e 9f
|
||||
U+0090 Control: Control, Common, Control
|
||||
U+0091 Control: Control, Common, Control
|
||||
U+0092 Control: Control, Common, Control
|
||||
U+0093 Control: Control, Common, Control
|
||||
U+0094 Control: Control, Common, Control
|
||||
U+0095 Control: Control, Common, Control
|
||||
U+0096 Control: Control, Common, Control
|
||||
U+0097 Control: Control, Common, Control
|
||||
U+0098 Control: Control, Common, Control
|
||||
U+0099 Control: Control, Common, Control
|
||||
U+009A Control: Control, Common, Control
|
||||
U+009B Control: Control, Common, Control
|
||||
U+009C Control: Control, Common, Control
|
||||
U+009D Control: Control, Common, Control
|
||||
U+009E Control: Control, Common, Control
|
||||
U+009F Control: Control, Common, Control
|
||||
U+0090 BN Control: Control, common, Control
|
||||
U+0091 BN Control: Control, common, Control
|
||||
U+0092 BN Control: Control, common, Control
|
||||
U+0093 BN Control: Control, common, Control
|
||||
U+0094 BN Control: Control, common, Control
|
||||
U+0095 BN Control: Control, common, Control
|
||||
U+0096 BN Control: Control, common, Control
|
||||
U+0097 BN Control: Control, common, Control
|
||||
U+0098 BN Control: Control, common, Control
|
||||
U+0099 BN Control: Control, common, Control
|
||||
U+009A BN Control: Control, common, Control
|
||||
U+009B BN Control: Control, common, Control
|
||||
U+009C BN Control: Control, common, Control
|
||||
U+009D BN Control: Control, common, Control
|
||||
U+009E BN Control: Control, common, Control
|
||||
U+009F BN Control: Control, common, Control
|
||||
findprop a0 a1 a2 a3 a4 a5 a6 a7 a8 a9 aa ab ac ad ae af
|
||||
U+00A0 Separator: Space separator, Common, Other
|
||||
U+00A1 Punctuation: Other punctuation, Common, Other
|
||||
U+00A2 Symbol: Currency symbol, Common, Other
|
||||
U+00A3 Symbol: Currency symbol, Common, Other
|
||||
U+00A4 Symbol: Currency symbol, Common, Other
|
||||
U+00A5 Symbol: Currency symbol, Common, Other
|
||||
U+00A6 Symbol: Other symbol, Common, Other
|
||||
U+00A7 Punctuation: Other punctuation, Common, Other
|
||||
U+00A8 Symbol: Modifier symbol, Common, Other
|
||||
U+00A9 Symbol: Other symbol, Common, Extended Pictographic
|
||||
U+00AA Letter: Other letter, Latin, Other
|
||||
U+00AB Punctuation: Initial punctuation, Common, Other
|
||||
U+00AC Symbol: Mathematical symbol, Common, Other
|
||||
U+00AD Control: Format, Common, Control
|
||||
U+00AE Symbol: Other symbol, Common, Extended Pictographic
|
||||
U+00AF Symbol: Modifier symbol, Common, Other
|
||||
U+00A0 CS Separator: Space separator, common, Other, [alphabetic, caseignorable, cased, diacritic, graphemebase, idcontinue, idstart, lowercase]
|
||||
U+00A1 ON Punctuation: Other punctuation, common, Other, [caseignorable, graphemebase, idcontinue, terminalpunctuation, xidcontinue]
|
||||
U+00A2 ET Symbol: Currency symbol, common, Other, [caseignorable, graphemebase, idcontinue, terminalpunctuation, xidcontinue]
|
||||
U+00A3 ET Symbol: Currency symbol, common, Other, [caseignorable, graphemebase, idcontinue, terminalpunctuation, xidcontinue]
|
||||
U+00A4 ET Symbol: Currency symbol, common, Other, [caseignorable, graphemebase, idcontinue, terminalpunctuation, xidcontinue]
|
||||
U+00A5 ET Symbol: Currency symbol, common, Other, [caseignorable, graphemebase, idcontinue, terminalpunctuation, xidcontinue]
|
||||
U+00A6 ON Symbol: Other symbol, common, Other, [caseignorable, graphemebase, idcontinue, terminalpunctuation, xidcontinue]
|
||||
U+00A7 ON Punctuation: Other punctuation, common, Other, [caseignorable, graphemebase, idcontinue, terminalpunctuation, xidcontinue]
|
||||
U+00A8 ON Symbol: Modifier symbol, common, Other, [alphabetic, cased, graphemebase, idcontinue, idstart, math, uppercase, xidcontinue, xidstart]
|
||||
U+00A9 ON Symbol: Other symbol, common, Extended Pictographic, [alphabetic, cased, changeswhencasefolded, changeswhencasemapped, changeswhenlowercased, graphemebase, idcontinue, idstart, math, uppercase, xidcontinue, xidstart]
|
||||
U+00AA L Letter: Other letter, latin, Other, [caseignorable, graphemeextend]
|
||||
U+00AB ON Punctuation: Initial punctuation, common, Other, [graphemebase, sentenceterminal, terminalpunctuation]
|
||||
U+00AC ON Symbol: Mathematical symbol, common, Other, [alphabetic, caseignorable, diacritic, graphemeextend, idcontinue, xidcontinue]
|
||||
U+00AD BN Control: Format, common, Control, [caseignorable, prependedconcatenationmark]
|
||||
U+00AE ON Symbol: Other symbol, common, Extended Pictographic, [alphabetic, cased, changeswhencasefolded, changeswhencasemapped, changeswhenlowercased, graphemebase, idcontinue, idstart, math, uppercase, xidcontinue, xidstart]
|
||||
U+00AF ON Symbol: Modifier symbol, common, Other, [alphabetic, cased, graphemebase, idcontinue, idstart, math, uppercase, xidcontinue, xidstart]
|
||||
findprop b0 b1 b2 b3 b4 b5 b6 b7 b8 b9 ba bb bc bd be bf
|
||||
U+00B0 Symbol: Other symbol, Common, Other
|
||||
U+00B1 Symbol: Mathematical symbol, Common, Other
|
||||
U+00B2 Number: Other number, Common, Other
|
||||
U+00B3 Number: Other number, Common, Other
|
||||
U+00B4 Symbol: Modifier symbol, Common, Other
|
||||
U+00B5 Letter: Lower case letter, Common, Other, U+03BC, U+039C
|
||||
U+00B6 Punctuation: Other punctuation, Common, Other
|
||||
U+00B7 Punctuation: Other punctuation, Common, Other
|
||||
U+00B8 Symbol: Modifier symbol, Common, Other
|
||||
U+00B9 Number: Other number, Common, Other
|
||||
U+00BA Letter: Other letter, Latin, Other
|
||||
U+00BB Punctuation: Final punctuation, Common, Other
|
||||
U+00BC Number: Other number, Common, Other
|
||||
U+00BD Number: Other number, Common, Other
|
||||
U+00BE Number: Other number, Common, Other
|
||||
U+00BF Punctuation: Other punctuation, Common, Other
|
||||
U+00B0 ET Symbol: Other symbol, common, Other, [caseignorable, graphemebase, idcontinue, terminalpunctuation, xidcontinue]
|
||||
U+00B1 ET Symbol: Mathematical symbol, common, Other, [alphabetic, caseignorable, diacritic, graphemeextend, idcontinue, xidcontinue]
|
||||
U+00B2 EN Number: Other number, common, Other, [alphabetic, caseignorable, extender, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
|
||||
U+00B3 EN Number: Other number, common, Other, [alphabetic, caseignorable, extender, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
|
||||
U+00B4 ON Symbol: Modifier symbol, common, Other, [alphabetic, cased, graphemebase, idcontinue, idstart, math, uppercase, xidcontinue, xidstart]
|
||||
U+00B5 L Letter: Lower case letter, common, Other, U+03BC, U+039C, [alphabetic, deprecated, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
|
||||
U+00B6 ON Punctuation: Other punctuation, common, Other, [caseignorable, graphemebase, idcontinue, terminalpunctuation, xidcontinue]
|
||||
U+00B7 ON Punctuation: Other punctuation, common, Other, [alphabetic, graphemebase, idcontinue, xidcontinue]
|
||||
U+00B8 ON Symbol: Modifier symbol, common, Other, [alphabetic, cased, graphemebase, idcontinue, idstart, math, uppercase, xidcontinue, xidstart]
|
||||
U+00B9 EN Number: Other number, common, Other, [alphabetic, caseignorable, extender, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
|
||||
U+00BA L Letter: Other letter, latin, Other, [caseignorable, graphemeextend]
|
||||
U+00BB ON Punctuation: Final punctuation, common, Other, [graphemebase, sentenceterminal, terminalpunctuation]
|
||||
U+00BC ON Number: Other number, common, Other, [alphabetic, caseignorable, extender, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
|
||||
U+00BD ON Number: Other number, common, Other, [alphabetic, caseignorable, extender, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
|
||||
U+00BE ON Number: Other number, common, Other, [alphabetic, caseignorable, extender, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
|
||||
U+00BF ON Punctuation: Other punctuation, common, Other, [caseignorable, graphemebase, idcontinue, terminalpunctuation, xidcontinue]
|
||||
findprop c0 c1 c2 c3 c4 c5 c6 c7 c8 c9 ca cb cc cd ce cf
|
||||
U+00C0 Letter: Upper case letter, Latin, Other, U+00E0
|
||||
U+00C1 Letter: Upper case letter, Latin, Other, U+00E1
|
||||
U+00C2 Letter: Upper case letter, Latin, Other, U+00E2
|
||||
U+00C3 Letter: Upper case letter, Latin, Other, U+00E3
|
||||
U+00C4 Letter: Upper case letter, Latin, Other, U+00E4
|
||||
U+00C5 Letter: Upper case letter, Latin, Other, U+00E5, U+212B
|
||||
U+00C6 Letter: Upper case letter, Latin, Other, U+00E6
|
||||
U+00C7 Letter: Upper case letter, Latin, Other, U+00E7
|
||||
U+00C8 Letter: Upper case letter, Latin, Other, U+00E8
|
||||
U+00C9 Letter: Upper case letter, Latin, Other, U+00E9
|
||||
U+00CA Letter: Upper case letter, Latin, Other, U+00EA
|
||||
U+00CB Letter: Upper case letter, Latin, Other, U+00EB
|
||||
U+00CC Letter: Upper case letter, Latin, Other, U+00EC
|
||||
U+00CD Letter: Upper case letter, Latin, Other, U+00ED
|
||||
U+00CE Letter: Upper case letter, Latin, Other, U+00EE
|
||||
U+00CF Letter: Upper case letter, Latin, Other, U+00EF
|
||||
U+00C0 L Letter: Upper case letter, latin, Other, U+00E0, [alphabetic, graphemeextend, idcontinue, xidcontinue]
|
||||
U+00C1 L Letter: Upper case letter, latin, Other, U+00E1, [alphabetic, graphemeextend, idcontinue, xidcontinue]
|
||||
U+00C2 L Letter: Upper case letter, latin, Other, U+00E2, [alphabetic, graphemeextend, idcontinue, xidcontinue]
|
||||
U+00C3 L Letter: Upper case letter, latin, Other, U+00E3, [alphabetic, graphemeextend, idcontinue, xidcontinue]
|
||||
U+00C4 L Letter: Upper case letter, latin, Other, U+00E4, [alphabetic, graphemeextend, idcontinue, xidcontinue]
|
||||
U+00C5 L Letter: Upper case letter, latin, Other, U+00E5, U+212B, [alphabetic, graphemeextend, idcontinue, xidcontinue]
|
||||
U+00C6 L Letter: Upper case letter, latin, Other, U+00E6, [alphabetic, graphemeextend, idcontinue, xidcontinue]
|
||||
U+00C7 L Letter: Upper case letter, latin, Other, U+00E7, [alphabetic, graphemeextend, idcontinue, xidcontinue]
|
||||
U+00C8 L Letter: Upper case letter, latin, Other, U+00E8, [alphabetic, graphemeextend, idcontinue, xidcontinue]
|
||||
U+00C9 L Letter: Upper case letter, latin, Other, U+00E9, [alphabetic, graphemeextend, idcontinue, xidcontinue]
|
||||
U+00CA L Letter: Upper case letter, latin, Other, U+00EA, [alphabetic, graphemeextend, idcontinue, xidcontinue]
|
||||
U+00CB L Letter: Upper case letter, latin, Other, U+00EB, [alphabetic, graphemeextend, idcontinue, xidcontinue]
|
||||
U+00CC L Letter: Upper case letter, latin, Other, U+00EC, [alphabetic, graphemeextend, idcontinue, xidcontinue]
|
||||
U+00CD L Letter: Upper case letter, latin, Other, U+00ED, [alphabetic, graphemeextend, idcontinue, xidcontinue]
|
||||
U+00CE L Letter: Upper case letter, latin, Other, U+00EE, [alphabetic, graphemeextend, idcontinue, xidcontinue]
|
||||
U+00CF L Letter: Upper case letter, latin, Other, U+00EF, [alphabetic, graphemeextend, idcontinue, xidcontinue]
|
||||
findprop d0 d1 d2 d3 d4 d5 d6 d7 d8 d9 da db dc dd de df
|
||||
U+00D0 Letter: Upper case letter, Latin, Other, U+00F0
|
||||
U+00D1 Letter: Upper case letter, Latin, Other, U+00F1
|
||||
U+00D2 Letter: Upper case letter, Latin, Other, U+00F2
|
||||
U+00D3 Letter: Upper case letter, Latin, Other, U+00F3
|
||||
U+00D4 Letter: Upper case letter, Latin, Other, U+00F4
|
||||
U+00D5 Letter: Upper case letter, Latin, Other, U+00F5
|
||||
U+00D6 Letter: Upper case letter, Latin, Other, U+00F6
|
||||
U+00D7 Symbol: Mathematical symbol, Common, Other
|
||||
U+00D8 Letter: Upper case letter, Latin, Other, U+00F8
|
||||
U+00D9 Letter: Upper case letter, Latin, Other, U+00F9
|
||||
U+00DA Letter: Upper case letter, Latin, Other, U+00FA
|
||||
U+00DB Letter: Upper case letter, Latin, Other, U+00FB
|
||||
U+00DC Letter: Upper case letter, Latin, Other, U+00FC
|
||||
U+00DD Letter: Upper case letter, Latin, Other, U+00FD
|
||||
U+00DE Letter: Upper case letter, Latin, Other, U+00FE
|
||||
U+00DF Letter: Lower case letter, Latin, Other, U+1E9E
|
||||
U+00D0 L Letter: Upper case letter, latin, Other, U+00F0, [alphabetic, graphemeextend, idcontinue, xidcontinue]
|
||||
U+00D1 L Letter: Upper case letter, latin, Other, U+00F1, [alphabetic, graphemeextend, idcontinue, xidcontinue]
|
||||
U+00D2 L Letter: Upper case letter, latin, Other, U+00F2, [alphabetic, graphemeextend, idcontinue, xidcontinue]
|
||||
U+00D3 L Letter: Upper case letter, latin, Other, U+00F3, [alphabetic, graphemeextend, idcontinue, xidcontinue]
|
||||
U+00D4 L Letter: Upper case letter, latin, Other, U+00F4, [alphabetic, graphemeextend, idcontinue, xidcontinue]
|
||||
U+00D5 L Letter: Upper case letter, latin, Other, U+00F5, [alphabetic, graphemeextend, idcontinue, xidcontinue]
|
||||
U+00D6 L Letter: Upper case letter, latin, Other, U+00F6, [alphabetic, graphemeextend, idcontinue, xidcontinue]
|
||||
U+00D7 ON Symbol: Mathematical symbol, common, Other, [alphabetic, caseignorable, diacritic, graphemeextend, idcontinue, xidcontinue]
|
||||
U+00D8 L Letter: Upper case letter, latin, Other, U+00F8, [alphabetic, graphemeextend, idcontinue, xidcontinue]
|
||||
U+00D9 L Letter: Upper case letter, latin, Other, U+00F9, [alphabetic, graphemeextend, idcontinue, xidcontinue]
|
||||
U+00DA L Letter: Upper case letter, latin, Other, U+00FA, [alphabetic, graphemeextend, idcontinue, xidcontinue]
|
||||
U+00DB L Letter: Upper case letter, latin, Other, U+00FB, [alphabetic, graphemeextend, idcontinue, xidcontinue]
|
||||
U+00DC L Letter: Upper case letter, latin, Other, U+00FC, [alphabetic, graphemeextend, idcontinue, xidcontinue]
|
||||
U+00DD L Letter: Upper case letter, latin, Other, U+00FD, [alphabetic, graphemeextend, idcontinue, xidcontinue]
|
||||
U+00DE L Letter: Upper case letter, latin, Other, U+00FE, [alphabetic, graphemeextend, idcontinue, xidcontinue]
|
||||
U+00DF L Letter: Lower case letter, latin, Other, U+1E9E, [alphabetic, deprecated, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
|
||||
findprop e0 e1 e2 e3 e4 e5 e6 e7 e8 e9 ea eb ec ed ee ef
|
||||
U+00E0 Letter: Lower case letter, Latin, Other, U+00C0
|
||||
U+00E1 Letter: Lower case letter, Latin, Other, U+00C1
|
||||
U+00E2 Letter: Lower case letter, Latin, Other, U+00C2
|
||||
U+00E3 Letter: Lower case letter, Latin, Other, U+00C3
|
||||
U+00E4 Letter: Lower case letter, Latin, Other, U+00C4
|
||||
U+00E5 Letter: Lower case letter, Latin, Other, U+00C5, U+212B
|
||||
U+00E6 Letter: Lower case letter, Latin, Other, U+00C6
|
||||
U+00E7 Letter: Lower case letter, Latin, Other, U+00C7
|
||||
U+00E8 Letter: Lower case letter, Latin, Other, U+00C8
|
||||
U+00E9 Letter: Lower case letter, Latin, Other, U+00C9
|
||||
U+00EA Letter: Lower case letter, Latin, Other, U+00CA
|
||||
U+00EB Letter: Lower case letter, Latin, Other, U+00CB
|
||||
U+00EC Letter: Lower case letter, Latin, Other, U+00CC
|
||||
U+00ED Letter: Lower case letter, Latin, Other, U+00CD
|
||||
U+00EE Letter: Lower case letter, Latin, Other, U+00CE
|
||||
U+00EF Letter: Lower case letter, Latin, Other, U+00CF
|
||||
U+00E0 L Letter: Lower case letter, latin, Other, U+00C0, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue]
|
||||
U+00E1 L Letter: Lower case letter, latin, Other, U+00C1, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue]
|
||||
U+00E2 L Letter: Lower case letter, latin, Other, U+00C2, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue]
|
||||
U+00E3 L Letter: Lower case letter, latin, Other, U+00C3, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue]
|
||||
U+00E4 L Letter: Lower case letter, latin, Other, U+00C4, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue]
|
||||
U+00E5 L Letter: Lower case letter, latin, Other, U+00C5, U+212B, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue]
|
||||
U+00E6 L Letter: Lower case letter, latin, Other, U+00C6, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue]
|
||||
U+00E7 L Letter: Lower case letter, latin, Other, U+00C7, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue]
|
||||
U+00E8 L Letter: Lower case letter, latin, Other, U+00C8, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue]
|
||||
U+00E9 L Letter: Lower case letter, latin, Other, U+00C9, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue]
|
||||
U+00EA L Letter: Lower case letter, latin, Other, U+00CA, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue]
|
||||
U+00EB L Letter: Lower case letter, latin, Other, U+00CB, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue]
|
||||
U+00EC L Letter: Lower case letter, latin, Other, U+00CC, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue]
|
||||
U+00ED L Letter: Lower case letter, latin, Other, U+00CD, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue]
|
||||
U+00EE L Letter: Lower case letter, latin, Other, U+00CE, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue]
|
||||
U+00EF L Letter: Lower case letter, latin, Other, U+00CF, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue]
|
||||
findprop f0 f1 f2 f3 f4 f5 f6 f7 f8 f9 fa fb fc fd fe ff
|
||||
U+00F0 Letter: Lower case letter, Latin, Other, U+00D0
|
||||
U+00F1 Letter: Lower case letter, Latin, Other, U+00D1
|
||||
U+00F2 Letter: Lower case letter, Latin, Other, U+00D2
|
||||
U+00F3 Letter: Lower case letter, Latin, Other, U+00D3
|
||||
U+00F4 Letter: Lower case letter, Latin, Other, U+00D4
|
||||
U+00F5 Letter: Lower case letter, Latin, Other, U+00D5
|
||||
U+00F6 Letter: Lower case letter, Latin, Other, U+00D6
|
||||
U+00F7 Symbol: Mathematical symbol, Common, Other
|
||||
U+00F8 Letter: Lower case letter, Latin, Other, U+00D8
|
||||
U+00F9 Letter: Lower case letter, Latin, Other, U+00D9
|
||||
U+00FA Letter: Lower case letter, Latin, Other, U+00DA
|
||||
U+00FB Letter: Lower case letter, Latin, Other, U+00DB
|
||||
U+00FC Letter: Lower case letter, Latin, Other, U+00DC
|
||||
U+00FD Letter: Lower case letter, Latin, Other, U+00DD
|
||||
U+00FE Letter: Lower case letter, Latin, Other, U+00DE
|
||||
U+00FF Letter: Lower case letter, Latin, Other, U+0178
|
||||
U+00F0 L Letter: Lower case letter, latin, Other, U+00D0, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue]
|
||||
U+00F1 L Letter: Lower case letter, latin, Other, U+00D1, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue]
|
||||
U+00F2 L Letter: Lower case letter, latin, Other, U+00D2, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue]
|
||||
U+00F3 L Letter: Lower case letter, latin, Other, U+00D3, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue]
|
||||
U+00F4 L Letter: Lower case letter, latin, Other, U+00D4, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue]
|
||||
U+00F5 L Letter: Lower case letter, latin, Other, U+00D5, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue]
|
||||
U+00F6 L Letter: Lower case letter, latin, Other, U+00D6, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue]
|
||||
U+00F7 ON Symbol: Mathematical symbol, common, Other, [alphabetic, caseignorable, diacritic, graphemeextend, idcontinue, xidcontinue]
|
||||
U+00F8 L Letter: Lower case letter, latin, Other, U+00D8, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue]
|
||||
U+00F9 L Letter: Lower case letter, latin, Other, U+00D9, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue]
|
||||
U+00FA L Letter: Lower case letter, latin, Other, U+00DA, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue]
|
||||
U+00FB L Letter: Lower case letter, latin, Other, U+00DB, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue]
|
||||
U+00FC L Letter: Lower case letter, latin, Other, U+00DC, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue]
|
||||
U+00FD L Letter: Lower case letter, latin, Other, U+00DD, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue]
|
||||
U+00FE L Letter: Lower case letter, latin, Other, U+00DE, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue]
|
||||
U+00FF L Letter: Lower case letter, latin, Other, U+0178, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue]
|
||||
|
||||
findprop 0100 0101 0102 0103 0104 0105 0106
|
||||
U+0100 Letter: Upper case letter, Latin, Other, U+0101
|
||||
U+0101 Letter: Lower case letter, Latin, Other, U+0100
|
||||
U+0102 Letter: Upper case letter, Latin, Other, U+0103
|
||||
U+0103 Letter: Lower case letter, Latin, Other, U+0102
|
||||
U+0104 Letter: Upper case letter, Latin, Other, U+0105
|
||||
U+0105 Letter: Lower case letter, Latin, Other, U+0104
|
||||
U+0106 Letter: Upper case letter, Latin, Other, U+0107
|
||||
U+0100 L Letter: Upper case letter, latin, Other, U+0101, [alphabetic, graphemeextend, idcontinue, xidcontinue]
|
||||
U+0101 L Letter: Lower case letter, latin, Other, U+0100, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue]
|
||||
U+0102 L Letter: Upper case letter, latin, Other, U+0103, [alphabetic, graphemeextend, idcontinue, xidcontinue]
|
||||
U+0103 L Letter: Lower case letter, latin, Other, U+0102, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue]
|
||||
U+0104 L Letter: Upper case letter, latin, Other, U+0105, [alphabetic, graphemeextend, idcontinue, xidcontinue]
|
||||
U+0105 L Letter: Lower case letter, latin, Other, U+0104, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue]
|
||||
U+0106 L Letter: Upper case letter, latin, Other, U+0107, [alphabetic, graphemeextend, idcontinue, xidcontinue]
|
||||
|
||||
findprop ffe0 ffe1 ffe2 ffe3 ffe4 ffe5 ffe6 ffe7
|
||||
U+FFE0 Symbol: Currency symbol, Common, Other
|
||||
U+FFE1 Symbol: Currency symbol, Common, Other
|
||||
U+FFE2 Symbol: Mathematical symbol, Common, Other
|
||||
U+FFE3 Symbol: Modifier symbol, Common, Other
|
||||
U+FFE4 Symbol: Other symbol, Common, Other
|
||||
U+FFE5 Symbol: Currency symbol, Common, Other
|
||||
U+FFE6 Symbol: Currency symbol, Common, Other
|
||||
U+FFE7 Control: Unassigned, Unknown, Other
|
||||
U+FFE0 ET Symbol: Currency symbol, common, Other, [alphabetic, caseignorable, extender, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
|
||||
U+FFE1 ET Symbol: Currency symbol, common, Other, [alphabetic, caseignorable, extender, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
|
||||
U+FFE2 ON Symbol: Mathematical symbol, common, Other, [emoji, extendedpictographic, graphemebase]
|
||||
U+FFE3 ON Symbol: Modifier symbol, common, Other, [alphabetic, cased, graphemebase, idcontinue, idstart, math, uppercase, xidcontinue, xidstart]
|
||||
U+FFE4 ON Symbol: Other symbol, common, Other, [alphabetic, caseignorable, extender, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
|
||||
U+FFE5 ET Symbol: Currency symbol, common, Other, [alphabetic, caseignorable, extender, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
|
||||
U+FFE6 ET Symbol: Currency symbol, common, Other, [alphabetic, caseignorable, extender, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
|
||||
U+FFE7 L Control: Unassigned, unknown, Other
|
||||
findprop ffe8 ffe9 ffea ffeb ffec ffed ffee ffef
|
||||
U+FFE8 Symbol: Other symbol, Common, Other
|
||||
U+FFE9 Symbol: Mathematical symbol, Common, Other
|
||||
U+FFEA Symbol: Mathematical symbol, Common, Other
|
||||
U+FFEB Symbol: Mathematical symbol, Common, Other
|
||||
U+FFEC Symbol: Mathematical symbol, Common, Other
|
||||
U+FFED Symbol: Other symbol, Common, Other
|
||||
U+FFEE Symbol: Other symbol, Common, Other
|
||||
U+FFEF Control: Unassigned, Unknown, Other
|
||||
U+FFE8 ON Symbol: Other symbol, common, Other, [alphabetic, caseignorable, extender, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
|
||||
U+FFE9 ON Symbol: Mathematical symbol, common, Other, [emoji, extendedpictographic, graphemebase]
|
||||
U+FFEA ON Symbol: Mathematical symbol, common, Other, [emoji, extendedpictographic, graphemebase]
|
||||
U+FFEB ON Symbol: Mathematical symbol, common, Other, [emoji, extendedpictographic, graphemebase]
|
||||
U+FFEC ON Symbol: Mathematical symbol, common, Other, [emoji, extendedpictographic, graphemebase]
|
||||
U+FFED ON Symbol: Other symbol, common, Other, [alphabetic, caseignorable, extender, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
|
||||
U+FFEE ON Symbol: Other symbol, common, Other, [alphabetic, caseignorable, extender, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
|
||||
U+FFEF L Control: Unassigned, unknown, Other
|
||||
findprop fff8 fff9 fffa fffb fffc fffd fffe ffff
|
||||
U+FFF8 Control: Unassigned, Unknown, Control
|
||||
U+FFF9 Control: Format, Common, Control
|
||||
U+FFFA Control: Format, Common, Control
|
||||
U+FFFB Control: Format, Common, Control
|
||||
U+FFFC Symbol: Other symbol, Common, Other
|
||||
U+FFFD Symbol: Other symbol, Common, Other
|
||||
U+FFFE Control: Unassigned, Unknown, Other
|
||||
U+FFFF Control: Unassigned, Unknown, Other
|
||||
U+FFF8 BN Control: Unassigned, unknown, Control, [dash, defaultignorablecodepoint, deprecated, extendedpictographic, joincontrol, lowercase, patternwhitespace, quotationmark, sentenceterminal, softdotted, xidcontinue, xidstart]
|
||||
U+FFF9 ON Control: Format, common, Control, [changeswhenuppercased, deprecated, emojimodifier, emojipresentation, extender, sentenceterminal, xidcontinue, xidstart]
|
||||
U+FFFA ON Control: Format, common, Control, [changeswhenuppercased, deprecated, emojimodifier, emojipresentation, extender, sentenceterminal, xidcontinue, xidstart]
|
||||
U+FFFB ON Control: Format, common, Control, [changeswhenuppercased, deprecated, emojimodifier, emojipresentation, extender, sentenceterminal, xidcontinue, xidstart]
|
||||
U+FFFC ON Symbol: Other symbol, common, Other, [alphabetic, caseignorable, extender, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
|
||||
U+FFFD ON Symbol: Other symbol, common, Other, [alphabetic, caseignorable, extender, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
|
||||
U+FFFE BN Control: Unassigned, unknown, Other, [changeswhenuppercased, deprecated, emojicomponent, emojimodifierbase, extender, quotationmark, sentenceterminal, xidcontinue, xidstart]
|
||||
U+FFFF BN Control: Unassigned, unknown, Other, [changeswhenuppercased, deprecated, emojicomponent, emojimodifierbase, extender, quotationmark, sentenceterminal, xidcontinue, xidstart]
|
||||
findprop 10000 10001 e01ef f0000 100000
|
||||
U+10000 Letter: Other letter, Linear_B, Other
|
||||
U+10001 Letter: Other letter, Linear_B, Other
|
||||
U+E01EF Mark: Non-spacing mark, Inherited, Extend
|
||||
U+F0000 Control: Private use, Unknown, Other
|
||||
U+100000 Control: Private use, Unknown, Other
|
||||
U+10000 L Letter: Other letter, linearb, Other, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+10001 L Letter: Other letter, linearb, Other, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+E01EF NSM Mark: Non-spacing mark, inherited, Extend, []
|
||||
U+F0000 L Control: Private use, unknown, Other
|
||||
U+100000 L Control: Private use, unknown, Other
|
||||
|
||||
findprop 1b00 12000 7c0 a840 10900
|
||||
U+1B00 Mark: Non-spacing mark, Balinese, Extend
|
||||
U+12000 Letter: Other letter, Cuneiform, Other
|
||||
U+07C0 Number: Decimal number, Nko, Other
|
||||
U+A840 Letter: Other letter, Phags_Pa, Other
|
||||
U+10900 Letter: Other letter, Phoenician, Other
|
||||
U+1B00 NSM Mark: Non-spacing mark, balinese, Extend, [alphabetic, cased, changeswhencasefolded, changeswhencasemapped, changeswhenlowercased, graphemebase, uppercase]
|
||||
U+12000 L Letter: Other letter, cuneiform, Other, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+07C0 R Number: Decimal number, nko, Other, [graphemebase, patternsyntax, terminalpunctuation]
|
||||
U+A840 L Letter: Other letter, phagspa, Other, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+10900 R Letter: Other letter, phoenician, Other, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
findprop 1d79 a77d
|
||||
U+1D79 Letter: Lower case letter, Latin, Other, U+A77D
|
||||
U+A77D Letter: Upper case letter, Latin, Other, U+1D79
|
||||
U+1D79 L Letter: Lower case letter, latin, Other, U+A77D, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue]
|
||||
U+A77D L Letter: Upper case letter, latin, Other, U+1D79, [alphabetic, graphemeextend, idcontinue, xidcontinue]
|
||||
|
||||
findprop 0800 083e a4d0 a4f7 aa80 aadf
|
||||
U+0800 Letter: Other letter, Samaritan, Other
|
||||
U+083E Punctuation: Other punctuation, Samaritan, Other
|
||||
U+A4D0 Letter: Other letter, Lisu, Other
|
||||
U+A4F7 Letter: Other letter, Lisu, Other
|
||||
U+AA80 Letter: Other letter, Tai_Viet, Other
|
||||
U+AADF Punctuation: Other punctuation, Tai_Viet, Other
|
||||
U+0800 R Letter: Other letter, samaritan, Other, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+083E R Punctuation: Other punctuation, samaritan, Other, [bidimirrored, graphemebase, math, patternsyntax]
|
||||
U+A4D0 L Letter: Other letter, lisu, Other, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+A4F7 L Letter: Other letter, lisu, Other, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+AA80 L Letter: Other letter, taiviet, Other, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+AADF L Punctuation: Other punctuation, taiviet, Other, [graphemebase, patternsyntax, sentenceterminal, terminalpunctuation]
|
||||
findprop 10b00 10b35 13000 1342e 10840 10855
|
||||
U+10B00 Letter: Other letter, Avestan, Other
|
||||
U+10B35 Letter: Other letter, Avestan, Other
|
||||
U+13000 Letter: Other letter, Egyptian_Hieroglyphs, Other
|
||||
U+1342E Letter: Other letter, Egyptian_Hieroglyphs, Other
|
||||
U+10840 Letter: Other letter, Imperial_Aramaic, Other
|
||||
U+10855 Letter: Other letter, Imperial_Aramaic, Other
|
||||
U+10B00 R Letter: Other letter, avestan, Other, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+10B35 R Letter: Other letter, avestan, Other, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+13000 L Letter: Other letter, egyptianhieroglyphs, Other, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+1342E L Letter: Other letter, egyptianhieroglyphs, Other, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+10840 R Letter: Other letter, imperialaramaic, Other, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+10855 R Letter: Other letter, imperialaramaic, Other, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
|
||||
findprop 11100 1113c 11680 116c0
|
||||
U+11100 Mark: Non-spacing mark, Chakma, Extend
|
||||
U+1113C Number: Decimal number, Chakma, Other
|
||||
U+11680 Letter: Other letter, Takri, Other
|
||||
U+116C0 Number: Decimal number, Takri, Other
|
||||
U+11100 NSM Mark: Non-spacing mark, chakma, Extend, [alphabetic, cased, changeswhencasefolded, changeswhencasemapped, changeswhenlowercased, graphemebase, uppercase]
|
||||
U+1113C L Number: Decimal number, chakma, Other, [graphemebase, patternsyntax, terminalpunctuation]
|
||||
U+11680 L Letter: Other letter, takri, Other, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+116C0 L Number: Decimal number, takri, Other, [graphemebase, patternsyntax, terminalpunctuation]
|
||||
|
||||
findprop 0d 0a 0e 0711 1b04 1111 1169 11fe ae4c ad89
|
||||
U+000D Control: Control, Common, CR
|
||||
U+000A Control: Control, Common, LF
|
||||
U+000E Control: Control, Common, Control
|
||||
U+0711 Mark: Non-spacing mark, Syriac, Extend
|
||||
U+1B04 Mark: Spacing mark, Balinese, SpacingMark
|
||||
U+1111 Letter: Other letter, Hangul, Hangul syllable type L
|
||||
U+1169 Letter: Other letter, Hangul, Hangul syllable type V
|
||||
U+11FE Letter: Other letter, Hangul, Hangul syllable type T
|
||||
U+AE4C Letter: Other letter, Hangul, Hangul syllable type LV
|
||||
U+AD89 Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||
U+000D B Control: Control, common, CR, [ascii, graphemebase, patternsyntax, sentenceterminal, terminalpunctuation]
|
||||
U+000A B Control: Control, common, LF, [ascii, graphemebase, patternsyntax, sentenceterminal, terminalpunctuation]
|
||||
U+000E BN Control: Control, common, Control, [ascii, patternwhitespace, whitespace]
|
||||
U+0711 NSM Mark: Non-spacing mark, syriac, Extend, [alphabetic, cased, changeswhencasefolded, changeswhencasemapped, changeswhenlowercased, graphemebase, uppercase]
|
||||
U+1B04 L Mark: Spacing mark, balinese, SpacingMark, [dash, emoji, extendedpictographic, graphemebase, patternsyntax]
|
||||
U+1111 L Letter: Other letter, hangul, Hangul syllable type L, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+1169 L Letter: Other letter, hangul, Hangul syllable type V, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+11FE L Letter: Other letter, hangul, Hangul syllable type T, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+AE4C L Letter: Other letter, hangul, Hangul syllable type LV, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+AD89 L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
|
||||
findprop 118a0 11ac7 16ad0
|
||||
U+118A0 Letter: Upper case letter, Warang_Citi, Other, U+118C0
|
||||
U+11AC7 Letter: Other letter, Pau_Cin_Hau, Other
|
||||
U+16AD0 Letter: Other letter, Bassa_Vah, Other
|
||||
U+118A0 L Letter: Upper case letter, warangciti, Other, U+118C0, [alphabetic, graphemeextend, idcontinue, xidcontinue]
|
||||
U+11AC7 L Letter: Other letter, paucinhau, Other, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+16AD0 L Letter: Other letter, bassavah, Other, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
|
||||
findprop 11700 14400 108e0 11280 1d800
|
||||
U+11700 Letter: Other letter, Ahom, Other
|
||||
U+14400 Letter: Other letter, Anatolian_Hieroglyphs, Other
|
||||
U+108E0 Letter: Other letter, Hatran, Other
|
||||
U+11280 Letter: Other letter, Multani, Other
|
||||
U+1D800 Symbol: Other symbol, SignWriting, Other
|
||||
U+11700 L Letter: Other letter, ahom, Other, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+14400 L Letter: Other letter, anatolianhieroglyphs, Other, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+108E0 R Letter: Other letter, hatran, Other, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+11280 L Letter: Other letter, multani, Other, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+1D800 L Symbol: Other symbol, signwriting, Other, [alphabetic, caseignorable, extender, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
|
||||
|
||||
findprop 11800 1e903 11da9 10d27 11ee0 16e48 10f27 10f30
|
||||
U+11800 Letter: Other letter, Dogra, Other
|
||||
U+1E903 Letter: Upper case letter, Adlam, Other, U+1E925
|
||||
U+11DA9 Number: Decimal number, Gunjala_Gondi, Other
|
||||
U+10D27 Mark: Non-spacing mark, Hanifi_Rohingya, Extend
|
||||
U+11EE0 Letter: Other letter, Makasar, Other
|
||||
U+16E48 Letter: Upper case letter, Medefaidrin, Other, U+16E68
|
||||
U+10F27 Letter: Other letter, Old_Sogdian, Other
|
||||
U+10F30 Letter: Other letter, Sogdian, Other
|
||||
U+11800 L Letter: Other letter, dogra, Other, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+1E903 R Letter: Upper case letter, adlam, Other, U+1E925, [alphabetic, graphemeextend, idcontinue, xidcontinue]
|
||||
U+11DA9 L Number: Decimal number, gunjalagondi, Other, [graphemebase, patternsyntax, terminalpunctuation]
|
||||
U+10D27 NSM Mark: Non-spacing mark, hanifirohingya, Extend, [extendedpictographic, graphemebase, patternsyntax]
|
||||
U+11EE0 L Letter: Other letter, makasar, Other, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+16E48 L Letter: Upper case letter, medefaidrin, Other, U+16E68, [alphabetic, graphemeextend, idcontinue, xidcontinue]
|
||||
U+10F27 R Letter: Other letter, oldsogdian, Other, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+10F30 AL Letter: Other letter, sogdian, Other, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
|
||||
findprop a836 a833 1cf4 20f0 1cd0
|
||||
U+A836 Symbol: Other symbol, Common, Other, [Devanagari, Dogra, Gujarati, Gurmukhi, Khojki, Kaithi, Mahajani, Modi, Khudawadi, Takri, Tirhuta]
|
||||
U+A833 Number: Other number, Common, Other, [Devanagari, Dogra, Gujarati, Gurmukhi, Khojki, Kannada, Kaithi, Mahajani, Modi, Nandinagari, Khudawadi, Takri, Tirhuta]
|
||||
U+1CF4 Mark: Non-spacing mark, Inherited, Extend, [Devanagari, Grantha, Kannada]
|
||||
U+20F0 Mark: Non-spacing mark, Inherited, Extend, [Devanagari, Grantha, Latin]
|
||||
U+1CD0 Mark: Non-spacing mark, Inherited, Extend, [Bengali, Devanagari, Grantha, Kannada]
|
||||
U+A836 L Symbol: Other symbol, common, Other, [devanagari, gurmukhi, gujarati, kaithi, takri, khojki, mahajani, modi, khudawadi, tirhuta, dogra], [alphabetic, caseignorable, extender, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
|
||||
U+A833 L Number: Other number, common, Other, [devanagari, gurmukhi, gujarati, kannada, kaithi, takri, khojki, mahajani, modi, khudawadi, tirhuta, dogra, nandinagari], [alphabetic, caseignorable, extender, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
|
||||
U+1CF4 NSM Mark: Non-spacing mark, inherited, Extend, [devanagari, kannada, grantha], [alphabetic, cased, graphemebase, idcontinue, idstart, lowercase, softdotted, xidcontinue, xidstart]
|
||||
U+20F0 NSM Mark: Non-spacing mark, inherited, Extend, [latin, devanagari, grantha], [caseignorable, graphemebase, patternsyntax, quotationmark]
|
||||
U+1CD0 NSM Mark: Non-spacing mark, inherited, Extend, [devanagari, bengali, kannada, grantha], [alphabetic, cased, graphemebase, idcontinue, idstart, lowercase, softdotted, xidcontinue, xidstart]
|
||||
|
||||
findprop 32ff
|
||||
U+32FF Symbol: Other symbol, Common, Other, [Han]
|
||||
U+32FF L Symbol: Other symbol, common, Other, [han], [alphabetic, caseignorable, extender, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
|
||||
|
||||
findprop 1f16d
|
||||
U+1F16D Symbol: Other symbol, Common, Extended Pictographic
|
||||
U+1F16D ON Symbol: Other symbol, common, Extended Pictographic, [ascii, sentenceterminal, unifiedideograph, whitespace, xidcontinue]
|
||||
|
||||
findprop U+10e93 U+10eaa
|
||||
U+10E93 Letter: Other letter, Yezidi, Other
|
||||
U+10EAA Control: Unassigned, Unknown, Other
|
||||
U+10E93 R Letter: Other letter, yezidi, Other, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+10EAA R Control: Unassigned, unknown, Other
|
||||
|
||||
findprop 0602 202a 202b 202c 2068 2069 202d 202e 2067
|
||||
U+0602 AN Control: Format, arabic, Prepend, [alphabetic, cased, changeswhencasemapped, changeswhentitlecased, changeswhenuppercased, graphemebase, lowercase]
|
||||
U+202A LRE Control: Format, common, Control, [extendedpictographic, graphemebase, math, patternsyntax]
|
||||
U+202B RLE Control: Format, common, Control, [extendedpictographic, graphemebase, math, patternsyntax]
|
||||
U+202C PDF Control: Format, common, Control, [extendedpictographic, graphemebase, math, patternsyntax]
|
||||
U+2068 FSI Control: Format, common, Control, [extendedpictographic, graphemebase, math, patternsyntax]
|
||||
U+2069 PDI Control: Format, common, Control, [extendedpictographic, graphemebase, math, patternsyntax]
|
||||
U+202D LRO Control: Format, common, Control, [extendedpictographic, graphemebase, math, patternsyntax]
|
||||
U+202E RLO Control: Format, common, Control, [extendedpictographic, graphemebase, math, patternsyntax]
|
||||
U+2067 RLI Control: Format, common, Control, [extendedpictographic, graphemebase, math, patternsyntax]
|
||||
|
|
|
@ -1,188 +1,298 @@
|
|||
find script Han
|
||||
U+2E80..U+2E99 Symbol: Other symbol, Han, Other
|
||||
U+2E9B..U+2EF3 Symbol: Other symbol, Han, Other
|
||||
U+2F00..U+2FD5 Symbol: Other symbol, Han, Other
|
||||
U+3005 Letter: Modifier letter, Han, Other
|
||||
U+3007 Number: Letter number, Han, Other
|
||||
U+3021..U+3029 Number: Letter number, Han, Other
|
||||
U+3038..U+303A Number: Letter number, Han, Other
|
||||
U+303B Letter: Modifier letter, Han, Other
|
||||
U+3400..U+4DBF Letter: Other letter, Han, Other
|
||||
U+4E00..U+9FFC Letter: Other letter, Han, Other
|
||||
U+F900..U+FA6D Letter: Other letter, Han, Other
|
||||
U+FA70..U+FAD9 Letter: Other letter, Han, Other
|
||||
U+16FF0..U+16FF1 Mark: Spacing mark, Han, SpacingMark
|
||||
U+20000..U+2A6DD Letter: Other letter, Han, Other
|
||||
U+2A700..U+2B734 Letter: Other letter, Han, Other
|
||||
U+2B740..U+2B81D Letter: Other letter, Han, Other
|
||||
U+2B820..U+2CEA1 Letter: Other letter, Han, Other
|
||||
U+2CEB0..U+2EBE0 Letter: Other letter, Han, Other
|
||||
U+2F800..U+2FA1D Letter: Other letter, Han, Other
|
||||
U+30000..U+3134A Letter: Other letter, Han, Other
|
||||
U+2E80..U+2E99 ON Symbol: Other symbol, han, Other, [ascii, sentenceterminal, unifiedideograph, whitespace, xidstart]
|
||||
U+2E9B..U+2EF3 ON Symbol: Other symbol, han, Other, [ascii, sentenceterminal, unifiedideograph, whitespace, xidstart]
|
||||
U+2F00..U+2FD5 ON Symbol: Other symbol, han, Other, [ascii, sentenceterminal, unifiedideograph, whitespace, xidstart]
|
||||
U+3005 L Letter: Modifier letter, han, Other, [emoji, emojimodifierbase, emojipresentation, extendedpictographic, graphemebase, patternsyntax]
|
||||
U+3007 L Number: Letter number, han, Other, [sentenceterminal, unifiedideograph, xidcontinue, xidstart]
|
||||
U+3021..U+3029 L Number: Letter number, han, Other, [sentenceterminal, unifiedideograph, xidcontinue, xidstart]
|
||||
U+3038..U+303A L Number: Letter number, han, Other, [sentenceterminal, unifiedideograph, xidcontinue, xidstart]
|
||||
U+303B L Letter: Modifier letter, han, Other, [alphabetic, graphemebase, idcontinue, idstart, ideographic, xidcontinue, xidstart]
|
||||
U+3400..U+4DBF L Letter: Other letter, han, Other, [changeswhenuppercased, deprecated, emojimodifier, emojimodifierbase, extender, quotationmark, sentenceterminal, xidcontinue, xidstart]
|
||||
U+4E00..U+9FFF L Letter: Other letter, han, Other, [changeswhenuppercased, deprecated, emojimodifier, emojimodifierbase, extender, quotationmark, sentenceterminal, xidcontinue, xidstart]
|
||||
U+F900..U+FA0D L Letter: Other letter, han, Other, [sentenceterminal, unifiedideograph, xidcontinue, xidstart]
|
||||
U+FA0E..U+FA0F L Letter: Other letter, han, Other, [changeswhenuppercased, deprecated, emojimodifier, emojimodifierbase, extender, quotationmark, sentenceterminal, xidcontinue, xidstart]
|
||||
U+FA10 L Letter: Other letter, han, Other, [sentenceterminal, unifiedideograph, xidcontinue, xidstart]
|
||||
U+FA11 L Letter: Other letter, han, Other, [changeswhenuppercased, deprecated, emojimodifier, emojimodifierbase, extender, quotationmark, sentenceterminal, xidcontinue, xidstart]
|
||||
U+FA12 L Letter: Other letter, han, Other, [sentenceterminal, unifiedideograph, xidcontinue, xidstart]
|
||||
U+FA13..U+FA14 L Letter: Other letter, han, Other, [changeswhenuppercased, deprecated, emojimodifier, emojimodifierbase, extender, quotationmark, sentenceterminal, xidcontinue, xidstart]
|
||||
U+FA15..U+FA1E L Letter: Other letter, han, Other, [sentenceterminal, unifiedideograph, xidcontinue, xidstart]
|
||||
U+FA1F L Letter: Other letter, han, Other, [changeswhenuppercased, deprecated, emojimodifier, emojimodifierbase, extender, quotationmark, sentenceterminal, xidcontinue, xidstart]
|
||||
U+FA20 L Letter: Other letter, han, Other, [sentenceterminal, unifiedideograph, xidcontinue, xidstart]
|
||||
U+FA21 L Letter: Other letter, han, Other, [changeswhenuppercased, deprecated, emojimodifier, emojimodifierbase, extender, quotationmark, sentenceterminal, xidcontinue, xidstart]
|
||||
U+FA22 L Letter: Other letter, han, Other, [sentenceterminal, unifiedideograph, xidcontinue, xidstart]
|
||||
U+FA23..U+FA24 L Letter: Other letter, han, Other, [changeswhenuppercased, deprecated, emojimodifier, emojimodifierbase, extender, quotationmark, sentenceterminal, xidcontinue, xidstart]
|
||||
U+FA25..U+FA26 L Letter: Other letter, han, Other, [sentenceterminal, unifiedideograph, xidcontinue, xidstart]
|
||||
U+FA27..U+FA29 L Letter: Other letter, han, Other, [changeswhenuppercased, deprecated, emojimodifier, emojimodifierbase, extender, quotationmark, sentenceterminal, xidcontinue, xidstart]
|
||||
U+FA2A..U+FA6D L Letter: Other letter, han, Other, [sentenceterminal, unifiedideograph, xidcontinue, xidstart]
|
||||
U+FA70..U+FAD9 L Letter: Other letter, han, Other, [sentenceterminal, unifiedideograph, xidcontinue, xidstart]
|
||||
U+16FE2 ON Punctuation: Other punctuation, han, Other, [alphabetic, caseignorable, extender, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
|
||||
U+16FE3 L Letter: Modifier letter, han, Other, [emoji, emojimodifierbase, emojipresentation, extendedpictographic, graphemebase, patternsyntax]
|
||||
U+16FF0..U+16FF1 L Mark: Spacing mark, han, SpacingMark, [caseignorable, graphemeextend, idcontinue, ideographic, xidcontinue]
|
||||
U+20000..U+2A6DF L Letter: Other letter, han, Other, [changeswhenuppercased, deprecated, emojimodifier, emojimodifierbase, extender, quotationmark, sentenceterminal, xidcontinue, xidstart]
|
||||
U+2A700..U+2B738 L Letter: Other letter, han, Other, [changeswhenuppercased, deprecated, emojimodifier, emojimodifierbase, extender, quotationmark, sentenceterminal, xidcontinue, xidstart]
|
||||
U+2B740..U+2B81D L Letter: Other letter, han, Other, [changeswhenuppercased, deprecated, emojimodifier, emojimodifierbase, extender, quotationmark, sentenceterminal, xidcontinue, xidstart]
|
||||
U+2B820..U+2CEA1 L Letter: Other letter, han, Other, [changeswhenuppercased, deprecated, emojimodifier, emojimodifierbase, extender, quotationmark, sentenceterminal, xidcontinue, xidstart]
|
||||
U+2CEB0..U+2EBE0 L Letter: Other letter, han, Other, [changeswhenuppercased, deprecated, emojimodifier, emojimodifierbase, extender, quotationmark, sentenceterminal, xidcontinue, xidstart]
|
||||
U+2F800..U+2FA1D L Letter: Other letter, han, Other, [sentenceterminal, unifiedideograph, xidcontinue, xidstart]
|
||||
U+30000..U+3134A L Letter: Other letter, han, Other, [changeswhenuppercased, deprecated, emojimodifier, emojimodifierbase, extender, quotationmark, sentenceterminal, xidcontinue, xidstart]
|
||||
find type Pe script Common scriptx Hangul
|
||||
U+3009 Punctuation: Close punctuation, Common, Other, [Bopomofo, Hangul, Han, Hiragana, Katakana, Yi]
|
||||
U+300B Punctuation: Close punctuation, Common, Other, [Bopomofo, Hangul, Han, Hiragana, Katakana, Yi]
|
||||
U+300D Punctuation: Close punctuation, Common, Other, [Bopomofo, Hangul, Han, Hiragana, Katakana, Yi]
|
||||
U+300F Punctuation: Close punctuation, Common, Other, [Bopomofo, Hangul, Han, Hiragana, Katakana, Yi]
|
||||
U+3011 Punctuation: Close punctuation, Common, Other, [Bopomofo, Hangul, Han, Hiragana, Katakana, Yi]
|
||||
U+3015 Punctuation: Close punctuation, Common, Other, [Bopomofo, Hangul, Han, Hiragana, Katakana, Yi]
|
||||
U+3017 Punctuation: Close punctuation, Common, Other, [Bopomofo, Hangul, Han, Hiragana, Katakana, Yi]
|
||||
U+3019 Punctuation: Close punctuation, Common, Other, [Bopomofo, Hangul, Han, Hiragana, Katakana, Yi]
|
||||
U+301B Punctuation: Close punctuation, Common, Other, [Bopomofo, Hangul, Han, Hiragana, Katakana, Yi]
|
||||
U+301E..U+301F Punctuation: Close punctuation, Common, Other, [Bopomofo, Hangul, Han, Hiragana, Katakana]
|
||||
U+FF63 Punctuation: Close punctuation, Common, Other, [Bopomofo, Hangul, Han, Hiragana, Katakana, Yi]
|
||||
U+3009 ON Punctuation: Close punctuation, common, Other, [hangul, hiragana, katakana, bopomofo, han, yiii], [ascii, alphabetic, cased, changeswhencasemapped, changeswhentitlecased, dash, emojimodifier, emojimodifierbase]
|
||||
U+300B ON Punctuation: Close punctuation, common, Other, [hangul, hiragana, katakana, bopomofo, han, yiii], [ascii, alphabetic, cased, changeswhencasemapped, changeswhentitlecased, dash, emojimodifier, emojimodifierbase]
|
||||
U+300D ON Punctuation: Close punctuation, common, Other, [hangul, hiragana, katakana, bopomofo, han, yiii], [graphemebase, sentenceterminal, terminalpunctuation]
|
||||
U+300F ON Punctuation: Close punctuation, common, Other, [hangul, hiragana, katakana, bopomofo, han, yiii], [graphemebase, sentenceterminal, terminalpunctuation]
|
||||
U+3011 ON Punctuation: Close punctuation, common, Other, [hangul, hiragana, katakana, bopomofo, han, yiii], [ascii, alphabetic, cased, changeswhencasemapped, changeswhentitlecased, dash, emojimodifier, emojimodifierbase]
|
||||
U+3015 ON Punctuation: Close punctuation, common, Other, [hangul, hiragana, katakana, bopomofo, han, yiii], [ascii, alphabetic, cased, changeswhencasemapped, changeswhentitlecased, dash, emojimodifier, emojimodifierbase]
|
||||
U+3017 ON Punctuation: Close punctuation, common, Other, [hangul, hiragana, katakana, bopomofo, han, yiii], [ascii, alphabetic, cased, changeswhencasemapped, changeswhentitlecased, dash, emojimodifier, emojimodifierbase]
|
||||
U+3019 ON Punctuation: Close punctuation, common, Other, [hangul, hiragana, katakana, bopomofo, han, yiii], [ascii, alphabetic, cased, changeswhencasemapped, changeswhentitlecased, dash, emojimodifier, emojimodifierbase]
|
||||
U+301B ON Punctuation: Close punctuation, common, Other, [hangul, hiragana, katakana, bopomofo, han, yiii], [ascii, alphabetic, cased, changeswhencasemapped, changeswhentitlecased, dash, emojimodifier, emojimodifierbase]
|
||||
U+301E..U+301F ON Punctuation: Close punctuation, common, Other, [hangul, hiragana, katakana, bopomofo, han], [softdotted, terminalpunctuation, unifiedideograph, xidcontinue, xidstart]
|
||||
U+FF63 ON Punctuation: Close punctuation, common, Other, [hangul, hiragana, katakana, bopomofo, han, yiii], [changeswhencasemapped, changeswhenlowercased, changeswhentitlecased, emojimodifier, emojimodifierbase]
|
||||
find type Sk
|
||||
U+005E Symbol: Modifier symbol, Common, Other
|
||||
U+0060 Symbol: Modifier symbol, Common, Other
|
||||
U+00A8 Symbol: Modifier symbol, Common, Other
|
||||
U+00AF Symbol: Modifier symbol, Common, Other
|
||||
U+00B4 Symbol: Modifier symbol, Common, Other
|
||||
U+00B8 Symbol: Modifier symbol, Common, Other
|
||||
U+02C2..U+02C5 Symbol: Modifier symbol, Common, Other
|
||||
U+02D2..U+02DF Symbol: Modifier symbol, Common, Other
|
||||
U+02E5..U+02E9 Symbol: Modifier symbol, Common, Other
|
||||
U+02EA..U+02EB Symbol: Modifier symbol, Bopomofo, Other
|
||||
U+02ED Symbol: Modifier symbol, Common, Other
|
||||
U+02EF..U+02FF Symbol: Modifier symbol, Common, Other
|
||||
U+0375 Symbol: Modifier symbol, Greek, Other
|
||||
U+0384 Symbol: Modifier symbol, Greek, Other
|
||||
U+0385 Symbol: Modifier symbol, Common, Other
|
||||
U+1FBD Symbol: Modifier symbol, Greek, Other
|
||||
U+1FBF..U+1FC1 Symbol: Modifier symbol, Greek, Other
|
||||
U+1FCD..U+1FCF Symbol: Modifier symbol, Greek, Other
|
||||
U+1FDD..U+1FDF Symbol: Modifier symbol, Greek, Other
|
||||
U+1FED..U+1FEF Symbol: Modifier symbol, Greek, Other
|
||||
U+1FFD..U+1FFE Symbol: Modifier symbol, Greek, Other
|
||||
U+309B..U+309C Symbol: Modifier symbol, Common, Other, [Hiragana, Katakana]
|
||||
U+A700..U+A707 Symbol: Modifier symbol, Common, Other, [Han, Latin]
|
||||
U+A708..U+A716 Symbol: Modifier symbol, Common, Other
|
||||
U+A720..U+A721 Symbol: Modifier symbol, Common, Other
|
||||
U+A789..U+A78A Symbol: Modifier symbol, Common, Other
|
||||
U+AB5B Symbol: Modifier symbol, Common, Other
|
||||
U+AB6A..U+AB6B Symbol: Modifier symbol, Common, Other
|
||||
U+FBB2..U+FBC1 Symbol: Modifier symbol, Arabic, Other
|
||||
U+FF3E Symbol: Modifier symbol, Common, Other
|
||||
U+FF40 Symbol: Modifier symbol, Common, Other
|
||||
U+FFE3 Symbol: Modifier symbol, Common, Other
|
||||
U+1F3FB..U+1F3FF Symbol: Modifier symbol, Common, Extend
|
||||
U+005E ON Symbol: Modifier symbol, common, Other, [alphabetic, cased, changeswhencasemapped, changeswhentitlecased, changeswhenuppercased, graphemebase, idcontinue, idstart, lowercase, xidcontinue, xidstart]
|
||||
U+0060 ON Symbol: Modifier symbol, common, Other, [alphabetic, cased, changeswhencasefolded, changeswhencasemapped, changeswhenlowercased, changeswhentitlecased, graphemebase, idcontinue, idstart, uppercase, xidcontinue, xidstart]
|
||||
U+00A8 ON Symbol: Modifier symbol, common, Other, [alphabetic, cased, graphemebase, idcontinue, idstart, math, uppercase, xidcontinue, xidstart]
|
||||
U+00AF ON Symbol: Modifier symbol, common, Other, [alphabetic, cased, graphemebase, idcontinue, idstart, math, uppercase, xidcontinue, xidstart]
|
||||
U+00B4 ON Symbol: Modifier symbol, common, Other, [alphabetic, cased, graphemebase, idcontinue, idstart, math, uppercase, xidcontinue, xidstart]
|
||||
U+00B8 ON Symbol: Modifier symbol, common, Other, [alphabetic, cased, graphemebase, idcontinue, idstart, math, uppercase, xidcontinue, xidstart]
|
||||
U+02C2..U+02C5 ON Symbol: Modifier symbol, common, Other, [alphabetic, cased, graphemebase, idcontinue, idstart, math, uppercase, xidcontinue, xidstart]
|
||||
U+02D2..U+02DF ON Symbol: Modifier symbol, common, Other, [alphabetic, cased, graphemebase, idcontinue, idstart, math, uppercase, xidcontinue, xidstart]
|
||||
U+02E5..U+02E9 ON Symbol: Modifier symbol, common, Other, [alphabetic, cased, graphemebase, idcontinue, idstart, math, uppercase, xidcontinue, xidstart]
|
||||
U+02EA..U+02EB ON Symbol: Modifier symbol, bopomofo, Other, [alphabetic, cased, graphemebase, idcontinue, idstart, math, uppercase, xidcontinue, xidstart]
|
||||
U+02ED ON Symbol: Modifier symbol, common, Other, [alphabetic, cased, graphemebase, idcontinue, idstart, math, uppercase, xidcontinue, xidstart]
|
||||
U+02EF..U+02FF ON Symbol: Modifier symbol, common, Other, [alphabetic, cased, graphemebase, idcontinue, idstart, math, uppercase, xidcontinue, xidstart]
|
||||
U+0375 ON Symbol: Modifier symbol, greek, Other, [alphabetic, cased, graphemebase, idcontinue, idstart, math, uppercase, xidcontinue, xidstart]
|
||||
U+0384 ON Symbol: Modifier symbol, greek, Other, [alphabetic, cased, graphemebase, idcontinue, idstart, math, uppercase, xidcontinue, xidstart]
|
||||
U+0385 ON Symbol: Modifier symbol, common, Other, [alphabetic, cased, graphemebase, idcontinue, idstart, math, uppercase, xidcontinue, xidstart]
|
||||
U+0888 AL Symbol: Modifier symbol, arabic, Other, [alphabetic, cased, graphemebase, idcontinue, idstart, lowercase, math, softdotted, xidcontinue, xidstart]
|
||||
U+1FBD ON Symbol: Modifier symbol, greek, Other, [alphabetic, cased, graphemebase, idcontinue, idstart, math, uppercase, xidcontinue, xidstart]
|
||||
U+1FBF..U+1FC1 ON Symbol: Modifier symbol, greek, Other, [alphabetic, cased, graphemebase, idcontinue, idstart, math, uppercase, xidcontinue, xidstart]
|
||||
U+1FCD..U+1FCF ON Symbol: Modifier symbol, greek, Other, [alphabetic, cased, graphemebase, idcontinue, idstart, math, uppercase, xidcontinue, xidstart]
|
||||
U+1FDD..U+1FDF ON Symbol: Modifier symbol, greek, Other, [alphabetic, cased, graphemebase, idcontinue, idstart, math, uppercase, xidcontinue, xidstart]
|
||||
U+1FED..U+1FEF ON Symbol: Modifier symbol, greek, Other, [alphabetic, cased, graphemebase, idcontinue, idstart, math, uppercase, xidcontinue, xidstart]
|
||||
U+1FFD..U+1FFE ON Symbol: Modifier symbol, greek, Other, [alphabetic, cased, graphemebase, idcontinue, idstart, math, uppercase, xidcontinue, xidstart]
|
||||
U+309B..U+309C ON Symbol: Modifier symbol, common, Other, [hiragana, katakana], [alphabetic, bidimirrored, caseignorable, cased, changeswhencasefolded, changeswhenlowercased, changeswhentitlecased, changeswhenuppercased, dash, defaultignorablecodepoint, deprecated, diacritic, emoji, emojicomponent, emojimodifier, emojimodifierbase, emojipresentation, extendedpictographic, extender, graphemebase, graphemeextend, graphemelink, hexdigit, idsbinaryoperator, idstrinaryoperator, idcontinue, idstart, ideographic, sentenceterminal, unifiedideograph, whitespace, xidcontinue]
|
||||
U+A700..U+A707 ON Symbol: Modifier symbol, common, Other, [latin, han], [alphabetic, cased, graphemebase, idcontinue, idstart, math, uppercase, xidcontinue, xidstart]
|
||||
U+A708..U+A716 ON Symbol: Modifier symbol, common, Other, [alphabetic, cased, graphemebase, idcontinue, idstart, math, uppercase, xidcontinue, xidstart]
|
||||
U+A720..U+A721 ON Symbol: Modifier symbol, common, Other, [alphabetic, cased, graphemebase, idcontinue, idstart, math, uppercase, xidcontinue, xidstart]
|
||||
U+A789..U+A78A L Symbol: Modifier symbol, common, Other, [alphabetic, cased, graphemebase, idcontinue, idstart, math, uppercase, xidcontinue, xidstart]
|
||||
U+AB5B L Symbol: Modifier symbol, common, Other, [alphabetic, cased, graphemebase, idcontinue, idstart, math, uppercase, xidcontinue, xidstart]
|
||||
U+AB6A..U+AB6B ON Symbol: Modifier symbol, common, Other, [alphabetic, cased, graphemebase, idcontinue, idstart, math, uppercase, xidcontinue, xidstart]
|
||||
U+FBB2..U+FBC2 AL Symbol: Modifier symbol, arabic, Other, [alphabetic, cased, graphemebase, idcontinue, idstart, lowercase, math, softdotted, xidcontinue, xidstart]
|
||||
U+FF3E ON Symbol: Modifier symbol, common, Other, [asciihexdigit, bidicontrol, bidimirrored, cased, changeswhencasefolded, sentenceterminal, unifiedideograph, whitespace, xidstart]
|
||||
U+FF40 ON Symbol: Modifier symbol, common, Other, [alphabetic, cased, graphemebase, idcontinue, idstart, math, uppercase, xidcontinue, xidstart]
|
||||
U+FFE3 ON Symbol: Modifier symbol, common, Other, [alphabetic, cased, graphemebase, idcontinue, idstart, math, uppercase, xidcontinue, xidstart]
|
||||
U+1F3FB..U+1F3FF ON Symbol: Modifier symbol, common, Extend, [changeswhencasemapped, changeswhenuppercased, emojimodifier, emojimodifierbase, math, patternsyntax, radical, sentenceterminal, terminalpunctuation]
|
||||
find type Pd
|
||||
U+002D Punctuation: Dash punctuation, Common, Other
|
||||
U+058A Punctuation: Dash punctuation, Armenian, Other
|
||||
U+05BE Punctuation: Dash punctuation, Hebrew, Other
|
||||
U+1400 Punctuation: Dash punctuation, Canadian_Aboriginal, Other
|
||||
U+1806 Punctuation: Dash punctuation, Mongolian, Other
|
||||
U+2010..U+2015 Punctuation: Dash punctuation, Common, Other
|
||||
U+2E17 Punctuation: Dash punctuation, Common, Other
|
||||
U+2E1A Punctuation: Dash punctuation, Common, Other
|
||||
U+2E3A..U+2E3B Punctuation: Dash punctuation, Common, Other
|
||||
U+2E40 Punctuation: Dash punctuation, Common, Other
|
||||
U+301C Punctuation: Dash punctuation, Common, Other, [Bopomofo, Hangul, Han, Hiragana, Katakana]
|
||||
U+3030 Punctuation: Dash punctuation, Common, Extended Pictographic, [Bopomofo, Hangul, Han, Hiragana, Katakana]
|
||||
U+30A0 Punctuation: Dash punctuation, Common, Other, [Hiragana, Katakana]
|
||||
U+FE31..U+FE32 Punctuation: Dash punctuation, Common, Other
|
||||
U+FE58 Punctuation: Dash punctuation, Common, Other
|
||||
U+FE63 Punctuation: Dash punctuation, Common, Other
|
||||
U+FF0D Punctuation: Dash punctuation, Common, Other
|
||||
U+10EAD Punctuation: Dash punctuation, Yezidi, Other
|
||||
U+002D ES Punctuation: Dash punctuation, common, Other, [ascii, alphabetic, cased, changeswhencasemapped, changeswhentitlecased, changeswhenuppercased, graphemebase, idcontinue, idstart, lowercase, softdotted, xidcontinue, xidstart]
|
||||
U+058A ON Punctuation: Dash punctuation, armenian, Other, [emoji, emojipresentation, extendedpictographic, graphemebase, patternsyntax]
|
||||
U+05BE R Punctuation: Dash punctuation, hebrew, Other, [emoji, emojipresentation, extendedpictographic, graphemebase, patternsyntax]
|
||||
U+1400 ON Punctuation: Dash punctuation, canadianaboriginal, Other, [emoji, emojipresentation, extendedpictographic, graphemebase, patternsyntax]
|
||||
U+1806 ON Punctuation: Dash punctuation, mongolian, Other, [emoji, emojipresentation, extendedpictographic, graphemebase, patternsyntax]
|
||||
U+2010..U+2015 ON Punctuation: Dash punctuation, common, Other, [dash, defaultignorablecodepoint, deprecated, emojipresentation, joincontrol, lowercase, patternwhitespace, radical, regionalindicator, softdotted, xidcontinue, xidstart]
|
||||
U+2E17 ON Punctuation: Dash punctuation, common, Other, [dash, defaultignorablecodepoint, deprecated, emojipresentation, joincontrol, lowercase, patternwhitespace, radical, regionalindicator, softdotted, xidcontinue, xidstart]
|
||||
U+2E1A ON Punctuation: Dash punctuation, common, Other, [dash, defaultignorablecodepoint, deprecated, emojipresentation, joincontrol, lowercase, patternwhitespace, radical, regionalindicator, softdotted, xidcontinue, xidstart]
|
||||
U+2E3A..U+2E3B ON Punctuation: Dash punctuation, common, Other, [dash, defaultignorablecodepoint, deprecated, emojipresentation, joincontrol, lowercase, patternwhitespace, radical, regionalindicator, softdotted, xidcontinue, xidstart]
|
||||
U+2E40 ON Punctuation: Dash punctuation, common, Other, [dash, defaultignorablecodepoint, deprecated, emojipresentation, joincontrol, lowercase, patternwhitespace, radical, regionalindicator, softdotted, xidcontinue, xidstart]
|
||||
U+2E5D ON Punctuation: Dash punctuation, common, Other, [dash, defaultignorablecodepoint, deprecated, emojipresentation, joincontrol, lowercase, patternwhitespace, radical, regionalindicator, softdotted, xidcontinue, xidstart]
|
||||
U+301C ON Punctuation: Dash punctuation, common, Other, [hangul, hiragana, katakana, bopomofo, han], [dash, defaultignorablecodepoint, deprecated, emojipresentation, joincontrol, lowercase, patternwhitespace, radical, regionalindicator, softdotted, xidcontinue, xidstart]
|
||||
U+3030 ON Punctuation: Dash punctuation, common, Extended Pictographic, [hangul, hiragana, katakana, bopomofo, han], [changeswhencasemapped, changeswhenuppercased, emojimodifier, emojimodifierbase, joincontrol, logicalorderexception, lowercase, prependedconcatenationmark, quotationmark, radical, regionalindicator, sentenceterminal, softdotted, terminalpunctuation, unifiedideograph, uppercase, variationselector, whitespace, xidcontinue, xidstart]
|
||||
U+30A0 ON Punctuation: Dash punctuation, common, Other, [hiragana, katakana], [emoji, emojipresentation, extendedpictographic, graphemebase, patternsyntax]
|
||||
U+FE31..U+FE32 ON Punctuation: Dash punctuation, common, Other, [emoji, emojipresentation, extendedpictographic, graphemebase, patternsyntax]
|
||||
U+FE58 ON Punctuation: Dash punctuation, common, Other, [emoji, emojipresentation, extendedpictographic, graphemebase, patternsyntax]
|
||||
U+FE63 ES Punctuation: Dash punctuation, common, Other, [caseignorable, sentenceterminal, unifiedideograph, xidcontinue]
|
||||
U+FF0D ES Punctuation: Dash punctuation, common, Other, [emoji, emojipresentation, extendedpictographic, graphemebase, patternsyntax]
|
||||
U+10EAD R Punctuation: Dash punctuation, yezidi, Other, [emoji, emojipresentation, extendedpictographic, graphemebase, patternsyntax]
|
||||
find gbreak LVT
|
||||
U+AC01..U+AC1B Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||
U+AC1D..U+AC37 Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||
U+AC39..U+AC53 Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||
U+AC55..U+AC6F Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||
U+AC71..U+AC8B Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||
U+AC8D..U+ACA7 Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||
U+ACA9..U+ACC3 Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||
U+ACC5..U+ACDF Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||
U+ACE1..U+ACFB Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||
U+ACFD..U+AD17 Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||
U+AD19..U+AD33 Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||
U+AD35..U+AD4F Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||
U+AD51..U+AD6B Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||
U+AD6D..U+AD87 Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||
U+AD89..U+ADA3 Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||
U+ADA5..U+ADBF Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||
U+ADC1..U+ADDB Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||
U+ADDD..U+ADF7 Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||
U+ADF9..U+AE13 Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||
U+AE15..U+AE2F Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||
U+AE31..U+AE4B Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||
U+AE4D..U+AE67 Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||
U+AE69..U+AE83 Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||
U+AE85..U+AE9F Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||
U+AEA1..U+AEBB Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||
U+AEBD..U+AED7 Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||
U+AED9..U+AEF3 Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||
U+AEF5..U+AF0F Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||
U+AF11..U+AF2B Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||
U+AF2D..U+AF47 Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||
U+AF49..U+AF63 Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||
U+AF65..U+AF7F Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||
U+AF81..U+AF9B Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||
U+AF9D..U+AFB7 Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||
U+AFB9..U+AFD3 Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||
U+AFD5..U+AFEF Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||
U+AFF1..U+B00B Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||
U+B00D..U+B027 Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||
U+B029..U+B043 Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||
U+B045..U+B05F Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||
U+B061..U+B07B Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||
U+B07D..U+B097 Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||
U+B099..U+B0B3 Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||
U+B0B5..U+B0CF Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||
U+B0D1..U+B0EB Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||
U+B0ED..U+B107 Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||
U+B109..U+B123 Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||
U+B125..U+B13F Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||
U+B141..U+B15B Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||
U+B15D..U+B177 Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||
U+B179..U+B193 Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||
U+B195..U+B1AF Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||
U+B1B1..U+B1CB Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||
U+B1CD..U+B1E7 Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||
U+B1E9..U+B203 Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||
U+B205..U+B21F Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||
U+B221..U+B23B Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||
U+B23D..U+B257 Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||
U+B259..U+B273 Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||
U+B275..U+B28F Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||
U+B291..U+B2AB Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||
U+B2AD..U+B2C7 Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||
U+B2C9..U+B2E3 Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||
U+B2E5..U+B2FF Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||
U+B301..U+B31B Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||
U+B31D..U+B337 Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||
U+B339..U+B353 Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||
U+B355..U+B36F Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||
U+B371..U+B38B Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||
U+B38D..U+B3A7 Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||
U+B3A9..U+B3C3 Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||
U+B3C5..U+B3DF Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||
U+B3E1..U+B3FB Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||
U+B3FD..U+B417 Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||
U+B419..U+B433 Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||
U+B435..U+B44F Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||
U+B451..U+B46B Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||
U+B46D..U+B487 Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||
U+B489..U+B4A3 Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||
U+B4A5..U+B4BF Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||
U+B4C1..U+B4DB Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||
U+B4DD..U+B4F7 Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||
U+B4F9..U+B513 Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||
U+B515..U+B52F Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||
U+B531..U+B54B Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||
U+B54D..U+B567 Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||
U+B569..U+B583 Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||
U+B585..U+B59F Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||
U+B5A1..U+B5BB Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||
U+B5BD..U+B5D7 Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||
U+B5D9..U+B5F3 Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||
U+B5F5..U+B60F Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||
U+B611..U+B62B Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||
U+B62D..U+B647 Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||
U+B649..U+B663 Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||
U+B665..U+B67F Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||
U+B681..U+B69B Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||
U+B69D..U+B6B7 Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||
U+B6B9..U+B6D3 Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||
U+B6D5..U+B6EF Letter: Other letter, Hangul, Hangul syllable type LVT
|
||||
U+AC01..U+AC1B L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+AC1D..U+AC37 L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+AC39..U+AC53 L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+AC55..U+AC6F L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+AC71..U+AC8B L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+AC8D..U+ACA7 L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+ACA9..U+ACC3 L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+ACC5..U+ACDF L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+ACE1..U+ACFB L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+ACFD..U+AD17 L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+AD19..U+AD33 L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+AD35..U+AD4F L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+AD51..U+AD6B L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+AD6D..U+AD87 L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+AD89..U+ADA3 L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+ADA5..U+ADBF L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+ADC1..U+ADDB L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+ADDD..U+ADF7 L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+ADF9..U+AE13 L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+AE15..U+AE2F L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+AE31..U+AE4B L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+AE4D..U+AE67 L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+AE69..U+AE83 L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+AE85..U+AE9F L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+AEA1..U+AEBB L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+AEBD..U+AED7 L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+AED9..U+AEF3 L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+AEF5..U+AF0F L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+AF11..U+AF2B L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+AF2D..U+AF47 L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+AF49..U+AF63 L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+AF65..U+AF7F L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+AF81..U+AF9B L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+AF9D..U+AFB7 L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+AFB9..U+AFD3 L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+AFD5..U+AFEF L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+AFF1..U+B00B L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+B00D..U+B027 L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+B029..U+B043 L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+B045..U+B05F L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+B061..U+B07B L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+B07D..U+B097 L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+B099..U+B0B3 L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+B0B5..U+B0CF L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+B0D1..U+B0EB L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+B0ED..U+B107 L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+B109..U+B123 L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+B125..U+B13F L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+B141..U+B15B L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+B15D..U+B177 L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+B179..U+B193 L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+B195..U+B1AF L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+B1B1..U+B1CB L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+B1CD..U+B1E7 L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+B1E9..U+B203 L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+B205..U+B21F L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+B221..U+B23B L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+B23D..U+B257 L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+B259..U+B273 L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+B275..U+B28F L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+B291..U+B2AB L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+B2AD..U+B2C7 L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+B2C9..U+B2E3 L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+B2E5..U+B2FF L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+B301..U+B31B L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+B31D..U+B337 L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+B339..U+B353 L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+B355..U+B36F L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+B371..U+B38B L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+B38D..U+B3A7 L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+B3A9..U+B3C3 L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+B3C5..U+B3DF L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+B3E1..U+B3FB L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+B3FD..U+B417 L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+B419..U+B433 L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+B435..U+B44F L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+B451..U+B46B L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+B46D..U+B487 L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+B489..U+B4A3 L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+B4A5..U+B4BF L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+B4C1..U+B4DB L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+B4DD..U+B4F7 L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+B4F9..U+B513 L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+B515..U+B52F L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+B531..U+B54B L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+B54D..U+B567 L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+B569..U+B583 L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+B585..U+B59F L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+B5A1..U+B5BB L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+B5BD..U+B5D7 L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+B5D9..U+B5F3 L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+B5F5..U+B60F L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+B611..U+B62B L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+B62D..U+B647 L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+B649..U+B663 L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+B665..U+B67F L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+B681..U+B69B L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+B69D..U+B6B7 L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+B6B9..U+B6D3 L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+B6D5..U+B6EF L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
...
|
||||
find script Old_Uyghur
|
||||
U+10F70..U+10F81 R Letter: Other letter, olduyghur, Other, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+10F82..U+10F85 NSM Mark: Non-spacing mark, olduyghur, Extend, [alphabetic, cased, graphemebase, idcontinue, idstart, lowercase, softdotted, xidcontinue, xidstart]
|
||||
U+10F86..U+10F89 R Punctuation: Other punctuation, olduyghur, Other, [bidimirrored, graphemebase, math, patternsyntax]
|
||||
find bidi PDF
|
||||
U+202C PDF Control: Format, common, Control, [extendedpictographic, graphemebase, math, patternsyntax]
|
||||
find bidi CS
|
||||
U+002C CS Punctuation: Other punctuation, common, Other, [ascii, asciihexdigit, alphabetic, cased, changeswhencasemapped, changeswhentitlecased, changeswhenuppercased, graphemebase, hexdigit, idcontinue, idstart, lowercase, xidcontinue, xidstart]
|
||||
U+002E CS Punctuation: Other punctuation, common, Other, [graphemebase, whitespace]
|
||||
U+002F CS Punctuation: Other punctuation, common, Other, [ascii, asciihexdigit, emoji, emojicomponent, graphemebase, hexdigit, idcontinue, xidcontinue]
|
||||
U+003A CS Punctuation: Other punctuation, common, Other, [alphabetic, cased, graphemebase, idcontinue, idstart, lowercase, xidcontinue, xidstart]
|
||||
U+00A0 CS Separator: Space separator, common, Other, [alphabetic, caseignorable, cased, diacritic, graphemebase, idcontinue, idstart, lowercase]
|
||||
U+060C CS Punctuation: Other punctuation, common, Other, [arabic, syriac, thaana, nko, hanifirohingya, yezidi], [graphemebase, patternsyntax, sentenceterminal, terminalpunctuation]
|
||||
U+202F CS Separator: Space separator, common, Other, [latin, mongolian], [alphabetic, caseignorable, cased, diacritic, graphemebase, idcontinue, idstart, lowercase]
|
||||
U+2044 CS Symbol: Mathematical symbol, common, Other, [alphabetic, caseignorable, diacritic, graphemeextend, idcontinue, xidcontinue]
|
||||
U+FE50 CS Punctuation: Other punctuation, common, Other, [graphemebase, patternsyntax, sentenceterminal, terminalpunctuation]
|
||||
U+FE52 CS Punctuation: Other punctuation, common, Other, [changeswhenuppercased, deprecated, emojimodifier, emojimodifierbase, extender, quotationmark, sentenceterminal, xidcontinue, xidstart]
|
||||
U+FE55 CS Punctuation: Other punctuation, common, Other, [changeswhencasemapped, changeswhenuppercased, emojimodifier, emojimodifierbase, joincontrol, noncharactercodepoint, patternwhitespace, prependedconcatenationmark]
|
||||
U+FF0C CS Punctuation: Other punctuation, common, Other, [graphemebase, patternsyntax, sentenceterminal, terminalpunctuation]
|
||||
U+FF0E CS Punctuation: Other punctuation, common, Other, [changeswhenuppercased, deprecated, emojimodifier, emojimodifierbase, extender, quotationmark, sentenceterminal, xidcontinue, xidstart]
|
||||
U+FF0F CS Punctuation: Other punctuation, common, Other, [alphabetic, caseignorable, extender, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
|
||||
U+FF1A CS Punctuation: Other punctuation, common, Other, [changeswhencasemapped, changeswhenuppercased, emojimodifier, emojimodifierbase, joincontrol, noncharactercodepoint, patternwhitespace, prependedconcatenationmark]
|
||||
find bidi CS type Sm
|
||||
U+2044 CS Symbol: Mathematical symbol, common, Other, [alphabetic, caseignorable, diacritic, graphemeextend, idcontinue, xidcontinue]
|
||||
find bidi B
|
||||
U+000A B Control: Control, common, LF, [ascii, graphemebase, patternsyntax, sentenceterminal, terminalpunctuation]
|
||||
U+000D B Control: Control, common, CR, [ascii, graphemebase, patternsyntax, sentenceterminal, terminalpunctuation]
|
||||
U+001C..U+001E B Control: Control, common, Control, [ascii, patternwhitespace, whitespace]
|
||||
U+0085 B Control: Control, common, Control, [caseignorable, defaultignorablecodepoint, graphemeextend, idcontinue, xidcontinue]
|
||||
U+2029 B Separator: Paragraph separator, common, Control, [caseignorable, defaultignorablecodepoint, graphemeextend, idcontinue, xidcontinue]
|
||||
find bidi FSI
|
||||
U+2068 FSI Control: Format, common, Control, [extendedpictographic, graphemebase, math, patternsyntax]
|
||||
find bidi PDI
|
||||
U+2069 PDI Control: Format, common, Control, [extendedpictographic, graphemebase, math, patternsyntax]
|
||||
find bidi RLI
|
||||
U+2067 RLI Control: Format, common, Control, [extendedpictographic, graphemebase, math, patternsyntax]
|
||||
find bidi RLO
|
||||
U+202E RLO Control: Format, common, Control, [extendedpictographic, graphemebase, math, patternsyntax]
|
||||
find bidi S
|
||||
U+0009 S Control: Control, common, Control, [ascii, graphemebase, patternsyntax, sentenceterminal, terminalpunctuation]
|
||||
U+000B S Control: Control, common, Control, [ascii, graphemebase, patternsyntax, sentenceterminal, terminalpunctuation]
|
||||
U+001F S Control: Control, common, Control, [ascii, patternwhitespace, whitespace]
|
||||
find bidi WS
|
||||
U+000C WS Control: Control, common, Control, [ascii, graphemebase, patternsyntax, sentenceterminal, terminalpunctuation]
|
||||
U+0020 WS Separator: Space separator, common, Other, [ascii, emoji, emojicomponent, graphemebase, patternsyntax]
|
||||
U+1680 WS Separator: Space separator, ogham, Other, [alphabetic, caseignorable, cased, diacritic, graphemebase, idcontinue, idstart, lowercase]
|
||||
U+2000..U+200A WS Separator: Space separator, common, Other, [alphabetic, caseignorable, cased, diacritic, graphemebase, idcontinue, idstart, lowercase]
|
||||
U+2028 WS Separator: Line separator, common, Control, [caseignorable, defaultignorablecodepoint, graphemeextend, idcontinue, xidcontinue]
|
||||
U+205F WS Separator: Space separator, common, Other, [alphabetic, caseignorable, cased, diacritic, graphemebase, idcontinue, idstart, lowercase]
|
||||
U+3000 WS Separator: Space separator, common, Other, [alphabetic, caseignorable, cased, diacritic, graphemebase, idcontinue, idstart, lowercase]
|
||||
find script bopo
|
||||
U+02EA..U+02EB ON Symbol: Modifier symbol, bopomofo, Other, [alphabetic, cased, graphemebase, idcontinue, idstart, math, uppercase, xidcontinue, xidstart]
|
||||
U+3105..U+312F L Letter: Other letter, bopomofo, Other, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+31A0..U+31BF L Letter: Other letter, bopomofo, Other, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
find bool prependedconcatenationmark
|
||||
U+00AD BN Control: Format, common, Control, [caseignorable, prependedconcatenationmark]
|
||||
U+180E BN Control: Format, mongolian, Control, [caseignorable, prependedconcatenationmark]
|
||||
U+200B BN Control: Format, common, Control, [caseignorable, prependedconcatenationmark]
|
||||
U+2060 BN Control: Format, common, Control, [caseignorable, prependedconcatenationmark]
|
||||
U+2118 ON Symbol: Mathematical symbol, common, Other, [changeswhencasemapped, changeswhentitlecased, emojimodifier, emojimodifierbase, patternsyntax, patternwhitespace, prependedconcatenationmark, quotationmark, radical, regionalindicator, sentenceterminal, softdotted, terminalpunctuation, unifiedideograph, uppercase, variationselector, whitespace, xidcontinue, xidstart]
|
||||
U+3030 ON Punctuation: Dash punctuation, common, Extended Pictographic, [hangul, hiragana, katakana, bopomofo, han], [changeswhencasemapped, changeswhenuppercased, emojimodifier, emojimodifierbase, joincontrol, logicalorderexception, lowercase, prependedconcatenationmark, quotationmark, radical, regionalindicator, sentenceterminal, softdotted, terminalpunctuation, unifiedideograph, uppercase, variationselector, whitespace, xidcontinue, xidstart]
|
||||
U+AAC0 L Letter: Other letter, taiviet, Other, [changeswhencasemapped, changeswhenuppercased, emojimodifier, emojimodifierbase, logicalorderexception, lowercase, math, patternwhitespace, prependedconcatenationmark]
|
||||
U+AAC2 L Letter: Other letter, taiviet, Other, [changeswhencasemapped, changeswhenuppercased, emojimodifier, emojimodifierbase, logicalorderexception, lowercase, math, patternwhitespace, prependedconcatenationmark]
|
||||
U+FE0F NSM Mark: Non-spacing mark, inherited, Extend, [changeswhencasemapped, changeswhenuppercased, emojimodifier, emojimodifierbase, joincontrol, logicalorderexception, math, patternwhitespace, prependedconcatenationmark]
|
||||
U+FE55 CS Punctuation: Other punctuation, common, Other, [changeswhencasemapped, changeswhenuppercased, emojimodifier, emojimodifierbase, joincontrol, noncharactercodepoint, patternwhitespace, prependedconcatenationmark]
|
||||
U+FEFF BN Control: Format, common, Control, [caseignorable, prependedconcatenationmark]
|
||||
U+FF1A CS Punctuation: Other punctuation, common, Other, [changeswhencasemapped, changeswhenuppercased, emojimodifier, emojimodifierbase, joincontrol, noncharactercodepoint, patternwhitespace, prependedconcatenationmark]
|
||||
U+FF21..U+FF26 L Letter: Upper case letter, latin, Other, [changeswhencasemapped, changeswhenuppercased, emojimodifier, emojimodifierbase, logicalorderexception, lowercase, noncharactercodepoint, patternwhitespace, prependedconcatenationmark]
|
||||
U+10D22..U+10D23 AL Letter: Other letter, hanifirohingya, Other, [changeswhencasemapped, changeswhenuppercased, emojimodifier, emojimodifierbase, logicalorderexception, lowercase, math, patternwhitespace, prependedconcatenationmark]
|
||||
U+1135D L Letter: Other letter, grantha, Other, [changeswhencasemapped, changeswhentitlecased, emojimodifier, emojimodifierbase, graphemeextend, hexdigit, logicalorderexception, lowercase, math, noncharactercodepoint, patternsyntax, patternwhitespace, prependedconcatenationmark, quotationmark, radical, regionalindicator, sentenceterminal, softdotted, terminalpunctuation, unifiedideograph, uppercase, variationselector, whitespace, xidcontinue, xidstart]
|
||||
U+1BCA0..U+1BCA3 BN Control: Format, common, Control, [duployan], [caseignorable, prependedconcatenationmark]
|
||||
U+1D173..U+1D17A BN Control: Format, common, Control, [caseignorable, prependedconcatenationmark]
|
||||
U+1F1E6..U+1F1FF L Symbol: Other symbol, common, Regional Indicator, [changeswhencasemapped, changeswhenuppercased, emojimodifier, emojimodifierbase, math, patternwhitespace, prependedconcatenationmark, quotationmark, radical, regionalindicator, sentenceterminal, softdotted, terminalpunctuation, unifiedideograph, uppercase, variationselector, whitespace, xidcontinue, xidstart]
|
||||
find bool pcm
|
||||
U+00AD BN Control: Format, common, Control, [caseignorable, prependedconcatenationmark]
|
||||
U+180E BN Control: Format, mongolian, Control, [caseignorable, prependedconcatenationmark]
|
||||
U+200B BN Control: Format, common, Control, [caseignorable, prependedconcatenationmark]
|
||||
U+2060 BN Control: Format, common, Control, [caseignorable, prependedconcatenationmark]
|
||||
U+2118 ON Symbol: Mathematical symbol, common, Other, [changeswhencasemapped, changeswhentitlecased, emojimodifier, emojimodifierbase, patternsyntax, patternwhitespace, prependedconcatenationmark, quotationmark, radical, regionalindicator, sentenceterminal, softdotted, terminalpunctuation, unifiedideograph, uppercase, variationselector, whitespace, xidcontinue, xidstart]
|
||||
U+3030 ON Punctuation: Dash punctuation, common, Extended Pictographic, [hangul, hiragana, katakana, bopomofo, han], [changeswhencasemapped, changeswhenuppercased, emojimodifier, emojimodifierbase, joincontrol, logicalorderexception, lowercase, prependedconcatenationmark, quotationmark, radical, regionalindicator, sentenceterminal, softdotted, terminalpunctuation, unifiedideograph, uppercase, variationselector, whitespace, xidcontinue, xidstart]
|
||||
U+AAC0 L Letter: Other letter, taiviet, Other, [changeswhencasemapped, changeswhenuppercased, emojimodifier, emojimodifierbase, logicalorderexception, lowercase, math, patternwhitespace, prependedconcatenationmark]
|
||||
U+AAC2 L Letter: Other letter, taiviet, Other, [changeswhencasemapped, changeswhenuppercased, emojimodifier, emojimodifierbase, logicalorderexception, lowercase, math, patternwhitespace, prependedconcatenationmark]
|
||||
U+FE0F NSM Mark: Non-spacing mark, inherited, Extend, [changeswhencasemapped, changeswhenuppercased, emojimodifier, emojimodifierbase, joincontrol, logicalorderexception, math, patternwhitespace, prependedconcatenationmark]
|
||||
U+FE55 CS Punctuation: Other punctuation, common, Other, [changeswhencasemapped, changeswhenuppercased, emojimodifier, emojimodifierbase, joincontrol, noncharactercodepoint, patternwhitespace, prependedconcatenationmark]
|
||||
U+FEFF BN Control: Format, common, Control, [caseignorable, prependedconcatenationmark]
|
||||
U+FF1A CS Punctuation: Other punctuation, common, Other, [changeswhencasemapped, changeswhenuppercased, emojimodifier, emojimodifierbase, joincontrol, noncharactercodepoint, patternwhitespace, prependedconcatenationmark]
|
||||
U+FF21..U+FF26 L Letter: Upper case letter, latin, Other, [changeswhencasemapped, changeswhenuppercased, emojimodifier, emojimodifierbase, logicalorderexception, lowercase, noncharactercodepoint, patternwhitespace, prependedconcatenationmark]
|
||||
U+10D22..U+10D23 AL Letter: Other letter, hanifirohingya, Other, [changeswhencasemapped, changeswhenuppercased, emojimodifier, emojimodifierbase, logicalorderexception, lowercase, math, patternwhitespace, prependedconcatenationmark]
|
||||
U+1135D L Letter: Other letter, grantha, Other, [changeswhencasemapped, changeswhentitlecased, emojimodifier, emojimodifierbase, graphemeextend, hexdigit, logicalorderexception, lowercase, math, noncharactercodepoint, patternsyntax, patternwhitespace, prependedconcatenationmark, quotationmark, radical, regionalindicator, sentenceterminal, softdotted, terminalpunctuation, unifiedideograph, uppercase, variationselector, whitespace, xidcontinue, xidstart]
|
||||
U+1BCA0..U+1BCA3 BN Control: Format, common, Control, [duployan], [caseignorable, prependedconcatenationmark]
|
||||
U+1D173..U+1D17A BN Control: Format, common, Control, [caseignorable, prependedconcatenationmark]
|
||||
U+1F1E6..U+1F1FF L Symbol: Other symbol, common, Regional Indicator, [changeswhencasemapped, changeswhenuppercased, emojimodifier, emojimodifierbase, math, patternwhitespace, prependedconcatenationmark, quotationmark, radical, regionalindicator, sentenceterminal, softdotted, terminalpunctuation, unifiedideograph, uppercase, variationselector, whitespace, xidcontinue, xidstart]
|
||||
|
|
|
@ -97,6 +97,9 @@ sure both macros are undefined; an emulation function will then be used. */
|
|||
/* Have PTHREAD_PRIO_INHERIT. */
|
||||
/* #undef HAVE_PTHREAD_PRIO_INHERIT */
|
||||
|
||||
/* Define to 1 if you have the <readline.h> header file. */
|
||||
/* #undef HAVE_READLINE_H */
|
||||
|
||||
/* Define to 1 if you have the <readline/history.h> header file. */
|
||||
/* #undef HAVE_READLINE_HISTORY_H */
|
||||
|
||||
|
@ -233,7 +236,7 @@ sure both macros are undefined; an emulation function will then be used. */
|
|||
#define PACKAGE_NAME "PCRE2"
|
||||
|
||||
/* Define to the full name and version of this package. */
|
||||
#define PACKAGE_STRING "PCRE2 10.38-RC1"
|
||||
#define PACKAGE_STRING "PCRE2 10.40"
|
||||
|
||||
/* Define to the one symbol short name of this package. */
|
||||
#define PACKAGE_TARNAME "pcre2"
|
||||
|
@ -242,7 +245,7 @@ sure both macros are undefined; an emulation function will then be used. */
|
|||
#define PACKAGE_URL ""
|
||||
|
||||
/* Define to the version of this package. */
|
||||
#define PACKAGE_VERSION "10.38-RC1"
|
||||
#define PACKAGE_VERSION "10.40"
|
||||
|
||||
/* The value of PARENS_NEST_LIMIT specifies the maximum depth of nested
|
||||
parentheses (of any kind) in a pattern. This limits the amount of system
|
||||
|
@ -435,7 +438,7 @@ sure both macros are undefined; an emulation function will then be used. */
|
|||
#endif
|
||||
|
||||
/* Version number of package */
|
||||
#define VERSION "10.38-RC1"
|
||||
#define VERSION "10.40"
|
||||
|
||||
/* Define to empty if `const' does not conform to ANSI C. */
|
||||
/* #undef const */
|
||||
|
|
|
@ -97,6 +97,9 @@ sure both macros are undefined; an emulation function will then be used. */
|
|||
/* Have PTHREAD_PRIO_INHERIT. */
|
||||
#undef HAVE_PTHREAD_PRIO_INHERIT
|
||||
|
||||
/* Define to 1 if you have the <readline.h> header file. */
|
||||
#undef HAVE_READLINE_H
|
||||
|
||||
/* Define to 1 if you have the <readline/history.h> header file. */
|
||||
#undef HAVE_READLINE_HISTORY_H
|
||||
|
||||
|
|
|
@ -42,9 +42,9 @@ POSSIBILITY OF SUCH DAMAGE.
|
|||
/* The current PCRE version information. */
|
||||
|
||||
#define PCRE2_MAJOR 10
|
||||
#define PCRE2_MINOR 38
|
||||
#define PCRE2_PRERELEASE -RC1
|
||||
#define PCRE2_DATE 2021-08-31
|
||||
#define PCRE2_MINOR 40
|
||||
#define PCRE2_PRERELEASE
|
||||
#define PCRE2_DATE 2022-04-14
|
||||
|
||||
/* When an application links to a PCRE DLL in Windows, the symbols that are
|
||||
imported have to be identified as such. When building PCRE2, the appropriate
|
||||
|
@ -84,8 +84,8 @@ set, we ensure here that it has no effect. */
|
|||
/* Have to include limits.h, stdlib.h, and inttypes.h to ensure that size_t and
|
||||
uint8_t, UCHAR_MAX, etc are defined. Some systems that do have inttypes.h do
|
||||
not have stdint.h, which is why we use inttypes.h, which according to the C
|
||||
standard is a superset of stdint.h. If none of these headers are available,
|
||||
the relevant values must be provided by some other means. */
|
||||
standard is a superset of stdint.h. If inttypes.h is not available the build
|
||||
will break and the relevant values must be provided by some other means. */
|
||||
|
||||
#include <limits.h>
|
||||
#include <stdlib.h>
|
||||
|
|
|
@ -84,8 +84,8 @@ set, we ensure here that it has no effect. */
|
|||
/* Have to include limits.h, stdlib.h, and inttypes.h to ensure that size_t and
|
||||
uint8_t, UCHAR_MAX, etc are defined. Some systems that do have inttypes.h do
|
||||
not have stdint.h, which is why we use inttypes.h, which according to the C
|
||||
standard is a superset of stdint.h. If none of these headers are available,
|
||||
the relevant values must be provided by some other means. */
|
||||
standard is a superset of stdint.h. If inttypes.h is not available the build
|
||||
will break and the relevant values must be provided by some other means. */
|
||||
|
||||
#include <limits.h>
|
||||
#include <stdlib.h>
|
||||
|
|
|
@ -7,7 +7,7 @@ and semantics are as close as possible to those of the Perl 5 language.
|
|||
|
||||
Written by Philip Hazel
|
||||
Original API code Copyright (c) 1997-2012 University of Cambridge
|
||||
New API code Copyright (c) 2016-2021 University of Cambridge
|
||||
New API code Copyright (c) 2016-2022 University of Cambridge
|
||||
|
||||
-----------------------------------------------------------------------------
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
|
@ -123,18 +123,21 @@ opcode is used to select the column. The values are as follows:
|
|||
*/
|
||||
|
||||
static const uint8_t propposstab[PT_TABSIZE][PT_TABSIZE] = {
|
||||
/* ANY LAMP GC PC SC ALNUM SPACE PXSPACE WORD CLIST UCNC */
|
||||
{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, /* PT_ANY */
|
||||
{ 0, 3, 0, 0, 0, 3, 1, 1, 0, 0, 0 }, /* PT_LAMP */
|
||||
{ 0, 0, 2, 4, 0, 9, 10, 10, 11, 0, 0 }, /* PT_GC */
|
||||
{ 0, 0, 5, 2, 0, 15, 16, 16, 17, 0, 0 }, /* PT_PC */
|
||||
{ 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0 }, /* PT_SC */
|
||||
{ 0, 3, 6, 12, 0, 3, 1, 1, 0, 0, 0 }, /* PT_ALNUM */
|
||||
{ 0, 1, 7, 13, 0, 1, 3, 3, 1, 0, 0 }, /* PT_SPACE */
|
||||
{ 0, 1, 7, 13, 0, 1, 3, 3, 1, 0, 0 }, /* PT_PXSPACE */
|
||||
{ 0, 0, 8, 14, 0, 0, 1, 1, 3, 0, 0 }, /* PT_WORD */
|
||||
{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, /* PT_CLIST */
|
||||
{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3 } /* PT_UCNC */
|
||||
/* ANY LAMP GC PC SC SCX ALNUM SPACE PXSPACE WORD CLIST UCNC BIDICL BOOL */
|
||||
{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, /* PT_ANY */
|
||||
{ 0, 3, 0, 0, 0, 0, 3, 1, 1, 0, 0, 0, 0, 0 }, /* PT_LAMP */
|
||||
{ 0, 0, 2, 4, 0, 0, 9, 10, 10, 11, 0, 0, 0, 0 }, /* PT_GC */
|
||||
{ 0, 0, 5, 2, 0, 0, 15, 16, 16, 17, 0, 0, 0, 0 }, /* PT_PC */
|
||||
{ 0, 0, 0, 0, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0 }, /* PT_SC */
|
||||
{ 0, 0, 0, 0, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0 }, /* PT_SCX */
|
||||
{ 0, 3, 6, 12, 0, 0, 3, 1, 1, 0, 0, 0, 0, 0 }, /* PT_ALNUM */
|
||||
{ 0, 1, 7, 13, 0, 0, 1, 3, 3, 1, 0, 0, 0, 0 }, /* PT_SPACE */
|
||||
{ 0, 1, 7, 13, 0, 0, 1, 3, 3, 1, 0, 0, 0, 0 }, /* PT_PXSPACE */
|
||||
{ 0, 0, 8, 14, 0, 0, 0, 1, 1, 3, 0, 0, 0, 0 }, /* PT_WORD */
|
||||
{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, /* PT_CLIST */
|
||||
{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0 }, /* PT_UCNC */
|
||||
{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, /* PT_BIDICL */
|
||||
{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } /* PT_BOOL */
|
||||
};
|
||||
|
||||
/* This table is used to check whether auto-possessification is possible
|
||||
|
@ -196,6 +199,7 @@ static BOOL
|
|||
check_char_prop(uint32_t c, unsigned int ptype, unsigned int pdata,
|
||||
BOOL negated)
|
||||
{
|
||||
BOOL ok;
|
||||
const uint32_t *p;
|
||||
const ucd_record *prop = GET_UCD(c);
|
||||
|
||||
|
@ -215,6 +219,11 @@ switch(ptype)
|
|||
case PT_SC:
|
||||
return (pdata == prop->script) == negated;
|
||||
|
||||
case PT_SCX:
|
||||
ok = (pdata == prop->script
|
||||
|| MAPBIT(PRIV(ucd_script_sets) + UCD_SCRIPTX_PROP(prop), pdata) != 0);
|
||||
return ok == negated;
|
||||
|
||||
/* These are specials */
|
||||
|
||||
case PT_ALNUM:
|
||||
|
@ -251,6 +260,14 @@ switch(ptype)
|
|||
if (c == *p++) return negated;
|
||||
}
|
||||
break; /* Control never reaches here */
|
||||
|
||||
/* Haven't yet thought these through. */
|
||||
|
||||
case PT_BIDICL:
|
||||
return FALSE;
|
||||
|
||||
case PT_BOOL:
|
||||
return FALSE;
|
||||
}
|
||||
|
||||
return FALSE;
|
||||
|
|
|
@ -7,7 +7,7 @@ and semantics are as close as possible to those of the Perl 5 language.
|
|||
|
||||
Written by Philip Hazel
|
||||
Original API code Copyright (c) 1997-2012 University of Cambridge
|
||||
New API code Copyright (c) 2016-2021 University of Cambridge
|
||||
New API code Copyright (c) 2016-2022 University of Cambridge
|
||||
|
||||
-----------------------------------------------------------------------------
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
|
@ -124,7 +124,7 @@ static unsigned int
|
|||
|
||||
static int
|
||||
compile_regex(uint32_t, PCRE2_UCHAR **, uint32_t **, int *, uint32_t,
|
||||
uint32_t *, int32_t *, uint32_t *, int32_t *, branch_chain *,
|
||||
uint32_t *, uint32_t *, uint32_t *, uint32_t *, branch_chain *,
|
||||
compile_block *, PCRE2_SIZE *);
|
||||
|
||||
static int
|
||||
|
@ -137,7 +137,7 @@ static BOOL
|
|||
|
||||
static int
|
||||
check_lookbehinds(uint32_t *, uint32_t **, parsed_recurse_check *,
|
||||
compile_block *);
|
||||
compile_block *, int *);
|
||||
|
||||
|
||||
/*************************************************
|
||||
|
@ -385,13 +385,15 @@ compiler is clever with identical subexpressions. */
|
|||
|
||||
#define SETBIT(a,b) a[(b)/8] = (uint8_t)(a[(b)/8] | (1u << ((b)&7)))
|
||||
|
||||
/* Private flags added to firstcu and reqcu. */
|
||||
/* Values and flags for the unsigned xxcuflags variables that accompany xxcu
|
||||
variables, which are concerned with first and required code units. A value
|
||||
greater than or equal to REQ_NONE means "no code unit set"; otherwise the
|
||||
matching xxcu variable is set, and the low valued bits are relevant. */
|
||||
|
||||
#define REQ_CASELESS (1u << 0) /* Indicates caselessness */
|
||||
#define REQ_VARY (1u << 1) /* reqcu followed non-literal item */
|
||||
/* Negative values for the firstcu and reqcu flags */
|
||||
#define REQ_UNSET (-2) /* Not yet found anything */
|
||||
#define REQ_NONE (-1) /* Found not fixed char */
|
||||
#define REQ_UNSET 0xffffffffu /* Not yet found anything */
|
||||
#define REQ_NONE 0xfffffffeu /* Found not fixed character */
|
||||
#define REQ_CASELESS 0x00000001u /* Code unit in xxcu is caseless */
|
||||
#define REQ_VARY 0x00000002u /* Code unit is followed by non-literal */
|
||||
|
||||
/* These flags are used in the groupinfo vector. */
|
||||
|
||||
|
@ -1264,8 +1266,10 @@ PCRE2_SIZE* ref_count;
|
|||
|
||||
if (code != NULL)
|
||||
{
|
||||
#ifdef SUPPORT_JIT
|
||||
if (code->executable_jit != NULL)
|
||||
PRIV(jit_free)(code->executable_jit, &code->memctl);
|
||||
#endif
|
||||
|
||||
if ((code->flags & PCRE2_DEREF_TABLES) != 0)
|
||||
{
|
||||
|
@ -2088,7 +2092,9 @@ get_ucp(PCRE2_SPTR *ptrptr, BOOL *negptr, uint16_t *ptypeptr,
|
|||
PCRE2_UCHAR c;
|
||||
PCRE2_SIZE i, bot, top;
|
||||
PCRE2_SPTR ptr = *ptrptr;
|
||||
PCRE2_UCHAR name[32];
|
||||
PCRE2_UCHAR name[50];
|
||||
PCRE2_UCHAR *vptr = NULL;
|
||||
uint16_t ptscript = PT_NOTSCRIPT;
|
||||
|
||||
if (ptr >= cb->end_pattern) goto ERROR_RETURN;
|
||||
c = *ptr++;
|
||||
|
@ -2100,36 +2106,95 @@ negation. */
|
|||
if (c == CHAR_LEFT_CURLY_BRACKET)
|
||||
{
|
||||
if (ptr >= cb->end_pattern) goto ERROR_RETURN;
|
||||
|
||||
if (*ptr == CHAR_CIRCUMFLEX_ACCENT)
|
||||
{
|
||||
*negptr = TRUE;
|
||||
ptr++;
|
||||
}
|
||||
|
||||
for (i = 0; i < (int)(sizeof(name) / sizeof(PCRE2_UCHAR)) - 1; i++)
|
||||
{
|
||||
if (ptr >= cb->end_pattern) goto ERROR_RETURN;
|
||||
c = *ptr++;
|
||||
while (c == '_' || c == '-' || isspace(c))
|
||||
{
|
||||
if (ptr >= cb->end_pattern) goto ERROR_RETURN;
|
||||
c = *ptr++;
|
||||
}
|
||||
if (c == CHAR_NUL) goto ERROR_RETURN;
|
||||
if (c == CHAR_RIGHT_CURLY_BRACKET) break;
|
||||
name[i] = c;
|
||||
name[i] = tolower(c);
|
||||
if ((c == ':' || c == '=') && vptr == NULL) vptr = name + i;
|
||||
}
|
||||
|
||||
if (c != CHAR_RIGHT_CURLY_BRACKET) goto ERROR_RETURN;
|
||||
name[i] = 0;
|
||||
}
|
||||
|
||||
/* Otherwise there is just one following character, which must be an ASCII
|
||||
letter. */
|
||||
/* If { doesn't follow \p or \P there is just one following character, which
|
||||
must be an ASCII letter. */
|
||||
|
||||
else if (MAX_255(c) && (cb->ctypes[c] & ctype_letter) != 0)
|
||||
{
|
||||
name[0] = c;
|
||||
name[0] = tolower(c);
|
||||
name[1] = 0;
|
||||
}
|
||||
else goto ERROR_RETURN;
|
||||
|
||||
*ptrptr = ptr;
|
||||
|
||||
/* Search for a recognized property name using binary chop. */
|
||||
/* If the property contains ':' or '=' we have class name and value separately
|
||||
specified. The following are supported:
|
||||
|
||||
. Bidi_Class (synonym bc), for which the property names are "bidi<name>".
|
||||
. Script (synonym sc) for which the property name is the script name
|
||||
. Script_Extensions (synonym scx), ditto
|
||||
|
||||
As this is a small number, we currently just check the names directly. If this
|
||||
grows, a sorted table and a switch will be neater.
|
||||
|
||||
For both the script properties, set a PT_xxx value so that (1) they can be
|
||||
distinguished and (2) invalid script names that happen to be the name of
|
||||
another property can be diagnosed. */
|
||||
|
||||
if (vptr != NULL)
|
||||
{
|
||||
int offset = 0;
|
||||
PCRE2_UCHAR sname[8];
|
||||
|
||||
*vptr = 0; /* Terminate property name */
|
||||
if (PRIV(strcmp_c8)(name, STRING_bidiclass) == 0 ||
|
||||
PRIV(strcmp_c8)(name, STRING_bc) == 0)
|
||||
{
|
||||
offset = 4;
|
||||
sname[0] = CHAR_b;
|
||||
sname[1] = CHAR_i; /* There is no strcpy_c8 function */
|
||||
sname[2] = CHAR_d;
|
||||
sname[3] = CHAR_i;
|
||||
}
|
||||
|
||||
else if (PRIV(strcmp_c8)(name, STRING_script) == 0 ||
|
||||
PRIV(strcmp_c8)(name, STRING_sc) == 0)
|
||||
ptscript = PT_SC;
|
||||
|
||||
else if (PRIV(strcmp_c8)(name, STRING_scriptextensions) == 0 ||
|
||||
PRIV(strcmp_c8)(name, STRING_scx) == 0)
|
||||
ptscript = PT_SCX;
|
||||
|
||||
else
|
||||
{
|
||||
*errorcodeptr = ERR47;
|
||||
return FALSE;
|
||||
}
|
||||
|
||||
/* Adjust the string in name[] as needed */
|
||||
|
||||
memmove(name + offset, vptr + 1, (name + i - vptr)*sizeof(PCRE2_UCHAR));
|
||||
if (offset != 0) memmove(name, sname, offset*sizeof(PCRE2_UCHAR));
|
||||
}
|
||||
|
||||
/* Search for a recognized property using binary chop. */
|
||||
|
||||
bot = 0;
|
||||
top = PRIV(utt_size);
|
||||
|
@ -2139,15 +2204,37 @@ while (bot < top)
|
|||
int r;
|
||||
i = (bot + top) >> 1;
|
||||
r = PRIV(strcmp_c8)(name, PRIV(utt_names) + PRIV(utt)[i].name_offset);
|
||||
|
||||
/* When a matching property is found, some extra checking is needed when the
|
||||
\p{xx:yy} syntax is used and xx is either sc or scx. */
|
||||
|
||||
if (r == 0)
|
||||
{
|
||||
*ptypeptr = PRIV(utt)[i].type;
|
||||
*pdataptr = PRIV(utt)[i].value;
|
||||
return TRUE;
|
||||
if (vptr == NULL || ptscript == PT_NOTSCRIPT)
|
||||
{
|
||||
*ptypeptr = PRIV(utt)[i].type;
|
||||
return TRUE;
|
||||
}
|
||||
|
||||
switch (PRIV(utt)[i].type)
|
||||
{
|
||||
case PT_SC:
|
||||
*ptypeptr = PT_SC;
|
||||
return TRUE;
|
||||
|
||||
case PT_SCX:
|
||||
*ptypeptr = ptscript;
|
||||
return TRUE;
|
||||
}
|
||||
|
||||
break; /* Non-script found */
|
||||
}
|
||||
|
||||
if (r > 0) bot = i + 1; else top = i;
|
||||
}
|
||||
*errorcodeptr = ERR47; /* Unrecognized name */
|
||||
|
||||
*errorcodeptr = ERR47; /* Unrecognized property */
|
||||
return FALSE;
|
||||
|
||||
ERROR_RETURN: /* Malformed \P or \p */
|
||||
|
@ -5285,9 +5372,9 @@ Arguments:
|
|||
pptrptr points to the current parsed pattern pointer
|
||||
errorcodeptr points to error code variable
|
||||
firstcuptr place to put the first required code unit
|
||||
firstcuflagsptr place to put the first code unit flags, or a negative number
|
||||
firstcuflagsptr place to put the first code unit flags
|
||||
reqcuptr place to put the last required code unit
|
||||
reqcuflagsptr place to put the last required code unit flags, or a negative number
|
||||
reqcuflagsptr place to put the last required code unit flags
|
||||
bcptr points to current branch chain
|
||||
cb contains pointers to tables etc.
|
||||
lengthptr NULL during the real compile phase
|
||||
|
@ -5300,8 +5387,8 @@ Returns: 0 There's been an error, *errorcodeptr is non-zero
|
|||
|
||||
static int
|
||||
compile_branch(uint32_t *optionsptr, PCRE2_UCHAR **codeptr, uint32_t **pptrptr,
|
||||
int *errorcodeptr, uint32_t *firstcuptr, int32_t *firstcuflagsptr,
|
||||
uint32_t *reqcuptr, int32_t *reqcuflagsptr, branch_chain *bcptr,
|
||||
int *errorcodeptr, uint32_t *firstcuptr, uint32_t *firstcuflagsptr,
|
||||
uint32_t *reqcuptr, uint32_t *reqcuflagsptr, branch_chain *bcptr,
|
||||
compile_block *cb, PCRE2_SIZE *lengthptr)
|
||||
{
|
||||
int bravalue = 0;
|
||||
|
@ -5316,9 +5403,9 @@ uint32_t zeroreqcu, zerofirstcu;
|
|||
uint32_t escape;
|
||||
uint32_t *pptr = *pptrptr;
|
||||
uint32_t meta, meta_arg;
|
||||
int32_t firstcuflags, reqcuflags;
|
||||
int32_t zeroreqcuflags, zerofirstcuflags;
|
||||
int32_t req_caseopt, reqvary, tempreqvary;
|
||||
uint32_t firstcuflags, reqcuflags;
|
||||
uint32_t zeroreqcuflags, zerofirstcuflags;
|
||||
uint32_t req_caseopt, reqvary, tempreqvary;
|
||||
PCRE2_SIZE offset = 0;
|
||||
PCRE2_SIZE length_prevgroup = 0;
|
||||
PCRE2_UCHAR *code = *codeptr;
|
||||
|
@ -5374,13 +5461,13 @@ item types that can be repeated set these backoff variables appropriately. */
|
|||
firstcu = reqcu = zerofirstcu = zeroreqcu = 0;
|
||||
firstcuflags = reqcuflags = zerofirstcuflags = zeroreqcuflags = REQ_UNSET;
|
||||
|
||||
/* The variable req_caseopt contains either the REQ_CASELESS value or zero,
|
||||
/* The variable req_caseopt contains either the REQ_CASELESS bit or zero,
|
||||
according to the current setting of the caseless flag. The REQ_CASELESS value
|
||||
leaves the lower 28 bit empty. It is added into the firstcu or reqcu variables
|
||||
to record the case status of the value. This is used only for ASCII characters.
|
||||
*/
|
||||
|
||||
req_caseopt = ((options & PCRE2_CASELESS) != 0)? REQ_CASELESS:0;
|
||||
req_caseopt = ((options & PCRE2_CASELESS) != 0)? REQ_CASELESS : 0;
|
||||
|
||||
/* Switch on next META item until the end of the branch */
|
||||
|
||||
|
@ -5395,13 +5482,12 @@ for (;; pptr++)
|
|||
BOOL possessive_quantifier;
|
||||
BOOL note_group_empty;
|
||||
int class_has_8bitchar;
|
||||
int i;
|
||||
uint32_t mclength;
|
||||
uint32_t skipunits;
|
||||
uint32_t subreqcu, subfirstcu;
|
||||
uint32_t groupnumber;
|
||||
uint32_t verbarglen, verbculen;
|
||||
int32_t subreqcuflags, subfirstcuflags; /* Must be signed */
|
||||
uint32_t subreqcuflags, subfirstcuflags;
|
||||
open_capitem *oc;
|
||||
PCRE2_UCHAR mcbuffer[8];
|
||||
|
||||
|
@ -5770,9 +5856,9 @@ for (;; pptr++)
|
|||
if (taboffset >= 0)
|
||||
{
|
||||
if (tabopt >= 0)
|
||||
for (i = 0; i < 32; i++) pbits[i] |= cbits[(int)i + taboffset];
|
||||
for (int i = 0; i < 32; i++) pbits[i] |= cbits[(int)i + taboffset];
|
||||
else
|
||||
for (i = 0; i < 32; i++) pbits[i] &= ~cbits[(int)i + taboffset];
|
||||
for (int i = 0; i < 32; i++) pbits[i] &= ~cbits[(int)i + taboffset];
|
||||
}
|
||||
|
||||
/* Now see if we need to remove any special characters. An option
|
||||
|
@ -5786,9 +5872,9 @@ for (;; pptr++)
|
|||
being built and we are done. */
|
||||
|
||||
if (local_negate)
|
||||
for (i = 0; i < 32; i++) classbits[i] |= ~pbits[i];
|
||||
for (int i = 0; i < 32; i++) classbits[i] |= (uint8_t)(~pbits[i]);
|
||||
else
|
||||
for (i = 0; i < 32; i++) classbits[i] |= pbits[i];
|
||||
for (int i = 0; i < 32; i++) classbits[i] |= pbits[i];
|
||||
|
||||
/* Every class contains at least one < 256 character. */
|
||||
|
||||
|
@ -5827,21 +5913,23 @@ for (;; pptr++)
|
|||
switch(escape)
|
||||
{
|
||||
case ESC_d:
|
||||
for (i = 0; i < 32; i++) classbits[i] |= cbits[i+cbit_digit];
|
||||
for (int i = 0; i < 32; i++) classbits[i] |= cbits[i+cbit_digit];
|
||||
break;
|
||||
|
||||
case ESC_D:
|
||||
should_flip_negation = TRUE;
|
||||
for (i = 0; i < 32; i++) classbits[i] |= ~cbits[i+cbit_digit];
|
||||
for (int i = 0; i < 32; i++)
|
||||
classbits[i] |= (uint8_t)(~cbits[i+cbit_digit]);
|
||||
break;
|
||||
|
||||
case ESC_w:
|
||||
for (i = 0; i < 32; i++) classbits[i] |= cbits[i+cbit_word];
|
||||
for (int i = 0; i < 32; i++) classbits[i] |= cbits[i+cbit_word];
|
||||
break;
|
||||
|
||||
case ESC_W:
|
||||
should_flip_negation = TRUE;
|
||||
for (i = 0; i < 32; i++) classbits[i] |= ~cbits[i+cbit_word];
|
||||
for (int i = 0; i < 32; i++)
|
||||
classbits[i] |= (uint8_t)(~cbits[i+cbit_word]);
|
||||
break;
|
||||
|
||||
/* Perl 5.004 onwards omitted VT from \s, but restored it at Perl
|
||||
|
@ -5852,12 +5940,13 @@ for (;; pptr++)
|
|||
longer treat \s and \S specially. */
|
||||
|
||||
case ESC_s:
|
||||
for (i = 0; i < 32; i++) classbits[i] |= cbits[i+cbit_space];
|
||||
for (int i = 0; i < 32; i++) classbits[i] |= cbits[i+cbit_space];
|
||||
break;
|
||||
|
||||
case ESC_S:
|
||||
should_flip_negation = TRUE;
|
||||
for (i = 0; i < 32; i++) classbits[i] |= ~cbits[i+cbit_space];
|
||||
for (int i = 0; i < 32; i++)
|
||||
classbits[i] |= (uint8_t)(~cbits[i+cbit_space]);
|
||||
break;
|
||||
|
||||
/* When adding the horizontal or vertical space lists to a class, or
|
||||
|
@ -6098,7 +6187,7 @@ for (;; pptr++)
|
|||
if (negate_class && !xclass_has_prop)
|
||||
{
|
||||
/* Using 255 ^ instead of ~ avoids clang sanitize warning. */
|
||||
for (i = 0; i < 32; i++) classbits[i] = 255 ^ classbits[i];
|
||||
for (int i = 0; i < 32; i++) classbits[i] = 255 ^ classbits[i];
|
||||
}
|
||||
memcpy(code, classbits, 32);
|
||||
code = class_uchardata + (32 / sizeof(PCRE2_UCHAR));
|
||||
|
@ -6124,7 +6213,7 @@ for (;; pptr++)
|
|||
if (negate_class)
|
||||
{
|
||||
/* Using 255 ^ instead of ~ avoids clang sanitize warning. */
|
||||
for (i = 0; i < 32; i++) classbits[i] = 255 ^ classbits[i];
|
||||
for (int i = 0; i < 32; i++) classbits[i] = 255 ^ classbits[i];
|
||||
}
|
||||
memcpy(code, classbits, 32);
|
||||
}
|
||||
|
@ -6198,7 +6287,7 @@ for (;; pptr++)
|
|||
verbarglen = *(++pptr);
|
||||
verbculen = 0;
|
||||
tempcode = code++;
|
||||
for (i = 0; i < (int)verbarglen; i++)
|
||||
for (int i = 0; i < (int)verbarglen; i++)
|
||||
{
|
||||
meta = *(++pptr);
|
||||
#ifdef SUPPORT_UNICODE
|
||||
|
@ -6247,6 +6336,7 @@ for (;; pptr++)
|
|||
bravalue = OP_COND;
|
||||
{
|
||||
int count, index;
|
||||
unsigned int i;
|
||||
PCRE2_SPTR name;
|
||||
named_group *ng = cb->named_groups;
|
||||
uint32_t length = *(++pptr);
|
||||
|
@ -6286,7 +6376,7 @@ for (;; pptr++)
|
|||
groupnumber = 0;
|
||||
if (meta == META_COND_RNUMBER)
|
||||
{
|
||||
for (i = 1; i < (int)length; i++)
|
||||
for (i = 1; i < length; i++)
|
||||
{
|
||||
groupnumber = groupnumber * 10 + name[i] - CHAR_0;
|
||||
if (groupnumber > MAX_GROUP_NUMBER)
|
||||
|
@ -6608,7 +6698,7 @@ for (;; pptr++)
|
|||
|
||||
if (firstcuflags == REQ_UNSET && subfirstcuflags != REQ_UNSET)
|
||||
{
|
||||
if (subfirstcuflags >= 0)
|
||||
if (subfirstcuflags < REQ_NONE)
|
||||
{
|
||||
firstcu = subfirstcu;
|
||||
firstcuflags = subfirstcuflags;
|
||||
|
@ -6622,7 +6712,7 @@ for (;; pptr++)
|
|||
into reqcu if there wasn't one, using the vary flag that was in
|
||||
existence beforehand. */
|
||||
|
||||
else if (subfirstcuflags >= 0 && subreqcuflags < 0)
|
||||
else if (subfirstcuflags < REQ_NONE && subreqcuflags >= REQ_NONE)
|
||||
{
|
||||
subreqcu = subfirstcu;
|
||||
subreqcuflags = subfirstcuflags | tempreqvary;
|
||||
|
@ -6631,7 +6721,7 @@ for (;; pptr++)
|
|||
/* If the subpattern set a required code unit (or set a first code unit
|
||||
that isn't really the first code unit - see above), set it. */
|
||||
|
||||
if (subreqcuflags >= 0)
|
||||
if (subreqcuflags < REQ_NONE)
|
||||
{
|
||||
reqcu = subreqcu;
|
||||
reqcuflags = subreqcuflags;
|
||||
|
@ -6650,7 +6740,7 @@ for (;; pptr++)
|
|||
in that example, 'X' ends up set for both. */
|
||||
|
||||
else if ((bravalue == OP_ASSERT || bravalue == OP_ASSERT_NA) &&
|
||||
subreqcuflags >= 0 && subfirstcuflags >= 0)
|
||||
subreqcuflags < REQ_NONE && subfirstcuflags < REQ_NONE)
|
||||
{
|
||||
reqcu = subreqcu;
|
||||
reqcuflags = subreqcuflags;
|
||||
|
@ -6680,7 +6770,7 @@ for (;; pptr++)
|
|||
this name is duplicated. */
|
||||
|
||||
groupnumber = 0;
|
||||
for (i = 0; i < cb->names_found; i++, ng++)
|
||||
for (unsigned int i = 0; i < cb->names_found; i++, ng++)
|
||||
{
|
||||
if (length == ng->length &&
|
||||
PRIV(strncmp)(name, ng->name, length) == 0)
|
||||
|
@ -6935,14 +7025,19 @@ for (;; pptr++)
|
|||
#endif /* MAYBE_UTF_MULTI */
|
||||
|
||||
/* Handle the case of a single code unit - either with no UTF support, or
|
||||
with UTF disabled, or for a single-code-unit UTF character. */
|
||||
with UTF disabled, or for a single-code-unit UTF character. In the latter
|
||||
case, for a repeated positive match, get the caseless flag for the
|
||||
required code unit from the previous character, because a class like [Aa]
|
||||
sets a caseless A but by now the req_caseopt flag has been reset. */
|
||||
|
||||
{
|
||||
mcbuffer[0] = code[-1];
|
||||
mclength = 1;
|
||||
if (op_previous <= OP_CHARI && repeat_min > 1)
|
||||
{
|
||||
reqcu = mcbuffer[0];
|
||||
reqcuflags = req_caseopt | cb->req_varyopt;
|
||||
reqcuflags = cb->req_varyopt;
|
||||
if (op_previous == OP_CHARI) reqcuflags |= REQ_CASELESS;
|
||||
}
|
||||
}
|
||||
goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */
|
||||
|
@ -7034,7 +7129,7 @@ for (;; pptr++)
|
|||
*lengthptr += delta;
|
||||
}
|
||||
|
||||
else for (i = 0; i < replicate; i++)
|
||||
else for (int i = 0; i < replicate; i++)
|
||||
{
|
||||
memcpy(code, previous, CU2BYTES(1 + LINK_SIZE));
|
||||
previous = code;
|
||||
|
@ -7210,12 +7305,12 @@ for (;; pptr++)
|
|||
|
||||
else
|
||||
{
|
||||
if (groupsetfirstcu && reqcuflags < 0)
|
||||
if (groupsetfirstcu && reqcuflags >= REQ_NONE)
|
||||
{
|
||||
reqcu = firstcu;
|
||||
reqcuflags = firstcuflags;
|
||||
}
|
||||
for (i = 1; (uint32_t)i < repeat_min; i++)
|
||||
for (uint32_t i = 1; i < repeat_min; i++)
|
||||
{
|
||||
memcpy(code, previous, CU2BYTES(len));
|
||||
code += len;
|
||||
|
@ -7259,14 +7354,14 @@ for (;; pptr++)
|
|||
|
||||
/* This is compiling for real */
|
||||
|
||||
else for (i = repeat_max - 1; i >= 0; i--)
|
||||
else for (uint32_t i = repeat_max; i >= 1; i--)
|
||||
{
|
||||
*code++ = OP_BRAZERO + repeat_type;
|
||||
|
||||
/* All but the final copy start a new nesting, maintaining the
|
||||
chain of brackets outstanding. */
|
||||
|
||||
if (i != 0)
|
||||
if (i != 1)
|
||||
{
|
||||
int linkoffset;
|
||||
*code++ = OP_BRA;
|
||||
|
@ -7985,9 +8080,9 @@ Arguments:
|
|||
errorcodeptr -> pointer to error code variable
|
||||
skipunits skip this many code units at start (for brackets and OP_COND)
|
||||
firstcuptr place to put the first required code unit
|
||||
firstcuflagsptr place to put the first code unit flags, or a negative number
|
||||
firstcuflagsptr place to put the first code unit flags
|
||||
reqcuptr place to put the last required code unit
|
||||
reqcuflagsptr place to put the last required code unit flags, or a negative number
|
||||
reqcuflagsptr place to put the last required code unit flags
|
||||
bcptr pointer to the chain of currently open branches
|
||||
cb points to the data block with tables pointers etc.
|
||||
lengthptr NULL during the real compile phase
|
||||
|
@ -8001,7 +8096,7 @@ Returns: 0 There has been an error
|
|||
static int
|
||||
compile_regex(uint32_t options, PCRE2_UCHAR **codeptr, uint32_t **pptrptr,
|
||||
int *errorcodeptr, uint32_t skipunits, uint32_t *firstcuptr,
|
||||
int32_t *firstcuflagsptr, uint32_t *reqcuptr,int32_t *reqcuflagsptr,
|
||||
uint32_t *firstcuflagsptr, uint32_t *reqcuptr, uint32_t *reqcuflagsptr,
|
||||
branch_chain *bcptr, compile_block *cb, PCRE2_SIZE *lengthptr)
|
||||
{
|
||||
PCRE2_UCHAR *code = *codeptr;
|
||||
|
@ -8014,9 +8109,9 @@ int okreturn = 1;
|
|||
uint32_t *pptr = *pptrptr;
|
||||
uint32_t firstcu, reqcu;
|
||||
uint32_t lookbehindlength;
|
||||
int32_t firstcuflags, reqcuflags;
|
||||
uint32_t firstcuflags, reqcuflags;
|
||||
uint32_t branchfirstcu, branchreqcu;
|
||||
int32_t branchfirstcuflags, branchreqcuflags;
|
||||
uint32_t branchfirstcuflags, branchreqcuflags;
|
||||
PCRE2_SIZE length;
|
||||
branch_chain bc;
|
||||
|
||||
|
@ -8135,9 +8230,9 @@ for (;;)
|
|||
|
||||
if (firstcuflags != branchfirstcuflags || firstcu != branchfirstcu)
|
||||
{
|
||||
if (firstcuflags >= 0)
|
||||
if (firstcuflags < REQ_NONE)
|
||||
{
|
||||
if (reqcuflags < 0)
|
||||
if (reqcuflags >= REQ_NONE)
|
||||
{
|
||||
reqcu = firstcu;
|
||||
reqcuflags = firstcuflags;
|
||||
|
@ -8149,8 +8244,8 @@ for (;;)
|
|||
/* If we (now or from before) have no firstcu, a firstcu from the
|
||||
branch becomes a reqcu if there isn't a branch reqcu. */
|
||||
|
||||
if (firstcuflags < 0 && branchfirstcuflags >= 0 &&
|
||||
branchreqcuflags < 0)
|
||||
if (firstcuflags >= REQ_NONE && branchfirstcuflags < REQ_NONE &&
|
||||
branchreqcuflags >= REQ_NONE)
|
||||
{
|
||||
branchreqcu = branchfirstcu;
|
||||
branchreqcuflags = branchfirstcuflags;
|
||||
|
@ -8298,7 +8393,7 @@ Returns: TRUE or FALSE
|
|||
*/
|
||||
|
||||
static BOOL
|
||||
is_anchored(PCRE2_SPTR code, unsigned int bracket_map, compile_block *cb,
|
||||
is_anchored(PCRE2_SPTR code, uint32_t bracket_map, compile_block *cb,
|
||||
int atomcount, BOOL inassert)
|
||||
{
|
||||
do {
|
||||
|
@ -8321,7 +8416,7 @@ do {
|
|||
op == OP_SCBRA || op == OP_SCBRAPOS)
|
||||
{
|
||||
int n = GET2(scode, 1+LINK_SIZE);
|
||||
int new_map = bracket_map | ((n < 32)? (1u << n) : 1);
|
||||
uint32_t new_map = bracket_map | ((n < 32)? (1u << n) : 1);
|
||||
if (!is_anchored(scode, new_map, cb, atomcount, inassert)) return FALSE;
|
||||
}
|
||||
|
||||
|
@ -8681,15 +8776,15 @@ Returns: the fixed first code unit, or 0 with REQ_NONE in flags
|
|||
*/
|
||||
|
||||
static uint32_t
|
||||
find_firstassertedcu(PCRE2_SPTR code, int32_t *flags, uint32_t inassert)
|
||||
find_firstassertedcu(PCRE2_SPTR code, uint32_t *flags, uint32_t inassert)
|
||||
{
|
||||
uint32_t c = 0;
|
||||
int cflags = REQ_NONE;
|
||||
uint32_t cflags = REQ_NONE;
|
||||
|
||||
*flags = REQ_NONE;
|
||||
do {
|
||||
uint32_t d;
|
||||
int dflags;
|
||||
uint32_t dflags;
|
||||
int xl = (*code == OP_CBRA || *code == OP_SCBRA ||
|
||||
*code == OP_CBRAPOS || *code == OP_SCBRAPOS)? IMM2_SIZE:0;
|
||||
PCRE2_SPTR scode = first_significant_code(code + 1+LINK_SIZE + xl, TRUE);
|
||||
|
@ -8712,9 +8807,8 @@ do {
|
|||
case OP_SCRIPT_RUN:
|
||||
d = find_firstassertedcu(scode, &dflags, inassert +
|
||||
((op == OP_ASSERT || op == OP_ASSERT_NA)?1:0));
|
||||
if (dflags < 0)
|
||||
return 0;
|
||||
if (cflags < 0) { c = d; cflags = dflags; }
|
||||
if (dflags >= REQ_NONE) return 0;
|
||||
if (cflags >= REQ_NONE) { c = d; cflags = dflags; }
|
||||
else if (c != d || cflags != dflags) return 0;
|
||||
break;
|
||||
|
||||
|
@ -8727,7 +8821,7 @@ do {
|
|||
case OP_MINPLUS:
|
||||
case OP_POSPLUS:
|
||||
if (inassert == 0) return 0;
|
||||
if (cflags < 0) { c = scode[1]; cflags = 0; }
|
||||
if (cflags >= REQ_NONE) { c = scode[1]; cflags = 0; }
|
||||
else if (c != scode[1]) return 0;
|
||||
break;
|
||||
|
||||
|
@ -8753,7 +8847,7 @@ do {
|
|||
#endif
|
||||
#endif
|
||||
|
||||
if (cflags < 0) { c = scode[1]; cflags = REQ_CASELESS; }
|
||||
if (cflags >= REQ_NONE) { c = scode[1]; cflags = REQ_CASELESS; }
|
||||
else if (c != scode[1]) return 0;
|
||||
break;
|
||||
}
|
||||
|
@ -9161,7 +9255,7 @@ for (;; pptr++)
|
|||
case META_LOOKAHEAD:
|
||||
case META_LOOKAHEADNOT:
|
||||
case META_LOOKAHEAD_NA:
|
||||
*errcodeptr = check_lookbehinds(pptr + 1, &pptr, recurses, cb);
|
||||
*errcodeptr = check_lookbehinds(pptr + 1, &pptr, recurses, cb, lcptr);
|
||||
if (*errcodeptr != 0) return -1;
|
||||
|
||||
/* Ignore any qualifiers that follow a lookahead assertion. */
|
||||
|
@ -9501,16 +9595,16 @@ Arguments
|
|||
retptr if not NULL, return the ket pointer here
|
||||
recurses chain of recurse_check to catch mutual recursion
|
||||
cb points to the compile block
|
||||
lcptr points to loop counter
|
||||
|
||||
Returns: 0 on success, or an errorcode (cb->erroroffset will be set)
|
||||
*/
|
||||
|
||||
static int
|
||||
check_lookbehinds(uint32_t *pptr, uint32_t **retptr,
|
||||
parsed_recurse_check *recurses, compile_block *cb)
|
||||
parsed_recurse_check *recurses, compile_block *cb, int *lcptr)
|
||||
{
|
||||
int errorcode = 0;
|
||||
int loopcount = 0;
|
||||
int nestlevel = 0;
|
||||
|
||||
cb->erroroffset = PCRE2_UNSET;
|
||||
|
@ -9636,7 +9730,7 @@ for (; *pptr != META_END; pptr++)
|
|||
case META_LOOKBEHIND:
|
||||
case META_LOOKBEHINDNOT:
|
||||
case META_LOOKBEHIND_NA:
|
||||
if (!set_lookbehind_lengths(&pptr, &errorcode, &loopcount, recurses, cb))
|
||||
if (!set_lookbehind_lengths(&pptr, &errorcode, lcptr, recurses, cb))
|
||||
return errorcode;
|
||||
break;
|
||||
}
|
||||
|
@ -9689,7 +9783,7 @@ PCRE2_SIZE re_blocksize; /* Size of memory block */
|
|||
PCRE2_SIZE big32count = 0; /* 32-bit literals >= 0x80000000 */
|
||||
PCRE2_SIZE parsed_size_needed; /* Needed for parsed pattern */
|
||||
|
||||
int32_t firstcuflags, reqcuflags; /* Type of first/req code unit */
|
||||
uint32_t firstcuflags, reqcuflags; /* Type of first/req code unit */
|
||||
uint32_t firstcu, reqcu; /* Value of first/req code unit */
|
||||
uint32_t setflags = 0; /* NL and BSR set flags */
|
||||
|
||||
|
@ -10091,7 +10185,8 @@ lengths. */
|
|||
|
||||
if (has_lookbehind)
|
||||
{
|
||||
errorcode = check_lookbehinds(cb.parsed_pattern, NULL, NULL, &cb);
|
||||
int loopcount = 0;
|
||||
errorcode = check_lookbehinds(cb.parsed_pattern, NULL, NULL, &cb, &loopcount);
|
||||
if (errorcode != 0) goto HAD_CB_ERROR;
|
||||
}
|
||||
|
||||
|
@ -10368,13 +10463,13 @@ if ((re->overall_options & PCRE2_NO_START_OPTIMIZE) == 0)
|
|||
(these are not saved during the compile because they can cause conflicts with
|
||||
actual literals that follow). */
|
||||
|
||||
if (firstcuflags < 0)
|
||||
if (firstcuflags >= REQ_NONE)
|
||||
firstcu = find_firstassertedcu(codestart, &firstcuflags, 0);
|
||||
|
||||
/* Save the data for a first code unit. The existence of one means the
|
||||
minimum length must be at least 1. */
|
||||
|
||||
if (firstcuflags >= 0)
|
||||
if (firstcuflags < REQ_NONE)
|
||||
{
|
||||
re->first_codeunit = firstcu;
|
||||
re->flags |= PCRE2_FIRSTSET;
|
||||
|
@ -10421,16 +10516,16 @@ if ((re->overall_options & PCRE2_NO_START_OPTIMIZE) == 0)
|
|||
different character and not a non-starting code unit of the first character,
|
||||
because the minimum length count is in characters, not code units. */
|
||||
|
||||
if (reqcuflags >= 0)
|
||||
if (reqcuflags < REQ_NONE)
|
||||
{
|
||||
#if PCRE2_CODE_UNIT_WIDTH == 16
|
||||
if ((re->overall_options & PCRE2_UTF) == 0 || /* Not UTF */
|
||||
firstcuflags < 0 || /* First not set */
|
||||
firstcuflags >= REQ_NONE || /* First not set */
|
||||
(firstcu & 0xf800) != 0xd800 || /* First not surrogate */
|
||||
(reqcu & 0xfc00) != 0xdc00) /* Req not low surrogate */
|
||||
#elif PCRE2_CODE_UNIT_WIDTH == 8
|
||||
if ((re->overall_options & PCRE2_UTF) == 0 || /* Not UTF */
|
||||
firstcuflags < 0 || /* First not set */
|
||||
firstcuflags >= REQ_NONE || /* First not set */
|
||||
(firstcu & 0x80) == 0 || /* First is ASCII */
|
||||
(reqcu & 0x80) == 0) /* Req is ASCII */
|
||||
#endif
|
||||
|
@ -10527,4 +10622,10 @@ re = NULL;
|
|||
goto EXIT;
|
||||
}
|
||||
|
||||
/* These #undefs are here to enable unity builds with CMake. */
|
||||
|
||||
#undef NLBLOCK /* Block containing newline information */
|
||||
#undef PSSTART /* Field containing processed string start */
|
||||
#undef PSEND /* Field containing processed string end */
|
||||
|
||||
/* End of pcre2_compile.c */
|
||||
|
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue