Compare commits
296 Commits
pcre2-10.3
...
amigaos
Author | SHA1 | Date |
---|---|---|
George Sokianos | 4a45482c9c | |
Philip Hazel | 8b133fa0ba | |
Philip Hazel | cc5e121c8e | |
Philip Hazel | 1343bdff8f | |
Philip Hazel | d90fb23878 | |
Ezekiel Warren | e47fc51584 | |
Zoltan Herczeg | b67d568201 | |
Zoltan Herczeg | 4851890ede | |
Amin Yahyaabadi | 3e52db5209 | |
Philip Hazel | 4804b00e8f | |
Philip Hazel | 7549fdca74 | |
Philip Hazel | 5271b533c4 | |
larinsv | 45af1203bd | |
Rémi Verschelde | 187b7ba050 | |
William A Rowe Jr | 06f34ba374 | |
GregThain | a334ea2a34 | |
Carlo Marcelo Arenas Belón | 15a82c3efd | |
Philip Hazel | 51a5fcdc1f | |
Philip Hazel | 104fe2fead | |
Philip Hazel | f65df06305 | |
pkeir | a13d7d4340 | |
Lucas Trzesniewski | c630e868ca | |
Joe Zhang | 77ce1ff528 | |
Philip Hazel | ff5402a378 | |
Philip Hazel | b52d055d1b | |
Carlo Marcelo Arenas Belón | a4ac97fea8 | |
Philip Hazel | fedf4d9d40 | |
Philip Hazel | 8ebf9efe7b | |
Carlo Marcelo Arenas Belón | 4edcf6ada5 | |
Philip Hazel | d0c7544e78 | |
Carlo Marcelo Arenas Belón | f28e82602d | |
Philip Hazel | 1bb2b97b29 | |
Lucas Trzesniewski | 3fec24a26f | |
Philip Hazel | 66b3cb34df | |
Philip Hazel | 29a43aa11d | |
Philip Hazel | 3103b8f20a | |
Philip Hazel | 13be26a5c2 | |
pagabuc | ba6a5f16d2 | |
Zoltan Herczeg | d07c967b3a | |
Carlo Marcelo Arenas Belón | 4279abbd7d | |
Philip Hazel | 8ff3ab27d5 | |
Zoltan Herczeg | e612e06b5d | |
Philip Hazel | 64c9baaaa4 | |
Carlo Marcelo Arenas Belón | 9c8abddc52 | |
Carlo Marcelo Arenas Belón | f11c26842d | |
Zoltan Herczeg | 4ca0530b9b | |
Zoltan Herczeg | 03654e751e | |
Zoltan Herczeg | d4fa336fbc | |
Zoltan Herczeg | 50a51cb7e6 | |
Philip Hazel | f7a7341726 | |
Philip Hazel | eef5740ff9 | |
Zoltan Herczeg | dea56d2df9 | |
Adam | 111cd470b5 | |
Philip Hazel | fdd9479108 | |
Philip Hazel | 419e3c68a3 | |
Zoltan Herczeg | e21345de97 | |
Philip Hazel | e85a81ebac | |
Philip Hazel | 504ff06fff | |
Philip Hazel | 360a84e80b | |
Zoltan Herczeg | 061e57695a | |
Philip Hazel | 7f7d3e8521 | |
Philip Hazel | bf35c0518c | |
Zoltan Herczeg | 68fbc1982e | |
Philip Hazel | 06d3a66065 | |
Philip Hazel | 87571b5af3 | |
Philip Hazel | 838cdac4dc | |
Philip Hazel | 628a804102 | |
Philip Hazel | ec091e2e44 | |
Philip Hazel | 636569a957 | |
Philip Hazel | 81d3729c66 | |
Zoltan Herczeg | f90542a209 | |
Carlo Marcelo Arenas Belón | 14dbc6e6ec | |
Philip Hazel | 80205ee2a0 | |
Jessica Clarke | 04ecb267c0 | |
Jessica Clarke | 534b4760e3 | |
Philip Hazel | 31fb2e58a1 | |
Zoltan Herczeg | 435140a0ac | |
Philip Hazel | c24047f15d | |
Zoltan Herczeg | e7457003cd | |
Philip Hazel | d888d36013 | |
Zoltan Herczeg | 6614b281bc | |
Zoltan Herczeg | afa4756d19 | |
Philip Hazel | 7713f33e46 | |
Michael Kaufmann | af2637ee5e | |
Philip Hazel | 98e7d70bc6 | |
Philip Hazel | 321b559ed4 | |
Philip Hazel | 16c8a84cce | |
Philip Hazel | 4514ddd2a2 | |
Philip Hazel | 944f0e10a1 | |
Philip Hazel | b29732063b | |
Philip Hazel | 92d7cf1dd0 | |
Philip Hazel | 1d432ee3cf | |
Philip Hazel | 194a15315a | |
Philip Hazel | 1c41a5b815 | |
Zoltan Herczeg | 4243515033 | |
Philip Hazel | 49b29f837d | |
Philip Hazel | 30abd0ac8d | |
Philip Hazel | 0246c6bf64 | |
Philip Hazel | 823d4ac956 | |
Philip Hazel | ba3d0edcbd | |
Philip Hazel | 4ef0c51d2b | |
Philip Hazel | 7ab2769728 | |
Philip Hazel | 2a294ddadb | |
Philip Hazel | cb854a912e | |
Philip Hazel | 16dccbcb13 | |
Carlo Marcelo Arenas Belón | ae4e6261e5 | |
Carlo Marcelo Arenas Belón | d24a1c9d31 | |
Carlo Marcelo Arenas Belón | 055b7ce4a9 | |
Philip Hazel | 4a8f5d104c | |
Carlo Marcelo Arenas Belón | 587b94277b | |
Philip Hazel | c8d31f1605 | |
Carlo Marcelo Arenas Belón | adf76faace | |
Zoltan Herczeg | d144199dfb | |
Carlo Marcelo Arenas Belón | eb42305f07 | |
Philip Hazel | 46890604a4 | |
Carlo Marcelo Arenas Belón | acc520924c | |
Philip Hazel | bc70a183fc | |
Carlo Marcelo Arenas Belón | dae475092d | |
Philip Hazel | 1ed34b9cb1 | |
Philip Hazel | f19e84674e | |
Carlo Marcelo Arenas Belón | 7db8784296 | |
Philip Hazel | 072717a61f | |
Philip Hazel | 35fee4193b | |
Philip Hazel | 3469b13b8e | |
Philip Hazel | 29c37f9aa3 | |
Carlo Marcelo Arenas Belón | 128c50360c | |
Philip Hazel | bf2c8cc564 | |
Philip Hazel | 87f32b9b39 | |
Philip Hazel | 7ed39af7cc | |
Carlo Marcelo Arenas Belón | 3b973ebf4b | |
Carlo Marcelo Arenas Belón | f5e4e10042 | |
Carlo Marcelo Arenas Belón | d46f1863be | |
Philip Hazel | c99f0738c5 | |
Philip Hazel | 794470b51d | |
PhilipHazel | 179c5d212c | |
Lucas Trzesniewski | ec0755b829 | |
Philip Hazel | 8d9e91228c | |
PhilipHazel | e7af7efaa1 | |
Zoltan Herczeg | 51ec2c9893 | |
Philip Hazel | 0612ed77c2 | |
Philip Hazel | 507e4dcf6f | |
Zoltan Herczeg | dc5f966635 | |
Philip Hazel | 8f3e11a355 | |
Philip Hazel | e2fde18833 | |
Philip Hazel | 857ac92372 | |
Philip Hazel | 31a46200fa | |
Philip Hazel | edcc076bd8 | |
Philip Hazel | c232286c6b | |
Philip Hazel | 21c26698b3 | |
Philip Hazel | eea410b33a | |
Philip Hazel | d5a61ee891 | |
Philip Hazel | 6c2fe9da99 | |
Philip Hazel | 5ff1daffa0 | |
Philip Hazel | f4beac6c1a | |
Philip Hazel | e1cd61c292 | |
Philip Hazel | 6ee9921a89 | |
Philip Hazel | b8c60ce272 | |
Philip Hazel | b61aa572f6 | |
Philip Hazel | 25bb9de6fc | |
Philip Hazel | e74a9b6932 | |
PhilipHazel | 30036e670f | |
Philip Hazel | a8c4ef7f20 | |
Philip Hazel | c2fc6cfa0a | |
Philip Hazel | 587e46b372 | |
Philip Hazel | d8267c20fd | |
Philip Hazel | 15b692fd82 | |
Philip Hazel | 4ccef1697a | |
Philip Hazel | 5c0d38b3a8 | |
Philip Hazel | 23c16e6ced | |
Philip Hazel | 876ba431b0 | |
Philip Hazel | f64fbed2e1 | |
Philip.Hazel | 2410fbe386 | |
Philip.Hazel | d70da76dfb | |
Zoltán Herczeg | a5389db88d | |
Zoltán Herczeg | 3d80cf5a25 | |
Zoltán Herczeg | 900921f83e | |
Zoltán Herczeg | 1951243b5d | |
Philip.Hazel | 1c3256349f | |
Philip.Hazel | cd45050ee4 | |
Philip.Hazel | a5d81d06f4 | |
Philip.Hazel | 85fc061dcf | |
Philip.Hazel | 080d7789eb | |
Zoltán Herczeg | 38dbea6200 | |
Philip.Hazel | 8c1df186ab | |
Zoltán Herczeg | 0dd0283b17 | |
Zoltán Herczeg | 19a1319c0a | |
Philip.Hazel | 2c4d3942e4 | |
Zoltán Herczeg | b6acebe497 | |
Philip.Hazel | 25029849c3 | |
Philip.Hazel | 4cfa216898 | |
Philip.Hazel | 91485e5d5a | |
Philip.Hazel | 6cb388d55b | |
Philip.Hazel | 8144ae04e9 | |
Philip.Hazel | 166e576f91 | |
Philip.Hazel | c246f53ae1 | |
Zoltán Herczeg | e5e1fab2db | |
Zoltán Herczeg | b730793117 | |
Zoltán Herczeg | 46158a811f | |
Philip.Hazel | 027c9375c0 | |
Philip.Hazel | 7eb23f423e | |
Philip.Hazel | 6a9900c53b | |
Philip.Hazel | 9e15c97b6d | |
Zoltán Herczeg | d19789c251 | |
Philip.Hazel | 000bbf2ea7 | |
Philip.Hazel | dc426be88e | |
Zoltán Herczeg | fb54d81528 | |
Zoltán Herczeg | 2451870e3c | |
Zoltán Herczeg | 37b76d8609 | |
Philip.Hazel | 92554d19aa | |
Philip.Hazel | 6d4936dc29 | |
Philip.Hazel | fff544a1e9 | |
Philip.Hazel | deffc391ce | |
Philip.Hazel | 81da2b97e3 | |
Zoltán Herczeg | 3bdc76e4f3 | |
Philip.Hazel | f8cbb1f58d | |
Philip.Hazel | 0cf247f558 | |
Philip.Hazel | a2f0fd01c7 | |
Philip.Hazel | 5652d41209 | |
Zoltán Herczeg | 384620a172 | |
Zoltán Herczeg | 3d317692ac | |
Philip.Hazel | 0ad89ab06d | |
Philip.Hazel | ed489f99ae | |
Philip.Hazel | 3faff02596 | |
Philip.Hazel | cffe1ca463 | |
Philip.Hazel | b55dba885a | |
Zoltán Herczeg | fda3221597 | |
Zoltán Herczeg | 0652de5597 | |
Philip.Hazel | e44976f929 | |
Zoltán Herczeg | e0c6029a62 | |
Philip.Hazel | 5dfe817b5e | |
Philip.Hazel | e73119cbfa | |
Philip.Hazel | 768c7fe67e | |
Zoltán Herczeg | 018044a54e | |
Philip.Hazel | 9ff7f342f8 | |
Philip.Hazel | 56c4bf9095 | |
Philip.Hazel | bf4ca900f3 | |
Philip.Hazel | b940ed7520 | |
Philip.Hazel | d4e4533240 | |
Philip.Hazel | ce558bbff1 | |
Philip.Hazel | 5ec5c45423 | |
Philip.Hazel | ca55d0be6b | |
Philip.Hazel | 8b3f8af535 | |
Zoltán Herczeg | cf670e3bb9 | |
Philip.Hazel | 28f92c8596 | |
Philip.Hazel | 9cebee7e75 | |
Philip.Hazel | c472f3f91a | |
Philip.Hazel | 59233b8079 | |
Philip.Hazel | f988433788 | |
Philip.Hazel | 8057c3c8b9 | |
Zoltán Herczeg | 953d4e9c95 | |
Zoltán Herczeg | 0d0d954bbd | |
Zoltán Herczeg | 21c40e638b | |
Zoltán Herczeg | 106d9d3a25 | |
Zoltán Herczeg | 325908279e | |
Philip.Hazel | 3155a6951f | |
Zoltán Herczeg | 305e273e99 | |
Philip.Hazel | 68f9c49517 | |
Philip.Hazel | 3be538015b | |
Philip.Hazel | 4e8f13cbd6 | |
Philip.Hazel | f50ee03f5d | |
Zoltán Herczeg | a3057bbecd | |
Philip.Hazel | 4a7dfab0ec | |
Zoltán Herczeg | d0666136c9 | |
Zoltán Herczeg | c39fb3a9e1 | |
Zoltán Herczeg | c21bd97754 | |
Philip.Hazel | eedd9d8e55 | |
Philip.Hazel | a57787b7cd | |
Philip.Hazel | 29c0d64158 | |
Zoltán Herczeg | 697cf5f602 | |
Zoltán Herczeg | d71dc302a5 | |
Zoltán Herczeg | ed8a3146b9 | |
Philip.Hazel | e2c8dc8c2e | |
Philip.Hazel | b040e2e1cd | |
Philip.Hazel | 3a6b4948d1 | |
Philip.Hazel | 9e960f5465 | |
Philip.Hazel | f3c658cf87 | |
Philip.Hazel | 9e8c98587f | |
Zoltán Herczeg | 0a6ca6d420 | |
Zoltán Herczeg | 09984bb0e4 | |
Philip.Hazel | e8d70e2459 | |
Philip.Hazel | 7171d86587 | |
Zoltán Herczeg | bf4cd8212f | |
Philip.Hazel | 03720de840 | |
Philip.Hazel | 5ba5230b82 | |
Philip.Hazel | eaf4572ff8 | |
Philip.Hazel | 6707614863 | |
Philip.Hazel | 279128cbde | |
Philip.Hazel | f006fa5e3c | |
Philip.Hazel | ac4ab7186d | |
Philip.Hazel | d170829b26 | |
Philip.Hazel | 777582d4de | |
Philip.Hazel | f3fd8b18cb | |
Philip.Hazel | 0a2033f0f7 | |
Zoltán Herczeg | 880aac5dda | |
Zoltán Herczeg | 2632526c67 | |
Zoltán Herczeg | f5286d8f56 |
|
@ -0,0 +1,3 @@
|
|||
common --experimental_enable_bzlmod
|
||||
build --incompatible_enable_cc_toolchain_resolution
|
||||
build --incompatible_strict_action_env
|
|
@ -0,0 +1,77 @@
|
|||
|
||||
name: Build
|
||||
on: [push, pull_request]
|
||||
|
||||
jobs:
|
||||
linux:
|
||||
name: Linux
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v2
|
||||
|
||||
- name: Autogen
|
||||
run: ./autogen.sh
|
||||
|
||||
- name: Configure
|
||||
run: ./configure --enable-jit --enable-pcre2-8 --enable-pcre2-16 --enable-pcre2-32
|
||||
|
||||
- name: Build
|
||||
run: make
|
||||
|
||||
- name: Test (main test script)
|
||||
run: ./RunTest
|
||||
|
||||
- name: Test (JIT test program)
|
||||
run: ./pcre2_jit_test
|
||||
|
||||
- name: Test (pcre2grep test script)
|
||||
run: ./RunGrepTest
|
||||
|
||||
alpine:
|
||||
name: alpine
|
||||
runs-on: ubuntu-latest
|
||||
container: alpine
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v2
|
||||
|
||||
- name: Autotools
|
||||
run: apk add --no-cache automake autoconf gcc libtool make musl-dev
|
||||
|
||||
- name: Autogen
|
||||
run: ./autogen.sh
|
||||
|
||||
- name: Configure
|
||||
run: ./configure --enable-jit --enable-pcre2-8 --enable-pcre2-16 --enable-pcre2-32
|
||||
|
||||
- name: Build
|
||||
run: make
|
||||
|
||||
- name: Test (main test script)
|
||||
run: ./RunTest
|
||||
|
||||
- name: Test (JIT test program)
|
||||
run: ./pcre2_jit_test
|
||||
|
||||
- name: Test (pcre2grep test script)
|
||||
run: ./RunGrepTest
|
||||
|
||||
windows:
|
||||
name: 32bit Windows
|
||||
runs-on: windows-latest
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v2
|
||||
|
||||
- name: Configure
|
||||
run: cmake -DPCRE2_SUPPORT_JIT=ON -DPCRE2_BUILD_PCRE2_16=ON -DPCRE2_BUILD_PCRE2_32=ON -B build -A Win32
|
||||
|
||||
- name: Build
|
||||
run: cmake --build build
|
||||
|
||||
- name: Test
|
||||
run: |
|
||||
cd build\Debug
|
||||
..\..\RunTest.bat
|
||||
|
|
@ -0,0 +1,73 @@
|
|||
# For most projects, this workflow file will not need changing; you simply need
|
||||
# to commit it to your repository.
|
||||
#
|
||||
# You may wish to alter this file to override the set of languages analyzed,
|
||||
# or to provide custom queries or build logic.
|
||||
#
|
||||
# ******** NOTE ********
|
||||
# We have attempted to detect the languages in your repository. Please check
|
||||
# the `language` matrix defined below to confirm you have the correct set of
|
||||
# supported CodeQL languages.
|
||||
#
|
||||
name: "CodeQL"
|
||||
|
||||
on:
|
||||
push:
|
||||
branches: [ master ]
|
||||
pull_request:
|
||||
# The branches below must be a subset of the branches above
|
||||
branches: [ master ]
|
||||
schedule:
|
||||
- cron: '27 6 * * 4'
|
||||
|
||||
# Declare default permissions as read only.
|
||||
permissions: read-all
|
||||
|
||||
jobs:
|
||||
analyze:
|
||||
name: Analyze
|
||||
runs-on: ubuntu-latest
|
||||
permissions:
|
||||
actions: read
|
||||
contents: read
|
||||
security-events: write
|
||||
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
language: [ 'cpp', 'python' ]
|
||||
# CodeQL supports [ 'cpp', 'csharp', 'go', 'java', 'javascript', 'python', 'ruby' ]
|
||||
# Learn more about CodeQL language support at https://git.io/codeql-language-support
|
||||
|
||||
steps:
|
||||
- name: Checkout repository
|
||||
uses: actions/checkout@v2
|
||||
|
||||
# Initializes the CodeQL tools for scanning.
|
||||
- name: Initialize CodeQL
|
||||
uses: github/codeql-action/init@v1
|
||||
with:
|
||||
languages: ${{ matrix.language }}
|
||||
# If you wish to specify custom queries, you can do so here or in a config file.
|
||||
# By default, queries listed here will override any specified in a config file.
|
||||
# Prefix the list here with "+" to use these queries and those in the config file.
|
||||
# queries: ./path/to/local/query, your-org/your-repo/queries@main
|
||||
|
||||
# Autobuild attempts to build any compiled languages (C/C++, C#, or Java).
|
||||
# If this step fails, then you should remove it and run the build manually (see below)
|
||||
- name: Autobuild
|
||||
uses: github/codeql-action/autobuild@v1
|
||||
|
||||
# ℹ️ Command-line programs to run using the OS shell.
|
||||
# 📚 https://git.io/JvXDl
|
||||
|
||||
# ✏️ If the Autobuild fails above, remove it and uncomment the following three lines
|
||||
# and modify them (or add more) to build your code if your project
|
||||
# uses a compiled language
|
||||
|
||||
#- run: |
|
||||
# make bootstrap
|
||||
# make release
|
||||
|
||||
- name: Perform CodeQL Analysis
|
||||
uses: github/codeql-action/analyze@v1
|
|
@ -0,0 +1,55 @@
|
|||
name: Scorecards supply-chain security
|
||||
on:
|
||||
# Only the default branch is supported.
|
||||
branch_protection_rule:
|
||||
schedule:
|
||||
- cron: '23 17 * * 1'
|
||||
push:
|
||||
branches: [ master ]
|
||||
|
||||
# Declare default permissions as read only.
|
||||
permissions: read-all
|
||||
|
||||
jobs:
|
||||
analysis:
|
||||
name: Scorecards analysis
|
||||
runs-on: ubuntu-latest
|
||||
permissions:
|
||||
# Needed to upload the results to code-scanning dashboard.
|
||||
security-events: write
|
||||
actions: read
|
||||
contents: read
|
||||
|
||||
steps:
|
||||
- name: "Checkout code"
|
||||
uses: actions/checkout@ec3a7ce113134d7a93b817d10a8272cb61118579 # v2.4.0
|
||||
with:
|
||||
persist-credentials: false
|
||||
|
||||
- name: "Run analysis"
|
||||
uses: ossf/scorecard-action@c1aec4ac820532bab364f02a81873c555a0ba3a1 # v1.0.4
|
||||
with:
|
||||
results_file: results.sarif
|
||||
results_format: sarif
|
||||
# Read-only PAT token. To create it,
|
||||
# follow the steps in https://github.com/ossf/scorecard-action#pat-token-creation.
|
||||
repo_token: ${{ secrets.SCORECARD_READ_TOKEN }}
|
||||
# Publish the results to enable scorecard badges. For more details, see
|
||||
# https://github.com/ossf/scorecard-action#publishing-results.
|
||||
# For private repositories, `publish_results` will automatically be set to `false`,
|
||||
# regardless of the value entered here.
|
||||
publish_results: true
|
||||
|
||||
# Upload the results as artifacts (optional).
|
||||
- name: "Upload artifact"
|
||||
uses: actions/upload-artifact@82c141cc518b40d92cc801eee768e7aafc9c2fa2 # v2.3.1
|
||||
with:
|
||||
name: SARIF file
|
||||
path: results.sarif
|
||||
retention-days: 5
|
||||
|
||||
# Upload the results to GitHub's code scanning dashboard.
|
||||
- name: "Upload to code-scanning"
|
||||
uses: github/codeql-action/upload-sarif@5f532563584d71fdef14ee64d17bafb34f751ce5 # v1.0.26
|
||||
with:
|
||||
sarif_file: results.sarif
|
|
@ -1,47 +1,82 @@
|
|||
INSTALL
|
||||
Makefile.in
|
||||
aclocal.m4
|
||||
ar-lib
|
||||
compile
|
||||
config.guess
|
||||
config.sub
|
||||
configure
|
||||
depcomp
|
||||
install-sh
|
||||
ltmain.sh
|
||||
m4/
|
||||
missing
|
||||
test-driver
|
||||
# Public .gitignore file for PCRE2
|
||||
|
||||
Makefile
|
||||
config.log
|
||||
config.status
|
||||
libpcre2-*.pc
|
||||
libtool
|
||||
pcre2-config
|
||||
src/.deps
|
||||
src/config.h
|
||||
src/pcre2.h
|
||||
src/stamp-h1
|
||||
|
||||
.libs
|
||||
*.o
|
||||
*.lo
|
||||
*.a
|
||||
*.lo
|
||||
*.la
|
||||
src/.dirstamp
|
||||
src/pcre2_chartables.c
|
||||
*.pc
|
||||
*.o
|
||||
*~
|
||||
*.lha
|
||||
|
||||
pcre2grep
|
||||
pcre2test
|
||||
pcre2_jit_test
|
||||
__pycache__
|
||||
.deps
|
||||
.libs
|
||||
|
||||
INSTALL
|
||||
Makefile
|
||||
Makefile.in
|
||||
RunGrepTest.log
|
||||
RunGrepTest.trs
|
||||
RunTest.log
|
||||
RunTest.trs
|
||||
|
||||
aclocal.m4
|
||||
ar-lib
|
||||
compile
|
||||
config.guess
|
||||
config.log
|
||||
config.status
|
||||
config.sub
|
||||
configure
|
||||
depcomp
|
||||
install-sh
|
||||
libtool
|
||||
ltmain.sh
|
||||
missing
|
||||
pcre2-config
|
||||
pcre2_dftables
|
||||
pcre2_jit_test
|
||||
pcre2_jit_test.log
|
||||
pcre2_jit_test.trs
|
||||
pcre2demo
|
||||
pcre2fuzzcheck
|
||||
pcre2grep
|
||||
pcre2test
|
||||
test-driver
|
||||
test-suite.log
|
||||
test3input
|
||||
test3output
|
||||
testNinput
|
||||
testNinputgrep
|
||||
teststderr
|
||||
teststderrM
|
||||
teststderrgrep
|
||||
teststdout
|
||||
teststdoutM
|
||||
testtemp1
|
||||
testtemp1grep
|
||||
testtemp2
|
||||
testtemp2grep
|
||||
testtry
|
||||
testtrygrep
|
||||
|
||||
m4/libtool.m4
|
||||
m4/ltoptions.m4
|
||||
m4/ltsugar.m4
|
||||
m4/ltversion.m4
|
||||
m4/lt~obsolete.m4
|
||||
|
||||
maint/ucptest
|
||||
maint/utf8
|
||||
|
||||
src/.deps
|
||||
src/.dirstamp
|
||||
src/config.h
|
||||
src/pcre2.h
|
||||
src/pcre2_chartables.c
|
||||
src/stamp-h1
|
||||
|
||||
/bazel-*
|
||||
|
||||
# End
|
||||
|
||||
*~
|
||||
|
|
12
AUTHORS
12
AUTHORS
|
@ -2,13 +2,13 @@ THE MAIN PCRE2 LIBRARY CODE
|
|||
---------------------------
|
||||
|
||||
Written by: Philip Hazel
|
||||
Email local part: ph10
|
||||
Email domain: cam.ac.uk
|
||||
Email local part: Philip.Hazel
|
||||
Email domain: gmail.com
|
||||
|
||||
University of Cambridge Computing Service,
|
||||
Retired from University of Cambridge Computing Service,
|
||||
Cambridge, England.
|
||||
|
||||
Copyright (c) 1997-2019 University of Cambridge
|
||||
Copyright (c) 1997-2022 University of Cambridge
|
||||
All rights reserved
|
||||
|
||||
|
||||
|
@ -19,7 +19,7 @@ Written by: Zoltan Herczeg
|
|||
Email local part: hzmester
|
||||
Emain domain: freemail.hu
|
||||
|
||||
Copyright(c) 2010-2019 Zoltan Herczeg
|
||||
Copyright(c) 2010-2022 Zoltan Herczeg
|
||||
All rights reserved.
|
||||
|
||||
|
||||
|
@ -30,7 +30,7 @@ Written by: Zoltan Herczeg
|
|||
Email local part: hzmester
|
||||
Emain domain: freemail.hu
|
||||
|
||||
Copyright(c) 2009-2019 Zoltan Herczeg
|
||||
Copyright(c) 2009-2022 Zoltan Herczeg
|
||||
All rights reserved.
|
||||
|
||||
####
|
||||
|
|
|
@ -0,0 +1,72 @@
|
|||
load("@rules_cc//cc:defs.bzl", "cc_library", "cc_test")
|
||||
load("@bazel_skylib//rules:copy_file.bzl", "copy_file")
|
||||
|
||||
copy_file(
|
||||
name = "config_h_generic",
|
||||
src = "src/config.h.generic",
|
||||
out = "src/config.h",
|
||||
)
|
||||
|
||||
copy_file(
|
||||
name = "pcre2_h_generic",
|
||||
src = "src/pcre2.h.generic",
|
||||
out = "src/pcre2.h",
|
||||
)
|
||||
|
||||
copy_file(
|
||||
name = "pcre2_chartables_c",
|
||||
src = "src/pcre2_chartables.c.dist",
|
||||
out = "src/pcre2_chartables.c",
|
||||
)
|
||||
|
||||
cc_library(
|
||||
name = "pcre2",
|
||||
srcs = [
|
||||
"src/pcre2_auto_possess.c",
|
||||
"src/pcre2_compile.c",
|
||||
"src/pcre2_config.c",
|
||||
"src/pcre2_context.c",
|
||||
"src/pcre2_convert.c",
|
||||
"src/pcre2_dfa_match.c",
|
||||
"src/pcre2_error.c",
|
||||
"src/pcre2_extuni.c",
|
||||
"src/pcre2_find_bracket.c",
|
||||
"src/pcre2_maketables.c",
|
||||
"src/pcre2_match.c",
|
||||
"src/pcre2_match_data.c",
|
||||
"src/pcre2_newline.c",
|
||||
"src/pcre2_ord2utf.c",
|
||||
"src/pcre2_pattern_info.c",
|
||||
"src/pcre2_script_run.c",
|
||||
"src/pcre2_serialize.c",
|
||||
"src/pcre2_string_utils.c",
|
||||
"src/pcre2_study.c",
|
||||
"src/pcre2_substitute.c",
|
||||
"src/pcre2_substring.c",
|
||||
"src/pcre2_tables.c",
|
||||
"src/pcre2_ucd.c",
|
||||
"src/pcre2_ucptables.c",
|
||||
"src/pcre2_valid_utf.c",
|
||||
"src/pcre2_xclass.c",
|
||||
":pcre2_chartables_c",
|
||||
],
|
||||
hdrs = glob(["src/*.h"]) + [
|
||||
":config_h_generic",
|
||||
":pcre2_h_generic",
|
||||
],
|
||||
defines = [
|
||||
"HAVE_CONFIG_H",
|
||||
"PCRE2_CODE_UNIT_WIDTH=8",
|
||||
"PCRE2_STATIC",
|
||||
],
|
||||
includes = ["src"],
|
||||
strip_include_prefix = "src",
|
||||
visibility = ["//visibility:public"],
|
||||
)
|
||||
|
||||
cc_binary(
|
||||
name = "pcre2demo",
|
||||
srcs = ["src/pcre2demo.c"],
|
||||
visibility = ["//visibility:public"],
|
||||
deps = [":pcre2"],
|
||||
)
|
578
CMakeLists.txt
578
CMakeLists.txt
|
@ -1,6 +1,5 @@
|
|||
# CMakeLists.txt
|
||||
#
|
||||
#
|
||||
# This file enables PCRE2 to be built with the CMake configuration and build
|
||||
# tool. Download CMake in source or binary form from http://www.cmake.org/
|
||||
# Converted to support PCRE2 from the original PCRE file, August 2014.
|
||||
|
@ -85,19 +84,44 @@
|
|||
# 2018-11-14 PH removed unnecessary checks for stdint.h and inttypes.h
|
||||
# 2018-11-16 PH added PCRE2GREP_SUPPORT_CALLOUT_FORK support and tidied
|
||||
# 2019-02-16 PH hacked to avoid CMP0026 policy issue (see comments below)
|
||||
# 2020-03-16 PH renamed dftables as pcre2_dftables (as elsewhere)
|
||||
# 2020-03-24 PH changed CMAKE_MODULE_PATH definition to add, not replace
|
||||
# 2020-04-08 Carlo added function check for secure_getenv, fixed strerror
|
||||
# 2020-04-16 enh added check for __attribute__((uninitialized))
|
||||
# 2020-04-25 PH applied patches from Uwe Korn to support pkg-config and
|
||||
# library versioning.
|
||||
# 2020-04-25 Carlo added function check for mkostemp used in ProtExecAllocator
|
||||
# 2020-04-28 PH added function check for memfd_create based on Carlo's patch
|
||||
# 2020-05-25 PH added a check for Intel CET
|
||||
# 2020-12-03 PH altered the definition of pcre2test as suggested by Daniel
|
||||
# 2021-06-29 JWSB added the option to build static library with PIC.
|
||||
# 2021-07-05 JWSB modified such both the static and shared library can be
|
||||
# build in one go.
|
||||
# 2021-08-28 PH increased minimum version
|
||||
# 2021-08-28 PH added test for realpath()
|
||||
|
||||
PROJECT(PCRE2 C)
|
||||
|
||||
# Increased minimum to 2.8.0 to support newer add_test features.
|
||||
CMAKE_MINIMUM_REQUIRED(VERSION 2.8.0)
|
||||
# Increased minimum to 2.8.5 to support GNUInstallDirs.
|
||||
# Increased minimum to 3.1 to support imported targets.
|
||||
CMAKE_MINIMUM_REQUIRED(VERSION 3.1)
|
||||
|
||||
# Set policy CMP0026 to avoid warnings for the use of LOCATION in
|
||||
# GET_TARGET_PROPERTY. This should no longer be required.
|
||||
# CMAKE_POLICY(SET CMP0026 OLD)
|
||||
|
||||
SET(CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake) # for FindReadline.cmake
|
||||
# With a recent cmake, you can provide a rootdir to look for non
|
||||
# standard installed library dependencies, but to do so, the policy
|
||||
# needs to be set to new (by uncommenting the following)
|
||||
# CMAKE_POLICY(SET CMP0074 NEW)
|
||||
|
||||
SET(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -I${PROJECT_SOURCE_DIR}/src")
|
||||
# For FindReadline.cmake. This was changed to allow setting CMAKE_MODULE_PATH
|
||||
# on the command line.
|
||||
# SET(CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake)
|
||||
|
||||
LIST(APPEND CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake)
|
||||
|
||||
INCLUDE_DIRECTORIES(${PROJECT_SOURCE_DIR}/src)
|
||||
|
||||
# external packages
|
||||
FIND_PACKAGE( BZip2 )
|
||||
|
@ -107,29 +131,66 @@ FIND_PACKAGE( Editline )
|
|||
|
||||
# Configuration checks
|
||||
|
||||
INCLUDE(CheckIncludeFile)
|
||||
INCLUDE(CheckCSourceCompiles)
|
||||
INCLUDE(CheckFunctionExists)
|
||||
INCLUDE(CheckSymbolExists)
|
||||
INCLUDE(CheckIncludeFile)
|
||||
INCLUDE(CheckTypeSize)
|
||||
INCLUDE(GNUInstallDirs) # for CMAKE_INSTALL_LIBDIR
|
||||
|
||||
CHECK_INCLUDE_FILE(dirent.h HAVE_DIRENT_H)
|
||||
CHECK_INCLUDE_FILE(stdint.h HAVE_STDINT_H)
|
||||
CHECK_INCLUDE_FILE(inttypes.h HAVE_INTTYPES_H)
|
||||
CHECK_INCLUDE_FILE(sys/stat.h HAVE_SYS_STAT_H)
|
||||
CHECK_INCLUDE_FILE(sys/types.h HAVE_SYS_TYPES_H)
|
||||
CHECK_INCLUDE_FILE(unistd.h HAVE_UNISTD_H)
|
||||
CHECK_INCLUDE_FILE(windows.h HAVE_WINDOWS_H)
|
||||
|
||||
CHECK_FUNCTION_EXISTS(bcopy HAVE_BCOPY)
|
||||
CHECK_FUNCTION_EXISTS(memmove HAVE_MEMMOVE)
|
||||
CHECK_FUNCTION_EXISTS(strerror HAVE_STRERROR)
|
||||
CHECK_SYMBOL_EXISTS(bcopy "strings.h" HAVE_BCOPY)
|
||||
CHECK_SYMBOL_EXISTS(memfd_create "sys/mman.h" HAVE_MEMFD_CREATE)
|
||||
CHECK_SYMBOL_EXISTS(memmove "string.h" HAVE_MEMMOVE)
|
||||
CHECK_SYMBOL_EXISTS(secure_getenv "stdlib.h" HAVE_SECURE_GETENV)
|
||||
CHECK_SYMBOL_EXISTS(strerror "string.h" HAVE_STRERROR)
|
||||
|
||||
CHECK_C_SOURCE_COMPILES(
|
||||
"#include <stdlib.h>
|
||||
#include <limits.h>
|
||||
int main(int c, char *v[]) { char buf[PATH_MAX]; realpath(v[1], buf); return 0; }"
|
||||
HAVE_REALPATH
|
||||
)
|
||||
|
||||
set(ORIG_CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS})
|
||||
set(CMAKE_REQUIRED_FLAGS "${CMAKE_REQUIRED_FLAGS} -Werror")
|
||||
CHECK_C_SOURCE_COMPILES(
|
||||
"int main() { char buf[128] __attribute__((uninitialized)); (void)buf; return 0; }"
|
||||
HAVE_ATTRIBUTE_UNINITIALIZED
|
||||
)
|
||||
set(CMAKE_REQUIRED_FLAGS ${ORIG_CMAKE_REQUIRED_FLAGS})
|
||||
|
||||
# Check whether Intel CET is enabled, and if so, adjust compiler flags. This
|
||||
# code was written by PH, trying to imitate the logic from the autotools
|
||||
# configuration.
|
||||
|
||||
CHECK_C_SOURCE_COMPILES(
|
||||
"#ifndef __CET__
|
||||
#error CET is not enabled
|
||||
#endif
|
||||
int main() { return 0; }"
|
||||
INTEL_CET_ENABLED
|
||||
)
|
||||
|
||||
IF (INTEL_CET_ENABLED)
|
||||
SET(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mshstk")
|
||||
ENDIF(INTEL_CET_ENABLED)
|
||||
|
||||
|
||||
|
||||
# User-configurable options
|
||||
#
|
||||
# Note: CMakeSetup displays these in alphabetical order, regardless of
|
||||
# the order we use here.
|
||||
|
||||
SET(BUILD_SHARED_LIBS OFF CACHE BOOL
|
||||
"Build shared libraries instead of static ones.")
|
||||
SET(BUILD_SHARED_LIBS OFF CACHE BOOL "Build shared libraries.")
|
||||
|
||||
OPTION(BUILD_STATIC_LIBS "Build static libraries." ON)
|
||||
|
||||
OPTION(PCRE2_BUILD_PCRE2_8 "Build 8 bit PCRE2 library" ON)
|
||||
|
||||
|
@ -137,6 +198,8 @@ OPTION(PCRE2_BUILD_PCRE2_16 "Build 16 bit PCRE2 library" OFF)
|
|||
|
||||
OPTION(PCRE2_BUILD_PCRE2_32 "Build 32 bit PCRE2 library" OFF)
|
||||
|
||||
OPTION(PCRE2_STATIC_PIC "Build the static library with the option position independent code enabled." OFF)
|
||||
|
||||
OPTION(PCRE2_DEBUG "Include debugging code" OFF)
|
||||
|
||||
OPTION(PCRE2_DISABLE_PERCENT_ZT "Disable the use of %zu and %td (rarely needed)" OFF)
|
||||
|
@ -177,8 +240,12 @@ SET(PCRE2_HEAP_MATCH_RECURSE OFF CACHE BOOL
|
|||
SET(PCRE2_SUPPORT_JIT OFF CACHE BOOL
|
||||
"Enable support for Just-in-time compiling.")
|
||||
|
||||
SET(PCRE2_SUPPORT_JIT_SEALLOC OFF CACHE BOOL
|
||||
"Enable SELinux compatible execmem allocator in JIT (experimental).")
|
||||
IF(${CMAKE_SYSTEM_NAME} MATCHES Linux|NetBSD)
|
||||
SET(PCRE2_SUPPORT_JIT_SEALLOC OFF CACHE BOOL
|
||||
"Enable SELinux compatible execmem allocator in JIT (experimental).")
|
||||
ELSE(${CMAKE_SYSTEM_NAME} MATCHES Linux|NetBSD)
|
||||
SET(PCRE2_SUPPORT_JIT_SEALLOC IGNORE)
|
||||
ENDIF(${CMAKE_SYSTEM_NAME} MATCHES Linux|NetBSD)
|
||||
|
||||
SET(PCRE2GREP_SUPPORT_JIT ON CACHE BOOL
|
||||
"Enable use of Just-in-time compiling in pcre2grep.")
|
||||
|
@ -244,9 +311,19 @@ ENDIF(PCRE2_SUPPORT_LIBZ)
|
|||
IF(EDITLINE_FOUND)
|
||||
OPTION (PCRE2_SUPPORT_LIBEDIT "Enable support for linking pcre2test with libedit." OFF)
|
||||
ENDIF(EDITLINE_FOUND)
|
||||
IF(PCRE2_SUPPORT_LIBEDIT)
|
||||
INCLUDE_DIRECTORIES(${EDITLINE_INCLUDE_DIR})
|
||||
ENDIF(PCRE2_SUPPORT_LIBEDIT)
|
||||
IF(EDITLINE_FOUND)
|
||||
IF(PCRE2_SUPPORT_LIBEDIT)
|
||||
INCLUDE_DIRECTORIES(${EDITLINE_INCLUDE_DIR})
|
||||
ENDIF(PCRE2_SUPPORT_LIBEDIT)
|
||||
ELSE(EDITLINE_FOUND)
|
||||
IF(PCRE2_SUPPORT_LIBEDIT)
|
||||
MESSAGE(FATAL_ERROR
|
||||
" libedit not found, set EDITLINE_INCLUDE_DIR to a compatible header\n"
|
||||
" or set Editline_ROOT to a full libedit installed tree, as needed\n"
|
||||
" Might need to enable policy CMP0074 in CMakeLists.txt"
|
||||
)
|
||||
ENDIF(PCRE2_SUPPORT_LIBEDIT)
|
||||
ENDIF(EDITLINE_FOUND)
|
||||
|
||||
# readline lib
|
||||
IF(READLINE_FOUND)
|
||||
|
@ -258,9 +335,9 @@ ENDIF(PCRE2_SUPPORT_LIBREADLINE)
|
|||
|
||||
# Prepare build configuration
|
||||
|
||||
IF(NOT BUILD_SHARED_LIBS)
|
||||
SET(PCRE2_STATIC 1)
|
||||
ENDIF(NOT BUILD_SHARED_LIBS)
|
||||
IF(NOT BUILD_SHARED_LIBS AND NOT BUILD_STATIC_LIBS)
|
||||
MESSAGE(FATAL_ERROR "At least one of BUILD_SHARED_LIBS or BUILD_STATIC_LIBS must be enabled.")
|
||||
ENDIF(NOT BUILD_SHARED_LIBS AND NOT BUILD_STATIC_LIBS)
|
||||
|
||||
IF(NOT PCRE2_BUILD_PCRE2_8 AND NOT PCRE2_BUILD_PCRE2_16 AND NOT PCRE2_BUILD_PCRE2_32)
|
||||
MESSAGE(FATAL_ERROR "At least one of PCRE2_BUILD_PCRE2_8, PCRE2_BUILD_PCRE2_16 or PCRE2_BUILD_PCRE2_32 must be enabled")
|
||||
|
@ -284,7 +361,12 @@ IF(PCRE2_BUILD_PCRE2GREP AND NOT PCRE2_BUILD_PCRE2_8)
|
|||
ENDIF(PCRE2_BUILD_PCRE2GREP AND NOT PCRE2_BUILD_PCRE2_8)
|
||||
|
||||
IF(PCRE2_SUPPORT_LIBREADLINE AND PCRE2_SUPPORT_LIBEDIT)
|
||||
MESSAGE(FATAL_ERROR "Only one of libreadline or libeditline can be specified")
|
||||
IF(READLINE_FOUND)
|
||||
MESSAGE(FATAL_ERROR
|
||||
" Only one of the readline compatible libraries can be enabled.\n"
|
||||
" Disable libreadline with -DPCRE2_SUPPORT_LIBREADLINE=OFF"
|
||||
)
|
||||
ENDIF(READLINE_FOUND)
|
||||
ENDIF(PCRE2_SUPPORT_LIBREADLINE AND PCRE2_SUPPORT_LIBEDIT)
|
||||
|
||||
IF(PCRE2_SUPPORT_BSR_ANYCRLF)
|
||||
|
@ -300,11 +382,29 @@ IF(PCRE2_SUPPORT_UNICODE)
|
|||
ENDIF(PCRE2_SUPPORT_UNICODE)
|
||||
|
||||
IF(PCRE2_SUPPORT_JIT)
|
||||
SET(SUPPORT_JIT 1)
|
||||
SET(SUPPORT_JIT 1)
|
||||
IF(UNIX)
|
||||
FIND_PACKAGE(Threads REQUIRED)
|
||||
IF(CMAKE_USE_PTHREADS_INIT)
|
||||
SET(REQUIRE_PTHREAD 1)
|
||||
ENDIF(CMAKE_USE_PTHREADS_INIT)
|
||||
ENDIF(UNIX)
|
||||
ENDIF(PCRE2_SUPPORT_JIT)
|
||||
|
||||
IF(PCRE2_SUPPORT_JIT_SEALLOC)
|
||||
SET(SLJIT_PROT_EXECUTABLE_ALLOCATOR 1)
|
||||
SET(CMAKE_REQUIRED_DEFINITIONS -D_GNU_SOURCE)
|
||||
CHECK_SYMBOL_EXISTS(mkostemp stdlib.h REQUIRED)
|
||||
UNSET(CMAKE_REQUIRED_DEFINITIONS)
|
||||
IF(${REQUIRED})
|
||||
IF(${CMAKE_SYSTEM_NAME} MATCHES Linux|NetBSD)
|
||||
ADD_DEFINITIONS(-D_GNU_SOURCE)
|
||||
SET(SLJIT_PROT_EXECUTABLE_ALLOCATOR 1)
|
||||
ELSE(${CMAKE_SYSTEM_NAME} MATCHES Linux|NetBSD)
|
||||
MESSAGE(FATAL_ERROR "Your configuration is not supported")
|
||||
ENDIF(${CMAKE_SYSTEM_NAME} MATCHES Linux|NetBSD)
|
||||
ELSE(${REQUIRED})
|
||||
SET(PCRE2_SUPPORT_JIT_SEALLOC OFF)
|
||||
ENDIF(${REQUIRED})
|
||||
ENDIF(PCRE2_SUPPORT_JIT_SEALLOC)
|
||||
|
||||
IF(PCRE2GREP_SUPPORT_JIT)
|
||||
|
@ -400,12 +500,13 @@ file(STRINGS ${PROJECT_SOURCE_DIR}/configure.ac
|
|||
LIMIT_COUNT 50 # Read only the first 50 lines of the file
|
||||
)
|
||||
|
||||
set(SEARCHED_VARIABLES "pcre2_major" "pcre2_minor" "pcre2_prerelease" "pcre2_date")
|
||||
set(SEARCHED_VARIABLES "pcre2_major" "pcre2_minor" "pcre2_prerelease" "pcre2_date"
|
||||
"libpcre2_posix_version" "libpcre2_8_version" "libpcre2_16_version" "libpcre2_32_version")
|
||||
foreach(configure_line ${configure_lines})
|
||||
foreach(_substitution_variable ${SEARCHED_VARIABLES})
|
||||
string(TOUPPER ${_substitution_variable} _substitution_variable_upper)
|
||||
if (NOT ${_substitution_variable_upper})
|
||||
string(REGEX MATCH "m4_define\\(${_substitution_variable}, \\[(.*)\\]" MACTHED_STRING ${configure_line})
|
||||
string(REGEX MATCH "m4_define\\(${_substitution_variable}, *\\[(.*)\\]" MATCHED_STRING ${configure_line})
|
||||
if (CMAKE_MATCH_1)
|
||||
set(${_substitution_variable_upper} ${CMAKE_MATCH_1})
|
||||
endif()
|
||||
|
@ -413,21 +514,83 @@ foreach(configure_line ${configure_lines})
|
|||
endforeach()
|
||||
endforeach()
|
||||
|
||||
macro(PARSE_LIB_VERSION VARIABLE_PREFIX)
|
||||
string(REPLACE ":" ";" ${VARIABLE_PREFIX}_VERSION_LIST ${${VARIABLE_PREFIX}_VERSION})
|
||||
list(GET ${VARIABLE_PREFIX}_VERSION_LIST 0 ${VARIABLE_PREFIX}_VERSION_CURRENT)
|
||||
list(GET ${VARIABLE_PREFIX}_VERSION_LIST 1 ${VARIABLE_PREFIX}_VERSION_REVISION)
|
||||
list(GET ${VARIABLE_PREFIX}_VERSION_LIST 2 ${VARIABLE_PREFIX}_VERSION_AGE)
|
||||
|
||||
math(EXPR ${VARIABLE_PREFIX}_SOVERSION "${${VARIABLE_PREFIX}_VERSION_CURRENT} - ${${VARIABLE_PREFIX}_VERSION_AGE}")
|
||||
math(EXPR ${VARIABLE_PREFIX}_MACHO_COMPATIBILITY_VERSION "${${VARIABLE_PREFIX}_VERSION_CURRENT} + 1")
|
||||
math(EXPR ${VARIABLE_PREFIX}_MACHO_CURRENT_VERSION "${${VARIABLE_PREFIX}_VERSION_CURRENT} + 1")
|
||||
set(${VARIABLE_PREFIX}_MACHO_CURRENT_VERSION "${${VARIABLE_PREFIX}_MACHO_CURRENT_VERSION}.${${VARIABLE_PREFIX}_VERSION_REVISION}}")
|
||||
set(${VARIABLE_PREFIX}_VERSION "${${VARIABLE_PREFIX}_SOVERSION}.${${VARIABLE_PREFIX}_VERSION_AGE}.${${VARIABLE_PREFIX}_VERSION_REVISION}")
|
||||
endmacro()
|
||||
|
||||
PARSE_LIB_VERSION(LIBPCRE2_POSIX)
|
||||
PARSE_LIB_VERSION(LIBPCRE2_8)
|
||||
PARSE_LIB_VERSION(LIBPCRE2_16)
|
||||
PARSE_LIB_VERSION(LIBPCRE2_32)
|
||||
|
||||
CONFIGURE_FILE(src/pcre2.h.in
|
||||
${PROJECT_BINARY_DIR}/pcre2.h
|
||||
@ONLY)
|
||||
|
||||
# What about pcre2-config and libpcre2.pc?
|
||||
# Make sure to not link debug libs
|
||||
# against release libs and vice versa
|
||||
IF(WIN32)
|
||||
SET(CMAKE_DEBUG_POSTFIX "d")
|
||||
ENDIF(WIN32)
|
||||
|
||||
# Generate pkg-config files
|
||||
|
||||
SET(PACKAGE_VERSION "${PCRE2_MAJOR}.${PCRE2_MINOR}")
|
||||
SET(prefix ${CMAKE_INSTALL_PREFIX})
|
||||
|
||||
SET(exec_prefix "\${prefix}")
|
||||
SET(libdir "\${exec_prefix}/${CMAKE_INSTALL_LIBDIR}")
|
||||
SET(includedir "\${prefix}/include")
|
||||
IF(WIN32 AND (CMAKE_BUILD_TYPE MATCHES Debug))
|
||||
SET(LIB_POSTFIX ${CMAKE_DEBUG_POSTFIX})
|
||||
ENDIF()
|
||||
CONFIGURE_FILE(libpcre2-posix.pc.in libpcre2-posix.pc @ONLY)
|
||||
SET(pkg_config_files ${pkg_config_files} "${CMAKE_CURRENT_BINARY_DIR}/libpcre2-posix.pc")
|
||||
|
||||
IF(PCRE2_BUILD_PCRE2_8)
|
||||
CONFIGURE_FILE(libpcre2-8.pc.in libpcre2-8.pc @ONLY)
|
||||
SET(pkg_config_files ${pkg_config_files} "${CMAKE_CURRENT_BINARY_DIR}/libpcre2-8.pc")
|
||||
SET(enable_pcre2_8 "yes")
|
||||
ELSE()
|
||||
SET(enable_pcre2_8 "no")
|
||||
ENDIF()
|
||||
|
||||
IF(PCRE2_BUILD_PCRE2_16)
|
||||
CONFIGURE_FILE(libpcre2-16.pc.in libpcre2-16.pc @ONLY)
|
||||
SET(pkg_config_files ${pkg_config_files} "${CMAKE_CURRENT_BINARY_DIR}/libpcre2-16.pc")
|
||||
SET(enable_pcre2_16 "yes")
|
||||
ELSE()
|
||||
SET(enable_pcre2_16 "no")
|
||||
ENDIF()
|
||||
|
||||
IF(PCRE2_BUILD_PCRE2_32)
|
||||
CONFIGURE_FILE(libpcre2-32.pc.in libpcre2-32.pc @ONLY)
|
||||
SET(pkg_config_files ${pkg_config_files} "${CMAKE_CURRENT_BINARY_DIR}/libpcre2-32.pc")
|
||||
SET(enable_pcre2_32 "yes")
|
||||
ELSE()
|
||||
SET(enable_pcre2_32 "no")
|
||||
ENDIF()
|
||||
|
||||
CONFIGURE_FILE(pcre2-config.in pcre2-config @ONLY)
|
||||
|
||||
# Character table generation
|
||||
|
||||
OPTION(PCRE2_REBUILD_CHARTABLES "Rebuild char tables" OFF)
|
||||
IF(PCRE2_REBUILD_CHARTABLES)
|
||||
ADD_EXECUTABLE(dftables src/dftables.c)
|
||||
ADD_EXECUTABLE(pcre2_dftables src/pcre2_dftables.c)
|
||||
ADD_CUSTOM_COMMAND(
|
||||
COMMENT "Generating character tables (pcre2_chartables.c) for current locale"
|
||||
DEPENDS dftables
|
||||
COMMAND dftables
|
||||
DEPENDS pcre2_dftables
|
||||
COMMAND pcre2_dftables
|
||||
ARGS ${PROJECT_BINARY_DIR}/pcre2_chartables.c
|
||||
OUTPUT ${PROJECT_BINARY_DIR}/pcre2_chartables.c
|
||||
)
|
||||
|
@ -474,39 +637,37 @@ SET(PCRE2_SOURCES
|
|||
SET(PCRE2POSIX_HEADERS src/pcre2posix.h)
|
||||
SET(PCRE2POSIX_SOURCES src/pcre2posix.c)
|
||||
|
||||
IF(MINGW AND NOT PCRE2_STATIC)
|
||||
IF (EXISTS ${PROJECT_SOURCE_DIR}/pcre2.rc)
|
||||
ADD_CUSTOM_COMMAND(OUTPUT ${PROJECT_SOURCE_DIR}/pcre2.o
|
||||
PRE-LINK
|
||||
COMMAND windres ARGS pcre2.rc pcre2.o
|
||||
WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}
|
||||
COMMENT Using pcre2 coff info in mingw build)
|
||||
SET(PCRE2_SOURCES
|
||||
${PCRE2_SOURCES} ${PROJECT_SOURCE_DIR}/pcre2.o
|
||||
)
|
||||
ENDIF(EXISTS ${PROJECT_SOURCE_DIR}/pcre2.rc)
|
||||
IF (EXISTS ${PROJECT_SOURCE_DIR}/pcre2posix.rc)
|
||||
ADD_CUSTOM_COMMAND(OUTPUT ${PROJECT_SOURCE_DIR}/pcre2posix.o
|
||||
PRE-LINK
|
||||
COMMAND windres ARGS pcre2posix.rc pcre2posix.o
|
||||
WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}
|
||||
COMMENT Using pcre2posix coff info in mingw build)
|
||||
SET(PCRE2POSIX_SOURCES
|
||||
${PCRE2POSIX_SOURCES} ${PROJECT_SOURCE_DIR}/pcre2posix.o
|
||||
)
|
||||
ENDIF(EXISTS ${PROJECT_SOURCE_DIR}/pcre2posix.rc)
|
||||
ENDIF(MINGW AND NOT PCRE2_STATIC)
|
||||
IF(MINGW AND BUILD_SHARED_LIBS)
|
||||
IF (EXISTS ${PROJECT_SOURCE_DIR}/pcre2.rc)
|
||||
ADD_CUSTOM_COMMAND(OUTPUT ${PROJECT_SOURCE_DIR}/pcre2.o
|
||||
PRE-LINK
|
||||
COMMAND windres ARGS pcre2.rc pcre2.o
|
||||
WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}
|
||||
COMMENT Using pcre2 coff info in mingw build)
|
||||
SET(PCRE2_SOURCES ${PCRE2_SOURCES} ${PROJECT_SOURCE_DIR}/pcre2.o)
|
||||
ENDIF(EXISTS ${PROJECT_SOURCE_DIR}/pcre2.rc)
|
||||
|
||||
IF(MSVC AND NOT PCRE2_STATIC)
|
||||
IF (EXISTS ${PROJECT_SOURCE_DIR}/pcre2.rc)
|
||||
SET(PCRE2_SOURCES
|
||||
${PCRE2_SOURCES} pcre2.rc)
|
||||
ENDIF(EXISTS ${PROJECT_SOURCE_DIR}/pcre2.rc)
|
||||
IF (EXISTS ${PROJECT_SOURCE_DIR}/pcre2posix.rc)
|
||||
SET(PCRE2POSIX_SOURCES
|
||||
${PCRE2POSIX_SOURCES} pcre2posix.rc)
|
||||
ENDIF (EXISTS ${PROJECT_SOURCE_DIR}/pcre2posix.rc)
|
||||
ENDIF(MSVC AND NOT PCRE2_STATIC)
|
||||
IF (EXISTS ${PROJECT_SOURCE_DIR}/pcre2posix.rc)
|
||||
ADD_CUSTOM_COMMAND(OUTPUT ${PROJECT_SOURCE_DIR}/pcre2posix.o
|
||||
PRE-LINK
|
||||
COMMAND windres ARGS pcre2posix.rc pcre2posix.o
|
||||
WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}
|
||||
COMMENT Using pcre2posix coff info in mingw build)
|
||||
SET(PCRE2POSIX_SOURCES ${PCRE2POSIX_SOURCES} ${PROJECT_SOURCE_DIR}/pcre2posix.o)
|
||||
ENDIF(EXISTS ${PROJECT_SOURCE_DIR}/pcre2posix.rc)
|
||||
ENDIF(MINGW AND BUILD_SHARED_LIBS)
|
||||
|
||||
IF(MSVC AND BUILD_SHARED_LIBS)
|
||||
SET(dll_pdb_files ${PROJECT_BINARY_DIR}/pcre2-posix.pdb ${dll_pdb_files})
|
||||
SET(dll_pdb_debug_files ${PROJECT_BINARY_DIR}/pcre2-posixd.pdb ${dll_pdb_debug_files})
|
||||
IF (EXISTS ${PROJECT_SOURCE_DIR}/pcre2.rc)
|
||||
SET(PCRE2_SOURCES ${PCRE2_SOURCES} pcre2.rc)
|
||||
ENDIF(EXISTS ${PROJECT_SOURCE_DIR}/pcre2.rc)
|
||||
|
||||
IF (EXISTS ${PROJECT_SOURCE_DIR}/pcre2posix.rc)
|
||||
SET(PCRE2POSIX_SOURCES ${PCRE2POSIX_SOURCES} pcre2posix.rc)
|
||||
ENDIF (EXISTS ${PROJECT_SOURCE_DIR}/pcre2posix.rc)
|
||||
ENDIF(MSVC AND BUILD_SHARED_LIBS)
|
||||
|
||||
# Fix static compilation with MSVC: https://bugs.exim.org/show_bug.cgi?id=1681
|
||||
# This code was taken from the CMake wiki, not from WebM.
|
||||
|
@ -529,71 +690,219 @@ IF(MSVC)
|
|||
ENDIF(MSVC)
|
||||
|
||||
SET(CMAKE_INCLUDE_CURRENT_DIR 1)
|
||||
# needed to make sure to not link debug libs
|
||||
# against release libs and vice versa
|
||||
IF(WIN32)
|
||||
SET(CMAKE_DEBUG_POSTFIX "d")
|
||||
ENDIF(WIN32)
|
||||
|
||||
SET(targets)
|
||||
|
||||
# 8-bit library
|
||||
|
||||
IF(PCRE2_BUILD_PCRE2_8)
|
||||
ADD_LIBRARY(pcre2-8 ${PCRE2_HEADERS} ${PCRE2_SOURCES} ${PROJECT_BINARY_DIR}/config.h)
|
||||
SET_PROPERTY(TARGET pcre2-8
|
||||
PROPERTY COMPILE_DEFINITIONS PCRE2_CODE_UNIT_WIDTH=8)
|
||||
SET(targets ${targets} pcre2-8)
|
||||
ADD_LIBRARY(pcre2-posix ${PCRE2POSIX_HEADERS} ${PCRE2POSIX_SOURCES})
|
||||
SET_PROPERTY(TARGET pcre2-posix
|
||||
PROPERTY COMPILE_DEFINITIONS PCRE2_CODE_UNIT_WIDTH=8)
|
||||
SET(targets ${targets} pcre2-posix)
|
||||
TARGET_LINK_LIBRARIES(pcre2-posix pcre2-8)
|
||||
IF(BUILD_STATIC_LIBS)
|
||||
ADD_LIBRARY(pcre2-8-static STATIC ${PCRE2_HEADERS} ${PCRE2_SOURCES} ${PROJECT_BINARY_DIR}/config.h)
|
||||
SET_TARGET_PROPERTIES(pcre2-8-static PROPERTIES
|
||||
COMPILE_DEFINITIONS PCRE2_CODE_UNIT_WIDTH=8
|
||||
MACHO_COMPATIBILITY_VERSION "${LIBPCRE2_8_MACHO_COMPATIBILITY_VERSION}"
|
||||
MACHO_CURRENT_VERSION "${LIBPCRE2_8_MACHO_CURRENT_VERSION}"
|
||||
VERSION ${LIBPCRE2_8_VERSION}
|
||||
SOVERSION ${LIBPCRE2_8_SOVERSION})
|
||||
TARGET_COMPILE_DEFINITIONS(pcre2-8-static PUBLIC PCRE2_STATIC)
|
||||
TARGET_INCLUDE_DIRECTORIES(pcre2-8-static PUBLIC ${PROJECT_BINARY_DIR})
|
||||
IF(REQUIRE_PTHREAD)
|
||||
TARGET_LINK_LIBRARIES(pcre2-8-static Threads::Threads)
|
||||
ENDIF(REQUIRE_PTHREAD)
|
||||
SET(targets ${targets} pcre2-8-static)
|
||||
ADD_LIBRARY(pcre2-posix-static STATIC ${PCRE2POSIX_HEADERS} ${PCRE2POSIX_SOURCES})
|
||||
SET_TARGET_PROPERTIES(pcre2-posix-static PROPERTIES
|
||||
COMPILE_DEFINITIONS PCRE2_CODE_UNIT_WIDTH=8
|
||||
MACHO_COMPATIBILITY_VERSION "${LIBPCRE2_POSIX_MACHO_COMPATIBILITY_VERSION}"
|
||||
MACHO_CURRENT_VERSION "${LIBPCRE2_POSIX_MACHO_CURRENT_VERSION}"
|
||||
VERSION ${LIBPCRE2_POSIX_VERSION}
|
||||
SOVERSION ${LIBPCRE2_POSIX_SOVERSION})
|
||||
TARGET_LINK_LIBRARIES(pcre2-posix-static pcre2-8-static)
|
||||
TARGET_COMPILE_DEFINITIONS(pcre2-posix-static PUBLIC PCRE2_STATIC)
|
||||
TARGET_INCLUDE_DIRECTORIES(pcre2-posix-static PUBLIC ${PROJECT_BINARY_DIR})
|
||||
SET(targets ${targets} pcre2-posix-static)
|
||||
|
||||
IF(MINGW AND NOT PCRE2_STATIC)
|
||||
IF(NON_STANDARD_LIB_PREFIX)
|
||||
SET_TARGET_PROPERTIES(pcre2-8 pcre2-posix PROPERTIES PREFIX "")
|
||||
ENDIF(NON_STANDARD_LIB_PREFIX)
|
||||
IF(NON_STANDARD_LIB_SUFFIX)
|
||||
SET_TARGET_PROPERTIES(pcre2-8 pcre2-posix PROPERTIES SUFFIX "-0.dll")
|
||||
ENDIF(NON_STANDARD_LIB_SUFFIX)
|
||||
ENDIF(MINGW AND NOT PCRE2_STATIC)
|
||||
IF(MSVC)
|
||||
SET_TARGET_PROPERTIES(pcre2-8-static PROPERTIES OUTPUT_NAME pcre2-8-static)
|
||||
SET_TARGET_PROPERTIES(pcre2-posix-static PROPERTIES OUTPUT_NAME pcre2-posix-static)
|
||||
ELSE(MSVC)
|
||||
SET_TARGET_PROPERTIES(pcre2-8-static PROPERTIES OUTPUT_NAME pcre2-8)
|
||||
SET_TARGET_PROPERTIES(pcre2-posix-static PROPERTIES OUTPUT_NAME pcre2-posix)
|
||||
ENDIF(MSVC)
|
||||
IF(PCRE2_STATIC_PIC)
|
||||
SET_TARGET_PROPERTIES(pcre2-8-static pcre2-posix-static PROPERTIES POSITION_INDEPENDENT_CODE 1)
|
||||
ENDIF(PCRE2_STATIC_PIC)
|
||||
ENDIF(BUILD_STATIC_LIBS)
|
||||
|
||||
IF(BUILD_SHARED_LIBS)
|
||||
ADD_LIBRARY(pcre2-8-shared SHARED ${PCRE2_HEADERS} ${PCRE2_SOURCES} ${PROJECT_BINARY_DIR}/config.h)
|
||||
TARGET_INCLUDE_DIRECTORIES(pcre2-8-shared PUBLIC ${PROJECT_BINARY_DIR})
|
||||
SET_TARGET_PROPERTIES(pcre2-8-shared PROPERTIES
|
||||
COMPILE_DEFINITIONS PCRE2_CODE_UNIT_WIDTH=8
|
||||
MACHO_COMPATIBILITY_VERSION "${LIBPCRE2_8_MACHO_COMPATIBILITY_VERSION}"
|
||||
MACHO_CURRENT_VERSION "${LIBPCRE2_8_MACHO_CURRENT_VERSION}"
|
||||
VERSION ${LIBPCRE2_8_VERSION}
|
||||
SOVERSION ${LIBPCRE2_8_SOVERSION}
|
||||
OUTPUT_NAME pcre2-8)
|
||||
IF(REQUIRE_PTHREAD)
|
||||
TARGET_LINK_LIBRARIES(pcre2-8-shared Threads::Threads)
|
||||
ENDIF(REQUIRE_PTHREAD)
|
||||
SET(targets ${targets} pcre2-8-shared)
|
||||
ADD_LIBRARY(pcre2-posix-shared SHARED ${PCRE2POSIX_HEADERS} ${PCRE2POSIX_SOURCES})
|
||||
TARGET_INCLUDE_DIRECTORIES(pcre2-posix-shared PUBLIC ${PROJECT_BINARY_DIR})
|
||||
SET_TARGET_PROPERTIES(pcre2-posix-shared PROPERTIES
|
||||
COMPILE_DEFINITIONS PCRE2_CODE_UNIT_WIDTH=8
|
||||
MACHO_COMPATIBILITY_VERSION "${LIBPCRE2_POSIX_MACHO_COMPATIBILITY_VERSION}"
|
||||
MACHO_CURRENT_VERSION "${LIBPCRE2_POSIX_MACHO_CURRENT_VERSION}"
|
||||
VERSION ${LIBPCRE2_POSIX_VERSION}
|
||||
SOVERSION ${LIBPCRE2_POSIX_SOVERSION}
|
||||
OUTPUT_NAME pcre2-posix)
|
||||
TARGET_LINK_LIBRARIES(pcre2-posix-shared pcre2-8-shared)
|
||||
SET(targets ${targets} pcre2-posix-shared)
|
||||
SET(dll_pdb_files ${PROJECT_BINARY_DIR}/pcre2-8.pdb ${dll_pdb_files})
|
||||
SET(dll_pdb_debug_files ${PROJECT_BINARY_DIR}/pcre2-8d.pdb ${dll_pdb_debug_files})
|
||||
|
||||
IF(MINGW)
|
||||
IF(NON_STANDARD_LIB_PREFIX)
|
||||
SET_TARGET_PROPERTIES(pcre2-8-shared pcre2-posix-shared PROPERTIES PREFIX "")
|
||||
ENDIF(NON_STANDARD_LIB_PREFIX)
|
||||
IF(NON_STANDARD_LIB_SUFFIX)
|
||||
SET_TARGET_PROPERTIES(pcre2-8-shared pcre2-posix-shared PROPERTIES SUFFIX "-0.dll")
|
||||
ENDIF(NON_STANDARD_LIB_SUFFIX)
|
||||
ENDIF(MINGW)
|
||||
ENDIF(BUILD_SHARED_LIBS)
|
||||
|
||||
IF(BUILD_STATIC_LIBS)
|
||||
ADD_LIBRARY(pcre2-8 ALIAS pcre2-8-static)
|
||||
ADD_LIBRARY(pcre2-posix ALIAS pcre2-posix-static)
|
||||
ELSE(BUILD_STATIC_LIBS)
|
||||
ADD_LIBRARY(pcre2-8 ALIAS pcre2-8-shared)
|
||||
ADD_LIBRARY(pcre2-posix ALIAS pcre2-posix-shared)
|
||||
ENDIF(BUILD_STATIC_LIBS)
|
||||
ENDIF(PCRE2_BUILD_PCRE2_8)
|
||||
|
||||
# 16-bit library
|
||||
|
||||
IF(PCRE2_BUILD_PCRE2_16)
|
||||
ADD_LIBRARY(pcre2-16 ${PCRE2_HEADERS} ${PCRE2_SOURCES} ${PROJECT_BINARY_DIR}/config.h)
|
||||
SET_PROPERTY(TARGET pcre2-16
|
||||
PROPERTY COMPILE_DEFINITIONS PCRE2_CODE_UNIT_WIDTH=16)
|
||||
SET(targets ${targets} pcre2-16)
|
||||
IF(BUILD_STATIC_LIBS)
|
||||
ADD_LIBRARY(pcre2-16-static STATIC ${PCRE2_HEADERS} ${PCRE2_SOURCES} ${PROJECT_BINARY_DIR}/config.h)
|
||||
TARGET_INCLUDE_DIRECTORIES(pcre2-16-static PUBLIC ${PROJECT_BINARY_DIR})
|
||||
SET_TARGET_PROPERTIES(pcre2-16-static PROPERTIES
|
||||
COMPILE_DEFINITIONS PCRE2_CODE_UNIT_WIDTH=16
|
||||
MACHO_COMPATIBILITY_VERSION "${LIBPCRE2_32_MACHO_COMPATIBILITY_VERSION}"
|
||||
MACHO_CURRENT_VERSION "${LIBPCRE2_32_MACHO_CURRENT_VERSION}"
|
||||
VERSION ${LIBPCRE2_16_VERSION}
|
||||
SOVERSION ${LIBPCRE2_16_SOVERSION})
|
||||
TARGET_COMPILE_DEFINITIONS(pcre2-16-static PUBLIC PCRE2_STATIC)
|
||||
IF(REQUIRE_PTHREAD)
|
||||
TARGET_LINK_LIBRARIES(pcre2-16-static Threads::Threads)
|
||||
ENDIF(REQUIRE_PTHREAD)
|
||||
SET(targets ${targets} pcre2-16-static)
|
||||
|
||||
IF(MINGW AND NOT PCRE2_STATIC)
|
||||
IF(NON_STANDARD_LIB_PREFIX)
|
||||
SET_TARGET_PROPERTIES(pcre2-16 PROPERTIES PREFIX "")
|
||||
ENDIF(NON_STANDARD_LIB_PREFIX)
|
||||
IF(NON_STANDARD_LIB_SUFFIX)
|
||||
SET_TARGET_PROPERTIES(pcre2-16 PROPERTIES SUFFIX "-0.dll")
|
||||
ENDIF(NON_STANDARD_LIB_SUFFIX)
|
||||
ENDIF(MINGW AND NOT PCRE2_STATIC)
|
||||
IF(MSVC)
|
||||
SET_TARGET_PROPERTIES(pcre2-16-static PROPERTIES OUTPUT_NAME pcre2-16-static)
|
||||
ELSE(MSVC)
|
||||
SET_TARGET_PROPERTIES(pcre2-16-static PROPERTIES OUTPUT_NAME pcre2-16)
|
||||
ENDIF(MSVC)
|
||||
IF(PCRE2_STATIC_PIC)
|
||||
SET_TARGET_PROPERTIES(pcre2-16-static PROPERTIES POSITION_INDEPENDENT_CODE 1)
|
||||
ENDIF(PCRE2_STATIC_PIC)
|
||||
ENDIF(BUILD_STATIC_LIBS)
|
||||
|
||||
IF(BUILD_SHARED_LIBS)
|
||||
ADD_LIBRARY(pcre2-16-shared SHARED ${PCRE2_HEADERS} ${PCRE2_SOURCES} ${PROJECT_BINARY_DIR}/config.h)
|
||||
TARGET_INCLUDE_DIRECTORIES(pcre2-16-shared PUBLIC ${PROJECT_BINARY_DIR})
|
||||
SET_TARGET_PROPERTIES(pcre2-16-shared PROPERTIES
|
||||
COMPILE_DEFINITIONS PCRE2_CODE_UNIT_WIDTH=16
|
||||
MACHO_COMPATIBILITY_VERSION "${LIBPCRE2_32_MACHO_COMPATIBILITY_VERSION}"
|
||||
MACHO_CURRENT_VERSION "${LIBPCRE2_32_MACHO_CURRENT_VERSION}"
|
||||
VERSION ${LIBPCRE2_16_VERSION}
|
||||
SOVERSION ${LIBPCRE2_16_SOVERSION}
|
||||
OUTPUT_NAME pcre2-16)
|
||||
IF(REQUIRE_PTHREAD)
|
||||
TARGET_LINK_LIBRARIES(pcre2-16-shared Threads::Threads)
|
||||
ENDIF(REQUIRE_PTHREAD)
|
||||
SET(targets ${targets} pcre2-16-shared)
|
||||
SET(dll_pdb_files ${PROJECT_BINARY_DIR}/pcre2-16.pdb ${dll_pdb_files})
|
||||
SET(dll_pdb_debug_files ${PROJECT_BINARY_DIR}/pcre2-16d.pdb ${dll_pdb_debug_files})
|
||||
|
||||
IF(MINGW)
|
||||
IF(NON_STANDARD_LIB_PREFIX)
|
||||
SET_TARGET_PROPERTIES(pcre2-16-shared PROPERTIES PREFIX "")
|
||||
ENDIF(NON_STANDARD_LIB_PREFIX)
|
||||
IF(NON_STANDARD_LIB_SUFFIX)
|
||||
SET_TARGET_PROPERTIES(pcre2-16-shared PROPERTIES SUFFIX "-0.dll")
|
||||
ENDIF(NON_STANDARD_LIB_SUFFIX)
|
||||
ENDIF(MINGW)
|
||||
ENDIF(BUILD_SHARED_LIBS)
|
||||
|
||||
IF(BUILD_STATIC_LIBS)
|
||||
ADD_LIBRARY(pcre2-16 ALIAS pcre2-16-static)
|
||||
ELSE(BUILD_STATIC_LIBS)
|
||||
ADD_LIBRARY(pcre2-16 ALIAS pcre2-16-shared)
|
||||
ENDIF(BUILD_STATIC_LIBS)
|
||||
ENDIF(PCRE2_BUILD_PCRE2_16)
|
||||
|
||||
# 32-bit library
|
||||
|
||||
IF(PCRE2_BUILD_PCRE2_32)
|
||||
ADD_LIBRARY(pcre2-32 ${PCRE2_HEADERS} ${PCRE2_SOURCES} ${PROJECT_BINARY_DIR}/config.h)
|
||||
SET_PROPERTY(TARGET pcre2-32
|
||||
PROPERTY COMPILE_DEFINITIONS PCRE2_CODE_UNIT_WIDTH=32)
|
||||
SET(targets ${targets} pcre2-32)
|
||||
IF(BUILD_STATIC_LIBS)
|
||||
ADD_LIBRARY(pcre2-32-static STATIC ${PCRE2_HEADERS} ${PCRE2_SOURCES} ${PROJECT_BINARY_DIR}/config.h)
|
||||
TARGET_INCLUDE_DIRECTORIES(pcre2-32-static PUBLIC ${PROJECT_BINARY_DIR})
|
||||
SET_TARGET_PROPERTIES(pcre2-32-static PROPERTIES
|
||||
COMPILE_DEFINITIONS PCRE2_CODE_UNIT_WIDTH=32
|
||||
MACHO_COMPATIBILITY_VERSION "${LIBPCRE2_32_MACHO_COMPATIBILITY_VERSION}"
|
||||
MACHO_CURRENT_VERSION "${LIBPCRE2_32_MACHO_CURRENT_VERSION}"
|
||||
VERSION ${LIBPCRE2_32_VERSION}
|
||||
SOVERSION ${LIBPCRE2_32_SOVERSION})
|
||||
TARGET_COMPILE_DEFINITIONS(pcre2-32-static PUBLIC PCRE2_STATIC)
|
||||
IF(REQUIRE_PTHREAD)
|
||||
TARGET_LINK_LIBRARIES(pcre2-32-static Threads::Threads)
|
||||
ENDIF(REQUIRE_PTHREAD)
|
||||
SET(targets ${targets} pcre2-32-static)
|
||||
|
||||
IF(MINGW AND NOT PCRE2_STATIC)
|
||||
IF(NON_STANDARD_LIB_PREFIX)
|
||||
SET_TARGET_PROPERTIES(pcre2-32 PROPERTIES PREFIX "")
|
||||
ENDIF(NON_STANDARD_LIB_PREFIX)
|
||||
IF(NON_STANDARD_LIB_SUFFIX)
|
||||
SET_TARGET_PROPERTIES(pcre2-32 PROPERTIES SUFFIX "-0.dll")
|
||||
ENDIF(NON_STANDARD_LIB_SUFFIX)
|
||||
ENDIF(MINGW AND NOT PCRE2_STATIC)
|
||||
IF(MSVC)
|
||||
SET_TARGET_PROPERTIES(pcre2-32-static PROPERTIES OUTPUT_NAME pcre2-32-static)
|
||||
ELSE(MSVC)
|
||||
SET_TARGET_PROPERTIES(pcre2-32-static PROPERTIES OUTPUT_NAME pcre2-32)
|
||||
ENDIF(MSVC)
|
||||
IF(PCRE2_STATIC_PIC)
|
||||
SET_TARGET_PROPERTIES(pcre2-32-static PROPERTIES POSITION_INDEPENDENT_CODE 1)
|
||||
ENDIF(PCRE2_STATIC_PIC)
|
||||
ENDIF(BUILD_STATIC_LIBS)
|
||||
|
||||
IF(BUILD_SHARED_LIBS)
|
||||
ADD_LIBRARY(pcre2-32-shared SHARED ${PCRE2_HEADERS} ${PCRE2_SOURCES} ${PROJECT_BINARY_DIR}/config.h)
|
||||
TARGET_INCLUDE_DIRECTORIES(pcre2-32-shared PUBLIC ${PROJECT_BINARY_DIR})
|
||||
SET_TARGET_PROPERTIES(pcre2-32-shared PROPERTIES
|
||||
COMPILE_DEFINITIONS PCRE2_CODE_UNIT_WIDTH=32
|
||||
MACHO_COMPATIBILITY_VERSION "${LIBPCRE2_32_MACHO_COMPATIBILITY_VERSION}"
|
||||
MACHO_CURRENT_VERSION "${LIBPCRE2_32_MACHO_CURRENT_VERSION}"
|
||||
VERSION ${LIBPCRE2_32_VERSION}
|
||||
SOVERSION ${LIBPCRE2_32_SOVERSION}
|
||||
OUTPUT_NAME pcre2-32)
|
||||
IF(REQUIRE_PTHREAD)
|
||||
TARGET_LINK_LIBRARIES(pcre2-32-shared Threads::Threads)
|
||||
ENDIF(REQUIRE_PTHREAD)
|
||||
SET(targets ${targets} pcre2-32-shared)
|
||||
SET(dll_pdb_files ${PROJECT_BINARY_DIR}/pcre2-32.pdb ${dll_pdb_files})
|
||||
SET(dll_pdb_debug_files ${PROJECT_BINARY_DIR}/pcre2-32d.pdb ${dll_pdb_debug_files})
|
||||
|
||||
IF(MINGW)
|
||||
IF(NON_STANDARD_LIB_PREFIX)
|
||||
SET_TARGET_PROPERTIES(pcre2-32-shared PROPERTIES PREFIX "")
|
||||
ENDIF(NON_STANDARD_LIB_PREFIX)
|
||||
IF(NON_STANDARD_LIB_SUFFIX)
|
||||
SET_TARGET_PROPERTIES(pcre2-32-shared PROPERTIES SUFFIX "-0.dll")
|
||||
ENDIF(NON_STANDARD_LIB_SUFFIX)
|
||||
ENDIF(MINGW)
|
||||
ENDIF(BUILD_SHARED_LIBS)
|
||||
|
||||
IF(BUILD_STATIC_LIBS)
|
||||
ADD_LIBRARY(pcre2-32 ALIAS pcre2-32-static)
|
||||
ELSE(BUILD_STATIC_LIBS)
|
||||
ADD_LIBRARY(pcre2-32 ALIAS pcre2-32-shared)
|
||||
ENDIF(BUILD_STATIC_LIBS)
|
||||
ENDIF(PCRE2_BUILD_PCRE2_32)
|
||||
|
||||
# Executables
|
||||
|
@ -718,7 +1027,9 @@ if test \"$?\" != \"0\"; then exit 1; fi
|
|||
\@echo off
|
||||
setlocal
|
||||
SET srcdir=\"${winsrc}\"
|
||||
SET pcre2test=\"${winexe}\"
|
||||
# The next line was replaced by the following one after a user comment.
|
||||
# SET pcre2test=\"${winexe}\"
|
||||
SET pcre2test=\"${winbin}\\pcre2test.exe\"
|
||||
if not [%CMAKE_CONFIG_TYPE%]==[] SET pcre2test=\"${winbin}\\%CMAKE_CONFIG_TYPE%\\pcre2test.exe\"
|
||||
call %srcdir%\\RunTest.Bat
|
||||
if errorlevel 1 exit /b 1
|
||||
|
@ -754,42 +1065,44 @@ SET(CMAKE_INSTALL_ALWAYS 1)
|
|||
|
||||
INSTALL(TARGETS ${targets}
|
||||
RUNTIME DESTINATION bin
|
||||
LIBRARY DESTINATION lib
|
||||
ARCHIVE DESTINATION lib)
|
||||
LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
|
||||
ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR})
|
||||
INSTALL(FILES ${pkg_config_files} DESTINATION ${CMAKE_INSTALL_LIBDIR}/pkgconfig)
|
||||
INSTALL(FILES "${CMAKE_CURRENT_BINARY_DIR}/pcre2-config"
|
||||
DESTINATION bin
|
||||
# Set 0755 permissions
|
||||
PERMISSIONS OWNER_WRITE OWNER_READ OWNER_EXECUTE GROUP_READ GROUP_EXECUTE WORLD_READ WORLD_EXECUTE)
|
||||
|
||||
INSTALL(FILES ${PCRE2_HEADERS} ${PCRE2POSIX_HEADERS} DESTINATION include)
|
||||
|
||||
# CMake config files.
|
||||
set(PCRE2_CONFIG_IN ${CMAKE_CURRENT_SOURCE_DIR}/cmake/pcre2-config.cmake.in)
|
||||
set(PCRE2_CONFIG_OUT ${CMAKE_CURRENT_BINARY_DIR}/cmake/pcre2-config.cmake)
|
||||
configure_file(${PCRE2_CONFIG_IN} ${PCRE2_CONFIG_OUT} @ONLY)
|
||||
set(PCRE2_CONFIG_VERSION_IN ${CMAKE_CURRENT_SOURCE_DIR}/cmake/pcre2-config-version.cmake.in)
|
||||
set(PCRE2_CONFIG_VERSION_OUT ${CMAKE_CURRENT_BINARY_DIR}/cmake/pcre2-config-version.cmake)
|
||||
configure_file(${PCRE2_CONFIG_VERSION_IN} ${PCRE2_CONFIG_VERSION_OUT} @ONLY)
|
||||
install(FILES ${PCRE2_CONFIG_OUT} ${PCRE2_CONFIG_VERSION_OUT} DESTINATION cmake)
|
||||
|
||||
FILE(GLOB html ${PROJECT_SOURCE_DIR}/doc/html/*.html)
|
||||
FILE(GLOB man1 ${PROJECT_SOURCE_DIR}/doc/*.1)
|
||||
FILE(GLOB man3 ${PROJECT_SOURCE_DIR}/doc/*.3)
|
||||
|
||||
FOREACH(man ${man3})
|
||||
GET_FILENAME_COMPONENT(man_tmp ${man} NAME)
|
||||
SET(man3_new ${man3} ${man})
|
||||
ENDFOREACH(man ${man3})
|
||||
SET(man3 ${man3_new})
|
||||
|
||||
INSTALL(FILES ${man1} DESTINATION man/man1)
|
||||
INSTALL(FILES ${man3} DESTINATION man/man3)
|
||||
INSTALL(FILES ${html} DESTINATION share/doc/pcre2/html)
|
||||
|
||||
IF(MSVC AND INSTALL_MSVC_PDB)
|
||||
INSTALL(FILES ${PROJECT_BINARY_DIR}/pcre2.pdb
|
||||
${PROJECT_BINARY_DIR}/pcre2posix.pdb
|
||||
DESTINATION bin
|
||||
CONFIGURATIONS RelWithDebInfo)
|
||||
INSTALL(FILES ${PROJECT_BINARY_DIR}/pcre2d.pdb
|
||||
${PROJECT_BINARY_DIR}/pcre2posixd.pdb
|
||||
DESTINATION bin
|
||||
CONFIGURATIONS Debug)
|
||||
INSTALL(FILES ${dll_pdb_files} DESTINATION bin CONFIGURATIONS RelWithDebInfo)
|
||||
INSTALL(FILES ${dll_pdb_debug_files} DESTINATION bin CONFIGURATIONS Debug)
|
||||
ENDIF(MSVC AND INSTALL_MSVC_PDB)
|
||||
|
||||
# Help, only for nice output
|
||||
IF(BUILD_SHARED_LIBS)
|
||||
SET(BUILD_STATIC_LIBS OFF)
|
||||
ELSE(BUILD_SHARED_LIBS)
|
||||
IF(BUILD_STATIC_LIBS)
|
||||
SET(BUILD_STATIC_LIBS ON)
|
||||
ENDIF(BUILD_SHARED_LIBS)
|
||||
ELSE(BUILD_STATIC_LIBS)
|
||||
SET(BUILD_STATIC_LIBS OFF)
|
||||
ENDIF(BUILD_STATIC_LIBS)
|
||||
|
||||
IF(PCRE2_HEAP_MATCH_RECURSE)
|
||||
MESSAGE(WARNING "HEAP_MATCH_RECURSE is obsolete and does nothing.")
|
||||
|
@ -802,7 +1115,7 @@ IF(PCRE2_SHOW_REPORT)
|
|||
ENDIF(CMAKE_C_FLAGS)
|
||||
MESSAGE(STATUS "")
|
||||
MESSAGE(STATUS "")
|
||||
MESSAGE(STATUS "PCRE2 configuration summary:")
|
||||
MESSAGE(STATUS "PCRE2-${PCRE2_MAJOR}.${PCRE2_MINOR} configuration summary:")
|
||||
MESSAGE(STATUS "")
|
||||
MESSAGE(STATUS " Install prefix .................. : ${CMAKE_INSTALL_PREFIX}")
|
||||
MESSAGE(STATUS " C compiler ...................... : ${CMAKE_C_COMPILER}")
|
||||
|
@ -827,6 +1140,7 @@ IF(PCRE2_SHOW_REPORT)
|
|||
MESSAGE(STATUS " Match depth limit ............... : ${PCRE2_MATCH_LIMIT_DEPTH}")
|
||||
MESSAGE(STATUS " Build shared libs ............... : ${BUILD_SHARED_LIBS}")
|
||||
MESSAGE(STATUS " Build static libs ............... : ${BUILD_STATIC_LIBS}")
|
||||
MESSAGE(STATUS " with PIC enabled ............. : ${PCRE2_STATIC_PIC}")
|
||||
MESSAGE(STATUS " Build pcre2grep ................. : ${PCRE2_BUILD_PCRE2GREP}")
|
||||
MESSAGE(STATUS " Enable JIT in pcre2grep ......... : ${PCRE2GREP_SUPPORT_JIT}")
|
||||
MESSAGE(STATUS " Enable callouts in pcre2grep .... : ${PCRE2GREP_SUPPORT_CALLOUT}")
|
||||
|
@ -861,10 +1175,10 @@ IF(PCRE2_SHOW_REPORT)
|
|||
MESSAGE(STATUS " Use %zu and %td ..................: AUTO" )
|
||||
ENDIF(PCRE2_DISABLE_PERCENT_ZT)
|
||||
|
||||
IF(MINGW AND NOT PCRE2_STATIC)
|
||||
IF(MINGW AND BUILD_SHARED_LIBS)
|
||||
MESSAGE(STATUS " Non-standard dll names (prefix) . : ${NON_STANDARD_LIB_PREFIX}")
|
||||
MESSAGE(STATUS " Non-standard dll names (suffix) . : ${NON_STANDARD_LIB_SUFFIX}")
|
||||
ENDIF(MINGW AND NOT PCRE2_STATIC)
|
||||
ENDIF(MINGW AND BUILD_SHARED_LIBS)
|
||||
|
||||
IF(MSVC)
|
||||
MESSAGE(STATUS " Install MSVC .pdb files ..........: ${INSTALL_MSVC_PDB}")
|
||||
|
|
563
ChangeLog
563
ChangeLog
|
@ -1,5 +1,562 @@
|
|||
Change Log for PCRE2
|
||||
--------------------
|
||||
Change Log for PCRE2 - see also the Git log
|
||||
-------------------------------------------
|
||||
|
||||
|
||||
Version 10.41 xx-xxx-2022
|
||||
-------------------------
|
||||
|
||||
1. Add fflush() before and after a fork callout in pcre2grep to get its output
|
||||
to be the same on all systems. (THere were previously ordering differences in
|
||||
Alpine Linux).
|
||||
|
||||
2. Merged patch from @carenas (GitHub #110) for pthreads support in CMake.
|
||||
|
||||
3. SSF scorecards grumbled about possible overflow in an expression in
|
||||
pcre2test. It never would have overflowed in practice, but some casts have been
|
||||
added and at the some time there's been some tidying of fprints that output
|
||||
size_t values.
|
||||
|
||||
4. PR #94 showed up an unused enum in pcre2_convert.c, which is now removed.
|
||||
|
||||
5. Minor code re-arrangement to remove gcc warning about realloc() in
|
||||
pcre2test.
|
||||
|
||||
6. Change a number of int variables that hold buffer and line lengths in
|
||||
pcre2grep to PCRE2_SIZE (aka size_t).
|
||||
|
||||
7. Added an #ifdef to cut out a call to PRIV(jit_free) when JIT is not
|
||||
supported (even though that function would do nothing in that case) at the
|
||||
request of a user who doesn't even want to link with pcre_jit_compile.o. Also
|
||||
tidied up an untidy #ifdef arrangement in pcre2test.
|
||||
|
||||
8. Fixed an issue in the backtracking optimization of character repeats in
|
||||
JIT. Furthermore optimize star repetitions, not just plus repetitions.
|
||||
|
||||
9. Removed the use of an initial backtracking frames vector on the system stack
|
||||
in pcre2_match() so that it now always uses the heap. (In a multi-thread
|
||||
environment with very small stacks there had been an issue.) This also is
|
||||
tidier for JIT matching, which didn't need that vector. The heap vector is now
|
||||
remembered in the match data block and re-used if that block itself is re-used.
|
||||
It is freed with the match data block.
|
||||
|
||||
10. Adjusted the find_limits code in pcre2test to work with change 9 above.
|
||||
|
||||
11. Added find_limits_noheap to pcre2test, because the heap limits are now
|
||||
different in different environments and so cannot be included in the standard
|
||||
tests.
|
||||
|
||||
12. Created a test for pcre2_match() heap processing that is not part of the
|
||||
tests run by 'make check', but can be run manually. The current output is from
|
||||
a 64-bit system.
|
||||
|
||||
13. Implemented -Z aka --null in pcre2grep.
|
||||
|
||||
|
||||
Version 10.40 15-April-2022
|
||||
---------------------------
|
||||
|
||||
1. Merged patch from @carenas (GitHub #35, 7db87842) to fix pcre2grep incorrect
|
||||
handling of multiple passes.
|
||||
|
||||
2. Merged patch from @carenas (GitHub #36, dae47509) to fix portability issue
|
||||
in pcre2grep with buffered fseek(stdin).
|
||||
|
||||
3. Merged patch from @carenas (GitHub #37, acc520924) to fix tests when -S is
|
||||
not supported.
|
||||
|
||||
4. Revert an unintended change in JIT repeat detection.
|
||||
|
||||
5. Merged patch from @carenas (GitHub #52, b037bfa1) to fix build on GNU Hurd.
|
||||
|
||||
6. Merged documentation and comments patches from @carenas (GitHub #47).
|
||||
|
||||
7. Merged patch from @carenas (GitHub #49) to remove obsolete JFriedl test code
|
||||
from pcre2grep.
|
||||
|
||||
8. Merged patch from @carenas (GitHub #48) to fix CMake install issue #46.
|
||||
|
||||
9. Merged patch from @carenas (GitHub #53) fixing NULL checks in matching and
|
||||
substituting.
|
||||
|
||||
10. Add null_subject and null_replacement modifiers to pcre2test.
|
||||
|
||||
11. Add check for NULL subject to POSIX regexec() function.
|
||||
|
||||
12. Add check for NULL replacement to pcre2_substitute().
|
||||
|
||||
13. For the subject arguments of pcre2_match(), pcre2_dfa_match(), and
|
||||
pcre2_substitute(), and the replacement argument of the latter, if the pointer
|
||||
is NULL and the length is zero, treat as an empty string. Apparently a number
|
||||
of applications treat NULL/0 in this way.
|
||||
|
||||
14. Added support for Bidi_Class and a number of binary Unicode properties,
|
||||
including Bidi_Control.
|
||||
|
||||
15. Fix some minor issues raised by clang sanitize.
|
||||
|
||||
16. Very minor code speed up for maximizing character property matches.
|
||||
|
||||
17. A number of changes to script matching for \p and \P:
|
||||
|
||||
(a) Script extensions for a character are now coded as a bitmap instead of
|
||||
a list of script numbers, which should be faster and does not need a
|
||||
loop.
|
||||
|
||||
(b) Added the syntax \p{script:xxx} and \p{script_extensions:xxx} (synonyms
|
||||
sc and scx).
|
||||
|
||||
(c) Changed \p{scriptname} from being the same as \p{sc:scriptname} to being
|
||||
the same as \p{scx:scriptname} because this change happened in Perl at
|
||||
release 5.26.
|
||||
|
||||
(d) The standard Unicode 4-letter abbreviations for script names are now
|
||||
recognized.
|
||||
|
||||
(e) In accordance with Unicode and Perl's "loose matching" rules, spaces,
|
||||
hyphens, and underscores are ignored in property names, which are then
|
||||
matched independent of case.
|
||||
|
||||
18. The Python scripts in the maint directory have been refactored. There are
|
||||
now three scripts that generate pcre2_ucd.c, pcre2_ucp.h, and pcre2_ucptables.c
|
||||
(which is #included by pcre2_tables.c). The data lists that used to be
|
||||
duplicated are now held in a single common Python module.
|
||||
|
||||
19. On CHERI, and thus Arm's Morello prototype, pointers are represented as
|
||||
hardware capabilities, which consist of both an integer address and additional
|
||||
metadata, meaning they are twice the size of the platform's size_t type, i.e.
|
||||
16 bytes on a 64-bit system. The ovector member of heapframe happens to only be
|
||||
8 byte aligned, and so computing frame_size ended up with a multiple of 8 but
|
||||
not 16. Whilst the first frame was always suitably aligned, this then
|
||||
misaligned the frame that follows, resulting in an alignment fault when storing
|
||||
a pointer to Fecode at the start of match. Patch to fix this issue by Jessica
|
||||
Clarke PR#72.
|
||||
|
||||
20. Added -LP and -LS listing options to pcre2test.
|
||||
|
||||
21. A user discovered that the library names in CMakeLists.txt for MSVC
|
||||
debugger (PDB) files were incorrect - perhaps never tried for PCRE2?
|
||||
|
||||
22. An item such as [Aa] is optimized into a caseless single character match.
|
||||
When this was quantified (e.g. [Aa]{2}) and was also the last literal item in a
|
||||
pattern, the optimizing "must be present for a match" character check was not
|
||||
being flagged as caseless, causing some matches that should have succeeded to
|
||||
fail.
|
||||
|
||||
23. Fixed a unicode property matching issue in JIT. The character was not
|
||||
fully read in caseless matching.
|
||||
|
||||
24. Fixed an issue affecting recursions in JIT caused by duplicated data
|
||||
transfers.
|
||||
|
||||
25. Merged patch from @carenas (GitHub #96) which fixes some problems with
|
||||
pcre2test and readline/readedit:
|
||||
|
||||
* Use the right header for libedit in FreeBSD with autoconf
|
||||
* Really allow libedit with cmake
|
||||
* Avoid using readline headers with libedit
|
||||
|
||||
|
||||
Version 10.39 29-October-2021
|
||||
-----------------------------
|
||||
|
||||
1. Fix incorrect detection of alternatives in first character search in JIT.
|
||||
|
||||
2. Merged patch from @carenas (GitHub #28):
|
||||
|
||||
Visual Studio 2013 includes support for %zu and %td, so let newer
|
||||
versions of it avoid the fallback, and while at it, make sure that
|
||||
the first check is for DISABLE_PERCENT_ZT so it will be always
|
||||
honoured if chosen.
|
||||
|
||||
prtdiff_t is signed, so use a signed type instead, and make sure
|
||||
that an appropriate width is chosen if pointers are 64bit wide and
|
||||
long is not (ex: Windows 64bit).
|
||||
|
||||
IMHO removing the cast (and therefore the possibilty of truncation)
|
||||
make the code cleaner and the fallback is likely portable enough
|
||||
with all 64-bit POSIX systems doing LP64 except for Windows.
|
||||
|
||||
3. Merged patch from @carenas (GitHub #29) to update to Unicode 14.0.0.
|
||||
|
||||
4. Merged patch from @carenas (GitHub #30):
|
||||
|
||||
* Cleanup: remove references to no longer used stdint.h
|
||||
|
||||
Since 19c50b9d (Unconditionally use inttypes.h instead of trying for stdint.h
|
||||
(simplification) and remove the now unnecessary inclusion in
|
||||
pcre2_internal.h., 2018-11-14), stdint.h is no longer used.
|
||||
|
||||
Remove checks for it in autotools and CMake and document better the expected
|
||||
build failures for systems that might have stdint.h (C99) and not inttypes.h
|
||||
(from POSIX), like old Windows.
|
||||
|
||||
* Cleanup: remove detection for inttypes.h which is a hard dependency
|
||||
|
||||
CMake checks for standard headers are not meant to be used for hard
|
||||
dependencies, so will prevent a possible fallback to work.
|
||||
|
||||
Alternatively, the header could be checked to make the configuration fail
|
||||
instead of breaking the build, but that was punted, as it was missing anyway
|
||||
from autotools.
|
||||
|
||||
5. Merged patch from @carenas (GitHub #32):
|
||||
|
||||
* jit: allow building with ancient MSVC versions
|
||||
|
||||
Visual Studio older than 2013 fails to build with JIT enabled, because it is
|
||||
unable to parse non C89 compatible syntax, with mixed declarations and code.
|
||||
While most recent compilers wouldn't even report this as a warning since it
|
||||
is valid C99, it could be also made visible by adding to gcc/clang the
|
||||
-Wdeclaration-after-statement flag at build time.
|
||||
|
||||
Move the code below the affected definitions.
|
||||
|
||||
* pcre2grep: avoid mixing declarations with code
|
||||
|
||||
Since d5a61ee8 (Patch to detect (and ignore) symlink loops in pcre2grep,
|
||||
2021-08-28), code will fail to build in a strict C89 compiler.
|
||||
|
||||
Reformat slightly to make it C89 compatible again.
|
||||
|
||||
|
||||
Version 10.38 01-October-2021
|
||||
-----------------------------
|
||||
|
||||
1. Fix invalid single character repetition issues in JIT when the repetition
|
||||
is inside a capturing bracket and the bracket is preceded by character
|
||||
literals.
|
||||
|
||||
2. Installed revised CMake configuration files provided by Jan-Willem Blokland.
|
||||
This extends the CMake build system to build both static and shared libraries
|
||||
in one go, builds the static library with PIC, and exposes PCRE2 libraries
|
||||
using the CMake config files. JWB provided these notes:
|
||||
|
||||
- Introduced CMake variable BUILD_STATIC_LIBS to build the static library.
|
||||
|
||||
- Make a small modification to config-cmake.h.in by removing the PCRE2_STATIC
|
||||
variable. Added PCRE2_STATIC variable to the static build using the
|
||||
target_compile_definitions() function.
|
||||
|
||||
- Extended the CMake config files.
|
||||
|
||||
- Introduced CMake variable PCRE2_USE_STATIC_LIBS to easily switch between
|
||||
the static and shared libraries.
|
||||
|
||||
- Added the PCRE_STATIC variable to the target compile definitions for the
|
||||
import of the static library.
|
||||
|
||||
Building static and shared libraries using MSVC results in a name clash of
|
||||
the libraries. Both static and shared library builds create, for example, the
|
||||
file pcre2-8.lib. Therefore, I decided to change the static library names by
|
||||
adding "-static". For example, pcre2-8.lib has become pcre2-8-static.lib.
|
||||
[Comment by PH: this is MSVC-specific. It doesn't happen on Linux.]
|
||||
|
||||
3. Increased the minimum release number for CMake to 3.0.0 because older than
|
||||
2.8.12 is deprecated (it was set to 2.8.5) and causes warnings. Even 3.0.0 is
|
||||
quite old; it was released in 2014.
|
||||
|
||||
4. Implemented a modified version of Thomas Tempelmann's pcre2grep patch for
|
||||
detecting symlink loops. This is dependent on the availability of realpath(),
|
||||
which is now tested for in ./configure and CMakeLists.txt.
|
||||
|
||||
5. Implemented a modified version of Thomas Tempelmann's patch for faster
|
||||
case-independent "first code unit" searches for unanchored patterns in 8-bit
|
||||
mode in the interpreters. Instead of just remembering whether one case matched
|
||||
or not, it remembers the position of a previous match so as to avoid
|
||||
unnecessary repeated searching.
|
||||
|
||||
6. Perl now locks out \K in lookarounds, so PCRE2 now does the same by default.
|
||||
However, just in case anybody was relying on the old behaviour, there is an
|
||||
option called PCRE2_EXTRA_ALLOW_LOOKAROUND_BSK that enables the old behaviour.
|
||||
An option has also been added to pcre2grep to enable this.
|
||||
|
||||
7. Re-enable a JIT optimization which was unintentionally disabled in 10.35.
|
||||
|
||||
8. There is a loop counter to catch excessively crazy patterns when checking
|
||||
the lengths of lookbehinds at compile time. This was incorrectly getting reset
|
||||
whenever a lookahead was processed, leading to some fuzzer-generated patterns
|
||||
taking a very long time to compile when (?|) was present in the pattern,
|
||||
because (?|) disables caching of group lengths.
|
||||
|
||||
|
||||
Version 10.37 26-May-2021
|
||||
-------------------------
|
||||
|
||||
1. Change RunGrepTest to use tr instead of sed when testing with binary
|
||||
zero bytes, because sed varies a lot from system to system and has problems
|
||||
with binary zeros. This is from Bugzilla #2681. Patch from Jeremie
|
||||
Courreges-Anglas via Nam Nguyen. This fixes RunGrepTest for OpenBSD. Later:
|
||||
it broke it for at least one version of Solaris, where tr can't handle binary
|
||||
zeros. However, that system had /usr/xpg4/bin/tr installed, which works OK, so
|
||||
RunGrepTest now checks for that command and uses it if found.
|
||||
|
||||
2. Compiling with gcc 10.2's -fanalyzer option showed up a hypothetical problem
|
||||
with a NULL dereference. I don't think this case could ever occur in practice,
|
||||
but I have put in a check in order to get rid of the compiler error.
|
||||
|
||||
3. An alternative patch for CMakeLists.txt because 10.36 #4 breaks CMake on
|
||||
Windows. Patch from email@cs-ware.de fixes bugzilla #2688.
|
||||
|
||||
4. Two bugs related to over-large numbers have been fixed so the behaviour is
|
||||
now the same as Perl.
|
||||
|
||||
(a) A pattern such as /\214748364/ gave an overflow error instead of being
|
||||
treated as the octal number \214 followed by literal digits.
|
||||
|
||||
(b) A sequence such as {65536 that has no terminating } so is not a
|
||||
quantifier was nevertheless complaining that a quantifier number was too big.
|
||||
|
||||
5. A run of autoconf suggested that configure.ac was out-of-date with respect
|
||||
to the lastest autoconf. Running autoupdate made some valid changes, some valid
|
||||
suggestions, and also some invalid changes, which were fixed by hand. Autoconf
|
||||
now runs clean and the resulting "configure" seems to work, so I hope nothing
|
||||
is broken. Later: the requirement for autoconf 2.70 broke some automatic test
|
||||
robots. It doesn't seem to be necessary: trying a reduction to 2.60.
|
||||
|
||||
6. The pattern /a\K.(?0)*/ when matched against "abac" by the interpreter gave
|
||||
the answer "bac", whereas Perl and JIT both yield "c". This was because the
|
||||
effect of \K was not propagating back from the full pattern recursion. Other
|
||||
recursions such as /(a\K.(?1)*)/ did not have this problem.
|
||||
|
||||
7. Restore single character repetition optimization in JIT. Currently fewer
|
||||
character repetitions are optimized than in 10.34.
|
||||
|
||||
8. When the names of the functions in the POSIX wrapper were changed to
|
||||
pcre2_regcomp() etc. (see change 10.33 #4 below), functions with the original
|
||||
names were left in the library so that pre-compiled programs would still work.
|
||||
However, this has proved troublesome when programs link with several libraries,
|
||||
some of which use PCRE2 via the POSIX interface while others use a native POSIX
|
||||
library. For this reason, the POSIX function names are removed in this release.
|
||||
The macros in pcre2posix.h should ensure that re-compiling fixes any programs
|
||||
that haven't been compiled since before 10.33.
|
||||
|
||||
|
||||
Version 10.36 04-December-2020
|
||||
------------------------------
|
||||
|
||||
1. Add CET_CFLAGS so that when Intel CET is enabled, pass -mshstk to
|
||||
compiler. This fixes https://bugs.exim.org/show_bug.cgi?id=2578. Patch for
|
||||
Makefile.am and configure.ac by H.J. Lu. Equivalent patch for CMakeLists.txt
|
||||
invented by PH.
|
||||
|
||||
2. Fix inifinite loop when a single byte newline is searched in JIT when
|
||||
invalid utf8 mode is enabled.
|
||||
|
||||
3. Updated CMakeLists.txt with patch from Wolfgang Stöggl (Bugzilla #2584):
|
||||
|
||||
- Include GNUInstallDirs and use ${CMAKE_INSTALL_LIBDIR} instead of hardcoded
|
||||
lib. This allows differentiation between lib and lib64.
|
||||
CMAKE_INSTALL_LIBDIR is used for installation of libraries and also for
|
||||
pkgconfig file generation.
|
||||
|
||||
- Add the version of PCRE2 to the configuration summary like ./configure
|
||||
does.
|
||||
|
||||
- Fix typo: MACTHED_STRING->MATCHED_STRING
|
||||
|
||||
4. Updated CMakeLists.txt with another patch from Wolfgang Stöggl (Bugzilla
|
||||
#2588):
|
||||
|
||||
- Add escaped double quotes around include directory in CMakeLists.txt to
|
||||
allow spaces in directory names.
|
||||
|
||||
- This fixes a cmake error, if the path of the pcre2 source contains a space.
|
||||
|
||||
5. Updated CMakeLists.txt with a patch from B. Scott Michel: CMake's
|
||||
documentation suggests using CHECK_SYMBOL_EXISTS over CHECK_FUNCTION_EXIST.
|
||||
Moreover, these functions come from specific header files, which need to be
|
||||
specified (and, thankfully, are the same on both the Linux and WinXX
|
||||
platforms.)
|
||||
|
||||
6. Added a (uint32_t) cast to prevent a compiler warning in pcre2_compile.c.
|
||||
|
||||
7. Applied a patch from Wolfgang Stöggl (Bugzilla #2600) to fix postfix for
|
||||
debug Windows builds using CMake. This also updated configure so that it
|
||||
generates *.pc files and pcre2-config with the same content, as in the past.
|
||||
|
||||
8. If a pattern ended with (?(VERSION=n.d where n is any number but d is just a
|
||||
single digit, the code unit beyond d was being read (i.e. there was a read
|
||||
buffer overflow). Fixes ClusterFuzz 23779.
|
||||
|
||||
9. After the rework in r1235, certain character ranges were incorrectly
|
||||
handled by an optimization in JIT. Furthermore a wrong offset was used to
|
||||
read a value from a buffer which could lead to memory overread.
|
||||
|
||||
10. Unnoticed for many years was the fact that delimiters other than / in the
|
||||
testinput1 and testinput4 files could cause incorrect behaviour when these
|
||||
files were processed by perltest.sh. There were several tests that used quotes
|
||||
as delimiters, and it was just luck that they didn't go wrong with perltest.sh.
|
||||
All the patterns in testinput1 and testinput4 now use / as their delimiter.
|
||||
This fixes Bugzilla #2641.
|
||||
|
||||
11. Perl has started to give an error for \K within lookarounds (though there
|
||||
are cases where it doesn't). PCRE2 still allows this, so the tests that include
|
||||
this case have been moved from test 1 to test 2.
|
||||
|
||||
12. Further to 10 above, pcre2test has been updated to detect and grumble if a
|
||||
delimiter other than / is used after #perltest.
|
||||
|
||||
13. Fixed a bug with PCRE2_MATCH_INVALID_UTF in 8-bit mode when PCRE2_CASELESS
|
||||
was set and PCRE2_NO_START_OPTIMIZE was not set. The optimization for finding
|
||||
the start of a match was not resetting correctly after a failed match on the
|
||||
first valid fragment of the subject, possibly causing incorrect "no match"
|
||||
returns on subsequent fragments. For example, the pattern /A/ failed to match
|
||||
the subject \xe5A. Fixes Bugzilla #2642.
|
||||
|
||||
14. Fixed a bug in character set matching when JIT is enabled and both unicode
|
||||
scripts and unicode classes are present at the same time.
|
||||
|
||||
15. Added GNU grep's -m (aka --max-count) option to pcre2grep.
|
||||
|
||||
16. Refactored substitution processing in pcre2grep strings, both for the -O
|
||||
option and when dealing with callouts. There is now a single function that
|
||||
handles $ expansion in all cases (instead of multiple copies of almost
|
||||
identical code). This means that the same escape sequences are available
|
||||
everywhere, which was not previously the case. At the same time, the escape
|
||||
sequences $x{...} and $o{...} have been introduced, to allow for characters
|
||||
whose code points are greater than 255 in Unicode mode.
|
||||
|
||||
17. Applied the patch from Bugzilla #2628 to RunGrepTest. This does an explicit
|
||||
test for a version of sed that can handle binary zero, instead of assuming that
|
||||
any Linux version will work. Later: replaced $(...) by `...` because not all
|
||||
shells recognize the former.
|
||||
|
||||
18. Fixed a word boundary check bug in JIT when partial matching is enabled.
|
||||
|
||||
19. Fix ARM64 compilation warning in JIT. Patch by Carlo.
|
||||
|
||||
20. A bug in the RunTest script meant that if the first part of test 2 failed,
|
||||
the failure was not reported.
|
||||
|
||||
21. Test 2 was failing when run from a directory other than the source
|
||||
directory. This failure was previously missed in RunTest because of 20 above.
|
||||
Fixes added to both RunTest and RunTest.bat.
|
||||
|
||||
22. Patch to CMakeLists.txt from Daniel to fix problem with testing under
|
||||
Windows.
|
||||
|
||||
|
||||
Version 10.35 09-May-2020
|
||||
---------------------------
|
||||
|
||||
1. Use PCRE2_MATCH_EMPTY flag to detect empty matches in JIT.
|
||||
|
||||
2. Fix ARMv5 JIT improper handling of labels right after a constant pool.
|
||||
|
||||
3. A JIT bug is fixed which allowed to read the fields of the compiled
|
||||
pattern before its existence is checked.
|
||||
|
||||
4. Back in the PCRE1 day, capturing groups that contained recursive back
|
||||
references to themselves were made atomic (version 8.01, change 18) because
|
||||
after the end a repeated group, the captured substrings had their values from
|
||||
the final repetition, not from an earlier repetition that might be the
|
||||
destination of a backtrack. This feature was documented, and was carried over
|
||||
into PCRE2. However, it has now been realized that the major refactoring that
|
||||
was done for 10.30 has made this atomicizing unnecessary, and it is confusing
|
||||
when users are unaware of it, making some patterns appear not to be working as
|
||||
expected. Capture values of recursive back references in repeated groups are
|
||||
now correctly backtracked, so this unnecessary restriction has been removed.
|
||||
|
||||
5. Added PCRE2_SUBSTITUTE_LITERAL.
|
||||
|
||||
6. Avoid some VS compiler warnings.
|
||||
|
||||
7. Added PCRE2_SUBSTITUTE_MATCHED.
|
||||
|
||||
8. Added (?* and (?<* as synonyms for (*napla: and (*naplb: to match another
|
||||
regex engine. The Perl regex folks are aware of this usage and have made a note
|
||||
about it.
|
||||
|
||||
9. When an assertion is repeated, PCRE2 used to limit the maximum repetition to
|
||||
1, believing that repeating an assertion is pointless. However, if a positive
|
||||
assertion contains capturing groups, repetition can be useful. In any case, an
|
||||
assertion could always be wrapped in a repeated group. The only restriction
|
||||
that is now imposed is that an unlimited maximum is changed to one more than
|
||||
the minimum.
|
||||
|
||||
10. Fix *THEN verbs in lookahead assertions in JIT.
|
||||
|
||||
11. Added PCRE2_SUBSTITUTE_REPLACEMENT_ONLY.
|
||||
|
||||
12. The JIT stack should be freed when the low-level stack allocation fails.
|
||||
|
||||
13. In pcre2grep, if the final line in a scanned file is output but does not
|
||||
end with a newline sequence, add a newline according to the --newline setting.
|
||||
|
||||
14. (?(DEFINE)...) groups were not being handled correctly when checking for
|
||||
the fixed length of a lookbehind assertion. Such a group within a lookbehind
|
||||
should be skipped, as it does not contribute to the length of the group.
|
||||
Instead, the (DEFINE) group was being processed, and if at the end of the
|
||||
lookbehind, that end was not correctly recognized. Errors such as "lookbehind
|
||||
assertion is not fixed length" and also "internal error: bad code value in
|
||||
parsed_skip()" could result.
|
||||
|
||||
15. Put a limit of 1000 on recursive calls in pcre2_study() when searching
|
||||
nested groups for starting code units, in order to avoid stack overflow issues.
|
||||
If the limit is reached, it just gives up trying for this optimization.
|
||||
|
||||
16. The control verb chain list must always be restored when exiting from a
|
||||
recurse function in JIT.
|
||||
|
||||
17. Fix a crash which occurs when the character type of an invalid UTF
|
||||
character is decoded in JIT.
|
||||
|
||||
18. Changes in many areas of the code so that when Unicode is supported and
|
||||
PCRE2_UCP is set without PCRE2_UTF, Unicode character properties are used for
|
||||
upper/lower case computations on characters whose code points are greater than
|
||||
127.
|
||||
|
||||
19. The function for checking UTF-16 validity was returning an incorrect offset
|
||||
for the start of the error when a high surrogate was not followed by a valid
|
||||
low surrogate. This caused incorrect behaviour, for example when
|
||||
PCRE2_MATCH_INVALID_UTF was set and a match started immediately following the
|
||||
invalid high surrogate, such as /aa/ matching "\x{d800}aa".
|
||||
|
||||
20. If a DEFINE group immediately preceded a lookbehind assertion, the pattern
|
||||
could be mis-compiled and therefore not match correctly. This is the example
|
||||
that found this: /(?(DEFINE)(?<foo>bar))(?<![-a-z0-9])word/ which failed to
|
||||
match "word" because the "move back" value was set to zero.
|
||||
|
||||
21. Following a request from a user, some extensions and tidies to the
|
||||
character tables handling have been done:
|
||||
|
||||
(a) The dftables auxiliary program is renamed pcre2_dftables, but it is still
|
||||
not installed for public use.
|
||||
|
||||
(b) There is now a -b option for pcre2_dftables, which causes the tables to
|
||||
be written in binary. There is also a -help option.
|
||||
|
||||
(c) PCRE2_CONFIG_TABLES_LENGTH is added to pcre2_config() so that an
|
||||
application that wants to save tables in binary knows how long they are.
|
||||
|
||||
22. Changed setting of CMAKE_MODULE_PATH in CMakeLists.txt from SET to
|
||||
LIST(APPEND...) to allow a setting from the command line to be included.
|
||||
|
||||
23. Updated to Unicode 13.0.0.
|
||||
|
||||
24. CMake build now checks for secure_getenv() and strerror(). Patch by Carlo.
|
||||
|
||||
25. Avoid using [-1] as a suffix in pcre2test because it can provoke a compiler
|
||||
warning.
|
||||
|
||||
26. Added tests for __attribute__((uninitialized)) to both the configure and
|
||||
CMake build files, and then applied this attribute to the variable called
|
||||
stack_frames_vector[] in pcre2_match(). When implemented, this disables
|
||||
automatic initialization (a facility in clang), which can take time on big
|
||||
variables.
|
||||
|
||||
27. Updated CMakeLists.txt (patches by Uwe Korn) to add support for
|
||||
pcre2-config, the libpcre*.pc files, SOVERSION, VERSION and the
|
||||
MACHO_*_VERSIONS settings for CMake builds.
|
||||
|
||||
28. Another patch to CMakeLists.txt to check for mkostemp (configure already
|
||||
does). Patch by Carlo Marcelo Arenas Belon.
|
||||
|
||||
29. Check for the existence of memfd_create in both CMake and configure
|
||||
configurations. Patch by Carlo Marcelo Arenas Belon.
|
||||
|
||||
30. Restrict the configuration setting for the SELinux compatible execmem
|
||||
allocator (change 10.30/44) to Linux and NetBSD.
|
||||
|
||||
|
||||
Version 10.34 21-November-2019
|
||||
|
@ -337,7 +894,7 @@ Patch by Guillem Jover.
|
|||
warnings were reported.
|
||||
|
||||
38. Using the clang compiler with sanitizing options causes runtime complaints
|
||||
about truncation for statments such as x = ~x when x is an 8-bit value; it
|
||||
about truncation for statements such as x = ~x when x is an 8-bit value; it
|
||||
seems to compute ~x as a 32-bit value. Changing such statements to x = 255 ^ x
|
||||
gets rid of the warnings. There were also two missing casts in pcre2test.
|
||||
|
||||
|
|
25
CheckMan
25
CheckMan
|
@ -16,6 +16,7 @@ while (scalar(@ARGV) > 0)
|
|||
|
||||
while (<IN>)
|
||||
{
|
||||
$count = 0;
|
||||
$line++;
|
||||
if (/^\s*$/)
|
||||
{
|
||||
|
@ -50,14 +51,24 @@ while (scalar(@ARGV) > 0)
|
|||
$yield = 1;
|
||||
}
|
||||
}
|
||||
else
|
||||
elsif (/\\[^ef]|\\f[^IBP]/)
|
||||
{
|
||||
if (/\\[^ef]|\\f[^IBP]/)
|
||||
{
|
||||
printf "Bad backslash in line $line of $file\n";
|
||||
$yield = 1;
|
||||
}
|
||||
}
|
||||
printf "Bad backslash in line $line of $file\n";
|
||||
$yield = 1;
|
||||
}
|
||||
while (/\\f[BI]/g)
|
||||
{
|
||||
$count++;
|
||||
}
|
||||
while (/\\fP/g)
|
||||
{
|
||||
$count--;
|
||||
}
|
||||
if ($count != 0)
|
||||
{
|
||||
printf "Mismatching formatting in line $line of $file\n";
|
||||
$yield = 1;
|
||||
}
|
||||
}
|
||||
|
||||
close(IN);
|
||||
|
|
64
HACKING
64
HACKING
|
@ -8,8 +8,8 @@ library is referred to as PCRE1 below. For information about testing PCRE2, see
|
|||
the pcre2test documentation and the comment at the head of the RunTest file.
|
||||
|
||||
PCRE1 releases were up to 8.3x when PCRE2 was developed, and later bug fix
|
||||
releases remain in the 8.xx series. PCRE2 releases started at 10.00 to avoid
|
||||
confusion with PCRE1.
|
||||
releases carried on the 8.xx series, up to the final 8.45 release. PCRE2
|
||||
releases started at 10.00 to avoid confusion with PCRE1.
|
||||
|
||||
|
||||
Historical note 1
|
||||
|
@ -38,8 +38,8 @@ Historical note 2
|
|||
By contrast, the code originally written by Henry Spencer (which was
|
||||
subsequently heavily modified for Perl) compiles the expression twice: once in
|
||||
a dummy mode in order to find out how much store will be needed, and then for
|
||||
real. (The Perl version probably doesn't do this any more; I'm talking about
|
||||
the original library.) The execution function operates by backtracking and
|
||||
real. (The Perl version may or may not still do this; I'm talking about the
|
||||
original library.) The execution function operates by backtracking and
|
||||
maximizing (or, optionally, minimizing, in Perl) the amount of the subject that
|
||||
matches individual wild portions of the pattern. This is an "NFA algorithm" in
|
||||
Friedl's terminology.
|
||||
|
@ -151,8 +151,8 @@ of code units in the item itself. The exception is the aforementioned large
|
|||
advance to check for such values. When auto-callouts are enabled, the generous
|
||||
assumption is made that there will be a callout for each pattern code unit
|
||||
(which of course is only actually true if all code units are literals) plus one
|
||||
at the end. There is a default parsed pattern vector on the system stack, but
|
||||
if this is not big enough, heap memory is used.
|
||||
at the end. A default parsed pattern vector is defined on the system stack, to
|
||||
minimize memory handling, but if this is not big enough, heap memory is used.
|
||||
|
||||
As before, the actual compiling function is run twice, the first time to
|
||||
determine the amount of memory needed for the final compiled pattern. It
|
||||
|
@ -187,7 +187,7 @@ META_CLASS_EMPTY [] empty class - only with PCRE2_ALLOW_EMPTY_CLASS
|
|||
META_CLASS_EMPTY_NOT [^] negative empty class - ditto
|
||||
META_CLASS_END ] end of non-empty class
|
||||
META_CLASS_NOT [^ start non-empty negative class
|
||||
META_COMMIT (*COMMIT)
|
||||
META_COMMIT (*COMMIT) - no argument (see below for with argument)
|
||||
META_COND_ASSERT (?(?assertion)
|
||||
META_DOLLAR $ metacharacter
|
||||
META_DOT . metacharacter
|
||||
|
@ -201,18 +201,18 @@ META_NOCAPTURE (?: no capture parens
|
|||
META_PLUS +
|
||||
META_PLUS_PLUS ++
|
||||
META_PLUS_QUERY +?
|
||||
META_PRUNE (*PRUNE) - no argument
|
||||
META_PRUNE (*PRUNE) - no argument (see below for with argument)
|
||||
META_QUERY ?
|
||||
META_QUERY_PLUS ?+
|
||||
META_QUERY_QUERY ??
|
||||
META_RANGE_ESCAPED hyphen in class range with at least one escape
|
||||
META_RANGE_LITERAL hyphen in class range defined literally
|
||||
META_SKIP (*SKIP) - no argument
|
||||
META_THEN (*THEN) - no argument
|
||||
META_SKIP (*SKIP) - no argument (see below for with argument)
|
||||
META_THEN (*THEN) - no argument (see below for with argument)
|
||||
|
||||
The two RANGE values occur only in character classes. They are positioned
|
||||
between two literals that define the start and end of the range. In an EBCDIC
|
||||
evironment it is necessary to know whether either of the range values was
|
||||
environment it is necessary to know whether either of the range values was
|
||||
specified as an escape. In an ASCII/Unicode environment the distinction is not
|
||||
relevant.
|
||||
|
||||
|
@ -229,17 +229,16 @@ If the data for META_ALT is non-zero, it is inside a lookbehind, and the data
|
|||
is the length of its branch, for which OP_REVERSE must be generated.
|
||||
|
||||
META_BACKREF, META_CAPTURE, and META_RECURSE have the capture group number as
|
||||
their data in the lower 16 bits of the element.
|
||||
their data in the lower 16 bits of the element. META_RECURSE is followed by an
|
||||
offset, for use in error messages.
|
||||
|
||||
META_BACKREF is followed by an offset if the back reference group number is 10
|
||||
or more. The offsets of the first ocurrences of references to groups whose
|
||||
or more. The offsets of the first occurrences of references to groups whose
|
||||
numbers are less than 10 are put in cb->small_ref_offset[] (only the first
|
||||
occurrence is useful). On 64-bit systems this avoids using more than two parsed
|
||||
pattern elements for items such as \3. The offset is used when an error occurs
|
||||
because the reference is to a non-existent group.
|
||||
|
||||
META_RECURSE is always followed by an offset, for use in error messages.
|
||||
|
||||
META_ESCAPE has an ESC_xxx value as its data. For ESC_P and ESC_p, the next
|
||||
element contains the 16-bit type and data property values, packed together.
|
||||
ESC_g and ESC_k are used only for named references - numerical ones are turned
|
||||
|
@ -291,9 +290,9 @@ META_LOOKBEHIND (?<= start of lookbehind
|
|||
META_LOOKBEHIND_NA (*naplb: start of non-atomic lookbehind
|
||||
META_LOOKBEHINDNOT (?<! start of negative lookbehind
|
||||
|
||||
The following are followed by two elements, the minimum and maximum. Repeat
|
||||
values are limited to 65535 (MAX_REPEAT). A maximum value of "unlimited" is
|
||||
represented by UNLIMITED_REPEAT, which is bigger than MAX_REPEAT:
|
||||
The following are followed by two elements, the minimum and maximum. The
|
||||
maximum value is limited to 65535 (MAX_REPEAT). A maximum value of "unlimited"
|
||||
is represented by UNLIMITED_REPEAT, which is bigger than MAX_REPEAT:
|
||||
|
||||
META_MINMAX {n,m} repeat
|
||||
META_MINMAX_PLUS {n,m}+ repeat
|
||||
|
@ -347,11 +346,11 @@ support is not available for this kind of matching.
|
|||
Changeable options
|
||||
------------------
|
||||
|
||||
The /i, /m, or /s options (PCRE2_CASELESS, PCRE2_MULTILINE, PCRE2_DOTALL, and
|
||||
others) may be changed in the middle of patterns by items such as (?i). Their
|
||||
processing is handled entirely at compile time by generating different opcodes
|
||||
for the different settings. The runtime functions do not need to keep track of
|
||||
an option's state.
|
||||
The /i, /m, or /s options (PCRE2_CASELESS, PCRE2_MULTILINE, PCRE2_DOTALL) and
|
||||
some others may be changed in the middle of patterns by items such as (?i).
|
||||
Their processing is handled entirely at compile time by generating different
|
||||
opcodes for the different settings. The runtime functions do not need to keep
|
||||
track of an option's state.
|
||||
|
||||
PCRE2_DUPNAMES, PCRE2_EXTENDED, PCRE2_EXTENDED_MORE, and PCRE2_NO_AUTO_CAPTURE
|
||||
are tracked and processed during the parsing pre-pass. The others are handled
|
||||
|
@ -437,7 +436,7 @@ Backtracking control verbs
|
|||
--------------------------
|
||||
|
||||
Verbs with no arguments generate opcodes with no following data (as listed
|
||||
in the section above).
|
||||
in the section above).
|
||||
|
||||
(*MARK:NAME) generates OP_MARK followed by the mark name, preceded by a
|
||||
length in one code unit, and followed by a binary zero. The name length is
|
||||
|
@ -468,8 +467,8 @@ Caseless matching (positive or negative) of characters that have more than two
|
|||
case-equivalent code points (which is possible only in UTF mode) is handled by
|
||||
compiling a Unicode property item (see below), with the pseudo-property
|
||||
PT_CLIST. The value of this property is an offset in a vector called
|
||||
"ucd_caseless_sets" which identifies the start of a short list of equivalent
|
||||
characters, terminated by the value NOTACHAR (0xffffffff).
|
||||
"ucd_caseless_sets" which identifies the start of a short list of case
|
||||
equivalent characters, terminated by the value NOTACHAR (0xffffffff).
|
||||
|
||||
|
||||
Repeating single characters
|
||||
|
@ -546,8 +545,9 @@ Each is followed by two code units that encode the desired property as a type
|
|||
and a value. The types are a set of #defines of the form PT_xxx, and the values
|
||||
are enumerations of the form ucp_xx, defined in the pcre2_ucp.h source file.
|
||||
The value is relevant only for PT_GC (General Category), PT_PC (Particular
|
||||
Category), PT_SC (Script), and the pseudo-property PT_CLIST, which is used to
|
||||
identify a list of case-equivalent characters when there are three or more.
|
||||
Category), PT_SC (Script), PT_BIDICL (Bidi Class), PT_BOOL (Boolean property),
|
||||
and the pseudo-property PT_CLIST, which is used to identify a list of
|
||||
case-equivalent characters when there are three or more (see above).
|
||||
|
||||
Repeats of these items use the OP_TYPESTAR etc. set of opcodes, followed by
|
||||
three code units: OP_PROP or OP_NOTPROP, and then the desired property type and
|
||||
|
@ -665,9 +665,9 @@ a count that immediately follows the offset.
|
|||
There are several opcodes that mark the end of a subpattern group. OP_KET is
|
||||
used for subpatterns that do not repeat indefinitely, OP_KETRMIN and
|
||||
OP_KETRMAX are used for indefinite repetitions, minimally or maximally
|
||||
respectively, and OP_KETRPOS for possessive repetitions (see below for more
|
||||
respectively, and OP_KETRPOS for possessive repetitions (see below for more
|
||||
details). All four are followed by a LINK_SIZE value giving (as a positive
|
||||
number) the offset back to the matching bracket opcode.
|
||||
number) the offset back to the matching opening bracket opcode.
|
||||
|
||||
If a subpattern is quantified such that it is permitted to match zero times, it
|
||||
is preceded by one of OP_BRAZERO, OP_BRAMINZERO, or OP_SKIPZERO. These are
|
||||
|
@ -718,7 +718,7 @@ Assertions
|
|||
|
||||
Forward assertions are also just like other subpatterns, but starting with one
|
||||
of the opcodes OP_ASSERT, OP_ASSERT_NA (non-atomic assertion), or
|
||||
OP_ASSERT_NOT. Backward assertions use the opcodes OP_ASSERTBACK,
|
||||
OP_ASSERT_NOT. Backward assertions use the opcodes OP_ASSERTBACK,
|
||||
OP_ASSERTBACK_NA, and OP_ASSERTBACK_NOT, and the first opcode inside the
|
||||
assertion is OP_REVERSE, followed by a count of the number of characters to
|
||||
move back the pointer in the subject string. In ASCII or UTF-32 mode, the count
|
||||
|
@ -827,4 +827,4 @@ not a real opcode, but is used to check at compile time that tables indexed by
|
|||
opcode are the correct length, in order to catch updating errors.
|
||||
|
||||
Philip Hazel
|
||||
12 July 2019
|
||||
April 2022
|
||||
|
|
12
LICENCE
12
LICENCE
|
@ -20,13 +20,13 @@ THE BASIC LIBRARY FUNCTIONS
|
|||
---------------------------
|
||||
|
||||
Written by: Philip Hazel
|
||||
Email local part: ph10
|
||||
Email domain: cam.ac.uk
|
||||
Email local part: Philip.Hazel
|
||||
Email domain: gmail.com
|
||||
|
||||
University of Cambridge Computing Service,
|
||||
Retired from University of Cambridge Computing Service,
|
||||
Cambridge, England.
|
||||
|
||||
Copyright (c) 1997-2019 University of Cambridge
|
||||
Copyright (c) 1997-2022 University of Cambridge
|
||||
All rights reserved.
|
||||
|
||||
|
||||
|
@ -37,7 +37,7 @@ Written by: Zoltan Herczeg
|
|||
Email local part: hzmester
|
||||
Email domain: freemail.hu
|
||||
|
||||
Copyright(c) 2010-2019 Zoltan Herczeg
|
||||
Copyright(c) 2010-2022 Zoltan Herczeg
|
||||
All rights reserved.
|
||||
|
||||
|
||||
|
@ -48,7 +48,7 @@ Written by: Zoltan Herczeg
|
|||
Email local part: hzmester
|
||||
Email domain: freemail.hu
|
||||
|
||||
Copyright(c) 2009-2019 Zoltan Herczeg
|
||||
Copyright(c) 2009-2022 Zoltan Herczeg
|
||||
All rights reserved.
|
||||
|
||||
|
||||
|
|
|
@ -0,0 +1,8 @@
|
|||
module(
|
||||
name = "pcre2",
|
||||
version = "10.40",
|
||||
compatibility_level = 1,
|
||||
)
|
||||
|
||||
bazel_dep(name = "rules_cc", version = "0.0.1")
|
||||
bazel_dep(name = "bazel_skylib", version = "1.2.1")
|
43
Makefile.am
43
Makefile.am
|
@ -325,18 +325,18 @@ include_HEADERS = src/pcre2posix.h
|
|||
bin_SCRIPTS = pcre2-config
|
||||
|
||||
## ---------------------------------------------------------------
|
||||
## The dftables program is used to rebuild character tables before compiling
|
||||
## PCRE2, if --enable-rebuild-chartables is specified. It is not a user-visible
|
||||
## program. The default (when --enable-rebuild-chartables is not specified) is
|
||||
## to copy a distributed set of tables that are defined for ASCII code. In this
|
||||
## case, dftables is not needed.
|
||||
## The pcre2_dftables program is used to rebuild character tables before
|
||||
## compiling PCRE2, if --enable-rebuild-chartables is specified. It is not an
|
||||
## installed program. The default (when --enable-rebuild-chartables is not
|
||||
## specified) is to copy a distributed set of tables that are defined for ASCII
|
||||
## code. In this case, pcre2_dftables is not needed.
|
||||
|
||||
if WITH_REBUILD_CHARTABLES
|
||||
noinst_PROGRAMS += dftables
|
||||
dftables_SOURCES = src/dftables.c
|
||||
src/pcre2_chartables.c: dftables$(EXEEXT)
|
||||
noinst_PROGRAMS += pcre2_dftables
|
||||
pcre2_dftables_SOURCES = src/pcre2_dftables.c
|
||||
src/pcre2_chartables.c: pcre2_dftables$(EXEEXT)
|
||||
rm -f $@
|
||||
./dftables$(EXEEXT) $@
|
||||
./pcre2_dftables$(EXEEXT) $@
|
||||
else
|
||||
src/pcre2_chartables.c: $(srcdir)/src/pcre2_chartables.c.dist
|
||||
rm -f $@
|
||||
|
@ -382,6 +382,10 @@ COMMON_SOURCES = \
|
|||
src/pcre2_valid_utf.c \
|
||||
src/pcre2_xclass.c
|
||||
|
||||
# The pcre2_ucptables.c file is #included by pcre2_tables.c
|
||||
|
||||
EXTRA_DIST += src/pcre2_ucptables.c
|
||||
|
||||
if WITH_PCRE2_8
|
||||
lib_LTLIBRARIES += libpcre2-8.la
|
||||
libpcre2_8_la_SOURCES = \
|
||||
|
@ -391,6 +395,7 @@ nodist_libpcre2_8_la_SOURCES = \
|
|||
libpcre2_8_la_CFLAGS = \
|
||||
-DPCRE2_CODE_UNIT_WIDTH=8 \
|
||||
$(VISIBILITY_CFLAGS) \
|
||||
$(CET_CFLAGS) \
|
||||
$(AM_CFLAGS)
|
||||
libpcre2_8_la_LIBADD =
|
||||
endif # WITH_PCRE2_8
|
||||
|
@ -404,6 +409,7 @@ nodist_libpcre2_16_la_SOURCES = \
|
|||
libpcre2_16_la_CFLAGS = \
|
||||
-DPCRE2_CODE_UNIT_WIDTH=16 \
|
||||
$(VISIBILITY_CFLAGS) \
|
||||
$(CET_CFLAGS) \
|
||||
$(AM_CFLAGS)
|
||||
libpcre2_16_la_LIBADD =
|
||||
endif # WITH_PCRE2_16
|
||||
|
@ -417,6 +423,7 @@ nodist_libpcre2_32_la_SOURCES = \
|
|||
libpcre2_32_la_CFLAGS = \
|
||||
-DPCRE2_CODE_UNIT_WIDTH=32 \
|
||||
$(VISIBILITY_CFLAGS) \
|
||||
$(CET_CFLAGS) \
|
||||
$(AM_CFLAGS)
|
||||
libpcre2_32_la_LIBADD =
|
||||
endif # WITH_PCRE2_32
|
||||
|
@ -445,15 +452,16 @@ EXTRA_DIST += \
|
|||
src/sljit/sljitNativePPC_32.c \
|
||||
src/sljit/sljitNativePPC_64.c \
|
||||
src/sljit/sljitNativePPC_common.c \
|
||||
src/sljit/sljitNativeSPARC_32.c \
|
||||
src/sljit/sljitNativeSPARC_common.c \
|
||||
src/sljit/sljitNativeTILEGX-encoder.c \
|
||||
src/sljit/sljitNativeTILEGX_64.c \
|
||||
src/sljit/sljitNativeRISCV_32.c \
|
||||
src/sljit/sljitNativeRISCV_64.c \
|
||||
src/sljit/sljitNativeRISCV_common.c \
|
||||
src/sljit/sljitNativeS390X.c \
|
||||
src/sljit/sljitNativeX86_32.c \
|
||||
src/sljit/sljitNativeX86_64.c \
|
||||
src/sljit/sljitNativeX86_common.c \
|
||||
src/sljit/sljitProtExecAllocator.c \
|
||||
src/sljit/sljitUtils.c
|
||||
src/sljit/sljitUtils.c \
|
||||
src/sljit/sljitWXExecAllocator.c
|
||||
|
||||
# Some of the JIT sources are also in separate files that are #included.
|
||||
|
||||
|
@ -634,6 +642,7 @@ EXTRA_DIST += \
|
|||
testdata/grepoutputCN \
|
||||
testdata/grepoutputN \
|
||||
testdata/greppatN4 \
|
||||
testdata/testbtables \
|
||||
testdata/testinput1 \
|
||||
testdata/testinput2 \
|
||||
testdata/testinput3 \
|
||||
|
@ -659,6 +668,7 @@ EXTRA_DIST += \
|
|||
testdata/testinput23 \
|
||||
testdata/testinput24 \
|
||||
testdata/testinput25 \
|
||||
testdata/testinput26 \
|
||||
testdata/testinputEBC \
|
||||
testdata/testoutput1 \
|
||||
testdata/testoutput2 \
|
||||
|
@ -701,6 +711,7 @@ EXTRA_DIST += \
|
|||
testdata/testoutput23 \
|
||||
testdata/testoutput24 \
|
||||
testdata/testoutput25 \
|
||||
testdata/testoutput26 \
|
||||
testdata/testoutputEBC \
|
||||
testdata/valgrind-jit.supp \
|
||||
testdata/wintestinput3 \
|
||||
|
@ -855,9 +866,11 @@ endif # WITH_GCOV
|
|||
|
||||
EXTRA_DIST += \
|
||||
cmake/COPYING-CMAKE-SCRIPTS \
|
||||
cmake/FindEditline.cmake \
|
||||
cmake/FindPackageHandleStandardArgs.cmake \
|
||||
cmake/FindReadline.cmake \
|
||||
cmake/FindEditline.cmake \
|
||||
cmake/pcre2-config-version.cmake.in \
|
||||
cmake/pcre2-config.cmake.in \
|
||||
CMakeLists.txt \
|
||||
config-cmake.h.in
|
||||
|
||||
|
|
|
@ -0,0 +1,271 @@
|
|||
#
|
||||
# Project: pcre2
|
||||
#
|
||||
# Created on: 10-01-2022 22:01:46
|
||||
#
|
||||
# commands to use:
|
||||
# make -f Makefile.os4 libpcre2.a
|
||||
# make -f Makefile.os4 libpcre2-posix.a
|
||||
# make -f Makefile.os4 pcre2test
|
||||
# sh RunTest
|
||||
# make -f Makefile.os4 clean
|
||||
#
|
||||
|
||||
###################################################################
|
||||
##
|
||||
##//// Objects
|
||||
##
|
||||
###################################################################
|
||||
|
||||
libpcre2_OBJ := \
|
||||
src/pcre2_chartables.o src/pcre2_auto_possess.o src/pcre2_compile.o \
|
||||
src/pcre2_config.o src/pcre2_context.o src/pcre2_convert.o \
|
||||
src/pcre2_dfa_match.o src/pcre2_error.o src/pcre2_extuni.o \
|
||||
src/pcre2_find_bracket.o src/pcre2_jit_compile.o src/pcre2_maketables.o \
|
||||
src/pcre2_match.o src/pcre2_match_data.o src/pcre2_newline.o \
|
||||
src/pcre2_ord2utf.o src/pcre2_pattern_info.o src/pcre2_script_run.o \
|
||||
src/pcre2_serialize.o src/pcre2_string_utils.o src/pcre2_study.o \
|
||||
src/pcre2_substitute.o src/pcre2_substring.o src/pcre2_tables.o \
|
||||
src/pcre2_ucd.o src/pcre2_valid_utf.o src/pcre2_xclass.o \
|
||||
|
||||
|
||||
|
||||
pcre2posix_OBJ := \
|
||||
src/pcre2posix.o
|
||||
|
||||
|
||||
pcre2test_OBJ := \
|
||||
src/pcre2test.o
|
||||
|
||||
|
||||
pcre2grep_OBJ := \
|
||||
src/pcre2grep.o
|
||||
|
||||
###################################################################
|
||||
##
|
||||
##//// Variables and Environment
|
||||
##
|
||||
###################################################################
|
||||
|
||||
MCRT := -mcrt=newlib
|
||||
ifeq ($(USE_CLIB2), yes)
|
||||
MCRT := -mcrt=clib2
|
||||
endif
|
||||
|
||||
CC := gcc:bin/gcc
|
||||
|
||||
INCPATH := -I. -Isrc
|
||||
|
||||
# for pcre2test
|
||||
CFLAGS := $(MCRT) $(INCPATH) -O2 -DHAVE_CONFIG_H -DPCRE2_CODE_UNIT_WIDTH=8
|
||||
|
||||
###################################################################
|
||||
##
|
||||
##//// General rules
|
||||
##
|
||||
###################################################################
|
||||
|
||||
.PHONY: all all-before all-after clean clean-custom realclean
|
||||
|
||||
all: all-before libpcre2.a libpcre2-posix.a all-after
|
||||
|
||||
all-before:
|
||||
# You can add rules here to execute before the project is built
|
||||
|
||||
all-after:
|
||||
# You can add rules here to execute after the project is built
|
||||
|
||||
tests: pcre2test pcre2grep
|
||||
|
||||
clean: clean-custom
|
||||
@echo "Cleaning compiler objects..."
|
||||
@rm -f $(libpcre2_OBJ) $(pcre2posix_OBJ) $(pcre2test_OBJ)
|
||||
|
||||
cleanall: clean
|
||||
@echo "Cleaning compiler targets..."
|
||||
@rm -f libpcre.a libpcre-posix.a pcre2test pcre2grep
|
||||
|
||||
###################################################################
|
||||
##
|
||||
##//// Targets
|
||||
##
|
||||
###################################################################
|
||||
|
||||
libpcre2.a: $(libpcre2_OBJ)
|
||||
ar -rcs libpcre2.a $(libpcre2_OBJ)
|
||||
ranlib libpcre2.a
|
||||
|
||||
libpcre2-posix.a: $(pcre2posix_OBJ)
|
||||
ar -rcs libpcre2-posix.a $(pcre2posix_OBJ)
|
||||
ranlib libpcre2-posix.a
|
||||
|
||||
pcre2test: libpcre2.a libpcre2-posix.a $(pcre2test_OBJ)
|
||||
@echo "Linking pcre2test"
|
||||
@gcc:bin/gcc $(MCRT) -o pcre2test $(pcre2test_OBJ) -L. -lauto -lpcre2 -lpcre2-posix
|
||||
@echo "Removing stale debug target: pcre2test"
|
||||
@rm -f pcre2test.debug
|
||||
|
||||
pcre2grep: libpcre2.a $(pcre2grep_OBJ)
|
||||
@echo "Linking pcre2grep"
|
||||
@gcc:bin/gcc $(MCRT) -o pcre2grep $(pcre2grep_OBJ) -L . -lauto -lpcre2
|
||||
@echo "Removing stale debug target: pcre2grep"
|
||||
@rm -f pcre2grep.debug
|
||||
|
||||
|
||||
###################################################################
|
||||
##
|
||||
##//// Standard rules
|
||||
##
|
||||
###################################################################
|
||||
|
||||
# A default rule to make all the objects listed below
|
||||
# because we are hiding compiler commands from the output
|
||||
|
||||
.c.o:
|
||||
@echo "Compiling $<"
|
||||
@$(CC) -c $< -o $*.o $(CFLAGS)
|
||||
|
||||
src/pcre2_chartables.o: src/pcre2_chartables.c src/config.h src/pcre2_internal.h \
|
||||
src/pcre2.h src/pcre2_ucp.h
|
||||
|
||||
src/pcre2_auto_possess.o: src/pcre2_auto_possess.c src/config.h src/pcre2_internal.h \
|
||||
src/pcre2.h src/pcre2_ucp.h
|
||||
|
||||
src/pcre2_compile.o: src/pcre2_compile.c src/config.h src/pcre2_internal.h \
|
||||
src/pcre2.h src/pcre2_ucp.h src/pcre2_intmodedep.h \
|
||||
|
||||
|
||||
src/pcre2_config.o: src/pcre2_config.c src/config.h src/pcre2_internal.h \
|
||||
src/pcre2.h src/pcre2_ucp.h
|
||||
|
||||
src/pcre2_context.o: src/pcre2_context.c src/config.h src/pcre2_internal.h \
|
||||
src/pcre2.h src/pcre2_ucp.h
|
||||
|
||||
src/pcre2_convert.o: src/pcre2_convert.c src/config.h src/pcre2_internal.h \
|
||||
src/pcre2.h src/pcre2_ucp.h
|
||||
|
||||
src/pcre2_dfa_match.o: src/pcre2_dfa_match.c src/config.h src/pcre2_internal.h \
|
||||
src/pcre2.h src/pcre2_ucp.h
|
||||
|
||||
src/pcre2_error.o: src/pcre2_error.c src/config.h src/pcre2_internal.h \
|
||||
src/pcre2.h src/pcre2_ucp.h
|
||||
|
||||
src/pcre2_extuni.o: src/pcre2_extuni.c src/config.h src/pcre2_internal.h \
|
||||
src/pcre2.h src/pcre2_ucp.h
|
||||
|
||||
src/pcre2_find_bracket.o: src/pcre2_find_bracket.c src/config.h src/pcre2_internal.h \
|
||||
src/pcre2.h src/pcre2_ucp.h
|
||||
|
||||
src/pcre2_jit_compile.o: src/pcre2_jit_compile.c src/config.h src/pcre2_internal.h \
|
||||
src/pcre2.h src/pcre2_ucp.h src/pcre2_intmodedep.h \
|
||||
src/sljit/sljitLir.c src/sljit/sljitLir.h src/sljit/sljitConfig.h \
|
||||
src/sljit/sljitConfigInternal.h src/sljit/sljitUtils.c src/sljit/sljitProtExecAllocator.c \
|
||||
src/sljit/sljitWXExecAllocator.c src/sljit/sljitExecAllocator.c src/pcre2_jit_simd_inc.h \
|
||||
src/pcre2_jit_neon_inc.h src/pcre2_jit_match.c
|
||||
|
||||
src/pcre2_maketables.o: src/pcre2_maketables.c
|
||||
|
||||
src/pcre2_match.o: src/pcre2_match.c src/config.h src/pcre2_internal.h \
|
||||
src/pcre2.h src/pcre2_ucp.h
|
||||
|
||||
src/pcre2_match_data.o: src/pcre2_match_data.c src/config.h src/pcre2_internal.h \
|
||||
src/pcre2.h src/pcre2_ucp.h
|
||||
|
||||
src/pcre2_newline.o: src/pcre2_newline.c src/config.h src/pcre2_internal.h \
|
||||
src/pcre2.h src/pcre2_ucp.h
|
||||
|
||||
src/pcre2_ord2utf.o: src/pcre2_ord2utf.c src/config.h src/pcre2_internal.h \
|
||||
src/pcre2.h src/pcre2_ucp.h
|
||||
|
||||
src/pcre2_pattern_info.o: src/pcre2_pattern_info.c src/config.h src/pcre2_internal.h \
|
||||
src/pcre2.h src/pcre2_ucp.h
|
||||
|
||||
src/pcre2_script_run.o: src/pcre2_script_run.c src/config.h src/pcre2_internal.h \
|
||||
src/pcre2.h src/pcre2_ucp.h
|
||||
|
||||
src/pcre2_serialize.o: src/pcre2_serialize.c src/config.h src/pcre2_internal.h \
|
||||
src/pcre2.h src/pcre2_ucp.h
|
||||
|
||||
src/pcre2test.o: src/pcre2test.c src/config.h src/pcre2.h \
|
||||
src/pcre2posix.h src/pcre2_internal.h src/pcre2_ucp.h \
|
||||
src/pcre2_intmodedep.h src/pcre2_tables.c src/pcre2_ucptables.c \
|
||||
src/pcre2_ucd.c src/pcre2_printint.c
|
||||
|
||||
src/pcre2_string_utils.o: src/pcre2_string_utils.c src/config.h src/pcre2_internal.h \
|
||||
src/pcre2.h src/pcre2_ucp.h
|
||||
|
||||
src/pcre2_study.o: src/pcre2_study.c src/config.h src/pcre2_internal.h \
|
||||
src/pcre2.h src/pcre2_ucp.h
|
||||
|
||||
src/pcre2_substitute.o: src/pcre2_substitute.c src/config.h src/pcre2_internal.h \
|
||||
src/pcre2.h src/pcre2_ucp.h
|
||||
|
||||
src/pcre2_substring.o: src/pcre2_substring.c src/config.h src/pcre2_internal.h \
|
||||
src/pcre2.h src/pcre2_ucp.h
|
||||
|
||||
src/pcre2posix.o: src/pcre2posix.c src/config.h src/pcre2.h \
|
||||
|
||||
|
||||
src/pcre2_tables.o: src/pcre2_tables.c src/config.h src/pcre2_internal.h \
|
||||
src/pcre2.h src/pcre2_ucp.h src/pcre2_intmodedep.h \
|
||||
|
||||
|
||||
src/pcre2_ucd.o: src/pcre2_ucd.c src/config.h src/pcre2_internal.h \
|
||||
src/pcre2.h src/pcre2_ucp.h
|
||||
|
||||
src/pcre2_valid_utf.o: src/pcre2_valid_utf.c src/config.h src/pcre2_internal.h \
|
||||
src/pcre2.h src/pcre2_ucp.h
|
||||
|
||||
src/pcre2_xclass.o: src/pcre2_xclass.c src/config.h src/pcre2_internal.h \
|
||||
src/pcre2.h src/pcre2_ucp.h
|
||||
|
||||
|
||||
src/pcre2grep.o: src/pcre2grep.c src/config.h
|
||||
|
||||
###################################################################
|
||||
##
|
||||
##//// Custom rules
|
||||
##
|
||||
###################################################################
|
||||
|
||||
runtests: libpcre2.a libpcre2-posix.a tests
|
||||
sh RunTest
|
||||
sh RunGrepTest
|
||||
|
||||
release:
|
||||
@echo "Create release folders..."
|
||||
@mkdir -p release/local/newlib/lib release/local/clib2/lib release/local/Documentation/pcre2 release/local/common/include
|
||||
|
||||
@echo "Building newlib based libraries..."
|
||||
@make -f Makefile.os4 all
|
||||
@cp libpcre2.a release/local/newlib/lib/
|
||||
@cp libpcre2-posix.a release/local/newlib/lib/
|
||||
|
||||
@echo "Clean build and libraries files..."
|
||||
@make -f Makefile.os4 cleanall
|
||||
|
||||
@echo "Building clib2 based libraries..."
|
||||
@make -f Makefile.os4 all USE_CLIB2=yes
|
||||
@cp libpcre2.a release/local/clib2/lib/
|
||||
@cp libpcre2-posix.a release/local/clib2/lib/
|
||||
|
||||
@echo "Copy the necessary files..."
|
||||
@cp src/pcre2.h release/local/common/include/
|
||||
@cp src/pcre2posix.h release/local/common/include/
|
||||
@cp COPYING release/local/Documentation/pcre2/
|
||||
@cp HACKING release/local/Documentation/pcre2/
|
||||
@cp LICENCE release/local/Documentation/pcre2/
|
||||
@cp README release/local/Documentation/pcre2/
|
||||
@cp README-OS4.md release/local/Documentation/pcre2/
|
||||
|
||||
@echo "Clean build and libraries files..."
|
||||
@make -f Makefile.os4 cleanall
|
||||
|
||||
@echo "Creating the lha release file..."
|
||||
@rm -f pcre2.lha
|
||||
@lha -aeqr3 a pcre2.lha release/
|
||||
|
||||
@rm -rf release
|
||||
|
||||
###################################################################
|
||||
|
100
NEWS
100
NEWS
|
@ -2,6 +2,106 @@ News about PCRE2 releases
|
|||
-------------------------
|
||||
|
||||
|
||||
Version 10.40 15-April-2022
|
||||
---------------------------
|
||||
|
||||
This is mostly a bug-fixing and code-tidying release. However, there are some
|
||||
extensions to Unicode property handling:
|
||||
|
||||
* Added support for Bidi_Class and a number of binary Unicode properties,
|
||||
including Bidi_Control.
|
||||
|
||||
* A number of changes to script matching for \p and \P:
|
||||
|
||||
(a) Script extensions for a character are now coded as a bitmap instead of
|
||||
a list of script numbers, which should be faster and does not need a
|
||||
loop.
|
||||
|
||||
(b) Added the syntax \p{script:xxx} and \p{script_extensions:xxx} (synonyms
|
||||
sc and scx).
|
||||
|
||||
(c) Changed \p{scriptname} from being the same as \p{sc:scriptname} to being
|
||||
the same as \p{scx:scriptname} because this change happened in Perl at
|
||||
release 5.26.
|
||||
|
||||
(d) The standard Unicode 4-letter abbreviations for script names are now
|
||||
recognized.
|
||||
|
||||
(e) In accordance with Unicode and Perl's "loose matching" rules, spaces,
|
||||
hyphens, and underscores are ignored in property names, which are then
|
||||
matched independent of case.
|
||||
|
||||
As always, see ChangeLog for a list of all changes (also the Git log).
|
||||
|
||||
|
||||
Version 10.39 29-October-2021
|
||||
-----------------------------
|
||||
|
||||
This release is happening soon after 10.38 because the bug fix is important.
|
||||
|
||||
1. Fix incorrect detection of alternatives in first character search in JIT.
|
||||
|
||||
2. Update to Unicode 14.0.0.
|
||||
|
||||
3. Some code cleanups (see ChangeLog).
|
||||
|
||||
|
||||
Version 10.38 01-October-2021
|
||||
-----------------------------
|
||||
|
||||
As well as some bug fixes and tidies (as always, see ChangeLog for details),
|
||||
the documentation is updated to list the new URLs, following the move of the
|
||||
source repository to GitHub and the mailing list to Google Groups.
|
||||
|
||||
* The CMake build system can now build both static and shared libraries in one
|
||||
go.
|
||||
|
||||
* Following Perl's lead, \K is now locked out in lookaround assertions by
|
||||
default, but an option is provided to re-enable the previous behaviour.
|
||||
|
||||
|
||||
Version 10.37 26-May-2021
|
||||
-------------------------
|
||||
|
||||
A few more bug fixes and tidies. The only change of real note is the removal of
|
||||
the actual POSIX names regcomp etc. from the POSIX wrapper library because
|
||||
these have caused issues for some applications (see 10.33 #2 below).
|
||||
|
||||
|
||||
Version 10.36 04-December-2020
|
||||
------------------------------
|
||||
|
||||
Again, mainly bug fixes and tidies. The only enhancements are the addition of
|
||||
GNU grep's -m (aka --max-count) option to pcre2grep, and also unifying the
|
||||
handling of substitution strings for both -O and callouts in pcre2grep, with
|
||||
the addition of $x{...} and $o{...} to allow for characters whose code points
|
||||
are greater than 255 in Unicode mode.
|
||||
|
||||
NOTE: there is an outstanding issue with JIT support for MacOS on arm64
|
||||
hardware. For details, please see Bugzilla issue #2618.
|
||||
|
||||
|
||||
Version 10.35 15-April-2020
|
||||
---------------------------
|
||||
|
||||
Bugfixes, tidies, and a few new enhancements.
|
||||
|
||||
1. Capturing groups that contain recursive backreferences to themselves are no
|
||||
longer automatically atomic, because the restriction is no longer necessary
|
||||
as a result of the 10.30 restructuring.
|
||||
|
||||
2. Several new options for pcre2_substitute().
|
||||
|
||||
3. When Unicode is supported and PCRE2_UCP is set without PCRE2_UTF, Unicode
|
||||
character properties are used for upper/lower case computations on characters
|
||||
whose code points are greater than 127.
|
||||
|
||||
4. The character tables (for low-valued characters) can now more easily be
|
||||
saved and restored in binary.
|
||||
|
||||
5. Updated to Unicode 13.0.0.
|
||||
|
||||
|
||||
Version 10.34 21-November-2019
|
||||
------------------------------
|
||||
|
||||
|
|
|
@ -40,7 +40,11 @@ GENERIC INSTRUCTIONS FOR THE PCRE2 C LIBRARY
|
|||
|
||||
The following are generic instructions for building the PCRE2 C library "by
|
||||
hand". If you are going to use CMake, this section does not apply to you; you
|
||||
can skip ahead to the CMake section.
|
||||
can skip ahead to the CMake section. Note that the settings concerned with
|
||||
8-bit, 16-bit, and 32-bit code units relate to the type of data string that
|
||||
PCRE2 processes. They are NOT referring to the underlying operating system bit
|
||||
width. You do not have to do anything special to compile in a 64-bit
|
||||
environment, for example.
|
||||
|
||||
(1) Copy or rename the file src/config.h.generic as src/config.h, and edit the
|
||||
macro settings that it contains to whatever is appropriate for your
|
||||
|
@ -74,23 +78,23 @@ can skip ahead to the CMake section.
|
|||
src/pcre2_chartables.c.
|
||||
|
||||
OR:
|
||||
Compile src/dftables.c as a stand-alone program (using -DHAVE_CONFIG_H
|
||||
if you have set up src/config.h), and then run it with the single
|
||||
argument "src/pcre2_chartables.c". This generates a set of standard
|
||||
character tables and writes them to that file. The tables are generated
|
||||
using the default C locale for your system. If you want to use a locale
|
||||
that is specified by LC_xxx environment variables, add the -L option to
|
||||
the dftables command. You must use this method if you are building on a
|
||||
system that uses EBCDIC code.
|
||||
Compile src/pcre2_dftables.c as a stand-alone program (using
|
||||
-DHAVE_CONFIG_H if you have set up src/config.h), and then run it with
|
||||
the single argument "src/pcre2_chartables.c". This generates a set of
|
||||
standard character tables and writes them to that file. The tables are
|
||||
generated using the default C locale for your system. If you want to use
|
||||
a locale that is specified by LC_xxx environment variables, add the -L
|
||||
option to the pcre2_dftables command. You must use this method if you
|
||||
are building on a system that uses EBCDIC code.
|
||||
|
||||
The tables in src/pcre2_chartables.c are defaults. The caller of PCRE2 can
|
||||
specify alternative tables at run time.
|
||||
|
||||
(4) For an 8-bit library, compile the following source files from the src
|
||||
directory, setting -DPCRE2_CODE_UNIT_WIDTH=8 as a compiler option. Also
|
||||
set -DHAVE_CONFIG_H if you have set up src/config.h with your
|
||||
configuration, or else use other -D settings to change the configuration
|
||||
as required.
|
||||
(4) For a library that supports 8-bit code units in the character strings that
|
||||
it processes, compile the following source files from the src directory,
|
||||
setting -DPCRE2_CODE_UNIT_WIDTH=8 as a compiler option. Also set
|
||||
-DHAVE_CONFIG_H if you have set up src/config.h with your configuration,
|
||||
or else use other -D settings to change the configuration as required.
|
||||
|
||||
pcre2_auto_possess.c
|
||||
pcre2_chartables.c
|
||||
|
@ -117,6 +121,7 @@ can skip ahead to the CMake section.
|
|||
pcre2_substring.c
|
||||
pcre2_tables.c
|
||||
pcre2_ucd.c
|
||||
pcre2_ucptables.c
|
||||
pcre2_valid_utf.c
|
||||
pcre2_xclass.c
|
||||
|
||||
|
@ -142,9 +147,9 @@ can skip ahead to the CMake section.
|
|||
If your system has static and shared libraries, you may have to do this
|
||||
once for each type.
|
||||
|
||||
(6) If you want to build a 16-bit library or 32-bit library (as well as, or
|
||||
instead of the 8-bit library) just supply 16 or 32 as the value of
|
||||
-DPCRE2_CODE_UNIT_WIDTH when you are compiling.
|
||||
(6) If you want to build a library that supports 16-bit or 32-bit code units,
|
||||
(as well as, or instead of the 8-bit library) just supply 16 or 32 as the
|
||||
value of -DPCRE2_CODE_UNIT_WIDTH when you are compiling.
|
||||
|
||||
(7) If you want to build the POSIX wrapper functions (which apply only to the
|
||||
8-bit library), ensure that you have the src/pcre2posix.h file and then
|
||||
|
@ -302,7 +307,7 @@ cache can be deleted by selecting "File > Delete Cache".
|
|||
3. Create a new, empty build directory, preferably a subdirectory of the
|
||||
source dir. For example, C:\pcre2\pcre2-xx\build.
|
||||
|
||||
4. Run cmake-gui from the Shell envirornment of your build tool, for example,
|
||||
4. Run cmake-gui from the Shell environment of your build tool, for example,
|
||||
Msys for Msys/MinGW or Visual Studio Command Prompt for VC/VC++. Do not try
|
||||
to start Cmake from the Windows Start menu, as this can lead to errors.
|
||||
|
||||
|
@ -339,10 +344,10 @@ cache can be deleted by selecting "File > Delete Cache".
|
|||
|
||||
BUILDING PCRE2 ON WINDOWS WITH VISUAL STUDIO
|
||||
|
||||
The code currently cannot be compiled without a stdint.h header, which is
|
||||
available only in relatively recent versions of Visual Studio. However, this
|
||||
portable and permissively-licensed implementation of the header worked without
|
||||
issue:
|
||||
The code currently cannot be compiled without an inttypes.h header, which is
|
||||
available only with Visual Studio 2013 or newer. However, this portable and
|
||||
permissively-licensed implementation of the stdint.h header could be used as an
|
||||
alternative:
|
||||
|
||||
http://www.azillionmonkeys.com/qed/pstdint.h
|
||||
|
||||
|
@ -369,7 +374,7 @@ Otherwise:
|
|||
1. Copy RunTest.bat into the directory where pcre2test.exe and pcre2grep.exe
|
||||
have been created.
|
||||
|
||||
2. Edit RunTest.bat to indentify the full or relative location of
|
||||
2. Edit RunTest.bat to identify the full or relative location of
|
||||
the pcre2 source (wherein which the testdata folder resides), e.g.:
|
||||
|
||||
set srcdir=C:\pcre2\pcre2-10.00
|
||||
|
@ -401,6 +406,6 @@ Everything in that location, source and executable, is in EBCDIC and native
|
|||
z/OS file formats. The port provides an API for LE languages such as COBOL and
|
||||
for the z/OS and z/VM versions of the Rexx languages.
|
||||
|
||||
==============================
|
||||
Last Updated: 14 November 2018
|
||||
==============================
|
||||
===========================
|
||||
Last Updated: 28 April 2021
|
||||
===========================
|
||||
|
|
|
@ -190,7 +190,7 @@ files="\
|
|||
libpcre2-16.pc.in \
|
||||
libpcre2-32.pc.in \
|
||||
libpcre2-posix.pc.in \
|
||||
src/dftables.c \
|
||||
src/pcre2_dftables.c \
|
||||
src/pcre2.h.in \
|
||||
src/pcre2_auto_possess.c \
|
||||
src/pcre2_compile.c \
|
||||
|
|
154
README
154
README
|
@ -4,18 +4,20 @@ README file for PCRE2 (Perl-compatible regular expression library)
|
|||
PCRE2 is a re-working of the original PCRE1 library to provide an entirely new
|
||||
API. Since its initial release in 2015, there has been further development of
|
||||
the code and it now differs from PCRE1 in more than just the API. There are new
|
||||
features and the internals have been improved. The latest release of PCRE2 is
|
||||
available in three alternative formats from:
|
||||
features, and the internals have been improved. The original PCRE1 library is
|
||||
now obsolete and no longer maintained. The latest release of PCRE2 is available
|
||||
in .tar.gz, tar.bz2, or .zip form from this GitHub repository:
|
||||
|
||||
https://ftp.pcre.org/pub/pcre/pcre2-10.xx.tar.gz
|
||||
https://ftp.pcre.org/pub/pcre/pcre2-10.xx.tar.bz2
|
||||
https://ftp.pcre.org/pub/pcre/pcre2-10.xx.tar.zip
|
||||
https://github.com/PCRE2Project/pcre2/releases
|
||||
|
||||
There is a mailing list for discussion about the development of PCRE (both the
|
||||
original and new APIs) at pcre-dev@exim.org. You can access the archives and
|
||||
subscribe or manage your subscription here:
|
||||
There is a mailing list for discussion about the development of PCRE2 at
|
||||
pcre2-dev@googlegroups.com. You can subscribe by sending an email to
|
||||
pcre2-dev+subscribe@googlegroups.com.
|
||||
|
||||
https://lists.exim.org/mailman/listinfo/pcre-dev
|
||||
You can access the archives and also subscribe or manage your subscription
|
||||
here:
|
||||
|
||||
https://groups.google.com/g/pcre2-dev
|
||||
|
||||
Please read the NEWS file if you are upgrading from a previous release. The
|
||||
contents of this README file are:
|
||||
|
@ -112,12 +114,18 @@ Building PCRE2 using autotools
|
|||
The following instructions assume the use of the widely used "configure; make;
|
||||
make install" (autotools) process.
|
||||
|
||||
To build PCRE2 on system that supports autotools, first run the "configure"
|
||||
command from the PCRE2 distribution directory, with your current directory set
|
||||
If you have downloaded and unpacked a PCRE2 release tarball, run the
|
||||
"configure" command from the PCRE2 directory, with your current directory set
|
||||
to the directory where you want the files to be created. This command is a
|
||||
standard GNU "autoconf" configuration script, for which generic instructions
|
||||
are supplied in the file INSTALL.
|
||||
|
||||
The files in the GitHub repository do not contain "configure". If you have
|
||||
downloaded the PCRE2 source files from GitHub, before you can run "configure"
|
||||
you must run the shell script called autogen.sh. This runs a number of
|
||||
autotools to create a "configure" script (you must of course have the autotools
|
||||
commands installed in order to do this).
|
||||
|
||||
Most commonly, people build PCRE2 within its own distribution directory, and in
|
||||
this case, on many systems, just running "./configure" is sufficient. However,
|
||||
the usual methods of changing standard defaults are available. For example:
|
||||
|
@ -186,10 +194,10 @@ library. They are also documented in the pcre2build man page.
|
|||
|
||||
As well as supporting UTF strings, Unicode support includes support for the
|
||||
\P, \p, and \X sequences that recognize Unicode character properties.
|
||||
However, only the basic two-letter properties such as Lu are supported.
|
||||
Escape sequences such as \d and \w in patterns do not by default make use of
|
||||
Unicode properties, but can be made to do so by setting the PCRE2_UCP option
|
||||
or starting a pattern with (*UCP).
|
||||
However, only a subset of Unicode properties are supported; see the
|
||||
pcre2pattern man page for details. Escape sequences such as \d and \w in
|
||||
patterns do not by default make use of Unicode properties, but can be made to
|
||||
do so by setting the PCRE2_UCP option or starting a pattern with (*UCP).
|
||||
|
||||
. You can build PCRE2 to recognize either CR or LF or the sequence CRLF, or any
|
||||
of the preceding, or any of the Unicode newline sequences, or the NUL (zero)
|
||||
|
@ -269,9 +277,9 @@ library. They are also documented in the pcre2build man page.
|
|||
|
||||
--enable-rebuild-chartables
|
||||
|
||||
a program called dftables is compiled and run in the default C locale when
|
||||
you obey "make". It builds a source file called pcre2_chartables.c. If you do
|
||||
not specify this option, pcre2_chartables.c is created as a copy of
|
||||
a program called pcre2_dftables is compiled and run in the default C locale
|
||||
when you obey "make". It builds a source file called pcre2_chartables.c. If
|
||||
you do not specify this option, pcre2_chartables.c is created as a copy of
|
||||
pcre2_chartables.c.dist. See "Character tables" below for further
|
||||
information.
|
||||
|
||||
|
@ -297,8 +305,8 @@ library. They are also documented in the pcre2build man page.
|
|||
unaddressable. This allows it to detect invalid memory accesses, and is
|
||||
mostly useful for debugging PCRE2 itself.
|
||||
|
||||
. In environments where the gcc compiler is used and lcov version 1.6 or above
|
||||
is installed, if you specify
|
||||
. In environments where the gcc compiler is used and lcov is installed, if you
|
||||
specify
|
||||
|
||||
--enable-coverage
|
||||
|
||||
|
@ -367,19 +375,20 @@ library. They are also documented in the pcre2build man page.
|
|||
necessary to specify something like LIBS="-lncurses" as well. This is
|
||||
because, to quote the readline INSTALL, "Readline uses the termcap functions,
|
||||
but does not link with the termcap or curses library itself, allowing
|
||||
applications which link with readline the to choose an appropriate library."
|
||||
applications which link with readline the option to choose an appropriate
|
||||
library."
|
||||
If you get error messages about missing functions tgetstr, tgetent, tputs,
|
||||
tgetflag, or tgoto, this is the problem, and linking with the ncurses library
|
||||
should fix it.
|
||||
|
||||
. The C99 standard defines formatting modifiers z and t for size_t and
|
||||
ptrdiff_t values, respectively. By default, PCRE2 uses these modifiers in
|
||||
environments other than Microsoft Visual Studio when __STDC_VERSION__ is
|
||||
defined and has a value greater than or equal to 199901L (indicating C99).
|
||||
However, there is at least one environment that claims to be C99 but does not
|
||||
support these modifiers. If --disable-percent-zt is specified, no use is made
|
||||
of the z or t modifiers. Instead or %td or %zu, %lu is used, with a cast for
|
||||
size_t values.
|
||||
environments other than Microsoft Visual Studio versions earlier than 2013
|
||||
when __STDC_VERSION__ is defined and has a value greater than or equal to
|
||||
199901L (indicating C99). However, there is at least one environment that
|
||||
claims to be C99 but does not support these modifiers. If
|
||||
--disable-percent-zt is specified, no use is made of the z or t modifiers.
|
||||
Instead of %td or %zu, %lu is used, with a cast for size_t values.
|
||||
|
||||
. There is a special option called --enable-fuzz-support for use by people who
|
||||
want to run fuzzing tests on PCRE2. At present this applies only to the 8-bit
|
||||
|
@ -392,10 +401,10 @@ library. They are also documented in the pcre2build man page.
|
|||
Setting --enable-fuzz-support also causes a binary called pcre2fuzzcheck to
|
||||
be created. This is normally run under valgrind or used when PCRE2 is
|
||||
compiled with address sanitizing enabled. It calls the fuzzing function and
|
||||
outputs information about it is doing. The input strings are specified by
|
||||
arguments: if an argument starts with "=" the rest of it is a literal input
|
||||
string. Otherwise, it is assumed to be a file name, and the contents of the
|
||||
file are the test string.
|
||||
outputs information about what it is doing. The input strings are specified
|
||||
by arguments: if an argument starts with "=" the rest of it is a literal
|
||||
input string. Otherwise, it is assumed to be a file name, and the contents
|
||||
of the file are the test string.
|
||||
|
||||
. Releases before 10.30 could be compiled with --disable-stack-for-recursion,
|
||||
which caused pcre2_match() to use individual blocks on the heap for
|
||||
|
@ -409,7 +418,7 @@ The "configure" script builds the following files for the basic C library:
|
|||
. Makefile the makefile that builds the library
|
||||
. src/config.h build-time configuration options for the library
|
||||
. src/pcre2.h the public PCRE2 header file
|
||||
. pcre2-config script that shows the building settings such as CFLAGS
|
||||
. pcre2-config script that shows the building settings such as CFLAGS
|
||||
that were set for "configure"
|
||||
. libpcre2-8.pc )
|
||||
. libpcre2-16.pc ) data for the pkg-config command
|
||||
|
@ -548,11 +557,11 @@ Cross-compiling using autotools
|
|||
|
||||
You can specify CC and CFLAGS in the normal way to the "configure" command, in
|
||||
order to cross-compile PCRE2 for some other host. However, you should NOT
|
||||
specify --enable-rebuild-chartables, because if you do, the dftables.c source
|
||||
file is compiled and run on the local host, in order to generate the inbuilt
|
||||
character tables (the pcre2_chartables.c file). This will probably not work,
|
||||
because dftables.c needs to be compiled with the local compiler, not the cross
|
||||
compiler.
|
||||
specify --enable-rebuild-chartables, because if you do, the pcre2_dftables.c
|
||||
source file is compiled and run on the local host, in order to generate the
|
||||
inbuilt character tables (the pcre2_chartables.c file). This will probably not
|
||||
work, because pcre2_dftables.c needs to be compiled with the local compiler,
|
||||
not the cross compiler.
|
||||
|
||||
When --enable-rebuild-chartables is not specified, pcre2_chartables.c is
|
||||
created by making a copy of pcre2_chartables.c.dist, which is a default set of
|
||||
|
@ -560,9 +569,10 @@ tables that assumes ASCII code. Cross-compiling with the default tables should
|
|||
not be a problem.
|
||||
|
||||
If you need to modify the character tables when cross-compiling, you should
|
||||
move pcre2_chartables.c.dist out of the way, then compile dftables.c by hand
|
||||
and run it on the local host to make a new version of pcre2_chartables.c.dist.
|
||||
Then when you cross-compile PCRE2 this new version of the tables will be used.
|
||||
move pcre2_chartables.c.dist out of the way, then compile pcre2_dftables.c by
|
||||
hand and run it on the local host to make a new version of
|
||||
pcre2_chartables.c.dist. See the pcre2build section "Creating character tables
|
||||
at build time" for more details.
|
||||
|
||||
|
||||
Making new tarballs
|
||||
|
@ -599,13 +609,13 @@ is available. RunTest outputs a comment when it skips a test.
|
|||
|
||||
Many (but not all) of the tests that are not skipped are run twice if JIT
|
||||
support is available. On the second run, JIT compilation is forced. This
|
||||
testing can be suppressed by putting "nojit" on the RunTest command line.
|
||||
testing can be suppressed by putting "-nojit" on the RunTest command line.
|
||||
|
||||
The entire set of tests is run once for each of the 8-bit, 16-bit and 32-bit
|
||||
libraries that are enabled. If you want to run just one set of tests, call
|
||||
RunTest with either the -8, -16 or -32 option.
|
||||
|
||||
If valgrind is installed, you can run the tests under it by putting "valgrind"
|
||||
If valgrind is installed, you can run the tests under it by putting "-valgrind"
|
||||
on the RunTest command line. To run pcre2test on just one or more specific test
|
||||
files, give their numbers as arguments to RunTest, for example:
|
||||
|
||||
|
@ -686,7 +696,7 @@ Test 14 contains some special UTF and UCP tests that give different output for
|
|||
different code unit widths.
|
||||
|
||||
Test 15 contains a number of tests that must not be run with JIT. They check,
|
||||
among other non-JIT things, the match-limiting features of the intepretive
|
||||
among other non-JIT things, the match-limiting features of the interpretive
|
||||
matcher.
|
||||
|
||||
Test 16 is run only when JIT support is not available. It checks that an
|
||||
|
@ -721,8 +731,8 @@ compile context.
|
|||
The source file called pcre2_chartables.c contains the default set of tables.
|
||||
By default, this is created as a copy of pcre2_chartables.c.dist, which
|
||||
contains tables for ASCII coding. However, if --enable-rebuild-chartables is
|
||||
specified for ./configure, a different version of pcre2_chartables.c is built
|
||||
by the program dftables (compiled from dftables.c), which uses the ANSI C
|
||||
specified for ./configure, a new version of pcre2_chartables.c is built by the
|
||||
program pcre2_dftables (compiled from pcre2_dftables.c), which uses the ANSI C
|
||||
character handling functions such as isalnum(), isalpha(), isupper(),
|
||||
islower(), etc. to build the table sources. This means that the default C
|
||||
locale that is set for your system will control the contents of these default
|
||||
|
@ -732,32 +742,40 @@ file does not get automatically re-generated. The best way to do this is to
|
|||
move pcre2_chartables.c.dist out of the way and replace it with your customized
|
||||
tables.
|
||||
|
||||
When the dftables program is run as a result of --enable-rebuild-chartables,
|
||||
it uses the default C locale that is set on your system. It does not pay
|
||||
attention to the LC_xxx environment variables. In other words, it uses the
|
||||
system's default locale rather than whatever the compiling user happens to have
|
||||
set. If you really do want to build a source set of character tables in a
|
||||
locale that is specified by the LC_xxx variables, you can run the dftables
|
||||
program by hand with the -L option. For example:
|
||||
When the pcre2_dftables program is run as a result of specifying
|
||||
--enable-rebuild-chartables, it uses the default C locale that is set on your
|
||||
system. It does not pay attention to the LC_xxx environment variables. In other
|
||||
words, it uses the system's default locale rather than whatever the compiling
|
||||
user happens to have set. If you really do want to build a source set of
|
||||
character tables in a locale that is specified by the LC_xxx variables, you can
|
||||
run the pcre2_dftables program by hand with the -L option. For example:
|
||||
|
||||
./dftables -L pcre2_chartables.c.special
|
||||
./pcre2_dftables -L pcre2_chartables.c.special
|
||||
|
||||
The first two 256-byte tables provide lower casing and case flipping functions,
|
||||
respectively. The next table consists of three 32-byte bit maps which identify
|
||||
digits, "word" characters, and white space, respectively. These are used when
|
||||
building 32-byte bit maps that represent character classes for code points less
|
||||
than 256. The final 256-byte table has bits indicating various character types,
|
||||
as follows:
|
||||
The second argument names the file where the source code for the tables is
|
||||
written. The first two 256-byte tables provide lower casing and case flipping
|
||||
functions, respectively. The next table consists of a number of 32-byte bit
|
||||
maps which identify certain character classes such as digits, "word"
|
||||
characters, white space, etc. These are used when building 32-byte bit maps
|
||||
that represent character classes for code points less than 256. The final
|
||||
256-byte table has bits indicating various character types, as follows:
|
||||
|
||||
1 white space character
|
||||
2 letter
|
||||
4 decimal digit
|
||||
8 hexadecimal digit
|
||||
4 lower case letter
|
||||
8 decimal digit
|
||||
16 alphanumeric or '_'
|
||||
128 regular expression metacharacter or binary zero
|
||||
|
||||
You should not alter the set of characters that contain the 128 bit, as that
|
||||
will cause PCRE2 to malfunction.
|
||||
You can also specify -b (with or without -L) when running pcre2_dftables. This
|
||||
causes the tables to be written in binary instead of as source code. A set of
|
||||
binary tables can be loaded into memory by an application and passed to
|
||||
pcre2_compile() in the same way as tables created dynamically by calling
|
||||
pcre2_maketables(). The tables are just a string of bytes, independent of
|
||||
hardware characteristics such as endianness. This means they can be bundled
|
||||
with an application that runs in different environments, to ensure consistent
|
||||
behaviour.
|
||||
|
||||
See also the pcre2build section "Creating character tables at build time".
|
||||
|
||||
|
||||
File manifest
|
||||
|
@ -768,7 +786,7 @@ The distribution should contain the files listed below.
|
|||
(A) Source files for the PCRE2 library functions and their headers are found in
|
||||
the src directory:
|
||||
|
||||
src/dftables.c auxiliary program for building pcre2_chartables.c
|
||||
src/pcre2_dftables.c auxiliary program for building pcre2_chartables.c
|
||||
when --enable-rebuild-chartables is specified
|
||||
|
||||
src/pcre2_chartables.c.dist a default set of character tables that assume
|
||||
|
@ -892,6 +910,6 @@ The distribution should contain the files listed below.
|
|||
) environments
|
||||
|
||||
Philip Hazel
|
||||
Email local part: ph10
|
||||
Email domain: cam.ac.uk
|
||||
Last updated: 16 April 2019
|
||||
Email local part: Philip.Hazel
|
||||
Email domain: gmail.com
|
||||
Last updated: 15 April 2022
|
||||
|
|
|
@ -0,0 +1,39 @@
|
|||
PCRE2 (Perl-compatible regular expression library)
|
||||
---------------------------------------------------------------------------
|
||||
|
||||
This is a port of PCRE2 10.40 by Philip Hazel for AmigaOS 4, as found at the
|
||||
GitHub repository https://github.com/PCRE2Project/pcre2
|
||||
|
||||
More information about PCRE can be found at its official website
|
||||
at https://www.pcre.org and at the documentation that comes with this
|
||||
package.
|
||||
|
||||
In the archive both newlib and clib2 libraries are included. It has been
|
||||
tested with various applications, but in case you find issues please
|
||||
contact me.
|
||||
|
||||
To install it into your AmigaOS 4 SDK installation, just extract all the
|
||||
files in the SDK: path.
|
||||
|
||||
Compile
|
||||
--------------------------
|
||||
The source and the changes I did can be found at my personale repository
|
||||
https://git.walkero.gr/walkero/pcre2
|
||||
|
||||
You can compile it using the Makefile.os4 file, and produce the libraries
|
||||
yourself.
|
||||
|
||||
* with newlib run:
|
||||
```bash
|
||||
make -f Makefile.os4 all
|
||||
```
|
||||
* with clib2 run:
|
||||
```bash
|
||||
make -f Makefile.os4 all USE_CLIB2=yes
|
||||
```
|
||||
|
||||
Changelog
|
||||
--------------------------
|
||||
v10.40r1 - 2022-07-31
|
||||
* First release
|
||||
|
|
@ -0,0 +1,56 @@
|
|||
# PCRE2 - Perl-Compatible Regular Expressions
|
||||
|
||||
The PCRE2 library is a set of C functions that implement regular expression
|
||||
pattern matching using the same syntax and semantics as Perl 5. PCRE2 has its
|
||||
own native API, as well as a set of wrapper functions that correspond to the
|
||||
POSIX regular expression API. The PCRE2 library is free, even for building
|
||||
proprietary software. It comes in three forms, for processing 8-bit, 16-bit,
|
||||
or 32-bit code units, in either literal or UTF encoding.
|
||||
|
||||
PCRE2 was first released in 2015 to replace the API in the original PCRE
|
||||
library, which is now obsolete and no longer maintained. As well as a more
|
||||
flexible API, the code of PCRE2 has been much improved since the fork.
|
||||
|
||||
## Download
|
||||
|
||||
As well as downloading from the
|
||||
[GitHub site](https://github.com/PCRE2Project/pcre2), you can download PCRE2
|
||||
or the older, unmaintained PCRE1 library from an
|
||||
[*unofficial* mirror](https://sourceforge.net/projects/pcre/files/) at SourceForge.
|
||||
|
||||
You can check out the PCRE2 source code via Git or Subversion:
|
||||
|
||||
git clone https://github.com/PCRE2Project/pcre2.git
|
||||
svn co https://github.com/PCRE2Project/pcre2.git
|
||||
|
||||
## Contributed Ports
|
||||
|
||||
If you just need the command-line PCRE2 tools on Windows, precompiled binary
|
||||
versions are available at this
|
||||
[Rexegg page](http://www.rexegg.com/pcregrep-pcretest.html).
|
||||
|
||||
A PCRE2 port for z/OS, a mainframe operating system which uses EBCDIC as its
|
||||
default character encoding, can be found at
|
||||
[http://www.cbttape.org](http://www.cbttape.org/) (File 939).
|
||||
|
||||
## Documentation
|
||||
|
||||
You can read the PCRE2 documentation
|
||||
[here](https://PCRE2Project.github.io/pcre2/doc/html/index.html).
|
||||
|
||||
Comparisons to Perl's regular expression semantics can be found in the
|
||||
community authored Wikipedia entry for PCRE.
|
||||
|
||||
There is a curated summary of changes for each PCRE release, copies of
|
||||
documentation from older releases, and other useful information from the third
|
||||
party authored
|
||||
[RexEgg PCRE Documentation and Change Log page](http://www.rexegg.com/pcre-documentation.html).
|
||||
|
||||
## Contact
|
||||
|
||||
To report a problem with the PCRE2 library, or to make a feature request, please
|
||||
use the PCRE2 GitHub issues tracker. There is a mailing list for discussion of
|
||||
PCRE2 issues and development at pcre2-dev@googlegroups.com, which is where any
|
||||
announcements will be made. You can browse the
|
||||
[list archives](https://groups.google.com/g/pcre2-dev).
|
||||
|
87
RunGrepTest
87
RunGrepTest
|
@ -68,6 +68,22 @@ diff -b /dev/null /dev/null 2>/dev/null && cf="diff -b"
|
|||
diff -u /dev/null /dev/null 2>/dev/null && cf="diff -u"
|
||||
diff -ub /dev/null /dev/null 2>/dev/null && cf="diff -ub"
|
||||
|
||||
# Some tests involve NUL characters. It seems impossible to handle them easily
|
||||
# in many operating systems. An earlier version of this script used sed to
|
||||
# translate NUL into the string ZERO, but this didn't work on Solaris (aka
|
||||
# SunOS), where the version of sed explicitly doesn't like them, and also MacOS
|
||||
# (Darwin), OpenBSD, FreeBSD, NetBSD, and some Linux distributions like Alpine,
|
||||
# even when using GNU sed. A user suggested using tr instead, which
|
||||
# necessitates translating to a single character. However, on (some versions
|
||||
# of?) Solaris, the normal "tr" cannot handle binary zeros, but if
|
||||
# /usr/xpg4/bin/tr is available, it can do so, so test for that.
|
||||
|
||||
if [ -x /usr/xpg4/bin/tr ] ; then
|
||||
tr=/usr/xpg4/bin/tr
|
||||
else
|
||||
tr=tr
|
||||
fi
|
||||
|
||||
# If this test is being run from "make check", $srcdir will be set. If not, set
|
||||
# it to the current or parent directory, whichever one contains the test data.
|
||||
# Subsequently, we run most of the pcre2grep tests in the source directory so
|
||||
|
@ -558,7 +574,7 @@ echo "RC=$?" >>testtrygrep
|
|||
echo "---------------------------- Test 107 -----------------------------" >>testtrygrep
|
||||
echo "a" >testtemp1grep
|
||||
echo "aaaaa" >>testtemp1grep
|
||||
(cd $srcdir; $valgrind $vjs $pcre2grep --line-offsets '(?<=\Ka)' $builddir/testtemp1grep) >>testtrygrep 2>&1
|
||||
(cd $srcdir; $valgrind $vjs $pcre2grep --line-offsets --allow-lookaround-bsk '(?<=\Ka)' $builddir/testtemp1grep) >>testtrygrep 2>&1
|
||||
echo "RC=$?" >>testtrygrep
|
||||
|
||||
echo "---------------------------- Test 108 ------------------------------" >>testtrygrep
|
||||
|
@ -638,13 +654,13 @@ echo "RC=$?" >>testtrygrep
|
|||
|
||||
echo "---------------------------- Test 125 -----------------------------" >>testtrygrep
|
||||
printf 'abcd\n' >testNinputgrep
|
||||
$valgrind $vjs $pcre2grep --colour=always '(?<=\K.)' testNinputgrep >>testtrygrep
|
||||
$valgrind $vjs $pcre2grep --colour=always --allow-lookaround-bsk '(?<=\K.)' testNinputgrep >>testtrygrep
|
||||
echo "RC=$?" >>testtrygrep
|
||||
$valgrind $vjs $pcre2grep --colour=always '(?=.\K)' testNinputgrep >>testtrygrep
|
||||
$valgrind $vjs $pcre2grep --colour=always --allow-lookaround-bsk '(?=.\K)' testNinputgrep >>testtrygrep
|
||||
echo "RC=$?" >>testtrygrep
|
||||
$valgrind $vjs $pcre2grep --colour=always '(?<=\K[ac])' testNinputgrep >>testtrygrep
|
||||
$valgrind $vjs $pcre2grep --colour=always --allow-lookaround-bsk '(?<=\K[ac])' testNinputgrep >>testtrygrep
|
||||
echo "RC=$?" >>testtrygrep
|
||||
$valgrind $vjs $pcre2grep --colour=always '(?=[ac]\K)' testNinputgrep >>testtrygrep
|
||||
$valgrind $vjs $pcre2grep --colour=always --allow-lookaround-bsk '(?=[ac]\K)' testNinputgrep >>testtrygrep
|
||||
echo "RC=$?" >>testtrygrep
|
||||
|
||||
echo "---------------------------- Test 126 -----------------------------" >>testtrygrep
|
||||
|
@ -661,6 +677,40 @@ echo "---------------------------- Test 128 -----------------------------" >>tes
|
|||
(cd $srcdir; $valgrind $vjs $pcre2grep -o1 --om-capture=0 'pattern()()()()' testdata/grepinput) >>testtrygrep 2>&1
|
||||
echo "RC=$?" >>testtrygrep
|
||||
|
||||
echo "---------------------------- Test 129 -----------------------------" >>testtrygrep
|
||||
(cd $srcdir; $valgrind $vjs $pcre2grep -m 2 'fox' testdata/grepinput) >>testtrygrep 2>&1
|
||||
echo "RC=$?" >>testtrygrep
|
||||
|
||||
echo "---------------------------- Test 130 -----------------------------" >>testtrygrep
|
||||
(cd $srcdir; $valgrind $vjs $pcre2grep -o -m2 'fox' testdata/grepinput) >>testtrygrep 2>&1
|
||||
echo "RC=$?" >>testtrygrep
|
||||
|
||||
echo "---------------------------- Test 131 -----------------------------" >>testtrygrep
|
||||
(cd $srcdir; $valgrind $vjs $pcre2grep -oc -m2 'fox' testdata/grepinput) >>testtrygrep 2>&1
|
||||
echo "RC=$?" >>testtrygrep
|
||||
|
||||
echo "---------------------------- Test 132 -----------------------------" >>testtrygrep
|
||||
(cd $srcdir; exec 3<testdata/grepinput; $valgrind $vjs $pcre2grep -m1 -A3 '^match' <&3; echo '---'; head -1 <&3; exec 3<&-) >>testtrygrep 2>&1
|
||||
echo "RC=$?" >>testtrygrep
|
||||
|
||||
echo "---------------------------- Test 133 -----------------------------" >>testtrygrep
|
||||
(cd $srcdir; exec 3<testdata/grepinput; $valgrind $vjs $pcre2grep -m1 -A3 '^match' <&3; echo '---'; $valgrind $vjs $pcre2grep -m1 -A3 '^match' <&3; exec 3<&-) >>testtrygrep 2>&1
|
||||
echo "RC=$?" >>testtrygrep
|
||||
|
||||
echo "---------------------------- Test 134 -----------------------------" >>testtrygrep
|
||||
(cd $srcdir; $valgrind $vjs $pcre2grep -m1 -O '=$x{41}$x423$o{103}$o1045=' 'fox') <$srcdir/testdata/grepinputv >>testtrygrep 2>&1
|
||||
echo "RC=$?" >>testtrygrep
|
||||
|
||||
echo "---------------------------- Test 135 -----------------------------" >>testtrygrep
|
||||
(cd $srcdir; $valgrind $vjs $pcre2grep -HZ 'word' ./testdata/grepinputv) | $tr '\000' '@' >>testtrygrep
|
||||
echo "RC=$?" >>testtrygrep
|
||||
(cd $srcdir; $valgrind $vjs $pcre2grep -lZ 'word' ./testdata/grepinputv ./testdata/grepinputv) | $tr '\000' '@' >>testtrygrep
|
||||
echo "RC=$?" >>testtrygrep
|
||||
(cd $srcdir; $valgrind $vjs $pcre2grep -A 1 -B 1 -HZ 'word' ./testdata/grepinputv) | $tr '\000' '@' >>testtrygrep
|
||||
echo "RC=$?" >>testtrygrep
|
||||
(cd $srcdir; $valgrind $vjs $pcre2grep -MHZn 'start[\s]+end' testdata/grepinputM) >>testtrygrep
|
||||
echo "RC=$?" >>testtrygrep
|
||||
|
||||
# Now compare the results.
|
||||
|
||||
$cf $srcdir/testdata/grepoutput testtrygrep
|
||||
|
@ -681,7 +731,7 @@ if [ $utf8 -ne 0 ] ; then
|
|||
echo "RC=$?" >>testtrygrep
|
||||
|
||||
echo "---------------------------- Test U3 ------------------------------" >>testtrygrep
|
||||
(cd $srcdir; $valgrind $vjs $pcre2grep --line-offsets -u --newline=any '(?<=\K\x{17f})' ./testdata/grepinput8) >>testtrygrep
|
||||
(cd $srcdir; $valgrind $vjs $pcre2grep --line-offsets -u --newline=any --allow-lookaround-bsk '(?<=\K\x{17f})' ./testdata/grepinput8) >>testtrygrep
|
||||
echo "RC=$?" >>testtrygrep
|
||||
|
||||
echo "---------------------------- Test U4 ------------------------------" >>testtrygrep
|
||||
|
@ -694,6 +744,10 @@ if [ $utf8 -ne 0 ] ; then
|
|||
(cd $srcdir; $valgrind $vjs $pcre2grep -U -o '....' $builddir/testtemp1grep) >>testtrygrep
|
||||
echo "RC=$?" >>testtrygrep
|
||||
|
||||
echo "---------------------------- Test U6 -----------------------------" >>testtrygrep
|
||||
(cd $srcdir; $valgrind $vjs $pcre2grep -u -m1 -O '=$x{1d3}$o{744}=' 'fox') <$srcdir/testdata/grepinputv >>testtrygrep 2>&1
|
||||
echo "RC=$?" >>testtrygrep
|
||||
|
||||
$cf $srcdir/testdata/grepoutput8 testtrygrep
|
||||
if [ $? != 0 ] ; then exit 1; fi
|
||||
|
||||
|
@ -731,24 +785,10 @@ $valgrind $vjs $pcre2grep -n --newline=any "^(abc|def|ghi|jkl)" testNinputgrep >
|
|||
printf '%c--------------------------- Test N6 ------------------------------\r\n' - >>testtrygrep
|
||||
$valgrind $vjs $pcre2grep -n --newline=anycrlf "^(abc|def|ghi|jkl)" testNinputgrep >>testtrygrep
|
||||
|
||||
# It seems impossible to handle NUL characters easily in many operating
|
||||
# systems, including Solaris (aka SunOS), where the version of sed explicitly
|
||||
# doesn't like them, and also MacOS (Darwin), OpenBSD, FreeBSD, and NetBSD. So
|
||||
# now we run this test only on OS that are known to work. For the rest, we
|
||||
# fudge the output so that the comparison works.
|
||||
|
||||
printf '%c--------------------------- Test N7 ------------------------------\r\n' - >>testtrygrep
|
||||
uname=`uname`
|
||||
case $uname in
|
||||
Linux)
|
||||
printf 'abc\0def' >testNinputgrep
|
||||
$valgrind $vjs $pcre2grep -na --newline=nul "^(abc|def)" testNinputgrep | sed 's/\x00/ZERO/' >>testtrygrep
|
||||
echo "" >>testtrygrep
|
||||
;;
|
||||
*)
|
||||
echo '1:abcZERO2:def' >>testtrygrep
|
||||
;;
|
||||
esac
|
||||
printf 'abc\0def' >testNinputgrep
|
||||
$valgrind $vjs $pcre2grep -na --newline=nul "^(abc|def)" testNinputgrep | $tr '\000' '@' >>testtrygrep
|
||||
echo "" >>testtrygrep
|
||||
|
||||
$cf $srcdir/testdata/grepoutputN testtrygrep
|
||||
if [ $? != 0 ] ; then exit 1; fi
|
||||
|
@ -764,6 +804,7 @@ if $valgrind $vjs $pcre2grep --help | $valgrind $vjs $pcre2grep -q 'callout scri
|
|||
$valgrind $vjs $pcre2grep '(T)(..(.))()()()()()()()(..)(?C"/bin/echo|Arg1: [$11] [${11}]")' $srcdir/testdata/grepinputv >>testtrygrep
|
||||
$valgrind $vjs $pcre2grep '(T)(?C"|$0:$1$n")' $srcdir/testdata/grepinputv >>testtrygrep
|
||||
$valgrind $vjs $pcre2grep '(T)(?C"|$1$n")(*F)' $srcdir/testdata/grepinputv >>testtrygrep
|
||||
$valgrind $vjs $pcre2grep -m1 '(T)(?C"|$0:$1:$x{41}$o{101}$n")' $srcdir/testdata/grepinputv >>testtrygrep
|
||||
|
||||
if $valgrind $vjs $pcre2grep --help | $valgrind $vjs $pcre2grep -q 'Non-fork callout scripts in patterns are supported'; then
|
||||
$cf $srcdir/testdata/grepoutputCN testtrygrep
|
||||
|
|
74
RunTest
74
RunTest
|
@ -17,8 +17,16 @@
|
|||
# individual test numbers, ranges of tests such as 3-6 or 3- (meaning 3 to the
|
||||
# end), or a number preceded by ~ to exclude a test. For example, "3-15 ~10"
|
||||
# runs tests 3 to 15, excluding test 10, and just "~10" runs all the tests
|
||||
# except test 10. Whatever order the arguments are in, the tests are always run
|
||||
# in numerical order.
|
||||
# except test 10. Whatever order the arguments are in, these tests are always
|
||||
# run in numerical order.
|
||||
#
|
||||
# If no specific tests are selected (which is the case when this script is run
|
||||
# via 'make check') the default is to run all the numbered tests.
|
||||
#
|
||||
# There may also be named (as well as numbered) tests for special purposes. At
|
||||
# present there is just one, called "heap". This test's output contains the
|
||||
# sizes of heap frames and frame vectors, which depend on the environment. It
|
||||
# is therefore not run unless explicitly requested.
|
||||
#
|
||||
# Inappropriate tests are automatically skipped (with a comment to say so). For
|
||||
# example, if JIT support is not compiled, test 16 is skipped, whereas if JIT
|
||||
|
@ -80,7 +88,9 @@ title22="Test 22: \C tests with UTF (not supported for DFA matching)"
|
|||
title23="Test 23: \C disabled test"
|
||||
title24="Test 24: Non-UTF pattern conversion tests"
|
||||
title25="Test 25: UTF pattern conversion tests"
|
||||
maxtest=25
|
||||
title26="Test 26: Auto-generated unicode property tests"
|
||||
maxtest=26
|
||||
titleheap="Test 'heap': Environment-specific heap tests"
|
||||
|
||||
if [ $# -eq 1 -a "$1" = "list" ]; then
|
||||
echo $title0
|
||||
|
@ -109,6 +119,12 @@ if [ $# -eq 1 -a "$1" = "list" ]; then
|
|||
echo $title23
|
||||
echo $title24
|
||||
echo $title25
|
||||
echo $title26
|
||||
echo ""
|
||||
echo $titleheap
|
||||
echo ""
|
||||
echo "Numbered tests are automatically run if nothing selected."
|
||||
echo "Named tests must be explicitly selected."
|
||||
exit 0
|
||||
fi
|
||||
|
||||
|
@ -238,6 +254,8 @@ do22=no
|
|||
do23=no
|
||||
do24=no
|
||||
do25=no
|
||||
do26=no
|
||||
doheap=no
|
||||
|
||||
while [ $# -gt 0 ] ; do
|
||||
case $1 in
|
||||
|
@ -267,6 +285,8 @@ while [ $# -gt 0 ] ; do
|
|||
23) do23=yes;;
|
||||
24) do24=yes;;
|
||||
25) do25=yes;;
|
||||
26) do26=yes;;
|
||||
heap) doheap=yes;;
|
||||
-8) arg8=yes;;
|
||||
-16) arg16=yes;;
|
||||
-32) arg32=yes;;
|
||||
|
@ -320,7 +340,8 @@ fi
|
|||
# set up a large stack.
|
||||
|
||||
$sim ./pcre2test -S 64 /dev/null /dev/null
|
||||
if [ $? -eq 0 -a "$bigstack" != "" ] ; then
|
||||
support_setstack=$?
|
||||
if [ $support_setstack -eq 0 -a "$bigstack" != "" ] ; then
|
||||
setstack="-S 64"
|
||||
else
|
||||
setstack=""
|
||||
|
@ -407,8 +428,8 @@ if [ $jit -ne 0 -a "$nojit" != "yes" ] ; then
|
|||
fi
|
||||
fi
|
||||
|
||||
# If no specific tests were requested, select all. Those that are not
|
||||
# relevant will be automatically skipped.
|
||||
# If no specific tests were requested, select all the numbered tests. Those
|
||||
# that are not relevant will be automatically skipped.
|
||||
|
||||
if [ $do0 = no -a $do1 = no -a $do2 = no -a $do3 = no -a \
|
||||
$do4 = no -a $do5 = no -a $do6 = no -a $do7 = no -a \
|
||||
|
@ -416,7 +437,7 @@ if [ $do0 = no -a $do1 = no -a $do2 = no -a $do3 = no -a \
|
|||
$do12 = no -a $do13 = no -a $do14 = no -a $do15 = no -a \
|
||||
$do16 = no -a $do17 = no -a $do18 = no -a $do19 = no -a \
|
||||
$do20 = no -a $do21 = no -a $do22 = no -a $do23 = no -a \
|
||||
$do24 = no -a $do25 = no \
|
||||
$do24 = no -a $do25 = no -a $do26 = no -a $doheap = no \
|
||||
]; then
|
||||
do0=yes
|
||||
do1=yes
|
||||
|
@ -444,6 +465,7 @@ if [ $do0 = no -a $do1 = no -a $do2 = no -a $do3 = no -a \
|
|||
do23=yes
|
||||
do24=yes
|
||||
do25=yes
|
||||
do26=yes
|
||||
fi
|
||||
|
||||
# Handle any explicit skips at this stage, so that an argument list may consist
|
||||
|
@ -479,7 +501,9 @@ for bmode in "$test8" "$test16" "$test32"; do
|
|||
echo '' >testtry
|
||||
checkspecial '-C'
|
||||
checkspecial '--help'
|
||||
checkspecial '-S 1 -t 10 testSinput'
|
||||
if [ $support_setstack -eq 0 ] ; then
|
||||
checkspecial '-S 1 -t 10 testSinput'
|
||||
fi
|
||||
echo " OK"
|
||||
fi
|
||||
|
||||
|
@ -493,15 +517,20 @@ for bmode in "$test8" "$test16" "$test32"; do
|
|||
done
|
||||
fi
|
||||
|
||||
# PCRE2 tests that are not Perl-compatible: API, errors, internals
|
||||
# PCRE2 tests that are not Perl-compatible: API, errors, internals. We copy
|
||||
# the testbtables file to the current directory for use by this test.
|
||||
|
||||
if [ $do2 = yes ] ; then
|
||||
echo $title2 "(excluding UTF-$bits)"
|
||||
cp $testdata/testbtables .
|
||||
for opt in "" $jitopt; do
|
||||
$sim $valgrind ${opt:+$vjs} ./pcre2test -q $setstack $bmode $opt $testdata/testinput2 testtry
|
||||
if [ $? = 0 ] ; then
|
||||
saverc=$?
|
||||
if [ $saverc = 0 ] ; then
|
||||
$sim $valgrind ${opt:+$vjs} ./pcre2test -q $bmode $opt -error -70,-62,-2,-1,0,100,101,191,200 >>testtry
|
||||
checkresult $? 2 "$opt"
|
||||
else
|
||||
checkresult $saverc 2 "$opt"
|
||||
fi
|
||||
done
|
||||
fi
|
||||
|
@ -855,10 +884,33 @@ for bmode in "$test8" "$test16" "$test32"; do
|
|||
fi
|
||||
fi
|
||||
|
||||
# Auto-generated unicode property tests
|
||||
|
||||
if [ $do26 = yes ] ; then
|
||||
echo $title26
|
||||
if [ $utf -eq 0 ] ; then
|
||||
echo " Skipped because UTF-$bits support is not available"
|
||||
else
|
||||
for opt in "" $jitopt; do
|
||||
$sim $valgrind ${opt:+$vjs} ./pcre2test -q $setstack $bmode $opt $testdata/testinput26 testtry
|
||||
checkresult $? 26 "$opt"
|
||||
done
|
||||
fi
|
||||
fi
|
||||
|
||||
# Manually selected heap tests - output may vary in different environments,
|
||||
# which is why that are not automatically run.
|
||||
|
||||
if [ $doheap = yes ] ; then
|
||||
echo $titleheap
|
||||
$sim $valgrind ./pcre2test -q $setstack $bmode $testdata/testinputheap testtry
|
||||
checkresult $? heap-$bits ""
|
||||
fi
|
||||
|
||||
# End of loop for 8/16/32-bit tests
|
||||
done
|
||||
|
||||
# Clean up local working files
|
||||
rm -f testSinput test3input testsaved1 testsaved2 test3output test3outputA test3outputB teststdout teststderr testtry
|
||||
rm -f testbtables testSinput test3input testsaved1 testsaved2 test3output test3outputA test3outputB teststdout teststderr testtry
|
||||
|
||||
# End
|
||||
|
|
|
@ -26,6 +26,7 @@
|
|||
@rem Updated for new test 14 (moving others up a number), August 2015.
|
||||
@rem Tidied and updated for new tests 21, 22, 23 by PH, October 2015.
|
||||
@rem PH added missing "set type" for test 22, April 2016.
|
||||
@rem PH added copy command for new testbtables file, November 2020
|
||||
|
||||
|
||||
setlocal enabledelayedexpansion
|
||||
|
@ -134,9 +135,9 @@ if "%all%" == "yes" (
|
|||
set do7=yes
|
||||
set do8=yes
|
||||
set do9=yes
|
||||
set do10=yes
|
||||
set do10=no
|
||||
set do11=yes
|
||||
set do12=yes
|
||||
set do12=no
|
||||
set do13=yes
|
||||
set do14=yes
|
||||
set do15=yes
|
||||
|
@ -305,6 +306,7 @@ if %jit% EQU 1 call :runsub 1 testoutjit "Test with JIT Override" -q -jit
|
|||
goto :eof
|
||||
|
||||
:do2
|
||||
copy /y %srcdir%\testdata\testbtables testbtables
|
||||
call :runsub 2 testout "API, errors, internals, and non-Perl stuff" -q
|
||||
if %jit% EQU 1 call :runsub 2 testoutjit "Test with JIT Override" -q -jit
|
||||
goto :eof
|
||||
|
|
|
@ -0,0 +1 @@
|
|||
# See MODULE.bazel
|
|
@ -1,17 +1,16 @@
|
|||
# Modified from FindReadline.cmake (PH Feb 2012)
|
||||
|
||||
if(EDITLINE_INCLUDE_DIR AND EDITLINE_LIBRARY AND NCURSES_LIBRARY)
|
||||
if(EDITLINE_INCLUDE_DIR AND EDITLINE_LIBRARY)
|
||||
set(EDITLINE_FOUND TRUE)
|
||||
else(EDITLINE_INCLUDE_DIR AND EDITLINE_LIBRARY AND NCURSES_LIBRARY)
|
||||
FIND_PATH(EDITLINE_INCLUDE_DIR readline.h
|
||||
/usr/include/editline
|
||||
/usr/include/edit/readline
|
||||
/usr/include/readline
|
||||
else(EDITLINE_INCLUDE_DIR AND EDITLINE_LIBRARY)
|
||||
FIND_PATH(EDITLINE_INCLUDE_DIR readline.h PATH_SUFFIXES
|
||||
editline
|
||||
edit/readline
|
||||
)
|
||||
|
||||
FIND_LIBRARY(EDITLINE_LIBRARY NAMES edit)
|
||||
include(FindPackageHandleStandardArgs)
|
||||
FIND_PACKAGE_HANDLE_STANDARD_ARGS(Editline DEFAULT_MSG EDITLINE_INCLUDE_DIR EDITLINE_LIBRARY )
|
||||
FIND_PACKAGE_HANDLE_STANDARD_ARGS(Editline DEFAULT_MSG EDITLINE_INCLUDE_DIR EDITLINE_LIBRARY)
|
||||
|
||||
MARK_AS_ADVANCED(EDITLINE_INCLUDE_DIR EDITLINE_LIBRARY)
|
||||
endif(EDITLINE_INCLUDE_DIR AND EDITLINE_LIBRARY AND NCURSES_LIBRARY)
|
||||
endif(EDITLINE_INCLUDE_DIR AND EDITLINE_LIBRARY)
|
||||
|
|
|
@ -0,0 +1,15 @@
|
|||
set(PACKAGE_VERSION_MAJOR @PCRE2_MAJOR@)
|
||||
set(PACKAGE_VERSION_MINOR @PCRE2_MINOR@)
|
||||
set(PACKAGE_VERSION_PATCH 0)
|
||||
set(PACKAGE_VERSION @PCRE2_MAJOR@.@PCRE2_MINOR@.0)
|
||||
|
||||
# Check whether the requested PACKAGE_FIND_VERSION is compatible
|
||||
if(PACKAGE_VERSION VERSION_LESS PACKAGE_FIND_VERSION OR
|
||||
PACKAGE_VERSION_MAJOR GREATER PACKAGE_FIND_VERSION_MAJOR)
|
||||
set(PACKAGE_VERSION_COMPATIBLE FALSE)
|
||||
else()
|
||||
set(PACKAGE_VERSION_COMPATIBLE TRUE)
|
||||
if(PACKAGE_VERSION VERSION_EQUAL PACKAGE_FIND_VERSION)
|
||||
set(PACKAGE_VERSION_EXACT TRUE)
|
||||
endif()
|
||||
endif()
|
|
@ -0,0 +1,145 @@
|
|||
# pcre2-config.cmake
|
||||
# ----------------
|
||||
#
|
||||
# Finds the PCRE2 library, specify the starting search path in PCRE2_ROOT.
|
||||
#
|
||||
# Static vs. shared
|
||||
# -----------------
|
||||
# To make use of the static library instead of the shared one, one needs
|
||||
# to set the variable PCRE2_USE_STATIC_LIBS to ON before calling find_package.
|
||||
# Example:
|
||||
# set(PCRE2_USE_STATIC_LIBS ON)
|
||||
# find_package(PCRE2 CONFIG COMPONENTS 8BIT)
|
||||
#
|
||||
# This will define the following variables:
|
||||
#
|
||||
# PCRE2_FOUND - True if the system has the PCRE2 library.
|
||||
# PCRE2_VERSION - The version of the PCRE2 library which was found.
|
||||
#
|
||||
# and the following imported targets:
|
||||
#
|
||||
# PCRE2::8BIT - The 8 bit PCRE2 library.
|
||||
# PCRE2::16BIT - The 16 bit PCRE2 library.
|
||||
# PCRE2::32BIT - The 32 bit PCRE2 library.
|
||||
# PCRE2::POSIX - The POSIX PCRE2 library.
|
||||
|
||||
set(PCRE2_NON_STANDARD_LIB_PREFIX @NON_STANDARD_LIB_PREFIX@)
|
||||
set(PCRE2_NON_STANDARD_LIB_SUFFIX @NON_STANDARD_LIB_SUFFIX@)
|
||||
set(PCRE2_8BIT_NAME pcre2-8)
|
||||
set(PCRE2_16BIT_NAME pcre2-16)
|
||||
set(PCRE2_32BIT_NAME pcre2-32)
|
||||
set(PCRE2_POSIX_NAME pcre2-posix)
|
||||
find_path(PCRE2_INCLUDE_DIR NAMES pcre2.h DOC "PCRE2 include directory")
|
||||
if (PCRE2_USE_STATIC_LIBS)
|
||||
if (MSVC)
|
||||
set(PCRE2_8BIT_NAME pcre2-8-static)
|
||||
set(PCRE2_16BIT_NAME pcre2-16-static)
|
||||
set(PCRE2_32BIT_NAME pcre2-32-static)
|
||||
set(PCRE2_POSIX_NAME pcre2-posix-static)
|
||||
endif ()
|
||||
|
||||
set(PCRE2_PREFIX ${CMAKE_STATIC_LIBRARY_PREFIX})
|
||||
set(PCRE2_SUFFIX ${CMAKE_STATIC_LIBRARY_SUFFIX})
|
||||
else ()
|
||||
set(PCRE2_PREFIX ${CMAKE_SHARED_LIBRARY_PREFIX})
|
||||
if (MINGW AND PCRE2_NON_STANDARD_LIB_PREFIX)
|
||||
set(PCRE2_PREFIX "")
|
||||
endif ()
|
||||
|
||||
set(PCRE2_SUFFIX ${CMAKE_SHARED_LIBRARY_SUFFIX})
|
||||
if (MINGW AND PCRE2_NON_STANDARD_LIB_SUFFIX)
|
||||
set(PCRE2_SUFFIX "-0.dll")
|
||||
endif ()
|
||||
endif ()
|
||||
find_library(PCRE2_8BIT_LIBRARY NAMES ${PCRE2_PREFIX}${PCRE2_8BIT_NAME}${PCRE2_SUFFIX} ${PCRE2_PREFIX}${PCRE2_8BIT_NAME}d${PCRE2_SUFFIX} DOC "8 bit PCRE2 library")
|
||||
find_library(PCRE2_16BIT_LIBRARY NAMES ${PCRE2_PREFIX}${PCRE2_16BIT_NAME}${PCRE2_SUFFIX} ${PCRE2_PREFIX}${PCRE2_8BIT_NAME}d${PCRE2_SUFFIX} DOC "16 bit PCRE2 library")
|
||||
find_library(PCRE2_32BIT_LIBRARY NAMES ${PCRE2_PREFIX}${PCRE2_32BIT_NAME}${PCRE2_SUFFIX} ${PCRE2_PREFIX}${PCRE2_8BIT_NAME}d${PCRE2_SUFFIX} DOC "32 bit PCRE2 library")
|
||||
find_library(PCRE2_POSIX_LIBRARY NAMES ${PCRE2_PREFIX}${PCRE2_POSIX_NAME}${PCRE2_SUFFIX} ${PCRE2_PREFIX}${PCRE2_8BIT_NAME}d${PCRE2_SUFFIX} DOC "8 bit POSIX PCRE2 library")
|
||||
unset(PCRE2_NON_STANDARD_LIB_PREFIX)
|
||||
unset(PCRE2_NON_STANDARD_LIB_SUFFIX)
|
||||
unset(PCRE2_8BIT_NAME)
|
||||
unset(PCRE2_16BIT_NAME)
|
||||
unset(PCRE2_32BIT_NAME)
|
||||
unset(PCRE2_POSIX_NAME)
|
||||
|
||||
# Set version
|
||||
if (PCRE2_INCLUDE_DIR)
|
||||
set(PCRE2_VERSION "@PCRE2_MAJOR@.@PCRE2_MINOR@.0")
|
||||
endif ()
|
||||
|
||||
# Which components have been found.
|
||||
if (PCRE2_8BIT_LIBRARY)
|
||||
set(PCRE2_8BIT_FOUND TRUE)
|
||||
endif ()
|
||||
if (PCRE2_16BIT_LIBRARY)
|
||||
set(PCRE2_16BIT_FOUND TRUE)
|
||||
endif ()
|
||||
if (PCRE2_32BIT_LIBRARY)
|
||||
set(PCRE2_32BIT_FOUND TRUE)
|
||||
endif ()
|
||||
if (PCRE2_POSIX_LIBRARY)
|
||||
set(PCRE2_POSIX_FOUND TRUE)
|
||||
endif ()
|
||||
|
||||
# Check if at least one component has been specified.
|
||||
list(LENGTH PCRE2_FIND_COMPONENTS PCRE2_NCOMPONENTS)
|
||||
if (PCRE2_NCOMPONENTS LESS 1)
|
||||
message(FATAL_ERROR "No components have been specified. This is not allowed. Please, specify at least one component.")
|
||||
endif ()
|
||||
unset(PCRE2_NCOMPONENTS)
|
||||
|
||||
# When POSIX component has been specified make sure that also 8BIT component is specified.
|
||||
set(PCRE2_8BIT_COMPONENT FALSE)
|
||||
set(PCRE2_POSIX_COMPONENT FALSE)
|
||||
foreach(component ${PCRE2_FIND_COMPONENTS})
|
||||
if (component STREQUAL "8BIT")
|
||||
set(PCRE2_8BIT_COMPONENT TRUE)
|
||||
elseif (component STREQUAL "POSIX")
|
||||
set(PCRE2_POSIX_COMPONENT TRUE)
|
||||
endif ()
|
||||
endforeach()
|
||||
|
||||
if (PCRE2_POSIX_COMPONENT AND NOT PCRE2_8BIT_COMPONENT)
|
||||
message(FATAL_ERROR "The component POSIX is specified while the 8BIT one is not. This is not allowed. Please, also specify the 8BIT component.")
|
||||
endif()
|
||||
unset(PCRE2_8BIT_COMPONENT)
|
||||
unset(PCRE2_POSIX_COMPONENT)
|
||||
|
||||
include(FindPackageHandleStandardArgs)
|
||||
set(${CMAKE_FIND_PACKAGE_NAME}_CONFIG "${CMAKE_CURRENT_LIST_FILE}")
|
||||
find_package_handle_standard_args(PCRE2
|
||||
FOUND_VAR PCRE2_FOUND
|
||||
REQUIRED_VARS PCRE2_INCLUDE_DIR
|
||||
HANDLE_COMPONENTS
|
||||
VERSION_VAR PCRE2_VERSION
|
||||
CONFIG_MODE
|
||||
)
|
||||
|
||||
set(PCRE2_LIBRARIES)
|
||||
if (PCRE2_FOUND)
|
||||
foreach(component ${PCRE2_FIND_COMPONENTS})
|
||||
if (PCRE2_USE_STATIC_LIBS)
|
||||
add_library(PCRE2::${component} STATIC IMPORTED)
|
||||
target_compile_definitions(PCRE2::${component} INTERFACE PCRE2_STATIC)
|
||||
else ()
|
||||
add_library(PCRE2::${component} SHARED IMPORTED)
|
||||
endif ()
|
||||
set_target_properties(PCRE2::${component} PROPERTIES
|
||||
IMPORTED_LOCATION "${PCRE2_${component}_LIBRARY}"
|
||||
INTERFACE_INCLUDE_DIRECTORIES "${PCRE2_INCLUDE_DIR}"
|
||||
)
|
||||
if (component STREQUAL "POSIX")
|
||||
set_target_properties(PCRE2::${component} PROPERTIES
|
||||
INTERFACE_LINK_LIBRARIES "PCRE2::8BIT"
|
||||
LINK_LIBRARIES "PCRE2::8BIT"
|
||||
)
|
||||
endif ()
|
||||
|
||||
set(PCRE2_LIBRARIES ${PCRE2_LIBRARIES} ${PCRE2_${component}_LIBRARY})
|
||||
mark_as_advanced(PCRE2_${component}_LIBRARY)
|
||||
endforeach()
|
||||
endif ()
|
||||
|
||||
mark_as_advanced(
|
||||
PCRE2_INCLUDE_DIR
|
||||
)
|
|
@ -1,8 +1,7 @@
|
|||
/* config.h for CMake builds */
|
||||
|
||||
#cmakedefine HAVE_ATTRIBUTE_UNINITIALIZED 1
|
||||
#cmakedefine HAVE_DIRENT_H 1
|
||||
#cmakedefine HAVE_INTTYPES_H 1
|
||||
#cmakedefine HAVE_STDINT_H 1
|
||||
#cmakedefine HAVE_STRERROR 1
|
||||
#cmakedefine HAVE_SYS_STAT_H 1
|
||||
#cmakedefine HAVE_SYS_TYPES_H 1
|
||||
|
@ -10,9 +9,10 @@
|
|||
#cmakedefine HAVE_WINDOWS_H 1
|
||||
|
||||
#cmakedefine HAVE_BCOPY 1
|
||||
#cmakedefine HAVE_MEMFD_CREATE 1
|
||||
#cmakedefine HAVE_MEMMOVE 1
|
||||
|
||||
#cmakedefine PCRE2_STATIC 1
|
||||
#cmakedefine HAVE_SECURE_GETENV 1
|
||||
#cmakedefine HAVE_STRERROR 1
|
||||
|
||||
#cmakedefine SUPPORT_PCRE2_8 1
|
||||
#cmakedefine SUPPORT_PCRE2_16 1
|
||||
|
|
113
configure.ac
113
configure.ac
|
@ -9,21 +9,21 @@ dnl The PCRE2_PRERELEASE feature is for identifying release candidates. It might
|
|||
dnl be defined as -RC2, for example. For real releases, it should be empty.
|
||||
|
||||
m4_define(pcre2_major, [10])
|
||||
m4_define(pcre2_minor, [34])
|
||||
m4_define(pcre2_minor, [41])
|
||||
m4_define(pcre2_prerelease, [])
|
||||
m4_define(pcre2_date, [2019-11-21])
|
||||
m4_define(pcre2_date, [2022-xx-xx])
|
||||
|
||||
# Libtool shared library interface versions (current:revision:age)
|
||||
m4_define(libpcre2_8_version, [11:0:11])
|
||||
m4_define(libpcre2_16_version, [11:0:11])
|
||||
m4_define(libpcre2_32_version, [11:0:11])
|
||||
m4_define(libpcre2_posix_version, [3:2:0])
|
||||
|
||||
# NOTE: The CMakeLists.txt file searches for the above variables in the first
|
||||
# 50 lines of this file. Please update that if the variables above are moved.
|
||||
|
||||
# Libtool shared library interface versions (current:revision:age)
|
||||
m4_define(libpcre2_8_version, [9:0:9])
|
||||
m4_define(libpcre2_16_version, [9:0:9])
|
||||
m4_define(libpcre2_32_version, [9:0:9])
|
||||
m4_define(libpcre2_posix_version, [2:3:0])
|
||||
|
||||
AC_PREREQ(2.57)
|
||||
AC_INIT(PCRE2, pcre2_major.pcre2_minor[]pcre2_prerelease, , pcre2)
|
||||
AC_PREREQ([2.60])
|
||||
AC_INIT([PCRE2],pcre2_major.pcre2_minor[]pcre2_prerelease,[],[pcre2])
|
||||
AC_CONFIG_SRCDIR([src/pcre2.h.in])
|
||||
AM_INIT_AUTOMAKE([dist-bzip2 dist-zip])
|
||||
m4_ifdef([AM_SILENT_RULES], [AM_SILENT_RULES([yes])])
|
||||
|
@ -64,14 +64,31 @@ m4_ifdef([AM_PROG_AR], [AM_PROG_AR])
|
|||
AC_TYPE_INT64_T
|
||||
|
||||
AC_PROG_INSTALL
|
||||
AC_LIBTOOL_WIN32_DLL
|
||||
LT_INIT
|
||||
LT_INIT([win32-dll])
|
||||
AC_PROG_LN_S
|
||||
|
||||
# Check for GCC visibility feature
|
||||
|
||||
PCRE2_VISIBILITY
|
||||
|
||||
# Check for Clang __attribute__((uninitialized)) feature
|
||||
|
||||
AC_MSG_CHECKING([for __attribute__((uninitialized))])
|
||||
AC_LANG_PUSH([C])
|
||||
tmp_CFLAGS=$CFLAGS
|
||||
CFLAGS="$CFLAGS -Werror"
|
||||
AC_COMPILE_IFELSE([AC_LANG_PROGRAM(,
|
||||
[[char buf[128] __attribute__((uninitialized));(void)buf]])],
|
||||
[pcre2_cc_cv_attribute_uninitialized=yes],
|
||||
[pcre2_cc_cv_attribute_uninitialized=no])
|
||||
AC_MSG_RESULT([$pcre2_cc_cv_attribute_uninitialized])
|
||||
if test "$pcre2_cc_cv_attribute_uninitialized" = yes; then
|
||||
AC_DEFINE([HAVE_ATTRIBUTE_UNINITIALIZED], 1, [Define this if your compiler
|
||||
supports __attribute__((uninitialized))])
|
||||
fi
|
||||
CFLAGS=$tmp_CFLAGS
|
||||
AC_LANG_POP([C])
|
||||
|
||||
# Versioning
|
||||
|
||||
PCRE2_MAJOR="pcre2_major"
|
||||
|
@ -158,11 +175,18 @@ if test "$enable_jit" = "auto"; then
|
|||
echo checking for JIT support on this hardware... $enable_jit
|
||||
fi
|
||||
|
||||
# Handle --enable-jit-sealloc (disabled by default)
|
||||
AC_ARG_ENABLE(jit-sealloc,
|
||||
AS_HELP_STRING([--enable-jit-sealloc],
|
||||
[enable SELinux compatible execmem allocator in JIT (experimental)]),
|
||||
, enable_jit_sealloc=no)
|
||||
# Handle --enable-jit-sealloc (disabled by default and only experimental)
|
||||
case $host_os in
|
||||
linux* | netbsd*)
|
||||
AC_ARG_ENABLE(jit-sealloc,
|
||||
AS_HELP_STRING([--enable-jit-sealloc],
|
||||
[enable SELinux compatible execmem allocator in JIT (experimental)]),
|
||||
,enable_jit_sealloc=no)
|
||||
;;
|
||||
*)
|
||||
enable_jit_sealloc=unsupported
|
||||
;;
|
||||
esac
|
||||
|
||||
# Handle --disable-pcre2grep-jit (enabled by default)
|
||||
AC_ARG_ENABLE(pcre2grep-jit,
|
||||
|
@ -399,7 +423,7 @@ case "$enable_newline" in
|
|||
anycrlf) ac_pcre2_newline_value=5 ;;
|
||||
nul) ac_pcre2_newline_value=6 ;;
|
||||
*)
|
||||
AC_MSG_ERROR([invalid argument \"$enable_newline\" to --enable-newline option])
|
||||
AC_MSG_ERROR([invalid argument "$enable_newline" to --enable-newline option])
|
||||
;;
|
||||
esac
|
||||
|
||||
|
@ -428,7 +452,7 @@ fi
|
|||
case "$with_link_size" in
|
||||
2|3|4) ;;
|
||||
*)
|
||||
AC_MSG_ERROR([invalid argument \"$with_link_size\" to --with-link-size option])
|
||||
AC_MSG_ERROR([invalid argument "$with_link_size" to --with-link-size option])
|
||||
;;
|
||||
esac
|
||||
|
||||
|
@ -461,7 +485,6 @@ HAVE_BCOPY is defined. If your system has neither bcopy() nor memmove(), make
|
|||
sure both macros are undefined; an emulation function will then be used. */])
|
||||
|
||||
# Checks for header files.
|
||||
AC_HEADER_STDC
|
||||
AC_CHECK_HEADERS(limits.h sys/types.h sys/stat.h dirent.h)
|
||||
AC_CHECK_HEADERS([windows.h], [HAVE_WINDOWS_H=1])
|
||||
AC_CHECK_HEADERS([sys/wait.h], [HAVE_SYS_WAIT_H=1])
|
||||
|
@ -489,7 +512,20 @@ AC_TYPE_SIZE_T
|
|||
|
||||
# Checks for library functions.
|
||||
|
||||
AC_CHECK_FUNCS(bcopy memmove strerror mkostemp secure_getenv)
|
||||
AC_CHECK_FUNCS(bcopy memfd_create memmove mkostemp secure_getenv strerror)
|
||||
AC_MSG_CHECKING([for realpath])
|
||||
AC_LINK_IFELSE([AC_LANG_PROGRAM([[
|
||||
#include <stdlib.h>
|
||||
#include <limits.h>
|
||||
]],[[
|
||||
char buffer[PATH_MAX];
|
||||
realpath(".", buffer);
|
||||
]])],
|
||||
[AC_MSG_RESULT([yes])
|
||||
AC_DEFINE([HAVE_REALPATH], 1,
|
||||
[Define to 1 if you have the `realpath' function.])
|
||||
],
|
||||
AC_MSG_RESULT([no]))
|
||||
|
||||
# Check for the availability of libz (aka zlib)
|
||||
|
||||
|
@ -561,14 +597,14 @@ if test "$enable_pcre2test_libreadline" = "yes"; then
|
|||
fi
|
||||
fi
|
||||
|
||||
|
||||
# Check for the availability of libedit. Different distributions put its
|
||||
# headers in different places. Try to cover the most common ones.
|
||||
|
||||
if test "$enable_pcre2test_libedit" = "yes"; then
|
||||
AC_CHECK_HEADERS([editline/readline.h], [HAVE_EDITLINE_READLINE_H=1],
|
||||
[AC_CHECK_HEADERS([edit/readline/readline.h], [HAVE_READLINE_READLINE_H=1],
|
||||
[AC_CHECK_HEADERS([readline/readline.h], [HAVE_READLINE_READLINE_H=1])])])
|
||||
AC_CHECK_HEADERS([editline/readline.h edit/readline/readline.h readline.h], [
|
||||
HAVE_LIBEDIT_HEADER=1
|
||||
break
|
||||
])
|
||||
AC_CHECK_LIB([edit], [readline], [LIBEDIT="-ledit"])
|
||||
fi
|
||||
|
||||
|
@ -904,10 +940,9 @@ if test "$enable_pcre2test_libedit" = "yes"; then
|
|||
echo "** Cannot use both --enable-pcre2test-libedit and --enable-pcre2test-readline"
|
||||
exit 1
|
||||
fi
|
||||
if test "$HAVE_EDITLINE_READLINE_H" != "1" -a \
|
||||
"$HAVE_READLINE_READLINE_H" != "1"; then
|
||||
echo "** Cannot --enable-pcre2test-libedit because neither editline/readline.h"
|
||||
echo "** nor readline/readline.h was found."
|
||||
if test -z "$HAVE_LIBEDIT_HEADER"; then
|
||||
echo "** Cannot --enable-pcre2test-libedit because neither editline/readline.h,"
|
||||
echo "** edit/readline/readline.h nor a compatible header was found."
|
||||
exit 1
|
||||
fi
|
||||
if test -z "$LIBEDIT"; then
|
||||
|
@ -981,7 +1016,27 @@ fi # enable_coverage
|
|||
|
||||
AM_CONDITIONAL([WITH_GCOV],[test "x$enable_coverage" = "xyes"])
|
||||
|
||||
AC_MSG_CHECKING([whether Intel CET is enabled])
|
||||
AC_LANG_PUSH([C])
|
||||
AC_COMPILE_IFELSE([AC_LANG_PROGRAM(,
|
||||
[[#ifndef __CET__
|
||||
# error CET is not enabled
|
||||
#endif]])],
|
||||
[pcre2_cc_cv_intel_cet_enabled=yes],
|
||||
[pcre2_cc_cv_intel_cet_enabled=no])
|
||||
AC_MSG_RESULT([$pcre2_cc_cv_intel_cet_enabled])
|
||||
if test "$pcre2_cc_cv_intel_cet_enabled" = yes; then
|
||||
CET_CFLAGS="-mshstk"
|
||||
AC_SUBST([CET_CFLAGS])
|
||||
fi
|
||||
AC_LANG_POP([C])
|
||||
|
||||
# LIB_POSTFIX is used by CMakeLists.txt for Windows debug builds.
|
||||
# Pass empty LIB_POSTFIX to *.pc files and pcre2-config here.
|
||||
AC_SUBST(LIB_POSTFIX)
|
||||
|
||||
# Produce these files, in addition to config.h.
|
||||
|
||||
AC_CONFIG_FILES(
|
||||
Makefile
|
||||
libpcre2-8.pc
|
||||
|
|
|
@ -40,7 +40,11 @@ GENERIC INSTRUCTIONS FOR THE PCRE2 C LIBRARY
|
|||
|
||||
The following are generic instructions for building the PCRE2 C library "by
|
||||
hand". If you are going to use CMake, this section does not apply to you; you
|
||||
can skip ahead to the CMake section.
|
||||
can skip ahead to the CMake section. Note that the settings concerned with
|
||||
8-bit, 16-bit, and 32-bit code units relate to the type of data string that
|
||||
PCRE2 processes. They are NOT referring to the underlying operating system bit
|
||||
width. You do not have to do anything special to compile in a 64-bit
|
||||
environment, for example.
|
||||
|
||||
(1) Copy or rename the file src/config.h.generic as src/config.h, and edit the
|
||||
macro settings that it contains to whatever is appropriate for your
|
||||
|
@ -74,23 +78,23 @@ can skip ahead to the CMake section.
|
|||
src/pcre2_chartables.c.
|
||||
|
||||
OR:
|
||||
Compile src/dftables.c as a stand-alone program (using -DHAVE_CONFIG_H
|
||||
if you have set up src/config.h), and then run it with the single
|
||||
argument "src/pcre2_chartables.c". This generates a set of standard
|
||||
character tables and writes them to that file. The tables are generated
|
||||
using the default C locale for your system. If you want to use a locale
|
||||
that is specified by LC_xxx environment variables, add the -L option to
|
||||
the dftables command. You must use this method if you are building on a
|
||||
system that uses EBCDIC code.
|
||||
Compile src/pcre2_dftables.c as a stand-alone program (using
|
||||
-DHAVE_CONFIG_H if you have set up src/config.h), and then run it with
|
||||
the single argument "src/pcre2_chartables.c". This generates a set of
|
||||
standard character tables and writes them to that file. The tables are
|
||||
generated using the default C locale for your system. If you want to use
|
||||
a locale that is specified by LC_xxx environment variables, add the -L
|
||||
option to the pcre2_dftables command. You must use this method if you
|
||||
are building on a system that uses EBCDIC code.
|
||||
|
||||
The tables in src/pcre2_chartables.c are defaults. The caller of PCRE2 can
|
||||
specify alternative tables at run time.
|
||||
|
||||
(4) For an 8-bit library, compile the following source files from the src
|
||||
directory, setting -DPCRE2_CODE_UNIT_WIDTH=8 as a compiler option. Also
|
||||
set -DHAVE_CONFIG_H if you have set up src/config.h with your
|
||||
configuration, or else use other -D settings to change the configuration
|
||||
as required.
|
||||
(4) For a library that supports 8-bit code units in the character strings that
|
||||
it processes, compile the following source files from the src directory,
|
||||
setting -DPCRE2_CODE_UNIT_WIDTH=8 as a compiler option. Also set
|
||||
-DHAVE_CONFIG_H if you have set up src/config.h with your configuration,
|
||||
or else use other -D settings to change the configuration as required.
|
||||
|
||||
pcre2_auto_possess.c
|
||||
pcre2_chartables.c
|
||||
|
@ -117,6 +121,7 @@ can skip ahead to the CMake section.
|
|||
pcre2_substring.c
|
||||
pcre2_tables.c
|
||||
pcre2_ucd.c
|
||||
pcre2_ucptables.c
|
||||
pcre2_valid_utf.c
|
||||
pcre2_xclass.c
|
||||
|
||||
|
@ -142,9 +147,9 @@ can skip ahead to the CMake section.
|
|||
If your system has static and shared libraries, you may have to do this
|
||||
once for each type.
|
||||
|
||||
(6) If you want to build a 16-bit library or 32-bit library (as well as, or
|
||||
instead of the 8-bit library) just supply 16 or 32 as the value of
|
||||
-DPCRE2_CODE_UNIT_WIDTH when you are compiling.
|
||||
(6) If you want to build a library that supports 16-bit or 32-bit code units,
|
||||
(as well as, or instead of the 8-bit library) just supply 16 or 32 as the
|
||||
value of -DPCRE2_CODE_UNIT_WIDTH when you are compiling.
|
||||
|
||||
(7) If you want to build the POSIX wrapper functions (which apply only to the
|
||||
8-bit library), ensure that you have the src/pcre2posix.h file and then
|
||||
|
@ -302,7 +307,7 @@ cache can be deleted by selecting "File > Delete Cache".
|
|||
3. Create a new, empty build directory, preferably a subdirectory of the
|
||||
source dir. For example, C:\pcre2\pcre2-xx\build.
|
||||
|
||||
4. Run cmake-gui from the Shell envirornment of your build tool, for example,
|
||||
4. Run cmake-gui from the Shell environment of your build tool, for example,
|
||||
Msys for Msys/MinGW or Visual Studio Command Prompt for VC/VC++. Do not try
|
||||
to start Cmake from the Windows Start menu, as this can lead to errors.
|
||||
|
||||
|
@ -339,10 +344,10 @@ cache can be deleted by selecting "File > Delete Cache".
|
|||
|
||||
BUILDING PCRE2 ON WINDOWS WITH VISUAL STUDIO
|
||||
|
||||
The code currently cannot be compiled without a stdint.h header, which is
|
||||
available only in relatively recent versions of Visual Studio. However, this
|
||||
portable and permissively-licensed implementation of the header worked without
|
||||
issue:
|
||||
The code currently cannot be compiled without an inttypes.h header, which is
|
||||
available only with Visual Studio 2013 or newer. However, this portable and
|
||||
permissively-licensed implementation of the stdint.h header could be used as an
|
||||
alternative:
|
||||
|
||||
http://www.azillionmonkeys.com/qed/pstdint.h
|
||||
|
||||
|
@ -369,7 +374,7 @@ Otherwise:
|
|||
1. Copy RunTest.bat into the directory where pcre2test.exe and pcre2grep.exe
|
||||
have been created.
|
||||
|
||||
2. Edit RunTest.bat to indentify the full or relative location of
|
||||
2. Edit RunTest.bat to identify the full or relative location of
|
||||
the pcre2 source (wherein which the testdata folder resides), e.g.:
|
||||
|
||||
set srcdir=C:\pcre2\pcre2-10.00
|
||||
|
@ -401,6 +406,6 @@ Everything in that location, source and executable, is in EBCDIC and native
|
|||
z/OS file formats. The port provides an API for LE languages such as COBOL and
|
||||
for the z/OS and z/VM versions of the Rexx languages.
|
||||
|
||||
==============================
|
||||
Last Updated: 14 November 2018
|
||||
==============================
|
||||
===========================
|
||||
Last Updated: 28 April 2021
|
||||
===========================
|
||||
|
|
|
@ -4,18 +4,20 @@ README file for PCRE2 (Perl-compatible regular expression library)
|
|||
PCRE2 is a re-working of the original PCRE1 library to provide an entirely new
|
||||
API. Since its initial release in 2015, there has been further development of
|
||||
the code and it now differs from PCRE1 in more than just the API. There are new
|
||||
features and the internals have been improved. The latest release of PCRE2 is
|
||||
available in three alternative formats from:
|
||||
features, and the internals have been improved. The original PCRE1 library is
|
||||
now obsolete and no longer maintained. The latest release of PCRE2 is available
|
||||
in .tar.gz, tar.bz2, or .zip form from this GitHub repository:
|
||||
|
||||
https://ftp.pcre.org/pub/pcre/pcre2-10.xx.tar.gz
|
||||
https://ftp.pcre.org/pub/pcre/pcre2-10.xx.tar.bz2
|
||||
https://ftp.pcre.org/pub/pcre/pcre2-10.xx.tar.zip
|
||||
https://github.com/PCRE2Project/pcre2/releases
|
||||
|
||||
There is a mailing list for discussion about the development of PCRE (both the
|
||||
original and new APIs) at pcre-dev@exim.org. You can access the archives and
|
||||
subscribe or manage your subscription here:
|
||||
There is a mailing list for discussion about the development of PCRE2 at
|
||||
pcre2-dev@googlegroups.com. You can subscribe by sending an email to
|
||||
pcre2-dev+subscribe@googlegroups.com.
|
||||
|
||||
https://lists.exim.org/mailman/listinfo/pcre-dev
|
||||
You can access the archives and also subscribe or manage your subscription
|
||||
here:
|
||||
|
||||
https://groups.google.com/g/pcre2-dev
|
||||
|
||||
Please read the NEWS file if you are upgrading from a previous release. The
|
||||
contents of this README file are:
|
||||
|
@ -112,12 +114,18 @@ Building PCRE2 using autotools
|
|||
The following instructions assume the use of the widely used "configure; make;
|
||||
make install" (autotools) process.
|
||||
|
||||
To build PCRE2 on system that supports autotools, first run the "configure"
|
||||
command from the PCRE2 distribution directory, with your current directory set
|
||||
If you have downloaded and unpacked a PCRE2 release tarball, run the
|
||||
"configure" command from the PCRE2 directory, with your current directory set
|
||||
to the directory where you want the files to be created. This command is a
|
||||
standard GNU "autoconf" configuration script, for which generic instructions
|
||||
are supplied in the file INSTALL.
|
||||
|
||||
The files in the GitHub repository do not contain "configure". If you have
|
||||
downloaded the PCRE2 source files from GitHub, before you can run "configure"
|
||||
you must run the shell script called autogen.sh. This runs a number of
|
||||
autotools to create a "configure" script (you must of course have the autotools
|
||||
commands installed in order to do this).
|
||||
|
||||
Most commonly, people build PCRE2 within its own distribution directory, and in
|
||||
this case, on many systems, just running "./configure" is sufficient. However,
|
||||
the usual methods of changing standard defaults are available. For example:
|
||||
|
@ -186,10 +194,10 @@ library. They are also documented in the pcre2build man page.
|
|||
|
||||
As well as supporting UTF strings, Unicode support includes support for the
|
||||
\P, \p, and \X sequences that recognize Unicode character properties.
|
||||
However, only the basic two-letter properties such as Lu are supported.
|
||||
Escape sequences such as \d and \w in patterns do not by default make use of
|
||||
Unicode properties, but can be made to do so by setting the PCRE2_UCP option
|
||||
or starting a pattern with (*UCP).
|
||||
However, only a subset of Unicode properties are supported; see the
|
||||
pcre2pattern man page for details. Escape sequences such as \d and \w in
|
||||
patterns do not by default make use of Unicode properties, but can be made to
|
||||
do so by setting the PCRE2_UCP option or starting a pattern with (*UCP).
|
||||
|
||||
. You can build PCRE2 to recognize either CR or LF or the sequence CRLF, or any
|
||||
of the preceding, or any of the Unicode newline sequences, or the NUL (zero)
|
||||
|
@ -269,9 +277,9 @@ library. They are also documented in the pcre2build man page.
|
|||
|
||||
--enable-rebuild-chartables
|
||||
|
||||
a program called dftables is compiled and run in the default C locale when
|
||||
you obey "make". It builds a source file called pcre2_chartables.c. If you do
|
||||
not specify this option, pcre2_chartables.c is created as a copy of
|
||||
a program called pcre2_dftables is compiled and run in the default C locale
|
||||
when you obey "make". It builds a source file called pcre2_chartables.c. If
|
||||
you do not specify this option, pcre2_chartables.c is created as a copy of
|
||||
pcre2_chartables.c.dist. See "Character tables" below for further
|
||||
information.
|
||||
|
||||
|
@ -297,8 +305,8 @@ library. They are also documented in the pcre2build man page.
|
|||
unaddressable. This allows it to detect invalid memory accesses, and is
|
||||
mostly useful for debugging PCRE2 itself.
|
||||
|
||||
. In environments where the gcc compiler is used and lcov version 1.6 or above
|
||||
is installed, if you specify
|
||||
. In environments where the gcc compiler is used and lcov is installed, if you
|
||||
specify
|
||||
|
||||
--enable-coverage
|
||||
|
||||
|
@ -367,19 +375,20 @@ library. They are also documented in the pcre2build man page.
|
|||
necessary to specify something like LIBS="-lncurses" as well. This is
|
||||
because, to quote the readline INSTALL, "Readline uses the termcap functions,
|
||||
but does not link with the termcap or curses library itself, allowing
|
||||
applications which link with readline the to choose an appropriate library."
|
||||
applications which link with readline the option to choose an appropriate
|
||||
library."
|
||||
If you get error messages about missing functions tgetstr, tgetent, tputs,
|
||||
tgetflag, or tgoto, this is the problem, and linking with the ncurses library
|
||||
should fix it.
|
||||
|
||||
. The C99 standard defines formatting modifiers z and t for size_t and
|
||||
ptrdiff_t values, respectively. By default, PCRE2 uses these modifiers in
|
||||
environments other than Microsoft Visual Studio when __STDC_VERSION__ is
|
||||
defined and has a value greater than or equal to 199901L (indicating C99).
|
||||
However, there is at least one environment that claims to be C99 but does not
|
||||
support these modifiers. If --disable-percent-zt is specified, no use is made
|
||||
of the z or t modifiers. Instead or %td or %zu, %lu is used, with a cast for
|
||||
size_t values.
|
||||
environments other than Microsoft Visual Studio versions earlier than 2013
|
||||
when __STDC_VERSION__ is defined and has a value greater than or equal to
|
||||
199901L (indicating C99). However, there is at least one environment that
|
||||
claims to be C99 but does not support these modifiers. If
|
||||
--disable-percent-zt is specified, no use is made of the z or t modifiers.
|
||||
Instead of %td or %zu, %lu is used, with a cast for size_t values.
|
||||
|
||||
. There is a special option called --enable-fuzz-support for use by people who
|
||||
want to run fuzzing tests on PCRE2. At present this applies only to the 8-bit
|
||||
|
@ -392,10 +401,10 @@ library. They are also documented in the pcre2build man page.
|
|||
Setting --enable-fuzz-support also causes a binary called pcre2fuzzcheck to
|
||||
be created. This is normally run under valgrind or used when PCRE2 is
|
||||
compiled with address sanitizing enabled. It calls the fuzzing function and
|
||||
outputs information about it is doing. The input strings are specified by
|
||||
arguments: if an argument starts with "=" the rest of it is a literal input
|
||||
string. Otherwise, it is assumed to be a file name, and the contents of the
|
||||
file are the test string.
|
||||
outputs information about what it is doing. The input strings are specified
|
||||
by arguments: if an argument starts with "=" the rest of it is a literal
|
||||
input string. Otherwise, it is assumed to be a file name, and the contents
|
||||
of the file are the test string.
|
||||
|
||||
. Releases before 10.30 could be compiled with --disable-stack-for-recursion,
|
||||
which caused pcre2_match() to use individual blocks on the heap for
|
||||
|
@ -409,7 +418,7 @@ The "configure" script builds the following files for the basic C library:
|
|||
. Makefile the makefile that builds the library
|
||||
. src/config.h build-time configuration options for the library
|
||||
. src/pcre2.h the public PCRE2 header file
|
||||
. pcre2-config script that shows the building settings such as CFLAGS
|
||||
. pcre2-config script that shows the building settings such as CFLAGS
|
||||
that were set for "configure"
|
||||
. libpcre2-8.pc )
|
||||
. libpcre2-16.pc ) data for the pkg-config command
|
||||
|
@ -548,11 +557,11 @@ Cross-compiling using autotools
|
|||
|
||||
You can specify CC and CFLAGS in the normal way to the "configure" command, in
|
||||
order to cross-compile PCRE2 for some other host. However, you should NOT
|
||||
specify --enable-rebuild-chartables, because if you do, the dftables.c source
|
||||
file is compiled and run on the local host, in order to generate the inbuilt
|
||||
character tables (the pcre2_chartables.c file). This will probably not work,
|
||||
because dftables.c needs to be compiled with the local compiler, not the cross
|
||||
compiler.
|
||||
specify --enable-rebuild-chartables, because if you do, the pcre2_dftables.c
|
||||
source file is compiled and run on the local host, in order to generate the
|
||||
inbuilt character tables (the pcre2_chartables.c file). This will probably not
|
||||
work, because pcre2_dftables.c needs to be compiled with the local compiler,
|
||||
not the cross compiler.
|
||||
|
||||
When --enable-rebuild-chartables is not specified, pcre2_chartables.c is
|
||||
created by making a copy of pcre2_chartables.c.dist, which is a default set of
|
||||
|
@ -560,9 +569,10 @@ tables that assumes ASCII code. Cross-compiling with the default tables should
|
|||
not be a problem.
|
||||
|
||||
If you need to modify the character tables when cross-compiling, you should
|
||||
move pcre2_chartables.c.dist out of the way, then compile dftables.c by hand
|
||||
and run it on the local host to make a new version of pcre2_chartables.c.dist.
|
||||
Then when you cross-compile PCRE2 this new version of the tables will be used.
|
||||
move pcre2_chartables.c.dist out of the way, then compile pcre2_dftables.c by
|
||||
hand and run it on the local host to make a new version of
|
||||
pcre2_chartables.c.dist. See the pcre2build section "Creating character tables
|
||||
at build time" for more details.
|
||||
|
||||
|
||||
Making new tarballs
|
||||
|
@ -599,13 +609,13 @@ is available. RunTest outputs a comment when it skips a test.
|
|||
|
||||
Many (but not all) of the tests that are not skipped are run twice if JIT
|
||||
support is available. On the second run, JIT compilation is forced. This
|
||||
testing can be suppressed by putting "nojit" on the RunTest command line.
|
||||
testing can be suppressed by putting "-nojit" on the RunTest command line.
|
||||
|
||||
The entire set of tests is run once for each of the 8-bit, 16-bit and 32-bit
|
||||
libraries that are enabled. If you want to run just one set of tests, call
|
||||
RunTest with either the -8, -16 or -32 option.
|
||||
|
||||
If valgrind is installed, you can run the tests under it by putting "valgrind"
|
||||
If valgrind is installed, you can run the tests under it by putting "-valgrind"
|
||||
on the RunTest command line. To run pcre2test on just one or more specific test
|
||||
files, give their numbers as arguments to RunTest, for example:
|
||||
|
||||
|
@ -686,7 +696,7 @@ Test 14 contains some special UTF and UCP tests that give different output for
|
|||
different code unit widths.
|
||||
|
||||
Test 15 contains a number of tests that must not be run with JIT. They check,
|
||||
among other non-JIT things, the match-limiting features of the intepretive
|
||||
among other non-JIT things, the match-limiting features of the interpretive
|
||||
matcher.
|
||||
|
||||
Test 16 is run only when JIT support is not available. It checks that an
|
||||
|
@ -721,8 +731,8 @@ compile context.
|
|||
The source file called pcre2_chartables.c contains the default set of tables.
|
||||
By default, this is created as a copy of pcre2_chartables.c.dist, which
|
||||
contains tables for ASCII coding. However, if --enable-rebuild-chartables is
|
||||
specified for ./configure, a different version of pcre2_chartables.c is built
|
||||
by the program dftables (compiled from dftables.c), which uses the ANSI C
|
||||
specified for ./configure, a new version of pcre2_chartables.c is built by the
|
||||
program pcre2_dftables (compiled from pcre2_dftables.c), which uses the ANSI C
|
||||
character handling functions such as isalnum(), isalpha(), isupper(),
|
||||
islower(), etc. to build the table sources. This means that the default C
|
||||
locale that is set for your system will control the contents of these default
|
||||
|
@ -732,32 +742,40 @@ file does not get automatically re-generated. The best way to do this is to
|
|||
move pcre2_chartables.c.dist out of the way and replace it with your customized
|
||||
tables.
|
||||
|
||||
When the dftables program is run as a result of --enable-rebuild-chartables,
|
||||
it uses the default C locale that is set on your system. It does not pay
|
||||
attention to the LC_xxx environment variables. In other words, it uses the
|
||||
system's default locale rather than whatever the compiling user happens to have
|
||||
set. If you really do want to build a source set of character tables in a
|
||||
locale that is specified by the LC_xxx variables, you can run the dftables
|
||||
program by hand with the -L option. For example:
|
||||
When the pcre2_dftables program is run as a result of specifying
|
||||
--enable-rebuild-chartables, it uses the default C locale that is set on your
|
||||
system. It does not pay attention to the LC_xxx environment variables. In other
|
||||
words, it uses the system's default locale rather than whatever the compiling
|
||||
user happens to have set. If you really do want to build a source set of
|
||||
character tables in a locale that is specified by the LC_xxx variables, you can
|
||||
run the pcre2_dftables program by hand with the -L option. For example:
|
||||
|
||||
./dftables -L pcre2_chartables.c.special
|
||||
./pcre2_dftables -L pcre2_chartables.c.special
|
||||
|
||||
The first two 256-byte tables provide lower casing and case flipping functions,
|
||||
respectively. The next table consists of three 32-byte bit maps which identify
|
||||
digits, "word" characters, and white space, respectively. These are used when
|
||||
building 32-byte bit maps that represent character classes for code points less
|
||||
than 256. The final 256-byte table has bits indicating various character types,
|
||||
as follows:
|
||||
The second argument names the file where the source code for the tables is
|
||||
written. The first two 256-byte tables provide lower casing and case flipping
|
||||
functions, respectively. The next table consists of a number of 32-byte bit
|
||||
maps which identify certain character classes such as digits, "word"
|
||||
characters, white space, etc. These are used when building 32-byte bit maps
|
||||
that represent character classes for code points less than 256. The final
|
||||
256-byte table has bits indicating various character types, as follows:
|
||||
|
||||
1 white space character
|
||||
2 letter
|
||||
4 decimal digit
|
||||
8 hexadecimal digit
|
||||
4 lower case letter
|
||||
8 decimal digit
|
||||
16 alphanumeric or '_'
|
||||
128 regular expression metacharacter or binary zero
|
||||
|
||||
You should not alter the set of characters that contain the 128 bit, as that
|
||||
will cause PCRE2 to malfunction.
|
||||
You can also specify -b (with or without -L) when running pcre2_dftables. This
|
||||
causes the tables to be written in binary instead of as source code. A set of
|
||||
binary tables can be loaded into memory by an application and passed to
|
||||
pcre2_compile() in the same way as tables created dynamically by calling
|
||||
pcre2_maketables(). The tables are just a string of bytes, independent of
|
||||
hardware characteristics such as endianness. This means they can be bundled
|
||||
with an application that runs in different environments, to ensure consistent
|
||||
behaviour.
|
||||
|
||||
See also the pcre2build section "Creating character tables at build time".
|
||||
|
||||
|
||||
File manifest
|
||||
|
@ -768,7 +786,7 @@ The distribution should contain the files listed below.
|
|||
(A) Source files for the PCRE2 library functions and their headers are found in
|
||||
the src directory:
|
||||
|
||||
src/dftables.c auxiliary program for building pcre2_chartables.c
|
||||
src/pcre2_dftables.c auxiliary program for building pcre2_chartables.c
|
||||
when --enable-rebuild-chartables is specified
|
||||
|
||||
src/pcre2_chartables.c.dist a default set of character tables that assume
|
||||
|
@ -892,6 +910,6 @@ The distribution should contain the files listed below.
|
|||
) environments
|
||||
|
||||
Philip Hazel
|
||||
Email local part: ph10
|
||||
Email domain: cam.ac.uk
|
||||
Last updated: 16 April 2019
|
||||
Email local part: Philip.Hazel
|
||||
Email domain: gmail.com
|
||||
Last updated: 15 April 2022
|
||||
|
|
|
@ -28,7 +28,8 @@ nearly two decades, the limitations of the original API were making development
|
|||
increasingly difficult. The new API is more extensible, and it was simplified
|
||||
by abolishing the separate "study" optimizing function; in PCRE2, patterns are
|
||||
automatically optimized where possible. Since forking from PCRE1, the code has
|
||||
been extensively refactored and new features introduced.
|
||||
been extensively refactored and new features introduced. The old library is now
|
||||
obsolete and is no longer maintained.
|
||||
</P>
|
||||
<P>
|
||||
As well as Perl-style regular expression patterns, some features that appeared
|
||||
|
@ -38,8 +39,14 @@ Oniguruma syntax items, and there are options for requesting some minor changes
|
|||
that give better ECMAScript (aka JavaScript) compatibility.
|
||||
</P>
|
||||
<P>
|
||||
The source code for PCRE2 can be compiled to support 8-bit, 16-bit, or 32-bit
|
||||
code units, which means that up to three separate libraries may be installed.
|
||||
The source code for PCRE2 can be compiled to support strings of 8-bit, 16-bit,
|
||||
or 32-bit code units, which means that up to three separate libraries may be
|
||||
installed, one for each code unit size. The size of code unit is not related to
|
||||
the bit size of the underlying hardware. In a 64-bit environment that also
|
||||
supports 32-bit applications, versions of PCRE2 that are compiled in both
|
||||
64-bit and 32-bit modes may be needed.
|
||||
</P>
|
||||
<P>
|
||||
The original work to extend PCRE to 16-bit and 32-bit code units was done by
|
||||
Zoltan Herczeg and Christian Persch, respectively. In all three cases, strings
|
||||
can be interpreted either as one character per code unit, or as UTF-encoded
|
||||
|
@ -187,20 +194,20 @@ function, listing its arguments and results.
|
|||
<P>
|
||||
Philip Hazel
|
||||
<br>
|
||||
University Computing Service
|
||||
Retired from University Computing Service
|
||||
<br>
|
||||
Cambridge, England.
|
||||
<br>
|
||||
</P>
|
||||
<P>
|
||||
Putting an actual email address here is a spam magnet. If you want to email me,
|
||||
use my two initials, followed by the two digits 10, at the domain cam.ac.uk.
|
||||
use my two names separated by a dot at gmail.com.
|
||||
</P>
|
||||
<br><a name="SEC5" href="#TOC1">REVISION</a><br>
|
||||
<P>
|
||||
Last updated: 17 September 2018
|
||||
Last updated: 27 August 2021
|
||||
<br>
|
||||
Copyright © 1997-2018 University of Cambridge.
|
||||
Copyright © 1997-2021 University of Cambridge.
|
||||
<br>
|
||||
<p>
|
||||
Return to the <a href="index.html">PCRE2 index page</a>.
|
||||
|
|
|
@ -92,8 +92,18 @@ Additional options may be set in the compile context via the
|
|||
function.
|
||||
</P>
|
||||
<P>
|
||||
The yield of this function is a pointer to a private data structure that
|
||||
contains the compiled pattern, or NULL if an error was detected.
|
||||
If either of <i>errorcode</i> or <i>erroroffset</i> is NULL, the function returns
|
||||
NULL immediately. Otherwise, the yield of this function is a pointer to a
|
||||
private data structure that contains the compiled pattern, or NULL if an error
|
||||
was detected. In the error case, a text error message can be obtained by
|
||||
passing the value returned via the <i>errorcode</i> argument to the the
|
||||
<b>pcre2_get_error_message()</b> function. The offset (in code units) where the
|
||||
error was encountered is returned via the <i>erroroffset</i> argument.
|
||||
</P>
|
||||
<P>
|
||||
If there is no error, the value passed via <i>errorcode</i> returns the message
|
||||
"no error" if passed to <b>pcre2_get_error_message()</b>, and the value passed
|
||||
via <i>erroroffset</i> is zero.
|
||||
</P>
|
||||
<P>
|
||||
There is a complete description of the PCRE2 native API, with more detail on
|
||||
|
|
|
@ -45,10 +45,16 @@ just once (except when processing lookaround assertions). This function is
|
|||
<i>workspace</i> Points to a vector of ints used as working space
|
||||
<i>wscount</i> Number of elements in the vector
|
||||
</pre>
|
||||
For <b>pcre2_dfa_match()</b>, a match context is needed only if you want to set
|
||||
up a callout function or specify the heap limit or the match or the recursion
|
||||
depth limits. The <i>length</i> and <i>startoffset</i> values are code units, not
|
||||
characters. The options are:
|
||||
The size of output vector needed to contain all the results depends on the
|
||||
number of simultaneous matches, not on the number of parentheses in the
|
||||
pattern. Using <b>pcre2_match_data_create_from_pattern()</b> to create the match
|
||||
data block is therefore not advisable when using this function.
|
||||
</P>
|
||||
<P>
|
||||
A match context is needed only if you want to set up a callout function or
|
||||
specify the heap limit or the match or the recursion depth limits. The
|
||||
<i>length</i> and <i>startoffset</i> values are code units, not characters. The
|
||||
options are:
|
||||
<pre>
|
||||
PCRE2_ANCHORED Match only at the first position
|
||||
PCRE2_COPY_MATCHED_SUBJECT
|
||||
|
|
|
@ -29,7 +29,7 @@ This function frees unused JIT executable memory. The argument is a general
|
|||
context, for custom memory management, or NULL for standard memory management.
|
||||
JIT memory allocation retains some memory in order to improve future JIT
|
||||
compilation speed. In low memory conditions,
|
||||
\fBpcre2_jit_free_unused_memory()\fB can be used to cause this memory to be
|
||||
<b>pcre2_jit_free_unused_memory()</b> can be used to cause this memory to be
|
||||
freed.
|
||||
</P>
|
||||
<P>
|
||||
|
|
|
@ -33,7 +33,9 @@ processed by the JIT compiler against a given subject string, using a matching
|
|||
algorithm that is similar to Perl's. It is a "fast path" interface to JIT, and
|
||||
it bypasses some of the sanity checks that <b>pcre2_match()</b> applies.
|
||||
Its arguments are exactly the same as for
|
||||
<a href="pcre2_match.html"><b>pcre2_match()</b>.</a>
|
||||
<a href="pcre2_match.html"><b>pcre2_match()</b>,</a>
|
||||
except that the subject string must be specified with a length;
|
||||
PCRE2_ZERO_TERMINATED is not supported.
|
||||
</P>
|
||||
<P>
|
||||
The supported options are PCRE2_NOTBOL, PCRE2_NOTEOL, PCRE2_NOTEMPTY,
|
||||
|
|
|
@ -34,7 +34,8 @@ allocation. The result can be passed to the JIT run-time code by calling
|
|||
<b>pcre2_jit_stack_assign()</b> to associate the stack with a compiled pattern,
|
||||
which can then be processed by <b>pcre2_match()</b> or <b>pcre2_jit_match()</b>.
|
||||
A maximum stack size of 512KiB to 1MiB should be more than enough for any
|
||||
pattern. For more details, see the
|
||||
pattern. If the stack couldn't be allocated or the values passed were not
|
||||
reasonable, NULL will be returned. For more details, see the
|
||||
<a href="pcre2jit.html"><b>pcre2jit</b></a>
|
||||
page.
|
||||
</P>
|
||||
|
|
|
@ -30,8 +30,9 @@ This function creates a new match data block, which is used for holding the
|
|||
result of a match. The first argument specifies the number of pairs of offsets
|
||||
that are required. These form the "output vector" (ovector) within the match
|
||||
data block, and are used to identify the matched string and any captured
|
||||
substrings. There is always one pair of offsets; if <b>ovecsize</b> is zero, it
|
||||
is treated as one.
|
||||
substrings when matching with <b>pcre2_match()</b>, or a number of different
|
||||
matches at the same point when used with <b>pcre2_dfa_match()</b>. There is
|
||||
always one pair of offsets; if <b>ovecsize</b> is zero, it is treated as one.
|
||||
</P>
|
||||
<P>
|
||||
The second argument points to a general context, for custom memory management,
|
||||
|
|
|
@ -26,12 +26,15 @@ SYNOPSIS
|
|||
DESCRIPTION
|
||||
</b><br>
|
||||
<P>
|
||||
This function creates a new match data block, which is used for holding the
|
||||
result of a match. The first argument points to a compiled pattern. The number
|
||||
of capturing parentheses within the pattern is used to compute the number of
|
||||
pairs of offsets that are required in the match data block. These form the
|
||||
"output vector" (ovector) within the match data block, and are used to identify
|
||||
the matched string and any captured substrings.
|
||||
This function creates a new match data block for holding the result of a match.
|
||||
The first argument points to a compiled pattern. The number of capturing
|
||||
parentheses within the pattern is used to compute the number of pairs of
|
||||
offsets that are required in the match data block. These form the "output
|
||||
vector" (ovector) within the match data block, and are used to identify the
|
||||
matched string and any captured substrings when matching with
|
||||
<b>pcre2_match()</b>. If you are using <b>pcre2_dfa_match()</b>, which uses the
|
||||
outut vector in a different way, you should use <b>pcre2_match_data_create()</b>
|
||||
instead of this function.
|
||||
</P>
|
||||
<P>
|
||||
The second argument points to a general context, for custom memory management,
|
||||
|
|
|
@ -48,7 +48,7 @@ the following negative error codes:
|
|||
PCRE2_ERROR_BADDATA <i>number_of_codes</i> is zero or less
|
||||
PCRE2_ERROR_BADMAGIC mismatch of id bytes in <i>bytes</i>
|
||||
PCRE2_ERROR_BADMODE mismatch of variable unit size or PCRE version
|
||||
PCRE2_ERROR_MEMORY memory allocation failed
|
||||
PCRE2_ERROR_NOMEMORY memory allocation failed
|
||||
PCRE2_ERROR_NULL <i>codes</i> or <i>bytes</i> is NULL
|
||||
</pre>
|
||||
PCRE2_ERROR_BADMAGIC may mean that the data is corrupt, or that it was compiled
|
||||
|
|
|
@ -27,9 +27,12 @@ DESCRIPTION
|
|||
</b><br>
|
||||
<P>
|
||||
This function sets a pointer to custom character tables within a compile
|
||||
context. The second argument must be the result of a call to
|
||||
<b>pcre2_maketables()</b> or NULL to request the default tables. The result is
|
||||
always zero.
|
||||
context. The second argument must point to a set of PCRE2 character tables or
|
||||
be NULL to request the default tables. The result is always zero. Character
|
||||
tables can be created by calling <b>pcre2_maketables()</b> or by running the
|
||||
<b>pcre2_dftables</b> maintenance command in binary mode (see the
|
||||
<a href="pcre2build.html"><b>pcre2build</b></a>
|
||||
documentation).
|
||||
</P>
|
||||
<P>
|
||||
There is a complete description of the PCRE2 native API in the
|
||||
|
|
|
@ -30,7 +30,8 @@ This function sets additional option bits for <b>pcre2_compile()</b> that are
|
|||
housed in a compile context. It completely replaces all the bits. The extra
|
||||
options are:
|
||||
<pre>
|
||||
PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES Allow \x{df800} to \x{dfff} in UTF-8 and UTF-32 modes
|
||||
PCRE2_EXTRA_ALLOW_LOOKAROUND_BSK Allow \K in lookarounds
|
||||
PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES Allow \x{d800} to \x{dfff} in UTF-8 and UTF-32 modes
|
||||
PCRE2_EXTRA_ALT_BSUX Extended alternate \u, \U, and \x handling
|
||||
PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL Treat all invalid escapes as a literal following character
|
||||
PCRE2_EXTRA_ESCAPED_CR_IS_LF Interpret \r as \n
|
||||
|
|
|
@ -48,8 +48,8 @@ Its arguments are:
|
|||
<i>outlengthptr</i> Points to the length of the output buffer
|
||||
</pre>
|
||||
A match data block is needed only if you want to inspect the data from the
|
||||
match that is returned in that block. A match context is needed only if you
|
||||
want to:
|
||||
final match that is returned in that block or if PCRE2_SUBSTITUTE_MATCHED is
|
||||
set. A match context is needed only if you want to:
|
||||
<pre>
|
||||
Set up a callout function
|
||||
Set a matching offset limit
|
||||
|
@ -57,29 +57,46 @@ want to:
|
|||
Change the backtracking depth limit
|
||||
Set custom memory management in the match context
|
||||
</pre>
|
||||
The <i>length</i>, <i>startoffset</i> and <i>rlength</i> values are code
|
||||
units, not characters, as is the contents of the variable pointed at by
|
||||
<i>outlengthptr</i>, which is updated to the actual length of the new string.
|
||||
The <i>length</i>, <i>startoffset</i> and <i>rlength</i> values are code units,
|
||||
not characters, as is the contents of the variable pointed at by
|
||||
<i>outlengthptr</i>. This variable must contain the length of the output buffer
|
||||
when the function is called. If the function is successful, the value is
|
||||
changed to the length of the new string, excluding the trailing zero that is
|
||||
automatically added.
|
||||
</P>
|
||||
<P>
|
||||
The subject and replacement lengths can be given as PCRE2_ZERO_TERMINATED for
|
||||
zero-terminated strings. The options are:
|
||||
<pre>
|
||||
PCRE2_ANCHORED Match only at the first position
|
||||
PCRE2_ENDANCHORED Pattern can match only at end of subject
|
||||
PCRE2_NOTBOL Subject is not the beginning of a line
|
||||
PCRE2_NOTEOL Subject is not the end of a line
|
||||
PCRE2_NOTEMPTY An empty string is not a valid match
|
||||
PCRE2_NOTEMPTY_ATSTART An empty string at the start of the subject is not a valid match
|
||||
PCRE2_NO_JIT Do not use JIT matching
|
||||
PCRE2_NO_UTF_CHECK Do not check the subject or replacement for UTF validity (only relevant if
|
||||
PCRE2_UTF was set at compile time)
|
||||
PCRE2_SUBSTITUTE_EXTENDED Do extended replacement processing
|
||||
PCRE2_SUBSTITUTE_GLOBAL Replace all occurrences in the subject
|
||||
PCRE2_SUBSTITUTE_OVERFLOW_LENGTH If overflow, compute needed length
|
||||
PCRE2_SUBSTITUTE_UNKNOWN_UNSET Treat unknown group as unset
|
||||
PCRE2_SUBSTITUTE_UNSET_EMPTY Simple unset insert = empty string
|
||||
PCRE2_ANCHORED Match only at the first position
|
||||
PCRE2_ENDANCHORED Match only at end of subject
|
||||
PCRE2_NOTBOL Subject is not the beginning of a line
|
||||
PCRE2_NOTEOL Subject is not the end of a line
|
||||
PCRE2_NOTEMPTY An empty string is not a valid match
|
||||
PCRE2_NOTEMPTY_ATSTART An empty string at the start of the subject is not a valid match
|
||||
PCRE2_NO_JIT Do not use JIT matching
|
||||
PCRE2_NO_UTF_CHECK Do not check for UTF validity in the subject or replacement
|
||||
(only relevant if PCRE2_UTF was set at compile time)
|
||||
PCRE2_SUBSTITUTE_EXTENDED Do extended replacement processing
|
||||
PCRE2_SUBSTITUTE_GLOBAL Replace all occurrences in the subject
|
||||
PCRE2_SUBSTITUTE_LITERAL The replacement string is literal
|
||||
PCRE2_SUBSTITUTE_MATCHED Use pre-existing match data for first match
|
||||
PCRE2_SUBSTITUTE_OVERFLOW_LENGTH If overflow, compute needed length
|
||||
PCRE2_SUBSTITUTE_REPLACEMENT_ONLY Return only replacement string(s)
|
||||
PCRE2_SUBSTITUTE_UNKNOWN_UNSET Treat unknown group as unset
|
||||
PCRE2_SUBSTITUTE_UNSET_EMPTY Simple unset insert = empty string
|
||||
</pre>
|
||||
If PCRE2_SUBSTITUTE_LITERAL is set, PCRE2_SUBSTITUTE_EXTENDED,
|
||||
PCRE2_SUBSTITUTE_UNKNOWN_UNSET, and PCRE2_SUBSTITUTE_UNSET_EMPTY are ignored.
|
||||
</P>
|
||||
<P>
|
||||
If PCRE2_SUBSTITUTE_MATCHED is set, <i>match_data</i> must be non-NULL; its
|
||||
contents must be the result of a call to <b>pcre2_match()</b> using the same
|
||||
pattern and subject.
|
||||
</P>
|
||||
<P>
|
||||
The function returns the number of substitutions, which may be zero if there
|
||||
were no matches. The result can be greater than one only when
|
||||
are no matches. The result may be greater than one only when
|
||||
PCRE2_SUBSTITUTE_GLOBAL is set. In the event of an error, a negative error code
|
||||
is returned.
|
||||
</P>
|
||||
|
|
|
@ -252,7 +252,7 @@ document for an overview of all the PCRE2 documentation.
|
|||
<b>int pcre2_substitute(const pcre2_code *<i>code</i>, PCRE2_SPTR <i>subject</i>,</b>
|
||||
<b> PCRE2_SIZE <i>length</i>, PCRE2_SIZE <i>startoffset</i>,</b>
|
||||
<b> uint32_t <i>options</i>, pcre2_match_data *<i>match_data</i>,</b>
|
||||
<b> pcre2_match_context *<i>mcontext</i>, PCRE2_SPTR \fIreplacementzfP,</b>
|
||||
<b> pcre2_match_context *<i>mcontext</i>, PCRE2_SPTR <i>replacementz</i>,</b>
|
||||
<b> PCRE2_SIZE <i>rlength</i>, PCRE2_UCHAR *<i>outputbuffer</i>,</b>
|
||||
<b> PCRE2_SIZE *<i>outlengthptr</i>);</b>
|
||||
</P>
|
||||
|
@ -626,14 +626,15 @@ documentation for more details.
|
|||
<P>
|
||||
In a more complicated situation, where patterns are compiled only when they are
|
||||
first needed, but are still shared between threads, pointers to compiled
|
||||
patterns must be protected from simultaneous writing by multiple threads, at
|
||||
least until a pattern has been compiled. The logic can be something like this:
|
||||
patterns must be protected from simultaneous writing by multiple threads. This
|
||||
is somewhat tricky to do correctly. If you know that writing to a pointer is
|
||||
atomic in your environment, you can use logic like this:
|
||||
<pre>
|
||||
Get a read-only (shared) lock (mutex) for pointer
|
||||
if (pointer == NULL)
|
||||
{
|
||||
Get a write (unique) lock for pointer
|
||||
pointer = pcre2_compile(...
|
||||
if (pointer == NULL) pointer = pcre2_compile(...
|
||||
}
|
||||
Release the lock
|
||||
Use pointer in pcre2_match()
|
||||
|
@ -641,10 +642,39 @@ least until a pattern has been compiled. The logic can be something like this:
|
|||
Of course, testing for compilation errors should also be included in the code.
|
||||
</P>
|
||||
<P>
|
||||
If JIT is being used, but the JIT compilation is not being done immediately,
|
||||
(perhaps waiting to see if the pattern is used often enough) similar logic is
|
||||
required. JIT compilation updates a pointer within the compiled code block, so
|
||||
a thread must gain unique write access to the pointer before calling
|
||||
The reason for checking the pointer a second time is as follows: Several
|
||||
threads may have acquired the shared lock and tested the pointer for being
|
||||
NULL, but only one of them will be given the write lock, with the rest kept
|
||||
waiting. The winning thread will compile the pattern and store the result.
|
||||
After this thread releases the write lock, another thread will get it, and if
|
||||
it does not retest pointer for being NULL, will recompile the pattern and
|
||||
overwrite the pointer, creating a memory leak and possibly causing other
|
||||
issues.
|
||||
</P>
|
||||
<P>
|
||||
In an environment where writing to a pointer may not be atomic, the above logic
|
||||
is not sufficient. The thread that is doing the compiling may be descheduled
|
||||
after writing only part of the pointer, which could cause other threads to use
|
||||
an invalid value. Instead of checking the pointer itself, a separate "pointer
|
||||
is valid" flag (that can be updated atomically) must be used:
|
||||
<pre>
|
||||
Get a read-only (shared) lock (mutex) for pointer
|
||||
if (!pointer_is_valid)
|
||||
{
|
||||
Get a write (unique) lock for pointer
|
||||
if (!pointer_is_valid)
|
||||
{
|
||||
pointer = pcre2_compile(...
|
||||
pointer_is_valid = TRUE
|
||||
}
|
||||
}
|
||||
Release the lock
|
||||
Use pointer in pcre2_match()
|
||||
</pre>
|
||||
If JIT is being used, but the JIT compilation is not being done immediately
|
||||
(perhaps waiting to see if the pattern is used often enough), similar logic is
|
||||
required. JIT compilation updates a value within the compiled code block, so a
|
||||
thread must gain unique write access to the pointer before calling
|
||||
<b>pcre2_jit_compile()</b>. Alternatively, <b>pcre2_code_copy()</b> or
|
||||
<b>pcre2_code_copy_with_tables()</b> can be used to obtain a private copy of the
|
||||
compiled code before calling the JIT compiler.
|
||||
|
@ -987,7 +1017,7 @@ has its own memory control arrangements (see the
|
|||
documentation for more details). If the limit is reached, the negative error
|
||||
code PCRE2_ERROR_HEAPLIMIT is returned. The default limit can be set when PCRE2
|
||||
is built; if it is not, the default is set very large and is essentially
|
||||
"unlimited".
|
||||
unlimited.
|
||||
</P>
|
||||
<P>
|
||||
A value for the heap limit may also be supplied by an item at the start of a
|
||||
|
@ -1000,19 +1030,17 @@ less than the limit set by the caller of <b>pcre2_match()</b> or, if no such
|
|||
limit is set, less than the default.
|
||||
</P>
|
||||
<P>
|
||||
The <b>pcre2_match()</b> function starts out using a 20KiB vector on the system
|
||||
stack for recording backtracking points. The more nested backtracking points
|
||||
there are (that is, the deeper the search tree), the more memory is needed.
|
||||
Heap memory is used only if the initial vector is too small. If the heap limit
|
||||
is set to a value less than 21 (in particular, zero) no heap memory will be
|
||||
used. In this case, only patterns that do not have a lot of nested backtracking
|
||||
can be successfully processed.
|
||||
The <b>pcre2_match()</b> function always needs some heap memory, so setting a
|
||||
value of zero guarantees a "heap limit exceeded" error. Details of how
|
||||
<b>pcre2_match()</b> uses the heap are given in the
|
||||
<a href="pcre2perform.html"><b>pcre2perform</b></a>
|
||||
documentation.
|
||||
</P>
|
||||
<P>
|
||||
Similarly, for <b>pcre2_dfa_match()</b>, a vector on the system stack is used
|
||||
when processing pattern recursions, lookarounds, or atomic groups, and only if
|
||||
this is not big enough is heap memory used. In this case, too, setting a value
|
||||
of zero disables the use of the heap.
|
||||
For <b>pcre2_dfa_match()</b>, a vector on the system stack is used when
|
||||
processing pattern recursions, lookarounds, or atomic groups, and only if this
|
||||
is not big enough is heap memory used. In this case, setting a value of zero
|
||||
disables the use of the heap.
|
||||
<br>
|
||||
<br>
|
||||
<b>int pcre2_set_match_limit(pcre2_match_context *<i>mcontext</i>,</b>
|
||||
|
@ -1059,10 +1087,10 @@ less than the limit set by the caller of <b>pcre2_match()</b> or
|
|||
<br>
|
||||
<br>
|
||||
This parameter limits the depth of nested backtracking in <b>pcre2_match()</b>.
|
||||
Each time a nested backtracking point is passed, a new memory "frame" is used
|
||||
Each time a nested backtracking point is passed, a new memory frame is used
|
||||
to remember the state of matching at that point. Thus, this parameter
|
||||
indirectly limits the amount of memory that is used in a match. However,
|
||||
because the size of each memory "frame" depends on the number of capturing
|
||||
because the size of each memory frame depends on the number of capturing
|
||||
parentheses, the actual memory limit varies from pattern to pattern. This limit
|
||||
was more useful in versions before 10.30, where function recursion was used for
|
||||
backtracking.
|
||||
|
@ -1105,10 +1133,11 @@ less than the limit set by the caller of <b>pcre2_match()</b> or
|
|||
<b>int pcre2_config(uint32_t <i>what</i>, void *<i>where</i>);</b>
|
||||
</P>
|
||||
<P>
|
||||
The function <b>pcre2_config()</b> makes it possible for a PCRE2 client to
|
||||
discover which optional features have been compiled into the PCRE2 library. The
|
||||
The function <b>pcre2_config()</b> makes it possible for a PCRE2 client to find
|
||||
the value of certain configuration parameters and to discover which optional
|
||||
features have been compiled into the PCRE2 library. The
|
||||
<a href="pcre2build.html"><b>pcre2build</b></a>
|
||||
documentation has more details about these optional features.
|
||||
documentation has more details about these features.
|
||||
</P>
|
||||
<P>
|
||||
The first argument for <b>pcre2_config()</b> specifies which information is
|
||||
|
@ -1224,6 +1253,13 @@ over compilation stack usage, see <b>pcre2_set_compile_recursion_guard()</b>.
|
|||
</pre>
|
||||
This parameter is obsolete and should not be used in new code. The output is a
|
||||
uint32_t integer that is always set to zero.
|
||||
<pre>
|
||||
PCRE2_CONFIG_TABLES_LENGTH
|
||||
</pre>
|
||||
The output is a uint32_t integer that gives the length of PCRE2's character
|
||||
processing tables in bytes. For details of these tables see the
|
||||
<a href="#localesupport">section on locale support</a>
|
||||
below.
|
||||
<pre>
|
||||
PCRE2_CONFIG_UNICODE_VERSION
|
||||
</pre>
|
||||
|
@ -1345,8 +1381,7 @@ If <i>errorcode</i> or <i>erroroffset</i> is NULL, <b>pcre2_compile()</b> return
|
|||
NULL immediately. Otherwise, the variables to which these point are set to an
|
||||
error code and an offset (number of code units) within the pattern,
|
||||
respectively, when <b>pcre2_compile()</b> returns NULL because a compilation
|
||||
error has occurred. The values are not defined when compilation is successful
|
||||
and <b>pcre2_compile()</b> returns a non-NULL value.
|
||||
error has occurred.
|
||||
</P>
|
||||
<P>
|
||||
There are nearly 100 positive error codes that <b>pcre2_compile()</b> may return
|
||||
|
@ -1361,15 +1396,18 @@ because the textual error messages that are obtained by calling the
|
|||
message"
|
||||
<a href="#geterrormessage">below)</a>
|
||||
should be self-explanatory. Macro names starting with PCRE2_ERROR_ are defined
|
||||
for both positive and negative error codes in <b>pcre2.h</b>.
|
||||
for both positive and negative error codes in <b>pcre2.h</b>. When compilation
|
||||
is successful <i>errorcode</i> is set to a value that returns the message "no
|
||||
error" if passed to <b>pcre2_get_error_message()</b>.
|
||||
</P>
|
||||
<P>
|
||||
The value returned in <i>erroroffset</i> is an indication of where in the
|
||||
pattern the error occurred. It is not necessarily the furthest point in the
|
||||
pattern that was read. For example, after the error "lookbehind assertion is
|
||||
not fixed length", the error offset points to the start of the failing
|
||||
assertion. For an invalid UTF-8 or UTF-16 string, the offset is that of the
|
||||
first code unit of the failing character.
|
||||
pattern an error occurred. When there is no error, zero is returned. A non-zero
|
||||
value is not necessarily the furthest point in the pattern that was read. For
|
||||
example, after the error "lookbehind assertion is not fixed length", the error
|
||||
offset points to the start of the failing assertion. For an invalid UTF-8 or
|
||||
UTF-16 string, the offset is that of the first code unit of the failing
|
||||
character.
|
||||
</P>
|
||||
<P>
|
||||
Some errors are not detected until the whole pattern has been scanned; in these
|
||||
|
@ -1481,13 +1519,16 @@ documentation.
|
|||
</pre>
|
||||
If this bit is set, letters in the pattern match both upper and lower case
|
||||
letters in the subject. It is equivalent to Perl's /i option, and it can be
|
||||
changed within a pattern by a (?i) option setting. If PCRE2_UTF is set, Unicode
|
||||
properties are used for all characters with more than one other case, and for
|
||||
all characters whose code points are greater than U+007F. For lower valued
|
||||
characters with only one other case, a lookup table is used for speed. When
|
||||
PCRE2_UTF is not set, a lookup table is used for all code points less than 256,
|
||||
and higher code points (available only in 16-bit or 32-bit mode) are treated as
|
||||
not having another case.
|
||||
changed within a pattern by a (?i) option setting. If either PCRE2_UTF or
|
||||
PCRE2_UCP is set, Unicode properties are used for all characters with more than
|
||||
one other case, and for all characters whose code points are greater than
|
||||
U+007F. Note that there are two ASCII characters, K and S, that, in addition to
|
||||
their lower case ASCII equivalents, are case-equivalent with U+212A (Kelvin
|
||||
sign) and U+017F (long S) respectively. For lower valued characters with only
|
||||
one other case, a lookup table is used for speed. When neither PCRE2_UTF nor
|
||||
PCRE2_UCP is set, a lookup table is used for all code points less than 256, and
|
||||
higher code points (available only in 16-bit or 32-bit mode) are treated as not
|
||||
having another case.
|
||||
<pre>
|
||||
PCRE2_DOLLAR_ENDONLY
|
||||
</pre>
|
||||
|
@ -1804,7 +1845,7 @@ undefined. It may cause your program to crash or loop.
|
|||
</P>
|
||||
<P>
|
||||
Note that this option can also be passed to <b>pcre2_match()</b> and
|
||||
<b>pcre_dfa_match()</b>, to suppress UTF validity checking of the subject
|
||||
<b>pcre2_dfa_match()</b>, to suppress UTF validity checking of the subject
|
||||
string.
|
||||
</P>
|
||||
<P>
|
||||
|
@ -1820,16 +1861,23 @@ are not representable in UTF-16.
|
|||
<pre>
|
||||
PCRE2_UCP
|
||||
</pre>
|
||||
This option changes the way PCRE2 processes \B, \b, \D, \d, \S, \s, \W,
|
||||
\w, and some of the POSIX character classes. By default, only ASCII characters
|
||||
are recognized, but if PCRE2_UCP is set, Unicode properties are used instead to
|
||||
classify characters. More details are given in the section on
|
||||
This option has two effects. Firstly, it change the way PCRE2 processes \B,
|
||||
\b, \D, \d, \S, \s, \W, \w, and some of the POSIX character classes. By
|
||||
default, only ASCII characters are recognized, but if PCRE2_UCP is set, Unicode
|
||||
properties are used instead to classify characters. More details are given in
|
||||
the section on
|
||||
<a href="pcre2pattern.html#genericchartypes">generic character types</a>
|
||||
in the
|
||||
<a href="pcre2pattern.html"><b>pcre2pattern</b></a>
|
||||
page. If you set PCRE2_UCP, matching one of the items it affects takes much
|
||||
longer. The option is available only if PCRE2 has been compiled with Unicode
|
||||
support (which is the default).
|
||||
longer.
|
||||
</P>
|
||||
<P>
|
||||
The second effect of PCRE2_UCP is to force the use of Unicode properties for
|
||||
upper/lower casing operations on characters with code points greater than 127,
|
||||
even when PCRE2_UTF is not set. This makes it possible, for example, to process
|
||||
strings in the 16-bit UCS-2 code. This option is available only if PCRE2 has
|
||||
been compiled with Unicode support (which is the default).
|
||||
<pre>
|
||||
PCRE2_UNGREEDY
|
||||
</pre>
|
||||
|
@ -1866,6 +1914,13 @@ Extra compile options
|
|||
<P>
|
||||
The option bits that can be set in a compile context by calling the
|
||||
<b>pcre2_set_compile_extra_options()</b> function are as follows:
|
||||
<pre>
|
||||
PCRE2_EXTRA_ALLOW_LOOKAROUND_BSK
|
||||
</pre>
|
||||
Since release 10.38 PCRE2 has forbidden the use of \K within lookaround
|
||||
assertions, following Perl's lead. This option is provided to re-enable the
|
||||
previous behaviour (act in positive lookarounds, ignore in negative ones) in
|
||||
case anybody is relying on it.
|
||||
<pre>
|
||||
PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES
|
||||
</pre>
|
||||
|
@ -1997,14 +2052,20 @@ PCRE2 handles caseless matching, and determines whether characters are letters,
|
|||
digits, or whatever, by reference to a set of tables, indexed by character code
|
||||
point. However, this applies only to characters whose code points are less than
|
||||
256. By default, higher-valued code points never match escapes such as \w or
|
||||
\d. When PCRE2 is built with Unicode support (the default), all characters can
|
||||
be tested with \p and \P, or, alternatively, the PCRE2_UCP option can be set
|
||||
when a pattern is compiled; this causes \w and friends to use Unicode property
|
||||
support instead of the built-in tables.
|
||||
\d.
|
||||
</P>
|
||||
<P>
|
||||
When PCRE2 is built with Unicode support (the default), certain Unicode
|
||||
character properties can be tested with \p and \P, or, alternatively, the
|
||||
PCRE2_UCP option can be set when a pattern is compiled; this causes \w and
|
||||
friends to use Unicode property support instead of the built-in tables.
|
||||
PCRE2_UCP also causes upper/lower casing operations on characters with code
|
||||
points greater than 127 to use Unicode properties. These effects apply even
|
||||
when PCRE2_UTF is not set.
|
||||
</P>
|
||||
<P>
|
||||
The use of locales with Unicode is discouraged. If you are handling characters
|
||||
with code points greater than 128, you should either use Unicode support, or
|
||||
with code points greater than 127, you should either use Unicode support, or
|
||||
use locales, but not try to mix the two.
|
||||
</P>
|
||||
<P>
|
||||
|
@ -2030,7 +2091,7 @@ calling <b>pcre2_set_character_tables()</b> to set the tables pointer therein.
|
|||
</P>
|
||||
<P>
|
||||
For example, to build and use tables that are appropriate for the French locale
|
||||
(where accented characters with values greater than 128 are treated as
|
||||
(where accented characters with values greater than 127 are treated as
|
||||
letters), the following code could be used:
|
||||
<pre>
|
||||
setlocale(LC_CTYPE, "fr_FR");
|
||||
|
@ -2044,10 +2105,10 @@ are using Windows, the name for the French locale is "french".
|
|||
</P>
|
||||
<P>
|
||||
The pointer that is passed (via the compile context) to <b>pcre2_compile()</b>
|
||||
is saved with the compiled pattern, and the same tables are used by
|
||||
<b>pcre2_match()</b> and <b>pcre_dfa_match()</b>. Thus, for any single pattern,
|
||||
compilation and matching both happen in the same locale, but different patterns
|
||||
can be processed in different locales.
|
||||
is saved with the compiled pattern, and the same tables are used by the
|
||||
matching functions. Thus, for any single pattern, compilation and matching both
|
||||
happen in the same locale, but different patterns can be processed in different
|
||||
locales.
|
||||
</P>
|
||||
<P>
|
||||
It is the caller's responsibility to ensure that the memory containing the
|
||||
|
@ -2055,6 +2116,23 @@ tables remains available while they are still in use. When they are no longer
|
|||
needed, you can discard them using <b>pcre2_maketables_free()</b>, which should
|
||||
pass as its first parameter the same global context that was used to create the
|
||||
tables.
|
||||
</P>
|
||||
<br><b>
|
||||
Saving locale tables
|
||||
</b><br>
|
||||
<P>
|
||||
The tables described above are just a sequence of binary bytes, which makes
|
||||
them independent of hardware characteristics such as endianness or whether the
|
||||
processor is 32-bit or 64-bit. A copy of the result of <b>pcre2_maketables()</b>
|
||||
can therefore be saved in a file or elsewhere and re-used later, even in a
|
||||
different program or on another computer. The size of the tables (number of
|
||||
bytes) must be obtained by calling <b>pcre2_config()</b> with the
|
||||
PCRE2_CONFIG_TABLES_LENGTH option because <b>pcre2_maketables()</b> does not
|
||||
return this value. Note that the <b>pcre2_dftables</b> program, which is part of
|
||||
the PCRE2 build system, can be used stand-alone to create a file that contains
|
||||
a set of binary tables. See the
|
||||
<a href="pcre2build.html#createtables"><b>pcre2build</b></a>
|
||||
documentation for details.
|
||||
<a name="infoaboutpattern"></a></P>
|
||||
<br><a name="SEC23" href="#TOC1">INFORMATION ABOUT A COMPILED PATTERN</a><br>
|
||||
<P>
|
||||
|
@ -2063,7 +2141,7 @@ tables.
|
|||
<P>
|
||||
The <b>pcre2_pattern_info()</b> function returns general information about a
|
||||
compiled pattern. For information about callouts, see the
|
||||
<a href="pcre2pattern.html#infoaboutcallouts">next section.</a>
|
||||
<a href="#infoaboutcallouts">next section.</a>
|
||||
The first argument for <b>pcre2_pattern_info()</b> is a pointer to the compiled
|
||||
pattern. The second argument specifies which piece of information is required,
|
||||
and the third argument is a pointer to a variable to receive the data. If the
|
||||
|
@ -2238,7 +2316,7 @@ return zero. The third argument should point to a <b>size_t</b> variable.
|
|||
PCRE2_INFO_LASTCODETYPE
|
||||
</pre>
|
||||
Returns 1 if there is a rightmost literal code unit that must exist in any
|
||||
matched string, other than at its start. The third argument should point to a
|
||||
matched string, other than at its start. The third argument should point to a
|
||||
<b>uint32_t</b> variable. If there is no such value, 0 is returned. When 1 is
|
||||
returned, the code unit value itself can be retrieved using
|
||||
PCRE2_INFO_LASTCODEUNIT. For anchored patterns, a last literal value is
|
||||
|
@ -2441,20 +2519,31 @@ to an abstract format like Java or .NET serialization.
|
|||
Information about a successful or unsuccessful match is placed in a match
|
||||
data block, which is an opaque structure that is accessed by function calls. In
|
||||
particular, the match data block contains a vector of offsets into the subject
|
||||
string that define the matched part of the subject and any substrings that were
|
||||
captured. This is known as the <i>ovector</i>.
|
||||
string that define the matched parts of the subject. This is known as the
|
||||
<i>ovector</i>.
|
||||
</P>
|
||||
<P>
|
||||
Before calling <b>pcre2_match()</b>, <b>pcre2_dfa_match()</b>, or
|
||||
<b>pcre2_jit_match()</b> you must create a match data block by calling one of
|
||||
the creation functions above. For <b>pcre2_match_data_create()</b>, the first
|
||||
argument is the number of pairs of offsets in the <i>ovector</i>. One pair of
|
||||
offsets is required to identify the string that matched the whole pattern, with
|
||||
an additional pair for each captured substring. For example, a value of 4
|
||||
creates enough space to record the matched portion of the subject plus three
|
||||
captured substrings. A minimum of at least 1 pair is imposed by
|
||||
<b>pcre2_match_data_create()</b>, so it is always possible to return the overall
|
||||
matched string.
|
||||
argument is the number of pairs of offsets in the <i>ovector</i>.
|
||||
</P>
|
||||
<P>
|
||||
When using <b>pcre2_match()</b>, one pair of offsets is required to identify the
|
||||
string that matched the whole pattern, with an additional pair for each
|
||||
captured substring. For example, a value of 4 creates enough space to record
|
||||
the matched portion of the subject plus three captured substrings.
|
||||
</P>
|
||||
<P>
|
||||
When using <b>pcre2_dfa_match()</b> there may be multiple matched substrings of
|
||||
different lengths at the same point in the subject. The ovector should be made
|
||||
large enough to hold as many as are expected.
|
||||
</P>
|
||||
<P>
|
||||
A minimum of at least 1 pair is imposed by <b>pcre2_match_data_create()</b>, so
|
||||
it is always possible to return the overall matched string in the case of
|
||||
<b>pcre2_match()</b> or the longest match in the case of
|
||||
<b>pcre2_dfa_match()</b>.
|
||||
</P>
|
||||
<P>
|
||||
The second argument of <b>pcre2_match_data_create()</b> is a pointer to a
|
||||
|
@ -2465,10 +2554,11 @@ pass NULL, which causes <b>malloc()</b> to be used.
|
|||
<P>
|
||||
For <b>pcre2_match_data_create_from_pattern()</b>, the first argument is a
|
||||
pointer to a compiled pattern. The ovector is created to be exactly the right
|
||||
size to hold all the substrings a pattern might capture. The second argument is
|
||||
again a pointer to a general context, but in this case if NULL is passed, the
|
||||
memory is obtained using the same allocator that was used for the compiled
|
||||
pattern (custom or default).
|
||||
size to hold all the substrings a pattern might capture when matched using
|
||||
<b>pcre2_match()</b>. You should not use this call when matching with
|
||||
<b>pcre2_dfa_match()</b>. The second argument is again a pointer to a general
|
||||
context, but in this case if NULL is passed, the memory is obtained using the
|
||||
same allocator that was used for the compiled pattern (custom or default).
|
||||
</P>
|
||||
<P>
|
||||
A match data block can be used many times, with the same or different compiled
|
||||
|
@ -2550,7 +2640,9 @@ The subject string is passed to <b>pcre2_match()</b> as a pointer in
|
|||
<i>startoffset</i>. The length and offset are in code units, not characters.
|
||||
That is, they are in bytes for the 8-bit library, 16-bit code units for the
|
||||
16-bit library, and 32-bit code units for the 32-bit library, whether or not
|
||||
UTF processing is enabled.
|
||||
UTF processing is enabled. As a special case, if <i>subject</i> is NULL and
|
||||
<i>length</i> is zero, the subject is assumed to be an empty string. If
|
||||
<i>length</i> is non-zero, an error occurs if <i>subject</i> is NULL.
|
||||
</P>
|
||||
<P>
|
||||
If <i>startoffset</i> is greater than the length of the subject,
|
||||
|
@ -2572,10 +2664,10 @@ lookbehind. For example, consider the pattern
|
|||
</pre>
|
||||
which finds occurrences of "iss" in the middle of words. (\B matches only if
|
||||
the current position in the subject is not a word boundary.) When applied to
|
||||
the string "Mississipi" the first call to <b>pcre2_match()</b> finds the first
|
||||
the string "Mississippi" the first call to <b>pcre2_match()</b> finds the first
|
||||
occurrence. If <b>pcre2_match()</b> is called again with just the remainder of
|
||||
the subject, namely "issipi", it does not match, because \B is always false at
|
||||
the start of the subject, which is deemed to be a word boundary. However, if
|
||||
the subject, namely "issippi", it does not match, because \B is always false
|
||||
at the start of the subject, which is deemed to be a word boundary. However, if
|
||||
<b>pcre2_match()</b> is passed the entire string again, but with
|
||||
<i>startoffset</i> set to 4, it finds the second occurrence of "iss" because it
|
||||
is able to look behind the starting point to discover that it is preceded by a
|
||||
|
@ -3054,11 +3146,11 @@ The backtracking match limit was reached.
|
|||
<pre>
|
||||
PCRE2_ERROR_NOMEMORY
|
||||
</pre>
|
||||
If a pattern contains many nested backtracking points, heap memory is used to
|
||||
remember them. This error is given when the memory allocation function (default
|
||||
or custom) fails. Note that a different error, PCRE2_ERROR_HEAPLIMIT, is given
|
||||
if the amount of memory needed exceeds the heap limit. PCRE2_ERROR_NOMEMORY is
|
||||
also returned if PCRE2_COPY_MATCHED_SUBJECT is set and memory allocation fails.
|
||||
Heap memory is used to remember backgracking points. This error is given when
|
||||
the memory allocation function (default or custom) fails. Note that a different
|
||||
error, PCRE2_ERROR_HEAPLIMIT, is given if the amount of memory needed exceeds
|
||||
the heap limit. PCRE2_ERROR_NOMEMORY is also returned if
|
||||
PCRE2_COPY_MATCHED_SUBJECT is set and memory allocation fails.
|
||||
<pre>
|
||||
PCRE2_ERROR_NULL
|
||||
</pre>
|
||||
|
@ -3302,12 +3394,25 @@ same number causes an error at compile time.
|
|||
<b> PCRE2_SIZE *<i>outlengthptr</i>);</b>
|
||||
</P>
|
||||
<P>
|
||||
This function calls <b>pcre2_match()</b> and then makes a copy of the subject
|
||||
string in <i>outputbuffer</i>, replacing one or more parts that were matched
|
||||
with the <i>replacement</i> string, whose length is supplied in <b>rlength</b>.
|
||||
This can be given as PCRE2_ZERO_TERMINATED for a zero-terminated string.
|
||||
The default is to perform just one replacement, but there is an option that
|
||||
requests multiple replacements (see PCRE2_SUBSTITUTE_GLOBAL below for details).
|
||||
This function optionally calls <b>pcre2_match()</b> and then makes a copy of the
|
||||
subject string in <i>outputbuffer</i>, replacing parts that were matched with
|
||||
the <i>replacement</i> string, whose length is supplied in <b>rlength</b>, which
|
||||
can be given as PCRE2_ZERO_TERMINATED for a zero-terminated string. As a
|
||||
special case, if <i>replacement</i> is NULL and <i>rlength</i> is zero, the
|
||||
replacement is assumed to be an empty string. If <i>rlength</i> is non-zero, an
|
||||
error occurs if <i>replacement</i> is NULL.
|
||||
</P>
|
||||
<P>
|
||||
There is an option (see PCRE2_SUBSTITUTE_REPLACEMENT_ONLY below) to return just
|
||||
the replacement string(s). The default action is to perform just one
|
||||
replacement if the pattern matches, but there is an option that requests
|
||||
multiple replacements (see PCRE2_SUBSTITUTE_GLOBAL below).
|
||||
</P>
|
||||
<P>
|
||||
If successful, <b>pcre2_substitute()</b> returns the number of substitutions
|
||||
that were carried out. This may be zero if no match was found, and is never
|
||||
greater than one unless PCRE2_SUBSTITUTE_GLOBAL is set. A negative value is
|
||||
returned if an error is detected.
|
||||
</P>
|
||||
<P>
|
||||
Matches in which a \K item in a lookahead in the pattern causes the match to
|
||||
|
@ -3325,35 +3430,86 @@ functions from the match context, if provided, or else those that were used to
|
|||
allocate memory for the compiled code.
|
||||
</P>
|
||||
<P>
|
||||
If an external <i>match_data</i> block is provided, its contents afterwards
|
||||
are those set by the final call to <b>pcre2_match()</b>. For global changes,
|
||||
this will have ended in a matching error. The contents of the ovector within
|
||||
the match data block may or may not have been changed.
|
||||
If <i>match_data</i> is not NULL and PCRE2_SUBSTITUTE_MATCHED is not set, the
|
||||
provided block is used for all calls to <b>pcre2_match()</b>, and its contents
|
||||
afterwards are the result of the final call. For global changes, this will
|
||||
always be a no-match error. The contents of the ovector within the match data
|
||||
block may or may not have been changed.
|
||||
</P>
|
||||
<P>
|
||||
The <i>outlengthptr</i> argument must point to a variable that contains the
|
||||
length, in code units, of the output buffer. If the function is successful, the
|
||||
value is updated to contain the length of the new string, excluding the
|
||||
trailing zero that is automatically added.
|
||||
As well as the usual options for <b>pcre2_match()</b>, a number of additional
|
||||
options can be set in the <i>options</i> argument of <b>pcre2_substitute()</b>.
|
||||
One such option is PCRE2_SUBSTITUTE_MATCHED. When this is set, an external
|
||||
<i>match_data</i> block must be provided, and it must have already been used for
|
||||
an external call to <b>pcre2_match()</b> with the same pattern and subject
|
||||
arguments. The data in the <i>match_data</i> block (return code, offset vector)
|
||||
is then used for the first substitution instead of calling <b>pcre2_match()</b>
|
||||
from within <b>pcre2_substitute()</b>. This allows an application to check for a
|
||||
match before choosing to substitute, without having to repeat the match.
|
||||
</P>
|
||||
<P>
|
||||
The contents of the externally supplied match data block are not changed when
|
||||
PCRE2_SUBSTITUTE_MATCHED is set. If PCRE2_SUBSTITUTE_GLOBAL is also set,
|
||||
<b>pcre2_match()</b> is called after the first substitution to check for further
|
||||
matches, but this is done using an internally obtained match data block, thus
|
||||
always leaving the external block unchanged.
|
||||
</P>
|
||||
<P>
|
||||
The <i>code</i> argument is not used for matching before the first substitution
|
||||
when PCRE2_SUBSTITUTE_MATCHED is set, but it must be provided, even when
|
||||
PCRE2_SUBSTITUTE_GLOBAL is not set, because it contains information such as the
|
||||
UTF setting and the number of capturing parentheses in the pattern.
|
||||
</P>
|
||||
<P>
|
||||
The default action of <b>pcre2_substitute()</b> is to return a copy of the
|
||||
subject string with matched substrings replaced. However, if
|
||||
PCRE2_SUBSTITUTE_REPLACEMENT_ONLY is set, only the replacement substrings are
|
||||
returned. In the global case, multiple replacements are concatenated in the
|
||||
output buffer. Substitution callouts (see
|
||||
<a href="#subcallouts">below)</a>
|
||||
can be used to separate them if necessary.
|
||||
</P>
|
||||
<P>
|
||||
The <i>outlengthptr</i> argument of <b>pcre2_substitute()</b> must point to a
|
||||
variable that contains the length, in code units, of the output buffer. If the
|
||||
function is successful, the value is updated to contain the length in code
|
||||
units of the new string, excluding the trailing zero that is automatically
|
||||
added.
|
||||
</P>
|
||||
<P>
|
||||
If the function is not successful, the value set via <i>outlengthptr</i> depends
|
||||
on the type of error. For syntax errors in the replacement string, the value is
|
||||
the offset in the replacement string where the error was detected. For other
|
||||
errors, the value is PCRE2_UNSET by default. This includes the case of the
|
||||
output buffer being too small, unless PCRE2_SUBSTITUTE_OVERFLOW_LENGTH is set
|
||||
(see below), in which case the value is the minimum length needed, including
|
||||
space for the trailing zero. Note that in order to compute the required length,
|
||||
<b>pcre2_substitute()</b> has to simulate all the matching and copying, instead
|
||||
of giving an error return as soon as the buffer overflows. Note also that the
|
||||
length is in code units, not bytes.
|
||||
output buffer being too small, unless PCRE2_SUBSTITUTE_OVERFLOW_LENGTH is set.
|
||||
</P>
|
||||
<P>
|
||||
In the replacement string, which is interpreted as a UTF string in UTF mode,
|
||||
and is checked for UTF validity unless the PCRE2_NO_UTF_CHECK option is set, a
|
||||
dollar character is an escape character that can specify the insertion of
|
||||
characters from capture groups or names from (*MARK) or other control verbs
|
||||
in the pattern. The following forms are always recognized:
|
||||
PCRE2_SUBSTITUTE_OVERFLOW_LENGTH changes what happens when the output buffer is
|
||||
too small. The default action is to return PCRE2_ERROR_NOMEMORY immediately. If
|
||||
this option is set, however, <b>pcre2_substitute()</b> continues to go through
|
||||
the motions of matching and substituting (without, of course, writing anything)
|
||||
in order to compute the size of buffer that is needed. This value is passed
|
||||
back via the <i>outlengthptr</i> variable, with the result of the function still
|
||||
being PCRE2_ERROR_NOMEMORY.
|
||||
</P>
|
||||
<P>
|
||||
Passing a buffer size of zero is a permitted way of finding out how much memory
|
||||
is needed for given substitution. However, this does mean that the entire
|
||||
operation is carried out twice. Depending on the application, it may be more
|
||||
efficient to allocate a large buffer and free the excess afterwards, instead of
|
||||
using PCRE2_SUBSTITUTE_OVERFLOW_LENGTH.
|
||||
</P>
|
||||
<P>
|
||||
The replacement string, which is interpreted as a UTF string in UTF mode, is
|
||||
checked for UTF validity unless PCRE2_NO_UTF_CHECK is set. An invalid UTF
|
||||
replacement string causes an immediate return with the relevant UTF error code.
|
||||
</P>
|
||||
<P>
|
||||
If PCRE2_SUBSTITUTE_LITERAL is set, the replacement string is not interpreted
|
||||
in any way. By default, however, a dollar character is an escape character that
|
||||
can specify the insertion of characters from capture groups and names from
|
||||
(*MARK) or other control verbs in the pattern. The following forms are always
|
||||
recognized:
|
||||
<pre>
|
||||
$$ insert a dollar character
|
||||
$<n> or ${<n>} insert the contents of group <n>
|
||||
|
@ -3377,10 +3533,6 @@ facility can be used to perform simple simultaneous substitutions, as this
|
|||
apple lemon
|
||||
2: pear orange
|
||||
</pre>
|
||||
As well as the usual options for <b>pcre2_match()</b>, a number of additional
|
||||
options can be set in the <i>options</i> argument of <b>pcre2_substitute()</b>.
|
||||
</P>
|
||||
<P>
|
||||
PCRE2_SUBSTITUTE_GLOBAL causes the function to iterate over the subject string,
|
||||
replacing every matching substring. If this option is not set, only the first
|
||||
matching substring is replaced. The search for matches takes place in the
|
||||
|
@ -3392,7 +3544,7 @@ set in the match context, searching stops when that limit is reached.
|
|||
<P>
|
||||
You can restrict the effect of a global substitution to a portion of the
|
||||
subject string by setting either or both of <i>startoffset</i> and an offset
|
||||
limit. Here is a \fPpcre2test\fP example:
|
||||
limit. Here is a <b>pcre2test</b> example:
|
||||
<pre>
|
||||
/B/g,replace=!,use_offset_limit
|
||||
ABC ABC ABC ABC\=offset=3,offset_limit=12
|
||||
|
@ -3405,22 +3557,6 @@ CRLF is a valid newline sequence and the next two characters are CR, LF. In
|
|||
this case, the offset is advanced by two characters.
|
||||
</P>
|
||||
<P>
|
||||
PCRE2_SUBSTITUTE_OVERFLOW_LENGTH changes what happens when the output buffer is
|
||||
too small. The default action is to return PCRE2_ERROR_NOMEMORY immediately. If
|
||||
this option is set, however, <b>pcre2_substitute()</b> continues to go through
|
||||
the motions of matching and substituting (without, of course, writing anything)
|
||||
in order to compute the size of buffer that is needed. This value is passed
|
||||
back via the <i>outlengthptr</i> variable, with the result of the function still
|
||||
being PCRE2_ERROR_NOMEMORY.
|
||||
</P>
|
||||
<P>
|
||||
Passing a buffer size of zero is a permitted way of finding out how much memory
|
||||
is needed for given substitution. However, this does mean that the entire
|
||||
operation is carried out twice. Depending on the application, it may be more
|
||||
efficient to allocate a large buffer and free the excess afterwards, instead of
|
||||
using PCRE2_SUBSTITUTE_OVERFLOW_LENGTH.
|
||||
</P>
|
||||
<P>
|
||||
PCRE2_SUBSTITUTE_UNKNOWN_UNSET causes references to capture groups that do
|
||||
not appear in the pattern to be treated as unset groups. This option should be
|
||||
used with care, because it means that a typo in a group name or number no
|
||||
|
@ -3454,8 +3590,11 @@ and force lower case. The escape sequences change the current state: \U and
|
|||
terminating a \Q quoted sequence) reverts to no case forcing. The sequences
|
||||
\u and \l force the next character (if it is a letter) to upper or lower
|
||||
case, respectively, and then the state automatically reverts to no case
|
||||
forcing. Case forcing applies to all inserted characters, including those from
|
||||
capture groups and letters within \Q...\E quoted sequences.
|
||||
forcing. Case forcing applies to all inserted characters, including those from
|
||||
capture groups and letters within \Q...\E quoted sequences. If either
|
||||
PCRE2_UTF or PCRE2_UCP was set when the pattern was compiled, Unicode
|
||||
properties are used for case forcing characters whose code points are greater
|
||||
than 127.
|
||||
</P>
|
||||
<P>
|
||||
Note that case forcing sequences such as \U...\E do not nest. For example,
|
||||
|
@ -3494,14 +3633,17 @@ substitutions. However, PCRE2_SUBSTITUTE_UNKNOWN_UNSET does cause unknown
|
|||
groups in the extended syntax forms to be treated as unset.
|
||||
</P>
|
||||
<P>
|
||||
If successful, <b>pcre2_substitute()</b> returns the number of successful
|
||||
matches. This may be zero if no matches were found, and is never greater than 1
|
||||
unless PCRE2_SUBSTITUTE_GLOBAL is set.
|
||||
If PCRE2_SUBSTITUTE_LITERAL is set, PCRE2_SUBSTITUTE_UNKNOWN_UNSET,
|
||||
PCRE2_SUBSTITUTE_UNSET_EMPTY, and PCRE2_SUBSTITUTE_EXTENDED are irrelevant and
|
||||
are ignored.
|
||||
</P>
|
||||
<br><b>
|
||||
Substitution errors
|
||||
</b><br>
|
||||
<P>
|
||||
In the event of an error, a negative error code is returned. Except for
|
||||
PCRE2_ERROR_NOMATCH (which is never returned), errors from <b>pcre2_match()</b>
|
||||
are passed straight back.
|
||||
In the event of an error, <b>pcre2_substitute()</b> returns a negative error
|
||||
code. Except for PCRE2_ERROR_NOMATCH (which is never returned), errors from
|
||||
<b>pcre2_match()</b> are passed straight back.
|
||||
</P>
|
||||
<P>
|
||||
PCRE2_ERROR_NOSUBSTRING is returned for a non-existent substring insertion,
|
||||
|
@ -3519,6 +3661,12 @@ needed is returned via <i>outlengthptr</i>. Note that this does not happen by
|
|||
default.
|
||||
</P>
|
||||
<P>
|
||||
PCRE2_ERROR_NULL is returned if PCRE2_SUBSTITUTE_MATCHED is set but the
|
||||
<i>match_data</i> argument is NULL or if the <i>subject</i> or <i>replacement</i>
|
||||
arguments are NULL. For backward compatibility reasons an exception is made for
|
||||
the <i>replacement</i> argument if the <i>rlength</i> argument is also 0.
|
||||
</P>
|
||||
<P>
|
||||
PCRE2_ERROR_BADREPLACEMENT is used for miscellaneous syntax errors in the
|
||||
replacement string, with more particular errors being PCRE2_ERROR_BADREPESCAPE
|
||||
(invalid escape sequence), PCRE2_ERROR_REPMISSINGBRACE (closing curly bracket
|
||||
|
@ -3532,7 +3680,7 @@ As for all PCRE2 errors, a text message that describes the error can be
|
|||
obtained by calling the <b>pcre2_get_error_message()</b> function (see
|
||||
"Obtaining a textual error message"
|
||||
<a href="#geterrormessage">above).</a>
|
||||
</P>
|
||||
<a name="subcallouts"></a></P>
|
||||
<br><b>
|
||||
Substitution callouts
|
||||
</b><br>
|
||||
|
@ -3671,12 +3819,13 @@ other alternatives. Ultimately, when it runs out of matches,
|
|||
<P>
|
||||
The function <b>pcre2_dfa_match()</b> is called to match a subject string
|
||||
against a compiled pattern, using a matching algorithm that scans the subject
|
||||
string just once (not counting lookaround assertions), and does not backtrack.
|
||||
This has different characteristics to the normal algorithm, and is not
|
||||
compatible with Perl. Some of the features of PCRE2 patterns are not supported.
|
||||
Nevertheless, there are times when this kind of matching can be useful. For a
|
||||
discussion of the two matching algorithms, and a list of features that
|
||||
<b>pcre2_dfa_match()</b> does not support, see the
|
||||
string just once (not counting lookaround assertions), and does not backtrack
|
||||
(except when processing lookaround assertions). This has different
|
||||
characteristics to the normal algorithm, and is not compatible with Perl. Some
|
||||
of the features of PCRE2 patterns are not supported. Nevertheless, there are
|
||||
times when this kind of matching can be useful. For a discussion of the two
|
||||
matching algorithms, and a list of features that <b>pcre2_dfa_match()</b> does
|
||||
not support, see the
|
||||
<a href="pcre2matching.html"><b>pcre2matching</b></a>
|
||||
documentation.
|
||||
</P>
|
||||
|
@ -3711,7 +3860,7 @@ Here is an example of a simple call to <b>pcre2_dfa_match()</b>:
|
|||
</PRE>
|
||||
</P>
|
||||
<br><b>
|
||||
Option bits for <b>pcre_dfa_match()</b>
|
||||
Option bits for <b>pcre2_dfa_match()</b>
|
||||
</b><br>
|
||||
<P>
|
||||
The unused bits of the <i>options</i> argument for <b>pcre2_dfa_match()</b> must
|
||||
|
@ -3862,16 +4011,16 @@ fail, this error is given.
|
|||
<P>
|
||||
Philip Hazel
|
||||
<br>
|
||||
University Computing Service
|
||||
Retired from University Computing Service
|
||||
<br>
|
||||
Cambridge, England.
|
||||
<br>
|
||||
</P>
|
||||
<br><a name="SEC42" href="#TOC1">REVISION</a><br>
|
||||
<P>
|
||||
Last updated: 02 September 2019
|
||||
Last updated: 27 July 2022
|
||||
<br>
|
||||
Copyright © 1997-2019 University of Cambridge.
|
||||
Copyright © 1997-2022 University of Cambridge.
|
||||
<br>
|
||||
<p>
|
||||
Return to the <a href="index.html">PCRE2 index page</a>.
|
||||
|
|
|
@ -128,7 +128,7 @@ To build it without Unicode support, add
|
|||
--disable-unicode
|
||||
</pre>
|
||||
to the <b>configure</b> command. This setting applies to all three libraries. It
|
||||
is not possible to build one library with Unicode support, and another without,
|
||||
is not possible to build one library with Unicode support and another without
|
||||
in the same configuration.
|
||||
</P>
|
||||
<P>
|
||||
|
@ -142,8 +142,9 @@ locked this out by setting PCRE2_NEVER_UTF.
|
|||
UTF support allows the libraries to process character code points up to
|
||||
0x10ffff in the strings that they handle. Unicode support also gives access to
|
||||
the Unicode properties of characters, using pattern escapes such as \P, \p,
|
||||
and \X. Only the general category properties such as <i>Lu</i> and <i>Nd</i> are
|
||||
supported. Details are given in the
|
||||
and \X. Only the general category properties such as <i>Lu</i> and <i>Nd</i>,
|
||||
script names, and some bi-directional properties are supported. Details are
|
||||
given in the
|
||||
<a href="pcre2pattern.html"><b>pcre2pattern</b></a>
|
||||
documentation.
|
||||
</P>
|
||||
|
@ -188,11 +189,11 @@ which enables the use of an execmem allocator in JIT that is compatible with
|
|||
SELinux. This has no effect if JIT is not enabled. See the
|
||||
<a href="pcre2jit.html"><b>pcre2jit</b></a>
|
||||
documentation for a discussion of JIT usage. When JIT support is enabled,
|
||||
pcre2grep automatically makes use of it, unless you add
|
||||
<b>pcre2grep</b> automatically makes use of it, unless you add
|
||||
<pre>
|
||||
--disable-pcre2grep-jit
|
||||
</pre>
|
||||
to the "configure" command.
|
||||
to the <b>configure</b> command.
|
||||
</P>
|
||||
<br><a name="SEC8" href="#TOC1">NEWLINE RECOGNITION</a><br>
|
||||
<P>
|
||||
|
@ -283,12 +284,11 @@ to the <b>configure</b> command. This setting also applies to the
|
|||
counting is done differently).
|
||||
</P>
|
||||
<P>
|
||||
The <b>pcre2_match()</b> function starts out using a 20KiB vector on the system
|
||||
stack to record backtracking points. The more nested backtracking points there
|
||||
are (that is, the deeper the search tree), the more memory is needed. If the
|
||||
initial vector is not large enough, heap memory is used, up to a certain limit,
|
||||
which is specified in kibibytes (units of 1024 bytes). The limit can be changed
|
||||
at run time, as described in the
|
||||
The <b>pcre2_match()</b> function uses heap memory to record backtracking
|
||||
points. The more nested backtracking points there are (that is, the deeper the
|
||||
search tree), the more memory is needed. There is an upper limit, specified in
|
||||
kibibytes (units of 1024 bytes). This limit can be changed at run time, as
|
||||
described in the
|
||||
<a href="pcre2api.html"><b>pcre2api</b></a>
|
||||
documentation. The default limit (in effect unlimited) is 20 million. You can
|
||||
change this by a setting such as
|
||||
|
@ -307,7 +307,7 @@ You can also explicitly limit the depth of nested backtracking in the
|
|||
for --with-match-limit. You can set a lower default limit by adding, for
|
||||
example,
|
||||
<pre>
|
||||
--with-match-limit_depth=10000
|
||||
--with-match-limit-depth=10000
|
||||
</pre>
|
||||
to the <b>configure</b> command. This value can be overridden at run time. This
|
||||
depth limit indirectly limits the amount of heap memory that is used, but
|
||||
|
@ -321,7 +321,7 @@ As well as applying to <b>pcre2_match()</b>, the depth limit also controls
|
|||
the depth of recursive function calls in <b>pcre2_dfa_match()</b>. These are
|
||||
used for lookaround assertions, atomic groups, and recursion within patterns.
|
||||
The limit does not apply to JIT matching.
|
||||
</P>
|
||||
<a name="createtables"></a></P>
|
||||
<br><a name="SEC12" href="#TOC1">CREATING CHARACTER TABLES AT BUILD TIME</a><br>
|
||||
<P>
|
||||
PCRE2 uses fixed tables for processing characters whose code points are less
|
||||
|
@ -332,12 +332,34 @@ only. If you add
|
|||
--enable-rebuild-chartables
|
||||
</pre>
|
||||
to the <b>configure</b> command, the distributed tables are no longer used.
|
||||
Instead, a program called <b>dftables</b> is compiled and run. This outputs the
|
||||
source for new set of tables, created in the default locale of your C run-time
|
||||
system. This method of replacing the tables does not work if you are cross
|
||||
compiling, because <b>dftables</b> is run on the local host. If you need to
|
||||
create alternative tables when cross compiling, you will have to do so "by
|
||||
hand".
|
||||
Instead, a program called <b>pcre2_dftables</b> is compiled and run. This
|
||||
outputs the source for new set of tables, created in the default locale of your
|
||||
C run-time system. This method of replacing the tables does not work if you are
|
||||
cross compiling, because <b>pcre2_dftables</b> needs to be run on the local
|
||||
host and therefore not compiled with the cross compiler.
|
||||
</P>
|
||||
<P>
|
||||
If you need to create alternative tables when cross compiling, you will have to
|
||||
do so "by hand". There may also be other reasons for creating tables manually.
|
||||
To cause <b>pcre2_dftables</b> to be built on the local host, run a normal
|
||||
compiling command, and then run the program with the output file as its
|
||||
argument, for example:
|
||||
<pre>
|
||||
cc src/pcre2_dftables.c -o pcre2_dftables
|
||||
./pcre2_dftables src/pcre2_chartables.c
|
||||
</pre>
|
||||
This builds the tables in the default locale of the local host. If you want to
|
||||
specify a locale, you must use the -L option:
|
||||
<pre>
|
||||
LC_ALL=fr_FR ./pcre2_dftables -L src/pcre2_chartables.c
|
||||
</pre>
|
||||
You can also specify -b (with or without -L). This causes the tables to be
|
||||
written in binary instead of as source code. A set of binary tables can be
|
||||
loaded into memory by an application and passed to <b>pcre2_compile()</b> in the
|
||||
same way as tables created by calling <b>pcre2_maketables()</b>. The tables are
|
||||
just a string of bytes, independent of hardware characteristics such as
|
||||
endianness. This means they can be bundled with an application that runs in
|
||||
different environments, to ensure consistent behaviour.
|
||||
</P>
|
||||
<br><a name="SEC13" href="#TOC1">USING EBCDIC CODE</a><br>
|
||||
<P>
|
||||
|
@ -414,7 +436,7 @@ default parameter values by adding, for example,
|
|||
--with-pcre2grep-bufsize=51200
|
||||
--with-pcre2grep-max-bufsize=2097152
|
||||
</pre>
|
||||
to the <b>configure</b> command. The caller of \fPpcre2grep\fP can override
|
||||
to the <b>configure</b> command. The caller of <b>pcre2grep</b> can override
|
||||
these values by using --buffer-size and --max-buffer-size on the command line.
|
||||
</P>
|
||||
<br><a name="SEC17" href="#TOC1">PCRE2TEST OPTION FOR LIBREADLINE SUPPORT</a><br>
|
||||
|
@ -531,15 +553,16 @@ documentation.
|
|||
<P>
|
||||
The C99 standard defines formatting modifiers z and t for size_t and
|
||||
ptrdiff_t values, respectively. By default, PCRE2 uses these modifiers in
|
||||
environments other than Microsoft Visual Studio when __STDC_VERSION__ is
|
||||
defined and has a value greater than or equal to 199901L (indicating C99).
|
||||
environments other than old versions of Microsoft Visual Studio when
|
||||
__STDC_VERSION__ is defined and has a value greater than or equal to 199901L
|
||||
(indicating support for C99).
|
||||
However, there is at least one environment that claims to be C99 but does not
|
||||
support these modifiers. If
|
||||
<pre>
|
||||
--disable-percent-zt
|
||||
</pre>
|
||||
is specified, no use is made of the z or t modifiers. Instead or %td or %zu,
|
||||
%lu is used, with a cast for size_t values.
|
||||
is specified, no use is made of the z or t modifiers. Instead of %td or %zu,
|
||||
a suitable format is used depending in the size of long for the platform.
|
||||
</P>
|
||||
<br><a name="SEC22" href="#TOC1">SUPPORT FOR FUZZERS</a><br>
|
||||
<P>
|
||||
|
@ -585,16 +608,16 @@ give a warning.
|
|||
<P>
|
||||
Philip Hazel
|
||||
<br>
|
||||
University Computing Service
|
||||
Retired from University Computing Service
|
||||
<br>
|
||||
Cambridge, England.
|
||||
<br>
|
||||
</P>
|
||||
<br><a name="SEC26" href="#TOC1">REVISION</a><br>
|
||||
<P>
|
||||
Last updated: 03 March 2019
|
||||
Last updated: 27 July 2022
|
||||
<br>
|
||||
Copyright © 1997-2019 University of Cambridge.
|
||||
Copyright © 1997-2022 University of Cambridge.
|
||||
<br>
|
||||
<p>
|
||||
Return to the <a href="index.html">PCRE2 index page</a>.
|
||||
|
|
|
@ -16,32 +16,43 @@ please consult the man page, in case the conversion went wrong.
|
|||
DIFFERENCES BETWEEN PCRE2 AND PERL
|
||||
</b><br>
|
||||
<P>
|
||||
This document describes the differences in the ways that PCRE2 and Perl handle
|
||||
regular expressions. The differences described here are with respect to Perl
|
||||
versions 5.26, but as both Perl and PCRE2 are continually changing, the
|
||||
information may sometimes be out of date.
|
||||
This document describes some of the differences in the ways that PCRE2 and Perl
|
||||
handle regular expressions. The differences described here are with respect to
|
||||
Perl version 5.34.0, but as both Perl and PCRE2 are continually changing, the
|
||||
information may at times be out of date.
|
||||
</P>
|
||||
<P>
|
||||
1. PCRE2 has only a subset of Perl's Unicode support. Details of what it does
|
||||
1. When PCRE2_DOTALL (equivalent to Perl's /s qualifier) is not set, the
|
||||
behaviour of the '.' metacharacter differs from Perl. In PCRE2, '.' matches the
|
||||
next character unless it is the start of a newline sequence. This means that,
|
||||
if the newline setting is CR, CRLF, or NUL, '.' will match the code point LF
|
||||
(0x0A) in ASCII/Unicode environments, and NL (either 0x15 or 0x25) when using
|
||||
EBCDIC. In Perl, '.' appears never to match LF, even when 0x0A is not a newline
|
||||
indicator.
|
||||
</P>
|
||||
<P>
|
||||
2. PCRE2 has only a subset of Perl's Unicode support. Details of what it does
|
||||
have are given in the
|
||||
<a href="pcre2unicode.html"><b>pcre2unicode</b></a>
|
||||
page.
|
||||
</P>
|
||||
<P>
|
||||
2. Like Perl, PCRE2 allows repeat quantifiers on parenthesized assertions, but
|
||||
3. Like Perl, PCRE2 allows repeat quantifiers on parenthesized assertions, but
|
||||
they do not mean what you might think. For example, (?!a){3} does not assert
|
||||
that the next three characters are not "a". It just asserts that the next
|
||||
character is not "a" three times (in principle; PCRE2 optimizes this to run the
|
||||
assertion just once). Perl allows some repeat quantifiers on other assertions,
|
||||
for example, \b* (but not \b{3}), but these do not seem to have any use.
|
||||
for example, \b* , but these do not seem to have any use. PCRE2 does not allow
|
||||
any kind of quantifier on non-lookaround assertions.
|
||||
</P>
|
||||
<P>
|
||||
3. Capture groups that occur inside negative lookaround assertions are counted,
|
||||
4. Capture groups that occur inside negative lookaround assertions are counted,
|
||||
but their entries in the offsets vector are set only when a negative assertion
|
||||
is a condition that has a matching branch (that is, the condition is false).
|
||||
Perl may set such capture groups in other circumstances.
|
||||
</P>
|
||||
<P>
|
||||
4. The following Perl escape sequences are not supported: \F, \l, \L, \u,
|
||||
5. The following Perl escape sequences are not supported: \F, \l, \L, \u,
|
||||
\U, and \N when followed by a character name. \N on its own, matching a
|
||||
non-newline character, and \N{U+dd..}, matching a Unicode code point, are
|
||||
supported. The escapes that modify the case of following letters are
|
||||
|
@ -52,24 +63,26 @@ PCRE2_EXTRA_ALT_BSUX options is set, \U and \u are interpreted as ECMAScript
|
|||
interprets them.
|
||||
</P>
|
||||
<P>
|
||||
5. The Perl escape sequences \p, \P, and \X are supported only if PCRE2 is
|
||||
6. The Perl escape sequences \p, \P, and \X are supported only if PCRE2 is
|
||||
built with Unicode support (the default). The properties that can be tested
|
||||
with \p and \P are limited to the general category properties such as Lu and
|
||||
Nd, script names such as Greek or Han, and the derived properties Any and L&.
|
||||
PCRE2 does support the Cs (surrogate) property, which Perl does not; the Perl
|
||||
documentation says "Because Perl hides the need for the user to understand the
|
||||
internal representation of Unicode characters, there is no need to implement
|
||||
the somewhat messy concept of surrogates."
|
||||
Nd, script names such as Greek or Han, Bidi_Class, Bidi_Control, and the
|
||||
derived properties Any and LC (synonym L&). Both PCRE2 and Perl support the Cs
|
||||
(surrogate) property, but in PCRE2 its use is limited. See the
|
||||
<a href="pcre2pattern.html"><b>pcre2pattern</b></a>
|
||||
documentation for details. The long synonyms for property names that Perl
|
||||
supports (such as \p{Letter}) are not supported by PCRE2, nor is it permitted
|
||||
to prefix any of these properties with "Is".
|
||||
</P>
|
||||
<P>
|
||||
6. PCRE2 supports the \Q...\E escape for quoting substrings. Characters
|
||||
7. PCRE2 supports the \Q...\E escape for quoting substrings. Characters
|
||||
in between are treated as literals. However, this is slightly different from
|
||||
Perl in that $ and @ are also handled as literals inside the quotes. In Perl,
|
||||
they cause variable interpolation (but of course PCRE2 does not have
|
||||
variables). Also, Perl does "double-quotish backslash interpolation" on any
|
||||
backslashes between \Q and \E which, its documentation says, "may lead to
|
||||
confusing results". PCRE2 treats a backslash between \Q and \E just like any
|
||||
other character. Note the following examples:
|
||||
they cause variable interpolation (PCRE2 does not have variables). Also, Perl
|
||||
does "double-quotish backslash interpolation" on any backslashes between \Q
|
||||
and \E which, its documentation says, "may lead to confusing results". PCRE2
|
||||
treats a backslash between \Q and \E just like any other character. Note the
|
||||
following examples:
|
||||
<pre>
|
||||
Pattern PCRE2 matches Perl matches
|
||||
|
||||
|
@ -79,41 +92,38 @@ other character. Note the following examples:
|
|||
\QA\B\E A\B A\B
|
||||
\Q\\E \ \\E
|
||||
</pre>
|
||||
The \Q...\E sequence is recognized both inside and outside character classes.
|
||||
The \Q...\E sequence is recognized both inside and outside character classes
|
||||
by both PCRE2 and Perl.
|
||||
</P>
|
||||
<P>
|
||||
7. Fairly obviously, PCRE2 does not support the (?{code}) and (??{code})
|
||||
8. Fairly obviously, PCRE2 does not support the (?{code}) and (??{code})
|
||||
constructions. However, PCRE2 does have a "callout" feature, which allows an
|
||||
external function to be called during pattern matching. See the
|
||||
<a href="pcre2callout.html"><b>pcre2callout</b></a>
|
||||
documentation for details.
|
||||
</P>
|
||||
<P>
|
||||
8. Subroutine calls (whether recursive or not) were treated as atomic groups up
|
||||
9. Subroutine calls (whether recursive or not) were treated as atomic groups up
|
||||
to PCRE2 release 10.23, but from release 10.30 this changed, and backtracking
|
||||
into subroutine calls is now supported, as in Perl.
|
||||
</P>
|
||||
<P>
|
||||
9. If any of the backtracking control verbs are used in a group that is called
|
||||
as a subroutine (whether or not recursively), their effect is confined to that
|
||||
group; it does not extend to the surrounding pattern. This is not always the
|
||||
case in Perl. In particular, if (*THEN) is present in a group that is called as
|
||||
a subroutine, its action is limited to that group, even if the group does not
|
||||
contain any | characters. Note that such groups are processed as anchored
|
||||
at the point where they are tested.
|
||||
10. In PCRE2, if any of the backtracking control verbs are used in a group that
|
||||
is called as a subroutine (whether or not recursively), their effect is
|
||||
confined to that group; it does not extend to the surrounding pattern. This is
|
||||
not always the case in Perl. In particular, if (*THEN) is present in a group
|
||||
that is called as a subroutine, its action is limited to that group, even if
|
||||
the group does not contain any | characters. Note that such groups are
|
||||
processed as anchored at the point where they are tested.
|
||||
</P>
|
||||
<P>
|
||||
10. If a pattern contains more than one backtracking control verb, the first
|
||||
11. If a pattern contains more than one backtracking control verb, the first
|
||||
one that is backtracked onto acts. For example, in the pattern
|
||||
A(*COMMIT)B(*PRUNE)C a failure in B triggers (*COMMIT), but a failure in C
|
||||
triggers (*PRUNE). Perl's behaviour is more complex; in many cases it is the
|
||||
same as PCRE2, but there are cases where it differs.
|
||||
</P>
|
||||
<P>
|
||||
11. Most backtracking verbs in assertions have their normal actions. They are
|
||||
not confined to the assertion.
|
||||
</P>
|
||||
<P>
|
||||
12. There are some differences that are concerned with the settings of captured
|
||||
strings when part of a pattern is repeated. For example, matching "aba" against
|
||||
the pattern /^(a(b)?)+$/ in Perl leaves $2 unset, but in PCRE2 it is set to
|
||||
|
@ -123,7 +133,7 @@ the pattern /^(a(b)?)+$/ in Perl leaves $2 unset, but in PCRE2 it is set to
|
|||
13. PCRE2's handling of duplicate capture group numbers and names is not as
|
||||
general as Perl's. This is a consequence of the fact the PCRE2 works internally
|
||||
just with numbers, using an external table to translate between numbers and
|
||||
names. In particular, a pattern such as (?|(?<a>A)|(?<b>B), where the two
|
||||
names. In particular, a pattern such as (?|(?<a>A)|(?<b>B)), where the two
|
||||
capture groups have the same number but different names, is not supported, and
|
||||
causes an error at compile time. If it were allowed, it would not be possible
|
||||
to distinguish which group matched, because both names map to capture group
|
||||
|
@ -146,19 +156,27 @@ certainly user mistakes.
|
|||
16. In PCRE2, the upper/lower case character properties Lu and Ll are not
|
||||
affected when case-independent matching is specified. For example, \p{Lu}
|
||||
always matches an upper case letter. I think Perl has changed in this respect;
|
||||
in the release at the time of writing (5.24), \p{Lu} and \p{Ll} match all
|
||||
in the release at the time of writing (5.34), \p{Lu} and \p{Ll} match all
|
||||
letters, regardless of case, when case independence is specified.
|
||||
</P>
|
||||
<P>
|
||||
17. PCRE2 provides some extensions to the Perl regular expression facilities.
|
||||
Perl 5.10 includes new features that are not in earlier versions of Perl, some
|
||||
17. From release 5.32.0, Perl locks out the use of \K in lookaround
|
||||
assertions. From release 10.38 PCRE2 does the same by default. However, there
|
||||
is an option for re-enabling the previous behaviour. When this option is set,
|
||||
\K is acted on when it occurs in positive assertions, but is ignored in
|
||||
negative assertions.
|
||||
</P>
|
||||
<P>
|
||||
18. PCRE2 provides some extensions to the Perl regular expression facilities.
|
||||
Perl 5.10 included new features that were not in earlier versions of Perl, some
|
||||
of which (such as named parentheses) were in PCRE2 for some time before. This
|
||||
list is with respect to Perl 5.26:
|
||||
list is with respect to Perl 5.34:
|
||||
<br>
|
||||
<br>
|
||||
(a) Although lookbehind assertions in PCRE2 must match fixed length strings,
|
||||
each alternative branch of a lookbehind assertion can match a different length
|
||||
of string. Perl requires them all to have the same length.
|
||||
each alternative toplevel branch of a lookbehind assertion can match a
|
||||
different length of string. Perl used to require them all to have the same
|
||||
length, but the latest version has some variable length support.
|
||||
<br>
|
||||
<br>
|
||||
(b) From PCRE2 10.23, backreferences to groups of fixed length are supported
|
||||
|
@ -203,7 +221,7 @@ different way and is not Perl-compatible.
|
|||
<br>
|
||||
<br>
|
||||
(l) PCRE2 recognizes some special sequences such as (*CR) or (*NO_JIT) at
|
||||
the start of a pattern that set overall options that cannot be changed within
|
||||
the start of a pattern. These set overall options that cannot be changed within
|
||||
the pattern.
|
||||
<br>
|
||||
<br>
|
||||
|
@ -212,12 +230,12 @@ extension to the lookaround facilities. The default, Perl-compatible
|
|||
lookarounds are atomic.
|
||||
</P>
|
||||
<P>
|
||||
18. The Perl /a modifier restricts /d numbers to pure ascii, and the /aa
|
||||
19. The Perl /a modifier restricts /d numbers to pure ascii, and the /aa
|
||||
modifier restricts /i case-insensitive matching to pure ascii, ignoring Unicode
|
||||
rules. This separation cannot be represented with PCRE2_UCP.
|
||||
</P>
|
||||
<P>
|
||||
19. Perl has different limits than PCRE2. See the
|
||||
20. Perl has different limits than PCRE2. See the
|
||||
<a href="pcre2limit.html"><b>pcre2limit</b></a>
|
||||
documentation for details. Perl went with 5.10 from recursion to iteration
|
||||
keeping the intermediate matches on the heap, which is ~10% slower but does not
|
||||
|
@ -230,7 +248,7 @@ AUTHOR
|
|||
<P>
|
||||
Philip Hazel
|
||||
<br>
|
||||
University Computing Service
|
||||
Retired from University Computing Service
|
||||
<br>
|
||||
Cambridge, England.
|
||||
<br>
|
||||
|
@ -239,9 +257,9 @@ Cambridge, England.
|
|||
REVISION
|
||||
</b><br>
|
||||
<P>
|
||||
Last updated: 13 July 2019
|
||||
Last updated: 08 December 2021
|
||||
<br>
|
||||
Copyright © 1997-2019 University of Cambridge.
|
||||
Copyright © 1997-2021 University of Cambridge.
|
||||
<br>
|
||||
<p>
|
||||
Return to the <a href="index.html">PCRE2 index page</a>.
|
||||
|
|
|
@ -141,8 +141,8 @@ permitted to match separator characters, but the double-star (**) feature
|
|||
</P>
|
||||
<P>
|
||||
PCRE2_CONVERT_GLOB_NO_WILD_SEPARATOR matches globs with wildcards allowed to
|
||||
match separator characters. PCRE2_GLOB_NO_STARSTAR matches globs with the
|
||||
double-star feature disabled. These options may be given together.
|
||||
match separator characters. PCRE2_CONVERT_GLOB_NO_STARSTAR matches globs with
|
||||
the double-star feature disabled. These options may be given together.
|
||||
</P>
|
||||
<br><a name="SEC5" href="#TOC1">CONVERTING POSIX PATTERNS</a><br>
|
||||
<P>
|
||||
|
|
|
@ -215,8 +215,8 @@ if (rc < 0)
|
|||
return 1;
|
||||
}
|
||||
|
||||
/* Match succeded. Get a pointer to the output vector, where string offsets are
|
||||
stored. */
|
||||
/* Match succeeded. Get a pointer to the output vector, where string offsets
|
||||
are stored. */
|
||||
|
||||
ovector = pcre2_get_ovector_pointer(match_data);
|
||||
printf("Match succeeded at offset %d\n", (int)ovector[0]);
|
||||
|
@ -234,9 +234,12 @@ pcre2_match_data_create_from_pattern() above. */
|
|||
if (rc == 0)
|
||||
printf("ovector was not big enough for all the captured substrings\n");
|
||||
|
||||
/* We must guard against patterns such as /(?=.\K)/ that use \K in an assertion
|
||||
to set the start of a match later than its end. In this demonstration program,
|
||||
we just detect this case and give up. */
|
||||
/* Since release 10.38 PCRE2 has locked out the use of \K in lookaround
|
||||
assertions. However, there is an option to re-enable the old behaviour. If that
|
||||
is set, it is possible to run patterns such as /(?=.\K)/ that use \K in an
|
||||
assertion to set the start of a match later than its end. In this demonstration
|
||||
program, we show how to detect this case, but it shouldn't arise because the
|
||||
option is never set. */
|
||||
|
||||
if (ovector[0] > ovector[1])
|
||||
{
|
||||
|
@ -453,7 +456,7 @@ for (;;)
|
|||
return 1;
|
||||
}
|
||||
|
||||
/* Match succeded */
|
||||
/* Match succeeded */
|
||||
|
||||
printf("\nMatch succeeded again at offset %d\n", (int)ovector[0]);
|
||||
|
||||
|
|
|
@ -71,13 +71,15 @@ For example:
|
|||
<pre>
|
||||
pcre2grep some-pattern file1 - file3
|
||||
</pre>
|
||||
Input files are searched line by line. By default, each line that matches a
|
||||
By default, input files are searched line by line. Each line that matches a
|
||||
pattern is copied to the standard output, and if there is more than one file,
|
||||
the file name is output at the start of each line, followed by a colon.
|
||||
However, there are options that can change how <b>pcre2grep</b> behaves. In
|
||||
particular, the <b>-M</b> option makes it possible to search for strings that
|
||||
span line boundaries. What defines a line boundary is controlled by the
|
||||
<b>-N</b> (<b>--newline</b>) option.
|
||||
However, there are options that can change how <b>pcre2grep</b> behaves. For
|
||||
example, the <b>-M</b> option makes it possible to search for strings that span
|
||||
line boundaries. What defines a line boundary is controlled by the <b>-N</b>
|
||||
(<b>--newline</b>) option. The <b>-h</b> and <b>-H</b> options control whether or
|
||||
not file names are shown, and the <b>-Z</b> option changes the file name
|
||||
terminator to a zero byte.
|
||||
</P>
|
||||
<P>
|
||||
The amount of memory used for buffering files that are being scanned is
|
||||
|
@ -111,8 +113,8 @@ matching substrings, or if <b>--only-matching</b>, <b>--file-offsets</b>, or
|
|||
(either shown literally, or as an offset), scanning resumes immediately
|
||||
following the match, so that further matches on the same line can be found. If
|
||||
there are multiple patterns, they are all tried on the remainder of the line,
|
||||
but patterns that follow the one that matched are not tried on the earlier part
|
||||
of the line.
|
||||
but patterns that follow the one that matched are not tried on the earlier
|
||||
matched part of the line.
|
||||
</P>
|
||||
<P>
|
||||
This behaviour means that the order in which multiple patterns are specified
|
||||
|
@ -146,11 +148,10 @@ ignored.
|
|||
<br><a name="SEC4" href="#TOC1">BINARY FILES</a><br>
|
||||
<P>
|
||||
By default, a file that contains a binary zero byte within the first 1024 bytes
|
||||
is identified as a binary file, and is processed specially. (GNU grep
|
||||
identifies binary files in this manner.) However, if the newline type is
|
||||
specified as "nul", that is, the line terminator is a binary zero, the test for
|
||||
a binary file is not applied. See the <b>--binary-files</b> option for a means
|
||||
of changing the way binary files are handled.
|
||||
is identified as a binary file, and is processed specially. However, if the
|
||||
newline type is specified as NUL, that is, the line terminator is a binary
|
||||
zero, the test for a binary file is not applied. See the <b>--binary-files</b>
|
||||
option for a means of changing the way binary files are handled.
|
||||
</P>
|
||||
<br><a name="SEC5" href="#TOC1">BINARY ZEROS IN PATTERNS</a><br>
|
||||
<P>
|
||||
|
@ -179,9 +180,11 @@ Output up to <i>number</i> lines of context after each matching line. Fewer
|
|||
lines are output if the next match or the end of the file is reached, or if the
|
||||
processing buffer size has been set too small. If file names and/or line
|
||||
numbers are being output, a hyphen separator is used instead of a colon for the
|
||||
context lines. A line containing "--" is output between each group of lines,
|
||||
unless they are in fact contiguous in the input file. The value of <i>number</i>
|
||||
is expected to be relatively small. When <b>-c</b> is used, <b>-A</b> is ignored.
|
||||
context lines (the <b>-Z</b> option can be used to change the file name
|
||||
terminator to a zero byte). A line containing "--" is output between each group
|
||||
of lines, unless they are in fact contiguous in the input file. The value of
|
||||
<i>number</i> is expected to be relatively small. When <b>-c</b> is used,
|
||||
<b>-A</b> is ignored.
|
||||
</P>
|
||||
<P>
|
||||
<b>-a</b>, <b>--text</b>
|
||||
|
@ -189,14 +192,21 @@ Treat binary files as text. This is equivalent to
|
|||
<b>--binary-files</b>=<i>text</i>.
|
||||
</P>
|
||||
<P>
|
||||
<b>--allow-lookaround-bsk</b>
|
||||
PCRE2 now forbids the use of \K in lookarounds by default, in line with Perl.
|
||||
This option causes <b>pcre2grep</b> to set the PCRE2_EXTRA_ALLOW_LOOKAROUND_BSK
|
||||
option, which enables this somewhat dangerous usage.
|
||||
</P>
|
||||
<P>
|
||||
<b>-B</b> <i>number</i>, <b>--before-context=</b><i>number</i>
|
||||
Output up to <i>number</i> lines of context before each matching line. Fewer
|
||||
lines are output if the previous match or the start of the file is within
|
||||
<i>number</i> lines, or if the processing buffer size has been set too small. If
|
||||
file names and/or line numbers are being output, a hyphen separator is used
|
||||
instead of a colon for the context lines. A line containing "--" is output
|
||||
between each group of lines, unless they are in fact contiguous in the input
|
||||
file. The value of <i>number</i> is expected to be relatively small. When
|
||||
instead of a colon for the context lines (the <b>-Z</b> option can be used to
|
||||
change the file name terminator to a zero byte). A line containing "--" is
|
||||
output between each group of lines, unless they are in fact contiguous in the
|
||||
input file. The value of <i>number</i> is expected to be relatively small. When
|
||||
<b>-c</b> is used, <b>-B</b> is ignored.
|
||||
</P>
|
||||
<P>
|
||||
|
@ -406,20 +416,22 @@ shown separately. This option is mutually exclusive with <b>--output</b>,
|
|||
<P>
|
||||
<b>-H</b>, <b>--with-filename</b>
|
||||
Force the inclusion of the file name at the start of output lines when
|
||||
searching a single file. By default, the file name is not shown in this case.
|
||||
For matching lines, the file name is followed by a colon; for context lines, a
|
||||
hyphen separator is used. If a line number is also being output, it follows the
|
||||
file name. When the <b>-M</b> option causes a pattern to match more than one
|
||||
line, only the first is preceded by the file name. This option overrides any
|
||||
previous <b>-h</b>, <b>-l</b>, or <b>-L</b> options.
|
||||
searching a single file. The file name is not normally shown in this case.
|
||||
By default, for matching lines, the file name is followed by a colon; for
|
||||
context lines, a hyphen separator is used. The <b>-Z</b> option can be used to
|
||||
change the terminator to a zero byte. If a line number is also being output,
|
||||
it follows the file name. When the <b>-M</b> option causes a pattern to match
|
||||
more than one line, only the first is preceded by the file name. This option
|
||||
overrides any previous <b>-h</b>, <b>-l</b>, or <b>-L</b> options.
|
||||
</P>
|
||||
<P>
|
||||
<b>-h</b>, <b>--no-filename</b>
|
||||
Suppress the output file names when searching multiple files. By default,
|
||||
file names are shown when multiple files are searched. For matching lines, the
|
||||
file name is followed by a colon; for context lines, a hyphen separator is used.
|
||||
If a line number is also being output, it follows the file name. This option
|
||||
overrides any previous <b>-H</b>, <b>-L</b>, or <b>-l</b> options.
|
||||
Suppress the output file names when searching multiple files. File names are
|
||||
normally shown when multiple files are searched. By default, for matching
|
||||
lines, the file name is followed by a colon; for context lines, a hyphen
|
||||
separator is used. The <b>-Z</b> option can be used to change the terminator to
|
||||
a zero byte. If a line number is also being output, it follows the file name.
|
||||
This option overrides any previous <b>-H</b>, <b>-L</b>, or <b>-l</b> options.
|
||||
</P>
|
||||
<P>
|
||||
<b>--heap-limit</b>=<i>number</i>
|
||||
|
@ -443,8 +455,8 @@ Ignore upper/lower case distinctions during comparisons.
|
|||
<P>
|
||||
<b>--include</b>=<i>pattern</i>
|
||||
If any <b>--include</b> patterns are specified, the only files that are
|
||||
processed are those that match one of the patterns (and do not match an
|
||||
<b>--exclude</b> pattern). This option does not affect directories, but it
|
||||
processed are those whose names match one of the patterns and do not match an
|
||||
<b>--exclude</b> pattern. This option does not affect directories, but it
|
||||
applies to all files, whether listed on the command line, obtained from
|
||||
<b>--file-list</b>, or by scanning a directory. The pattern is a PCRE2 regular
|
||||
expression, and is matched against the final component of the file name, not
|
||||
|
@ -463,8 +475,8 @@ may be given any number of times; all the files are read.
|
|||
<P>
|
||||
<b>--include-dir</b>=<i>pattern</i>
|
||||
If any <b>--include-dir</b> patterns are specified, the only directories that
|
||||
are processed are those that match one of the patterns (and do not match an
|
||||
<b>--exclude-dir</b> pattern). This applies to all directories, whether listed
|
||||
are processed are those whose names match one of the patterns and do not match
|
||||
an <b>--exclude-dir</b> pattern. This applies to all directories, whether listed
|
||||
on the command line, obtained from <b>--file-list</b>, or by scanning a parent
|
||||
directory. The pattern is a PCRE2 regular expression, and is matched against
|
||||
the final component of the directory name, not the entire path. The <b>-F</b>,
|
||||
|
@ -476,19 +488,22 @@ given any number of times. If a directory matches both <b>--include-dir</b> and
|
|||
<b>-L</b>, <b>--files-without-match</b>
|
||||
Instead of outputting lines from the files, just output the names of the files
|
||||
that do not contain any lines that would have been output. Each file name is
|
||||
output once, on a separate line. This option overrides any previous <b>-H</b>,
|
||||
<b>-h</b>, or <b>-l</b> options.
|
||||
output once, on a separate line by default, but if the <b>-Z</b> option is set,
|
||||
they are separated by zero bytes instead of newlines. This option overrides any
|
||||
previous <b>-H</b>, <b>-h</b>, or <b>-l</b> options.
|
||||
</P>
|
||||
<P>
|
||||
<b>-l</b>, <b>--files-with-matches</b>
|
||||
Instead of outputting lines from the files, just output the names of the files
|
||||
containing lines that would have been output. Each file name is output once, on
|
||||
a separate line. Searching normally stops as soon as a matching line is found
|
||||
in a file. However, if the <b>-c</b> (count) option is also used, matching
|
||||
continues in order to obtain the correct count, and those files that have at
|
||||
least one match are listed along with their counts. Using this option with
|
||||
<b>-c</b> is a way of suppressing the listing of files with no matches. This
|
||||
opeion overrides any previous <b>-H</b>, <b>-h</b>, or <b>-L</b> options.
|
||||
a separate line, but if the <b>-Z</b> option is set, they are separated by zero
|
||||
bytes instead of newlines. Searching normally stops as soon as a matching line
|
||||
is found in a file. However, if the <b>-c</b> (count) option is also used,
|
||||
matching continues in order to obtain the correct count, and those files that
|
||||
have at least one match are listed along with their counts. Using this option
|
||||
with <b>-c</b> is a way of suppressing the listing of files with no matches that
|
||||
occurs with <b>-c</b> on its own. This option overrides any previous <b>-H</b>,
|
||||
<b>-h</b>, or <b>-L</b> options.
|
||||
</P>
|
||||
<P>
|
||||
<b>--label</b>=<i>name</i>
|
||||
|
@ -501,8 +516,8 @@ short form for this option.
|
|||
When this option is given, non-compressed input is read and processed line by
|
||||
line, and the output is flushed after each write. By default, input is read in
|
||||
large chunks, unless <b>pcre2grep</b> can determine that it is reading from a
|
||||
terminal (which is currently possible only in Unix-like environments or
|
||||
Windows). Output to terminal is normally automatically flushed by the operating
|
||||
terminal, which is currently possible only in Unix-like environments or
|
||||
Windows. Output to terminal is normally automatically flushed by the operating
|
||||
system. This option can be useful when the input or output is attached to a
|
||||
pipe and you do not want <b>pcre2grep</b> to buffer up large amounts of data.
|
||||
However, its use will affect performance, and the <b>-M</b> (multiline) option
|
||||
|
@ -528,46 +543,6 @@ locale is specified, the PCRE2 library's default (usually the "C" locale) is
|
|||
used. There is no short form for this option.
|
||||
</P>
|
||||
<P>
|
||||
<b>--match-limit</b>=<i>number</i>
|
||||
Processing some regular expression patterns may take a very long time to search
|
||||
for all possible matching strings. Others may require a very large amount of
|
||||
memory. There are three options that set resource limits for matching.
|
||||
<br>
|
||||
<br>
|
||||
The <b>--match-limit</b> option provides a means of limiting computing resource
|
||||
usage when processing patterns that are not going to match, but which have a
|
||||
very large number of possibilities in their search trees. The classic example
|
||||
is a pattern that uses nested unlimited repeats. Internally, PCRE2 has a
|
||||
counter that is incremented each time around its main processing loop. If the
|
||||
value set by <b>--match-limit</b> is reached, an error occurs.
|
||||
<br>
|
||||
<br>
|
||||
The <b>--heap-limit</b> option specifies, as a number of kibibytes (units of
|
||||
1024 bytes), the amount of heap memory that may be used for matching. Heap
|
||||
memory is needed only if matching the pattern requires a significant number of
|
||||
nested backtracking points to be remembered. This parameter can be set to zero
|
||||
to forbid the use of heap memory altogether.
|
||||
<br>
|
||||
<br>
|
||||
The <b>--depth-limit</b> option limits the depth of nested backtracking points,
|
||||
which indirectly limits the amount of memory that is used. The amount of memory
|
||||
needed for each backtracking point depends on the number of capturing
|
||||
parentheses in the pattern, so the amount of memory that is used before this
|
||||
limit acts varies from pattern to pattern. This limit is of use only if it is
|
||||
set smaller than <b>--match-limit</b>.
|
||||
<br>
|
||||
<br>
|
||||
There are no short forms for these options. The default limits can be set
|
||||
when the PCRE2 library is compiled; if they are not specified, the defaults
|
||||
are very large and so effectively unlimited.
|
||||
</P>
|
||||
<P>
|
||||
\fB--max-buffer-size=<i>number</i>
|
||||
This limits the expansion of the processing buffer, whose initial size can be
|
||||
set by <b>--buffer-size</b>. The maximum buffer size is silently forced to be no
|
||||
smaller than the starting buffer size.
|
||||
</P>
|
||||
<P>
|
||||
<b>-M</b>, <b>--multiline</b>
|
||||
Allow patterns to match more than one line. When this option is set, the PCRE2
|
||||
library is called in "multiline" mode. This allows a matched string to extend
|
||||
|
@ -597,29 +572,84 @@ well as possibly handling a two-character newline sequence.
|
|||
There is a limit to the number of lines that can be matched, imposed by the way
|
||||
that <b>pcre2grep</b> buffers the input file as it scans it. With a sufficiently
|
||||
large processing buffer, this should not be a problem, but the <b>-M</b> option
|
||||
does not work when input is read line by line (see \fP--line-buffered\fP.)
|
||||
does not work when input is read line by line (see <b>--line-buffered</b>.)
|
||||
</P>
|
||||
<P>
|
||||
<b>-m</b> <i>number</i>, <b>--max-count</b>=<i>number</i>
|
||||
Stop processing after finding <i>number</i> matching lines, or non-matching
|
||||
lines if <b>-v</b> is also set. Any trailing context lines are output after the
|
||||
final match. In multiline mode, each multiline match counts as just one line
|
||||
for this purpose. If this limit is reached when reading the standard input from
|
||||
a regular file, the file is left positioned just after the last matching line.
|
||||
If <b>-c</b> is also set, the count that is output is never greater than
|
||||
<i>number</i>. This option has no effect if used with <b>-L</b>, <b>-l</b>, or
|
||||
<b>-q</b>, or when just checking for a match in a binary file.
|
||||
</P>
|
||||
<P>
|
||||
<b>--match-limit</b>=<i>number</i>
|
||||
Processing some regular expression patterns may take a very long time to search
|
||||
for all possible matching strings. Others may require a very large amount of
|
||||
memory. There are three options that set resource limits for matching.
|
||||
<br>
|
||||
<br>
|
||||
The <b>--match-limit</b> option provides a means of limiting computing resource
|
||||
usage when processing patterns that are not going to match, but which have a
|
||||
very large number of possibilities in their search trees. The classic example
|
||||
is a pattern that uses nested unlimited repeats. Internally, PCRE2 has a
|
||||
counter that is incremented each time around its main processing loop. If the
|
||||
value set by <b>--match-limit</b> is reached, an error occurs.
|
||||
<br>
|
||||
<br>
|
||||
The <b>--heap-limit</b> option specifies, as a number of kibibytes (units of
|
||||
1024 bytes), the maximum amount of heap memory that may be used for matching.
|
||||
<br>
|
||||
<br>
|
||||
The <b>--depth-limit</b> option limits the depth of nested backtracking points,
|
||||
which indirectly limits the amount of memory that is used. The amount of memory
|
||||
needed for each backtracking point depends on the number of capturing
|
||||
parentheses in the pattern, so the amount of memory that is used before this
|
||||
limit acts varies from pattern to pattern. This limit is of use only if it is
|
||||
set smaller than <b>--match-limit</b>.
|
||||
<br>
|
||||
<br>
|
||||
There are no short forms for these options. The default limits can be set
|
||||
when the PCRE2 library is compiled; if they are not specified, the defaults
|
||||
are very large and so effectively unlimited.
|
||||
</P>
|
||||
<P>
|
||||
<b>--max-buffer-size</b>=<i>number</i>
|
||||
This limits the expansion of the processing buffer, whose initial size can be
|
||||
set by <b>--buffer-size</b>. The maximum buffer size is silently forced to be no
|
||||
smaller than the starting buffer size.
|
||||
</P>
|
||||
<P>
|
||||
<b>-N</b> <i>newline-type</i>, <b>--newline</b>=<i>newline-type</i>
|
||||
The PCRE2 library supports five different conventions for indicating
|
||||
the ends of lines. They are the single-character sequences CR (carriage return)
|
||||
and LF (linefeed), the two-character sequence CRLF, an "anycrlf" convention,
|
||||
which recognizes any of the preceding three types, and an "any" convention, in
|
||||
which any Unicode line ending sequence is assumed to end a line. The Unicode
|
||||
sequences are the three just mentioned, plus VT (vertical tab, U+000B), FF
|
||||
(form feed, U+000C), NEL (next line, U+0085), LS (line separator, U+2028), and
|
||||
PS (paragraph separator, U+2029).
|
||||
Six different conventions for indicating the ends of lines in scanned files are
|
||||
supported. For example:
|
||||
<pre>
|
||||
pcre2grep -N CRLF 'some pattern' <file>
|
||||
</pre>
|
||||
The newline type may be specified in upper, lower, or mixed case. If the
|
||||
newline type is NUL, lines are separated by binary zero characters. The other
|
||||
types are the single-character sequences CR (carriage return) and LF
|
||||
(linefeed), the two-character sequence CRLF, an "anycrlf" type, which
|
||||
recognizes any of the preceding three types, and an "any" type, for which any
|
||||
Unicode line ending sequence is assumed to end a line. The Unicode sequences
|
||||
are the three just mentioned, plus VT (vertical tab, U+000B), FF (form feed,
|
||||
U+000C), NEL (next line, U+0085), LS (line separator, U+2028), and PS
|
||||
(paragraph separator, U+2029).
|
||||
<br>
|
||||
<br>
|
||||
When the PCRE2 library is built, a default line-ending sequence is specified.
|
||||
This is normally the standard sequence for the operating system. Unless
|
||||
otherwise specified by this option, <b>pcre2grep</b> uses the library's default.
|
||||
The possible values for this option are CR, LF, CRLF, ANYCRLF, or ANY. This
|
||||
makes it possible to use <b>pcre2grep</b> to scan files that have come from
|
||||
other environments without having to modify their line endings. If the data
|
||||
that is being scanned does not agree with the convention set by this option,
|
||||
<b>pcre2grep</b> may behave in strange ways. Note that this option does not
|
||||
apply to files specified by the <b>-f</b>, <b>--exclude-from</b>, or
|
||||
<br>
|
||||
<br>
|
||||
This option makes it possible to use <b>pcre2grep</b> to scan files that have
|
||||
come from other environments without having to modify their line endings. If
|
||||
the data that is being scanned does not agree with the convention set by this
|
||||
option, <b>pcre2grep</b> may behave in strange ways. Note that this option does
|
||||
not apply to files specified by the <b>-f</b>, <b>--exclude-from</b>, or
|
||||
<b>--include-from</b> options, which are expected to use the operating system's
|
||||
standard newline sequence.
|
||||
</P>
|
||||
|
@ -641,29 +671,41 @@ It should never be needed in normal use.
|
|||
</P>
|
||||
<P>
|
||||
<b>-O</b> <i>text</i>, <b>--output</b>=<i>text</i>
|
||||
When there is a match, instead of outputting the whole line that matched,
|
||||
output just the given text. This option is mutually exclusive with
|
||||
<b>--only-matching</b>, <b>--file-offsets</b>, and <b>--line-offsets</b>. Escape
|
||||
sequences starting with a dollar character may be used to insert the contents
|
||||
of the matched part of the line and/or captured substrings into the text.
|
||||
When there is a match, instead of outputting the line that matched, output just
|
||||
the text specified in this option, followed by an operating-system standard
|
||||
newline. In this mode, no context is shown. That is, the <b>-A</b>, <b>-B</b>,
|
||||
and <b>-C</b> options are ignored. The <b>--newline</b> option has no effect on
|
||||
this option, which is mutually exclusive with <b>--only-matching</b>,
|
||||
<b>--file-offsets</b>, and <b>--line-offsets</b>. However, like
|
||||
<b>--only-matching</b>, if there is more than one match in a line, each of them
|
||||
causes a line of output.
|
||||
<br>
|
||||
<br>
|
||||
$<digits> or ${<digits>} is replaced by the captured
|
||||
substring of the given decimal number; zero substitutes the whole match. If
|
||||
the number is greater than the number of capturing substrings, or if the
|
||||
capture is unset, the replacement is empty.
|
||||
Escape sequences starting with a dollar character may be used to insert the
|
||||
contents of the matched part of the line and/or captured substrings into the
|
||||
text.
|
||||
<br>
|
||||
<br>
|
||||
$<digits> or ${<digits>} is replaced by the captured substring of the given
|
||||
decimal number; zero substitutes the whole match. If the number is greater than
|
||||
the number of capturing substrings, or if the capture is unset, the replacement
|
||||
is empty.
|
||||
<br>
|
||||
<br>
|
||||
$a is replaced by bell; $b by backspace; $e by escape; $f by form feed; $n by
|
||||
newline; $r by carriage return; $t by tab; $v by vertical tab.
|
||||
<br>
|
||||
<br>
|
||||
$o<digits> is replaced by the character represented by the given octal
|
||||
number; up to three digits are processed.
|
||||
$o<digits> or $o{<digits>} is replaced by the character whose code point is the
|
||||
given octal number. In the first form, up to three octal digits are processed.
|
||||
When more digits are needed in Unicode mode to specify a wide character, the
|
||||
second form must be used.
|
||||
<br>
|
||||
<br>
|
||||
$x<digits> is replaced by the character represented by the given hexadecimal
|
||||
number; up to two digits are processed.
|
||||
$x<digits> or $x{<digits>} is replaced by the character represented by the
|
||||
given hexadecimal number. In the first form, up to two hexadecimal digits are
|
||||
processed. When more digits are needed in Unicode mode to specify a wide
|
||||
character, the second form must be used.
|
||||
<br>
|
||||
<br>
|
||||
Any other character is substituted by itself. In particular, $$ is replaced by
|
||||
|
@ -732,7 +774,8 @@ option to "recurse".
|
|||
</P>
|
||||
<P>
|
||||
<b>--recursion-limit</b>=<i>number</i>
|
||||
See <b>--match-limit</b> above.
|
||||
This is an obsolete synonym for <b>--depth-limit</b>. See <b>--match-limit</b>
|
||||
above for details.
|
||||
</P>
|
||||
<P>
|
||||
<b>-s</b>, <b>--no-messages</b>
|
||||
|
@ -756,15 +799,18 @@ total would always be zero.
|
|||
<b>-u</b>, <b>--utf</b>
|
||||
Operate in UTF-8 mode. This option is available only if PCRE2 has been compiled
|
||||
with UTF-8 support. All patterns (including those for any <b>--exclude</b> and
|
||||
<b>--include</b> options) and all subject lines that are scanned must be valid
|
||||
strings of UTF-8 characters.
|
||||
<b>--include</b> options) and all lines that are scanned must be valid strings
|
||||
of UTF-8 characters. If an invalid UTF-8 string is encountered, an error
|
||||
occurs.
|
||||
</P>
|
||||
<P>
|
||||
<b>-U</b>, <b>--utf-allow-invalid</b>
|
||||
As <b>--utf</b>, but in addition subject lines may contain invalid UTF-8 code
|
||||
unit sequences. These can never form part of any pattern match. This facility
|
||||
allows valid UTF-8 strings to be sought in executable or other binary files.
|
||||
For more details about matching in non-valid UTF-8 strings, see the
|
||||
unit sequences. These can never form part of any pattern match. Patterns
|
||||
themselves, however, must still be valid UTF-8 strings. This facility allows
|
||||
valid UTF-8 strings to be sought within arbitrary byte sequences in executable
|
||||
or other binary files. For more details about matching in non-valid UTF-8
|
||||
strings, see the
|
||||
<a href="pcre2unicode.html"><b>pcre2unicode</b>(3)</a>
|
||||
documentation.
|
||||
</P>
|
||||
|
@ -777,7 +823,9 @@ ignored.
|
|||
<P>
|
||||
<b>-v</b>, <b>--invert-match</b>
|
||||
Invert the sense of the match, so that lines which do <i>not</i> match any of
|
||||
the patterns are the ones that are found.
|
||||
the patterns are the ones that are found. When this option is set, options such
|
||||
as <b>--only-matching</b> and <b>--output</b>, which specify parts of a match
|
||||
that are to be output, are ignored.
|
||||
</P>
|
||||
<P>
|
||||
<b>-w</b>, <b>--word-regex</b>, <b>--word-regexp</b>
|
||||
|
@ -797,6 +845,13 @@ pattern and ")$" at the end. This option applies only to the patterns that are
|
|||
matched against the contents of files; it does not apply to patterns specified
|
||||
by any of the <b>--include</b> or <b>--exclude</b> options.
|
||||
</P>
|
||||
<P>
|
||||
<b>-Z</b>, <b>--null</b>
|
||||
Terminate files names in the regular output with a zero byte (the NUL
|
||||
character) instead of what would normally appear. This is useful when file
|
||||
names contain unusual characters such as colons, hyphens, or even newlines. The
|
||||
option does not apply to file names in error messages.
|
||||
</P>
|
||||
<br><a name="SEC7" href="#TOC1">ENVIRONMENT VARIABLES</a><br>
|
||||
<P>
|
||||
The environment variables <b>LC_ALL</b> and <b>LC_CTYPE</b> are examined, in that
|
||||
|
@ -807,16 +862,27 @@ by the <b>--locale</b> option. If no locale is set, the PCRE2 library's default
|
|||
<br><a name="SEC8" href="#TOC1">NEWLINES</a><br>
|
||||
<P>
|
||||
The <b>-N</b> (<b>--newline</b>) option allows <b>pcre2grep</b> to scan files with
|
||||
different newline conventions from the default. Any parts of the input files
|
||||
that are written to the standard output are copied identically, with whatever
|
||||
newline sequences they have in the input. However, the setting of this option
|
||||
affects only the way scanned files are processed. It does not affect the
|
||||
interpretation of files specified by the <b>-f</b>, <b>--file-list</b>,
|
||||
<b>--exclude-from</b>, or <b>--include-from</b> options, nor does it affect the
|
||||
way in which <b>pcre2grep</b> writes informational messages to the standard
|
||||
error and output streams. For these it uses the string "\n" to indicate
|
||||
newlines, relying on the C I/O library to convert this to an appropriate
|
||||
sequence.
|
||||
newline conventions that differ from the default. This option affects only the
|
||||
way scanned files are processed. It does not affect the interpretation of files
|
||||
specified by the <b>-f</b>, <b>--file-list</b>, <b>--exclude-from</b>, or
|
||||
<b>--include-from</b> options.
|
||||
</P>
|
||||
<P>
|
||||
Any parts of the scanned input files that are written to the standard output
|
||||
are copied with whatever newline sequences they have in the input. However, if
|
||||
the final line of a file is output, and it does not end with a newline
|
||||
sequence, a newline sequence is added. If the newline setting is CR, LF, CRLF
|
||||
or NUL, that line ending is output; for the other settings (ANYCRLF or ANY) a
|
||||
single NL is used.
|
||||
</P>
|
||||
<P>
|
||||
The newline setting does not affect the way in which <b>pcre2grep</b> writes
|
||||
newlines in informational messages to the standard output and error streams.
|
||||
Under Windows, the standard output is set to be binary, so that "\r\n" at the
|
||||
ends of output lines that are copied from the input is not converted to
|
||||
"\r\r\n" by the C I/O library. This means that any messages written to the
|
||||
standard output must end with "\r\n". For all other operating systems, and
|
||||
for all messages to the standard error stream, "\n" is used.
|
||||
</P>
|
||||
<br><a name="SEC9" href="#TOC1">OPTIONS COMPATIBILITY</a><br>
|
||||
<P>
|
||||
|
@ -889,12 +955,36 @@ documentation for details). Numbered callouts are ignored by <b>pcre2grep</b>;
|
|||
only callouts with string arguments are useful.
|
||||
</P>
|
||||
<br><b>
|
||||
Echoing a specific string
|
||||
</b><br>
|
||||
<P>
|
||||
Starting the callout string with a pipe character invokes an echoing facility
|
||||
that avoids calling an external program or script. This facility is always
|
||||
available, provided that callouts were not completely disabled when
|
||||
<b>pcre2grep</b> was built. The rest of the callout string is processed as a
|
||||
zero-terminated string, which means it should not contain any internal binary
|
||||
zeros. It is written to the output, having first been passed through the same
|
||||
escape processing as text from the <b>--output</b> (<b>-O</b>) option (see
|
||||
above). However, $0 cannot be used to insert a matched substring because the
|
||||
match is still in progress. Instead, the single character '0' is inserted. Any
|
||||
syntax errors in the string (for example, a dollar not followed by another
|
||||
character) causes the callout to be ignored. No terminator is added to the
|
||||
output string, so if you want a newline, you must include it explicitly using
|
||||
the escape $n. For example:
|
||||
<pre>
|
||||
pcre2grep '(.)(..(.))(?C"|[$1] [$2] [$3]$n")' <some file>
|
||||
</pre>
|
||||
Matching continues normally after the string is output. If you want to see only
|
||||
the callout output but not any output from an actual match, you should end the
|
||||
pattern with (*FAIL).
|
||||
</P>
|
||||
<br><b>
|
||||
Calling external programs or scripts
|
||||
</b><br>
|
||||
<P>
|
||||
This facility can be independently disabled when <b>pcre2grep</b> is built. It
|
||||
is supported for Windows, where a call to <b>_spawnvp()</b> is used, for VMS,
|
||||
where <b>lib$spawn()</b> is used, and for any other Unix-like environment where
|
||||
where <b>lib$spawn()</b> is used, and for any Unix-like environment where
|
||||
<b>fork()</b> and <b>execv()</b> are available.
|
||||
</P>
|
||||
<P>
|
||||
|
@ -906,14 +996,11 @@ arguments:
|
|||
executable_name|arg1|arg2|...
|
||||
</pre>
|
||||
Any substring (including the executable name) may contain escape sequences
|
||||
started by a dollar character: $<digits> or ${<digits>} is replaced by the
|
||||
captured substring of the given decimal number, which must be greater than
|
||||
zero. If the number is greater than the number of capturing substrings, or if
|
||||
the capture is unset, the replacement is empty.
|
||||
</P>
|
||||
<P>
|
||||
Any other character is substituted by itself. In particular, $$ is replaced by
|
||||
a single dollar and $| is replaced by a pipe character. Here is an example:
|
||||
started by a dollar character. These are the same as for the <b>--output</b>
|
||||
(<b>-O</b>) option documented above, except that $0 cannot insert the matched
|
||||
string because the match is still in progress. Instead, the character '0'
|
||||
is inserted. If you need a literal dollar or pipe character in any
|
||||
substring, use $$ or $| respectively. Here is an example:
|
||||
<pre>
|
||||
echo -e "abcde\n12345" | pcre2grep \
|
||||
'(?x)(.)(..(.))
|
||||
|
@ -926,28 +1013,14 @@ a single dollar and $| is replaced by a pipe character. Here is an example:
|
|||
Arg1: [1] [234] [4] Arg2: |1| ()
|
||||
12345
|
||||
</pre>
|
||||
The parameters for the system call that is used to run the
|
||||
program or script are zero-terminated strings. This means that binary zero
|
||||
characters in the callout argument will cause premature termination of their
|
||||
substrings, and therefore should not be present. Any syntax errors in the
|
||||
string (for example, a dollar not followed by another character) cause the
|
||||
callout to be ignored. If running the program fails for any reason (including
|
||||
the non-existence of the executable), a local matching failure occurs and the
|
||||
matcher backtracks in the normal way.
|
||||
</P>
|
||||
<br><b>
|
||||
Echoing a specific string
|
||||
</b><br>
|
||||
<P>
|
||||
This facility is always available, provided that callouts were not completely
|
||||
disabled when <b>pcre2grep</b> was built. If the callout string starts with a
|
||||
pipe (vertical bar) character, the rest of the string is written to the output,
|
||||
having been passed through the same escape processing as text from the --output
|
||||
option. This provides a simple echoing facility that avoids calling an external
|
||||
program or script. No terminator is added to the string, so if you want a
|
||||
newline, you must include it explicitly. Matching continues normally after the
|
||||
string is output. If you want to see only the callout output but not any output
|
||||
from an actual match, you should end the relevant pattern with (*FAIL).
|
||||
The parameters for the system call that is used to run the program or script
|
||||
are zero-terminated strings. This means that binary zero characters in the
|
||||
callout argument will cause premature termination of their substrings, and
|
||||
therefore should not be present. Any syntax errors in the string (for example,
|
||||
a dollar not followed by another character) causes the callout to be ignored.
|
||||
If running the program fails for any reason (including the non-existence of the
|
||||
executable), a local matching failure occurs and the matcher backtracks in the
|
||||
normal way.
|
||||
</P>
|
||||
<br><a name="SEC12" href="#TOC1">MATCHING ERRORS</a><br>
|
||||
<P>
|
||||
|
@ -979,22 +1052,23 @@ because VMS does not distinguish between exit(0) and exit(1).
|
|||
</P>
|
||||
<br><a name="SEC14" href="#TOC1">SEE ALSO</a><br>
|
||||
<P>
|
||||
<b>pcre2pattern</b>(3), <b>pcre2syntax</b>(3), <b>pcre2callout</b>(3).
|
||||
<b>pcre2pattern</b>(3), <b>pcre2syntax</b>(3), <b>pcre2callout</b>(3),
|
||||
<b>pcre2unicode</b>(3).
|
||||
</P>
|
||||
<br><a name="SEC15" href="#TOC1">AUTHOR</a><br>
|
||||
<P>
|
||||
Philip Hazel
|
||||
<br>
|
||||
University Computing Service
|
||||
Retired from University Computing Service
|
||||
<br>
|
||||
Cambridge, England.
|
||||
<br>
|
||||
</P>
|
||||
<br><a name="SEC16" href="#TOC1">REVISION</a><br>
|
||||
<P>
|
||||
Last updated: 15 June 2019
|
||||
Last updated: 30 July 2022
|
||||
<br>
|
||||
Copyright © 1997-2019 University of Cambridge.
|
||||
Copyright © 1997-2022 University of Cambridge.
|
||||
<br>
|
||||
<p>
|
||||
Return to the <a href="index.html">PCRE2 index page</a>.
|
||||
|
|
|
@ -54,6 +54,7 @@ platforms:
|
|||
<pre>
|
||||
ARM 32-bit (v5, v7, and Thumb2)
|
||||
ARM 64-bit
|
||||
IBM s390x 64 bit
|
||||
Intel x86 32-bit and 64-bit
|
||||
MIPS 32-bit and 64-bit
|
||||
Power PC 32-bit and 64-bit
|
||||
|
@ -90,7 +91,7 @@ or a negative error code.
|
|||
There is a limit to the size of pattern that JIT supports, imposed by the size
|
||||
of machine stack that it uses. The exact rules are not documented because they
|
||||
may change at any time, in particular, when new optimizations are introduced.
|
||||
If a pattern is too big, a call to \fBpcre2_jit_compile()\fB returns
|
||||
If a pattern is too big, a call to <b>pcre2_jit_compile()</b> returns
|
||||
PCRE2_ERROR_NOMEMORY.
|
||||
</P>
|
||||
<P>
|
||||
|
@ -268,11 +269,11 @@ starts another match, that match must use a different JIT stack to the one used
|
|||
for currently suspended match(es).
|
||||
</P>
|
||||
<P>
|
||||
In a multithread application, if you do not
|
||||
specify a JIT stack, or if you assign or pass back NULL from a callback, that
|
||||
is thread-safe, because each thread has its own machine stack. However, if you
|
||||
assign or pass back a non-NULL JIT stack, this must be a different stack for
|
||||
each thread so that the application is thread-safe.
|
||||
In a multithread application, if you do not specify a JIT stack, or if you
|
||||
assign or pass back NULL from a callback, that is thread-safe, because each
|
||||
thread has its own machine stack. However, if you assign or pass back a
|
||||
non-NULL JIT stack, this must be a different stack for each thread so that the
|
||||
application is thread-safe.
|
||||
</P>
|
||||
<P>
|
||||
Strictly speaking, even more is allowed. You can assign the same non-NULL stack
|
||||
|
@ -286,7 +287,7 @@ inefficient solution, and not recommended.
|
|||
This is a suggestion for how a multithreaded program that needs to set up
|
||||
non-default JIT stacks might operate:
|
||||
<pre>
|
||||
During thread initalization
|
||||
During thread initialization
|
||||
thread_local_var = pcre2_jit_stack_create(...)
|
||||
|
||||
During thread exit
|
||||
|
@ -339,12 +340,12 @@ stack through the JIT callback function.
|
|||
You can free a JIT stack at any time, as long as it will not be used by
|
||||
<b>pcre2_match()</b> again. When you assign the stack to a match context, only a
|
||||
pointer is set. There is no reference counting or any other magic. You can free
|
||||
compiled patterns, contexts, and stacks in any order, anytime. Just \fIdo
|
||||
not\fP call <b>pcre2_match()</b> with a match context pointing to an already
|
||||
freed stack, as that will cause SEGFAULT. (Also, do not free a stack currently
|
||||
used by <b>pcre2_match()</b> in another thread). You can also replace the stack
|
||||
in a context at any time when it is not in use. You should free the previous
|
||||
stack before assigning a replacement.
|
||||
compiled patterns, contexts, and stacks in any order, anytime.
|
||||
Just <i>do not</i> call <b>pcre2_match()</b> with a match context pointing to an
|
||||
already freed stack, as that will cause SEGFAULT. (Also, do not free a stack
|
||||
currently used by <b>pcre2_match()</b> in another thread). You can also replace
|
||||
the stack in a context at any time when it is not in use. You should free the
|
||||
previous stack before assigning a replacement.
|
||||
</P>
|
||||
<P>
|
||||
(5) Should I allocate/free a stack every time before/after calling
|
||||
|
@ -381,8 +382,8 @@ out this complicated API.
|
|||
<b>void pcre2_jit_free_unused_memory(pcre2_general_context *<i>gcontext</i>);</b>
|
||||
</P>
|
||||
<P>
|
||||
The JIT executable allocator does not free all memory when it is possible.
|
||||
It expects new allocations, and keeps some free memory around to improve
|
||||
The JIT executable allocator does not free all memory when it is possible. It
|
||||
expects new allocations, and keeps some free memory around to improve
|
||||
allocation speed. However, in low memory conditions, it might be better to free
|
||||
all possible memory. You can cause this to happen by calling
|
||||
pcre2_jit_free_unused_memory(). Its argument is a general context, for custom
|
||||
|
@ -441,10 +442,10 @@ that was not compiled.
|
|||
<P>
|
||||
When you call <b>pcre2_match()</b>, as well as testing for invalid options, a
|
||||
number of other sanity checks are performed on the arguments. For example, if
|
||||
the subject pointer is NULL, an immediate error is given. Also, unless
|
||||
PCRE2_NO_UTF_CHECK is set, a UTF subject string is tested for validity. In the
|
||||
interests of speed, these checks do not happen on the JIT fast path, and if
|
||||
invalid data is passed, the result is undefined.
|
||||
the subject pointer is NULL but the length is non-zero, an immediate error is
|
||||
given. Also, unless PCRE2_NO_UTF_CHECK is set, a UTF subject string is tested
|
||||
for validity. In the interests of speed, these checks do not happen on the JIT
|
||||
fast path, and if invalid data is passed, the result is undefined.
|
||||
</P>
|
||||
<P>
|
||||
Bypassing the sanity checks and the <b>pcre2_match()</b> wrapping can give
|
||||
|
@ -465,9 +466,9 @@ Cambridge, England.
|
|||
</P>
|
||||
<br><a name="SEC14" href="#TOC1">REVISION</a><br>
|
||||
<P>
|
||||
Last updated: 23 May 2019
|
||||
Last updated: 30 November 2021
|
||||
<br>
|
||||
Copyright © 1997-2019 University of Cambridge.
|
||||
Copyright © 1997-2021 University of Cambridge.
|
||||
<br>
|
||||
<p>
|
||||
Return to the <a href="index.html">PCRE2 index page</a>.
|
||||
|
|
|
@ -71,13 +71,18 @@ is 255 code units for the 8-bit library and 65535 code units for the 16-bit and
|
|||
The maximum length of a string argument to a callout is the largest number a
|
||||
32-bit unsigned integer can hold.
|
||||
</P>
|
||||
<P>
|
||||
The maximum amount of heap memory used for matching is controlled by the heap
|
||||
limit, which can be set in a pattern or in a match context. The default is a
|
||||
very large number, effectively unlimited.
|
||||
</P>
|
||||
<br><b>
|
||||
AUTHOR
|
||||
</b><br>
|
||||
<P>
|
||||
Philip Hazel
|
||||
<br>
|
||||
University Computing Service
|
||||
Retired from University Computing Service
|
||||
<br>
|
||||
Cambridge, England.
|
||||
<br>
|
||||
|
@ -86,9 +91,9 @@ Cambridge, England.
|
|||
REVISION
|
||||
</b><br>
|
||||
<P>
|
||||
Last updated: 02 February 2019
|
||||
Last updated: 26 July 2022
|
||||
<br>
|
||||
Copyright © 1997-2019 University of Cambridge.
|
||||
Copyright © 1997-2022 University of Cambridge.
|
||||
<br>
|
||||
<p>
|
||||
Return to the <a href="index.html">PCRE2 index page</a>.
|
||||
|
|
|
@ -78,8 +78,9 @@ tried is controlled by the greedy or ungreedy nature of the quantifier.
|
|||
If a leaf node is reached, a matching string has been found, and at that point
|
||||
the algorithm stops. Thus, if there is more than one possible match, this
|
||||
algorithm returns the first one that it finds. Whether this is the shortest,
|
||||
the longest, or some intermediate length depends on the way the greedy and
|
||||
ungreedy repetition quantifiers are specified in the pattern.
|
||||
the longest, or some intermediate length depends on the way the alternations
|
||||
and the greedy or ungreedy repetition quantifiers are specified in the
|
||||
pattern.
|
||||
</P>
|
||||
<P>
|
||||
Because it ends up with a single path through the tree, it is relatively
|
||||
|
@ -109,11 +110,17 @@ no more unterminated paths. At this point, terminated paths represent the
|
|||
different matching possibilities (if there are none, the match has failed).
|
||||
Thus, if there is more than one possible match, this algorithm finds all of
|
||||
them, and in particular, it finds the longest. The matches are returned in
|
||||
decreasing order of length. There is an option to stop the algorithm after the
|
||||
first match (which is necessarily the shortest) is found.
|
||||
the output vector in decreasing order of length. There is an option to stop the
|
||||
algorithm after the first match (which is necessarily the shortest) is found.
|
||||
</P>
|
||||
<P>
|
||||
Note that all the matches that are found start at the same point in the
|
||||
Note that the size of vector needed to contain all the results depends on the
|
||||
number of simultaneous matches, not on the number of parentheses in the
|
||||
pattern. Using <b>pcre2_match_data_create_from_pattern()</b> to create the match
|
||||
data block is therefore not advisable when doing DFA matching.
|
||||
</P>
|
||||
<P>
|
||||
Note also that all the matches that are found start at the same point in the
|
||||
subject. If the pattern
|
||||
<pre>
|
||||
cat(er(pillar)?)?
|
||||
|
@ -194,21 +201,14 @@ supported by <b>pcre2_dfa_match()</b>.
|
|||
</P>
|
||||
<br><a name="SEC5" href="#TOC1">ADVANTAGES OF THE ALTERNATIVE ALGORITHM</a><br>
|
||||
<P>
|
||||
Using the alternative matching algorithm provides the following advantages:
|
||||
The main advantage of the alternative algorithm is that all possible matches
|
||||
(at a single point in the subject) are automatically found, and in particular,
|
||||
the longest match is found. To find more than one match at the same point using
|
||||
the standard algorithm, you have to do kludgy things with callouts.
|
||||
</P>
|
||||
<P>
|
||||
1. All possible matches (at a single point in the subject) are automatically
|
||||
found, and in particular, the longest match is found. To find more than one
|
||||
match using the standard algorithm, you have to do kludgy things with
|
||||
callouts.
|
||||
</P>
|
||||
<P>
|
||||
2. Because the alternative algorithm scans the subject string just once, and
|
||||
never needs to backtrack (except for lookbehinds), it is possible to pass very
|
||||
long subject strings to the matching function in several pieces, checking for
|
||||
partial matching each time. Although it is also possible to do multi-segment
|
||||
matching using the standard algorithm, by retaining partially matched
|
||||
substrings, it is more complicated. The
|
||||
Partial matching is possible with this algorithm, though it has some
|
||||
limitations. The
|
||||
<a href="pcre2partial.html"><b>pcre2partial</b></a>
|
||||
documentation gives details of partial matching and discusses multi-segment
|
||||
matching.
|
||||
|
@ -230,20 +230,23 @@ invalid UTF string are not supported.
|
|||
3. Although atomic groups are supported, their use does not provide the
|
||||
performance advantage that it does for the standard algorithm.
|
||||
</P>
|
||||
<P>
|
||||
4. JIT optimization is not supported.
|
||||
</P>
|
||||
<br><a name="SEC7" href="#TOC1">AUTHOR</a><br>
|
||||
<P>
|
||||
Philip Hazel
|
||||
<br>
|
||||
University Computing Service
|
||||
Retired from University Computing Service
|
||||
<br>
|
||||
Cambridge, England.
|
||||
<br>
|
||||
</P>
|
||||
<br><a name="SEC8" href="#TOC1">REVISION</a><br>
|
||||
<P>
|
||||
Last updated: 23 May 2019
|
||||
Last updated: 28 August 2021
|
||||
<br>
|
||||
Copyright © 1997-2019 University of Cambridge.
|
||||
Copyright © 1997-2021 University of Cambridge.
|
||||
<br>
|
||||
<p>
|
||||
Return to the <a href="index.html">PCRE2 index page</a>.
|
||||
|
|
|
@ -295,7 +295,7 @@ these characters with '<' if the <b>allusedtext</b> modifier is set:
|
|||
Partial match: 123ab
|
||||
<<<
|
||||
</pre>
|
||||
However, the \fPallusedtext\fP modifier is not available for JIT matching,
|
||||
However, the <b>allusedtext</b> modifier is not available for JIT matching,
|
||||
because JIT matching does not record the first (or last) consulted characters.
|
||||
For this reason, this information is not available via the API. It is therefore
|
||||
not possible in general to obtain the exact number of characters that must be
|
||||
|
|
|
@ -114,7 +114,8 @@ Another special sequence that may appear at the start of a pattern is (*UCP).
|
|||
This has the same effect as setting the PCRE2_UCP option: it causes sequences
|
||||
such as \d and \w to use Unicode properties to determine character types,
|
||||
instead of recognizing only characters with codes less than 256 via a lookup
|
||||
table.
|
||||
table. If also causes upper/lower casing operations to use Unicode properties
|
||||
for characters with code points greater than 127, even when UTF is not set.
|
||||
</P>
|
||||
<P>
|
||||
Some applications that allow their users to supply patterns may wish to
|
||||
|
@ -288,8 +289,11 @@ corresponding characters in the subject. As a trivial example, the pattern
|
|||
The quick brown fox
|
||||
</pre>
|
||||
matches a portion of a subject string that is identical to itself. When
|
||||
caseless matching is specified (the PCRE2_CASELESS option), letters are matched
|
||||
independently of case.
|
||||
caseless matching is specified (the PCRE2_CASELESS option or (?i) within the
|
||||
pattern), letters are matched independently of case. Note that there are two
|
||||
ASCII characters, K and S, that, in addition to their lower case ASCII
|
||||
equivalents, are case-equivalent with Unicode U+212A (Kelvin sign) and U+017F
|
||||
(long S) respectively when either PCRE2_UTF or PCRE2_UCP is set.
|
||||
</P>
|
||||
<P>
|
||||
The power of regular expressions comes from the ability to include wild cards,
|
||||
|
@ -325,6 +329,20 @@ a character class the only metacharacters are:
|
|||
[ POSIX character class (if followed by POSIX syntax)
|
||||
] terminates the character class
|
||||
</pre>
|
||||
If a pattern is compiled with the PCRE2_EXTENDED option, most white space in
|
||||
the pattern, other than in a character class, and characters between a #
|
||||
outside a character class and the next newline, inclusive, are ignored. An
|
||||
escaping backslash can be used to include a white space or a # character as
|
||||
part of the pattern. If the PCRE2_EXTENDED_MORE option is set, the same
|
||||
applies, but in addition unescaped space and horizontal tab characters are
|
||||
ignored inside a character class. Note: only these two characters are ignored,
|
||||
not the full set of pattern white space characters that are ignored outside a
|
||||
character class. Option settings can be changed within a pattern; see the
|
||||
section entitled
|
||||
<a href="#internaloptions">"Internal Option Setting"</a>
|
||||
below.
|
||||
</P>
|
||||
<P>
|
||||
The following sections describe the use of each of the metacharacters.
|
||||
</P>
|
||||
<br><a name="SEC5" href="#TOC1">BACKSLASH</a><br>
|
||||
|
@ -342,16 +360,9 @@ precede a non-alphanumeric with backslash to specify that it stands for itself.
|
|||
In particular, if you want to match a backslash, you write \\.
|
||||
</P>
|
||||
<P>
|
||||
In a UTF mode, only ASCII digits and letters have any special meaning after a
|
||||
backslash. All other characters (in particular, those whose code points are
|
||||
greater than 127) are treated as literals.
|
||||
</P>
|
||||
<P>
|
||||
If a pattern is compiled with the PCRE2_EXTENDED option, most white space in
|
||||
the pattern (other than in a character class), and characters between a #
|
||||
outside a character class and the next newline, inclusive, are ignored. An
|
||||
escaping backslash can be used to include a white space or # character as part
|
||||
of the pattern.
|
||||
Only ASCII digits and letters have any special meaning after a backslash. All
|
||||
other characters (in particular, those whose code points are greater than 127)
|
||||
are treated as literals.
|
||||
</P>
|
||||
<P>
|
||||
If you want to treat all characters in a sequence as literals, you can do so by
|
||||
|
@ -523,7 +534,7 @@ for themselves. For example, outside a character class:
|
|||
\0113 is a tab followed by the character "3"
|
||||
\113 might be a backreference, otherwise the character with octal code 113
|
||||
\377 might be a backreference, otherwise the value 255 (decimal)
|
||||
\81 is always a backreference .sp
|
||||
\81 is always a backreference
|
||||
</pre>
|
||||
Note that octal values of 100 or greater that are specified using this syntax
|
||||
must not be introduced by a leading zero, because no more than three octal
|
||||
|
@ -734,7 +745,7 @@ Unicode support is not needed for these characters to be recognized.
|
|||
<P>
|
||||
It is possible to restrict \R to match only CR, LF, or CRLF (instead of the
|
||||
complete set of Unicode line endings) by setting the option PCRE2_BSR_ANYCRLF
|
||||
at compile time. (BSR is an abbrevation for "backslash R".) This can be made
|
||||
at compile time. (BSR is an abbreviation for "backslash R".) This can be made
|
||||
the default when PCRE2 is built; if this is the case, the other behaviour can
|
||||
be requested via the PCRE2_BSR_UNICODE option. It is also possible to specify
|
||||
these settings by starting a pattern string with one of the following
|
||||
|
@ -765,190 +776,62 @@ can be used in any mode, though in 8-bit and 16-bit non-UTF modes these
|
|||
sequences are of course limited to testing characters whose code points are
|
||||
less than U+0100 and U+10000, respectively. In 32-bit non-UTF mode, code points
|
||||
greater than 0x10ffff (the Unicode limit) may be encountered. These are all
|
||||
treated as being in the Unknown script and with an unassigned type. The extra
|
||||
escape sequences are:
|
||||
treated as being in the Unknown script and with an unassigned type.
|
||||
</P>
|
||||
<P>
|
||||
Matching characters by Unicode property is not fast, because PCRE2 has to do a
|
||||
multistage table lookup in order to find a character's property. That is why
|
||||
the traditional escape sequences such as \d and \w do not use Unicode
|
||||
properties in PCRE2 by default, though you can make them do so by setting the
|
||||
PCRE2_UCP option or by starting the pattern with (*UCP).
|
||||
</P>
|
||||
<P>
|
||||
The extra escape sequences that provide property support are:
|
||||
<pre>
|
||||
\p{<i>xx</i>} a character with the <i>xx</i> property
|
||||
\P{<i>xx</i>} a character without the <i>xx</i> property
|
||||
\X a Unicode extended grapheme cluster
|
||||
</pre>
|
||||
The property names represented by <i>xx</i> above are case-sensitive. There is
|
||||
support for Unicode script names, Unicode general category properties, "Any",
|
||||
which matches any character (including newline), and some special PCRE2
|
||||
properties (described in the
|
||||
<a href="#extraprops">next section).</a>
|
||||
Other Perl properties such as "InMusicalSymbols" are not supported by PCRE2.
|
||||
Note that \P{Any} does not match any characters, so always causes a match
|
||||
failure.
|
||||
The property names represented by <i>xx</i> above are not case-sensitive, and in
|
||||
accordance with Unicode's "loose matching" rules, spaces, hyphens, and
|
||||
underscores are ignored. There is support for Unicode script names, Unicode
|
||||
general category properties, "Any", which matches any character (including
|
||||
newline), Bidi_Class, a number of binary (yes/no) properties, and some special
|
||||
PCRE2 properties (described
|
||||
<a href="#extraprops">below).</a>
|
||||
Certain other Perl properties such as "InMusicalSymbols" are not supported by
|
||||
PCRE2. Note that \P{Any} does not match any characters, so always causes a
|
||||
match failure.
|
||||
</P>
|
||||
<br><b>
|
||||
Script properties for \p and \P
|
||||
</b><br>
|
||||
<P>
|
||||
There are three different syntax forms for matching a script. Each Unicode
|
||||
character has a basic script and, optionally, a list of other scripts ("Script
|
||||
Extensions") with which it is commonly used. Using the Adlam script as an
|
||||
example, \p{sc:Adlam} matches characters whose basic script is Adlam, whereas
|
||||
\p{scx:Adlam} matches, in addition, characters that have Adlam in their
|
||||
extensions list. The full names "script" and "script extensions" for the
|
||||
property types are recognized, and a equals sign is an alternative to the
|
||||
colon. If a script name is given without a property type, for example,
|
||||
\p{Adlam}, it is treated as \p{scx:Adlam}. Perl changed to this
|
||||
interpretation at release 5.26 and PCRE2 changed at release 10.40.
|
||||
</P>
|
||||
<P>
|
||||
Sets of Unicode characters are defined as belonging to certain scripts. A
|
||||
character from one of these sets can be matched using a script name. For
|
||||
example:
|
||||
<pre>
|
||||
\p{Greek}
|
||||
\P{Han}
|
||||
</pre>
|
||||
Unassigned characters (and in non-UTF 32-bit mode, characters with code points
|
||||
greater than 0x10FFFF) are assigned the "Unknown" script. Others that are not
|
||||
part of an identified script are lumped together as "Common". The current list
|
||||
of scripts is:
|
||||
</P>
|
||||
<P>
|
||||
Adlam,
|
||||
Ahom,
|
||||
Anatolian_Hieroglyphs,
|
||||
Arabic,
|
||||
Armenian,
|
||||
Avestan,
|
||||
Balinese,
|
||||
Bamum,
|
||||
Bassa_Vah,
|
||||
Batak,
|
||||
Bengali,
|
||||
Bhaiksuki,
|
||||
Bopomofo,
|
||||
Brahmi,
|
||||
Braille,
|
||||
Buginese,
|
||||
Buhid,
|
||||
Canadian_Aboriginal,
|
||||
Carian,
|
||||
Caucasian_Albanian,
|
||||
Chakma,
|
||||
Cham,
|
||||
Cherokee,
|
||||
Common,
|
||||
Coptic,
|
||||
Cuneiform,
|
||||
Cypriot,
|
||||
Cyrillic,
|
||||
Deseret,
|
||||
Devanagari,
|
||||
Dogra,
|
||||
Duployan,
|
||||
Egyptian_Hieroglyphs,
|
||||
Elbasan,
|
||||
Elymaic,
|
||||
Ethiopic,
|
||||
Georgian,
|
||||
Glagolitic,
|
||||
Gothic,
|
||||
Grantha,
|
||||
Greek,
|
||||
Gujarati,
|
||||
Gunjala_Gondi,
|
||||
Gurmukhi,
|
||||
Han,
|
||||
Hangul,
|
||||
Hanifi_Rohingya,
|
||||
Hanunoo,
|
||||
Hatran,
|
||||
Hebrew,
|
||||
Hiragana,
|
||||
Imperial_Aramaic,
|
||||
Inherited,
|
||||
Inscriptional_Pahlavi,
|
||||
Inscriptional_Parthian,
|
||||
Javanese,
|
||||
Kaithi,
|
||||
Kannada,
|
||||
Katakana,
|
||||
Kayah_Li,
|
||||
Kharoshthi,
|
||||
Khmer,
|
||||
Khojki,
|
||||
Khudawadi,
|
||||
Lao,
|
||||
Latin,
|
||||
Lepcha,
|
||||
Limbu,
|
||||
Linear_A,
|
||||
Linear_B,
|
||||
Lisu,
|
||||
Lycian,
|
||||
Lydian,
|
||||
Mahajani,
|
||||
Makasar,
|
||||
Malayalam,
|
||||
Mandaic,
|
||||
Manichaean,
|
||||
Marchen,
|
||||
Masaram_Gondi,
|
||||
Medefaidrin,
|
||||
Meetei_Mayek,
|
||||
Mende_Kikakui,
|
||||
Meroitic_Cursive,
|
||||
Meroitic_Hieroglyphs,
|
||||
Miao,
|
||||
Modi,
|
||||
Mongolian,
|
||||
Mro,
|
||||
Multani,
|
||||
Myanmar,
|
||||
Nabataean,
|
||||
Nandinagari,
|
||||
New_Tai_Lue,
|
||||
Newa,
|
||||
Nko,
|
||||
Nushu,
|
||||
Nyakeng_Puachue_Hmong,
|
||||
Ogham,
|
||||
Ol_Chiki,
|
||||
Old_Hungarian,
|
||||
Old_Italic,
|
||||
Old_North_Arabian,
|
||||
Old_Permic,
|
||||
Old_Persian,
|
||||
Old_Sogdian,
|
||||
Old_South_Arabian,
|
||||
Old_Turkic,
|
||||
Oriya,
|
||||
Osage,
|
||||
Osmanya,
|
||||
Pahawh_Hmong,
|
||||
Palmyrene,
|
||||
Pau_Cin_Hau,
|
||||
Phags_Pa,
|
||||
Phoenician,
|
||||
Psalter_Pahlavi,
|
||||
Rejang,
|
||||
Runic,
|
||||
Samaritan,
|
||||
Saurashtra,
|
||||
Sharada,
|
||||
Shavian,
|
||||
Siddham,
|
||||
SignWriting,
|
||||
Sinhala,
|
||||
Sogdian,
|
||||
Sora_Sompeng,
|
||||
Soyombo,
|
||||
Sundanese,
|
||||
Syloti_Nagri,
|
||||
Syriac,
|
||||
Tagalog,
|
||||
Tagbanwa,
|
||||
Tai_Le,
|
||||
Tai_Tham,
|
||||
Tai_Viet,
|
||||
Takri,
|
||||
Tamil,
|
||||
Tangut,
|
||||
Telugu,
|
||||
Thaana,
|
||||
Thai,
|
||||
Tibetan,
|
||||
Tifinagh,
|
||||
Tirhuta,
|
||||
Ugaritic,
|
||||
Unknown,
|
||||
Vai,
|
||||
Wancho,
|
||||
Warang_Citi,
|
||||
Yi,
|
||||
Zanabazar_Square.
|
||||
of recognized script names and their 4-character abbreviations can be obtained
|
||||
by running this command:
|
||||
<pre>
|
||||
pcre2test -LS
|
||||
|
||||
</PRE>
|
||||
</P>
|
||||
<br><b>
|
||||
The general category property for \p and \P
|
||||
</b><br>
|
||||
<P>
|
||||
Each character has exactly one Unicode general category property, specified by
|
||||
a two-letter abbreviation. For compatibility with Perl, negation can be
|
||||
|
@ -1010,9 +893,9 @@ The following general category property codes are supported:
|
|||
Zp Paragraph separator
|
||||
Zs Space separator
|
||||
</pre>
|
||||
The special property L& is also supported: it matches a character that has
|
||||
the Lu, Ll, or Lt property, in other words, a letter that is not classified as
|
||||
a modifier or "other".
|
||||
The special property LC, which has the synonym L&, is also supported: it
|
||||
matches a character that has the Lu, Ll, or Lt property, in other words, a
|
||||
letter that is not classified as a modifier or "other".
|
||||
</P>
|
||||
<P>
|
||||
The Cs (Surrogate) property applies only to characters whose code points are in
|
||||
|
@ -1039,12 +922,54 @@ Specifying caseless matching does not affect these escape sequences. For
|
|||
example, \p{Lu} always matches only upper case letters. This is different from
|
||||
the behaviour of current versions of Perl.
|
||||
</P>
|
||||
<br><b>
|
||||
Binary (yes/no) properties for \p and \P
|
||||
</b><br>
|
||||
<P>
|
||||
Matching characters by Unicode property is not fast, because PCRE2 has to do a
|
||||
multistage table lookup in order to find a character's property. That is why
|
||||
the traditional escape sequences such as \d and \w do not use Unicode
|
||||
properties in PCRE2 by default, though you can make them do so by setting the
|
||||
PCRE2_UCP option or by starting the pattern with (*UCP).
|
||||
Unicode defines a number of binary properties, that is, properties whose only
|
||||
values are true or false. You can obtain a list of those that are recognized by
|
||||
\p and \P, along with their abbreviations, by running this command:
|
||||
<pre>
|
||||
pcre2test -LP
|
||||
|
||||
</PRE>
|
||||
</P>
|
||||
<br><b>
|
||||
The Bidi_Class property for \p and \P
|
||||
</b><br>
|
||||
<P>
|
||||
<pre>
|
||||
\p{Bidi_Class:<class>} matches a character with the given class
|
||||
\p{BC:<class>} matches a character with the given class
|
||||
</pre>
|
||||
The recognized classes are:
|
||||
<pre>
|
||||
AL Arabic letter
|
||||
AN Arabic number
|
||||
B paragraph separator
|
||||
BN boundary neutral
|
||||
CS common separator
|
||||
EN European number
|
||||
ES European separator
|
||||
ET European terminator
|
||||
FSI first strong isolate
|
||||
L left-to-right
|
||||
LRE left-to-right embedding
|
||||
LRI left-to-right isolate
|
||||
LRO left-to-right override
|
||||
NSM non-spacing mark
|
||||
ON other neutral
|
||||
PDF pop directional format
|
||||
PDI pop directional isolate
|
||||
R right-to-left
|
||||
RLE right-to-left embedding
|
||||
RLI right-to-left isolate
|
||||
RLO right-to-left override
|
||||
S segment separator
|
||||
WS which space
|
||||
</pre>
|
||||
An equals sign may be used instead of a colon. The class names are
|
||||
case-insensitive; only the short names listed above are recognized.
|
||||
</P>
|
||||
<br><b>
|
||||
Extended grapheme clusters
|
||||
|
@ -1075,7 +1000,7 @@ additional characters according to the following rules for ending a cluster:
|
|||
3. Do not break Hangul (a Korean script) syllable sequences. Hangul characters
|
||||
are of five types: L, V, T, LV, and LVT. An L character may be followed by an
|
||||
L, V, LV, or LVT character; an LV or V character may be followed by a V or T
|
||||
character; an LVT or T character may be follwed only by a T character.
|
||||
character; an LVT or T character may be followed only by a T character.
|
||||
</P>
|
||||
<P>
|
||||
4. Do not end before extending characters or spacing marks or the "zero-width
|
||||
|
@ -1160,8 +1085,11 @@ For example, when the pattern
|
|||
matches "foobar", the first substring is still set to "foo".
|
||||
</P>
|
||||
<P>
|
||||
Perl documents that the use of \K within assertions is "not well defined". In
|
||||
PCRE2, \K is acted upon when it occurs inside positive assertions, but is
|
||||
From version 5.32.0 Perl forbids the use of \K in lookaround assertions. From
|
||||
release 10.38 PCRE2 also forbids this by default. However, the
|
||||
PCRE2_EXTRA_ALLOW_LOOKAROUND_BSK option can be used when calling
|
||||
<b>pcre2_compile()</b> to re-enable the previous behaviour. When this option is
|
||||
set, \K is acted upon when it occurs inside positive assertions, but is
|
||||
ignored in negative assertions. Note that when a pattern such as (?=ab\K)
|
||||
matches, the reported start of the match can be greater than the end of the
|
||||
match. Using \K in a lookbehind assertion at the start of a pattern can also
|
||||
|
@ -1318,15 +1246,17 @@ end of the subject in both modes, and if all branches of a pattern start with
|
|||
<P>
|
||||
Outside a character class, a dot in the pattern matches any one character in
|
||||
the subject string except (by default) a character that signifies the end of a
|
||||
line.
|
||||
line. One or more characters may be specified as line terminators (see
|
||||
<a href="#newlines">"Newline conventions"</a>
|
||||
above).
|
||||
</P>
|
||||
<P>
|
||||
When a line ending is defined as a single character, dot never matches that
|
||||
character; when the two-character sequence CRLF is used, dot does not match CR
|
||||
if it is immediately followed by LF, but otherwise it matches all characters
|
||||
(including isolated CRs and LFs). When any Unicode line endings are being
|
||||
recognized, dot does not match CR or LF or any of the other line ending
|
||||
characters.
|
||||
Dot never matches a single line-ending character. When the two-character
|
||||
sequence CRLF is the only line ending, dot does not match CR if it is
|
||||
immediately followed by LF, but otherwise it matches all characters (including
|
||||
isolated CRs and LFs). When ANYCRLF is selected for line endings, no occurences
|
||||
of CR of LF match dot. When all Unicode line endings are being recognized, dot
|
||||
does not match CR or LF or any of the other line ending characters.
|
||||
</P>
|
||||
<P>
|
||||
The behaviour of dot with regard to newlines can be changed. If the
|
||||
|
@ -1438,7 +1368,10 @@ Characters in a class may be specified by their code points using \o, \x, or
|
|||
\N{U+hh..} in the usual way. When caseless matching is set, any letters in a
|
||||
class represent both their upper case and lower case versions, so for example,
|
||||
a caseless [aeiou] matches "A" as well as "a", and a caseless [^aeiou] does not
|
||||
match "A", whereas a caseful version would.
|
||||
match "A", whereas a caseful version would. Note that there are two ASCII
|
||||
characters, K and S, that, in addition to their lower case ASCII equivalents,
|
||||
are case-equivalent with Unicode U+212A (Kelvin sign) and U+017F (long S)
|
||||
respectively when either PCRE2_UTF or PCRE2_UCP is set.
|
||||
</P>
|
||||
<P>
|
||||
Characters that might indicate line breaks are never treated in any special way
|
||||
|
@ -1650,7 +1583,7 @@ that succeeds is used. If the alternatives are within a group
|
|||
<a href="#group">(defined below),</a>
|
||||
"succeeds" means matching the rest of the main pattern as well as the
|
||||
alternative in the group.
|
||||
</P>
|
||||
<a name="internaloptions"></a></P>
|
||||
<br><a name="SEC13" href="#TOC1">INTERNAL OPTION SETTING</a><br>
|
||||
<P>
|
||||
The settings of the PCRE2_CASELESS, PCRE2_MULTILINE, PCRE2_DOTALL,
|
||||
|
@ -1901,12 +1834,19 @@ are permitted for groups with the same number, for example:
|
|||
(?|(?<AA>aa)|(?<AA>bb))
|
||||
</pre>
|
||||
The duplicate name constraint can be disabled by setting the PCRE2_DUPNAMES
|
||||
option at compile time, or by the use of (?J) within the pattern. Duplicate
|
||||
names can be useful for patterns where only one instance of the named capture
|
||||
group can match. Suppose you want to match the name of a weekday, either as a
|
||||
3-letter abbreviation or as the full name, and in both cases you want to
|
||||
extract the abbreviation. This pattern (ignoring the line breaks) does the job:
|
||||
option at compile time, or by the use of (?J) within the pattern, as described
|
||||
in the section entitled
|
||||
<a href="#internaloptions">"Internal Option Setting"</a>
|
||||
above.
|
||||
</P>
|
||||
<P>
|
||||
Duplicate names can be useful for patterns where only one instance of the named
|
||||
capture group can match. Suppose you want to match the name of a weekday,
|
||||
either as a 3-letter abbreviation or as the full name, and in both cases you
|
||||
want to extract the abbreviation. This pattern (ignoring the line breaks) does
|
||||
the job:
|
||||
<pre>
|
||||
(?J)
|
||||
(?<DN>Mon|Fri|Sun)(?:day)?|
|
||||
(?<DN>Tue)(?:sday)?|
|
||||
(?<DN>Wed)(?:nesday)?|
|
||||
|
@ -1927,7 +1867,7 @@ they appear in the overall pattern. The first one that is set is used for the
|
|||
reference. For example, this pattern matches both "foofoo" and "barbar" but not
|
||||
"foobar" or "barfoo":
|
||||
<pre>
|
||||
(?:(?<n>foo)|(?<n>bar))\k<n>
|
||||
(?J)(?:(?<n>foo)|(?<n>bar))\k<n>
|
||||
|
||||
</PRE>
|
||||
</P>
|
||||
|
@ -1961,7 +1901,7 @@ items:
|
|||
an escape such as \d or \pL that matches a single character
|
||||
a character class
|
||||
a backreference
|
||||
a parenthesized group (including most assertions)
|
||||
a parenthesized group (including lookaround assertions)
|
||||
a subroutine call (recursive or otherwise)
|
||||
</pre>
|
||||
The general repetition quantifier specifies a minimum and maximum number of
|
||||
|
@ -2147,10 +2087,10 @@ be easier to remember:
|
|||
<pre>
|
||||
(*atomic:\d+)foo
|
||||
</pre>
|
||||
This kind of parenthesized group "locks up" the part of the pattern it
|
||||
contains once it has matched, and a failure further into the pattern is
|
||||
prevented from backtracking into it. Backtracking past it to previous items,
|
||||
however, works as normal.
|
||||
This kind of parenthesized group "locks up" the part of the pattern it contains
|
||||
once it has matched, and a failure further into the pattern is prevented from
|
||||
backtracking into it. Backtracking past it to previous items, however, works as
|
||||
normal.
|
||||
</P>
|
||||
<P>
|
||||
An alternative description is that a group of this type matches exactly the
|
||||
|
@ -2349,11 +2289,11 @@ using alternation, as in the example above, or by a quantifier with a minimum
|
|||
of zero.
|
||||
</P>
|
||||
<P>
|
||||
Backreferences of this type cause the group that they reference to be treated
|
||||
as an
|
||||
For versions of PCRE2 less than 10.25, backreferences of this type used to
|
||||
cause the group that they reference to be treated as an
|
||||
<a href="#atomicgroup">atomic group.</a>
|
||||
Once the whole group has been matched, a subsequent matching failure cannot
|
||||
cause backtracking into the middle of the group.
|
||||
This restriction no longer applies, and backtracking into such groups can occur
|
||||
as normal.
|
||||
<a name="bigassertions"></a></P>
|
||||
<br><a name="SEC20" href="#TOC1">ASSERTIONS</a><br>
|
||||
<P>
|
||||
|
@ -2413,26 +2353,13 @@ control passes to the previous backtracking point, thus discarding any captured
|
|||
strings within the assertion.
|
||||
</P>
|
||||
<P>
|
||||
For compatibility with Perl, most assertion groups may be repeated; though it
|
||||
makes no sense to assert the same thing several times, the side effect of
|
||||
capturing may occasionally be useful. However, an assertion that forms the
|
||||
condition for a conditional group may not be quantified. In practice, for
|
||||
other assertions, there only three cases:
|
||||
<br>
|
||||
<br>
|
||||
(1) If the quantifier is {0}, the assertion is never obeyed during matching.
|
||||
However, it may contain internal capture groups that are called from elsewhere
|
||||
via the
|
||||
<a href="#groupsassubroutines">subroutine mechanism.</a>
|
||||
<br>
|
||||
<br>
|
||||
(2) If quantifier is {0,n} where n is greater than zero, it is treated as if it
|
||||
were {0,1}. At run time, the rest of the pattern match is tried with and
|
||||
without the assertion, the order depending on the greediness of the quantifier.
|
||||
<br>
|
||||
<br>
|
||||
(3) If the minimum repetition is greater than zero, the quantifier is ignored.
|
||||
The assertion is obeyed just once when encountered during matching.
|
||||
Most assertion groups may be repeated; though it makes no sense to assert the
|
||||
same thing several times, the side effect of capturing in positive assertions
|
||||
may occasionally be useful. However, an assertion that forms the condition for
|
||||
a conditional group may not be quantified. PCRE2 used to restrict the
|
||||
repetition of assertions, but from release 10.35 the only restriction is that
|
||||
an unlimited maximum repetition is changed to be one more than the minimum. For
|
||||
example, {3,} is treated as {3,4}.
|
||||
</P>
|
||||
<br><b>
|
||||
Alphabetic assertion names
|
||||
|
@ -2624,8 +2551,8 @@ backtracking into the assertion. However, there are some cases where non-atomic
|
|||
positive assertions can be useful. PCRE2 provides these using the following
|
||||
syntax:
|
||||
<pre>
|
||||
(*non_atomic_positive_lookahead: or (*napla:
|
||||
(*non_atomic_positive_lookbehind: or (*naplb:
|
||||
(*non_atomic_positive_lookahead: or (*napla: or (?*
|
||||
(*non_atomic_positive_lookbehind: or (*naplb: or (?<*
|
||||
</pre>
|
||||
Consider the problem of finding the right-most word in a string that also
|
||||
appears earlier in the string, that is, it must appear at least twice in total.
|
||||
|
@ -2665,9 +2592,15 @@ as before because nothing has changed, so using a non-atomic assertion just
|
|||
wastes resources.
|
||||
</P>
|
||||
<P>
|
||||
There is one exception to backtracking into a non-atomic assertion. If an
|
||||
(*ACCEPT) control verb is triggered, the assertion succeeds atomically. That
|
||||
is, a subsequent match failure cannot backtrack into the assertion.
|
||||
</P>
|
||||
<P>
|
||||
Non-atomic assertions are not supported by the alternative matching function
|
||||
<b>pcre2_dfa_match()</b>. They are also not supported by JIT (but may be in
|
||||
future). Note that assertions that appear as conditions for
|
||||
<b>pcre2_dfa_match()</b>. They are supported by JIT, but only if they do not
|
||||
contain any control verbs such as (*ACCEPT). (This may change in future). Note
|
||||
that assertions that appear as conditions for
|
||||
<a href="#conditions">conditional groups</a>
|
||||
(see below) must be atomic.
|
||||
</P>
|
||||
|
@ -2878,7 +2811,7 @@ breaks):
|
|||
(?(DEFINE) (?<byte> 2[0-4]\d | 25[0-5] | 1\d\d | [1-9]?\d) )
|
||||
\b (?&byte) (\.(?&byte)){3} \b
|
||||
</pre>
|
||||
The first part of the pattern is a DEFINE group inside which a another group
|
||||
The first part of the pattern is a DEFINE group inside which another group
|
||||
named "byte" is defined. This matches an individual component of an IPv4
|
||||
address (a number less than 256). When matching takes place, this part of the
|
||||
pattern is skipped because DEFINE acts like a false condition. The rest of the
|
||||
|
@ -3588,7 +3521,7 @@ successful match if there is a later mismatch. Consider:
|
|||
</pre>
|
||||
If the subject is "aaaac...", after the first match attempt fails (starting at
|
||||
the first character in the string), the starting point skips on to start the
|
||||
next attempt at "c". Note that a possessive quantifer does not have the same
|
||||
next attempt at "c". Note that a possessive quantifier does not have the same
|
||||
effect as this example; although it would suppress backtracking during the
|
||||
first match attempt, the second attempt would start at the second character
|
||||
instead of skipping on to "c".
|
||||
|
@ -3826,16 +3759,16 @@ there is a backtrack at the outer level.
|
|||
<P>
|
||||
Philip Hazel
|
||||
<br>
|
||||
University Computing Service
|
||||
Retired from University Computing Service
|
||||
<br>
|
||||
Cambridge, England.
|
||||
<br>
|
||||
</P>
|
||||
<br><a name="SEC32" href="#TOC1">REVISION</a><br>
|
||||
<P>
|
||||
Last updated: 29 July 2019
|
||||
Last updated: 12 January 2022
|
||||
<br>
|
||||
Copyright © 1997-2019 University of Cambridge.
|
||||
Copyright © 1997-2022 University of Cambridge.
|
||||
<br>
|
||||
<p>
|
||||
Return to the <a href="index.html">PCRE2 index page</a>.
|
||||
|
|
|
@ -83,12 +83,31 @@ From release 10.30, the interpretive (non-JIT) version of <b>pcre2_match()</b>
|
|||
uses very little system stack at run time. In earlier releases recursive
|
||||
function calls could use a great deal of stack, and this could cause problems,
|
||||
but this usage has been eliminated. Backtracking positions are now explicitly
|
||||
remembered in memory frames controlled by the code. An initial 20KiB vector of
|
||||
frames is allocated on the system stack (enough for about 100 frames for small
|
||||
patterns), but if this is insufficient, heap memory is used. The amount of heap
|
||||
memory can be limited; if the limit is set to zero, only the initial stack
|
||||
vector is used. Rewriting patterns to be time-efficient, as described below,
|
||||
may also reduce the memory requirements.
|
||||
remembered in memory frames controlled by the code.
|
||||
</P>
|
||||
<P>
|
||||
The size of each frame depends on the size of pointer variables and the number
|
||||
of capturing parenthesized groups in the pattern being matched. On a 64-bit
|
||||
system the frame size for a pattern with no captures is 128 bytes. For each
|
||||
capturing group the size increases by 16 bytes.
|
||||
</P>
|
||||
<P>
|
||||
Until release 10.41, an initial 20KiB frames vector was allocated on the system
|
||||
stack, but this still caused some issues for multi-thread applications where
|
||||
each thread has a very small stack. From release 10.41 backtracking memory
|
||||
frames are always held in heap memory. An initial heap allocation is obtained
|
||||
the first time any match data block is passed to <b>pcre2_match()</b>. This is
|
||||
remembered with the match data block and re-used if that block is used for
|
||||
another match. It is freed when the match data block itself is freed.
|
||||
</P>
|
||||
<P>
|
||||
The size of the initial block is the larger of 20KiB or ten times the pattern's
|
||||
frame size, unless the heap limit is less than this, in which case the heap
|
||||
limit is used. If the initial block proves to be too small during matching, it
|
||||
is replaced by a larger block, subject to the heap limit. The heap limit is
|
||||
checked only when a new block is to be allocated. Reducing the heap limit
|
||||
between calls to <b>pcre2_match()</b> with the same match data block does not
|
||||
affect the saved block.
|
||||
</P>
|
||||
<P>
|
||||
In contrast to <b>pcre2_match()</b>, <b>pcre2_dfa_match()</b> does use recursive
|
||||
|
@ -245,16 +264,16 @@ pattern to match. This is done by repeatedly matching with different limits.
|
|||
<P>
|
||||
Philip Hazel
|
||||
<br>
|
||||
University Computing Service
|
||||
Retired from University Computing Service
|
||||
<br>
|
||||
Cambridge, England.
|
||||
<br>
|
||||
</P>
|
||||
<br><a name="SEC6" href="#TOC1">REVISION</a><br>
|
||||
<P>
|
||||
Last updated: 03 February 2019
|
||||
Last updated: 27 July 2022
|
||||
<br>
|
||||
Copyright © 1997-2019 University of Cambridge.
|
||||
Copyright © 1997-2022 University of Cambridge.
|
||||
<br>
|
||||
<p>
|
||||
Return to the <a href="index.html">PCRE2 index page</a>.
|
||||
|
|
|
@ -68,11 +68,14 @@ application. Because the POSIX functions call the native ones, it is also
|
|||
necessary to add <b>-lpcre2-8</b>.
|
||||
</P>
|
||||
<P>
|
||||
Although they are not defined as protypes in <b>pcre2posix.h</b>, the library
|
||||
does contain functions with the POSIX names <b>regcomp()</b> etc. These simply
|
||||
pass their arguments to the PCRE2 functions. These functions are provided for
|
||||
backwards compatibility with earlier versions of PCRE2, so that existing
|
||||
programs do not have to be recompiled.
|
||||
Although they were not defined as protypes in <b>pcre2posix.h</b>, releases
|
||||
10.33 to 10.36 of the library contained functions with the POSIX names
|
||||
<b>regcomp()</b> etc. These simply passed their arguments to the PCRE2
|
||||
functions. These functions were provided for backwards compatibility with
|
||||
earlier versions of PCRE2, which had only POSIX names. However, this has proved
|
||||
troublesome in situations where a program links with several libraries, some of
|
||||
which use PCRE2's POSIX interface while others use the real POSIX functions.
|
||||
For this reason, the POSIX names have been removed since release 10.37.
|
||||
</P>
|
||||
<P>
|
||||
Calling the header file <b>pcre2posix.h</b> avoids any conflict with other POSIX
|
||||
|
@ -344,9 +347,9 @@ Cambridge, England.
|
|||
</P>
|
||||
<br><a name="SEC10" href="#TOC1">REVISION</a><br>
|
||||
<P>
|
||||
Last updated: 30 January 2019
|
||||
Last updated: 26 April 2021
|
||||
<br>
|
||||
Copyright © 1997-2019 University of Cambridge.
|
||||
Copyright © 1997-2021 University of Cambridge.
|
||||
<br>
|
||||
<p>
|
||||
Return to the <a href="index.html">PCRE2 index page</a>.
|
||||
|
|
|
@ -23,12 +23,12 @@ please consult the man page, in case the conversion went wrong.
|
|||
<br><a name="SEC1" href="#TOC1">SAVING AND RE-USING PRECOMPILED PCRE2 PATTERNS</a><br>
|
||||
<P>
|
||||
<b>int32_t pcre2_serialize_decode(pcre2_code **<i>codes</i>,</b>
|
||||
<b> int32_t <i>number_of_codes</i>, const uint32_t *<i>bytes</i>,</b>
|
||||
<b> int32_t <i>number_of_codes</i>, const uint8_t *<i>bytes</i>,</b>
|
||||
<b> pcre2_general_context *<i>gcontext</i>);</b>
|
||||
<br>
|
||||
<br>
|
||||
<b>int32_t pcre2_serialize_encode(pcre2_code **<i>codes</i>,</b>
|
||||
<b> int32_t <i>number_of_codes</i>, uint32_t **<i>serialized_bytes</i>,</b>
|
||||
<b>int32_t pcre2_serialize_encode(const pcre2_code **<i>codes</i>,</b>
|
||||
<b> int32_t <i>number_of_codes</i>, uint8_t **<i>serialized_bytes</i>,</b>
|
||||
<b> PCRE2_SIZE *<i>serialized_size</i>, pcre2_general_context *<i>gcontext</i>);</b>
|
||||
<br>
|
||||
<br>
|
||||
|
@ -94,7 +94,7 @@ of serialized patterns, or one of the following negative error codes:
|
|||
<pre>
|
||||
PCRE2_ERROR_BADDATA the number of patterns is zero or less
|
||||
PCRE2_ERROR_BADMAGIC mismatch of id bytes in one of the patterns
|
||||
PCRE2_ERROR_MEMORY memory allocation failed
|
||||
PCRE2_ERROR_NOMEMORY memory allocation failed
|
||||
PCRE2_ERROR_MIXEDTABLES the patterns do not all use the same tables
|
||||
PCRE2_ERROR_NULL the 1st, 3rd, or 4th argument is NULL
|
||||
</pre>
|
||||
|
@ -154,7 +154,6 @@ mangagement functions for the decoded patterns. If this argument is NULL,
|
|||
<b>malloc()</b> and <b>free()</b> are used. After deserialization, the byte
|
||||
stream is no longer needed and can be discarded.
|
||||
<pre>
|
||||
int32_t number_of_codes;
|
||||
pcre2_code *list_of_codes[2];
|
||||
uint8_t *bytes = <serialized data>;
|
||||
int32_t number_of_codes =
|
||||
|
|
|
@ -19,29 +19,31 @@ please consult the man page, in case the conversion went wrong.
|
|||
<li><a name="TOC4" href="#SEC4">CHARACTER TYPES</a>
|
||||
<li><a name="TOC5" href="#SEC5">GENERAL CATEGORY PROPERTIES FOR \p and \P</a>
|
||||
<li><a name="TOC6" href="#SEC6">PCRE2 SPECIAL CATEGORY PROPERTIES FOR \p and \P</a>
|
||||
<li><a name="TOC7" href="#SEC7">SCRIPT NAMES FOR \p AND \P</a>
|
||||
<li><a name="TOC8" href="#SEC8">CHARACTER CLASSES</a>
|
||||
<li><a name="TOC9" href="#SEC9">QUANTIFIERS</a>
|
||||
<li><a name="TOC10" href="#SEC10">ANCHORS AND SIMPLE ASSERTIONS</a>
|
||||
<li><a name="TOC11" href="#SEC11">REPORTED MATCH POINT SETTING</a>
|
||||
<li><a name="TOC12" href="#SEC12">ALTERNATION</a>
|
||||
<li><a name="TOC13" href="#SEC13">CAPTURING</a>
|
||||
<li><a name="TOC14" href="#SEC14">ATOMIC GROUPS</a>
|
||||
<li><a name="TOC15" href="#SEC15">COMMENT</a>
|
||||
<li><a name="TOC16" href="#SEC16">OPTION SETTING</a>
|
||||
<li><a name="TOC17" href="#SEC17">NEWLINE CONVENTION</a>
|
||||
<li><a name="TOC18" href="#SEC18">WHAT \R MATCHES</a>
|
||||
<li><a name="TOC19" href="#SEC19">LOOKAHEAD AND LOOKBEHIND ASSERTIONS</a>
|
||||
<li><a name="TOC20" href="#SEC20">NON-ATOMIC LOOKAROUND ASSERTIONS</a>
|
||||
<li><a name="TOC21" href="#SEC21">SCRIPT RUNS</a>
|
||||
<li><a name="TOC22" href="#SEC22">BACKREFERENCES</a>
|
||||
<li><a name="TOC23" href="#SEC23">SUBROUTINE REFERENCES (POSSIBLY RECURSIVE)</a>
|
||||
<li><a name="TOC24" href="#SEC24">CONDITIONAL PATTERNS</a>
|
||||
<li><a name="TOC25" href="#SEC25">BACKTRACKING CONTROL</a>
|
||||
<li><a name="TOC26" href="#SEC26">CALLOUTS</a>
|
||||
<li><a name="TOC27" href="#SEC27">SEE ALSO</a>
|
||||
<li><a name="TOC28" href="#SEC28">AUTHOR</a>
|
||||
<li><a name="TOC29" href="#SEC29">REVISION</a>
|
||||
<li><a name="TOC7" href="#SEC7">BINARY PROPERTIES FOR \p AND \P</a>
|
||||
<li><a name="TOC8" href="#SEC8">SCRIPT MATCHING WITH \p AND \P</a>
|
||||
<li><a name="TOC9" href="#SEC9">THE BIDI_CLASS PROPERTY FOR \p AND \P</a>
|
||||
<li><a name="TOC10" href="#SEC10">CHARACTER CLASSES</a>
|
||||
<li><a name="TOC11" href="#SEC11">QUANTIFIERS</a>
|
||||
<li><a name="TOC12" href="#SEC12">ANCHORS AND SIMPLE ASSERTIONS</a>
|
||||
<li><a name="TOC13" href="#SEC13">REPORTED MATCH POINT SETTING</a>
|
||||
<li><a name="TOC14" href="#SEC14">ALTERNATION</a>
|
||||
<li><a name="TOC15" href="#SEC15">CAPTURING</a>
|
||||
<li><a name="TOC16" href="#SEC16">ATOMIC GROUPS</a>
|
||||
<li><a name="TOC17" href="#SEC17">COMMENT</a>
|
||||
<li><a name="TOC18" href="#SEC18">OPTION SETTING</a>
|
||||
<li><a name="TOC19" href="#SEC19">NEWLINE CONVENTION</a>
|
||||
<li><a name="TOC20" href="#SEC20">WHAT \R MATCHES</a>
|
||||
<li><a name="TOC21" href="#SEC21">LOOKAHEAD AND LOOKBEHIND ASSERTIONS</a>
|
||||
<li><a name="TOC22" href="#SEC22">NON-ATOMIC LOOKAROUND ASSERTIONS</a>
|
||||
<li><a name="TOC23" href="#SEC23">SCRIPT RUNS</a>
|
||||
<li><a name="TOC24" href="#SEC24">BACKREFERENCES</a>
|
||||
<li><a name="TOC25" href="#SEC25">SUBROUTINE REFERENCES (POSSIBLY RECURSIVE)</a>
|
||||
<li><a name="TOC26" href="#SEC26">CONDITIONAL PATTERNS</a>
|
||||
<li><a name="TOC27" href="#SEC27">BACKTRACKING CONTROL</a>
|
||||
<li><a name="TOC28" href="#SEC28">CALLOUTS</a>
|
||||
<li><a name="TOC29" href="#SEC29">SEE ALSO</a>
|
||||
<li><a name="TOC30" href="#SEC30">AUTHOR</a>
|
||||
<li><a name="TOC31" href="#SEC31">REVISION</a>
|
||||
</ul>
|
||||
<br><a name="SEC1" href="#TOC1">PCRE2 REGULAR EXPRESSION SYNTAX SUMMARY</a><br>
|
||||
<P>
|
||||
|
@ -136,6 +138,11 @@ happening, \s and \w may also match characters with code points in the range
|
|||
sequences is changed to use Unicode properties and they match many more
|
||||
characters.
|
||||
</P>
|
||||
<P>
|
||||
Property descriptions in \p and \P are matched caselessly; hyphens,
|
||||
underscores, and white space are ignored, in accordance with Unicode's "loose
|
||||
matching" rules.
|
||||
</P>
|
||||
<br><a name="SEC5" href="#TOC1">GENERAL CATEGORY PROPERTIES FOR \p and \P</a><br>
|
||||
<P>
|
||||
<pre>
|
||||
|
@ -152,6 +159,7 @@ characters.
|
|||
Lo Other letter
|
||||
Lt Title case letter
|
||||
Lu Upper case letter
|
||||
Lc Ll, Lu, or Lt
|
||||
L& Ll, Lu, or Lt
|
||||
|
||||
M Mark
|
||||
|
@ -198,162 +206,58 @@ characters.
|
|||
Perl and POSIX space are now the same. Perl added VT to its space character set
|
||||
at release 5.18.
|
||||
</P>
|
||||
<br><a name="SEC7" href="#TOC1">SCRIPT NAMES FOR \p AND \P</a><br>
|
||||
<br><a name="SEC7" href="#TOC1">BINARY PROPERTIES FOR \p AND \P</a><br>
|
||||
<P>
|
||||
Adlam,
|
||||
Ahom,
|
||||
Anatolian_Hieroglyphs,
|
||||
Arabic,
|
||||
Armenian,
|
||||
Avestan,
|
||||
Balinese,
|
||||
Bamum,
|
||||
Bassa_Vah,
|
||||
Batak,
|
||||
Bengali,
|
||||
Bhaiksuki,
|
||||
Bopomofo,
|
||||
Brahmi,
|
||||
Braille,
|
||||
Buginese,
|
||||
Buhid,
|
||||
Canadian_Aboriginal,
|
||||
Carian,
|
||||
Caucasian_Albanian,
|
||||
Chakma,
|
||||
Cham,
|
||||
Cherokee,
|
||||
Common,
|
||||
Coptic,
|
||||
Cuneiform,
|
||||
Cypriot,
|
||||
Cyrillic,
|
||||
Deseret,
|
||||
Devanagari,
|
||||
Dogra,
|
||||
Duployan,
|
||||
Egyptian_Hieroglyphs,
|
||||
Elbasan,
|
||||
Elymaic,
|
||||
Ethiopic,
|
||||
Georgian,
|
||||
Glagolitic,
|
||||
Gothic,
|
||||
Grantha,
|
||||
Greek,
|
||||
Gujarati,
|
||||
Gunjala_Gondi,
|
||||
Gurmukhi,
|
||||
Han,
|
||||
Hangul,
|
||||
Hanifi_Rohingya,
|
||||
Hanunoo,
|
||||
Hatran,
|
||||
Hebrew,
|
||||
Hiragana,
|
||||
Imperial_Aramaic,
|
||||
Inherited,
|
||||
Inscriptional_Pahlavi,
|
||||
Inscriptional_Parthian,
|
||||
Javanese,
|
||||
Kaithi,
|
||||
Kannada,
|
||||
Katakana,
|
||||
Kayah_Li,
|
||||
Kharoshthi,
|
||||
Khmer,
|
||||
Khojki,
|
||||
Khudawadi,
|
||||
Lao,
|
||||
Latin,
|
||||
Lepcha,
|
||||
Limbu,
|
||||
Linear_A,
|
||||
Linear_B,
|
||||
Lisu,
|
||||
Lycian,
|
||||
Lydian,
|
||||
Mahajani,
|
||||
Makasar,
|
||||
Malayalam,
|
||||
Mandaic,
|
||||
Manichaean,
|
||||
Marchen,
|
||||
Masaram_Gondi,
|
||||
Medefaidrin,
|
||||
Meetei_Mayek,
|
||||
Mende_Kikakui,
|
||||
Meroitic_Cursive,
|
||||
Meroitic_Hieroglyphs,
|
||||
Miao,
|
||||
Modi,
|
||||
Mongolian,
|
||||
Mro,
|
||||
Multani,
|
||||
Myanmar,
|
||||
Nabataean,
|
||||
Nandinagari,
|
||||
New_Tai_Lue,
|
||||
Newa,
|
||||
Nko,
|
||||
Nushu,
|
||||
Nyakeng_Puachue_Hmong,
|
||||
Ogham,
|
||||
Ol_Chiki,
|
||||
Old_Hungarian,
|
||||
Old_Italic,
|
||||
Old_North_Arabian,
|
||||
Old_Permic,
|
||||
Old_Persian,
|
||||
Old_Sogdian,
|
||||
Old_South_Arabian,
|
||||
Old_Turkic,
|
||||
Oriya,
|
||||
Osage,
|
||||
Osmanya,
|
||||
Pahawh_Hmong,
|
||||
Palmyrene,
|
||||
Pau_Cin_Hau,
|
||||
Phags_Pa,
|
||||
Phoenician,
|
||||
Psalter_Pahlavi,
|
||||
Rejang,
|
||||
Runic,
|
||||
Samaritan,
|
||||
Saurashtra,
|
||||
Sharada,
|
||||
Shavian,
|
||||
Siddham,
|
||||
SignWriting,
|
||||
Sinhala,
|
||||
Sogdian,
|
||||
Sora_Sompeng,
|
||||
Soyombo,
|
||||
Sundanese,
|
||||
Syloti_Nagri,
|
||||
Syriac,
|
||||
Tagalog,
|
||||
Tagbanwa,
|
||||
Tai_Le,
|
||||
Tai_Tham,
|
||||
Tai_Viet,
|
||||
Takri,
|
||||
Tamil,
|
||||
Tangut,
|
||||
Telugu,
|
||||
Thaana,
|
||||
Thai,
|
||||
Tibetan,
|
||||
Tifinagh,
|
||||
Tirhuta,
|
||||
Ugaritic,
|
||||
Vai,
|
||||
Wancho,
|
||||
Warang_Citi,
|
||||
Yi,
|
||||
Zanabazar_Square.
|
||||
Unicode defines a number of binary properties, that is, properties whose only
|
||||
values are true or false. You can obtain a list of those that are recognized by
|
||||
\p and \P, along with their abbreviations, by running this command:
|
||||
<pre>
|
||||
pcre2test -LP
|
||||
</PRE>
|
||||
</P>
|
||||
<br><a name="SEC8" href="#TOC1">CHARACTER CLASSES</a><br>
|
||||
<br><a name="SEC8" href="#TOC1">SCRIPT MATCHING WITH \p AND \P</a><br>
|
||||
<P>
|
||||
Many script names and their 4-letter abbreviations are recognized in
|
||||
\p{sc:...} or \p{scx:...} items, or on their own with \p (and also \P of
|
||||
course). You can obtain a list of these scripts by running this command:
|
||||
<pre>
|
||||
pcre2test -LS
|
||||
</PRE>
|
||||
</P>
|
||||
<br><a name="SEC9" href="#TOC1">THE BIDI_CLASS PROPERTY FOR \p AND \P</a><br>
|
||||
<P>
|
||||
<pre>
|
||||
\p{Bidi_Class:<class>} matches a character with the given class
|
||||
\p{BC:<class>} matches a character with the given class
|
||||
</pre>
|
||||
The recognized classes are:
|
||||
<pre>
|
||||
AL Arabic letter
|
||||
AN Arabic number
|
||||
B paragraph separator
|
||||
BN boundary neutral
|
||||
CS common separator
|
||||
EN European number
|
||||
ES European separator
|
||||
ET European terminator
|
||||
FSI first strong isolate
|
||||
L left-to-right
|
||||
LRE left-to-right embedding
|
||||
LRI left-to-right isolate
|
||||
LRO left-to-right override
|
||||
NSM non-spacing mark
|
||||
ON other neutral
|
||||
PDF pop directional format
|
||||
PDI pop directional isolate
|
||||
R right-to-left
|
||||
RLE right-to-left embedding
|
||||
RLI right-to-left isolate
|
||||
RLO right-to-left override
|
||||
S segment separator
|
||||
WS which space
|
||||
</PRE>
|
||||
</P>
|
||||
<br><a name="SEC10" href="#TOC1">CHARACTER CLASSES</a><br>
|
||||
<P>
|
||||
<pre>
|
||||
[...] positive character class
|
||||
|
@ -381,7 +285,7 @@ In PCRE2, POSIX character set names recognize only ASCII characters by default,
|
|||
but some of them use Unicode properties if PCRE2_UCP is set. You can use
|
||||
\Q...\E inside a character class.
|
||||
</P>
|
||||
<br><a name="SEC9" href="#TOC1">QUANTIFIERS</a><br>
|
||||
<br><a name="SEC11" href="#TOC1">QUANTIFIERS</a><br>
|
||||
<P>
|
||||
<pre>
|
||||
? 0 or 1, greedy
|
||||
|
@ -402,7 +306,7 @@ but some of them use Unicode properties if PCRE2_UCP is set. You can use
|
|||
{n,}? n or more, lazy
|
||||
</PRE>
|
||||
</P>
|
||||
<br><a name="SEC10" href="#TOC1">ANCHORS AND SIMPLE ASSERTIONS</a><br>
|
||||
<br><a name="SEC12" href="#TOC1">ANCHORS AND SIMPLE ASSERTIONS</a><br>
|
||||
<P>
|
||||
<pre>
|
||||
\b word boundary
|
||||
|
@ -420,20 +324,23 @@ but some of them use Unicode properties if PCRE2_UCP is set. You can use
|
|||
\G first matching position in subject
|
||||
</PRE>
|
||||
</P>
|
||||
<br><a name="SEC11" href="#TOC1">REPORTED MATCH POINT SETTING</a><br>
|
||||
<br><a name="SEC13" href="#TOC1">REPORTED MATCH POINT SETTING</a><br>
|
||||
<P>
|
||||
<pre>
|
||||
\K set reported start of match
|
||||
</pre>
|
||||
From release 10.38 \K is not permitted by default in lookaround assertions,
|
||||
for compatibility with Perl. However, if the PCRE2_EXTRA_ALLOW_LOOKAROUND_BSK
|
||||
option is set, the previous behaviour is re-enabled. When this option is set,
|
||||
\K is honoured in positive assertions, but ignored in negative ones.
|
||||
</P>
|
||||
<br><a name="SEC12" href="#TOC1">ALTERNATION</a><br>
|
||||
<br><a name="SEC14" href="#TOC1">ALTERNATION</a><br>
|
||||
<P>
|
||||
<pre>
|
||||
expr|expr|expr...
|
||||
</PRE>
|
||||
</P>
|
||||
<br><a name="SEC13" href="#TOC1">CAPTURING</a><br>
|
||||
<br><a name="SEC15" href="#TOC1">CAPTURING</a><br>
|
||||
<P>
|
||||
<pre>
|
||||
(...) capture group
|
||||
|
@ -448,26 +355,26 @@ In non-UTF modes, names may contain underscores and ASCII letters and digits;
|
|||
in UTF modes, any Unicode letters and Unicode decimal digits are permitted. In
|
||||
both cases, a name must not start with a digit.
|
||||
</P>
|
||||
<br><a name="SEC14" href="#TOC1">ATOMIC GROUPS</a><br>
|
||||
<br><a name="SEC16" href="#TOC1">ATOMIC GROUPS</a><br>
|
||||
<P>
|
||||
<pre>
|
||||
(?>...) atomic non-capture group
|
||||
(*atomic:...) atomic non-capture group
|
||||
</PRE>
|
||||
</P>
|
||||
<br><a name="SEC15" href="#TOC1">COMMENT</a><br>
|
||||
<br><a name="SEC17" href="#TOC1">COMMENT</a><br>
|
||||
<P>
|
||||
<pre>
|
||||
(?#....) comment (not nestable)
|
||||
</PRE>
|
||||
</P>
|
||||
<br><a name="SEC16" href="#TOC1">OPTION SETTING</a><br>
|
||||
<br><a name="SEC18" href="#TOC1">OPTION SETTING</a><br>
|
||||
<P>
|
||||
Changes of these options within a group are automatically cancelled at the end
|
||||
of the group.
|
||||
<pre>
|
||||
(?i) caseless
|
||||
(?J) allow duplicate names
|
||||
(?J) allow duplicate named groups
|
||||
(?m) multiline
|
||||
(?n) no auto capture
|
||||
(?s) single line (dotall)
|
||||
|
@ -506,7 +413,7 @@ not increase them. LIMIT_RECURSION is an obsolete synonym for LIMIT_DEPTH. The
|
|||
application can lock out the use of (*UTF) and (*UCP) by setting the
|
||||
PCRE2_NEVER_UTF or PCRE2_NEVER_UCP options, respectively, at compile time.
|
||||
</P>
|
||||
<br><a name="SEC17" href="#TOC1">NEWLINE CONVENTION</a><br>
|
||||
<br><a name="SEC19" href="#TOC1">NEWLINE CONVENTION</a><br>
|
||||
<P>
|
||||
These are recognized only at the very start of the pattern or after option
|
||||
settings with a similar syntax.
|
||||
|
@ -519,7 +426,7 @@ settings with a similar syntax.
|
|||
(*NUL) the NUL character (binary zero)
|
||||
</PRE>
|
||||
</P>
|
||||
<br><a name="SEC18" href="#TOC1">WHAT \R MATCHES</a><br>
|
||||
<br><a name="SEC20" href="#TOC1">WHAT \R MATCHES</a><br>
|
||||
<P>
|
||||
These are recognized only at the very start of the pattern or after option
|
||||
setting with a similar syntax.
|
||||
|
@ -528,7 +435,7 @@ setting with a similar syntax.
|
|||
(*BSR_UNICODE) any Unicode newline sequence
|
||||
</PRE>
|
||||
</P>
|
||||
<br><a name="SEC19" href="#TOC1">LOOKAHEAD AND LOOKBEHIND ASSERTIONS</a><br>
|
||||
<br><a name="SEC21" href="#TOC1">LOOKAHEAD AND LOOKBEHIND ASSERTIONS</a><br>
|
||||
<P>
|
||||
<pre>
|
||||
(?=...) )
|
||||
|
@ -549,18 +456,20 @@ setting with a similar syntax.
|
|||
</pre>
|
||||
Each top-level branch of a lookbehind must be of a fixed length.
|
||||
</P>
|
||||
<br><a name="SEC20" href="#TOC1">NON-ATOMIC LOOKAROUND ASSERTIONS</a><br>
|
||||
<br><a name="SEC22" href="#TOC1">NON-ATOMIC LOOKAROUND ASSERTIONS</a><br>
|
||||
<P>
|
||||
These assertions are specific to PCRE2 and are not Perl-compatible.
|
||||
<pre>
|
||||
(*napla:...)
|
||||
(*non_atomic_positive_lookahead:...)
|
||||
(?*...) )
|
||||
(*napla:...) ) synonyms
|
||||
(*non_atomic_positive_lookahead:...) )
|
||||
|
||||
(*naplb:...)
|
||||
(*non_atomic_positive_lookbehind:...)
|
||||
(?<*...) )
|
||||
(*naplb:...) ) synonyms
|
||||
(*non_atomic_positive_lookbehind:...) )
|
||||
</PRE>
|
||||
</P>
|
||||
<br><a name="SEC21" href="#TOC1">SCRIPT RUNS</a><br>
|
||||
<br><a name="SEC23" href="#TOC1">SCRIPT RUNS</a><br>
|
||||
<P>
|
||||
<pre>
|
||||
(*script_run:...) ) script run, can be backtracked into
|
||||
|
@ -570,7 +479,7 @@ These assertions are specific to PCRE2 and are not Perl-compatible.
|
|||
(*asr:...) )
|
||||
</PRE>
|
||||
</P>
|
||||
<br><a name="SEC22" href="#TOC1">BACKREFERENCES</a><br>
|
||||
<br><a name="SEC24" href="#TOC1">BACKREFERENCES</a><br>
|
||||
<P>
|
||||
<pre>
|
||||
\n reference by number (can be ambiguous)
|
||||
|
@ -587,7 +496,7 @@ These assertions are specific to PCRE2 and are not Perl-compatible.
|
|||
(?P=name) reference by name (Python)
|
||||
</PRE>
|
||||
</P>
|
||||
<br><a name="SEC23" href="#TOC1">SUBROUTINE REFERENCES (POSSIBLY RECURSIVE)</a><br>
|
||||
<br><a name="SEC25" href="#TOC1">SUBROUTINE REFERENCES (POSSIBLY RECURSIVE)</a><br>
|
||||
<P>
|
||||
<pre>
|
||||
(?R) recurse whole pattern
|
||||
|
@ -606,7 +515,7 @@ These assertions are specific to PCRE2 and are not Perl-compatible.
|
|||
\g'-n' call subroutine by relative number (PCRE2 extension)
|
||||
</PRE>
|
||||
</P>
|
||||
<br><a name="SEC24" href="#TOC1">CONDITIONAL PATTERNS</a><br>
|
||||
<br><a name="SEC26" href="#TOC1">CONDITIONAL PATTERNS</a><br>
|
||||
<P>
|
||||
<pre>
|
||||
(?(condition)yes-pattern)
|
||||
|
@ -629,7 +538,7 @@ Note the ambiguity of (?(R) and (?(Rn) which might be named reference
|
|||
conditions or recursion tests. Such a condition is interpreted as a reference
|
||||
condition if the relevant named group exists.
|
||||
</P>
|
||||
<br><a name="SEC25" href="#TOC1">BACKTRACKING CONTROL</a><br>
|
||||
<br><a name="SEC27" href="#TOC1">BACKTRACKING CONTROL</a><br>
|
||||
<P>
|
||||
All backtracking control verbs may be in the form (*VERB:NAME). For (*MARK) the
|
||||
name is mandatory, for the others it is optional. (*SKIP) changes its behaviour
|
||||
|
@ -656,7 +565,7 @@ pattern is not anchored.
|
|||
The effect of one of these verbs in a group called as a subroutine is confined
|
||||
to the subroutine call.
|
||||
</P>
|
||||
<br><a name="SEC26" href="#TOC1">CALLOUTS</a><br>
|
||||
<br><a name="SEC28" href="#TOC1">CALLOUTS</a><br>
|
||||
<P>
|
||||
<pre>
|
||||
(?C) callout (assumed number 0)
|
||||
|
@ -667,25 +576,25 @@ The allowed string delimiters are ` ' " ^ % # $ (which are the same for the
|
|||
start and the end), and the starting delimiter { matched with the ending
|
||||
delimiter }. To encode the ending delimiter within the string, double it.
|
||||
</P>
|
||||
<br><a name="SEC27" href="#TOC1">SEE ALSO</a><br>
|
||||
<br><a name="SEC29" href="#TOC1">SEE ALSO</a><br>
|
||||
<P>
|
||||
<b>pcre2pattern</b>(3), <b>pcre2api</b>(3), <b>pcre2callout</b>(3),
|
||||
<b>pcre2matching</b>(3), <b>pcre2</b>(3).
|
||||
</P>
|
||||
<br><a name="SEC28" href="#TOC1">AUTHOR</a><br>
|
||||
<br><a name="SEC30" href="#TOC1">AUTHOR</a><br>
|
||||
<P>
|
||||
Philip Hazel
|
||||
<br>
|
||||
University Computing Service
|
||||
Retired from University Computing Service
|
||||
<br>
|
||||
Cambridge, England.
|
||||
<br>
|
||||
</P>
|
||||
<br><a name="SEC29" href="#TOC1">REVISION</a><br>
|
||||
<br><a name="SEC31" href="#TOC1">REVISION</a><br>
|
||||
<P>
|
||||
Last updated: 29 July 2019
|
||||
Last updated: 12 January 2022
|
||||
<br>
|
||||
Copyright © 1997-2019 University of Cambridge.
|
||||
Copyright © 1997-2022 University of Cambridge.
|
||||
<br>
|
||||
<p>
|
||||
Return to the <a href="index.html">PCRE2 index page</a>.
|
||||
|
|
|
@ -59,12 +59,7 @@ patterns, and the subject lines specify PCRE2 function options, control how the
|
|||
subject is processed, and what output is produced.
|
||||
</P>
|
||||
<P>
|
||||
As the original fairly simple PCRE library evolved, it acquired many different
|
||||
features, and as a result, the original <b>pcretest</b> program ended up with a
|
||||
lot of options in a messy, arcane syntax for testing all the features. The
|
||||
move to the new PCRE2 API provided an opportunity to re-implement the test
|
||||
program as <b>pcre2test</b>, with a cleaner modifier syntax. Nevertheless, there
|
||||
are still many obscure modifiers, some of which are specifically designed for
|
||||
There are many obscure modifiers, some of which are specifically designed for
|
||||
use in conjunction with the test script and data files that are distributed as
|
||||
part of PCRE2. All the modifiers are documented here, some without much
|
||||
justification, but many of them are unlikely to be of use except when testing
|
||||
|
@ -83,16 +78,16 @@ to 8-bit code units for output.
|
|||
</P>
|
||||
<P>
|
||||
In the rest of this document, the names of library functions and structures
|
||||
are given in generic form, for example, <b>pcre_compile()</b>. The actual
|
||||
are given in generic form, for example, <b>pcre2_compile()</b>. The actual
|
||||
names used in the libraries have a suffix _8, _16, or _32, as appropriate.
|
||||
<a name="inputencoding"></a></P>
|
||||
<br><a name="SEC3" href="#TOC1">INPUT ENCODING</a><br>
|
||||
<P>
|
||||
Input to <b>pcre2test</b> is processed line by line, either by calling the C
|
||||
library's <b>fgets()</b> function, or via the <b>libreadline</b> library. In some
|
||||
Windows environments character 26 (hex 1A) causes an immediate end of file, and
|
||||
no further data is read, so this character should be avoided unless you really
|
||||
want that action.
|
||||
library's <b>fgets()</b> function, or via the <b>libreadline</b> or <b>libedit</b>
|
||||
library. In some Windows environments character 26 (hex 1A) causes an immediate
|
||||
end of file, and no further data is read, so this character should be avoided
|
||||
unless you really want that action.
|
||||
</P>
|
||||
<P>
|
||||
The input is processed using using C's string functions, so must not
|
||||
|
@ -258,10 +253,22 @@ available, and the use of JIT for matching is verified.
|
|||
<b>-LM</b>
|
||||
List modifiers: write a list of available pattern and subject modifiers to the
|
||||
standard output, then exit with zero exit code. All other options are ignored.
|
||||
If both -C and -LM are present, whichever is first is recognized.
|
||||
If both -C and any -Lx options are present, whichever is first is recognized.
|
||||
</P>
|
||||
<P>
|
||||
\fB-pattern\fB <i>modifier-list</i>
|
||||
<b>-LP</b>
|
||||
List properties: write a list of recognized Unicode properties to the standard
|
||||
output, then exit with zero exit code. All other options are ignored. If both
|
||||
-C and any -Lx options are present, whichever is first is recognized.
|
||||
</P>
|
||||
<P>
|
||||
<b>-LS</b>
|
||||
List scripts: write a list of recogized Unicode script names to the standard
|
||||
output, then exit with zero exit code. All other options are ignored. If both
|
||||
-C and any -Lx options are present, whichever is first is recognized.
|
||||
</P>
|
||||
<P>
|
||||
<b>-pattern</b> <i>modifier-list</i>
|
||||
Behave as if each pattern line contains the given modifiers.
|
||||
</P>
|
||||
<P>
|
||||
|
@ -323,7 +330,7 @@ test data, command lines that begin with # may appear. This file format, with
|
|||
some restrictions, can also be processed by the <b>perltest.sh</b> script that
|
||||
is distributed with PCRE2 as a means of checking that the behaviour of PCRE2
|
||||
and Perl is the same. For a specification of <b>perltest.sh</b>, see the
|
||||
comments near its beginning.
|
||||
comments near its beginning. See also the #perltest command below.
|
||||
</P>
|
||||
<P>
|
||||
When the input is a terminal, <b>pcre2test</b> prompts for each line of input,
|
||||
|
@ -375,6 +382,12 @@ output.
|
|||
This command is used to load a set of precompiled patterns from a file, as
|
||||
described in the section entitled "Saving and restoring compiled patterns"
|
||||
<a href="#saverestore">below.</a>
|
||||
<pre>
|
||||
#loadtables <filename>
|
||||
</pre>
|
||||
This command is used to load a set of binary character tables that can be
|
||||
accessed by the tables=3 qualifier. Such tables can be created by the
|
||||
<b>pcre2_dftables</b> program with the -b option.
|
||||
<pre>
|
||||
#newline_default [<newline-list>]
|
||||
</pre>
|
||||
|
@ -414,14 +427,20 @@ patterns. Modifiers on a pattern can change these settings.
|
|||
<pre>
|
||||
#perltest
|
||||
</pre>
|
||||
The appearance of this line causes all subsequent modifier settings to be
|
||||
checked for compatibility with the <b>perltest.sh</b> script, which is used to
|
||||
confirm that Perl gives the same results as PCRE2. Also, apart from comment
|
||||
lines, #pattern commands, and #subject commands that set or unset "mark", no
|
||||
command lines are permitted, because they and many of the modifiers are
|
||||
specific to <b>pcre2test</b>, and should not be used in test files that are also
|
||||
processed by <b>perltest.sh</b>. The <b>#perltest</b> command helps detect tests
|
||||
that are accidentally put in the wrong file.
|
||||
This line is used in test files that can also be processed by <b>perltest.sh</b>
|
||||
to confirm that Perl gives the same results as PCRE2. Subsequent tests are
|
||||
checked for the use of <b>pcre2test</b> features that are incompatible with the
|
||||
<b>perltest.sh</b> script.
|
||||
</P>
|
||||
<P>
|
||||
Patterns must use '/' as their delimiter, and only certain modifiers are
|
||||
supported. Comment lines, #pattern commands, and #subject commands that set or
|
||||
unset "mark" are recognized and acted on. The #perltest, #forbid_utf, and
|
||||
#newline_default commands, which are needed in the relevant pcre2test files,
|
||||
are silently ignored. All other command lines are ignored, but give a warning
|
||||
message. The <b>#perltest</b> command helps detect tests that are accidentally
|
||||
put in the wrong file or use the wrong delimiter. For more details of the
|
||||
<b>perltest.sh</b> script see the comments it contains.
|
||||
<pre>
|
||||
#pop [<modifiers>]
|
||||
#popcopy [<modifiers>]
|
||||
|
@ -474,15 +493,17 @@ excluding pattern meta-characters):
|
|||
</pre>
|
||||
This is interpreted as the pattern's delimiter. A regular expression may be
|
||||
continued over several input lines, in which case the newline characters are
|
||||
included within it. It is possible to include the delimiter within the pattern
|
||||
by escaping it with a backslash, for example
|
||||
included within it. It is possible to include the delimiter as a literal within
|
||||
the pattern by escaping it with a backslash, for example
|
||||
<pre>
|
||||
/abc\/def/
|
||||
</pre>
|
||||
If you do this, the escape and the delimiter form part of the pattern, but
|
||||
since the delimiters are all non-alphanumeric, this does not affect its
|
||||
interpretation. If the terminating delimiter is immediately followed by a
|
||||
backslash, for example,
|
||||
since the delimiters are all non-alphanumeric, the inclusion of the backslash
|
||||
does not affect the pattern's interpretation. Note, however, that this trick
|
||||
does not work within \Q...\E literal bracketing because the backslash will
|
||||
itself be interpreted as a literal. If the terminating delimiter is immediately
|
||||
followed by a backslash, for example,
|
||||
<pre>
|
||||
/abc/\
|
||||
</pre>
|
||||
|
@ -500,11 +521,11 @@ A pattern can be followed by a modifier list (details below).
|
|||
</P>
|
||||
<br><a name="SEC9" href="#TOC1">SUBJECT LINE SYNTAX</a><br>
|
||||
<P>
|
||||
Before each subject line is passed to <b>pcre2_match()</b> or
|
||||
<b>pcre2_dfa_match()</b>, leading and trailing white space is removed, and the
|
||||
line is scanned for backslash escapes, unless the <b>subject_literal</b>
|
||||
modifier was set for the pattern. The following provide a means of encoding
|
||||
non-printing characters in a visible way:
|
||||
Before each subject line is passed to <b>pcre2_match()</b>,
|
||||
<b>pcre2_dfa_match()</b>, or <b>pcre2_jit_match()</b>, leading and trailing white
|
||||
space is removed, and the line is scanned for backslash escapes, unless the
|
||||
<b>subject_literal</b> modifier was set for the pattern. The following provide a
|
||||
means of encoding non-printing characters in a visible way:
|
||||
<pre>
|
||||
\a alarm (BEL, \x07)
|
||||
\b backspace (\x08)
|
||||
|
@ -601,6 +622,7 @@ way <b>pcre2_compile()</b> behaves. See
|
|||
for a description of the effects of these options.
|
||||
<pre>
|
||||
allow_empty_class set PCRE2_ALLOW_EMPTY_CLASS
|
||||
allow_lookaround_bsk set PCRE2_EXTRA_ALLOW_LOOKAROUND_BSK
|
||||
allow_surrogate_escapes set PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES
|
||||
alt_bsux set PCRE2_ALT_BSUX
|
||||
alt_circumflex set PCRE2_ALT_CIRCUMFLEX
|
||||
|
@ -679,7 +701,7 @@ heavily used in the test files.
|
|||
pushcopy push a copy onto the stack
|
||||
stackguard=<number> test the stackguard feature
|
||||
subject_literal treat all subject lines as literal
|
||||
tables=[0|1|2] select internal tables
|
||||
tables=[0|1|2|3] select internal tables
|
||||
use_length do not zero-terminate the pattern
|
||||
utf8_input treat input as UTF-8
|
||||
</pre>
|
||||
|
@ -1027,18 +1049,20 @@ Using alternative character tables
|
|||
</b><br>
|
||||
<P>
|
||||
The value specified for the <b>tables</b> modifier must be one of the digits 0,
|
||||
1, or 2. It causes a specific set of built-in character tables to be passed to
|
||||
<b>pcre2_compile()</b>. This is used in the PCRE2 tests to check behaviour with
|
||||
different character tables. The digit specifies the tables as follows:
|
||||
1, 2, or 3. It causes a specific set of built-in character tables to be passed
|
||||
to <b>pcre2_compile()</b>. This is used in the PCRE2 tests to check behaviour
|
||||
with different character tables. The digit specifies the tables as follows:
|
||||
<pre>
|
||||
0 do not pass any special character tables
|
||||
1 the default ASCII tables, as distributed in
|
||||
pcre2_chartables.c.dist
|
||||
2 a set of tables defining ISO 8859 characters
|
||||
3 a set of tables loaded by the #loadtables command
|
||||
</pre>
|
||||
In table 2, some characters whose codes are greater than 128 are identified as
|
||||
letters, digits, spaces, etc. Setting alternate character tables and a locale
|
||||
are mutually exclusive.
|
||||
In tables 2, some characters whose codes are greater than 128 are identified as
|
||||
letters, digits, spaces, etc. Tables 3 can be used only after a
|
||||
<b>#loadtables</b> command has loaded them from a binary file. Setting alternate
|
||||
character tables and a locale are mutually exclusive.
|
||||
</P>
|
||||
<br><b>
|
||||
Setting certain match controls
|
||||
|
@ -1050,24 +1074,27 @@ modifier list, in which case they are applied to every subject line that is
|
|||
processed with that pattern. These modifiers do not affect the compilation
|
||||
process.
|
||||
<pre>
|
||||
aftertext show text after match
|
||||
allaftertext show text after captures
|
||||
allcaptures show all captures
|
||||
allvector show the entire ovector
|
||||
allusedtext show all consulted text
|
||||
altglobal alternative global matching
|
||||
/g global global matching
|
||||
jitstack=<n> set size of JIT stack
|
||||
mark show mark values
|
||||
replace=<string> specify a replacement string
|
||||
startchar show starting character when relevant
|
||||
substitute_callout use substitution callouts
|
||||
substitute_extended use PCRE2_SUBSTITUTE_EXTENDED
|
||||
substitute_skip=<n> skip substitution number n
|
||||
substitute_overflow_length use PCRE2_SUBSTITUTE_OVERFLOW_LENGTH
|
||||
substitute_stop=<n> skip substitution number n and greater
|
||||
substitute_unknown_unset use PCRE2_SUBSTITUTE_UNKNOWN_UNSET
|
||||
substitute_unset_empty use PCRE2_SUBSTITUTE_UNSET_EMPTY
|
||||
aftertext show text after match
|
||||
allaftertext show text after captures
|
||||
allcaptures show all captures
|
||||
allvector show the entire ovector
|
||||
allusedtext show all consulted text
|
||||
altglobal alternative global matching
|
||||
/g global global matching
|
||||
jitstack=<n> set size of JIT stack
|
||||
mark show mark values
|
||||
replace=<string> specify a replacement string
|
||||
startchar show starting character when relevant
|
||||
substitute_callout use substitution callouts
|
||||
substitute_extended use PCRE2_SUBSTITUTE_EXTENDED
|
||||
substitute_literal use PCRE2_SUBSTITUTE_LITERAL
|
||||
substitute_matched use PCRE2_SUBSTITUTE_MATCHED
|
||||
substitute_overflow_length use PCRE2_SUBSTITUTE_OVERFLOW_LENGTH
|
||||
substitute_replacement_only use PCRE2_SUBSTITUTE_REPLACEMENT_ONLY
|
||||
substitute_skip=<n> skip substitution <n>
|
||||
substitute_stop=<n> skip substitution <n> and following
|
||||
substitute_unknown_unset use PCRE2_SUBSTITUTE_UNKNOWN_UNSET
|
||||
substitute_unset_empty use PCRE2_SUBSTITUTE_UNSET_EMPTY
|
||||
</pre>
|
||||
These modifiers may not appear in a <b>#pattern</b> command. If you want them as
|
||||
defaults, set them in a <b>#subject</b> command.
|
||||
|
@ -1196,7 +1223,7 @@ Setting match controls
|
|||
The following modifiers affect the matching process or request additional
|
||||
information. Some of them may also be specified on a pattern line (see above),
|
||||
in which case they apply to every subject line that is matched against that
|
||||
pattern.
|
||||
pattern, but can be overridden by modifiers on the subject.
|
||||
<pre>
|
||||
aftertext show text after match
|
||||
allaftertext show text after captures
|
||||
|
@ -1214,7 +1241,8 @@ pattern.
|
|||
copy=<number or name> copy captured substring
|
||||
depth_limit=<n> set a depth limit
|
||||
dfa use <b>pcre2_dfa_match()</b>
|
||||
find_limits find match and depth limits
|
||||
find_limits find heap, match and depth limits
|
||||
find_limits_noheap find match and depth limits
|
||||
get=<number or name> extract captured substring
|
||||
getall extract all captured substrings
|
||||
/g global global matching
|
||||
|
@ -1224,6 +1252,8 @@ pattern.
|
|||
match_limit=<n> set a match limit
|
||||
memory show heap memory usage
|
||||
null_context match with a NULL context
|
||||
null_replacement substitute with NULL replacement
|
||||
null_subject match with NULL subject
|
||||
offset=<n> set starting offset
|
||||
offset_limit=<n> set offset limit
|
||||
ovector=<n> set size of output vector
|
||||
|
@ -1233,8 +1263,11 @@ pattern.
|
|||
startoffset=<n> same as offset=<n>
|
||||
substitute_callout use substitution callouts
|
||||
substitute_extedded use PCRE2_SUBSTITUTE_EXTENDED
|
||||
substitute_skip=<n> skip substitution number n
|
||||
substitute_literal use PCRE2_SUBSTITUTE_LITERAL
|
||||
substitute_matched use PCRE2_SUBSTITUTE_MATCHED
|
||||
substitute_overflow_length use PCRE2_SUBSTITUTE_OVERFLOW_LENGTH
|
||||
substitute_replacement_only use PCRE2_SUBSTITUTE_REPLACEMENT_ONLY
|
||||
substitute_skip=<n> skip substitution number n
|
||||
substitute_stop=<n> skip substitution number n and greater
|
||||
substitute_unknown_unset use PCRE2_SUBSTITUTE_UNKNOWN_UNSET
|
||||
substitute_unset_empty use PCRE2_SUBSTITUTE_UNSET_EMPTY
|
||||
|
@ -1395,9 +1428,15 @@ Testing the substitution function
|
|||
</b><br>
|
||||
<P>
|
||||
If the <b>replace</b> modifier is set, the <b>pcre2_substitute()</b> function is
|
||||
called instead of one of the matching functions. Note that replacement strings
|
||||
cannot contain commas, because a comma signifies the end of a modifier. This is
|
||||
not thought to be an issue in a test program.
|
||||
called instead of one of the matching functions (or after one call of
|
||||
<b>pcre2_match()</b> in the case of PCRE2_SUBSTITUTE_MATCHED). Note that
|
||||
replacement strings cannot contain commas, because a comma signifies the end of
|
||||
a modifier. This is not thought to be an issue in a test program.
|
||||
</P>
|
||||
<P>
|
||||
Specifying a completely empty replacement string disables this modifier.
|
||||
However, it is possible to specify an empty replacement by providing a buffer
|
||||
length, as described below, for an otherwise empty replacement.
|
||||
</P>
|
||||
<P>
|
||||
Unlike subject strings, <b>pcre2test</b> does not process replacement strings
|
||||
|
@ -1413,11 +1452,16 @@ for <b>pcre2_substitute()</b>:
|
|||
<pre>
|
||||
global PCRE2_SUBSTITUTE_GLOBAL
|
||||
substitute_extended PCRE2_SUBSTITUTE_EXTENDED
|
||||
substitute_literal PCRE2_SUBSTITUTE_LITERAL
|
||||
substitute_matched PCRE2_SUBSTITUTE_MATCHED
|
||||
substitute_overflow_length PCRE2_SUBSTITUTE_OVERFLOW_LENGTH
|
||||
substitute_replacement_only PCRE2_SUBSTITUTE_REPLACEMENT_ONLY
|
||||
substitute_unknown_unset PCRE2_SUBSTITUTE_UNKNOWN_UNSET
|
||||
substitute_unset_empty PCRE2_SUBSTITUTE_UNSET_EMPTY
|
||||
|
||||
</PRE>
|
||||
</pre>
|
||||
See the
|
||||
<a href="pcre2api.html"><b>pcre2api</b></a>
|
||||
documentation for details of these options.
|
||||
</P>
|
||||
<P>
|
||||
After a successful substitution, the modified string is output, preceded by the
|
||||
|
@ -1521,7 +1565,7 @@ Setting heap, match, and depth limits
|
|||
<P>
|
||||
The <b>heap_limit</b>, <b>match_limit</b>, and <b>depth_limit</b> modifiers set
|
||||
the appropriate limits in the match context. These values are ignored when the
|
||||
<b>find_limits</b> modifier is specified.
|
||||
<b>find_limits</b> or <b>find_limits_noheap</b> modifier is specified.
|
||||
</P>
|
||||
<br><b>
|
||||
Finding minimum limits
|
||||
|
@ -1531,8 +1575,12 @@ If the <b>find_limits</b> modifier is present on a subject line, <b>pcre2test</b
|
|||
calls the relevant matching function several times, setting different values in
|
||||
the match context via <b>pcre2_set_heap_limit()</b>,
|
||||
<b>pcre2_set_match_limit()</b>, or <b>pcre2_set_depth_limit()</b> until it finds
|
||||
the minimum values for each parameter that allows the match to complete without
|
||||
error. If JIT is being used, only the match limit is relevant.
|
||||
the smallest value for each parameter that allows the match to complete without
|
||||
a "limit exceeded" error. The match itself may succeed or fail. An alternative
|
||||
modifier, <b>find_limits_noheap</b>, omits the heap limit. This is used in the
|
||||
standard tests, because the minimum heap limit varies between systems. If JIT
|
||||
is being used, only the match limit is relevant, and the other two are
|
||||
automatically omitted.
|
||||
</P>
|
||||
<P>
|
||||
When using this modifier, the pattern should not contain any limit settings
|
||||
|
@ -1560,9 +1608,7 @@ overall amount of computing resource that is used.
|
|||
</P>
|
||||
<P>
|
||||
For both kinds of matching, the <i>heap_limit</i> number, which is in kibibytes
|
||||
(units of 1024 bytes), limits the amount of heap memory used for matching. A
|
||||
value of zero disables the use of any heap memory; many simple pattern matches
|
||||
can be done without using the heap, so zero is not an unreasonable setting.
|
||||
(units of 1024 bytes), limits the amount of heap memory used for matching.
|
||||
</P>
|
||||
<br><b>
|
||||
Showing MARK names
|
||||
|
@ -1580,12 +1626,10 @@ Showing memory usage
|
|||
<P>
|
||||
The <b>memory</b> modifier causes <b>pcre2test</b> to log the sizes of all heap
|
||||
memory allocation and freeing calls that occur during a call to
|
||||
<b>pcre2_match()</b> or <b>pcre2_dfa_match()</b>. These occur only when a match
|
||||
requires a bigger vector than the default for remembering backtracking points
|
||||
(<b>pcre2_match()</b>) or for internal workspace (<b>pcre2_dfa_match()</b>). In
|
||||
many cases there will be no heap memory used and therefore no additional
|
||||
output. No heap memory is allocated during matching with JIT, so in that case
|
||||
the <b>memory</b> modifier never has any effect. For this modifier to work, the
|
||||
<b>pcre2_match()</b> or <b>pcre2_dfa_match()</b>. In the latter case, heap memory
|
||||
is used only when a match requires more internal workspace that the default
|
||||
allocation on the stack, so in many cases there will be no output. No heap
|
||||
memory is allocated during matching with JIT. For this modifier to work, the
|
||||
<b>null_context</b> modifier must not be set on both the pattern and the
|
||||
subject, though it can be set on one or the other.
|
||||
</P>
|
||||
|
@ -1639,7 +1683,7 @@ When testing <b>pcre2_substitute()</b>, this modifier also has the effect of
|
|||
passing the replacement string as zero-terminated.
|
||||
</P>
|
||||
<br><b>
|
||||
Passing a NULL context
|
||||
Passing a NULL context, subject, or replacement
|
||||
</b><br>
|
||||
<P>
|
||||
Normally, <b>pcre2test</b> passes a context block to <b>pcre2_match()</b>,
|
||||
|
@ -1647,7 +1691,13 @@ Normally, <b>pcre2test</b> passes a context block to <b>pcre2_match()</b>,
|
|||
If the <b>null_context</b> modifier is set, however, NULL is passed. This is for
|
||||
testing that the matching and substitution functions behave correctly in this
|
||||
case (they use default values). This modifier cannot be used with the
|
||||
<b>find_limits</b> or <b>substitute_callout</b> modifiers.
|
||||
<b>find_limits</b>, <b>find_limits_noheap</b>, or <b>substitute_callout</b>
|
||||
modifiers.
|
||||
</P>
|
||||
<P>
|
||||
Similarly, for testing purposes, if the <b>null_subject</b> or
|
||||
<b>null_replacement</b> modifier is set, the subject or replacement string
|
||||
pointers are passed as NULL, respectively, to the relevant functions.
|
||||
</P>
|
||||
<br><a name="SEC12" href="#TOC1">THE ALTERNATIVE MATCHING FUNCTION</a><br>
|
||||
<P>
|
||||
|
@ -2086,16 +2136,16 @@ on the stack.
|
|||
<P>
|
||||
Philip Hazel
|
||||
<br>
|
||||
University Computing Service
|
||||
Retired from University Computing Service
|
||||
<br>
|
||||
Cambridge, England.
|
||||
<br>
|
||||
</P>
|
||||
<br><a name="SEC21" href="#TOC1">REVISION</a><br>
|
||||
<P>
|
||||
Last updated: 30 July 2019
|
||||
Last updated: 27 July 2022
|
||||
<br>
|
||||
Copyright © 1997-2019 University of Cambridge.
|
||||
Copyright © 1997-2022 University of Cambridge.
|
||||
<br>
|
||||
<p>
|
||||
Return to the <a href="index.html">PCRE2 index page</a>.
|
||||
|
|
|
@ -19,7 +19,7 @@ UNICODE AND UTF SUPPORT
|
|||
PCRE2 is normally built with Unicode support, though if you do not need it, you
|
||||
can build it without, in which case the library will be smaller. With Unicode
|
||||
support, PCRE2 has knowledge of Unicode character properties and can process
|
||||
text strings in UTF-8, UTF-16, or UTF-32 format (depending on the code unit
|
||||
strings of text in UTF-8, UTF-16, and UTF-32 format (depending on the code unit
|
||||
width), but this is not the default. Unless specifically requested, PCRE2
|
||||
treats each code unit in a string as one character.
|
||||
</P>
|
||||
|
@ -50,17 +50,18 @@ UNICODE PROPERTY SUPPORT
|
|||
<P>
|
||||
When PCRE2 is built with Unicode support, the escape sequences \p{..},
|
||||
\P{..}, and \X can be used. This is not dependent on the PCRE2_UTF setting.
|
||||
The Unicode properties that can be tested are limited to the general category
|
||||
properties such as Lu for an upper case letter or Nd for a decimal number, the
|
||||
Unicode script names such as Arabic or Han, and the derived properties Any and
|
||||
L&. Full lists are given in the
|
||||
The Unicode properties that can be tested are a subset of those that Perl
|
||||
supports. Currently they are limited to the general category properties such as
|
||||
Lu for an upper case letter or Nd for a decimal number, the Unicode script
|
||||
names such as Arabic or Han, Bidi_Class, Bidi_Control, and the derived
|
||||
properties Any and LC (synonym L&). Full lists are given in the
|
||||
<a href="pcre2pattern.html"><b>pcre2pattern</b></a>
|
||||
and
|
||||
<a href="pcre2syntax.html"><b>pcre2syntax</b></a>
|
||||
documentation. Only the short names for properties are supported. For example,
|
||||
\p{L} matches a letter. Its Perl synonym, \p{Letter}, is not supported.
|
||||
Furthermore, in Perl, many properties may optionally be prefixed by "Is", for
|
||||
compatibility with Perl 5.6. PCRE2 does not support this.
|
||||
documentation. In general, only the short names for properties are supported.
|
||||
For example, \p{L} matches a letter. Its longer synonym, \p{Letter}, is not
|
||||
supported. Furthermore, in Perl, many properties may optionally be prefixed by
|
||||
"Is", for compatibility with Perl 5.6. PCRE2 does not support this.
|
||||
</P>
|
||||
<br><b>
|
||||
WIDE CHARACTERS AND UTF MODES
|
||||
|
@ -134,14 +135,16 @@ However, the special horizontal and vertical white space matching escapes (\h,
|
|||
not PCRE2_UCP is set.
|
||||
</P>
|
||||
<br><b>
|
||||
CASE-EQUIVALENCE IN UTF MODE
|
||||
UNICODE CASE-EQUIVALENCE
|
||||
</b><br>
|
||||
<P>
|
||||
Case-insensitive matching in UTF mode makes use of Unicode properties except
|
||||
for characters whose code points are less than 128 and that have at most two
|
||||
case-equivalent values. For these, a direct table lookup is used for speed. A
|
||||
few Unicode characters such as Greek sigma have more than two code points that
|
||||
are case-equivalent, and these are treated specially.
|
||||
If either PCRE2_UTF or PCRE2_UCP is set, upper/lower case processing makes use
|
||||
of Unicode properties except for characters whose code points are less than 128
|
||||
and that have at most two case-equivalent values. For these, a direct table
|
||||
lookup is used for speed. A few Unicode characters such as Greek sigma have
|
||||
more than two code points that are case-equivalent, and these are treated
|
||||
specially. Setting PCRE2_UCP without PCRE2_UTF allows Unicode-style case
|
||||
processing for non-UTF character encodings such as UCS-2.
|
||||
<a name="scriptruns"></a></P>
|
||||
<br><b>
|
||||
SCRIPT RUNS
|
||||
|
@ -475,7 +478,7 @@ AUTHOR
|
|||
<P>
|
||||
Philip Hazel
|
||||
<br>
|
||||
University Computing Service
|
||||
Retired from University Computing Service
|
||||
<br>
|
||||
Cambridge, England.
|
||||
<br>
|
||||
|
@ -484,9 +487,9 @@ Cambridge, England.
|
|||
REVISION
|
||||
</b><br>
|
||||
<P>
|
||||
Last updated: 24 May 2019
|
||||
Last updated: 22 December 2021
|
||||
<br>
|
||||
Copyright © 1997-2019 University of Cambridge.
|
||||
Copyright © 1997-2021 University of Cambridge.
|
||||
<br>
|
||||
<p>
|
||||
Return to the <a href="index.html">PCRE2 index page</a>.
|
||||
|
|
22
doc/pcre2.3
22
doc/pcre2.3
|
@ -1,4 +1,4 @@
|
|||
.TH PCRE2 3 "17 September 2018" "PCRE2 10.33"
|
||||
.TH PCRE2 3 "27 August 2021" "PCRE2 10.38"
|
||||
.SH NAME
|
||||
PCRE2 - Perl-compatible regular expressions (revised API)
|
||||
.SH INTRODUCTION
|
||||
|
@ -11,7 +11,8 @@ nearly two decades, the limitations of the original API were making development
|
|||
increasingly difficult. The new API is more extensible, and it was simplified
|
||||
by abolishing the separate "study" optimizing function; in PCRE2, patterns are
|
||||
automatically optimized where possible. Since forking from PCRE1, the code has
|
||||
been extensively refactored and new features introduced.
|
||||
been extensively refactored and new features introduced. The old library is now
|
||||
obsolete and is no longer maintained.
|
||||
.P
|
||||
As well as Perl-style regular expression patterns, some features that appeared
|
||||
in Python and the original PCRE before they appeared in Perl are available
|
||||
|
@ -19,8 +20,13 @@ using the Python syntax. There is also some support for one or two .NET and
|
|||
Oniguruma syntax items, and there are options for requesting some minor changes
|
||||
that give better ECMAScript (aka JavaScript) compatibility.
|
||||
.P
|
||||
The source code for PCRE2 can be compiled to support 8-bit, 16-bit, or 32-bit
|
||||
code units, which means that up to three separate libraries may be installed.
|
||||
The source code for PCRE2 can be compiled to support strings of 8-bit, 16-bit,
|
||||
or 32-bit code units, which means that up to three separate libraries may be
|
||||
installed, one for each code unit size. The size of code unit is not related to
|
||||
the bit size of the underlying hardware. In a 64-bit environment that also
|
||||
supports 32-bit applications, versions of PCRE2 that are compiled in both
|
||||
64-bit and 32-bit modes may be needed.
|
||||
.P
|
||||
The original work to extend PCRE to 16-bit and 32-bit code units was done by
|
||||
Zoltan Herczeg and Christian Persch, respectively. In all three cases, strings
|
||||
can be interpreted either as one character per code unit, or as UTF-encoded
|
||||
|
@ -185,18 +191,18 @@ function, listing its arguments and results.
|
|||
.sp
|
||||
.nf
|
||||
Philip Hazel
|
||||
University Computing Service
|
||||
Retired from University Computing Service
|
||||
Cambridge, England.
|
||||
.fi
|
||||
.P
|
||||
Putting an actual email address here is a spam magnet. If you want to email me,
|
||||
use my two initials, followed by the two digits 10, at the domain cam.ac.uk.
|
||||
use my two names separated by a dot at gmail.com.
|
||||
.
|
||||
.
|
||||
.SH REVISION
|
||||
.rs
|
||||
.sp
|
||||
.nf
|
||||
Last updated: 17 September 2018
|
||||
Copyright (c) 1997-2018 University of Cambridge.
|
||||
Last updated: 27 August 2021
|
||||
Copyright (c) 1997-2021 University of Cambridge.
|
||||
.fi
|
||||
|
|
5132
doc/pcre2.txt
5132
doc/pcre2.txt
File diff suppressed because it is too large
Load Diff
|
@ -1,4 +1,4 @@
|
|||
.TH PCRE2_COMPILE 3 "23 May 2019" "PCRE2 10.34"
|
||||
.TH PCRE2_COMPILE 3 "22 April 2022" "PCRE2 10.41"
|
||||
.SH NAME
|
||||
PCRE2 - Perl-compatible regular expressions (revised API)
|
||||
.SH SYNOPSIS
|
||||
|
@ -80,8 +80,17 @@ Additional options may be set in the compile context via the
|
|||
.\"
|
||||
function.
|
||||
.P
|
||||
The yield of this function is a pointer to a private data structure that
|
||||
contains the compiled pattern, or NULL if an error was detected.
|
||||
If either of \fIerrorcode\fP or \fIerroroffset\fP is NULL, the function returns
|
||||
NULL immediately. Otherwise, the yield of this function is a pointer to a
|
||||
private data structure that contains the compiled pattern, or NULL if an error
|
||||
was detected. In the error case, a text error message can be obtained by
|
||||
passing the value returned via the \fIerrorcode\fP argument to the the
|
||||
\fBpcre2_get_error_message()\fP function. The offset (in code units) where the
|
||||
error was encountered is returned via the \fIerroroffset\fP argument.
|
||||
.P
|
||||
If there is no error, the value passed via \fIerrorcode\fP returns the message
|
||||
"no error" if passed to \fBpcre2_get_error_message()\fP, and the value passed
|
||||
via \fIerroroffset\fP is zero.
|
||||
.P
|
||||
There is a complete description of the PCRE2 native API, with more detail on
|
||||
each option, in the
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
.TH PCRE2_DFA_MATCH 3 "16 October 2018" "PCRE2 10.33"
|
||||
.TH PCRE2_DFA_MATCH 3 "28 August 2021" "PCRE2 10.38"
|
||||
.SH NAME
|
||||
PCRE2 - Perl-compatible regular expressions (revised API)
|
||||
.SH SYNOPSIS
|
||||
|
@ -33,10 +33,15 @@ just once (except when processing lookaround assertions). This function is
|
|||
\fIworkspace\fP Points to a vector of ints used as working space
|
||||
\fIwscount\fP Number of elements in the vector
|
||||
.sp
|
||||
For \fBpcre2_dfa_match()\fP, a match context is needed only if you want to set
|
||||
up a callout function or specify the heap limit or the match or the recursion
|
||||
depth limits. The \fIlength\fP and \fIstartoffset\fP values are code units, not
|
||||
characters. The options are:
|
||||
The size of output vector needed to contain all the results depends on the
|
||||
number of simultaneous matches, not on the number of parentheses in the
|
||||
pattern. Using \fBpcre2_match_data_create_from_pattern()\fP to create the match
|
||||
data block is therefore not advisable when using this function.
|
||||
.P
|
||||
A match context is needed only if you want to set up a callout function or
|
||||
specify the heap limit or the match or the recursion depth limits. The
|
||||
\fIlength\fP and \fIstartoffset\fP values are code units, not characters. The
|
||||
options are:
|
||||
.sp
|
||||
PCRE2_ANCHORED Match only at the first position
|
||||
PCRE2_COPY_MATCHED_SUBJECT
|
||||
|
|
|
@ -17,7 +17,7 @@ This function frees unused JIT executable memory. The argument is a general
|
|||
context, for custom memory management, or NULL for standard memory management.
|
||||
JIT memory allocation retains some memory in order to improve future JIT
|
||||
compilation speed. In low memory conditions,
|
||||
\fBpcre2_jit_free_unused_memory()\fB can be used to cause this memory to be
|
||||
\fBpcre2_jit_free_unused_memory()\fP can be used to cause this memory to be
|
||||
freed.
|
||||
.P
|
||||
There is a complete description of the PCRE2 native API in the
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
.TH PCRE2_JIT_MATCH 3 "03 November 2014" "PCRE2 10.0"
|
||||
.TH PCRE2_JIT_MATCH 3 "11 February 2020" "PCRE2 10.35"
|
||||
.SH NAME
|
||||
PCRE2 - Perl-compatible regular expressions (revised API)
|
||||
.SH SYNOPSIS
|
||||
|
@ -22,8 +22,10 @@ algorithm that is similar to Perl's. It is a "fast path" interface to JIT, and
|
|||
it bypasses some of the sanity checks that \fBpcre2_match()\fP applies.
|
||||
Its arguments are exactly the same as for
|
||||
.\" HREF
|
||||
\fBpcre2_match()\fP.
|
||||
\fBpcre2_match()\fP,
|
||||
.\"
|
||||
except that the subject string must be specified with a length;
|
||||
PCRE2_ZERO_TERMINATED is not supported.
|
||||
.P
|
||||
The supported options are PCRE2_NOTBOL, PCRE2_NOTEOL, PCRE2_NOTEMPTY,
|
||||
PCRE2_NOTEMPTY_ATSTART, PCRE2_PARTIAL_HARD, and PCRE2_PARTIAL_SOFT. Unsupported
|
||||
|
|
|
@ -22,7 +22,8 @@ allocation. The result can be passed to the JIT run-time code by calling
|
|||
\fBpcre2_jit_stack_assign()\fP to associate the stack with a compiled pattern,
|
||||
which can then be processed by \fBpcre2_match()\fP or \fBpcre2_jit_match()\fP.
|
||||
A maximum stack size of 512KiB to 1MiB should be more than enough for any
|
||||
pattern. For more details, see the
|
||||
pattern. If the stack couldn't be allocated or the values passed were not
|
||||
reasonable, NULL will be returned. For more details, see the
|
||||
.\" HREF
|
||||
\fBpcre2jit\fP
|
||||
.\"
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
.TH PCRE2_MATCH_DATA_CREATE 3 "29 July 2015" "PCRE2 10.21"
|
||||
.TH PCRE2_MATCH_DATA_CREATE 3 "28 August 2021" "PCRE2 10.38"
|
||||
.SH NAME
|
||||
PCRE2 - Perl-compatible regular expressions (revised API)
|
||||
.SH SYNOPSIS
|
||||
|
@ -18,8 +18,9 @@ This function creates a new match data block, which is used for holding the
|
|||
result of a match. The first argument specifies the number of pairs of offsets
|
||||
that are required. These form the "output vector" (ovector) within the match
|
||||
data block, and are used to identify the matched string and any captured
|
||||
substrings. There is always one pair of offsets; if \fBovecsize\fP is zero, it
|
||||
is treated as one.
|
||||
substrings when matching with \fBpcre2_match()\fP, or a number of different
|
||||
matches at the same point when used with \fBpcre2_dfa_match()\fP. There is
|
||||
always one pair of offsets; if \fBovecsize\fP is zero, it is treated as one.
|
||||
.P
|
||||
The second argument points to a general context, for custom memory management,
|
||||
or is NULL for system memory management. The result of the function is NULL if
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
.TH PCRE2_MATCH_DATA_CREATE_FROM_PATTERN 3 "29 July 2015" "PCRE2 10.21"
|
||||
.TH PCRE2_MATCH_DATA_CREATE_FROM_PATTERN 3 "28 August 2021" "PCRE2 10.38"
|
||||
.SH NAME
|
||||
PCRE2 - Perl-compatible regular expressions (revised API)
|
||||
.SH SYNOPSIS
|
||||
|
@ -14,12 +14,15 @@ PCRE2 - Perl-compatible regular expressions (revised API)
|
|||
.SH DESCRIPTION
|
||||
.rs
|
||||
.sp
|
||||
This function creates a new match data block, which is used for holding the
|
||||
result of a match. The first argument points to a compiled pattern. The number
|
||||
of capturing parentheses within the pattern is used to compute the number of
|
||||
pairs of offsets that are required in the match data block. These form the
|
||||
"output vector" (ovector) within the match data block, and are used to identify
|
||||
the matched string and any captured substrings.
|
||||
This function creates a new match data block for holding the result of a match.
|
||||
The first argument points to a compiled pattern. The number of capturing
|
||||
parentheses within the pattern is used to compute the number of pairs of
|
||||
offsets that are required in the match data block. These form the "output
|
||||
vector" (ovector) within the match data block, and are used to identify the
|
||||
matched string and any captured substrings when matching with
|
||||
\fBpcre2_match()\fP. If you are using \fBpcre2_dfa_match()\fP, which uses the
|
||||
outut vector in a different way, you should use \fBpcre2_match_data_create()\fP
|
||||
instead of this function.
|
||||
.P
|
||||
The second argument points to a general context, for custom memory management,
|
||||
or is NULL to use the same memory allocator as was used for the compiled
|
||||
|
|
|
@ -36,7 +36,7 @@ the following negative error codes:
|
|||
PCRE2_ERROR_BADDATA \fInumber_of_codes\fP is zero or less
|
||||
PCRE2_ERROR_BADMAGIC mismatch of id bytes in \fIbytes\fP
|
||||
PCRE2_ERROR_BADMODE mismatch of variable unit size or PCRE version
|
||||
PCRE2_ERROR_MEMORY memory allocation failed
|
||||
PCRE2_ERROR_NOMEMORY memory allocation failed
|
||||
PCRE2_ERROR_NULL \fIcodes\fP or \fIbytes\fP is NULL
|
||||
.sp
|
||||
PCRE2_ERROR_BADMAGIC may mean that the data is corrupt, or that it was compiled
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
.TH PCRE2_SET_CHARACTER_TABLES 3 "22 October 2014" "PCRE2 10.00"
|
||||
.TH PCRE2_SET_CHARACTER_TABLES 3 "20 March 2020" "PCRE2 10.35"
|
||||
.SH NAME
|
||||
PCRE2 - Perl-compatible regular expressions (revised API)
|
||||
.SH SYNOPSIS
|
||||
|
@ -15,9 +15,14 @@ PCRE2 - Perl-compatible regular expressions (revised API)
|
|||
.rs
|
||||
.sp
|
||||
This function sets a pointer to custom character tables within a compile
|
||||
context. The second argument must be the result of a call to
|
||||
\fBpcre2_maketables()\fP or NULL to request the default tables. The result is
|
||||
always zero.
|
||||
context. The second argument must point to a set of PCRE2 character tables or
|
||||
be NULL to request the default tables. The result is always zero. Character
|
||||
tables can be created by calling \fBpcre2_maketables()\fP or by running the
|
||||
\fBpcre2_dftables\fP maintenance command in binary mode (see the
|
||||
.\" HREF
|
||||
\fBpcre2build\fP
|
||||
.\"
|
||||
documentation).
|
||||
.P
|
||||
There is a complete description of the PCRE2 native API in the
|
||||
.\" HREF
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
.TH PCRE2_SET_COMPILE_EXTRA_OPTIONS 3 "11 February 2019" "PCRE2 10.33"
|
||||
.TH PCRE2_SET_COMPILE_EXTRA_OPTIONS 3 "31 August 2021" "PCRE2 10.38"
|
||||
.SH NAME
|
||||
PCRE2 - Perl-compatible regular expressions (revised API)
|
||||
.SH SYNOPSIS
|
||||
|
@ -18,12 +18,13 @@ This function sets additional option bits for \fBpcre2_compile()\fP that are
|
|||
housed in a compile context. It completely replaces all the bits. The extra
|
||||
options are:
|
||||
.sp
|
||||
PCRE2_EXTRA_ALLOW_LOOKAROUND_BSK Allow \eK in lookarounds
|
||||
.\" JOIN
|
||||
PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES Allow \ex{df800} to \ex{dfff}
|
||||
PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES Allow \ex{d800} to \ex{dfff}
|
||||
in UTF-8 and UTF-32 modes
|
||||
.\" JOIN
|
||||
PCRE2_EXTRA_ALT_BSUX Extended alternate \eu, \eU, and \ex
|
||||
handling
|
||||
PCRE2_EXTRA_ALT_BSUX Extended alternate \eu, \eU, and
|
||||
\ex handling
|
||||
.\" JOIN
|
||||
PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL Treat all invalid escapes as
|
||||
a literal following character
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
.TH PCRE2_SUBSTITUTE 3 "04 April 2017" "PCRE2 10.30"
|
||||
.TH PCRE2_SUBSTITUTE 3 "22 January 2020" "PCRE2 10.35"
|
||||
.SH NAME
|
||||
PCRE2 - Perl-compatible regular expressions (revised API)
|
||||
.SH SYNOPSIS
|
||||
|
@ -36,8 +36,8 @@ Its arguments are:
|
|||
\fIoutlengthptr\fP Points to the length of the output buffer
|
||||
.sp
|
||||
A match data block is needed only if you want to inspect the data from the
|
||||
match that is returned in that block. A match context is needed only if you
|
||||
want to:
|
||||
final match that is returned in that block or if PCRE2_SUBSTITUTE_MATCHED is
|
||||
set. A match context is needed only if you want to:
|
||||
.sp
|
||||
Set up a callout function
|
||||
Set a matching offset limit
|
||||
|
@ -45,33 +45,57 @@ want to:
|
|||
Change the backtracking depth limit
|
||||
Set custom memory management in the match context
|
||||
.sp
|
||||
The \fIlength\fP, \fIstartoffset\fP and \fIrlength\fP values are code
|
||||
units, not characters, as is the contents of the variable pointed at by
|
||||
\fIoutlengthptr\fP, which is updated to the actual length of the new string.
|
||||
The \fIlength\fP, \fIstartoffset\fP and \fIrlength\fP values are code units,
|
||||
not characters, as is the contents of the variable pointed at by
|
||||
\fIoutlengthptr\fP. This variable must contain the length of the output buffer
|
||||
when the function is called. If the function is successful, the value is
|
||||
changed to the length of the new string, excluding the trailing zero that is
|
||||
automatically added.
|
||||
.P
|
||||
The subject and replacement lengths can be given as PCRE2_ZERO_TERMINATED for
|
||||
zero-terminated strings. The options are:
|
||||
.sp
|
||||
PCRE2_ANCHORED Match only at the first position
|
||||
PCRE2_ENDANCHORED Pattern can match only at end of subject
|
||||
PCRE2_NOTBOL Subject is not the beginning of a line
|
||||
PCRE2_NOTEOL Subject is not the end of a line
|
||||
PCRE2_NOTEMPTY An empty string is not a valid match
|
||||
PCRE2_ANCHORED Match only at the first position
|
||||
PCRE2_ENDANCHORED Match only at end of subject
|
||||
.\" JOIN
|
||||
PCRE2_NOTEMPTY_ATSTART An empty string at the start of the
|
||||
subject is not a valid match
|
||||
PCRE2_NO_JIT Do not use JIT matching
|
||||
PCRE2_NOTBOL Subject is not the beginning of a
|
||||
line
|
||||
PCRE2_NOTEOL Subject is not the end of a line
|
||||
.\" JOIN
|
||||
PCRE2_NO_UTF_CHECK Do not check the subject or replacement
|
||||
for UTF validity (only relevant if
|
||||
PCRE2_UTF was set at compile time)
|
||||
PCRE2_SUBSTITUTE_EXTENDED Do extended replacement processing
|
||||
PCRE2_SUBSTITUTE_GLOBAL Replace all occurrences in the subject
|
||||
PCRE2_SUBSTITUTE_OVERFLOW_LENGTH If overflow, compute needed length
|
||||
PCRE2_SUBSTITUTE_UNKNOWN_UNSET Treat unknown group as unset
|
||||
PCRE2_SUBSTITUTE_UNSET_EMPTY Simple unset insert = empty string
|
||||
PCRE2_NOTEMPTY An empty string is not a
|
||||
valid match
|
||||
.\" JOIN
|
||||
PCRE2_NOTEMPTY_ATSTART An empty string at the start of
|
||||
the subject is not a valid match
|
||||
PCRE2_NO_JIT Do not use JIT matching
|
||||
.\" JOIN
|
||||
PCRE2_NO_UTF_CHECK Do not check for UTF validity in
|
||||
the subject or replacement
|
||||
.\" JOIN
|
||||
(only relevant if PCRE2_UTF was
|
||||
set at compile time)
|
||||
PCRE2_SUBSTITUTE_EXTENDED Do extended replacement processing
|
||||
.\" JOIN
|
||||
PCRE2_SUBSTITUTE_GLOBAL Replace all occurrences in the
|
||||
subject
|
||||
PCRE2_SUBSTITUTE_LITERAL The replacement string is literal
|
||||
.\" JOIN
|
||||
PCRE2_SUBSTITUTE_MATCHED Use pre-existing match data for
|
||||
first match
|
||||
PCRE2_SUBSTITUTE_OVERFLOW_LENGTH If overflow, compute needed length
|
||||
PCRE2_SUBSTITUTE_REPLACEMENT_ONLY Return only replacement string(s)
|
||||
PCRE2_SUBSTITUTE_UNKNOWN_UNSET Treat unknown group as unset
|
||||
PCRE2_SUBSTITUTE_UNSET_EMPTY Simple unset insert = empty string
|
||||
.sp
|
||||
If PCRE2_SUBSTITUTE_LITERAL is set, PCRE2_SUBSTITUTE_EXTENDED,
|
||||
PCRE2_SUBSTITUTE_UNKNOWN_UNSET, and PCRE2_SUBSTITUTE_UNSET_EMPTY are ignored.
|
||||
.P
|
||||
If PCRE2_SUBSTITUTE_MATCHED is set, \fImatch_data\fP must be non-NULL; its
|
||||
contents must be the result of a call to \fBpcre2_match()\fP using the same
|
||||
pattern and subject.
|
||||
.P
|
||||
The function returns the number of substitutions, which may be zero if there
|
||||
were no matches. The result can be greater than one only when
|
||||
are no matches. The result may be greater than one only when
|
||||
PCRE2_SUBSTITUTE_GLOBAL is set. In the event of an error, a negative error code
|
||||
is returned.
|
||||
.P
|
||||
|
|
437
doc/pcre2api.3
437
doc/pcre2api.3
|
@ -1,4 +1,4 @@
|
|||
.TH PCRE2API 3 "02 September 2019" "PCRE2 10.34"
|
||||
.TH PCRE2API 3 "27 July 2022" "PCRE2 10.41"
|
||||
.SH NAME
|
||||
PCRE2 - Perl-compatible regular expressions (revised API)
|
||||
.sp
|
||||
|
@ -187,7 +187,7 @@ document for an overview of all the PCRE2 documentation.
|
|||
.B int pcre2_substitute(const pcre2_code *\fIcode\fP, PCRE2_SPTR \fIsubject\fP,
|
||||
.B " PCRE2_SIZE \fIlength\fP, PCRE2_SIZE \fIstartoffset\fP,"
|
||||
.B " uint32_t \fIoptions\fP, pcre2_match_data *\fImatch_data\fP,"
|
||||
.B " pcre2_match_context *\fImcontext\fP, PCRE2_SPTR \fIreplacementzfP,"
|
||||
.B " pcre2_match_context *\fImcontext\fP, PCRE2_SPTR \fIreplacementz\fP,"
|
||||
.B " PCRE2_SIZE \fIrlength\fP, PCRE2_UCHAR *\fIoutputbuffer\fP,"
|
||||
.B " PCRE2_SIZE *\fIoutlengthptr\fP);"
|
||||
.fi
|
||||
|
@ -564,24 +564,53 @@ documentation for more details.
|
|||
.P
|
||||
In a more complicated situation, where patterns are compiled only when they are
|
||||
first needed, but are still shared between threads, pointers to compiled
|
||||
patterns must be protected from simultaneous writing by multiple threads, at
|
||||
least until a pattern has been compiled. The logic can be something like this:
|
||||
patterns must be protected from simultaneous writing by multiple threads. This
|
||||
is somewhat tricky to do correctly. If you know that writing to a pointer is
|
||||
atomic in your environment, you can use logic like this:
|
||||
.sp
|
||||
Get a read-only (shared) lock (mutex) for pointer
|
||||
if (pointer == NULL)
|
||||
{
|
||||
Get a write (unique) lock for pointer
|
||||
pointer = pcre2_compile(...
|
||||
if (pointer == NULL) pointer = pcre2_compile(...
|
||||
}
|
||||
Release the lock
|
||||
Use pointer in pcre2_match()
|
||||
.sp
|
||||
Of course, testing for compilation errors should also be included in the code.
|
||||
.P
|
||||
If JIT is being used, but the JIT compilation is not being done immediately,
|
||||
(perhaps waiting to see if the pattern is used often enough) similar logic is
|
||||
required. JIT compilation updates a pointer within the compiled code block, so
|
||||
a thread must gain unique write access to the pointer before calling
|
||||
The reason for checking the pointer a second time is as follows: Several
|
||||
threads may have acquired the shared lock and tested the pointer for being
|
||||
NULL, but only one of them will be given the write lock, with the rest kept
|
||||
waiting. The winning thread will compile the pattern and store the result.
|
||||
After this thread releases the write lock, another thread will get it, and if
|
||||
it does not retest pointer for being NULL, will recompile the pattern and
|
||||
overwrite the pointer, creating a memory leak and possibly causing other
|
||||
issues.
|
||||
.P
|
||||
In an environment where writing to a pointer may not be atomic, the above logic
|
||||
is not sufficient. The thread that is doing the compiling may be descheduled
|
||||
after writing only part of the pointer, which could cause other threads to use
|
||||
an invalid value. Instead of checking the pointer itself, a separate "pointer
|
||||
is valid" flag (that can be updated atomically) must be used:
|
||||
.sp
|
||||
Get a read-only (shared) lock (mutex) for pointer
|
||||
if (!pointer_is_valid)
|
||||
{
|
||||
Get a write (unique) lock for pointer
|
||||
if (!pointer_is_valid)
|
||||
{
|
||||
pointer = pcre2_compile(...
|
||||
pointer_is_valid = TRUE
|
||||
}
|
||||
}
|
||||
Release the lock
|
||||
Use pointer in pcre2_match()
|
||||
.sp
|
||||
If JIT is being used, but the JIT compilation is not being done immediately
|
||||
(perhaps waiting to see if the pattern is used often enough), similar logic is
|
||||
required. JIT compilation updates a value within the compiled code block, so a
|
||||
thread must gain unique write access to the pointer before calling
|
||||
\fBpcre2_jit_compile()\fP. Alternatively, \fBpcre2_code_copy()\fP or
|
||||
\fBpcre2_code_copy_with_tables()\fP can be used to obtain a private copy of the
|
||||
compiled code before calling the JIT compiler.
|
||||
|
@ -924,7 +953,7 @@ has its own memory control arrangements (see the
|
|||
documentation for more details). If the limit is reached, the negative error
|
||||
code PCRE2_ERROR_HEAPLIMIT is returned. The default limit can be set when PCRE2
|
||||
is built; if it is not, the default is set very large and is essentially
|
||||
"unlimited".
|
||||
unlimited.
|
||||
.P
|
||||
A value for the heap limit may also be supplied by an item at the start of a
|
||||
pattern of the form
|
||||
|
@ -935,18 +964,18 @@ where ddd is a decimal number. However, such a setting is ignored unless ddd is
|
|||
less than the limit set by the caller of \fBpcre2_match()\fP or, if no such
|
||||
limit is set, less than the default.
|
||||
.P
|
||||
The \fBpcre2_match()\fP function starts out using a 20KiB vector on the system
|
||||
stack for recording backtracking points. The more nested backtracking points
|
||||
there are (that is, the deeper the search tree), the more memory is needed.
|
||||
Heap memory is used only if the initial vector is too small. If the heap limit
|
||||
is set to a value less than 21 (in particular, zero) no heap memory will be
|
||||
used. In this case, only patterns that do not have a lot of nested backtracking
|
||||
can be successfully processed.
|
||||
The \fBpcre2_match()\fP function always needs some heap memory, so setting a
|
||||
value of zero guarantees a "heap limit exceeded" error. Details of how
|
||||
\fBpcre2_match()\fP uses the heap are given in the
|
||||
.\" HREF
|
||||
\fBpcre2perform\fP
|
||||
.\"
|
||||
documentation.
|
||||
.P
|
||||
Similarly, for \fBpcre2_dfa_match()\fP, a vector on the system stack is used
|
||||
when processing pattern recursions, lookarounds, or atomic groups, and only if
|
||||
this is not big enough is heap memory used. In this case, too, setting a value
|
||||
of zero disables the use of the heap.
|
||||
For \fBpcre2_dfa_match()\fP, a vector on the system stack is used when
|
||||
processing pattern recursions, lookarounds, or atomic groups, and only if this
|
||||
is not big enough is heap memory used. In this case, setting a value of zero
|
||||
disables the use of the heap.
|
||||
.sp
|
||||
.nf
|
||||
.B int pcre2_set_match_limit(pcre2_match_context *\fImcontext\fP,
|
||||
|
@ -990,10 +1019,10 @@ less than the limit set by the caller of \fBpcre2_match()\fP or
|
|||
.fi
|
||||
.sp
|
||||
This parameter limits the depth of nested backtracking in \fBpcre2_match()\fP.
|
||||
Each time a nested backtracking point is passed, a new memory "frame" is used
|
||||
Each time a nested backtracking point is passed, a new memory frame is used
|
||||
to remember the state of matching at that point. Thus, this parameter
|
||||
indirectly limits the amount of memory that is used in a match. However,
|
||||
because the size of each memory "frame" depends on the number of capturing
|
||||
because the size of each memory frame depends on the number of capturing
|
||||
parentheses, the actual memory limit varies from pattern to pattern. This limit
|
||||
was more useful in versions before 10.30, where function recursion was used for
|
||||
backtracking.
|
||||
|
@ -1034,12 +1063,13 @@ less than the limit set by the caller of \fBpcre2_match()\fP or
|
|||
.sp
|
||||
.B int pcre2_config(uint32_t \fIwhat\fP, void *\fIwhere\fP);
|
||||
.P
|
||||
The function \fBpcre2_config()\fP makes it possible for a PCRE2 client to
|
||||
discover which optional features have been compiled into the PCRE2 library. The
|
||||
The function \fBpcre2_config()\fP makes it possible for a PCRE2 client to find
|
||||
the value of certain configuration parameters and to discover which optional
|
||||
features have been compiled into the PCRE2 library. The
|
||||
.\" HREF
|
||||
\fBpcre2build\fP
|
||||
.\"
|
||||
documentation has more details about these optional features.
|
||||
documentation has more details about these features.
|
||||
.P
|
||||
The first argument for \fBpcre2_config()\fP specifies which information is
|
||||
required. The second argument is a pointer to memory into which the information
|
||||
|
@ -1152,6 +1182,16 @@ over compilation stack usage, see \fBpcre2_set_compile_recursion_guard()\fP.
|
|||
.sp
|
||||
This parameter is obsolete and should not be used in new code. The output is a
|
||||
uint32_t integer that is always set to zero.
|
||||
.sp
|
||||
PCRE2_CONFIG_TABLES_LENGTH
|
||||
.sp
|
||||
The output is a uint32_t integer that gives the length of PCRE2's character
|
||||
processing tables in bytes. For details of these tables see the
|
||||
.\" HTML <a href="#localesupport">
|
||||
.\" </a>
|
||||
section on locale support
|
||||
.\"
|
||||
below.
|
||||
.sp
|
||||
PCRE2_CONFIG_UNICODE_VERSION
|
||||
.sp
|
||||
|
@ -1283,8 +1323,7 @@ If \fIerrorcode\fP or \fIerroroffset\fP is NULL, \fBpcre2_compile()\fP returns
|
|||
NULL immediately. Otherwise, the variables to which these point are set to an
|
||||
error code and an offset (number of code units) within the pattern,
|
||||
respectively, when \fBpcre2_compile()\fP returns NULL because a compilation
|
||||
error has occurred. The values are not defined when compilation is successful
|
||||
and \fBpcre2_compile()\fP returns a non-NULL value.
|
||||
error has occurred.
|
||||
.P
|
||||
There are nearly 100 positive error codes that \fBpcre2_compile()\fP may return
|
||||
if it finds an error in the pattern. There are also some negative error codes
|
||||
|
@ -1303,14 +1342,17 @@ message"
|
|||
below)
|
||||
.\"
|
||||
should be self-explanatory. Macro names starting with PCRE2_ERROR_ are defined
|
||||
for both positive and negative error codes in \fBpcre2.h\fP.
|
||||
for both positive and negative error codes in \fBpcre2.h\fP. When compilation
|
||||
is successful \fIerrorcode\fP is set to a value that returns the message "no
|
||||
error" if passed to \fBpcre2_get_error_message()\fP.
|
||||
.P
|
||||
The value returned in \fIerroroffset\fP is an indication of where in the
|
||||
pattern the error occurred. It is not necessarily the furthest point in the
|
||||
pattern that was read. For example, after the error "lookbehind assertion is
|
||||
not fixed length", the error offset points to the start of the failing
|
||||
assertion. For an invalid UTF-8 or UTF-16 string, the offset is that of the
|
||||
first code unit of the failing character.
|
||||
pattern an error occurred. When there is no error, zero is returned. A non-zero
|
||||
value is not necessarily the furthest point in the pattern that was read. For
|
||||
example, after the error "lookbehind assertion is not fixed length", the error
|
||||
offset points to the start of the failing assertion. For an invalid UTF-8 or
|
||||
UTF-16 string, the offset is that of the first code unit of the failing
|
||||
character.
|
||||
.P
|
||||
Some errors are not detected until the whole pattern has been scanned; in these
|
||||
cases, the offset passed back is the length of the pattern. Note that the
|
||||
|
@ -1420,13 +1462,16 @@ documentation.
|
|||
.sp
|
||||
If this bit is set, letters in the pattern match both upper and lower case
|
||||
letters in the subject. It is equivalent to Perl's /i option, and it can be
|
||||
changed within a pattern by a (?i) option setting. If PCRE2_UTF is set, Unicode
|
||||
properties are used for all characters with more than one other case, and for
|
||||
all characters whose code points are greater than U+007F. For lower valued
|
||||
characters with only one other case, a lookup table is used for speed. When
|
||||
PCRE2_UTF is not set, a lookup table is used for all code points less than 256,
|
||||
and higher code points (available only in 16-bit or 32-bit mode) are treated as
|
||||
not having another case.
|
||||
changed within a pattern by a (?i) option setting. If either PCRE2_UTF or
|
||||
PCRE2_UCP is set, Unicode properties are used for all characters with more than
|
||||
one other case, and for all characters whose code points are greater than
|
||||
U+007F. Note that there are two ASCII characters, K and S, that, in addition to
|
||||
their lower case ASCII equivalents, are case-equivalent with U+212A (Kelvin
|
||||
sign) and U+017F (long S) respectively. For lower valued characters with only
|
||||
one other case, a lookup table is used for speed. When neither PCRE2_UTF nor
|
||||
PCRE2_UCP is set, a lookup table is used for all code points less than 256, and
|
||||
higher code points (available only in 16-bit or 32-bit mode) are treated as not
|
||||
having another case.
|
||||
.sp
|
||||
PCRE2_DOLLAR_ENDONLY
|
||||
.sp
|
||||
|
@ -1751,7 +1796,7 @@ it is set, the effect of passing an invalid UTF string as a pattern is
|
|||
undefined. It may cause your program to crash or loop.
|
||||
.P
|
||||
Note that this option can also be passed to \fBpcre2_match()\fP and
|
||||
\fBpcre_dfa_match()\fP, to suppress UTF validity checking of the subject
|
||||
\fBpcre2_dfa_match()\fP, to suppress UTF validity checking of the subject
|
||||
string.
|
||||
.P
|
||||
Note also that setting PCRE2_NO_UTF_CHECK at compile time does not disable the
|
||||
|
@ -1769,10 +1814,11 @@ are not representable in UTF-16.
|
|||
.sp
|
||||
PCRE2_UCP
|
||||
.sp
|
||||
This option changes the way PCRE2 processes \eB, \eb, \eD, \ed, \eS, \es, \eW,
|
||||
\ew, and some of the POSIX character classes. By default, only ASCII characters
|
||||
are recognized, but if PCRE2_UCP is set, Unicode properties are used instead to
|
||||
classify characters. More details are given in the section on
|
||||
This option has two effects. Firstly, it change the way PCRE2 processes \eB,
|
||||
\eb, \eD, \ed, \eS, \es, \eW, \ew, and some of the POSIX character classes. By
|
||||
default, only ASCII characters are recognized, but if PCRE2_UCP is set, Unicode
|
||||
properties are used instead to classify characters. More details are given in
|
||||
the section on
|
||||
.\" HTML <a href="pcre2pattern.html#genericchartypes">
|
||||
.\" </a>
|
||||
generic character types
|
||||
|
@ -1782,8 +1828,13 @@ in the
|
|||
\fBpcre2pattern\fP
|
||||
.\"
|
||||
page. If you set PCRE2_UCP, matching one of the items it affects takes much
|
||||
longer. The option is available only if PCRE2 has been compiled with Unicode
|
||||
support (which is the default).
|
||||
longer.
|
||||
.P
|
||||
The second effect of PCRE2_UCP is to force the use of Unicode properties for
|
||||
upper/lower casing operations on characters with code points greater than 127,
|
||||
even when PCRE2_UTF is not set. This makes it possible, for example, to process
|
||||
strings in the 16-bit UCS-2 code. This option is available only if PCRE2 has
|
||||
been compiled with Unicode support (which is the default).
|
||||
.sp
|
||||
PCRE2_UNGREEDY
|
||||
.sp
|
||||
|
@ -1826,6 +1877,13 @@ characters with code points greater than 127.
|
|||
.sp
|
||||
The option bits that can be set in a compile context by calling the
|
||||
\fBpcre2_set_compile_extra_options()\fP function are as follows:
|
||||
.sp
|
||||
PCRE2_EXTRA_ALLOW_LOOKAROUND_BSK
|
||||
.sp
|
||||
Since release 10.38 PCRE2 has forbidden the use of \eK within lookaround
|
||||
assertions, following Perl's lead. This option is provided to re-enable the
|
||||
previous behaviour (act in positive lookarounds, ignore in negative ones) in
|
||||
case anybody is relying on it.
|
||||
.sp
|
||||
PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES
|
||||
.sp
|
||||
|
@ -1957,13 +2015,18 @@ PCRE2 handles caseless matching, and determines whether characters are letters,
|
|||
digits, or whatever, by reference to a set of tables, indexed by character code
|
||||
point. However, this applies only to characters whose code points are less than
|
||||
256. By default, higher-valued code points never match escapes such as \ew or
|
||||
\ed. When PCRE2 is built with Unicode support (the default), all characters can
|
||||
be tested with \ep and \eP, or, alternatively, the PCRE2_UCP option can be set
|
||||
when a pattern is compiled; this causes \ew and friends to use Unicode property
|
||||
support instead of the built-in tables.
|
||||
\ed.
|
||||
.P
|
||||
When PCRE2 is built with Unicode support (the default), certain Unicode
|
||||
character properties can be tested with \ep and \eP, or, alternatively, the
|
||||
PCRE2_UCP option can be set when a pattern is compiled; this causes \ew and
|
||||
friends to use Unicode property support instead of the built-in tables.
|
||||
PCRE2_UCP also causes upper/lower casing operations on characters with code
|
||||
points greater than 127 to use Unicode properties. These effects apply even
|
||||
when PCRE2_UTF is not set.
|
||||
.P
|
||||
The use of locales with Unicode is discouraged. If you are handling characters
|
||||
with code points greater than 128, you should either use Unicode support, or
|
||||
with code points greater than 127, you should either use Unicode support, or
|
||||
use locales, but not try to mix the two.
|
||||
.P
|
||||
PCRE2 contains a built-in set of character tables that are used by default.
|
||||
|
@ -1985,7 +2048,7 @@ the system \fBmalloc()\fP is used. The result can be passed to
|
|||
calling \fBpcre2_set_character_tables()\fP to set the tables pointer therein.
|
||||
.P
|
||||
For example, to build and use tables that are appropriate for the French locale
|
||||
(where accented characters with values greater than 128 are treated as
|
||||
(where accented characters with values greater than 127 are treated as
|
||||
letters), the following code could be used:
|
||||
.sp
|
||||
setlocale(LC_CTYPE, "fr_FR");
|
||||
|
@ -1998,10 +2061,10 @@ The locale name "fr_FR" is used on Linux and other Unix-like systems; if you
|
|||
are using Windows, the name for the French locale is "french".
|
||||
.P
|
||||
The pointer that is passed (via the compile context) to \fBpcre2_compile()\fP
|
||||
is saved with the compiled pattern, and the same tables are used by
|
||||
\fBpcre2_match()\fP and \fBpcre_dfa_match()\fP. Thus, for any single pattern,
|
||||
compilation and matching both happen in the same locale, but different patterns
|
||||
can be processed in different locales.
|
||||
is saved with the compiled pattern, and the same tables are used by the
|
||||
matching functions. Thus, for any single pattern, compilation and matching both
|
||||
happen in the same locale, but different patterns can be processed in different
|
||||
locales.
|
||||
.P
|
||||
It is the caller's responsibility to ensure that the memory containing the
|
||||
tables remains available while they are still in use. When they are no longer
|
||||
|
@ -2010,6 +2073,26 @@ pass as its first parameter the same global context that was used to create the
|
|||
tables.
|
||||
.
|
||||
.
|
||||
.SS "Saving locale tables"
|
||||
.rs
|
||||
.sp
|
||||
The tables described above are just a sequence of binary bytes, which makes
|
||||
them independent of hardware characteristics such as endianness or whether the
|
||||
processor is 32-bit or 64-bit. A copy of the result of \fBpcre2_maketables()\fP
|
||||
can therefore be saved in a file or elsewhere and re-used later, even in a
|
||||
different program or on another computer. The size of the tables (number of
|
||||
bytes) must be obtained by calling \fBpcre2_config()\fP with the
|
||||
PCRE2_CONFIG_TABLES_LENGTH option because \fBpcre2_maketables()\fP does not
|
||||
return this value. Note that the \fBpcre2_dftables\fP program, which is part of
|
||||
the PCRE2 build system, can be used stand-alone to create a file that contains
|
||||
a set of binary tables. See the
|
||||
.\" HTML <a href="pcre2build.html#createtables">
|
||||
.\" </a>
|
||||
\fBpcre2build\fP
|
||||
.\"
|
||||
documentation for details.
|
||||
.
|
||||
.
|
||||
.\" HTML <a name="infoaboutpattern"></a>
|
||||
.SH "INFORMATION ABOUT A COMPILED PATTERN"
|
||||
.rs
|
||||
|
@ -2020,7 +2103,7 @@ tables.
|
|||
.P
|
||||
The \fBpcre2_pattern_info()\fP function returns general information about a
|
||||
compiled pattern. For information about callouts, see the
|
||||
.\" HTML <a href="pcre2pattern.html#infoaboutcallouts">
|
||||
.\" HTML <a href="#infoaboutcallouts">
|
||||
.\" </a>
|
||||
next section.
|
||||
.\"
|
||||
|
@ -2198,7 +2281,7 @@ return zero. The third argument should point to a \fBsize_t\fP variable.
|
|||
PCRE2_INFO_LASTCODETYPE
|
||||
.sp
|
||||
Returns 1 if there is a rightmost literal code unit that must exist in any
|
||||
matched string, other than at its start. The third argument should point to a
|
||||
matched string, other than at its start. The third argument should point to a
|
||||
\fBuint32_t\fP variable. If there is no such value, 0 is returned. When 1 is
|
||||
returned, the code unit value itself can be retrieved using
|
||||
PCRE2_INFO_LASTCODEUNIT. For anchored patterns, a last literal value is
|
||||
|
@ -2416,19 +2499,27 @@ to an abstract format like Java or .NET serialization.
|
|||
Information about a successful or unsuccessful match is placed in a match
|
||||
data block, which is an opaque structure that is accessed by function calls. In
|
||||
particular, the match data block contains a vector of offsets into the subject
|
||||
string that define the matched part of the subject and any substrings that were
|
||||
captured. This is known as the \fIovector\fP.
|
||||
string that define the matched parts of the subject. This is known as the
|
||||
\fIovector\fP.
|
||||
.P
|
||||
Before calling \fBpcre2_match()\fP, \fBpcre2_dfa_match()\fP, or
|
||||
\fBpcre2_jit_match()\fP you must create a match data block by calling one of
|
||||
the creation functions above. For \fBpcre2_match_data_create()\fP, the first
|
||||
argument is the number of pairs of offsets in the \fIovector\fP. One pair of
|
||||
offsets is required to identify the string that matched the whole pattern, with
|
||||
an additional pair for each captured substring. For example, a value of 4
|
||||
creates enough space to record the matched portion of the subject plus three
|
||||
captured substrings. A minimum of at least 1 pair is imposed by
|
||||
\fBpcre2_match_data_create()\fP, so it is always possible to return the overall
|
||||
matched string.
|
||||
argument is the number of pairs of offsets in the \fIovector\fP.
|
||||
.P
|
||||
When using \fBpcre2_match()\fP, one pair of offsets is required to identify the
|
||||
string that matched the whole pattern, with an additional pair for each
|
||||
captured substring. For example, a value of 4 creates enough space to record
|
||||
the matched portion of the subject plus three captured substrings.
|
||||
.P
|
||||
When using \fBpcre2_dfa_match()\fP there may be multiple matched substrings of
|
||||
different lengths at the same point in the subject. The ovector should be made
|
||||
large enough to hold as many as are expected.
|
||||
.P
|
||||
A minimum of at least 1 pair is imposed by \fBpcre2_match_data_create()\fP, so
|
||||
it is always possible to return the overall matched string in the case of
|
||||
\fBpcre2_match()\fP or the longest match in the case of
|
||||
\fBpcre2_dfa_match()\fP.
|
||||
.P
|
||||
The second argument of \fBpcre2_match_data_create()\fP is a pointer to a
|
||||
general context, which can specify custom memory management for obtaining the
|
||||
|
@ -2437,10 +2528,11 @@ pass NULL, which causes \fBmalloc()\fP to be used.
|
|||
.P
|
||||
For \fBpcre2_match_data_create_from_pattern()\fP, the first argument is a
|
||||
pointer to a compiled pattern. The ovector is created to be exactly the right
|
||||
size to hold all the substrings a pattern might capture. The second argument is
|
||||
again a pointer to a general context, but in this case if NULL is passed, the
|
||||
memory is obtained using the same allocator that was used for the compiled
|
||||
pattern (custom or default).
|
||||
size to hold all the substrings a pattern might capture when matched using
|
||||
\fBpcre2_match()\fP. You should not use this call when matching with
|
||||
\fBpcre2_dfa_match()\fP. The second argument is again a pointer to a general
|
||||
context, but in this case if NULL is passed, the memory is obtained using the
|
||||
same allocator that was used for the compiled pattern (custom or default).
|
||||
.P
|
||||
A match data block can be used many times, with the same or different compiled
|
||||
patterns. You can extract information from a match data block after a match
|
||||
|
@ -2534,7 +2626,9 @@ The subject string is passed to \fBpcre2_match()\fP as a pointer in
|
|||
\fIstartoffset\fP. The length and offset are in code units, not characters.
|
||||
That is, they are in bytes for the 8-bit library, 16-bit code units for the
|
||||
16-bit library, and 32-bit code units for the 32-bit library, whether or not
|
||||
UTF processing is enabled.
|
||||
UTF processing is enabled. As a special case, if \fIsubject\fP is NULL and
|
||||
\fIlength\fP is zero, the subject is assumed to be an empty string. If
|
||||
\fIlength\fP is non-zero, an error occurs if \fIsubject\fP is NULL.
|
||||
.P
|
||||
If \fIstartoffset\fP is greater than the length of the subject,
|
||||
\fBpcre2_match()\fP returns PCRE2_ERROR_BADOFFSET. When the starting offset is
|
||||
|
@ -2554,10 +2648,10 @@ lookbehind. For example, consider the pattern
|
|||
.sp
|
||||
which finds occurrences of "iss" in the middle of words. (\eB matches only if
|
||||
the current position in the subject is not a word boundary.) When applied to
|
||||
the string "Mississipi" the first call to \fBpcre2_match()\fP finds the first
|
||||
the string "Mississippi" the first call to \fBpcre2_match()\fP finds the first
|
||||
occurrence. If \fBpcre2_match()\fP is called again with just the remainder of
|
||||
the subject, namely "issipi", it does not match, because \eB is always false at
|
||||
the start of the subject, which is deemed to be a word boundary. However, if
|
||||
the subject, namely "issippi", it does not match, because \eB is always false
|
||||
at the start of the subject, which is deemed to be a word boundary. However, if
|
||||
\fBpcre2_match()\fP is passed the entire string again, but with
|
||||
\fIstartoffset\fP set to 4, it finds the second occurrence of "iss" because it
|
||||
is able to look behind the starting point to discover that it is preceded by a
|
||||
|
@ -3068,11 +3162,11 @@ The backtracking match limit was reached.
|
|||
.sp
|
||||
PCRE2_ERROR_NOMEMORY
|
||||
.sp
|
||||
If a pattern contains many nested backtracking points, heap memory is used to
|
||||
remember them. This error is given when the memory allocation function (default
|
||||
or custom) fails. Note that a different error, PCRE2_ERROR_HEAPLIMIT, is given
|
||||
if the amount of memory needed exceeds the heap limit. PCRE2_ERROR_NOMEMORY is
|
||||
also returned if PCRE2_COPY_MATCHED_SUBJECT is set and memory allocation fails.
|
||||
Heap memory is used to remember backgracking points. This error is given when
|
||||
the memory allocation function (default or custom) fails. Note that a different
|
||||
error, PCRE2_ERROR_HEAPLIMIT, is given if the amount of memory needed exceeds
|
||||
the heap limit. PCRE2_ERROR_NOMEMORY is also returned if
|
||||
PCRE2_COPY_MATCHED_SUBJECT is set and memory allocation fails.
|
||||
.sp
|
||||
PCRE2_ERROR_NULL
|
||||
.sp
|
||||
|
@ -3321,12 +3415,23 @@ same number causes an error at compile time.
|
|||
.B " PCRE2_SIZE *\fIoutlengthptr\fP);"
|
||||
.fi
|
||||
.P
|
||||
This function calls \fBpcre2_match()\fP and then makes a copy of the subject
|
||||
string in \fIoutputbuffer\fP, replacing one or more parts that were matched
|
||||
with the \fIreplacement\fP string, whose length is supplied in \fBrlength\fP.
|
||||
This can be given as PCRE2_ZERO_TERMINATED for a zero-terminated string.
|
||||
The default is to perform just one replacement, but there is an option that
|
||||
requests multiple replacements (see PCRE2_SUBSTITUTE_GLOBAL below for details).
|
||||
This function optionally calls \fBpcre2_match()\fP and then makes a copy of the
|
||||
subject string in \fIoutputbuffer\fP, replacing parts that were matched with
|
||||
the \fIreplacement\fP string, whose length is supplied in \fBrlength\fP, which
|
||||
can be given as PCRE2_ZERO_TERMINATED for a zero-terminated string. As a
|
||||
special case, if \fIreplacement\fP is NULL and \fIrlength\fP is zero, the
|
||||
replacement is assumed to be an empty string. If \fIrlength\fP is non-zero, an
|
||||
error occurs if \fIreplacement\fP is NULL.
|
||||
.P
|
||||
There is an option (see PCRE2_SUBSTITUTE_REPLACEMENT_ONLY below) to return just
|
||||
the replacement string(s). The default action is to perform just one
|
||||
replacement if the pattern matches, but there is an option that requests
|
||||
multiple replacements (see PCRE2_SUBSTITUTE_GLOBAL below).
|
||||
.P
|
||||
If successful, \fBpcre2_substitute()\fP returns the number of substitutions
|
||||
that were carried out. This may be zero if no match was found, and is never
|
||||
greater than one unless PCRE2_SUBSTITUTE_GLOBAL is set. A negative value is
|
||||
returned if an error is detected.
|
||||
.P
|
||||
Matches in which a \eK item in a lookahead in the pattern causes the match to
|
||||
end before it starts are not supported, and give rise to an error return. For
|
||||
|
@ -3341,32 +3446,79 @@ data block is obtained and freed within this function, using memory management
|
|||
functions from the match context, if provided, or else those that were used to
|
||||
allocate memory for the compiled code.
|
||||
.P
|
||||
If an external \fImatch_data\fP block is provided, its contents afterwards
|
||||
are those set by the final call to \fBpcre2_match()\fP. For global changes,
|
||||
this will have ended in a matching error. The contents of the ovector within
|
||||
the match data block may or may not have been changed.
|
||||
If \fImatch_data\fP is not NULL and PCRE2_SUBSTITUTE_MATCHED is not set, the
|
||||
provided block is used for all calls to \fBpcre2_match()\fP, and its contents
|
||||
afterwards are the result of the final call. For global changes, this will
|
||||
always be a no-match error. The contents of the ovector within the match data
|
||||
block may or may not have been changed.
|
||||
.P
|
||||
The \fIoutlengthptr\fP argument must point to a variable that contains the
|
||||
length, in code units, of the output buffer. If the function is successful, the
|
||||
value is updated to contain the length of the new string, excluding the
|
||||
trailing zero that is automatically added.
|
||||
As well as the usual options for \fBpcre2_match()\fP, a number of additional
|
||||
options can be set in the \fIoptions\fP argument of \fBpcre2_substitute()\fP.
|
||||
One such option is PCRE2_SUBSTITUTE_MATCHED. When this is set, an external
|
||||
\fImatch_data\fP block must be provided, and it must have already been used for
|
||||
an external call to \fBpcre2_match()\fP with the same pattern and subject
|
||||
arguments. The data in the \fImatch_data\fP block (return code, offset vector)
|
||||
is then used for the first substitution instead of calling \fBpcre2_match()\fP
|
||||
from within \fBpcre2_substitute()\fP. This allows an application to check for a
|
||||
match before choosing to substitute, without having to repeat the match.
|
||||
.P
|
||||
The contents of the externally supplied match data block are not changed when
|
||||
PCRE2_SUBSTITUTE_MATCHED is set. If PCRE2_SUBSTITUTE_GLOBAL is also set,
|
||||
\fBpcre2_match()\fP is called after the first substitution to check for further
|
||||
matches, but this is done using an internally obtained match data block, thus
|
||||
always leaving the external block unchanged.
|
||||
.P
|
||||
The \fIcode\fP argument is not used for matching before the first substitution
|
||||
when PCRE2_SUBSTITUTE_MATCHED is set, but it must be provided, even when
|
||||
PCRE2_SUBSTITUTE_GLOBAL is not set, because it contains information such as the
|
||||
UTF setting and the number of capturing parentheses in the pattern.
|
||||
.P
|
||||
The default action of \fBpcre2_substitute()\fP is to return a copy of the
|
||||
subject string with matched substrings replaced. However, if
|
||||
PCRE2_SUBSTITUTE_REPLACEMENT_ONLY is set, only the replacement substrings are
|
||||
returned. In the global case, multiple replacements are concatenated in the
|
||||
output buffer. Substitution callouts (see
|
||||
.\" HTML <a href="#subcallouts">
|
||||
.\" </a>
|
||||
below)
|
||||
.\"
|
||||
can be used to separate them if necessary.
|
||||
.P
|
||||
The \fIoutlengthptr\fP argument of \fBpcre2_substitute()\fP must point to a
|
||||
variable that contains the length, in code units, of the output buffer. If the
|
||||
function is successful, the value is updated to contain the length in code
|
||||
units of the new string, excluding the trailing zero that is automatically
|
||||
added.
|
||||
.P
|
||||
If the function is not successful, the value set via \fIoutlengthptr\fP depends
|
||||
on the type of error. For syntax errors in the replacement string, the value is
|
||||
the offset in the replacement string where the error was detected. For other
|
||||
errors, the value is PCRE2_UNSET by default. This includes the case of the
|
||||
output buffer being too small, unless PCRE2_SUBSTITUTE_OVERFLOW_LENGTH is set
|
||||
(see below), in which case the value is the minimum length needed, including
|
||||
space for the trailing zero. Note that in order to compute the required length,
|
||||
\fBpcre2_substitute()\fP has to simulate all the matching and copying, instead
|
||||
of giving an error return as soon as the buffer overflows. Note also that the
|
||||
length is in code units, not bytes.
|
||||
output buffer being too small, unless PCRE2_SUBSTITUTE_OVERFLOW_LENGTH is set.
|
||||
.P
|
||||
In the replacement string, which is interpreted as a UTF string in UTF mode,
|
||||
and is checked for UTF validity unless the PCRE2_NO_UTF_CHECK option is set, a
|
||||
dollar character is an escape character that can specify the insertion of
|
||||
characters from capture groups or names from (*MARK) or other control verbs
|
||||
in the pattern. The following forms are always recognized:
|
||||
PCRE2_SUBSTITUTE_OVERFLOW_LENGTH changes what happens when the output buffer is
|
||||
too small. The default action is to return PCRE2_ERROR_NOMEMORY immediately. If
|
||||
this option is set, however, \fBpcre2_substitute()\fP continues to go through
|
||||
the motions of matching and substituting (without, of course, writing anything)
|
||||
in order to compute the size of buffer that is needed. This value is passed
|
||||
back via the \fIoutlengthptr\fP variable, with the result of the function still
|
||||
being PCRE2_ERROR_NOMEMORY.
|
||||
.P
|
||||
Passing a buffer size of zero is a permitted way of finding out how much memory
|
||||
is needed for given substitution. However, this does mean that the entire
|
||||
operation is carried out twice. Depending on the application, it may be more
|
||||
efficient to allocate a large buffer and free the excess afterwards, instead of
|
||||
using PCRE2_SUBSTITUTE_OVERFLOW_LENGTH.
|
||||
.P
|
||||
The replacement string, which is interpreted as a UTF string in UTF mode, is
|
||||
checked for UTF validity unless PCRE2_NO_UTF_CHECK is set. An invalid UTF
|
||||
replacement string causes an immediate return with the relevant UTF error code.
|
||||
.P
|
||||
If PCRE2_SUBSTITUTE_LITERAL is set, the replacement string is not interpreted
|
||||
in any way. By default, however, a dollar character is an escape character that
|
||||
can specify the insertion of characters from capture groups and names from
|
||||
(*MARK) or other control verbs in the pattern. The following forms are always
|
||||
recognized:
|
||||
.sp
|
||||
$$ insert a dollar character
|
||||
$<n> or ${<n>} insert the contents of group <n>
|
||||
|
@ -3389,9 +3541,6 @@ facility can be used to perform simple simultaneous substitutions, as this
|
|||
apple lemon
|
||||
2: pear orange
|
||||
.sp
|
||||
As well as the usual options for \fBpcre2_match()\fP, a number of additional
|
||||
options can be set in the \fIoptions\fP argument of \fBpcre2_substitute()\fP.
|
||||
.P
|
||||
PCRE2_SUBSTITUTE_GLOBAL causes the function to iterate over the subject string,
|
||||
replacing every matching substring. If this option is not set, only the first
|
||||
matching substring is replaced. The search for matches takes place in the
|
||||
|
@ -3402,7 +3551,7 @@ set in the match context, searching stops when that limit is reached.
|
|||
.P
|
||||
You can restrict the effect of a global substitution to a portion of the
|
||||
subject string by setting either or both of \fIstartoffset\fP and an offset
|
||||
limit. Here is a \fPpcre2test\fP example:
|
||||
limit. Here is a \fBpcre2test\fP example:
|
||||
.sp
|
||||
/B/g,replace=!,use_offset_limit
|
||||
ABC ABC ABC ABC\e=offset=3,offset_limit=12
|
||||
|
@ -3414,20 +3563,6 @@ If this is not successful, the offset is advanced by one character except when
|
|||
CRLF is a valid newline sequence and the next two characters are CR, LF. In
|
||||
this case, the offset is advanced by two characters.
|
||||
.P
|
||||
PCRE2_SUBSTITUTE_OVERFLOW_LENGTH changes what happens when the output buffer is
|
||||
too small. The default action is to return PCRE2_ERROR_NOMEMORY immediately. If
|
||||
this option is set, however, \fBpcre2_substitute()\fP continues to go through
|
||||
the motions of matching and substituting (without, of course, writing anything)
|
||||
in order to compute the size of buffer that is needed. This value is passed
|
||||
back via the \fIoutlengthptr\fP variable, with the result of the function still
|
||||
being PCRE2_ERROR_NOMEMORY.
|
||||
.P
|
||||
Passing a buffer size of zero is a permitted way of finding out how much memory
|
||||
is needed for given substitution. However, this does mean that the entire
|
||||
operation is carried out twice. Depending on the application, it may be more
|
||||
efficient to allocate a large buffer and free the excess afterwards, instead of
|
||||
using PCRE2_SUBSTITUTE_OVERFLOW_LENGTH.
|
||||
.P
|
||||
PCRE2_SUBSTITUTE_UNKNOWN_UNSET causes references to capture groups that do
|
||||
not appear in the pattern to be treated as unset groups. This option should be
|
||||
used with care, because it means that a typo in a group name or number no
|
||||
|
@ -3457,8 +3592,11 @@ and force lower case. The escape sequences change the current state: \eU and
|
|||
terminating a \eQ quoted sequence) reverts to no case forcing. The sequences
|
||||
\eu and \el force the next character (if it is a letter) to upper or lower
|
||||
case, respectively, and then the state automatically reverts to no case
|
||||
forcing. Case forcing applies to all inserted characters, including those from
|
||||
capture groups and letters within \eQ...\eE quoted sequences.
|
||||
forcing. Case forcing applies to all inserted characters, including those from
|
||||
capture groups and letters within \eQ...\eE quoted sequences. If either
|
||||
PCRE2_UTF or PCRE2_UCP was set when the pattern was compiled, Unicode
|
||||
properties are used for case forcing characters whose code points are greater
|
||||
than 127.
|
||||
.P
|
||||
Note that case forcing sequences such as \eU...\eE do not nest. For example,
|
||||
the result of processing "\eUaa\eLBB\eEcc\eE" is "AAbbcc"; the final \eE has no
|
||||
|
@ -3494,13 +3632,17 @@ The PCRE2_SUBSTITUTE_UNSET_EMPTY option does not affect these extended
|
|||
substitutions. However, PCRE2_SUBSTITUTE_UNKNOWN_UNSET does cause unknown
|
||||
groups in the extended syntax forms to be treated as unset.
|
||||
.P
|
||||
If successful, \fBpcre2_substitute()\fP returns the number of successful
|
||||
matches. This may be zero if no matches were found, and is never greater than 1
|
||||
unless PCRE2_SUBSTITUTE_GLOBAL is set.
|
||||
.P
|
||||
In the event of an error, a negative error code is returned. Except for
|
||||
PCRE2_ERROR_NOMATCH (which is never returned), errors from \fBpcre2_match()\fP
|
||||
are passed straight back.
|
||||
If PCRE2_SUBSTITUTE_LITERAL is set, PCRE2_SUBSTITUTE_UNKNOWN_UNSET,
|
||||
PCRE2_SUBSTITUTE_UNSET_EMPTY, and PCRE2_SUBSTITUTE_EXTENDED are irrelevant and
|
||||
are ignored.
|
||||
.
|
||||
.
|
||||
.SS "Substitution errors"
|
||||
.rs
|
||||
.sp
|
||||
In the event of an error, \fBpcre2_substitute()\fP returns a negative error
|
||||
code. Except for PCRE2_ERROR_NOMATCH (which is never returned), errors from
|
||||
\fBpcre2_match()\fP are passed straight back.
|
||||
.P
|
||||
PCRE2_ERROR_NOSUBSTRING is returned for a non-existent substring insertion,
|
||||
unless PCRE2_SUBSTITUTE_UNKNOWN_UNSET is set.
|
||||
|
@ -3514,6 +3656,11 @@ PCRE2_SUBSTITUTE_OVERFLOW_LENGTH option is set, the size of buffer that is
|
|||
needed is returned via \fIoutlengthptr\fP. Note that this does not happen by
|
||||
default.
|
||||
.P
|
||||
PCRE2_ERROR_NULL is returned if PCRE2_SUBSTITUTE_MATCHED is set but the
|
||||
\fImatch_data\fP argument is NULL or if the \fIsubject\fP or \fIreplacement\fP
|
||||
arguments are NULL. For backward compatibility reasons an exception is made for
|
||||
the \fIreplacement\fP argument if the \fIrlength\fP argument is also 0.
|
||||
.P
|
||||
PCRE2_ERROR_BADREPLACEMENT is used for miscellaneous syntax errors in the
|
||||
replacement string, with more particular errors being PCRE2_ERROR_BADREPESCAPE
|
||||
(invalid escape sequence), PCRE2_ERROR_REPMISSINGBRACE (closing curly bracket
|
||||
|
@ -3531,6 +3678,7 @@ above).
|
|||
.\"
|
||||
.
|
||||
.
|
||||
.\" HTML <a name="subcallouts"></a>
|
||||
.SS "Substitution callouts"
|
||||
.rs
|
||||
.sp
|
||||
|
@ -3673,12 +3821,13 @@ other alternatives. Ultimately, when it runs out of matches,
|
|||
.P
|
||||
The function \fBpcre2_dfa_match()\fP is called to match a subject string
|
||||
against a compiled pattern, using a matching algorithm that scans the subject
|
||||
string just once (not counting lookaround assertions), and does not backtrack.
|
||||
This has different characteristics to the normal algorithm, and is not
|
||||
compatible with Perl. Some of the features of PCRE2 patterns are not supported.
|
||||
Nevertheless, there are times when this kind of matching can be useful. For a
|
||||
discussion of the two matching algorithms, and a list of features that
|
||||
\fBpcre2_dfa_match()\fP does not support, see the
|
||||
string just once (not counting lookaround assertions), and does not backtrack
|
||||
(except when processing lookaround assertions). This has different
|
||||
characteristics to the normal algorithm, and is not compatible with Perl. Some
|
||||
of the features of PCRE2 patterns are not supported. Nevertheless, there are
|
||||
times when this kind of matching can be useful. For a discussion of the two
|
||||
matching algorithms, and a list of features that \fBpcre2_dfa_match()\fP does
|
||||
not support, see the
|
||||
.\" HREF
|
||||
\fBpcre2matching\fP
|
||||
.\"
|
||||
|
@ -3710,7 +3859,7 @@ Here is an example of a simple call to \fBpcre2_dfa_match()\fP:
|
|||
wspace, /* working space vector */
|
||||
20); /* number of elements (NOT size in bytes) */
|
||||
.
|
||||
.SS "Option bits for \fBpcre_dfa_match()\fP"
|
||||
.SS "Option bits for \fBpcre2_dfa_match()\fP"
|
||||
.rs
|
||||
.sp
|
||||
The unused bits of the \fIoptions\fP argument for \fBpcre2_dfa_match()\fP must
|
||||
|
@ -3869,7 +4018,7 @@ fail, this error is given.
|
|||
.sp
|
||||
.nf
|
||||
Philip Hazel
|
||||
University Computing Service
|
||||
Retired from University Computing Service
|
||||
Cambridge, England.
|
||||
.fi
|
||||
.
|
||||
|
@ -3878,6 +4027,6 @@ Cambridge, England.
|
|||
.rs
|
||||
.sp
|
||||
.nf
|
||||
Last updated: 02 September 2019
|
||||
Copyright (c) 1997-2019 University of Cambridge.
|
||||
Last updated: 27 July 2022
|
||||
Copyright (c) 1997-2022 University of Cambridge.
|
||||
.fi
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
.TH PCRE2BUILD 3 "03 March 2019" "PCRE2 10.33"
|
||||
.TH PCRE2BUILD 3 "27 July 2022" "PCRE2 10.41"
|
||||
.SH NAME
|
||||
PCRE2 - Perl-compatible regular expressions (revised API)
|
||||
.
|
||||
|
@ -110,7 +110,7 @@ To build it without Unicode support, add
|
|||
--disable-unicode
|
||||
.sp
|
||||
to the \fBconfigure\fP command. This setting applies to all three libraries. It
|
||||
is not possible to build one library with Unicode support, and another without,
|
||||
is not possible to build one library with Unicode support and another without
|
||||
in the same configuration.
|
||||
.P
|
||||
Of itself, Unicode support does not make PCRE2 treat strings as UTF-8, UTF-16
|
||||
|
@ -122,8 +122,9 @@ locked this out by setting PCRE2_NEVER_UTF.
|
|||
UTF support allows the libraries to process character code points up to
|
||||
0x10ffff in the strings that they handle. Unicode support also gives access to
|
||||
the Unicode properties of characters, using pattern escapes such as \eP, \ep,
|
||||
and \eX. Only the general category properties such as \fILu\fP and \fINd\fP are
|
||||
supported. Details are given in the
|
||||
and \eX. Only the general category properties such as \fILu\fP and \fINd\fP,
|
||||
script names, and some bi-directional properties are supported. Details are
|
||||
given in the
|
||||
.\" HREF
|
||||
\fBpcre2pattern\fP
|
||||
.\"
|
||||
|
@ -175,11 +176,11 @@ SELinux. This has no effect if JIT is not enabled. See the
|
|||
\fBpcre2jit\fP
|
||||
.\"
|
||||
documentation for a discussion of JIT usage. When JIT support is enabled,
|
||||
pcre2grep automatically makes use of it, unless you add
|
||||
\fBpcre2grep\fP automatically makes use of it, unless you add
|
||||
.sp
|
||||
--disable-pcre2grep-jit
|
||||
.sp
|
||||
to the "configure" command.
|
||||
to the \fBconfigure\fP command.
|
||||
.
|
||||
.
|
||||
.SH "NEWLINE RECOGNITION"
|
||||
|
@ -277,12 +278,11 @@ to the \fBconfigure\fP command. This setting also applies to the
|
|||
\fBpcre2_dfa_match()\fP matching function, and to JIT matching (though the
|
||||
counting is done differently).
|
||||
.P
|
||||
The \fBpcre2_match()\fP function starts out using a 20KiB vector on the system
|
||||
stack to record backtracking points. The more nested backtracking points there
|
||||
are (that is, the deeper the search tree), the more memory is needed. If the
|
||||
initial vector is not large enough, heap memory is used, up to a certain limit,
|
||||
which is specified in kibibytes (units of 1024 bytes). The limit can be changed
|
||||
at run time, as described in the
|
||||
The \fBpcre2_match()\fP function uses heap memory to record backtracking
|
||||
points. The more nested backtracking points there are (that is, the deeper the
|
||||
search tree), the more memory is needed. There is an upper limit, specified in
|
||||
kibibytes (units of 1024 bytes). This limit can be changed at run time, as
|
||||
described in the
|
||||
.\" HREF
|
||||
\fBpcre2api\fP
|
||||
.\"
|
||||
|
@ -302,7 +302,7 @@ You can also explicitly limit the depth of nested backtracking in the
|
|||
for --with-match-limit. You can set a lower default limit by adding, for
|
||||
example,
|
||||
.sp
|
||||
--with-match-limit_depth=10000
|
||||
--with-match-limit-depth=10000
|
||||
.sp
|
||||
to the \fBconfigure\fP command. This value can be overridden at run time. This
|
||||
depth limit indirectly limits the amount of heap memory that is used, but
|
||||
|
@ -317,6 +317,7 @@ used for lookaround assertions, atomic groups, and recursion within patterns.
|
|||
The limit does not apply to JIT matching.
|
||||
.
|
||||
.
|
||||
.\" HTML <a name="createtables"></a>
|
||||
.SH "CREATING CHARACTER TABLES AT BUILD TIME"
|
||||
.rs
|
||||
.sp
|
||||
|
@ -328,12 +329,33 @@ only. If you add
|
|||
--enable-rebuild-chartables
|
||||
.sp
|
||||
to the \fBconfigure\fP command, the distributed tables are no longer used.
|
||||
Instead, a program called \fBdftables\fP is compiled and run. This outputs the
|
||||
source for new set of tables, created in the default locale of your C run-time
|
||||
system. This method of replacing the tables does not work if you are cross
|
||||
compiling, because \fBdftables\fP is run on the local host. If you need to
|
||||
create alternative tables when cross compiling, you will have to do so "by
|
||||
hand".
|
||||
Instead, a program called \fBpcre2_dftables\fP is compiled and run. This
|
||||
outputs the source for new set of tables, created in the default locale of your
|
||||
C run-time system. This method of replacing the tables does not work if you are
|
||||
cross compiling, because \fBpcre2_dftables\fP needs to be run on the local
|
||||
host and therefore not compiled with the cross compiler.
|
||||
.P
|
||||
If you need to create alternative tables when cross compiling, you will have to
|
||||
do so "by hand". There may also be other reasons for creating tables manually.
|
||||
To cause \fBpcre2_dftables\fP to be built on the local host, run a normal
|
||||
compiling command, and then run the program with the output file as its
|
||||
argument, for example:
|
||||
.sp
|
||||
cc src/pcre2_dftables.c -o pcre2_dftables
|
||||
./pcre2_dftables src/pcre2_chartables.c
|
||||
.sp
|
||||
This builds the tables in the default locale of the local host. If you want to
|
||||
specify a locale, you must use the -L option:
|
||||
.sp
|
||||
LC_ALL=fr_FR ./pcre2_dftables -L src/pcre2_chartables.c
|
||||
.sp
|
||||
You can also specify -b (with or without -L). This causes the tables to be
|
||||
written in binary instead of as source code. A set of binary tables can be
|
||||
loaded into memory by an application and passed to \fBpcre2_compile()\fP in the
|
||||
same way as tables created by calling \fBpcre2_maketables()\fP. The tables are
|
||||
just a string of bytes, independent of hardware characteristics such as
|
||||
endianness. This means they can be bundled with an application that runs in
|
||||
different environments, to ensure consistent behaviour.
|
||||
.
|
||||
.
|
||||
.SH "USING EBCDIC CODE"
|
||||
|
@ -417,7 +439,7 @@ default parameter values by adding, for example,
|
|||
--with-pcre2grep-bufsize=51200
|
||||
--with-pcre2grep-max-bufsize=2097152
|
||||
.sp
|
||||
to the \fBconfigure\fP command. The caller of \fPpcre2grep\fP can override
|
||||
to the \fBconfigure\fP command. The caller of \fBpcre2grep\fP can override
|
||||
these values by using --buffer-size and --max-buffer-size on the command line.
|
||||
.
|
||||
.
|
||||
|
@ -541,15 +563,16 @@ documentation.
|
|||
.sp
|
||||
The C99 standard defines formatting modifiers z and t for size_t and
|
||||
ptrdiff_t values, respectively. By default, PCRE2 uses these modifiers in
|
||||
environments other than Microsoft Visual Studio when __STDC_VERSION__ is
|
||||
defined and has a value greater than or equal to 199901L (indicating C99).
|
||||
environments other than old versions of Microsoft Visual Studio when
|
||||
__STDC_VERSION__ is defined and has a value greater than or equal to 199901L
|
||||
(indicating support for C99).
|
||||
However, there is at least one environment that claims to be C99 but does not
|
||||
support these modifiers. If
|
||||
.sp
|
||||
--disable-percent-zt
|
||||
.sp
|
||||
is specified, no use is made of the z or t modifiers. Instead or %td or %zu,
|
||||
%lu is used, with a cast for size_t values.
|
||||
is specified, no use is made of the z or t modifiers. Instead of %td or %zu,
|
||||
a suitable format is used depending in the size of long for the platform.
|
||||
.
|
||||
.
|
||||
.SH "SUPPORT FOR FUZZERS"
|
||||
|
@ -601,7 +624,7 @@ give a warning.
|
|||
.sp
|
||||
.nf
|
||||
Philip Hazel
|
||||
University Computing Service
|
||||
Retired from University Computing Service
|
||||
Cambridge, England.
|
||||
.fi
|
||||
.
|
||||
|
@ -610,6 +633,6 @@ Cambridge, England.
|
|||
.rs
|
||||
.sp
|
||||
.nf
|
||||
Last updated: 03 March 2019
|
||||
Copyright (c) 1997-2019 University of Cambridge.
|
||||
Last updated: 27 July 2022
|
||||
Copyright (c) 1997-2022 University of Cambridge.
|
||||
.fi
|
||||
|
|
|
@ -1,33 +1,43 @@
|
|||
.TH PCRE2COMPAT 3 "13 July 2019" "PCRE2 10.34"
|
||||
.TH PCRE2COMPAT 3 "08 December 2021" "PCRE2 10.40"
|
||||
.SH NAME
|
||||
PCRE2 - Perl-compatible regular expressions (revised API)
|
||||
.SH "DIFFERENCES BETWEEN PCRE2 AND PERL"
|
||||
.rs
|
||||
.sp
|
||||
This document describes the differences in the ways that PCRE2 and Perl handle
|
||||
regular expressions. The differences described here are with respect to Perl
|
||||
versions 5.26, but as both Perl and PCRE2 are continually changing, the
|
||||
information may sometimes be out of date.
|
||||
This document describes some of the differences in the ways that PCRE2 and Perl
|
||||
handle regular expressions. The differences described here are with respect to
|
||||
Perl version 5.34.0, but as both Perl and PCRE2 are continually changing, the
|
||||
information may at times be out of date.
|
||||
.P
|
||||
1. PCRE2 has only a subset of Perl's Unicode support. Details of what it does
|
||||
1. When PCRE2_DOTALL (equivalent to Perl's /s qualifier) is not set, the
|
||||
behaviour of the '.' metacharacter differs from Perl. In PCRE2, '.' matches the
|
||||
next character unless it is the start of a newline sequence. This means that,
|
||||
if the newline setting is CR, CRLF, or NUL, '.' will match the code point LF
|
||||
(0x0A) in ASCII/Unicode environments, and NL (either 0x15 or 0x25) when using
|
||||
EBCDIC. In Perl, '.' appears never to match LF, even when 0x0A is not a newline
|
||||
indicator.
|
||||
.P
|
||||
2. PCRE2 has only a subset of Perl's Unicode support. Details of what it does
|
||||
have are given in the
|
||||
.\" HREF
|
||||
\fBpcre2unicode\fP
|
||||
.\"
|
||||
page.
|
||||
.P
|
||||
2. Like Perl, PCRE2 allows repeat quantifiers on parenthesized assertions, but
|
||||
3. Like Perl, PCRE2 allows repeat quantifiers on parenthesized assertions, but
|
||||
they do not mean what you might think. For example, (?!a){3} does not assert
|
||||
that the next three characters are not "a". It just asserts that the next
|
||||
character is not "a" three times (in principle; PCRE2 optimizes this to run the
|
||||
assertion just once). Perl allows some repeat quantifiers on other assertions,
|
||||
for example, \eb* (but not \eb{3}), but these do not seem to have any use.
|
||||
for example, \eb* , but these do not seem to have any use. PCRE2 does not allow
|
||||
any kind of quantifier on non-lookaround assertions.
|
||||
.P
|
||||
3. Capture groups that occur inside negative lookaround assertions are counted,
|
||||
4. Capture groups that occur inside negative lookaround assertions are counted,
|
||||
but their entries in the offsets vector are set only when a negative assertion
|
||||
is a condition that has a matching branch (that is, the condition is false).
|
||||
Perl may set such capture groups in other circumstances.
|
||||
.P
|
||||
4. The following Perl escape sequences are not supported: \eF, \el, \eL, \eu,
|
||||
5. The following Perl escape sequences are not supported: \eF, \el, \eL, \eu,
|
||||
\eU, and \eN when followed by a character name. \eN on its own, matching a
|
||||
non-newline character, and \eN{U+dd..}, matching a Unicode code point, are
|
||||
supported. The escapes that modify the case of following letters are
|
||||
|
@ -37,23 +47,27 @@ generated by default. However, if either of the PCRE2_ALT_BSUX or
|
|||
PCRE2_EXTRA_ALT_BSUX options is set, \eU and \eu are interpreted as ECMAScript
|
||||
interprets them.
|
||||
.P
|
||||
5. The Perl escape sequences \ep, \eP, and \eX are supported only if PCRE2 is
|
||||
6. The Perl escape sequences \ep, \eP, and \eX are supported only if PCRE2 is
|
||||
built with Unicode support (the default). The properties that can be tested
|
||||
with \ep and \eP are limited to the general category properties such as Lu and
|
||||
Nd, script names such as Greek or Han, and the derived properties Any and L&.
|
||||
PCRE2 does support the Cs (surrogate) property, which Perl does not; the Perl
|
||||
documentation says "Because Perl hides the need for the user to understand the
|
||||
internal representation of Unicode characters, there is no need to implement
|
||||
the somewhat messy concept of surrogates."
|
||||
Nd, script names such as Greek or Han, Bidi_Class, Bidi_Control, and the
|
||||
derived properties Any and LC (synonym L&). Both PCRE2 and Perl support the Cs
|
||||
(surrogate) property, but in PCRE2 its use is limited. See the
|
||||
.\" HREF
|
||||
\fBpcre2pattern\fP
|
||||
.\"
|
||||
documentation for details. The long synonyms for property names that Perl
|
||||
supports (such as \ep{Letter}) are not supported by PCRE2, nor is it permitted
|
||||
to prefix any of these properties with "Is".
|
||||
.P
|
||||
6. PCRE2 supports the \eQ...\eE escape for quoting substrings. Characters
|
||||
7. PCRE2 supports the \eQ...\eE escape for quoting substrings. Characters
|
||||
in between are treated as literals. However, this is slightly different from
|
||||
Perl in that $ and @ are also handled as literals inside the quotes. In Perl,
|
||||
they cause variable interpolation (but of course PCRE2 does not have
|
||||
variables). Also, Perl does "double-quotish backslash interpolation" on any
|
||||
backslashes between \eQ and \eE which, its documentation says, "may lead to
|
||||
confusing results". PCRE2 treats a backslash between \eQ and \eE just like any
|
||||
other character. Note the following examples:
|
||||
they cause variable interpolation (PCRE2 does not have variables). Also, Perl
|
||||
does "double-quotish backslash interpolation" on any backslashes between \eQ
|
||||
and \eE which, its documentation says, "may lead to confusing results". PCRE2
|
||||
treats a backslash between \eQ and \eE just like any other character. Note the
|
||||
following examples:
|
||||
.sp
|
||||
Pattern PCRE2 matches Perl matches
|
||||
.sp
|
||||
|
@ -65,9 +79,10 @@ other character. Note the following examples:
|
|||
\eQA\eB\eE A\eB A\eB
|
||||
\eQ\e\eE \e \e\eE
|
||||
.sp
|
||||
The \eQ...\eE sequence is recognized both inside and outside character classes.
|
||||
The \eQ...\eE sequence is recognized both inside and outside character classes
|
||||
by both PCRE2 and Perl.
|
||||
.P
|
||||
7. Fairly obviously, PCRE2 does not support the (?{code}) and (??{code})
|
||||
8. Fairly obviously, PCRE2 does not support the (?{code}) and (??{code})
|
||||
constructions. However, PCRE2 does have a "callout" feature, which allows an
|
||||
external function to be called during pattern matching. See the
|
||||
.\" HREF
|
||||
|
@ -75,27 +90,24 @@ external function to be called during pattern matching. See the
|
|||
.\"
|
||||
documentation for details.
|
||||
.P
|
||||
8. Subroutine calls (whether recursive or not) were treated as atomic groups up
|
||||
9. Subroutine calls (whether recursive or not) were treated as atomic groups up
|
||||
to PCRE2 release 10.23, but from release 10.30 this changed, and backtracking
|
||||
into subroutine calls is now supported, as in Perl.
|
||||
.P
|
||||
9. If any of the backtracking control verbs are used in a group that is called
|
||||
as a subroutine (whether or not recursively), their effect is confined to that
|
||||
group; it does not extend to the surrounding pattern. This is not always the
|
||||
case in Perl. In particular, if (*THEN) is present in a group that is called as
|
||||
a subroutine, its action is limited to that group, even if the group does not
|
||||
contain any | characters. Note that such groups are processed as anchored
|
||||
at the point where they are tested.
|
||||
10. In PCRE2, if any of the backtracking control verbs are used in a group that
|
||||
is called as a subroutine (whether or not recursively), their effect is
|
||||
confined to that group; it does not extend to the surrounding pattern. This is
|
||||
not always the case in Perl. In particular, if (*THEN) is present in a group
|
||||
that is called as a subroutine, its action is limited to that group, even if
|
||||
the group does not contain any | characters. Note that such groups are
|
||||
processed as anchored at the point where they are tested.
|
||||
.P
|
||||
10. If a pattern contains more than one backtracking control verb, the first
|
||||
11. If a pattern contains more than one backtracking control verb, the first
|
||||
one that is backtracked onto acts. For example, in the pattern
|
||||
A(*COMMIT)B(*PRUNE)C a failure in B triggers (*COMMIT), but a failure in C
|
||||
triggers (*PRUNE). Perl's behaviour is more complex; in many cases it is the
|
||||
same as PCRE2, but there are cases where it differs.
|
||||
.P
|
||||
11. Most backtracking verbs in assertions have their normal actions. They are
|
||||
not confined to the assertion.
|
||||
.P
|
||||
12. There are some differences that are concerned with the settings of captured
|
||||
strings when part of a pattern is repeated. For example, matching "aba" against
|
||||
the pattern /^(a(b)?)+$/ in Perl leaves $2 unset, but in PCRE2 it is set to
|
||||
|
@ -104,7 +116,7 @@ the pattern /^(a(b)?)+$/ in Perl leaves $2 unset, but in PCRE2 it is set to
|
|||
13. PCRE2's handling of duplicate capture group numbers and names is not as
|
||||
general as Perl's. This is a consequence of the fact the PCRE2 works internally
|
||||
just with numbers, using an external table to translate between numbers and
|
||||
names. In particular, a pattern such as (?|(?<a>A)|(?<b>B), where the two
|
||||
names. In particular, a pattern such as (?|(?<a>A)|(?<b>B)), where the two
|
||||
capture groups have the same number but different names, is not supported, and
|
||||
causes an error at compile time. If it were allowed, it would not be possible
|
||||
to distinguish which group matched, because both names map to capture group
|
||||
|
@ -124,17 +136,24 @@ certainly user mistakes.
|
|||
16. In PCRE2, the upper/lower case character properties Lu and Ll are not
|
||||
affected when case-independent matching is specified. For example, \ep{Lu}
|
||||
always matches an upper case letter. I think Perl has changed in this respect;
|
||||
in the release at the time of writing (5.24), \ep{Lu} and \ep{Ll} match all
|
||||
in the release at the time of writing (5.34), \ep{Lu} and \ep{Ll} match all
|
||||
letters, regardless of case, when case independence is specified.
|
||||
.P
|
||||
17. PCRE2 provides some extensions to the Perl regular expression facilities.
|
||||
Perl 5.10 includes new features that are not in earlier versions of Perl, some
|
||||
17. From release 5.32.0, Perl locks out the use of \eK in lookaround
|
||||
assertions. From release 10.38 PCRE2 does the same by default. However, there
|
||||
is an option for re-enabling the previous behaviour. When this option is set,
|
||||
\eK is acted on when it occurs in positive assertions, but is ignored in
|
||||
negative assertions.
|
||||
.P
|
||||
18. PCRE2 provides some extensions to the Perl regular expression facilities.
|
||||
Perl 5.10 included new features that were not in earlier versions of Perl, some
|
||||
of which (such as named parentheses) were in PCRE2 for some time before. This
|
||||
list is with respect to Perl 5.26:
|
||||
list is with respect to Perl 5.34:
|
||||
.sp
|
||||
(a) Although lookbehind assertions in PCRE2 must match fixed length strings,
|
||||
each alternative branch of a lookbehind assertion can match a different length
|
||||
of string. Perl requires them all to have the same length.
|
||||
each alternative toplevel branch of a lookbehind assertion can match a
|
||||
different length of string. Perl used to require them all to have the same
|
||||
length, but the latest version has some variable length support.
|
||||
.sp
|
||||
(b) From PCRE2 10.23, backreferences to groups of fixed length are supported
|
||||
in lookbehinds, provided that there is no possibility of referencing a
|
||||
|
@ -168,18 +187,18 @@ variable interpolation, but not general hooks on every match.
|
|||
different way and is not Perl-compatible.
|
||||
.sp
|
||||
(l) PCRE2 recognizes some special sequences such as (*CR) or (*NO_JIT) at
|
||||
the start of a pattern that set overall options that cannot be changed within
|
||||
the start of a pattern. These set overall options that cannot be changed within
|
||||
the pattern.
|
||||
.sp
|
||||
(m) PCRE2 supports non-atomic positive lookaround assertions. This is an
|
||||
extension to the lookaround facilities. The default, Perl-compatible
|
||||
lookarounds are atomic.
|
||||
.P
|
||||
18. The Perl /a modifier restricts /d numbers to pure ascii, and the /aa
|
||||
19. The Perl /a modifier restricts /d numbers to pure ascii, and the /aa
|
||||
modifier restricts /i case-insensitive matching to pure ascii, ignoring Unicode
|
||||
rules. This separation cannot be represented with PCRE2_UCP.
|
||||
.P
|
||||
19. Perl has different limits than PCRE2. See the
|
||||
20. Perl has different limits than PCRE2. See the
|
||||
.\" HREF
|
||||
\fBpcre2limit\fP
|
||||
.\"
|
||||
|
@ -194,7 +213,7 @@ fall into any stack-overflow limit. PCRE2 made a similar change at release
|
|||
.sp
|
||||
.nf
|
||||
Philip Hazel
|
||||
University Computing Service
|
||||
Retired from University Computing Service
|
||||
Cambridge, England.
|
||||
.fi
|
||||
.
|
||||
|
@ -203,6 +222,6 @@ Cambridge, England.
|
|||
.rs
|
||||
.sp
|
||||
.nf
|
||||
Last updated: 13 July 2019
|
||||
Copyright (c) 1997-2019 University of Cambridge.
|
||||
Last updated: 08 December 2021
|
||||
Copyright (c) 1997-2021 University of Cambridge.
|
||||
.fi
|
||||
|
|
|
@ -116,8 +116,8 @@ permitted to match separator characters, but the double-star (**) feature
|
|||
(which does match separators) is supported.
|
||||
.P
|
||||
PCRE2_CONVERT_GLOB_NO_WILD_SEPARATOR matches globs with wildcards allowed to
|
||||
match separator characters. PCRE2_GLOB_NO_STARSTAR matches globs with the
|
||||
double-star feature disabled. These options may be given together.
|
||||
match separator characters. PCRE2_CONVERT_GLOB_NO_STARSTAR matches globs with
|
||||
the double-star feature disabled. These options may be given together.
|
||||
.
|
||||
.
|
||||
.SH "CONVERTING POSIX PATTERNS"
|
||||
|
|
|
@ -215,8 +215,8 @@ if (rc < 0)
|
|||
return 1;
|
||||
}
|
||||
|
||||
/* Match succeded. Get a pointer to the output vector, where string offsets are
|
||||
stored. */
|
||||
/* Match succeeded. Get a pointer to the output vector, where string offsets
|
||||
are stored. */
|
||||
|
||||
ovector = pcre2_get_ovector_pointer(match_data);
|
||||
printf("Match succeeded at offset %d\en", (int)ovector[0]);
|
||||
|
@ -234,9 +234,12 @@ pcre2_match_data_create_from_pattern() above. */
|
|||
if (rc == 0)
|
||||
printf("ovector was not big enough for all the captured substrings\en");
|
||||
|
||||
/* We must guard against patterns such as /(?=.\eK)/ that use \eK in an assertion
|
||||
to set the start of a match later than its end. In this demonstration program,
|
||||
we just detect this case and give up. */
|
||||
/* Since release 10.38 PCRE2 has locked out the use of \eK in lookaround
|
||||
assertions. However, there is an option to re-enable the old behaviour. If that
|
||||
is set, it is possible to run patterns such as /(?=.\eK)/ that use \eK in an
|
||||
assertion to set the start of a match later than its end. In this demonstration
|
||||
program, we show how to detect this case, but it shouldn't arise because the
|
||||
option is never set. */
|
||||
|
||||
if (ovector[0] > ovector[1])
|
||||
{
|
||||
|
@ -453,7 +456,7 @@ for (;;)
|
|||
return 1;
|
||||
}
|
||||
|
||||
/* Match succeded */
|
||||
/* Match succeeded */
|
||||
|
||||
printf("\enMatch succeeded again at offset %d\en", (int)ovector[0]);
|
||||
|
||||
|
|
382
doc/pcre2grep.1
382
doc/pcre2grep.1
|
@ -1,4 +1,4 @@
|
|||
.TH PCRE2GREP 1 "15 June 2019" "PCRE2 10.34"
|
||||
.TH PCRE2GREP 1 "30 July 2022" "PCRE2 10.41"
|
||||
.SH NAME
|
||||
pcre2grep - a grep with Perl-compatible regular expressions.
|
||||
.SH SYNOPSIS
|
||||
|
@ -43,13 +43,15 @@ For example:
|
|||
.sp
|
||||
pcre2grep some-pattern file1 - file3
|
||||
.sp
|
||||
Input files are searched line by line. By default, each line that matches a
|
||||
By default, input files are searched line by line. Each line that matches a
|
||||
pattern is copied to the standard output, and if there is more than one file,
|
||||
the file name is output at the start of each line, followed by a colon.
|
||||
However, there are options that can change how \fBpcre2grep\fP behaves. In
|
||||
particular, the \fB-M\fP option makes it possible to search for strings that
|
||||
span line boundaries. What defines a line boundary is controlled by the
|
||||
\fB-N\fP (\fB--newline\fP) option.
|
||||
However, there are options that can change how \fBpcre2grep\fP behaves. For
|
||||
example, the \fB-M\fP option makes it possible to search for strings that span
|
||||
line boundaries. What defines a line boundary is controlled by the \fB-N\fP
|
||||
(\fB--newline\fP) option. The \fB-h\fP and \fB-H\fP options control whether or
|
||||
not file names are shown, and the \fB-Z\fP option changes the file name
|
||||
terminator to a zero byte.
|
||||
.P
|
||||
The amount of memory used for buffering files that are being scanned is
|
||||
controlled by parameters that can be set by the \fB--buffer-size\fP and
|
||||
|
@ -79,8 +81,8 @@ matching substrings, or if \fB--only-matching\fP, \fB--file-offsets\fP, or
|
|||
(either shown literally, or as an offset), scanning resumes immediately
|
||||
following the match, so that further matches on the same line can be found. If
|
||||
there are multiple patterns, they are all tried on the remainder of the line,
|
||||
but patterns that follow the one that matched are not tried on the earlier part
|
||||
of the line.
|
||||
but patterns that follow the one that matched are not tried on the earlier
|
||||
matched part of the line.
|
||||
.P
|
||||
This behaviour means that the order in which multiple patterns are specified
|
||||
can affect the output when one of the above options is used. This is no longer
|
||||
|
@ -115,11 +117,10 @@ ignored.
|
|||
.rs
|
||||
.sp
|
||||
By default, a file that contains a binary zero byte within the first 1024 bytes
|
||||
is identified as a binary file, and is processed specially. (GNU grep
|
||||
identifies binary files in this manner.) However, if the newline type is
|
||||
specified as "nul", that is, the line terminator is a binary zero, the test for
|
||||
a binary file is not applied. See the \fB--binary-files\fP option for a means
|
||||
of changing the way binary files are handled.
|
||||
is identified as a binary file, and is processed specially. However, if the
|
||||
newline type is specified as NUL, that is, the line terminator is a binary
|
||||
zero, the test for a binary file is not applied. See the \fB--binary-files\fP
|
||||
option for a means of changing the way binary files are handled.
|
||||
.
|
||||
.
|
||||
.SH "BINARY ZEROS IN PATTERNS"
|
||||
|
@ -150,22 +151,30 @@ Output up to \fInumber\fP lines of context after each matching line. Fewer
|
|||
lines are output if the next match or the end of the file is reached, or if the
|
||||
processing buffer size has been set too small. If file names and/or line
|
||||
numbers are being output, a hyphen separator is used instead of a colon for the
|
||||
context lines. A line containing "--" is output between each group of lines,
|
||||
unless they are in fact contiguous in the input file. The value of \fInumber\fP
|
||||
is expected to be relatively small. When \fB-c\fP is used, \fB-A\fP is ignored.
|
||||
context lines (the \fB-Z\fP option can be used to change the file name
|
||||
terminator to a zero byte). A line containing "--" is output between each group
|
||||
of lines, unless they are in fact contiguous in the input file. The value of
|
||||
\fInumber\fP is expected to be relatively small. When \fB-c\fP is used,
|
||||
\fB-A\fP is ignored.
|
||||
.TP
|
||||
\fB-a\fP, \fB--text\fP
|
||||
Treat binary files as text. This is equivalent to
|
||||
\fB--binary-files\fP=\fItext\fP.
|
||||
.TP
|
||||
\fB--allow-lookaround-bsk\fP
|
||||
PCRE2 now forbids the use of \eK in lookarounds by default, in line with Perl.
|
||||
This option causes \fBpcre2grep\fP to set the PCRE2_EXTRA_ALLOW_LOOKAROUND_BSK
|
||||
option, which enables this somewhat dangerous usage.
|
||||
.TP
|
||||
\fB-B\fP \fInumber\fP, \fB--before-context=\fP\fInumber\fP
|
||||
Output up to \fInumber\fP lines of context before each matching line. Fewer
|
||||
lines are output if the previous match or the start of the file is within
|
||||
\fInumber\fP lines, or if the processing buffer size has been set too small. If
|
||||
file names and/or line numbers are being output, a hyphen separator is used
|
||||
instead of a colon for the context lines. A line containing "--" is output
|
||||
between each group of lines, unless they are in fact contiguous in the input
|
||||
file. The value of \fInumber\fP is expected to be relatively small. When
|
||||
instead of a colon for the context lines (the \fB-Z\fP option can be used to
|
||||
change the file name terminator to a zero byte). A line containing "--" is
|
||||
output between each group of lines, unless they are in fact contiguous in the
|
||||
input file. The value of \fInumber\fP is expected to be relatively small. When
|
||||
\fB-c\fP is used, \fB-B\fP is ignored.
|
||||
.TP
|
||||
\fB--binary-files=\fP\fIword\fP
|
||||
|
@ -352,19 +361,21 @@ shown separately. This option is mutually exclusive with \fB--output\fP,
|
|||
.TP
|
||||
\fB-H\fP, \fB--with-filename\fP
|
||||
Force the inclusion of the file name at the start of output lines when
|
||||
searching a single file. By default, the file name is not shown in this case.
|
||||
For matching lines, the file name is followed by a colon; for context lines, a
|
||||
hyphen separator is used. If a line number is also being output, it follows the
|
||||
file name. When the \fB-M\fP option causes a pattern to match more than one
|
||||
line, only the first is preceded by the file name. This option overrides any
|
||||
previous \fB-h\fP, \fB-l\fP, or \fB-L\fP options.
|
||||
searching a single file. The file name is not normally shown in this case.
|
||||
By default, for matching lines, the file name is followed by a colon; for
|
||||
context lines, a hyphen separator is used. The \fB-Z\fP option can be used to
|
||||
change the terminator to a zero byte. If a line number is also being output,
|
||||
it follows the file name. When the \fB-M\fP option causes a pattern to match
|
||||
more than one line, only the first is preceded by the file name. This option
|
||||
overrides any previous \fB-h\fP, \fB-l\fP, or \fB-L\fP options.
|
||||
.TP
|
||||
\fB-h\fP, \fB--no-filename\fP
|
||||
Suppress the output file names when searching multiple files. By default,
|
||||
file names are shown when multiple files are searched. For matching lines, the
|
||||
file name is followed by a colon; for context lines, a hyphen separator is used.
|
||||
If a line number is also being output, it follows the file name. This option
|
||||
overrides any previous \fB-H\fP, \fB-L\fP, or \fB-l\fP options.
|
||||
Suppress the output file names when searching multiple files. File names are
|
||||
normally shown when multiple files are searched. By default, for matching
|
||||
lines, the file name is followed by a colon; for context lines, a hyphen
|
||||
separator is used. The \fB-Z\fP option can be used to change the terminator to
|
||||
a zero byte. If a line number is also being output, it follows the file name.
|
||||
This option overrides any previous \fB-H\fP, \fB-L\fP, or \fB-l\fP options.
|
||||
.TP
|
||||
\fB--heap-limit\fP=\fInumber\fP
|
||||
See \fB--match-limit\fP below.
|
||||
|
@ -383,8 +394,8 @@ Ignore upper/lower case distinctions during comparisons.
|
|||
.TP
|
||||
\fB--include\fP=\fIpattern\fP
|
||||
If any \fB--include\fP patterns are specified, the only files that are
|
||||
processed are those that match one of the patterns (and do not match an
|
||||
\fB--exclude\fP pattern). This option does not affect directories, but it
|
||||
processed are those whose names match one of the patterns and do not match an
|
||||
\fB--exclude\fP pattern. This option does not affect directories, but it
|
||||
applies to all files, whether listed on the command line, obtained from
|
||||
\fB--file-list\fP, or by scanning a directory. The pattern is a PCRE2 regular
|
||||
expression, and is matched against the final component of the file name, not
|
||||
|
@ -401,8 +412,8 @@ may be given any number of times; all the files are read.
|
|||
.TP
|
||||
\fB--include-dir\fP=\fIpattern\fP
|
||||
If any \fB--include-dir\fP patterns are specified, the only directories that
|
||||
are processed are those that match one of the patterns (and do not match an
|
||||
\fB--exclude-dir\fP pattern). This applies to all directories, whether listed
|
||||
are processed are those whose names match one of the patterns and do not match
|
||||
an \fB--exclude-dir\fP pattern. This applies to all directories, whether listed
|
||||
on the command line, obtained from \fB--file-list\fP, or by scanning a parent
|
||||
directory. The pattern is a PCRE2 regular expression, and is matched against
|
||||
the final component of the directory name, not the entire path. The \fB-F\fP,
|
||||
|
@ -413,18 +424,21 @@ given any number of times. If a directory matches both \fB--include-dir\fP and
|
|||
\fB-L\fP, \fB--files-without-match\fP
|
||||
Instead of outputting lines from the files, just output the names of the files
|
||||
that do not contain any lines that would have been output. Each file name is
|
||||
output once, on a separate line. This option overrides any previous \fB-H\fP,
|
||||
\fB-h\fP, or \fB-l\fP options.
|
||||
output once, on a separate line by default, but if the \fB-Z\fP option is set,
|
||||
they are separated by zero bytes instead of newlines. This option overrides any
|
||||
previous \fB-H\fP, \fB-h\fP, or \fB-l\fP options.
|
||||
.TP
|
||||
\fB-l\fP, \fB--files-with-matches\fP
|
||||
Instead of outputting lines from the files, just output the names of the files
|
||||
containing lines that would have been output. Each file name is output once, on
|
||||
a separate line. Searching normally stops as soon as a matching line is found
|
||||
in a file. However, if the \fB-c\fP (count) option is also used, matching
|
||||
continues in order to obtain the correct count, and those files that have at
|
||||
least one match are listed along with their counts. Using this option with
|
||||
\fB-c\fP is a way of suppressing the listing of files with no matches. This
|
||||
opeion overrides any previous \fB-H\fP, \fB-h\fP, or \fB-L\fP options.
|
||||
a separate line, but if the \fB-Z\fP option is set, they are separated by zero
|
||||
bytes instead of newlines. Searching normally stops as soon as a matching line
|
||||
is found in a file. However, if the \fB-c\fP (count) option is also used,
|
||||
matching continues in order to obtain the correct count, and those files that
|
||||
have at least one match are listed along with their counts. Using this option
|
||||
with \fB-c\fP is a way of suppressing the listing of files with no matches that
|
||||
occurs with \fB-c\fP on its own. This option overrides any previous \fB-H\fP,
|
||||
\fB-h\fP, or \fB-L\fP options.
|
||||
.TP
|
||||
\fB--label\fP=\fIname\fP
|
||||
This option supplies a name to be used for the standard input when file names
|
||||
|
@ -435,8 +449,8 @@ short form for this option.
|
|||
When this option is given, non-compressed input is read and processed line by
|
||||
line, and the output is flushed after each write. By default, input is read in
|
||||
large chunks, unless \fBpcre2grep\fP can determine that it is reading from a
|
||||
terminal (which is currently possible only in Unix-like environments or
|
||||
Windows). Output to terminal is normally automatically flushed by the operating
|
||||
terminal, which is currently possible only in Unix-like environments or
|
||||
Windows. Output to terminal is normally automatically flushed by the operating
|
||||
system. This option can be useful when the input or output is attached to a
|
||||
pipe and you do not want \fBpcre2grep\fP to buffer up large amounts of data.
|
||||
However, its use will affect performance, and the \fB-M\fP (multiline) option
|
||||
|
@ -459,40 +473,6 @@ the value in the \fBLC_ALL\fP or \fBLC_CTYPE\fP environment variables. If no
|
|||
locale is specified, the PCRE2 library's default (usually the "C" locale) is
|
||||
used. There is no short form for this option.
|
||||
.TP
|
||||
\fB--match-limit\fP=\fInumber\fP
|
||||
Processing some regular expression patterns may take a very long time to search
|
||||
for all possible matching strings. Others may require a very large amount of
|
||||
memory. There are three options that set resource limits for matching.
|
||||
.sp
|
||||
The \fB--match-limit\fP option provides a means of limiting computing resource
|
||||
usage when processing patterns that are not going to match, but which have a
|
||||
very large number of possibilities in their search trees. The classic example
|
||||
is a pattern that uses nested unlimited repeats. Internally, PCRE2 has a
|
||||
counter that is incremented each time around its main processing loop. If the
|
||||
value set by \fB--match-limit\fP is reached, an error occurs.
|
||||
.sp
|
||||
The \fB--heap-limit\fP option specifies, as a number of kibibytes (units of
|
||||
1024 bytes), the amount of heap memory that may be used for matching. Heap
|
||||
memory is needed only if matching the pattern requires a significant number of
|
||||
nested backtracking points to be remembered. This parameter can be set to zero
|
||||
to forbid the use of heap memory altogether.
|
||||
.sp
|
||||
The \fB--depth-limit\fP option limits the depth of nested backtracking points,
|
||||
which indirectly limits the amount of memory that is used. The amount of memory
|
||||
needed for each backtracking point depends on the number of capturing
|
||||
parentheses in the pattern, so the amount of memory that is used before this
|
||||
limit acts varies from pattern to pattern. This limit is of use only if it is
|
||||
set smaller than \fB--match-limit\fP.
|
||||
.sp
|
||||
There are no short forms for these options. The default limits can be set
|
||||
when the PCRE2 library is compiled; if they are not specified, the defaults
|
||||
are very large and so effectively unlimited.
|
||||
.TP
|
||||
\fB--max-buffer-size=\fInumber\fP
|
||||
This limits the expansion of the processing buffer, whose initial size can be
|
||||
set by \fB--buffer-size\fP. The maximum buffer size is silently forced to be no
|
||||
smaller than the starting buffer size.
|
||||
.TP
|
||||
\fB-M\fP, \fB--multiline\fP
|
||||
Allow patterns to match more than one line. When this option is set, the PCRE2
|
||||
library is called in "multiline" mode. This allows a matched string to extend
|
||||
|
@ -520,27 +500,74 @@ well as possibly handling a two-character newline sequence.
|
|||
There is a limit to the number of lines that can be matched, imposed by the way
|
||||
that \fBpcre2grep\fP buffers the input file as it scans it. With a sufficiently
|
||||
large processing buffer, this should not be a problem, but the \fB-M\fP option
|
||||
does not work when input is read line by line (see \fP--line-buffered\fP.)
|
||||
does not work when input is read line by line (see \fB--line-buffered\fP.)
|
||||
.TP
|
||||
\fB-m\fP \fInumber\fP, \fB--max-count\fP=\fInumber\fP
|
||||
Stop processing after finding \fInumber\fP matching lines, or non-matching
|
||||
lines if \fB-v\fP is also set. Any trailing context lines are output after the
|
||||
final match. In multiline mode, each multiline match counts as just one line
|
||||
for this purpose. If this limit is reached when reading the standard input from
|
||||
a regular file, the file is left positioned just after the last matching line.
|
||||
If \fB-c\fP is also set, the count that is output is never greater than
|
||||
\fInumber\fP. This option has no effect if used with \fB-L\fP, \fB-l\fP, or
|
||||
\fB-q\fP, or when just checking for a match in a binary file.
|
||||
.TP
|
||||
\fB--match-limit\fP=\fInumber\fP
|
||||
Processing some regular expression patterns may take a very long time to search
|
||||
for all possible matching strings. Others may require a very large amount of
|
||||
memory. There are three options that set resource limits for matching.
|
||||
.sp
|
||||
The \fB--match-limit\fP option provides a means of limiting computing resource
|
||||
usage when processing patterns that are not going to match, but which have a
|
||||
very large number of possibilities in their search trees. The classic example
|
||||
is a pattern that uses nested unlimited repeats. Internally, PCRE2 has a
|
||||
counter that is incremented each time around its main processing loop. If the
|
||||
value set by \fB--match-limit\fP is reached, an error occurs.
|
||||
.sp
|
||||
The \fB--heap-limit\fP option specifies, as a number of kibibytes (units of
|
||||
1024 bytes), the maximum amount of heap memory that may be used for matching.
|
||||
.sp
|
||||
The \fB--depth-limit\fP option limits the depth of nested backtracking points,
|
||||
which indirectly limits the amount of memory that is used. The amount of memory
|
||||
needed for each backtracking point depends on the number of capturing
|
||||
parentheses in the pattern, so the amount of memory that is used before this
|
||||
limit acts varies from pattern to pattern. This limit is of use only if it is
|
||||
set smaller than \fB--match-limit\fP.
|
||||
.sp
|
||||
There are no short forms for these options. The default limits can be set
|
||||
when the PCRE2 library is compiled; if they are not specified, the defaults
|
||||
are very large and so effectively unlimited.
|
||||
.TP
|
||||
\fB--max-buffer-size\fP=\fInumber\fP
|
||||
This limits the expansion of the processing buffer, whose initial size can be
|
||||
set by \fB--buffer-size\fP. The maximum buffer size is silently forced to be no
|
||||
smaller than the starting buffer size.
|
||||
.TP
|
||||
\fB-N\fP \fInewline-type\fP, \fB--newline\fP=\fInewline-type\fP
|
||||
The PCRE2 library supports five different conventions for indicating
|
||||
the ends of lines. They are the single-character sequences CR (carriage return)
|
||||
and LF (linefeed), the two-character sequence CRLF, an "anycrlf" convention,
|
||||
which recognizes any of the preceding three types, and an "any" convention, in
|
||||
which any Unicode line ending sequence is assumed to end a line. The Unicode
|
||||
sequences are the three just mentioned, plus VT (vertical tab, U+000B), FF
|
||||
(form feed, U+000C), NEL (next line, U+0085), LS (line separator, U+2028), and
|
||||
PS (paragraph separator, U+2029).
|
||||
Six different conventions for indicating the ends of lines in scanned files are
|
||||
supported. For example:
|
||||
.sp
|
||||
pcre2grep -N CRLF 'some pattern' <file>
|
||||
.sp
|
||||
The newline type may be specified in upper, lower, or mixed case. If the
|
||||
newline type is NUL, lines are separated by binary zero characters. The other
|
||||
types are the single-character sequences CR (carriage return) and LF
|
||||
(linefeed), the two-character sequence CRLF, an "anycrlf" type, which
|
||||
recognizes any of the preceding three types, and an "any" type, for which any
|
||||
Unicode line ending sequence is assumed to end a line. The Unicode sequences
|
||||
are the three just mentioned, plus VT (vertical tab, U+000B), FF (form feed,
|
||||
U+000C), NEL (next line, U+0085), LS (line separator, U+2028), and PS
|
||||
(paragraph separator, U+2029).
|
||||
.sp
|
||||
When the PCRE2 library is built, a default line-ending sequence is specified.
|
||||
This is normally the standard sequence for the operating system. Unless
|
||||
otherwise specified by this option, \fBpcre2grep\fP uses the library's default.
|
||||
The possible values for this option are CR, LF, CRLF, ANYCRLF, or ANY. This
|
||||
makes it possible to use \fBpcre2grep\fP to scan files that have come from
|
||||
other environments without having to modify their line endings. If the data
|
||||
that is being scanned does not agree with the convention set by this option,
|
||||
\fBpcre2grep\fP may behave in strange ways. Note that this option does not
|
||||
apply to files specified by the \fB-f\fP, \fB--exclude-from\fP, or
|
||||
.sp
|
||||
This option makes it possible to use \fBpcre2grep\fP to scan files that have
|
||||
come from other environments without having to modify their line endings. If
|
||||
the data that is being scanned does not agree with the convention set by this
|
||||
option, \fBpcre2grep\fP may behave in strange ways. Note that this option does
|
||||
not apply to files specified by the \fB-f\fP, \fB--exclude-from\fP, or
|
||||
\fB--include-from\fP options, which are expected to use the operating system's
|
||||
standard newline sequence.
|
||||
.TP
|
||||
|
@ -559,25 +586,36 @@ use of JIT at run time. It is provided for testing and working round problems.
|
|||
It should never be needed in normal use.
|
||||
.TP
|
||||
\fB-O\fP \fItext\fP, \fB--output\fP=\fItext\fP
|
||||
When there is a match, instead of outputting the whole line that matched,
|
||||
output just the given text. This option is mutually exclusive with
|
||||
\fB--only-matching\fP, \fB--file-offsets\fP, and \fB--line-offsets\fP. Escape
|
||||
sequences starting with a dollar character may be used to insert the contents
|
||||
of the matched part of the line and/or captured substrings into the text.
|
||||
When there is a match, instead of outputting the line that matched, output just
|
||||
the text specified in this option, followed by an operating-system standard
|
||||
newline. In this mode, no context is shown. That is, the \fB-A\fP, \fB-B\fP,
|
||||
and \fB-C\fP options are ignored. The \fB--newline\fP option has no effect on
|
||||
this option, which is mutually exclusive with \fB--only-matching\fP,
|
||||
\fB--file-offsets\fP, and \fB--line-offsets\fP. However, like
|
||||
\fB--only-matching\fP, if there is more than one match in a line, each of them
|
||||
causes a line of output.
|
||||
.sp
|
||||
$<digits> or ${<digits>} is replaced by the captured
|
||||
substring of the given decimal number; zero substitutes the whole match. If
|
||||
the number is greater than the number of capturing substrings, or if the
|
||||
capture is unset, the replacement is empty.
|
||||
Escape sequences starting with a dollar character may be used to insert the
|
||||
contents of the matched part of the line and/or captured substrings into the
|
||||
text.
|
||||
.sp
|
||||
$<digits> or ${<digits>} is replaced by the captured substring of the given
|
||||
decimal number; zero substitutes the whole match. If the number is greater than
|
||||
the number of capturing substrings, or if the capture is unset, the replacement
|
||||
is empty.
|
||||
.sp
|
||||
$a is replaced by bell; $b by backspace; $e by escape; $f by form feed; $n by
|
||||
newline; $r by carriage return; $t by tab; $v by vertical tab.
|
||||
.sp
|
||||
$o<digits> is replaced by the character represented by the given octal
|
||||
number; up to three digits are processed.
|
||||
$o<digits> or $o{<digits>} is replaced by the character whose code point is the
|
||||
given octal number. In the first form, up to three octal digits are processed.
|
||||
When more digits are needed in Unicode mode to specify a wide character, the
|
||||
second form must be used.
|
||||
.sp
|
||||
$x<digits> is replaced by the character represented by the given hexadecimal
|
||||
number; up to two digits are processed.
|
||||
$x<digits> or $x{<digits>} is replaced by the character represented by the
|
||||
given hexadecimal number. In the first form, up to two hexadecimal digits are
|
||||
processed. When more digits are needed in Unicode mode to specify a wide
|
||||
character, the second form must be used.
|
||||
.sp
|
||||
Any other character is substituted by itself. In particular, $$ is replaced by
|
||||
a single dollar.
|
||||
|
@ -636,7 +674,8 @@ immediate end-of-file. This option is a shorthand for setting the \fB-d\fP
|
|||
option to "recurse".
|
||||
.TP
|
||||
\fB--recursion-limit\fP=\fInumber\fP
|
||||
See \fB--match-limit\fP above.
|
||||
This is an obsolete synonym for \fB--depth-limit\fP. See \fB--match-limit\fP
|
||||
above for details.
|
||||
.TP
|
||||
\fB-s\fP, \fB--no-messages\fP
|
||||
Suppress error messages about non-existent or unreadable files. Such files are
|
||||
|
@ -657,14 +696,17 @@ total would always be zero.
|
|||
\fB-u\fP, \fB--utf\fP
|
||||
Operate in UTF-8 mode. This option is available only if PCRE2 has been compiled
|
||||
with UTF-8 support. All patterns (including those for any \fB--exclude\fP and
|
||||
\fB--include\fP options) and all subject lines that are scanned must be valid
|
||||
strings of UTF-8 characters.
|
||||
\fB--include\fP options) and all lines that are scanned must be valid strings
|
||||
of UTF-8 characters. If an invalid UTF-8 string is encountered, an error
|
||||
occurs.
|
||||
.TP
|
||||
\fB-U\fP, \fB--utf-allow-invalid\fP
|
||||
As \fB--utf\fP, but in addition subject lines may contain invalid UTF-8 code
|
||||
unit sequences. These can never form part of any pattern match. This facility
|
||||
allows valid UTF-8 strings to be sought in executable or other binary files.
|
||||
For more details about matching in non-valid UTF-8 strings, see the
|
||||
unit sequences. These can never form part of any pattern match. Patterns
|
||||
themselves, however, must still be valid UTF-8 strings. This facility allows
|
||||
valid UTF-8 strings to be sought within arbitrary byte sequences in executable
|
||||
or other binary files. For more details about matching in non-valid UTF-8
|
||||
strings, see the
|
||||
.\" HREF
|
||||
\fBpcre2unicode\fP(3)
|
||||
.\"
|
||||
|
@ -677,7 +719,9 @@ ignored.
|
|||
.TP
|
||||
\fB-v\fP, \fB--invert-match\fP
|
||||
Invert the sense of the match, so that lines which do \fInot\fP match any of
|
||||
the patterns are the ones that are found.
|
||||
the patterns are the ones that are found. When this option is set, options such
|
||||
as \fB--only-matching\fP and \fB--output\fP, which specify parts of a match
|
||||
that are to be output, are ignored.
|
||||
.TP
|
||||
\fB-w\fP, \fB--word-regex\fP, \fB--word-regexp\fP
|
||||
Force the patterns only to match "words". That is, there must be a word
|
||||
|
@ -694,6 +738,12 @@ be more than one line. This is equivalent to having "^(?:" at the start of each
|
|||
pattern and ")$" at the end. This option applies only to the patterns that are
|
||||
matched against the contents of files; it does not apply to patterns specified
|
||||
by any of the \fB--include\fP or \fB--exclude\fP options.
|
||||
.TP
|
||||
\fB-Z\fP, \fB--null\fP
|
||||
Terminate files names in the regular output with a zero byte (the NUL
|
||||
character) instead of what would normally appear. This is useful when file
|
||||
names contain unusual characters such as colons, hyphens, or even newlines. The
|
||||
option does not apply to file names in error messages.
|
||||
.
|
||||
.
|
||||
.SH "ENVIRONMENT VARIABLES"
|
||||
|
@ -709,16 +759,25 @@ by the \fB--locale\fP option. If no locale is set, the PCRE2 library's default
|
|||
.rs
|
||||
.sp
|
||||
The \fB-N\fP (\fB--newline\fP) option allows \fBpcre2grep\fP to scan files with
|
||||
different newline conventions from the default. Any parts of the input files
|
||||
that are written to the standard output are copied identically, with whatever
|
||||
newline sequences they have in the input. However, the setting of this option
|
||||
affects only the way scanned files are processed. It does not affect the
|
||||
interpretation of files specified by the \fB-f\fP, \fB--file-list\fP,
|
||||
\fB--exclude-from\fP, or \fB--include-from\fP options, nor does it affect the
|
||||
way in which \fBpcre2grep\fP writes informational messages to the standard
|
||||
error and output streams. For these it uses the string "\en" to indicate
|
||||
newlines, relying on the C I/O library to convert this to an appropriate
|
||||
sequence.
|
||||
newline conventions that differ from the default. This option affects only the
|
||||
way scanned files are processed. It does not affect the interpretation of files
|
||||
specified by the \fB-f\fP, \fB--file-list\fP, \fB--exclude-from\fP, or
|
||||
\fB--include-from\fP options.
|
||||
.P
|
||||
Any parts of the scanned input files that are written to the standard output
|
||||
are copied with whatever newline sequences they have in the input. However, if
|
||||
the final line of a file is output, and it does not end with a newline
|
||||
sequence, a newline sequence is added. If the newline setting is CR, LF, CRLF
|
||||
or NUL, that line ending is output; for the other settings (ANYCRLF or ANY) a
|
||||
single NL is used.
|
||||
.P
|
||||
The newline setting does not affect the way in which \fBpcre2grep\fP writes
|
||||
newlines in informational messages to the standard output and error streams.
|
||||
Under Windows, the standard output is set to be binary, so that "\er\en" at the
|
||||
ends of output lines that are copied from the input is not converted to
|
||||
"\er\er\en" by the C I/O library. This means that any messages written to the
|
||||
standard output must end with "\er\en". For all other operating systems, and
|
||||
for all messages to the standard error stream, "\en" is used.
|
||||
.
|
||||
.
|
||||
.SH "OPTIONS COMPATIBILITY"
|
||||
|
@ -795,12 +854,36 @@ documentation for details). Numbered callouts are ignored by \fBpcre2grep\fP;
|
|||
only callouts with string arguments are useful.
|
||||
.
|
||||
.
|
||||
.SS "Echoing a specific string"
|
||||
.rs
|
||||
.sp
|
||||
Starting the callout string with a pipe character invokes an echoing facility
|
||||
that avoids calling an external program or script. This facility is always
|
||||
available, provided that callouts were not completely disabled when
|
||||
\fBpcre2grep\fP was built. The rest of the callout string is processed as a
|
||||
zero-terminated string, which means it should not contain any internal binary
|
||||
zeros. It is written to the output, having first been passed through the same
|
||||
escape processing as text from the \fB--output\fP (\fB-O\fP) option (see
|
||||
above). However, $0 cannot be used to insert a matched substring because the
|
||||
match is still in progress. Instead, the single character '0' is inserted. Any
|
||||
syntax errors in the string (for example, a dollar not followed by another
|
||||
character) causes the callout to be ignored. No terminator is added to the
|
||||
output string, so if you want a newline, you must include it explicitly using
|
||||
the escape $n. For example:
|
||||
.sp
|
||||
pcre2grep '(.)(..(.))(?C"|[$1] [$2] [$3]$n")' <some file>
|
||||
.sp
|
||||
Matching continues normally after the string is output. If you want to see only
|
||||
the callout output but not any output from an actual match, you should end the
|
||||
pattern with (*FAIL).
|
||||
.
|
||||
.
|
||||
.SS "Calling external programs or scripts"
|
||||
.rs
|
||||
.sp
|
||||
This facility can be independently disabled when \fBpcre2grep\fP is built. It
|
||||
is supported for Windows, where a call to \fB_spawnvp()\fP is used, for VMS,
|
||||
where \fBlib$spawn()\fP is used, and for any other Unix-like environment where
|
||||
where \fBlib$spawn()\fP is used, and for any Unix-like environment where
|
||||
\fBfork()\fP and \fBexecv()\fP are available.
|
||||
.P
|
||||
If the callout string does not start with a pipe (vertical bar) character, it
|
||||
|
@ -811,13 +894,11 @@ arguments:
|
|||
executable_name|arg1|arg2|...
|
||||
.sp
|
||||
Any substring (including the executable name) may contain escape sequences
|
||||
started by a dollar character: $<digits> or ${<digits>} is replaced by the
|
||||
captured substring of the given decimal number, which must be greater than
|
||||
zero. If the number is greater than the number of capturing substrings, or if
|
||||
the capture is unset, the replacement is empty.
|
||||
.P
|
||||
Any other character is substituted by itself. In particular, $$ is replaced by
|
||||
a single dollar and $| is replaced by a pipe character. Here is an example:
|
||||
started by a dollar character. These are the same as for the \fB--output\fP
|
||||
(\fB-O\fP) option documented above, except that $0 cannot insert the matched
|
||||
string because the match is still in progress. Instead, the character '0'
|
||||
is inserted. If you need a literal dollar or pipe character in any
|
||||
substring, use $$ or $| respectively. Here is an example:
|
||||
.sp
|
||||
echo -e "abcde\en12345" | pcre2grep \e
|
||||
'(?x)(.)(..(.))
|
||||
|
@ -830,28 +911,14 @@ a single dollar and $| is replaced by a pipe character. Here is an example:
|
|||
Arg1: [1] [234] [4] Arg2: |1| ()
|
||||
12345
|
||||
.sp
|
||||
The parameters for the system call that is used to run the
|
||||
program or script are zero-terminated strings. This means that binary zero
|
||||
characters in the callout argument will cause premature termination of their
|
||||
substrings, and therefore should not be present. Any syntax errors in the
|
||||
string (for example, a dollar not followed by another character) cause the
|
||||
callout to be ignored. If running the program fails for any reason (including
|
||||
the non-existence of the executable), a local matching failure occurs and the
|
||||
matcher backtracks in the normal way.
|
||||
.
|
||||
.
|
||||
.SS "Echoing a specific string"
|
||||
.rs
|
||||
.sp
|
||||
This facility is always available, provided that callouts were not completely
|
||||
disabled when \fBpcre2grep\fP was built. If the callout string starts with a
|
||||
pipe (vertical bar) character, the rest of the string is written to the output,
|
||||
having been passed through the same escape processing as text from the --output
|
||||
option. This provides a simple echoing facility that avoids calling an external
|
||||
program or script. No terminator is added to the string, so if you want a
|
||||
newline, you must include it explicitly. Matching continues normally after the
|
||||
string is output. If you want to see only the callout output but not any output
|
||||
from an actual match, you should end the relevant pattern with (*FAIL).
|
||||
The parameters for the system call that is used to run the program or script
|
||||
are zero-terminated strings. This means that binary zero characters in the
|
||||
callout argument will cause premature termination of their substrings, and
|
||||
therefore should not be present. Any syntax errors in the string (for example,
|
||||
a dollar not followed by another character) causes the callout to be ignored.
|
||||
If running the program fails for any reason (including the non-existence of the
|
||||
executable), a local matching failure occurs and the matcher backtracks in the
|
||||
normal way.
|
||||
.
|
||||
.
|
||||
.SH "MATCHING ERRORS"
|
||||
|
@ -887,7 +954,8 @@ because VMS does not distinguish between exit(0) and exit(1).
|
|||
.SH "SEE ALSO"
|
||||
.rs
|
||||
.sp
|
||||
\fBpcre2pattern\fP(3), \fBpcre2syntax\fP(3), \fBpcre2callout\fP(3).
|
||||
\fBpcre2pattern\fP(3), \fBpcre2syntax\fP(3), \fBpcre2callout\fP(3),
|
||||
\fBpcre2unicode\fP(3).
|
||||
.
|
||||
.
|
||||
.SH AUTHOR
|
||||
|
@ -895,7 +963,7 @@ because VMS does not distinguish between exit(0) and exit(1).
|
|||
.sp
|
||||
.nf
|
||||
Philip Hazel
|
||||
University Computing Service
|
||||
Retired from University Computing Service
|
||||
Cambridge, England.
|
||||
.fi
|
||||
.
|
||||
|
@ -904,6 +972,6 @@ Cambridge, England.
|
|||
.rs
|
||||
.sp
|
||||
.nf
|
||||
Last updated: 15 June 2019
|
||||
Copyright (c) 1997-2019 University of Cambridge.
|
||||
Last updated: 30 July 2022
|
||||
Copyright (c) 1997-2022 University of Cambridge.
|
||||
.fi
|
||||
|
|
|
@ -42,13 +42,15 @@ DESCRIPTION
|
|||
|
||||
pcre2grep some-pattern file1 - file3
|
||||
|
||||
Input files are searched line by line. By default, each line that
|
||||
By default, input files are searched line by line. Each line that
|
||||
matches a pattern is copied to the standard output, and if there is
|
||||
more than one file, the file name is output at the start of each line,
|
||||
followed by a colon. However, there are options that can change how
|
||||
pcre2grep behaves. In particular, the -M option makes it possible to
|
||||
pcre2grep behaves. For example, the -M option makes it possible to
|
||||
search for strings that span line boundaries. What defines a line
|
||||
boundary is controlled by the -N (--newline) option.
|
||||
boundary is controlled by the -N (--newline) option. The -h and -H op-
|
||||
tions control whether or not file names are shown, and the -Z option
|
||||
changes the file name terminator to a zero byte.
|
||||
|
||||
The amount of memory used for buffering files that are being scanned is
|
||||
controlled by parameters that can be set by the --buffer-size and
|
||||
|
@ -80,7 +82,7 @@ DESCRIPTION
|
|||
following the match, so that further matches on the same line can be
|
||||
found. If there are multiple patterns, they are all tried on the re-
|
||||
mainder of the line, but patterns that follow the one that matched are
|
||||
not tried on the earlier part of the line.
|
||||
not tried on the earlier matched part of the line.
|
||||
|
||||
This behaviour means that the order in which multiple patterns are
|
||||
specified can affect the output when one of the above options is used.
|
||||
|
@ -115,10 +117,10 @@ BINARY FILES
|
|||
|
||||
By default, a file that contains a binary zero byte within the first
|
||||
1024 bytes is identified as a binary file, and is processed specially.
|
||||
(GNU grep identifies binary files in this manner.) However, if the new-
|
||||
line type is specified as "nul", that is, the line terminator is a bi-
|
||||
nary zero, the test for a binary file is not applied. See the --binary-
|
||||
files option for a means of changing the way binary files are handled.
|
||||
However, if the newline type is specified as NUL, that is, the line
|
||||
terminator is a binary zero, the test for a binary file is not applied.
|
||||
See the --binary-files option for a means of changing the way binary
|
||||
files are handled.
|
||||
|
||||
|
||||
BINARY ZEROS IN PATTERNS
|
||||
|
@ -149,26 +151,35 @@ OPTIONS
|
|||
the file is reached, or if the processing buffer size has
|
||||
been set too small. If file names and/or line numbers are be-
|
||||
ing output, a hyphen separator is used instead of a colon for
|
||||
the context lines. A line containing "--" is output between
|
||||
each group of lines, unless they are in fact contiguous in
|
||||
the input file. The value of number is expected to be rela-
|
||||
tively small. When -c is used, -A is ignored.
|
||||
the context lines (the -Z option can be used to change the
|
||||
file name terminator to a zero byte). A line containing "--"
|
||||
is output between each group of lines, unless they are in
|
||||
fact contiguous in the input file. The value of number is ex-
|
||||
pected to be relatively small. When -c is used, -A is ig-
|
||||
nored.
|
||||
|
||||
-a, --text
|
||||
Treat binary files as text. This is equivalent to --binary-
|
||||
files=text.
|
||||
|
||||
--allow-lookaround-bsk
|
||||
PCRE2 now forbids the use of \K in lookarounds by default, in
|
||||
line with Perl. This option causes pcre2grep to set the
|
||||
PCRE2_EXTRA_ALLOW_LOOKAROUND_BSK option, which enables this
|
||||
somewhat dangerous usage.
|
||||
|
||||
-B number, --before-context=number
|
||||
Output up to number lines of context before each matching
|
||||
line. Fewer lines are output if the previous match or the
|
||||
start of the file is within number lines, or if the process-
|
||||
ing buffer size has been set too small. If file names and/or
|
||||
Output up to number lines of context before each matching
|
||||
line. Fewer lines are output if the previous match or the
|
||||
start of the file is within number lines, or if the process-
|
||||
ing buffer size has been set too small. If file names and/or
|
||||
line numbers are being output, a hyphen separator is used in-
|
||||
stead of a colon for the context lines. A line containing
|
||||
"--" is output between each group of lines, unless they are
|
||||
in fact contiguous in the input file. The value of number is
|
||||
expected to be relatively small. When -c is used, -B is ig-
|
||||
nored.
|
||||
stead of a colon for the context lines (the -Z option can be
|
||||
used to change the file name terminator to a zero byte). A
|
||||
line containing "--" is output between each group of lines,
|
||||
unless they are in fact contiguous in the input file. The
|
||||
value of number is expected to be relatively small. When -c
|
||||
is used, -B is ignored.
|
||||
|
||||
--binary-files=word
|
||||
Specify how binary files are to be processed. If the word is
|
||||
|
@ -381,88 +392,94 @@ OPTIONS
|
|||
|
||||
-H, --with-filename
|
||||
Force the inclusion of the file name at the start of output
|
||||
lines when searching a single file. By default, the file name
|
||||
is not shown in this case. For matching lines, the file name
|
||||
is followed by a colon; for context lines, a hyphen separator
|
||||
is used. If a line number is also being output, it follows
|
||||
the file name. When the -M option causes a pattern to match
|
||||
more than one line, only the first is preceded by the file
|
||||
name. This option overrides any previous -h, -l, or -L op-
|
||||
tions.
|
||||
lines when searching a single file. The file name is not nor-
|
||||
mally shown in this case. By default, for matching lines,
|
||||
the file name is followed by a colon; for context lines, a
|
||||
hyphen separator is used. The -Z option can be used to change
|
||||
the terminator to a zero byte. If a line number is also being
|
||||
output, it follows the file name. When the -M option causes a
|
||||
pattern to match more than one line, only the first is pre-
|
||||
ceded by the file name. This option overrides any previous
|
||||
-h, -l, or -L options.
|
||||
|
||||
-h, --no-filename
|
||||
Suppress the output file names when searching multiple files.
|
||||
By default, file names are shown when multiple files are
|
||||
searched. For matching lines, the file name is followed by a
|
||||
colon; for context lines, a hyphen separator is used. If a
|
||||
line number is also being output, it follows the file name.
|
||||
This option overrides any previous -H, -L, or -l options.
|
||||
File names are normally shown when multiple files are
|
||||
searched. By default, for matching lines, the file name is
|
||||
followed by a colon; for context lines, a hyphen separator is
|
||||
used. The -Z option can be used to change the terminator to a
|
||||
zero byte. If a line number is also being output, it follows
|
||||
the file name. This option overrides any previous -H, -L, or
|
||||
-l options.
|
||||
|
||||
--heap-limit=number
|
||||
See --match-limit below.
|
||||
|
||||
--help Output a help message, giving brief details of the command
|
||||
options and file type support, and then exit. Anything else
|
||||
--help Output a help message, giving brief details of the command
|
||||
options and file type support, and then exit. Anything else
|
||||
on the command line is ignored.
|
||||
|
||||
-I Ignore binary files. This is equivalent to --binary-
|
||||
-I Ignore binary files. This is equivalent to --binary-
|
||||
files=without-match.
|
||||
|
||||
-i, --ignore-case
|
||||
Ignore upper/lower case distinctions during comparisons.
|
||||
|
||||
--include=pattern
|
||||
If any --include patterns are specified, the only files that
|
||||
are processed are those that match one of the patterns (and
|
||||
do not match an --exclude pattern). This option does not af-
|
||||
fect directories, but it applies to all files, whether listed
|
||||
on the command line, obtained from --file-list, or by scan-
|
||||
ning a directory. The pattern is a PCRE2 regular expression,
|
||||
and is matched against the final component of the file name,
|
||||
not the entire path. The -F, -w, and -x options do not apply
|
||||
to this pattern. The option may be given any number of times.
|
||||
If a file name matches both an --include and an --exclude
|
||||
pattern, it is excluded. There is no short form for this op-
|
||||
tion.
|
||||
If any --include patterns are specified, the only files that
|
||||
are processed are those whose names match one of the patterns
|
||||
and do not match an --exclude pattern. This option does not
|
||||
affect directories, but it applies to all files, whether
|
||||
listed on the command line, obtained from --file-list, or by
|
||||
scanning a directory. The pattern is a PCRE2 regular expres-
|
||||
sion, and is matched against the final component of the file
|
||||
name, not the entire path. The -F, -w, and -x options do not
|
||||
apply to this pattern. The option may be given any number of
|
||||
times. If a file name matches both an --include and an --ex-
|
||||
clude pattern, it is excluded. There is no short form for
|
||||
this option.
|
||||
|
||||
--include-from=filename
|
||||
Treat each non-empty line of the file as the data for an
|
||||
Treat each non-empty line of the file as the data for an
|
||||
--include option. What constitutes a newline for this purpose
|
||||
is the operating system's default. The --newline option has
|
||||
is the operating system's default. The --newline option has
|
||||
no effect on this option. This option may be given any number
|
||||
of times; all the files are read.
|
||||
|
||||
--include-dir=pattern
|
||||
If any --include-dir patterns are specified, the only direc-
|
||||
tories that are processed are those that match one of the
|
||||
patterns (and do not match an --exclude-dir pattern). This
|
||||
applies to all directories, whether listed on the command
|
||||
line, obtained from --file-list, or by scanning a parent di-
|
||||
rectory. The pattern is a PCRE2 regular expression, and is
|
||||
matched against the final component of the directory name,
|
||||
not the entire path. The -F, -w, and -x options do not apply
|
||||
If any --include-dir patterns are specified, the only direc-
|
||||
tories that are processed are those whose names match one of
|
||||
the patterns and do not match an --exclude-dir pattern. This
|
||||
applies to all directories, whether listed on the command
|
||||
line, obtained from --file-list, or by scanning a parent di-
|
||||
rectory. The pattern is a PCRE2 regular expression, and is
|
||||
matched against the final component of the directory name,
|
||||
not the entire path. The -F, -w, and -x options do not apply
|
||||
to this pattern. The option may be given any number of times.
|
||||
If a directory matches both --include-dir and --exclude-dir,
|
||||
If a directory matches both --include-dir and --exclude-dir,
|
||||
it is excluded. There is no short form for this option.
|
||||
|
||||
-L, --files-without-match
|
||||
Instead of outputting lines from the files, just output the
|
||||
names of the files that do not contain any lines that would
|
||||
have been output. Each file name is output once, on a sepa-
|
||||
rate line. This option overrides any previous -H, -h, or -l
|
||||
options.
|
||||
Instead of outputting lines from the files, just output the
|
||||
names of the files that do not contain any lines that would
|
||||
have been output. Each file name is output once, on a sepa-
|
||||
rate line by default, but if the -Z option is set, they are
|
||||
separated by zero bytes instead of newlines. This option
|
||||
overrides any previous -H, -h, or -l options.
|
||||
|
||||
-l, --files-with-matches
|
||||
Instead of outputting lines from the files, just output the
|
||||
names of the files containing lines that would have been out-
|
||||
put. Each file name is output once, on a separate line.
|
||||
Searching normally stops as soon as a matching line is found
|
||||
in a file. However, if the -c (count) option is also used,
|
||||
matching continues in order to obtain the correct count, and
|
||||
those files that have at least one match are listed along
|
||||
with their counts. Using this option with -c is a way of sup-
|
||||
pressing the listing of files with no matches. This opeion
|
||||
overrides any previous -H, -h, or -L options.
|
||||
put. Each file name is output once, on a separate line, but
|
||||
if the -Z option is set, they are separated by zero bytes in-
|
||||
stead of newlines. Searching normally stops as soon as a
|
||||
matching line is found in a file. However, if the -c (count)
|
||||
option is also used, matching continues in order to obtain
|
||||
the correct count, and those files that have at least one
|
||||
match are listed along with their counts. Using this option
|
||||
with -c is a way of suppressing the listing of files with no
|
||||
matches that occurs with -c on its own. This option overrides
|
||||
any previous -H, -h, or -L options.
|
||||
|
||||
--label=name
|
||||
This option supplies a name to be used for the standard input
|
||||
|
@ -473,15 +490,15 @@ OPTIONS
|
|||
When this option is given, non-compressed input is read and
|
||||
processed line by line, and the output is flushed after each
|
||||
write. By default, input is read in large chunks, unless
|
||||
pcre2grep can determine that it is reading from a terminal
|
||||
(which is currently possible only in Unix-like environments
|
||||
or Windows). Output to terminal is normally automatically
|
||||
flushed by the operating system. This option can be useful
|
||||
when the input or output is attached to a pipe and you do not
|
||||
want pcre2grep to buffer up large amounts of data. However,
|
||||
its use will affect performance, and the -M (multiline) op-
|
||||
tion ceases to work. When input is from a compressed .gz or
|
||||
.bz2 file, --line-buffered is ignored.
|
||||
pcre2grep can determine that it is reading from a terminal,
|
||||
which is currently possible only in Unix-like environments or
|
||||
Windows. Output to terminal is normally automatically flushed
|
||||
by the operating system. This option can be useful when the
|
||||
input or output is attached to a pipe and you do not want
|
||||
pcre2grep to buffer up large amounts of data. However, its
|
||||
use will affect performance, and the -M (multiline) option
|
||||
ceases to work. When input is from a compressed .gz or .bz2
|
||||
file, --line-buffered is ignored.
|
||||
|
||||
--line-offsets
|
||||
Instead of showing lines or parts of lines that match, show
|
||||
|
@ -501,27 +518,71 @@ OPTIONS
|
|||
brary's default (usually the "C" locale) is used. There is no
|
||||
short form for this option.
|
||||
|
||||
-M, --multiline
|
||||
Allow patterns to match more than one line. When this option
|
||||
is set, the PCRE2 library is called in "multiline" mode. This
|
||||
allows a matched string to extend past the end of a line and
|
||||
continue on one or more subsequent lines. Patterns used with
|
||||
-M may usefully contain literal newline characters and inter-
|
||||
nal occurrences of ^ and $ characters. The output for a suc-
|
||||
cessful match may consist of more than one line. The first
|
||||
line is the line in which the match started, and the last
|
||||
line is the line in which the match ended. If the matched
|
||||
string ends with a newline sequence, the output ends at the
|
||||
end of that line. If -v is set, none of the lines in a
|
||||
multi-line match are output. Once a match has been handled,
|
||||
scanning restarts at the beginning of the line after the one
|
||||
in which the match ended.
|
||||
|
||||
The newline sequence that separates multiple lines must be
|
||||
matched as part of the pattern. For example, to find the
|
||||
phrase "regular expression" in a file where "regular" might
|
||||
be at the end of a line and "expression" at the start of the
|
||||
next line, you could use this command:
|
||||
|
||||
pcre2grep -M 'regular\s+expression' <file>
|
||||
|
||||
The \s escape sequence matches any white space character, in-
|
||||
cluding newlines, and is followed by + so as to match trail-
|
||||
ing white space on the first line as well as possibly han-
|
||||
dling a two-character newline sequence.
|
||||
|
||||
There is a limit to the number of lines that can be matched,
|
||||
imposed by the way that pcre2grep buffers the input file as
|
||||
it scans it. With a sufficiently large processing buffer,
|
||||
this should not be a problem, but the -M option does not work
|
||||
when input is read line by line (see --line-buffered.)
|
||||
|
||||
-m number, --max-count=number
|
||||
Stop processing after finding number matching lines, or non-
|
||||
matching lines if -v is also set. Any trailing context lines
|
||||
are output after the final match. In multiline mode, each
|
||||
multiline match counts as just one line for this purpose. If
|
||||
this limit is reached when reading the standard input from a
|
||||
regular file, the file is left positioned just after the last
|
||||
matching line. If -c is also set, the count that is output
|
||||
is never greater than number. This option has no effect if
|
||||
used with -L, -l, or -q, or when just checking for a match in
|
||||
a binary file.
|
||||
|
||||
--match-limit=number
|
||||
Processing some regular expression patterns may take a very
|
||||
Processing some regular expression patterns may take a very
|
||||
long time to search for all possible matching strings. Others
|
||||
may require a very large amount of memory. There are three
|
||||
may require a very large amount of memory. There are three
|
||||
options that set resource limits for matching.
|
||||
|
||||
The --match-limit option provides a means of limiting comput-
|
||||
ing resource usage when processing patterns that are not go-
|
||||
ing resource usage when processing patterns that are not go-
|
||||
ing to match, but which have a very large number of possibil-
|
||||
ities in their search trees. The classic example is a pattern
|
||||
that uses nested unlimited repeats. Internally, PCRE2 has a
|
||||
counter that is incremented each time around its main pro-
|
||||
cessing loop. If the value set by --match-limit is reached,
|
||||
that uses nested unlimited repeats. Internally, PCRE2 has a
|
||||
counter that is incremented each time around its main pro-
|
||||
cessing loop. If the value set by --match-limit is reached,
|
||||
an error occurs.
|
||||
|
||||
The --heap-limit option specifies, as a number of kibibytes
|
||||
(units of 1024 bytes), the amount of heap memory that may be
|
||||
used for matching. Heap memory is needed only if matching the
|
||||
pattern requires a significant number of nested backtracking
|
||||
points to be remembered. This parameter can be set to zero to
|
||||
forbid the use of heap memory altogether.
|
||||
The --heap-limit option specifies, as a number of kibibytes
|
||||
(units of 1024 bytes), the maximum amount of heap memory that
|
||||
may be used for matching.
|
||||
|
||||
The --depth-limit option limits the depth of nested back-
|
||||
tracking points, which indirectly limits the amount of memory
|
||||
|
@ -542,66 +603,37 @@ OPTIONS
|
|||
size is silently forced to be no smaller than the starting
|
||||
buffer size.
|
||||
|
||||
-M, --multiline
|
||||
Allow patterns to match more than one line. When this option
|
||||
is set, the PCRE2 library is called in "multiline" mode. This
|
||||
allows a matched string to extend past the end of a line and
|
||||
continue on one or more subsequent lines. Patterns used with
|
||||
-M may usefully contain literal newline characters and inter-
|
||||
nal occurrences of ^ and $ characters. The output for a suc-
|
||||
cessful match may consist of more than one line. The first
|
||||
line is the line in which the match started, and the last
|
||||
line is the line in which the match ended. If the matched
|
||||
string ends with a newline sequence, the output ends at the
|
||||
end of that line. If -v is set, none of the lines in a
|
||||
multi-line match are output. Once a match has been handled,
|
||||
scanning restarts at the beginning of the line after the one
|
||||
in which the match ended.
|
||||
|
||||
The newline sequence that separates multiple lines must be
|
||||
matched as part of the pattern. For example, to find the
|
||||
phrase "regular expression" in a file where "regular" might
|
||||
be at the end of a line and "expression" at the start of the
|
||||
next line, you could use this command:
|
||||
|
||||
pcre2grep -M 'regular\s+expression' <file>
|
||||
|
||||
The \s escape sequence matches any white space character, in-
|
||||
cluding newlines, and is followed by + so as to match trail-
|
||||
ing white space on the first line as well as possibly han-
|
||||
dling a two-character newline sequence.
|
||||
|
||||
There is a limit to the number of lines that can be matched,
|
||||
imposed by the way that pcre2grep buffers the input file as
|
||||
it scans it. With a sufficiently large processing buffer,
|
||||
this should not be a problem, but the -M option does not work
|
||||
when input is read line by line (see --line-buffered.)
|
||||
|
||||
-N newline-type, --newline=newline-type
|
||||
The PCRE2 library supports five different conventions for in-
|
||||
dicating the ends of lines. They are the single-character se-
|
||||
quences CR (carriage return) and LF (linefeed), the two-char-
|
||||
acter sequence CRLF, an "anycrlf" convention, which recog-
|
||||
nizes any of the preceding three types, and an "any" conven-
|
||||
tion, in which any Unicode line ending sequence is assumed to
|
||||
end a line. The Unicode sequences are the three just men-
|
||||
tioned, plus VT (vertical tab, U+000B), FF (form feed,
|
||||
U+000C), NEL (next line, U+0085), LS (line separator,
|
||||
U+2028), and PS (paragraph separator, U+2029).
|
||||
Six different conventions for indicating the ends of lines in
|
||||
scanned files are supported. For example:
|
||||
|
||||
pcre2grep -N CRLF 'some pattern' <file>
|
||||
|
||||
The newline type may be specified in upper, lower, or mixed
|
||||
case. If the newline type is NUL, lines are separated by bi-
|
||||
nary zero characters. The other types are the single-charac-
|
||||
ter sequences CR (carriage return) and LF (linefeed), the
|
||||
two-character sequence CRLF, an "anycrlf" type, which recog-
|
||||
nizes any of the preceding three types, and an "any" type,
|
||||
for which any Unicode line ending sequence is assumed to end
|
||||
a line. The Unicode sequences are the three just mentioned,
|
||||
plus VT (vertical tab, U+000B), FF (form feed, U+000C), NEL
|
||||
(next line, U+0085), LS (line separator, U+2028), and PS
|
||||
(paragraph separator, U+2029).
|
||||
|
||||
When the PCRE2 library is built, a default line-ending se-
|
||||
quence is specified. This is normally the standard sequence
|
||||
for the operating system. Unless otherwise specified by this
|
||||
option, pcre2grep uses the library's default. The possible
|
||||
values for this option are CR, LF, CRLF, ANYCRLF, or ANY.
|
||||
This makes it possible to use pcre2grep to scan files that
|
||||
have come from other environments without having to modify
|
||||
their line endings. If the data that is being scanned does
|
||||
not agree with the convention set by this option, pcre2grep
|
||||
may behave in strange ways. Note that this option does not
|
||||
apply to files specified by the -f, --exclude-from, or --in-
|
||||
clude-from options, which are expected to use the operating
|
||||
system's standard newline sequence.
|
||||
option, pcre2grep uses the library's default.
|
||||
|
||||
This option makes it possible to use pcre2grep to scan files
|
||||
that have come from other environments without having to mod-
|
||||
ify their line endings. If the data that is being scanned
|
||||
does not agree with the convention set by this option,
|
||||
pcre2grep may behave in strange ways. Note that this option
|
||||
does not apply to files specified by the -f, --exclude-from,
|
||||
or --include-from options, which are expected to use the op-
|
||||
erating system's standard newline sequence.
|
||||
|
||||
-n, --line-number
|
||||
Precede each output line by its line number in the file, fol-
|
||||
|
@ -619,95 +651,109 @@ OPTIONS
|
|||
lems. It should never be needed in normal use.
|
||||
|
||||
-O text, --output=text
|
||||
When there is a match, instead of outputting the whole line
|
||||
that matched, output just the given text. This option is mu-
|
||||
tually exclusive with --only-matching, --file-offsets, and
|
||||
--line-offsets. Escape sequences starting with a dollar char-
|
||||
acter may be used to insert the contents of the matched part
|
||||
of the line and/or captured substrings into the text.
|
||||
When there is a match, instead of outputting the line that
|
||||
matched, output just the text specified in this option, fol-
|
||||
lowed by an operating-system standard newline. In this mode,
|
||||
no context is shown. That is, the -A, -B, and -C options are
|
||||
ignored. The --newline option has no effect on this option,
|
||||
which is mutually exclusive with --only-matching, --file-off-
|
||||
sets, and --line-offsets. However, like --only-matching, if
|
||||
there is more than one match in a line, each of them causes a
|
||||
line of output.
|
||||
|
||||
$<digits> or ${<digits>} is replaced by the captured sub-
|
||||
string of the given decimal number; zero substitutes the
|
||||
Escape sequences starting with a dollar character may be used
|
||||
to insert the contents of the matched part of the line and/or
|
||||
captured substrings into the text.
|
||||
|
||||
$<digits> or ${<digits>} is replaced by the captured sub-
|
||||
string of the given decimal number; zero substitutes the
|
||||
whole match. If the number is greater than the number of cap-
|
||||
turing substrings, or if the capture is unset, the replace-
|
||||
turing substrings, or if the capture is unset, the replace-
|
||||
ment is empty.
|
||||
|
||||
$a is replaced by bell; $b by backspace; $e by escape; $f by
|
||||
form feed; $n by newline; $r by carriage return; $t by tab;
|
||||
$a is replaced by bell; $b by backspace; $e by escape; $f by
|
||||
form feed; $n by newline; $r by carriage return; $t by tab;
|
||||
$v by vertical tab.
|
||||
|
||||
$o<digits> is replaced by the character represented by the
|
||||
given octal number; up to three digits are processed.
|
||||
$o<digits> or $o{<digits>} is replaced by the character whose
|
||||
code point is the given octal number. In the first form, up
|
||||
to three octal digits are processed. When more digits are
|
||||
needed in Unicode mode to specify a wide character, the sec-
|
||||
ond form must be used.
|
||||
|
||||
$x<digits> is replaced by the character represented by the
|
||||
given hexadecimal number; up to two digits are processed.
|
||||
$x<digits> or $x{<digits>} is replaced by the character rep-
|
||||
resented by the given hexadecimal number. In the first form,
|
||||
up to two hexadecimal digits are processed. When more digits
|
||||
are needed in Unicode mode to specify a wide character, the
|
||||
second form must be used.
|
||||
|
||||
Any other character is substituted by itself. In particular,
|
||||
Any other character is substituted by itself. In particular,
|
||||
$$ is replaced by a single dollar.
|
||||
|
||||
-o, --only-matching
|
||||
Show only the part of the line that matched a pattern instead
|
||||
of the whole line. In this mode, no context is shown. That
|
||||
is, the -A, -B, and -C options are ignored. If there is more
|
||||
than one match in a line, each of them is shown separately,
|
||||
on a separate line of output. If -o is combined with -v (in-
|
||||
vert the sense of the match to find non-matching lines), no
|
||||
output is generated, but the return code is set appropri-
|
||||
ately. If the matched portion of the line is empty, nothing
|
||||
is output unless the file name or line number are being
|
||||
printed, in which case they are shown on an otherwise empty
|
||||
of the whole line. In this mode, no context is shown. That
|
||||
is, the -A, -B, and -C options are ignored. If there is more
|
||||
than one match in a line, each of them is shown separately,
|
||||
on a separate line of output. If -o is combined with -v (in-
|
||||
vert the sense of the match to find non-matching lines), no
|
||||
output is generated, but the return code is set appropri-
|
||||
ately. If the matched portion of the line is empty, nothing
|
||||
is output unless the file name or line number are being
|
||||
printed, in which case they are shown on an otherwise empty
|
||||
line. This option is mutually exclusive with --output,
|
||||
--file-offsets and --line-offsets.
|
||||
|
||||
-onumber, --only-matching=number
|
||||
Show only the part of the line that matched the capturing
|
||||
Show only the part of the line that matched the capturing
|
||||
parentheses of the given number. Up to 50 capturing parenthe-
|
||||
ses are supported by default. This limit can be changed via
|
||||
the --om-capture option. A pattern may contain any number of
|
||||
capturing parentheses, but only those whose number is within
|
||||
the limit can be accessed by -o. An error occurs if the num-
|
||||
ses are supported by default. This limit can be changed via
|
||||
the --om-capture option. A pattern may contain any number of
|
||||
capturing parentheses, but only those whose number is within
|
||||
the limit can be accessed by -o. An error occurs if the num-
|
||||
ber specified by -o is greater than the limit.
|
||||
|
||||
-o0 is the same as -o without a number. Because these options
|
||||
can be given without an argument (see above), if an argument
|
||||
is present, it must be given in the same shell item, for ex-
|
||||
ample, -o3 or --only-matching=2. The comments given for the
|
||||
non-argument case above also apply to this option. If the
|
||||
specified capturing parentheses do not exist in the pattern,
|
||||
or were not set in the match, nothing is output unless the
|
||||
can be given without an argument (see above), if an argument
|
||||
is present, it must be given in the same shell item, for ex-
|
||||
ample, -o3 or --only-matching=2. The comments given for the
|
||||
non-argument case above also apply to this option. If the
|
||||
specified capturing parentheses do not exist in the pattern,
|
||||
or were not set in the match, nothing is output unless the
|
||||
file name or line number are being output.
|
||||
|
||||
If this option is given multiple times, multiple substrings
|
||||
are output for each match, in the order the options are
|
||||
given, and all on one line. For example, -o3 -o1 -o3 causes
|
||||
the substrings matched by capturing parentheses 3 and 1 and
|
||||
then 3 again to be output. By default, there is no separator
|
||||
If this option is given multiple times, multiple substrings
|
||||
are output for each match, in the order the options are
|
||||
given, and all on one line. For example, -o3 -o1 -o3 causes
|
||||
the substrings matched by capturing parentheses 3 and 1 and
|
||||
then 3 again to be output. By default, there is no separator
|
||||
(but see the next but one option).
|
||||
|
||||
--om-capture=number
|
||||
Set the number of capturing parentheses that can be accessed
|
||||
Set the number of capturing parentheses that can be accessed
|
||||
by -o. The default is 50.
|
||||
|
||||
--om-separator=text
|
||||
Specify a separating string for multiple occurrences of -o.
|
||||
The default is an empty string. Separating strings are never
|
||||
Specify a separating string for multiple occurrences of -o.
|
||||
The default is an empty string. Separating strings are never
|
||||
coloured.
|
||||
|
||||
-q, --quiet
|
||||
Work quietly, that is, display nothing except error messages.
|
||||
The exit status indicates whether or not any matches were
|
||||
The exit status indicates whether or not any matches were
|
||||
found.
|
||||
|
||||
-r, --recursive
|
||||
If any given path is a directory, recursively scan the files
|
||||
it contains, taking note of any --include and --exclude set-
|
||||
tings. By default, a directory is read as a normal file; in
|
||||
some operating systems this gives an immediate end-of-file.
|
||||
This option is a shorthand for setting the -d option to "re-
|
||||
If any given path is a directory, recursively scan the files
|
||||
it contains, taking note of any --include and --exclude set-
|
||||
tings. By default, a directory is read as a normal file; in
|
||||
some operating systems this gives an immediate end-of-file.
|
||||
This option is a shorthand for setting the -d option to "re-
|
||||
curse".
|
||||
|
||||
--recursion-limit=number
|
||||
See --match-limit above.
|
||||
This is an obsolete synonym for --depth-limit. See --match-
|
||||
limit above for details.
|
||||
|
||||
-s, --no-messages
|
||||
Suppress error messages about non-existent or unreadable
|
||||
|
@ -729,26 +775,30 @@ OPTIONS
|
|||
|
||||
-u, --utf Operate in UTF-8 mode. This option is available only if PCRE2
|
||||
has been compiled with UTF-8 support. All patterns (including
|
||||
those for any --exclude and --include options) and all sub-
|
||||
ject lines that are scanned must be valid strings of UTF-8
|
||||
characters.
|
||||
those for any --exclude and --include options) and all lines
|
||||
that are scanned must be valid strings of UTF-8 characters.
|
||||
If an invalid UTF-8 string is encountered, an error occurs.
|
||||
|
||||
-U, --utf-allow-invalid
|
||||
As --utf, but in addition subject lines may contain invalid
|
||||
UTF-8 code unit sequences. These can never form part of any
|
||||
pattern match. This facility allows valid UTF-8 strings to be
|
||||
sought in executable or other binary files. For more details
|
||||
about matching in non-valid UTF-8 strings, see the pcre2uni-
|
||||
code(3) documentation.
|
||||
pattern match. Patterns themselves, however, must still be
|
||||
valid UTF-8 strings. This facility allows valid UTF-8 strings
|
||||
to be sought within arbitrary byte sequences in executable or
|
||||
other binary files. For more details about matching in non-
|
||||
valid UTF-8 strings, see the pcre2unicode(3) documentation.
|
||||
|
||||
-V, --version
|
||||
Write the version numbers of pcre2grep and the PCRE2 library
|
||||
to the standard output and then exit. Anything else on the
|
||||
Write the version numbers of pcre2grep and the PCRE2 library
|
||||
to the standard output and then exit. Anything else on the
|
||||
command line is ignored.
|
||||
|
||||
-v, --invert-match
|
||||
Invert the sense of the match, so that lines which do not
|
||||
match any of the patterns are the ones that are found.
|
||||
Invert the sense of the match, so that lines which do not
|
||||
match any of the patterns are the ones that are found. When
|
||||
this option is set, options such as --only-matching and
|
||||
--output, which specify parts of a match that are to be out-
|
||||
put, are ignored.
|
||||
|
||||
-w, --word-regex, --word-regexp
|
||||
Force the patterns only to match "words". That is, there must
|
||||
|
@ -769,6 +819,13 @@ OPTIONS
|
|||
does not apply to patterns specified by any of the --include
|
||||
or --exclude options.
|
||||
|
||||
-Z, --null
|
||||
Terminate files names in the regular output with a zero byte
|
||||
(the NUL character) instead of what would normally appear.
|
||||
This is useful when file names contain unusual characters
|
||||
such as colons, hyphens, or even newlines. The option does
|
||||
not apply to file names in error messages.
|
||||
|
||||
|
||||
ENVIRONMENT VARIABLES
|
||||
|
||||
|
@ -780,17 +837,27 @@ ENVIRONMENT VARIABLES
|
|||
|
||||
NEWLINES
|
||||
|
||||
The -N (--newline) option allows pcre2grep to scan files with different
|
||||
newline conventions from the default. Any parts of the input files that
|
||||
are written to the standard output are copied identically, with what-
|
||||
ever newline sequences they have in the input. However, the setting of
|
||||
this option affects only the way scanned files are processed. It does
|
||||
not affect the interpretation of files specified by the -f, --file-
|
||||
list, --exclude-from, or --include-from options, nor does it affect the
|
||||
way in which pcre2grep writes informational messages to the standard
|
||||
error and output streams. For these it uses the string "\n" to indicate
|
||||
newlines, relying on the C I/O library to convert this to an appropri-
|
||||
ate sequence.
|
||||
The -N (--newline) option allows pcre2grep to scan files with newline
|
||||
conventions that differ from the default. This option affects only the
|
||||
way scanned files are processed. It does not affect the interpretation
|
||||
of files specified by the -f, --file-list, --exclude-from, or --in-
|
||||
clude-from options.
|
||||
|
||||
Any parts of the scanned input files that are written to the standard
|
||||
output are copied with whatever newline sequences they have in the in-
|
||||
put. However, if the final line of a file is output, and it does not
|
||||
end with a newline sequence, a newline sequence is added. If the new-
|
||||
line setting is CR, LF, CRLF or NUL, that line ending is output; for
|
||||
the other settings (ANYCRLF or ANY) a single NL is used.
|
||||
|
||||
The newline setting does not affect the way in which pcre2grep writes
|
||||
newlines in informational messages to the standard output and error
|
||||
streams. Under Windows, the standard output is set to be binary, so
|
||||
that "\r\n" at the ends of output lines that are copied from the input
|
||||
is not converted to "\r\r\n" by the C I/O library. This means that any
|
||||
messages written to the standard output must end with "\r\n". For all
|
||||
other operating systems, and for all messages to the standard error
|
||||
stream, "\n" is used.
|
||||
|
||||
|
||||
OPTIONS COMPATIBILITY
|
||||
|
@ -860,30 +927,49 @@ USING PCRE2'S CALLOUT FACILITY
|
|||
mentation for details). Numbered callouts are ignored by pcre2grep;
|
||||
only callouts with string arguments are useful.
|
||||
|
||||
Echoing a specific string
|
||||
|
||||
Starting the callout string with a pipe character invokes an echoing
|
||||
facility that avoids calling an external program or script. This facil-
|
||||
ity is always available, provided that callouts were not completely
|
||||
disabled when pcre2grep was built. The rest of the callout string is
|
||||
processed as a zero-terminated string, which means it should not con-
|
||||
tain any internal binary zeros. It is written to the output, having
|
||||
first been passed through the same escape processing as text from the
|
||||
--output (-O) option (see above). However, $0 cannot be used to insert
|
||||
a matched substring because the match is still in progress. Instead,
|
||||
the single character '0' is inserted. Any syntax errors in the string
|
||||
(for example, a dollar not followed by another character) causes the
|
||||
callout to be ignored. No terminator is added to the output string, so
|
||||
if you want a newline, you must include it explicitly using the escape
|
||||
$n. For example:
|
||||
|
||||
pcre2grep '(.)(..(.))(?C"|[$1] [$2] [$3]$n")' <some file>
|
||||
|
||||
Matching continues normally after the string is output. If you want to
|
||||
see only the callout output but not any output from an actual match,
|
||||
you should end the pattern with (*FAIL).
|
||||
|
||||
Calling external programs or scripts
|
||||
|
||||
This facility can be independently disabled when pcre2grep is built. It
|
||||
is supported for Windows, where a call to _spawnvp() is used, for VMS,
|
||||
where lib$spawn() is used, and for any other Unix-like environment
|
||||
where fork() and execv() are available.
|
||||
is supported for Windows, where a call to _spawnvp() is used, for VMS,
|
||||
where lib$spawn() is used, and for any Unix-like environment where
|
||||
fork() and execv() are available.
|
||||
|
||||
If the callout string does not start with a pipe (vertical bar) charac-
|
||||
ter, it is parsed into a list of substrings separated by pipe charac-
|
||||
ters. The first substring must be an executable name, with the follow-
|
||||
ter, it is parsed into a list of substrings separated by pipe charac-
|
||||
ters. The first substring must be an executable name, with the follow-
|
||||
ing substrings specifying arguments:
|
||||
|
||||
executable_name|arg1|arg2|...
|
||||
|
||||
Any substring (including the executable name) may contain escape se-
|
||||
quences started by a dollar character: $<digits> or ${<digits>} is re-
|
||||
placed by the captured substring of the given decimal number, which
|
||||
must be greater than zero. If the number is greater than the number of
|
||||
capturing substrings, or if the capture is unset, the replacement is
|
||||
empty.
|
||||
|
||||
Any other character is substituted by itself. In particular, $$ is re-
|
||||
placed by a single dollar and $| is replaced by a pipe character. Here
|
||||
is an example:
|
||||
Any substring (including the executable name) may contain escape se-
|
||||
quences started by a dollar character. These are the same as for the
|
||||
--output (-O) option documented above, except that $0 cannot insert the
|
||||
matched string because the match is still in progress. Instead, the
|
||||
character '0' is inserted. If you need a literal dollar or pipe charac-
|
||||
ter in any substring, use $$ or $| respectively. Here is an example:
|
||||
|
||||
echo -e "abcde\n12345" | pcre2grep \
|
||||
'(?x)(.)(..(.))
|
||||
|
@ -896,28 +982,15 @@ USING PCRE2'S CALLOUT FACILITY
|
|||
Arg1: [1] [234] [4] Arg2: |1| ()
|
||||
12345
|
||||
|
||||
The parameters for the system call that is used to run the program or
|
||||
The parameters for the system call that is used to run the program or
|
||||
script are zero-terminated strings. This means that binary zero charac-
|
||||
ters in the callout argument will cause premature termination of their
|
||||
substrings, and therefore should not be present. Any syntax errors in
|
||||
the string (for example, a dollar not followed by another character)
|
||||
cause the callout to be ignored. If running the program fails for any
|
||||
reason (including the non-existence of the executable), a local match-
|
||||
ters in the callout argument will cause premature termination of their
|
||||
substrings, and therefore should not be present. Any syntax errors in
|
||||
the string (for example, a dollar not followed by another character)
|
||||
causes the callout to be ignored. If running the program fails for any
|
||||
reason (including the non-existence of the executable), a local match-
|
||||
ing failure occurs and the matcher backtracks in the normal way.
|
||||
|
||||
Echoing a specific string
|
||||
|
||||
This facility is always available, provided that callouts were not com-
|
||||
pletely disabled when pcre2grep was built. If the callout string starts
|
||||
with a pipe (vertical bar) character, the rest of the string is written
|
||||
to the output, having been passed through the same escape processing as
|
||||
text from the --output option. This provides a simple echoing facility
|
||||
that avoids calling an external program or script. No terminator is
|
||||
added to the string, so if you want a newline, you must include it ex-
|
||||
plicitly. Matching continues normally after the string is output. If
|
||||
you want to see only the callout output but not any output from an ac-
|
||||
tual match, you should end the relevant pattern with (*FAIL).
|
||||
|
||||
|
||||
MATCHING ERRORS
|
||||
|
||||
|
@ -951,17 +1024,17 @@ DIAGNOSTICS
|
|||
|
||||
SEE ALSO
|
||||
|
||||
pcre2pattern(3), pcre2syntax(3), pcre2callout(3).
|
||||
pcre2pattern(3), pcre2syntax(3), pcre2callout(3), pcre2unicode(3).
|
||||
|
||||
|
||||
AUTHOR
|
||||
|
||||
Philip Hazel
|
||||
University Computing Service
|
||||
Retired from University Computing Service
|
||||
Cambridge, England.
|
||||
|
||||
|
||||
REVISION
|
||||
|
||||
Last updated: 15 June 2019
|
||||
Copyright (c) 1997-2019 University of Cambridge.
|
||||
Last updated: 30 July 2022
|
||||
Copyright (c) 1997-2022 University of Cambridge.
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
.TH PCRE2JIT 3 "23 May 2019" "PCRE2 10.34"
|
||||
.TH PCRE2JIT 3 "30 November 2021" "PCRE2 10.40"
|
||||
.SH NAME
|
||||
PCRE2 - Perl-compatible regular expressions (revised API)
|
||||
.SH "PCRE2 JUST-IN-TIME COMPILER SUPPORT"
|
||||
|
@ -29,6 +29,7 @@ platforms:
|
|||
.sp
|
||||
ARM 32-bit (v5, v7, and Thumb2)
|
||||
ARM 64-bit
|
||||
IBM s390x 64 bit
|
||||
Intel x86 32-bit and 64-bit
|
||||
MIPS 32-bit and 64-bit
|
||||
Power PC 32-bit and 64-bit
|
||||
|
@ -64,7 +65,7 @@ or a negative error code.
|
|||
There is a limit to the size of pattern that JIT supports, imposed by the size
|
||||
of machine stack that it uses. The exact rules are not documented because they
|
||||
may change at any time, in particular, when new optimizations are introduced.
|
||||
If a pattern is too big, a call to \fBpcre2_jit_compile()\fB returns
|
||||
If a pattern is too big, a call to \fBpcre2_jit_compile()\fP returns
|
||||
PCRE2_ERROR_NOMEMORY.
|
||||
.P
|
||||
PCRE2_JIT_COMPLETE requests the JIT compiler to generate code for complete
|
||||
|
@ -250,11 +251,11 @@ non-sequential matches in one thread is to use callouts: if a callout function
|
|||
starts another match, that match must use a different JIT stack to the one used
|
||||
for currently suspended match(es).
|
||||
.P
|
||||
In a multithread application, if you do not
|
||||
specify a JIT stack, or if you assign or pass back NULL from a callback, that
|
||||
is thread-safe, because each thread has its own machine stack. However, if you
|
||||
assign or pass back a non-NULL JIT stack, this must be a different stack for
|
||||
each thread so that the application is thread-safe.
|
||||
In a multithread application, if you do not specify a JIT stack, or if you
|
||||
assign or pass back NULL from a callback, that is thread-safe, because each
|
||||
thread has its own machine stack. However, if you assign or pass back a
|
||||
non-NULL JIT stack, this must be a different stack for each thread so that the
|
||||
application is thread-safe.
|
||||
.P
|
||||
Strictly speaking, even more is allowed. You can assign the same non-NULL stack
|
||||
to a match context that is used by any number of patterns, as long as they are
|
||||
|
@ -266,7 +267,7 @@ inefficient solution, and not recommended.
|
|||
This is a suggestion for how a multithreaded program that needs to set up
|
||||
non-default JIT stacks might operate:
|
||||
.sp
|
||||
During thread initalization
|
||||
During thread initialization
|
||||
thread_local_var = pcre2_jit_stack_create(...)
|
||||
.sp
|
||||
During thread exit
|
||||
|
@ -315,12 +316,12 @@ stack through the JIT callback function.
|
|||
You can free a JIT stack at any time, as long as it will not be used by
|
||||
\fBpcre2_match()\fP again. When you assign the stack to a match context, only a
|
||||
pointer is set. There is no reference counting or any other magic. You can free
|
||||
compiled patterns, contexts, and stacks in any order, anytime. Just \fIdo
|
||||
not\fP call \fBpcre2_match()\fP with a match context pointing to an already
|
||||
freed stack, as that will cause SEGFAULT. (Also, do not free a stack currently
|
||||
used by \fBpcre2_match()\fP in another thread). You can also replace the stack
|
||||
in a context at any time when it is not in use. You should free the previous
|
||||
stack before assigning a replacement.
|
||||
compiled patterns, contexts, and stacks in any order, anytime.
|
||||
Just \fIdo not\fP call \fBpcre2_match()\fP with a match context pointing to an
|
||||
already freed stack, as that will cause SEGFAULT. (Also, do not free a stack
|
||||
currently used by \fBpcre2_match()\fP in another thread). You can also replace
|
||||
the stack in a context at any time when it is not in use. You should free the
|
||||
previous stack before assigning a replacement.
|
||||
.P
|
||||
(5) Should I allocate/free a stack every time before/after calling
|
||||
\fBpcre2_match()\fP?
|
||||
|
@ -354,8 +355,8 @@ out this complicated API.
|
|||
.B void pcre2_jit_free_unused_memory(pcre2_general_context *\fIgcontext\fP);
|
||||
.fi
|
||||
.P
|
||||
The JIT executable allocator does not free all memory when it is possible.
|
||||
It expects new allocations, and keeps some free memory around to improve
|
||||
The JIT executable allocator does not free all memory when it is possible. It
|
||||
expects new allocations, and keeps some free memory around to improve
|
||||
allocation speed. However, in low memory conditions, it might be better to free
|
||||
all possible memory. You can cause this to happen by calling
|
||||
pcre2_jit_free_unused_memory(). Its argument is a general context, for custom
|
||||
|
@ -415,10 +416,10 @@ that was not compiled.
|
|||
.P
|
||||
When you call \fBpcre2_match()\fP, as well as testing for invalid options, a
|
||||
number of other sanity checks are performed on the arguments. For example, if
|
||||
the subject pointer is NULL, an immediate error is given. Also, unless
|
||||
PCRE2_NO_UTF_CHECK is set, a UTF subject string is tested for validity. In the
|
||||
interests of speed, these checks do not happen on the JIT fast path, and if
|
||||
invalid data is passed, the result is undefined.
|
||||
the subject pointer is NULL but the length is non-zero, an immediate error is
|
||||
given. Also, unless PCRE2_NO_UTF_CHECK is set, a UTF subject string is tested
|
||||
for validity. In the interests of speed, these checks do not happen on the JIT
|
||||
fast path, and if invalid data is passed, the result is undefined.
|
||||
.P
|
||||
Bypassing the sanity checks and the \fBpcre2_match()\fP wrapping can give
|
||||
speedups of more than 10%.
|
||||
|
@ -444,6 +445,6 @@ Cambridge, England.
|
|||
.rs
|
||||
.sp
|
||||
.nf
|
||||
Last updated: 23 May 2019
|
||||
Copyright (c) 1997-2019 University of Cambridge.
|
||||
Last updated: 30 November 2021
|
||||
Copyright (c) 1997-2021 University of Cambridge.
|
||||
.fi
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
.TH PCRE2LIMITS 3 "03 February 2019" "PCRE2 10.33"
|
||||
.TH PCRE2LIMITS 3 "26 July 2022" "PCRE2 10.41"
|
||||
.SH NAME
|
||||
PCRE2 - Perl-compatible regular expressions (revised API)
|
||||
.SH "SIZE AND OTHER LIMITATIONS"
|
||||
|
@ -51,6 +51,10 @@ is 255 code units for the 8-bit library and 65535 code units for the 16-bit and
|
|||
.P
|
||||
The maximum length of a string argument to a callout is the largest number a
|
||||
32-bit unsigned integer can hold.
|
||||
.P
|
||||
The maximum amount of heap memory used for matching is controlled by the heap
|
||||
limit, which can be set in a pattern or in a match context. The default is a
|
||||
very large number, effectively unlimited.
|
||||
.
|
||||
.
|
||||
.SH AUTHOR
|
||||
|
@ -58,7 +62,7 @@ The maximum length of a string argument to a callout is the largest number a
|
|||
.sp
|
||||
.nf
|
||||
Philip Hazel
|
||||
University Computing Service
|
||||
Retired from University Computing Service
|
||||
Cambridge, England.
|
||||
.fi
|
||||
.
|
||||
|
@ -67,6 +71,6 @@ Cambridge, England.
|
|||
.rs
|
||||
.sp
|
||||
.nf
|
||||
Last updated: 02 February 2019
|
||||
Copyright (c) 1997-2019 University of Cambridge.
|
||||
Last updated: 26 July 2022
|
||||
Copyright (c) 1997-2022 University of Cambridge.
|
||||
.fi
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
.TH PCRE2MATCHING 3 "23 May 2019" "PCRE2 10.34"
|
||||
.TH PCRE2MATCHING 3 "28 August 2021" "PCRE2 10.38"
|
||||
.SH NAME
|
||||
PCRE2 - Perl-compatible regular expressions (revised API)
|
||||
.SH "PCRE2 MATCHING ALGORITHMS"
|
||||
|
@ -61,8 +61,9 @@ tried is controlled by the greedy or ungreedy nature of the quantifier.
|
|||
If a leaf node is reached, a matching string has been found, and at that point
|
||||
the algorithm stops. Thus, if there is more than one possible match, this
|
||||
algorithm returns the first one that it finds. Whether this is the shortest,
|
||||
the longest, or some intermediate length depends on the way the greedy and
|
||||
ungreedy repetition quantifiers are specified in the pattern.
|
||||
the longest, or some intermediate length depends on the way the alternations
|
||||
and the greedy or ungreedy repetition quantifiers are specified in the
|
||||
pattern.
|
||||
.P
|
||||
Because it ends up with a single path through the tree, it is relatively
|
||||
straightforward for this algorithm to keep track of the substrings that are
|
||||
|
@ -91,10 +92,15 @@ no more unterminated paths. At this point, terminated paths represent the
|
|||
different matching possibilities (if there are none, the match has failed).
|
||||
Thus, if there is more than one possible match, this algorithm finds all of
|
||||
them, and in particular, it finds the longest. The matches are returned in
|
||||
decreasing order of length. There is an option to stop the algorithm after the
|
||||
first match (which is necessarily the shortest) is found.
|
||||
the output vector in decreasing order of length. There is an option to stop the
|
||||
algorithm after the first match (which is necessarily the shortest) is found.
|
||||
.P
|
||||
Note that all the matches that are found start at the same point in the
|
||||
Note that the size of vector needed to contain all the results depends on the
|
||||
number of simultaneous matches, not on the number of parentheses in the
|
||||
pattern. Using \fBpcre2_match_data_create_from_pattern()\fP to create the match
|
||||
data block is therefore not advisable when doing DFA matching.
|
||||
.P
|
||||
Note also that all the matches that are found start at the same point in the
|
||||
subject. If the pattern
|
||||
.sp
|
||||
cat(er(pillar)?)?
|
||||
|
@ -165,19 +171,13 @@ supported by \fBpcre2_dfa_match()\fP.
|
|||
.SH "ADVANTAGES OF THE ALTERNATIVE ALGORITHM"
|
||||
.rs
|
||||
.sp
|
||||
Using the alternative matching algorithm provides the following advantages:
|
||||
The main advantage of the alternative algorithm is that all possible matches
|
||||
(at a single point in the subject) are automatically found, and in particular,
|
||||
the longest match is found. To find more than one match at the same point using
|
||||
the standard algorithm, you have to do kludgy things with callouts.
|
||||
.P
|
||||
1. All possible matches (at a single point in the subject) are automatically
|
||||
found, and in particular, the longest match is found. To find more than one
|
||||
match using the standard algorithm, you have to do kludgy things with
|
||||
callouts.
|
||||
.P
|
||||
2. Because the alternative algorithm scans the subject string just once, and
|
||||
never needs to backtrack (except for lookbehinds), it is possible to pass very
|
||||
long subject strings to the matching function in several pieces, checking for
|
||||
partial matching each time. Although it is also possible to do multi-segment
|
||||
matching using the standard algorithm, by retaining partially matched
|
||||
substrings, it is more complicated. The
|
||||
Partial matching is possible with this algorithm, though it has some
|
||||
limitations. The
|
||||
.\" HREF
|
||||
\fBpcre2partial\fP
|
||||
.\"
|
||||
|
@ -199,6 +199,8 @@ invalid UTF string are not supported.
|
|||
.P
|
||||
3. Although atomic groups are supported, their use does not provide the
|
||||
performance advantage that it does for the standard algorithm.
|
||||
.P
|
||||
4. JIT optimization is not supported.
|
||||
.
|
||||
.
|
||||
.SH AUTHOR
|
||||
|
@ -206,7 +208,7 @@ performance advantage that it does for the standard algorithm.
|
|||
.sp
|
||||
.nf
|
||||
Philip Hazel
|
||||
University Computing Service
|
||||
Retired from University Computing Service
|
||||
Cambridge, England.
|
||||
.fi
|
||||
.
|
||||
|
@ -215,6 +217,6 @@ Cambridge, England.
|
|||
.rs
|
||||
.sp
|
||||
.nf
|
||||
Last updated: 23 May 2019
|
||||
Copyright (c) 1997-2019 University of Cambridge.
|
||||
Last updated: 28 August 2021
|
||||
Copyright (c) 1997-2021 University of Cambridge.
|
||||
.fi
|
||||
|
|
|
@ -261,7 +261,7 @@ these characters with '<' if the \fBallusedtext\fP modifier is set:
|
|||
Partial match: 123ab
|
||||
<<<
|
||||
.sp
|
||||
However, the \fPallusedtext\fP modifier is not available for JIT matching,
|
||||
However, the \fBallusedtext\fP modifier is not available for JIT matching,
|
||||
because JIT matching does not record the first (or last) consulted characters.
|
||||
For this reason, this information is not available via the API. It is therefore
|
||||
not possible in general to obtain the exact number of characters that must be
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
.TH PCRE2PATTERN 3 "29 July 2019" "PCRE2 10.34"
|
||||
.TH PCRE2PATTERN 3 "12 January 2022" "PCRE2 10.40"
|
||||
.SH NAME
|
||||
PCRE2 - Perl-compatible regular expressions (revised API)
|
||||
.SH "PCRE2 REGULAR EXPRESSION DETAILS"
|
||||
|
@ -75,7 +75,8 @@ Another special sequence that may appear at the start of a pattern is (*UCP).
|
|||
This has the same effect as setting the PCRE2_UCP option: it causes sequences
|
||||
such as \ed and \ew to use Unicode properties to determine character types,
|
||||
instead of recognizing only characters with codes less than 256 via a lookup
|
||||
table.
|
||||
table. If also causes upper/lower casing operations to use Unicode properties
|
||||
for characters with code points greater than 127, even when UTF is not set.
|
||||
.P
|
||||
Some applications that allow their users to supply patterns may wish to
|
||||
restrict them for security reasons. If the PCRE2_NEVER_UCP option is passed to
|
||||
|
@ -262,8 +263,11 @@ corresponding characters in the subject. As a trivial example, the pattern
|
|||
The quick brown fox
|
||||
.sp
|
||||
matches a portion of a subject string that is identical to itself. When
|
||||
caseless matching is specified (the PCRE2_CASELESS option), letters are matched
|
||||
independently of case.
|
||||
caseless matching is specified (the PCRE2_CASELESS option or (?i) within the
|
||||
pattern), letters are matched independently of case. Note that there are two
|
||||
ASCII characters, K and S, that, in addition to their lower case ASCII
|
||||
equivalents, are case-equivalent with Unicode U+212A (Kelvin sign) and U+017F
|
||||
(long S) respectively when either PCRE2_UTF or PCRE2_UCP is set.
|
||||
.P
|
||||
The power of regular expressions comes from the ability to include wild cards,
|
||||
character classes, alternatives, and repetitions in the pattern. These are
|
||||
|
@ -297,6 +301,22 @@ a character class the only metacharacters are:
|
|||
[ POSIX character class (if followed by POSIX syntax)
|
||||
] terminates the character class
|
||||
.sp
|
||||
If a pattern is compiled with the PCRE2_EXTENDED option, most white space in
|
||||
the pattern, other than in a character class, and characters between a #
|
||||
outside a character class and the next newline, inclusive, are ignored. An
|
||||
escaping backslash can be used to include a white space or a # character as
|
||||
part of the pattern. If the PCRE2_EXTENDED_MORE option is set, the same
|
||||
applies, but in addition unescaped space and horizontal tab characters are
|
||||
ignored inside a character class. Note: only these two characters are ignored,
|
||||
not the full set of pattern white space characters that are ignored outside a
|
||||
character class. Option settings can be changed within a pattern; see the
|
||||
section entitled
|
||||
.\" HTML <a href="#internaloptions">
|
||||
.\" </a>
|
||||
"Internal Option Setting"
|
||||
.\"
|
||||
below.
|
||||
.P
|
||||
The following sections describe the use of each of the metacharacters.
|
||||
.
|
||||
.
|
||||
|
@ -314,15 +334,9 @@ would otherwise be interpreted as a metacharacter, so it is always safe to
|
|||
precede a non-alphanumeric with backslash to specify that it stands for itself.
|
||||
In particular, if you want to match a backslash, you write \e\e.
|
||||
.P
|
||||
In a UTF mode, only ASCII digits and letters have any special meaning after a
|
||||
backslash. All other characters (in particular, those whose code points are
|
||||
greater than 127) are treated as literals.
|
||||
.P
|
||||
If a pattern is compiled with the PCRE2_EXTENDED option, most white space in
|
||||
the pattern (other than in a character class), and characters between a #
|
||||
outside a character class and the next newline, inclusive, are ignored. An
|
||||
escaping backslash can be used to include a white space or # character as part
|
||||
of the pattern.
|
||||
Only ASCII digits and letters have any special meaning after a backslash. All
|
||||
other characters (in particular, those whose code points are greater than 127)
|
||||
are treated as literals.
|
||||
.P
|
||||
If you want to treat all characters in a sequence as literals, you can do so by
|
||||
putting them between \eQ and \eE. This is different from Perl in that $ and @
|
||||
|
@ -495,7 +509,6 @@ for themselves. For example, outside a character class:
|
|||
.\" JOIN
|
||||
\e377 might be a backreference, otherwise
|
||||
the value 255 (decimal)
|
||||
.\" JOIN
|
||||
\e81 is always a backreference
|
||||
.sp
|
||||
Note that octal values of 100 or greater that are specified using this syntax
|
||||
|
@ -727,7 +740,7 @@ Unicode support is not needed for these characters to be recognized.
|
|||
.P
|
||||
It is possible to restrict \eR to match only CR, LF, or CRLF (instead of the
|
||||
complete set of Unicode line endings) by setting the option PCRE2_BSR_ANYCRLF
|
||||
at compile time. (BSR is an abbrevation for "backslash R".) This can be made
|
||||
at compile time. (BSR is an abbreviation for "backslash R".) This can be made
|
||||
the default when PCRE2 is built; if this is the case, the other behaviour can
|
||||
be requested via the PCRE2_BSR_UNICODE option. It is also possible to specify
|
||||
these settings by starting a pattern string with one of the following
|
||||
|
@ -759,191 +772,64 @@ can be used in any mode, though in 8-bit and 16-bit non-UTF modes these
|
|||
sequences are of course limited to testing characters whose code points are
|
||||
less than U+0100 and U+10000, respectively. In 32-bit non-UTF mode, code points
|
||||
greater than 0x10ffff (the Unicode limit) may be encountered. These are all
|
||||
treated as being in the Unknown script and with an unassigned type. The extra
|
||||
escape sequences are:
|
||||
treated as being in the Unknown script and with an unassigned type.
|
||||
.P
|
||||
Matching characters by Unicode property is not fast, because PCRE2 has to do a
|
||||
multistage table lookup in order to find a character's property. That is why
|
||||
the traditional escape sequences such as \ed and \ew do not use Unicode
|
||||
properties in PCRE2 by default, though you can make them do so by setting the
|
||||
PCRE2_UCP option or by starting the pattern with (*UCP).
|
||||
.P
|
||||
The extra escape sequences that provide property support are:
|
||||
.sp
|
||||
\ep{\fIxx\fP} a character with the \fIxx\fP property
|
||||
\eP{\fIxx\fP} a character without the \fIxx\fP property
|
||||
\eX a Unicode extended grapheme cluster
|
||||
.sp
|
||||
The property names represented by \fIxx\fP above are case-sensitive. There is
|
||||
support for Unicode script names, Unicode general category properties, "Any",
|
||||
which matches any character (including newline), and some special PCRE2
|
||||
properties (described in the
|
||||
The property names represented by \fIxx\fP above are not case-sensitive, and in
|
||||
accordance with Unicode's "loose matching" rules, spaces, hyphens, and
|
||||
underscores are ignored. There is support for Unicode script names, Unicode
|
||||
general category properties, "Any", which matches any character (including
|
||||
newline), Bidi_Class, a number of binary (yes/no) properties, and some special
|
||||
PCRE2 properties (described
|
||||
.\" HTML <a href="#extraprops">
|
||||
.\" </a>
|
||||
next section).
|
||||
below).
|
||||
.\"
|
||||
Other Perl properties such as "InMusicalSymbols" are not supported by PCRE2.
|
||||
Note that \eP{Any} does not match any characters, so always causes a match
|
||||
failure.
|
||||
Certain other Perl properties such as "InMusicalSymbols" are not supported by
|
||||
PCRE2. Note that \eP{Any} does not match any characters, so always causes a
|
||||
match failure.
|
||||
.
|
||||
.
|
||||
.
|
||||
.SS "Script properties for \ep and \eP"
|
||||
.rs
|
||||
.sp
|
||||
There are three different syntax forms for matching a script. Each Unicode
|
||||
character has a basic script and, optionally, a list of other scripts ("Script
|
||||
Extensions") with which it is commonly used. Using the Adlam script as an
|
||||
example, \ep{sc:Adlam} matches characters whose basic script is Adlam, whereas
|
||||
\ep{scx:Adlam} matches, in addition, characters that have Adlam in their
|
||||
extensions list. The full names "script" and "script extensions" for the
|
||||
property types are recognized, and a equals sign is an alternative to the
|
||||
colon. If a script name is given without a property type, for example,
|
||||
\ep{Adlam}, it is treated as \ep{scx:Adlam}. Perl changed to this
|
||||
interpretation at release 5.26 and PCRE2 changed at release 10.40.
|
||||
.P
|
||||
Sets of Unicode characters are defined as belonging to certain scripts. A
|
||||
character from one of these sets can be matched using a script name. For
|
||||
example:
|
||||
.sp
|
||||
\ep{Greek}
|
||||
\eP{Han}
|
||||
.sp
|
||||
Unassigned characters (and in non-UTF 32-bit mode, characters with code points
|
||||
greater than 0x10FFFF) are assigned the "Unknown" script. Others that are not
|
||||
part of an identified script are lumped together as "Common". The current list
|
||||
of scripts is:
|
||||
.P
|
||||
Adlam,
|
||||
Ahom,
|
||||
Anatolian_Hieroglyphs,
|
||||
Arabic,
|
||||
Armenian,
|
||||
Avestan,
|
||||
Balinese,
|
||||
Bamum,
|
||||
Bassa_Vah,
|
||||
Batak,
|
||||
Bengali,
|
||||
Bhaiksuki,
|
||||
Bopomofo,
|
||||
Brahmi,
|
||||
Braille,
|
||||
Buginese,
|
||||
Buhid,
|
||||
Canadian_Aboriginal,
|
||||
Carian,
|
||||
Caucasian_Albanian,
|
||||
Chakma,
|
||||
Cham,
|
||||
Cherokee,
|
||||
Common,
|
||||
Coptic,
|
||||
Cuneiform,
|
||||
Cypriot,
|
||||
Cyrillic,
|
||||
Deseret,
|
||||
Devanagari,
|
||||
Dogra,
|
||||
Duployan,
|
||||
Egyptian_Hieroglyphs,
|
||||
Elbasan,
|
||||
Elymaic,
|
||||
Ethiopic,
|
||||
Georgian,
|
||||
Glagolitic,
|
||||
Gothic,
|
||||
Grantha,
|
||||
Greek,
|
||||
Gujarati,
|
||||
Gunjala_Gondi,
|
||||
Gurmukhi,
|
||||
Han,
|
||||
Hangul,
|
||||
Hanifi_Rohingya,
|
||||
Hanunoo,
|
||||
Hatran,
|
||||
Hebrew,
|
||||
Hiragana,
|
||||
Imperial_Aramaic,
|
||||
Inherited,
|
||||
Inscriptional_Pahlavi,
|
||||
Inscriptional_Parthian,
|
||||
Javanese,
|
||||
Kaithi,
|
||||
Kannada,
|
||||
Katakana,
|
||||
Kayah_Li,
|
||||
Kharoshthi,
|
||||
Khmer,
|
||||
Khojki,
|
||||
Khudawadi,
|
||||
Lao,
|
||||
Latin,
|
||||
Lepcha,
|
||||
Limbu,
|
||||
Linear_A,
|
||||
Linear_B,
|
||||
Lisu,
|
||||
Lycian,
|
||||
Lydian,
|
||||
Mahajani,
|
||||
Makasar,
|
||||
Malayalam,
|
||||
Mandaic,
|
||||
Manichaean,
|
||||
Marchen,
|
||||
Masaram_Gondi,
|
||||
Medefaidrin,
|
||||
Meetei_Mayek,
|
||||
Mende_Kikakui,
|
||||
Meroitic_Cursive,
|
||||
Meroitic_Hieroglyphs,
|
||||
Miao,
|
||||
Modi,
|
||||
Mongolian,
|
||||
Mro,
|
||||
Multani,
|
||||
Myanmar,
|
||||
Nabataean,
|
||||
Nandinagari,
|
||||
New_Tai_Lue,
|
||||
Newa,
|
||||
Nko,
|
||||
Nushu,
|
||||
Nyakeng_Puachue_Hmong,
|
||||
Ogham,
|
||||
Ol_Chiki,
|
||||
Old_Hungarian,
|
||||
Old_Italic,
|
||||
Old_North_Arabian,
|
||||
Old_Permic,
|
||||
Old_Persian,
|
||||
Old_Sogdian,
|
||||
Old_South_Arabian,
|
||||
Old_Turkic,
|
||||
Oriya,
|
||||
Osage,
|
||||
Osmanya,
|
||||
Pahawh_Hmong,
|
||||
Palmyrene,
|
||||
Pau_Cin_Hau,
|
||||
Phags_Pa,
|
||||
Phoenician,
|
||||
Psalter_Pahlavi,
|
||||
Rejang,
|
||||
Runic,
|
||||
Samaritan,
|
||||
Saurashtra,
|
||||
Sharada,
|
||||
Shavian,
|
||||
Siddham,
|
||||
SignWriting,
|
||||
Sinhala,
|
||||
Sogdian,
|
||||
Sora_Sompeng,
|
||||
Soyombo,
|
||||
Sundanese,
|
||||
Syloti_Nagri,
|
||||
Syriac,
|
||||
Tagalog,
|
||||
Tagbanwa,
|
||||
Tai_Le,
|
||||
Tai_Tham,
|
||||
Tai_Viet,
|
||||
Takri,
|
||||
Tamil,
|
||||
Tangut,
|
||||
Telugu,
|
||||
Thaana,
|
||||
Thai,
|
||||
Tibetan,
|
||||
Tifinagh,
|
||||
Tirhuta,
|
||||
Ugaritic,
|
||||
Unknown,
|
||||
Vai,
|
||||
Wancho,
|
||||
Warang_Citi,
|
||||
Yi,
|
||||
Zanabazar_Square.
|
||||
.P
|
||||
of recognized script names and their 4-character abbreviations can be obtained
|
||||
by running this command:
|
||||
.sp
|
||||
pcre2test -LS
|
||||
.sp
|
||||
.
|
||||
.
|
||||
.
|
||||
.SS "The general category property for \ep and \eP"
|
||||
.rs
|
||||
.sp
|
||||
Each character has exactly one Unicode general category property, specified by
|
||||
a two-letter abbreviation. For compatibility with Perl, negation can be
|
||||
specified by including a circumflex between the opening brace and the property
|
||||
|
@ -1003,9 +889,9 @@ The following general category property codes are supported:
|
|||
Zp Paragraph separator
|
||||
Zs Space separator
|
||||
.sp
|
||||
The special property L& is also supported: it matches a character that has
|
||||
the Lu, Ll, or Lt property, in other words, a letter that is not classified as
|
||||
a modifier or "other".
|
||||
The special property LC, which has the synonym L&, is also supported: it
|
||||
matches a character that has the Lu, Ll, or Lt property, in other words, a
|
||||
letter that is not classified as a modifier or "other".
|
||||
.P
|
||||
The Cs (Surrogate) property applies only to characters whose code points are in
|
||||
the range U+D800 to U+DFFF. These characters are no different to any other
|
||||
|
@ -1029,12 +915,53 @@ Unicode table.
|
|||
Specifying caseless matching does not affect these escape sequences. For
|
||||
example, \ep{Lu} always matches only upper case letters. This is different from
|
||||
the behaviour of current versions of Perl.
|
||||
.P
|
||||
Matching characters by Unicode property is not fast, because PCRE2 has to do a
|
||||
multistage table lookup in order to find a character's property. That is why
|
||||
the traditional escape sequences such as \ed and \ew do not use Unicode
|
||||
properties in PCRE2 by default, though you can make them do so by setting the
|
||||
PCRE2_UCP option or by starting the pattern with (*UCP).
|
||||
.
|
||||
.
|
||||
.SS "Binary (yes/no) properties for \ep and \eP"
|
||||
.rs
|
||||
.sp
|
||||
Unicode defines a number of binary properties, that is, properties whose only
|
||||
values are true or false. You can obtain a list of those that are recognized by
|
||||
\ep and \eP, along with their abbreviations, by running this command:
|
||||
.sp
|
||||
pcre2test -LP
|
||||
.sp
|
||||
.
|
||||
.
|
||||
.SS "The Bidi_Class property for \ep and \eP"
|
||||
.rs
|
||||
.sp
|
||||
\ep{Bidi_Class:<class>} matches a character with the given class
|
||||
\ep{BC:<class>} matches a character with the given class
|
||||
.sp
|
||||
The recognized classes are:
|
||||
.sp
|
||||
AL Arabic letter
|
||||
AN Arabic number
|
||||
B paragraph separator
|
||||
BN boundary neutral
|
||||
CS common separator
|
||||
EN European number
|
||||
ES European separator
|
||||
ET European terminator
|
||||
FSI first strong isolate
|
||||
L left-to-right
|
||||
LRE left-to-right embedding
|
||||
LRI left-to-right isolate
|
||||
LRO left-to-right override
|
||||
NSM non-spacing mark
|
||||
ON other neutral
|
||||
PDF pop directional format
|
||||
PDI pop directional isolate
|
||||
R right-to-left
|
||||
RLE right-to-left embedding
|
||||
RLI right-to-left isolate
|
||||
RLO right-to-left override
|
||||
S segment separator
|
||||
WS which space
|
||||
.sp
|
||||
An equals sign may be used instead of a colon. The class names are
|
||||
case-insensitive; only the short names listed above are recognized.
|
||||
.
|
||||
.
|
||||
.SS Extended grapheme clusters
|
||||
|
@ -1064,7 +991,7 @@ additional characters according to the following rules for ending a cluster:
|
|||
3. Do not break Hangul (a Korean script) syllable sequences. Hangul characters
|
||||
are of five types: L, V, T, LV, and LVT. An L character may be followed by an
|
||||
L, V, LV, or LVT character; an LV or V character may be followed by a V or T
|
||||
character; an LVT or T character may be follwed only by a T character.
|
||||
character; an LVT or T character may be followed only by a T character.
|
||||
.P
|
||||
4. Do not end before extending characters or spacing marks or the "zero-width
|
||||
joiner" character. Characters with the "mark" property always have the
|
||||
|
@ -1150,8 +1077,11 @@ For example, when the pattern
|
|||
.sp
|
||||
matches "foobar", the first substring is still set to "foo".
|
||||
.P
|
||||
Perl documents that the use of \eK within assertions is "not well defined". In
|
||||
PCRE2, \eK is acted upon when it occurs inside positive assertions, but is
|
||||
From version 5.32.0 Perl forbids the use of \eK in lookaround assertions. From
|
||||
release 10.38 PCRE2 also forbids this by default. However, the
|
||||
PCRE2_EXTRA_ALLOW_LOOKAROUND_BSK option can be used when calling
|
||||
\fBpcre2_compile()\fP to re-enable the previous behaviour. When this option is
|
||||
set, \eK is acted upon when it occurs inside positive assertions, but is
|
||||
ignored in negative assertions. Note that when a pattern such as (?=ab\eK)
|
||||
matches, the reported start of the match can be greater than the end of the
|
||||
match. Using \eK in a lookbehind assertion at the start of a pattern can also
|
||||
|
@ -1310,14 +1240,19 @@ end of the subject in both modes, and if all branches of a pattern start with
|
|||
.sp
|
||||
Outside a character class, a dot in the pattern matches any one character in
|
||||
the subject string except (by default) a character that signifies the end of a
|
||||
line.
|
||||
line. One or more characters may be specified as line terminators (see
|
||||
.\" HTML <a href="#newlines">
|
||||
.\" </a>
|
||||
"Newline conventions"
|
||||
.\"
|
||||
above).
|
||||
.P
|
||||
When a line ending is defined as a single character, dot never matches that
|
||||
character; when the two-character sequence CRLF is used, dot does not match CR
|
||||
if it is immediately followed by LF, but otherwise it matches all characters
|
||||
(including isolated CRs and LFs). When any Unicode line endings are being
|
||||
recognized, dot does not match CR or LF or any of the other line ending
|
||||
characters.
|
||||
Dot never matches a single line-ending character. When the two-character
|
||||
sequence CRLF is the only line ending, dot does not match CR if it is
|
||||
immediately followed by LF, but otherwise it matches all characters (including
|
||||
isolated CRs and LFs). When ANYCRLF is selected for line endings, no occurences
|
||||
of CR of LF match dot. When all Unicode line endings are being recognized, dot
|
||||
does not match CR or LF or any of the other line ending characters.
|
||||
.P
|
||||
The behaviour of dot with regard to newlines can be changed. If the
|
||||
PCRE2_DOTALL option is set, a dot matches any one character, without exception.
|
||||
|
@ -1431,7 +1366,10 @@ Characters in a class may be specified by their code points using \eo, \ex, or
|
|||
\eN{U+hh..} in the usual way. When caseless matching is set, any letters in a
|
||||
class represent both their upper case and lower case versions, so for example,
|
||||
a caseless [aeiou] matches "A" as well as "a", and a caseless [^aeiou] does not
|
||||
match "A", whereas a caseful version would.
|
||||
match "A", whereas a caseful version would. Note that there are two ASCII
|
||||
characters, K and S, that, in addition to their lower case ASCII equivalents,
|
||||
are case-equivalent with Unicode U+212A (Kelvin sign) and U+017F (long S)
|
||||
respectively when either PCRE2_UTF or PCRE2_UCP is set.
|
||||
.P
|
||||
Characters that might indicate line breaks are never treated in any special way
|
||||
when matching character classes, whatever line-ending sequence is in use, and
|
||||
|
@ -1643,6 +1581,7 @@ that succeeds is used. If the alternatives are within a group
|
|||
alternative in the group.
|
||||
.
|
||||
.
|
||||
.\" HTML <a name="internaloptions"></a>
|
||||
.SH "INTERNAL OPTION SETTING"
|
||||
.rs
|
||||
.sp
|
||||
|
@ -1901,12 +1840,21 @@ are permitted for groups with the same number, for example:
|
|||
(?|(?<AA>aa)|(?<AA>bb))
|
||||
.sp
|
||||
The duplicate name constraint can be disabled by setting the PCRE2_DUPNAMES
|
||||
option at compile time, or by the use of (?J) within the pattern. Duplicate
|
||||
names can be useful for patterns where only one instance of the named capture
|
||||
group can match. Suppose you want to match the name of a weekday, either as a
|
||||
3-letter abbreviation or as the full name, and in both cases you want to
|
||||
extract the abbreviation. This pattern (ignoring the line breaks) does the job:
|
||||
option at compile time, or by the use of (?J) within the pattern, as described
|
||||
in the section entitled
|
||||
.\" HTML <a href="#internaloptions">
|
||||
.\" </a>
|
||||
"Internal Option Setting"
|
||||
.\"
|
||||
above.
|
||||
.P
|
||||
Duplicate names can be useful for patterns where only one instance of the named
|
||||
capture group can match. Suppose you want to match the name of a weekday,
|
||||
either as a 3-letter abbreviation or as the full name, and in both cases you
|
||||
want to extract the abbreviation. This pattern (ignoring the line breaks) does
|
||||
the job:
|
||||
.sp
|
||||
(?J)
|
||||
(?<DN>Mon|Fri|Sun)(?:day)?|
|
||||
(?<DN>Tue)(?:sday)?|
|
||||
(?<DN>Wed)(?:nesday)?|
|
||||
|
@ -1926,7 +1874,7 @@ they appear in the overall pattern. The first one that is set is used for the
|
|||
reference. For example, this pattern matches both "foofoo" and "barbar" but not
|
||||
"foobar" or "barfoo":
|
||||
.sp
|
||||
(?:(?<n>foo)|(?<n>bar))\ek<n>
|
||||
(?J)(?:(?<n>foo)|(?<n>bar))\ek<n>
|
||||
.sp
|
||||
.P
|
||||
If you make a subroutine call to a non-unique named group, the one that
|
||||
|
@ -1965,7 +1913,7 @@ items:
|
|||
an escape such as \ed or \epL that matches a single character
|
||||
a character class
|
||||
a backreference
|
||||
a parenthesized group (including most assertions)
|
||||
a parenthesized group (including lookaround assertions)
|
||||
a subroutine call (recursive or otherwise)
|
||||
.sp
|
||||
The general repetition quantifier specifies a minimum and maximum number of
|
||||
|
@ -2147,10 +2095,10 @@ be easier to remember:
|
|||
.sp
|
||||
(*atomic:\ed+)foo
|
||||
.sp
|
||||
This kind of parenthesized group "locks up" the part of the pattern it
|
||||
contains once it has matched, and a failure further into the pattern is
|
||||
prevented from backtracking into it. Backtracking past it to previous items,
|
||||
however, works as normal.
|
||||
This kind of parenthesized group "locks up" the part of the pattern it contains
|
||||
once it has matched, and a failure further into the pattern is prevented from
|
||||
backtracking into it. Backtracking past it to previous items, however, works as
|
||||
normal.
|
||||
.P
|
||||
An alternative description is that a group of this type matches exactly the
|
||||
string of characters that an identical standalone pattern would match, if
|
||||
|
@ -2346,14 +2294,14 @@ the first iteration does not need to match the backreference. This can be done
|
|||
using alternation, as in the example above, or by a quantifier with a minimum
|
||||
of zero.
|
||||
.P
|
||||
Backreferences of this type cause the group that they reference to be treated
|
||||
as an
|
||||
For versions of PCRE2 less than 10.25, backreferences of this type used to
|
||||
cause the group that they reference to be treated as an
|
||||
.\" HTML <a href="#atomicgroup">
|
||||
.\" </a>
|
||||
atomic group.
|
||||
.\"
|
||||
Once the whole group has been matched, a subsequent matching failure cannot
|
||||
cause backtracking into the middle of the group.
|
||||
This restriction no longer applies, and backtracking into such groups can occur
|
||||
as normal.
|
||||
.
|
||||
.
|
||||
.\" HTML <a name="bigassertions"></a>
|
||||
|
@ -2421,26 +2369,13 @@ the "no" branch of the condition. For other failing negative assertions,
|
|||
control passes to the previous backtracking point, thus discarding any captured
|
||||
strings within the assertion.
|
||||
.P
|
||||
For compatibility with Perl, most assertion groups may be repeated; though it
|
||||
makes no sense to assert the same thing several times, the side effect of
|
||||
capturing may occasionally be useful. However, an assertion that forms the
|
||||
condition for a conditional group may not be quantified. In practice, for
|
||||
other assertions, there only three cases:
|
||||
.sp
|
||||
(1) If the quantifier is {0}, the assertion is never obeyed during matching.
|
||||
However, it may contain internal capture groups that are called from elsewhere
|
||||
via the
|
||||
.\" HTML <a href="#groupsassubroutines">
|
||||
.\" </a>
|
||||
subroutine mechanism.
|
||||
.\"
|
||||
.sp
|
||||
(2) If quantifier is {0,n} where n is greater than zero, it is treated as if it
|
||||
were {0,1}. At run time, the rest of the pattern match is tried with and
|
||||
without the assertion, the order depending on the greediness of the quantifier.
|
||||
.sp
|
||||
(3) If the minimum repetition is greater than zero, the quantifier is ignored.
|
||||
The assertion is obeyed just once when encountered during matching.
|
||||
Most assertion groups may be repeated; though it makes no sense to assert the
|
||||
same thing several times, the side effect of capturing in positive assertions
|
||||
may occasionally be useful. However, an assertion that forms the condition for
|
||||
a conditional group may not be quantified. PCRE2 used to restrict the
|
||||
repetition of assertions, but from release 10.35 the only restriction is that
|
||||
an unlimited maximum repetition is changed to be one more than the minimum. For
|
||||
example, {3,} is treated as {3,4}.
|
||||
.
|
||||
.
|
||||
.SS "Alphabetic assertion names"
|
||||
|
@ -2637,8 +2572,8 @@ backtracking into the assertion. However, there are some cases where non-atomic
|
|||
positive assertions can be useful. PCRE2 provides these using the following
|
||||
syntax:
|
||||
.sp
|
||||
(*non_atomic_positive_lookahead: or (*napla:
|
||||
(*non_atomic_positive_lookbehind: or (*naplb:
|
||||
(*non_atomic_positive_lookahead: or (*napla: or (?*
|
||||
(*non_atomic_positive_lookbehind: or (*naplb: or (?<*
|
||||
.sp
|
||||
Consider the problem of finding the right-most word in a string that also
|
||||
appears earlier in the string, that is, it must appear at least twice in total.
|
||||
|
@ -2674,9 +2609,14 @@ pattern. If this is not the case, the rest of the pattern match fails exactly
|
|||
as before because nothing has changed, so using a non-atomic assertion just
|
||||
wastes resources.
|
||||
.P
|
||||
There is one exception to backtracking into a non-atomic assertion. If an
|
||||
(*ACCEPT) control verb is triggered, the assertion succeeds atomically. That
|
||||
is, a subsequent match failure cannot backtrack into the assertion.
|
||||
.P
|
||||
Non-atomic assertions are not supported by the alternative matching function
|
||||
\fBpcre2_dfa_match()\fP. They are also not supported by JIT (but may be in
|
||||
future). Note that assertions that appear as conditions for
|
||||
\fBpcre2_dfa_match()\fP. They are supported by JIT, but only if they do not
|
||||
contain any control verbs such as (*ACCEPT). (This may change in future). Note
|
||||
that assertions that appear as conditions for
|
||||
.\" HTML <a href="#conditions">
|
||||
.\" </a>
|
||||
conditional groups
|
||||
|
@ -2904,7 +2844,7 @@ breaks):
|
|||
(?(DEFINE) (?<byte> 2[0-4]\ed | 25[0-5] | 1\ed\ed | [1-9]?\ed) )
|
||||
\eb (?&byte) (\e.(?&byte)){3} \eb
|
||||
.sp
|
||||
The first part of the pattern is a DEFINE group inside which a another group
|
||||
The first part of the pattern is a DEFINE group inside which another group
|
||||
named "byte" is defined. This matches an individual component of an IPv4
|
||||
address (a number less than 256). When matching takes place, this part of the
|
||||
pattern is skipped because DEFINE acts like a false condition. The rest of the
|
||||
|
@ -3634,7 +3574,7 @@ successful match if there is a later mismatch. Consider:
|
|||
.sp
|
||||
If the subject is "aaaac...", after the first match attempt fails (starting at
|
||||
the first character in the string), the starting point skips on to start the
|
||||
next attempt at "c". Note that a possessive quantifer does not have the same
|
||||
next attempt at "c". Note that a possessive quantifier does not have the same
|
||||
effect as this example; although it would suppress backtracking during the
|
||||
first match attempt, the second attempt would start at the second character
|
||||
instead of skipping on to "c".
|
||||
|
@ -3865,7 +3805,7 @@ there is a backtrack at the outer level.
|
|||
.sp
|
||||
.nf
|
||||
Philip Hazel
|
||||
University Computing Service
|
||||
Retired from University Computing Service
|
||||
Cambridge, England.
|
||||
.fi
|
||||
.
|
||||
|
@ -3874,6 +3814,6 @@ Cambridge, England.
|
|||
.rs
|
||||
.sp
|
||||
.nf
|
||||
Last updated: 29 July 2019
|
||||
Copyright (c) 1997-2019 University of Cambridge.
|
||||
Last updated: 12 January 2022
|
||||
Copyright (c) 1997-2022 University of Cambridge.
|
||||
.fi
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
.TH PCRE2PERFORM 3 "03 February 2019" "PCRE2 10.33"
|
||||
.TH PCRE2PERFORM 3 "27 July 2022" "PCRE2 10.41"
|
||||
.SH NAME
|
||||
PCRE2 - Perl-compatible regular expressions (revised API)
|
||||
.SH "PCRE2 PERFORMANCE"
|
||||
|
@ -69,12 +69,28 @@ From release 10.30, the interpretive (non-JIT) version of \fBpcre2_match()\fP
|
|||
uses very little system stack at run time. In earlier releases recursive
|
||||
function calls could use a great deal of stack, and this could cause problems,
|
||||
but this usage has been eliminated. Backtracking positions are now explicitly
|
||||
remembered in memory frames controlled by the code. An initial 20KiB vector of
|
||||
frames is allocated on the system stack (enough for about 100 frames for small
|
||||
patterns), but if this is insufficient, heap memory is used. The amount of heap
|
||||
memory can be limited; if the limit is set to zero, only the initial stack
|
||||
vector is used. Rewriting patterns to be time-efficient, as described below,
|
||||
may also reduce the memory requirements.
|
||||
remembered in memory frames controlled by the code.
|
||||
.P
|
||||
The size of each frame depends on the size of pointer variables and the number
|
||||
of capturing parenthesized groups in the pattern being matched. On a 64-bit
|
||||
system the frame size for a pattern with no captures is 128 bytes. For each
|
||||
capturing group the size increases by 16 bytes.
|
||||
.P
|
||||
Until release 10.41, an initial 20KiB frames vector was allocated on the system
|
||||
stack, but this still caused some issues for multi-thread applications where
|
||||
each thread has a very small stack. From release 10.41 backtracking memory
|
||||
frames are always held in heap memory. An initial heap allocation is obtained
|
||||
the first time any match data block is passed to \fBpcre2_match()\fP. This is
|
||||
remembered with the match data block and re-used if that block is used for
|
||||
another match. It is freed when the match data block itself is freed.
|
||||
.P
|
||||
The size of the initial block is the larger of 20KiB or ten times the pattern's
|
||||
frame size, unless the heap limit is less than this, in which case the heap
|
||||
limit is used. If the initial block proves to be too small during matching, it
|
||||
is replaced by a larger block, subject to the heap limit. The heap limit is
|
||||
checked only when a new block is to be allocated. Reducing the heap limit
|
||||
between calls to \fBpcre2_match()\fP with the same match data block does not
|
||||
affect the saved block.
|
||||
.P
|
||||
In contrast to \fBpcre2_match()\fP, \fBpcre2_dfa_match()\fP does use recursive
|
||||
function calls, but only for processing atomic groups, lookaround assertions,
|
||||
|
@ -230,7 +246,7 @@ pattern to match. This is done by repeatedly matching with different limits.
|
|||
.sp
|
||||
.nf
|
||||
Philip Hazel
|
||||
University Computing Service
|
||||
Retired from University Computing Service
|
||||
Cambridge, England.
|
||||
.fi
|
||||
.
|
||||
|
@ -239,6 +255,6 @@ Cambridge, England.
|
|||
.rs
|
||||
.sp
|
||||
.nf
|
||||
Last updated: 03 February 2019
|
||||
Copyright (c) 1997-2019 University of Cambridge.
|
||||
Last updated: 27 July 2022
|
||||
Copyright (c) 1997-2022 University of Cambridge.
|
||||
.fi
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
.TH PCRE2POSIX 3 "30 January 2019" "PCRE2 10.33"
|
||||
.TH PCRE2POSIX 3 "26 April 2021" "PCRE2 10.37"
|
||||
.SH NAME
|
||||
PCRE2 - Perl-compatible regular expressions (revised API)
|
||||
.SH "SYNOPSIS"
|
||||
|
@ -44,11 +44,14 @@ can be accessed by adding \fB-lpcre2-posix\fP to the command for linking an
|
|||
application. Because the POSIX functions call the native ones, it is also
|
||||
necessary to add \fB-lpcre2-8\fP.
|
||||
.P
|
||||
Although they are not defined as protypes in \fBpcre2posix.h\fP, the library
|
||||
does contain functions with the POSIX names \fBregcomp()\fP etc. These simply
|
||||
pass their arguments to the PCRE2 functions. These functions are provided for
|
||||
backwards compatibility with earlier versions of PCRE2, so that existing
|
||||
programs do not have to be recompiled.
|
||||
Although they were not defined as protypes in \fBpcre2posix.h\fP, releases
|
||||
10.33 to 10.36 of the library contained functions with the POSIX names
|
||||
\fBregcomp()\fP etc. These simply passed their arguments to the PCRE2
|
||||
functions. These functions were provided for backwards compatibility with
|
||||
earlier versions of PCRE2, which had only POSIX names. However, this has proved
|
||||
troublesome in situations where a program links with several libraries, some of
|
||||
which use PCRE2's POSIX interface while others use the real POSIX functions.
|
||||
For this reason, the POSIX names have been removed since release 10.37.
|
||||
.P
|
||||
Calling the header file \fBpcre2posix.h\fP avoids any conflict with other POSIX
|
||||
libraries. It can, of course, be renamed or aliased as \fBregex.h\fP, which is
|
||||
|
@ -321,6 +324,6 @@ Cambridge, England.
|
|||
.rs
|
||||
.sp
|
||||
.nf
|
||||
Last updated: 30 January 2019
|
||||
Copyright (c) 1997-2019 University of Cambridge.
|
||||
Last updated: 26 April 2021
|
||||
Copyright (c) 1997-2021 University of Cambridge.
|
||||
.fi
|
||||
|
|
|
@ -6,11 +6,11 @@ PCRE2 - Perl-compatible regular expressions (revised API)
|
|||
.sp
|
||||
.nf
|
||||
.B int32_t pcre2_serialize_decode(pcre2_code **\fIcodes\fP,
|
||||
.B " int32_t \fInumber_of_codes\fP, const uint32_t *\fIbytes\fP,"
|
||||
.B " int32_t \fInumber_of_codes\fP, const uint8_t *\fIbytes\fP,"
|
||||
.B " pcre2_general_context *\fIgcontext\fP);"
|
||||
.sp
|
||||
.B int32_t pcre2_serialize_encode(pcre2_code **\fIcodes\fP,
|
||||
.B " int32_t \fInumber_of_codes\fP, uint32_t **\fIserialized_bytes\fP,"
|
||||
.B int32_t pcre2_serialize_encode(const pcre2_code **\fIcodes\fP,
|
||||
.B " int32_t \fInumber_of_codes\fP, uint8_t **\fIserialized_bytes\fP,"
|
||||
.B " PCRE2_SIZE *\fIserialized_size\fP, pcre2_general_context *\fIgcontext\fP);"
|
||||
.sp
|
||||
.B void pcre2_serialize_free(uint8_t *\fIbytes\fP);
|
||||
|
@ -81,7 +81,7 @@ of serialized patterns, or one of the following negative error codes:
|
|||
.sp
|
||||
PCRE2_ERROR_BADDATA the number of patterns is zero or less
|
||||
PCRE2_ERROR_BADMAGIC mismatch of id bytes in one of the patterns
|
||||
PCRE2_ERROR_MEMORY memory allocation failed
|
||||
PCRE2_ERROR_NOMEMORY memory allocation failed
|
||||
PCRE2_ERROR_MIXEDTABLES the patterns do not all use the same tables
|
||||
PCRE2_ERROR_NULL the 1st, 3rd, or 4th argument is NULL
|
||||
.sp
|
||||
|
@ -141,7 +141,6 @@ mangagement functions for the decoded patterns. If this argument is NULL,
|
|||
\fBmalloc()\fP and \fBfree()\fP are used. After deserialization, the byte
|
||||
stream is no longer needed and can be discarded.
|
||||
.sp
|
||||
int32_t number_of_codes;
|
||||
pcre2_code *list_of_codes[2];
|
||||
uint8_t *bytes = <serialized data>;
|
||||
int32_t number_of_codes =
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
.TH PCRE2SYNTAX 3 "29 July 2019" "PCRE2 10.34"
|
||||
.TH PCRE2SYNTAX 3 "12 January 2022" "PCRE2 10.40"
|
||||
.SH NAME
|
||||
PCRE2 - Perl-compatible regular expressions (revised API)
|
||||
.SH "PCRE2 REGULAR EXPRESSION SYNTAX SUMMARY"
|
||||
|
@ -102,6 +102,10 @@ happening, \es and \ew may also match characters with code points in the range
|
|||
128-255. If the PCRE2_UCP option is set, the behaviour of these escape
|
||||
sequences is changed to use Unicode properties and they match many more
|
||||
characters.
|
||||
.P
|
||||
Property descriptions in \ep and \eP are matched caselessly; hyphens,
|
||||
underscores, and white space are ignored, in accordance with Unicode's "loose
|
||||
matching" rules.
|
||||
.
|
||||
.
|
||||
.SH "GENERAL CATEGORY PROPERTIES FOR \ep and \eP"
|
||||
|
@ -120,6 +124,7 @@ characters.
|
|||
Lo Other letter
|
||||
Lt Title case letter
|
||||
Lu Upper case letter
|
||||
Lc Ll, Lu, or Lt
|
||||
L& Ll, Lu, or Lt
|
||||
.sp
|
||||
M Mark
|
||||
|
@ -167,161 +172,59 @@ Perl and POSIX space are now the same. Perl added VT to its space character set
|
|||
at release 5.18.
|
||||
.
|
||||
.
|
||||
.SH "SCRIPT NAMES FOR \ep AND \eP"
|
||||
.SH "BINARY PROPERTIES FOR \ep AND \eP"
|
||||
.rs
|
||||
.sp
|
||||
Adlam,
|
||||
Ahom,
|
||||
Anatolian_Hieroglyphs,
|
||||
Arabic,
|
||||
Armenian,
|
||||
Avestan,
|
||||
Balinese,
|
||||
Bamum,
|
||||
Bassa_Vah,
|
||||
Batak,
|
||||
Bengali,
|
||||
Bhaiksuki,
|
||||
Bopomofo,
|
||||
Brahmi,
|
||||
Braille,
|
||||
Buginese,
|
||||
Buhid,
|
||||
Canadian_Aboriginal,
|
||||
Carian,
|
||||
Caucasian_Albanian,
|
||||
Chakma,
|
||||
Cham,
|
||||
Cherokee,
|
||||
Common,
|
||||
Coptic,
|
||||
Cuneiform,
|
||||
Cypriot,
|
||||
Cyrillic,
|
||||
Deseret,
|
||||
Devanagari,
|
||||
Dogra,
|
||||
Duployan,
|
||||
Egyptian_Hieroglyphs,
|
||||
Elbasan,
|
||||
Elymaic,
|
||||
Ethiopic,
|
||||
Georgian,
|
||||
Glagolitic,
|
||||
Gothic,
|
||||
Grantha,
|
||||
Greek,
|
||||
Gujarati,
|
||||
Gunjala_Gondi,
|
||||
Gurmukhi,
|
||||
Han,
|
||||
Hangul,
|
||||
Hanifi_Rohingya,
|
||||
Hanunoo,
|
||||
Hatran,
|
||||
Hebrew,
|
||||
Hiragana,
|
||||
Imperial_Aramaic,
|
||||
Inherited,
|
||||
Inscriptional_Pahlavi,
|
||||
Inscriptional_Parthian,
|
||||
Javanese,
|
||||
Kaithi,
|
||||
Kannada,
|
||||
Katakana,
|
||||
Kayah_Li,
|
||||
Kharoshthi,
|
||||
Khmer,
|
||||
Khojki,
|
||||
Khudawadi,
|
||||
Lao,
|
||||
Latin,
|
||||
Lepcha,
|
||||
Limbu,
|
||||
Linear_A,
|
||||
Linear_B,
|
||||
Lisu,
|
||||
Lycian,
|
||||
Lydian,
|
||||
Mahajani,
|
||||
Makasar,
|
||||
Malayalam,
|
||||
Mandaic,
|
||||
Manichaean,
|
||||
Marchen,
|
||||
Masaram_Gondi,
|
||||
Medefaidrin,
|
||||
Meetei_Mayek,
|
||||
Mende_Kikakui,
|
||||
Meroitic_Cursive,
|
||||
Meroitic_Hieroglyphs,
|
||||
Miao,
|
||||
Modi,
|
||||
Mongolian,
|
||||
Mro,
|
||||
Multani,
|
||||
Myanmar,
|
||||
Nabataean,
|
||||
Nandinagari,
|
||||
New_Tai_Lue,
|
||||
Newa,
|
||||
Nko,
|
||||
Nushu,
|
||||
Nyakeng_Puachue_Hmong,
|
||||
Ogham,
|
||||
Ol_Chiki,
|
||||
Old_Hungarian,
|
||||
Old_Italic,
|
||||
Old_North_Arabian,
|
||||
Old_Permic,
|
||||
Old_Persian,
|
||||
Old_Sogdian,
|
||||
Old_South_Arabian,
|
||||
Old_Turkic,
|
||||
Oriya,
|
||||
Osage,
|
||||
Osmanya,
|
||||
Pahawh_Hmong,
|
||||
Palmyrene,
|
||||
Pau_Cin_Hau,
|
||||
Phags_Pa,
|
||||
Phoenician,
|
||||
Psalter_Pahlavi,
|
||||
Rejang,
|
||||
Runic,
|
||||
Samaritan,
|
||||
Saurashtra,
|
||||
Sharada,
|
||||
Shavian,
|
||||
Siddham,
|
||||
SignWriting,
|
||||
Sinhala,
|
||||
Sogdian,
|
||||
Sora_Sompeng,
|
||||
Soyombo,
|
||||
Sundanese,
|
||||
Syloti_Nagri,
|
||||
Syriac,
|
||||
Tagalog,
|
||||
Tagbanwa,
|
||||
Tai_Le,
|
||||
Tai_Tham,
|
||||
Tai_Viet,
|
||||
Takri,
|
||||
Tamil,
|
||||
Tangut,
|
||||
Telugu,
|
||||
Thaana,
|
||||
Thai,
|
||||
Tibetan,
|
||||
Tifinagh,
|
||||
Tirhuta,
|
||||
Ugaritic,
|
||||
Vai,
|
||||
Wancho,
|
||||
Warang_Citi,
|
||||
Yi,
|
||||
Zanabazar_Square.
|
||||
Unicode defines a number of binary properties, that is, properties whose only
|
||||
values are true or false. You can obtain a list of those that are recognized by
|
||||
\ep and \eP, along with their abbreviations, by running this command:
|
||||
.sp
|
||||
pcre2test -LP
|
||||
.
|
||||
.
|
||||
.
|
||||
.SH "SCRIPT MATCHING WITH \ep AND \eP"
|
||||
.rs
|
||||
.sp
|
||||
Many script names and their 4-letter abbreviations are recognized in
|
||||
\ep{sc:...} or \ep{scx:...} items, or on their own with \ep (and also \eP of
|
||||
course). You can obtain a list of these scripts by running this command:
|
||||
.sp
|
||||
pcre2test -LS
|
||||
.
|
||||
.
|
||||
.
|
||||
.SH "THE BIDI_CLASS PROPERTY FOR \ep AND \eP"
|
||||
.rs
|
||||
.sp
|
||||
\ep{Bidi_Class:<class>} matches a character with the given class
|
||||
\ep{BC:<class>} matches a character with the given class
|
||||
.sp
|
||||
The recognized classes are:
|
||||
.sp
|
||||
AL Arabic letter
|
||||
AN Arabic number
|
||||
B paragraph separator
|
||||
BN boundary neutral
|
||||
CS common separator
|
||||
EN European number
|
||||
ES European separator
|
||||
ET European terminator
|
||||
FSI first strong isolate
|
||||
L left-to-right
|
||||
LRE left-to-right embedding
|
||||
LRI left-to-right isolate
|
||||
LRO left-to-right override
|
||||
NSM non-spacing mark
|
||||
ON other neutral
|
||||
PDF pop directional format
|
||||
PDI pop directional isolate
|
||||
R right-to-left
|
||||
RLE right-to-left embedding
|
||||
RLI right-to-left isolate
|
||||
RLO right-to-left override
|
||||
S segment separator
|
||||
WS which space
|
||||
.
|
||||
.
|
||||
.SH "CHARACTER CLASSES"
|
||||
|
@ -397,6 +300,9 @@ but some of them use Unicode properties if PCRE2_UCP is set. You can use
|
|||
.sp
|
||||
\eK set reported start of match
|
||||
.sp
|
||||
From release 10.38 \eK is not permitted by default in lookaround assertions,
|
||||
for compatibility with Perl. However, if the PCRE2_EXTRA_ALLOW_LOOKAROUND_BSK
|
||||
option is set, the previous behaviour is re-enabled. When this option is set,
|
||||
\eK is honoured in positive assertions, but ignored in negative ones.
|
||||
.
|
||||
.
|
||||
|
@ -441,7 +347,7 @@ Changes of these options within a group are automatically cancelled at the end
|
|||
of the group.
|
||||
.sp
|
||||
(?i) caseless
|
||||
(?J) allow duplicate names
|
||||
(?J) allow duplicate named groups
|
||||
(?m) multiline
|
||||
(?n) no auto capture
|
||||
(?s) single line (dotall)
|
||||
|
@ -531,11 +437,13 @@ Each top-level branch of a lookbehind must be of a fixed length.
|
|||
.sp
|
||||
These assertions are specific to PCRE2 and are not Perl-compatible.
|
||||
.sp
|
||||
(*napla:...)
|
||||
(*non_atomic_positive_lookahead:...)
|
||||
(?*...) )
|
||||
(*napla:...) ) synonyms
|
||||
(*non_atomic_positive_lookahead:...) )
|
||||
.sp
|
||||
(*naplb:...)
|
||||
(*non_atomic_positive_lookbehind:...)
|
||||
(?<*...) )
|
||||
(*naplb:...) ) synonyms
|
||||
(*non_atomic_positive_lookbehind:...) )
|
||||
.
|
||||
.
|
||||
.SH "SCRIPT RUNS"
|
||||
|
@ -661,7 +569,7 @@ delimiter }. To encode the ending delimiter within the string, double it.
|
|||
.sp
|
||||
.nf
|
||||
Philip Hazel
|
||||
University Computing Service
|
||||
Retired from University Computing Service
|
||||
Cambridge, England.
|
||||
.fi
|
||||
.
|
||||
|
@ -670,6 +578,6 @@ Cambridge, England.
|
|||
.rs
|
||||
.sp
|
||||
.nf
|
||||
Last updated: 29 July 2019
|
||||
Copyright (c) 1997-2019 University of Cambridge.
|
||||
Last updated: 12 January 2022
|
||||
Copyright (c) 1997-2022 University of Cambridge.
|
||||
.fi
|
||||
|
|
210
doc/pcre2test.1
210
doc/pcre2test.1
|
@ -1,4 +1,4 @@
|
|||
.TH PCRE2TEST 1 "30 July 2019" "PCRE 10.34"
|
||||
.TH PCRE2TEST 1 "27 July 2022" "PCRE 10.41"
|
||||
.SH NAME
|
||||
pcre2test - a program for testing Perl-compatible regular expressions.
|
||||
.SH SYNOPSIS
|
||||
|
@ -27,12 +27,7 @@ each match attempt. Modifiers on external or internal command lines, the
|
|||
patterns, and the subject lines specify PCRE2 function options, control how the
|
||||
subject is processed, and what output is produced.
|
||||
.P
|
||||
As the original fairly simple PCRE library evolved, it acquired many different
|
||||
features, and as a result, the original \fBpcretest\fP program ended up with a
|
||||
lot of options in a messy, arcane syntax for testing all the features. The
|
||||
move to the new PCRE2 API provided an opportunity to re-implement the test
|
||||
program as \fBpcre2test\fP, with a cleaner modifier syntax. Nevertheless, there
|
||||
are still many obscure modifiers, some of which are specifically designed for
|
||||
There are many obscure modifiers, some of which are specifically designed for
|
||||
use in conjunction with the test script and data files that are distributed as
|
||||
part of PCRE2. All the modifiers are documented here, some without much
|
||||
justification, but many of them are unlikely to be of use except when testing
|
||||
|
@ -52,7 +47,7 @@ format before being passed to the library functions. Results are converted back
|
|||
to 8-bit code units for output.
|
||||
.P
|
||||
In the rest of this document, the names of library functions and structures
|
||||
are given in generic form, for example, \fBpcre_compile()\fP. The actual
|
||||
are given in generic form, for example, \fBpcre2_compile()\fP. The actual
|
||||
names used in the libraries have a suffix _8, _16, or _32, as appropriate.
|
||||
.
|
||||
.
|
||||
|
@ -61,10 +56,10 @@ names used in the libraries have a suffix _8, _16, or _32, as appropriate.
|
|||
.rs
|
||||
.sp
|
||||
Input to \fBpcre2test\fP is processed line by line, either by calling the C
|
||||
library's \fBfgets()\fP function, or via the \fBlibreadline\fP library. In some
|
||||
Windows environments character 26 (hex 1A) causes an immediate end of file, and
|
||||
no further data is read, so this character should be avoided unless you really
|
||||
want that action.
|
||||
library's \fBfgets()\fP function, or via the \fBlibreadline\fP or \fBlibedit\fP
|
||||
library. In some Windows environments character 26 (hex 1A) causes an immediate
|
||||
end of file, and no further data is read, so this character should be avoided
|
||||
unless you really want that action.
|
||||
.P
|
||||
The input is processed using using C's string functions, so must not
|
||||
contain binary zeros, even though in Unix-like environments, \fBfgets()\fP
|
||||
|
@ -216,9 +211,19 @@ available, and the use of JIT for matching is verified.
|
|||
\fB-LM\fP
|
||||
List modifiers: write a list of available pattern and subject modifiers to the
|
||||
standard output, then exit with zero exit code. All other options are ignored.
|
||||
If both -C and -LM are present, whichever is first is recognized.
|
||||
If both -C and any -Lx options are present, whichever is first is recognized.
|
||||
.TP 10
|
||||
\fB-pattern\fB \fImodifier-list\fP
|
||||
\fB-LP\fP
|
||||
List properties: write a list of recognized Unicode properties to the standard
|
||||
output, then exit with zero exit code. All other options are ignored. If both
|
||||
-C and any -Lx options are present, whichever is first is recognized.
|
||||
.TP 10
|
||||
\fB-LS\fP
|
||||
List scripts: write a list of recogized Unicode script names to the standard
|
||||
output, then exit with zero exit code. All other options are ignored. If both
|
||||
-C and any -Lx options are present, whichever is first is recognized.
|
||||
.TP 10
|
||||
\fB-pattern\fP \fImodifier-list\fP
|
||||
Behave as if each pattern line contains the given modifiers.
|
||||
.TP 10
|
||||
\fB-q\fP
|
||||
|
@ -273,7 +278,7 @@ test data, command lines that begin with # may appear. This file format, with
|
|||
some restrictions, can also be processed by the \fBperltest.sh\fP script that
|
||||
is distributed with PCRE2 as a means of checking that the behaviour of PCRE2
|
||||
and Perl is the same. For a specification of \fBperltest.sh\fP, see the
|
||||
comments near its beginning.
|
||||
comments near its beginning. See also the #perltest command below.
|
||||
.P
|
||||
When the input is a terminal, \fBpcre2test\fP prompts for each line of input,
|
||||
using "re>" to prompt for regular expression patterns, and "data>" to prompt
|
||||
|
@ -326,6 +331,12 @@ described in the section entitled "Saving and restoring compiled patterns"
|
|||
.\" </a>
|
||||
below.
|
||||
.\"
|
||||
.sp
|
||||
#loadtables <filename>
|
||||
.sp
|
||||
This command is used to load a set of binary character tables that can be
|
||||
accessed by the tables=3 qualifier. Such tables can be created by the
|
||||
\fBpcre2_dftables\fP program with the -b option.
|
||||
.sp
|
||||
#newline_default [<newline-list>]
|
||||
.sp
|
||||
|
@ -363,14 +374,19 @@ patterns. Modifiers on a pattern can change these settings.
|
|||
.sp
|
||||
#perltest
|
||||
.sp
|
||||
The appearance of this line causes all subsequent modifier settings to be
|
||||
checked for compatibility with the \fBperltest.sh\fP script, which is used to
|
||||
confirm that Perl gives the same results as PCRE2. Also, apart from comment
|
||||
lines, #pattern commands, and #subject commands that set or unset "mark", no
|
||||
command lines are permitted, because they and many of the modifiers are
|
||||
specific to \fBpcre2test\fP, and should not be used in test files that are also
|
||||
processed by \fBperltest.sh\fP. The \fB#perltest\fP command helps detect tests
|
||||
that are accidentally put in the wrong file.
|
||||
This line is used in test files that can also be processed by \fBperltest.sh\fP
|
||||
to confirm that Perl gives the same results as PCRE2. Subsequent tests are
|
||||
checked for the use of \fBpcre2test\fP features that are incompatible with the
|
||||
\fBperltest.sh\fP script.
|
||||
.P
|
||||
Patterns must use '/' as their delimiter, and only certain modifiers are
|
||||
supported. Comment lines, #pattern commands, and #subject commands that set or
|
||||
unset "mark" are recognized and acted on. The #perltest, #forbid_utf, and
|
||||
#newline_default commands, which are needed in the relevant pcre2test files,
|
||||
are silently ignored. All other command lines are ignored, but give a warning
|
||||
message. The \fB#perltest\fP command helps detect tests that are accidentally
|
||||
put in the wrong file or use the wrong delimiter. For more details of the
|
||||
\fBperltest.sh\fP script see the comments it contains.
|
||||
.sp
|
||||
#pop [<modifiers>]
|
||||
#popcopy [<modifiers>]
|
||||
|
@ -432,15 +448,17 @@ excluding pattern meta-characters):
|
|||
.sp
|
||||
This is interpreted as the pattern's delimiter. A regular expression may be
|
||||
continued over several input lines, in which case the newline characters are
|
||||
included within it. It is possible to include the delimiter within the pattern
|
||||
by escaping it with a backslash, for example
|
||||
included within it. It is possible to include the delimiter as a literal within
|
||||
the pattern by escaping it with a backslash, for example
|
||||
.sp
|
||||
/abc\e/def/
|
||||
.sp
|
||||
If you do this, the escape and the delimiter form part of the pattern, but
|
||||
since the delimiters are all non-alphanumeric, this does not affect its
|
||||
interpretation. If the terminating delimiter is immediately followed by a
|
||||
backslash, for example,
|
||||
since the delimiters are all non-alphanumeric, the inclusion of the backslash
|
||||
does not affect the pattern's interpretation. Note, however, that this trick
|
||||
does not work within \eQ...\eE literal bracketing because the backslash will
|
||||
itself be interpreted as a literal. If the terminating delimiter is immediately
|
||||
followed by a backslash, for example,
|
||||
.sp
|
||||
/abc/\e
|
||||
.sp
|
||||
|
@ -459,11 +477,11 @@ A pattern can be followed by a modifier list (details below).
|
|||
.SH "SUBJECT LINE SYNTAX"
|
||||
.rs
|
||||
.sp
|
||||
Before each subject line is passed to \fBpcre2_match()\fP or
|
||||
\fBpcre2_dfa_match()\fP, leading and trailing white space is removed, and the
|
||||
line is scanned for backslash escapes, unless the \fBsubject_literal\fP
|
||||
modifier was set for the pattern. The following provide a means of encoding
|
||||
non-printing characters in a visible way:
|
||||
Before each subject line is passed to \fBpcre2_match()\fP,
|
||||
\fBpcre2_dfa_match()\fP, or \fBpcre2_jit_match()\fP, leading and trailing white
|
||||
space is removed, and the line is scanned for backslash escapes, unless the
|
||||
\fBsubject_literal\fP modifier was set for the pattern. The following provide a
|
||||
means of encoding non-printing characters in a visible way:
|
||||
.sp
|
||||
\ea alarm (BEL, \ex07)
|
||||
\eb backspace (\ex08)
|
||||
|
@ -559,6 +577,7 @@ way \fBpcre2_compile()\fP behaves. See
|
|||
for a description of the effects of these options.
|
||||
.sp
|
||||
allow_empty_class set PCRE2_ALLOW_EMPTY_CLASS
|
||||
allow_lookaround_bsk set PCRE2_EXTRA_ALLOW_LOOKAROUND_BSK
|
||||
allow_surrogate_escapes set PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES
|
||||
alt_bsux set PCRE2_ALT_BSUX
|
||||
alt_circumflex set PCRE2_ALT_CIRCUMFLEX
|
||||
|
@ -638,7 +657,7 @@ heavily used in the test files.
|
|||
pushcopy push a copy onto the stack
|
||||
stackguard=<number> test the stackguard feature
|
||||
subject_literal treat all subject lines as literal
|
||||
tables=[0|1|2] select internal tables
|
||||
tables=[0|1|2|3] select internal tables
|
||||
use_length do not zero-terminate the pattern
|
||||
utf8_input treat input as UTF-8
|
||||
.sp
|
||||
|
@ -988,18 +1007,20 @@ be aborted.
|
|||
.rs
|
||||
.sp
|
||||
The value specified for the \fBtables\fP modifier must be one of the digits 0,
|
||||
1, or 2. It causes a specific set of built-in character tables to be passed to
|
||||
\fBpcre2_compile()\fP. This is used in the PCRE2 tests to check behaviour with
|
||||
different character tables. The digit specifies the tables as follows:
|
||||
1, 2, or 3. It causes a specific set of built-in character tables to be passed
|
||||
to \fBpcre2_compile()\fP. This is used in the PCRE2 tests to check behaviour
|
||||
with different character tables. The digit specifies the tables as follows:
|
||||
.sp
|
||||
0 do not pass any special character tables
|
||||
1 the default ASCII tables, as distributed in
|
||||
pcre2_chartables.c.dist
|
||||
2 a set of tables defining ISO 8859 characters
|
||||
3 a set of tables loaded by the #loadtables command
|
||||
.sp
|
||||
In table 2, some characters whose codes are greater than 128 are identified as
|
||||
letters, digits, spaces, etc. Setting alternate character tables and a locale
|
||||
are mutually exclusive.
|
||||
In tables 2, some characters whose codes are greater than 128 are identified as
|
||||
letters, digits, spaces, etc. Tables 3 can be used only after a
|
||||
\fB#loadtables\fP command has loaded them from a binary file. Setting alternate
|
||||
character tables and a locale are mutually exclusive.
|
||||
.
|
||||
.
|
||||
.SS "Setting certain match controls"
|
||||
|
@ -1011,24 +1032,27 @@ modifier list, in which case they are applied to every subject line that is
|
|||
processed with that pattern. These modifiers do not affect the compilation
|
||||
process.
|
||||
.sp
|
||||
aftertext show text after match
|
||||
allaftertext show text after captures
|
||||
allcaptures show all captures
|
||||
allvector show the entire ovector
|
||||
allusedtext show all consulted text
|
||||
altglobal alternative global matching
|
||||
/g global global matching
|
||||
jitstack=<n> set size of JIT stack
|
||||
mark show mark values
|
||||
replace=<string> specify a replacement string
|
||||
startchar show starting character when relevant
|
||||
substitute_callout use substitution callouts
|
||||
substitute_extended use PCRE2_SUBSTITUTE_EXTENDED
|
||||
substitute_skip=<n> skip substitution number n
|
||||
substitute_overflow_length use PCRE2_SUBSTITUTE_OVERFLOW_LENGTH
|
||||
substitute_stop=<n> skip substitution number n and greater
|
||||
substitute_unknown_unset use PCRE2_SUBSTITUTE_UNKNOWN_UNSET
|
||||
substitute_unset_empty use PCRE2_SUBSTITUTE_UNSET_EMPTY
|
||||
aftertext show text after match
|
||||
allaftertext show text after captures
|
||||
allcaptures show all captures
|
||||
allvector show the entire ovector
|
||||
allusedtext show all consulted text
|
||||
altglobal alternative global matching
|
||||
/g global global matching
|
||||
jitstack=<n> set size of JIT stack
|
||||
mark show mark values
|
||||
replace=<string> specify a replacement string
|
||||
startchar show starting character when relevant
|
||||
substitute_callout use substitution callouts
|
||||
substitute_extended use PCRE2_SUBSTITUTE_EXTENDED
|
||||
substitute_literal use PCRE2_SUBSTITUTE_LITERAL
|
||||
substitute_matched use PCRE2_SUBSTITUTE_MATCHED
|
||||
substitute_overflow_length use PCRE2_SUBSTITUTE_OVERFLOW_LENGTH
|
||||
substitute_replacement_only use PCRE2_SUBSTITUTE_REPLACEMENT_ONLY
|
||||
substitute_skip=<n> skip substitution <n>
|
||||
substitute_stop=<n> skip substitution <n> and following
|
||||
substitute_unknown_unset use PCRE2_SUBSTITUTE_UNKNOWN_UNSET
|
||||
substitute_unset_empty use PCRE2_SUBSTITUTE_UNSET_EMPTY
|
||||
.sp
|
||||
These modifiers may not appear in a \fB#pattern\fP command. If you want them as
|
||||
defaults, set them in a \fB#subject\fP command.
|
||||
|
@ -1164,7 +1188,7 @@ its input), you must use \fBposix_startend\fP to specify its length.
|
|||
The following modifiers affect the matching process or request additional
|
||||
information. Some of them may also be specified on a pattern line (see above),
|
||||
in which case they apply to every subject line that is matched against that
|
||||
pattern.
|
||||
pattern, but can be overridden by modifiers on the subject.
|
||||
.sp
|
||||
aftertext show text after match
|
||||
allaftertext show text after captures
|
||||
|
@ -1182,7 +1206,8 @@ pattern.
|
|||
copy=<number or name> copy captured substring
|
||||
depth_limit=<n> set a depth limit
|
||||
dfa use \fBpcre2_dfa_match()\fP
|
||||
find_limits find match and depth limits
|
||||
find_limits find heap, match and depth limits
|
||||
find_limits_noheap find match and depth limits
|
||||
get=<number or name> extract captured substring
|
||||
getall extract all captured substrings
|
||||
/g global global matching
|
||||
|
@ -1192,6 +1217,8 @@ pattern.
|
|||
match_limit=<n> set a match limit
|
||||
memory show heap memory usage
|
||||
null_context match with a NULL context
|
||||
null_replacement substitute with NULL replacement
|
||||
null_subject match with NULL subject
|
||||
offset=<n> set starting offset
|
||||
offset_limit=<n> set offset limit
|
||||
ovector=<n> set size of output vector
|
||||
|
@ -1201,8 +1228,11 @@ pattern.
|
|||
startoffset=<n> same as offset=<n>
|
||||
substitute_callout use substitution callouts
|
||||
substitute_extedded use PCRE2_SUBSTITUTE_EXTENDED
|
||||
substitute_skip=<n> skip substitution number n
|
||||
substitute_literal use PCRE2_SUBSTITUTE_LITERAL
|
||||
substitute_matched use PCRE2_SUBSTITUTE_MATCHED
|
||||
substitute_overflow_length use PCRE2_SUBSTITUTE_OVERFLOW_LENGTH
|
||||
substitute_replacement_only use PCRE2_SUBSTITUTE_REPLACEMENT_ONLY
|
||||
substitute_skip=<n> skip substitution number n
|
||||
substitute_stop=<n> skip substitution number n and greater
|
||||
substitute_unknown_unset use PCRE2_SUBSTITUTE_UNKNOWN_UNSET
|
||||
substitute_unset_empty use PCRE2_SUBSTITUTE_UNSET_EMPTY
|
||||
|
@ -1365,9 +1395,14 @@ by name.
|
|||
.rs
|
||||
.sp
|
||||
If the \fBreplace\fP modifier is set, the \fBpcre2_substitute()\fP function is
|
||||
called instead of one of the matching functions. Note that replacement strings
|
||||
cannot contain commas, because a comma signifies the end of a modifier. This is
|
||||
not thought to be an issue in a test program.
|
||||
called instead of one of the matching functions (or after one call of
|
||||
\fBpcre2_match()\fP in the case of PCRE2_SUBSTITUTE_MATCHED). Note that
|
||||
replacement strings cannot contain commas, because a comma signifies the end of
|
||||
a modifier. This is not thought to be an issue in a test program.
|
||||
.P
|
||||
Specifying a completely empty replacement string disables this modifier.
|
||||
However, it is possible to specify an empty replacement by providing a buffer
|
||||
length, as described below, for an otherwise empty replacement.
|
||||
.P
|
||||
Unlike subject strings, \fBpcre2test\fP does not process replacement strings
|
||||
for escape sequences. In UTF mode, a replacement string is checked to see if it
|
||||
|
@ -1381,10 +1416,18 @@ for \fBpcre2_substitute()\fP:
|
|||
.sp
|
||||
global PCRE2_SUBSTITUTE_GLOBAL
|
||||
substitute_extended PCRE2_SUBSTITUTE_EXTENDED
|
||||
substitute_literal PCRE2_SUBSTITUTE_LITERAL
|
||||
substitute_matched PCRE2_SUBSTITUTE_MATCHED
|
||||
substitute_overflow_length PCRE2_SUBSTITUTE_OVERFLOW_LENGTH
|
||||
substitute_replacement_only PCRE2_SUBSTITUTE_REPLACEMENT_ONLY
|
||||
substitute_unknown_unset PCRE2_SUBSTITUTE_UNKNOWN_UNSET
|
||||
substitute_unset_empty PCRE2_SUBSTITUTE_UNSET_EMPTY
|
||||
.sp
|
||||
See the
|
||||
.\" HREF
|
||||
\fBpcre2api\fP
|
||||
.\"
|
||||
documentation for details of these options.
|
||||
.P
|
||||
After a successful substitution, the modified string is output, preceded by the
|
||||
number of replacements. This may be zero if there were no matches. Here is a
|
||||
|
@ -1486,7 +1529,7 @@ value that was set on the pattern.
|
|||
.sp
|
||||
The \fBheap_limit\fP, \fBmatch_limit\fP, and \fBdepth_limit\fP modifiers set
|
||||
the appropriate limits in the match context. These values are ignored when the
|
||||
\fBfind_limits\fP modifier is specified.
|
||||
\fBfind_limits\fP or \fBfind_limits_noheap\fP modifier is specified.
|
||||
.
|
||||
.
|
||||
.SS "Finding minimum limits"
|
||||
|
@ -1496,8 +1539,12 @@ If the \fBfind_limits\fP modifier is present on a subject line, \fBpcre2test\fP
|
|||
calls the relevant matching function several times, setting different values in
|
||||
the match context via \fBpcre2_set_heap_limit()\fP,
|
||||
\fBpcre2_set_match_limit()\fP, or \fBpcre2_set_depth_limit()\fP until it finds
|
||||
the minimum values for each parameter that allows the match to complete without
|
||||
error. If JIT is being used, only the match limit is relevant.
|
||||
the smallest value for each parameter that allows the match to complete without
|
||||
a "limit exceeded" error. The match itself may succeed or fail. An alternative
|
||||
modifier, \fBfind_limits_noheap\fP, omits the heap limit. This is used in the
|
||||
standard tests, because the minimum heap limit varies between systems. If JIT
|
||||
is being used, only the match limit is relevant, and the other two are
|
||||
automatically omitted.
|
||||
.P
|
||||
When using this modifier, the pattern should not contain any limit settings
|
||||
such as (*LIMIT_MATCH=...) within it. If such a setting is present and is
|
||||
|
@ -1521,9 +1568,7 @@ and non-recursive, to the internal matching function, thus controlling the
|
|||
overall amount of computing resource that is used.
|
||||
.P
|
||||
For both kinds of matching, the \fIheap_limit\fP number, which is in kibibytes
|
||||
(units of 1024 bytes), limits the amount of heap memory used for matching. A
|
||||
value of zero disables the use of any heap memory; many simple pattern matches
|
||||
can be done without using the heap, so zero is not an unreasonable setting.
|
||||
(units of 1024 bytes), limits the amount of heap memory used for matching.
|
||||
.
|
||||
.
|
||||
.SS "Showing MARK names"
|
||||
|
@ -1542,12 +1587,10 @@ is added to the non-match message.
|
|||
.sp
|
||||
The \fBmemory\fP modifier causes \fBpcre2test\fP to log the sizes of all heap
|
||||
memory allocation and freeing calls that occur during a call to
|
||||
\fBpcre2_match()\fP or \fBpcre2_dfa_match()\fP. These occur only when a match
|
||||
requires a bigger vector than the default for remembering backtracking points
|
||||
(\fBpcre2_match()\fP) or for internal workspace (\fBpcre2_dfa_match()\fP). In
|
||||
many cases there will be no heap memory used and therefore no additional
|
||||
output. No heap memory is allocated during matching with JIT, so in that case
|
||||
the \fBmemory\fP modifier never has any effect. For this modifier to work, the
|
||||
\fBpcre2_match()\fP or \fBpcre2_dfa_match()\fP. In the latter case, heap memory
|
||||
is used only when a match requires more internal workspace that the default
|
||||
allocation on the stack, so in many cases there will be no output. No heap
|
||||
memory is allocated during matching with JIT. For this modifier to work, the
|
||||
\fBnull_context\fP modifier must not be set on both the pattern and the
|
||||
subject, though it can be set on one or the other.
|
||||
.
|
||||
|
@ -1599,7 +1642,7 @@ When testing \fBpcre2_substitute()\fP, this modifier also has the effect of
|
|||
passing the replacement string as zero-terminated.
|
||||
.
|
||||
.
|
||||
.SS "Passing a NULL context"
|
||||
.SS "Passing a NULL context, subject, or replacement"
|
||||
.rs
|
||||
.sp
|
||||
Normally, \fBpcre2test\fP passes a context block to \fBpcre2_match()\fP,
|
||||
|
@ -1607,7 +1650,12 @@ Normally, \fBpcre2test\fP passes a context block to \fBpcre2_match()\fP,
|
|||
If the \fBnull_context\fP modifier is set, however, NULL is passed. This is for
|
||||
testing that the matching and substitution functions behave correctly in this
|
||||
case (they use default values). This modifier cannot be used with the
|
||||
\fBfind_limits\fP or \fBsubstitute_callout\fP modifiers.
|
||||
\fBfind_limits\fP, \fBfind_limits_noheap\fP, or \fBsubstitute_callout\fP
|
||||
modifiers.
|
||||
.P
|
||||
Similarly, for testing purposes, if the \fBnull_subject\fP or
|
||||
\fBnull_replacement\fP modifier is set, the subject or replacement string
|
||||
pointers are passed as NULL, respectively, to the relevant functions.
|
||||
.
|
||||
.
|
||||
.SH "THE ALTERNATIVE MATCHING FUNCTION"
|
||||
|
@ -2064,7 +2112,7 @@ on the stack.
|
|||
.sp
|
||||
.nf
|
||||
Philip Hazel
|
||||
University Computing Service
|
||||
Retired from University Computing Service
|
||||
Cambridge, England.
|
||||
.fi
|
||||
.
|
||||
|
@ -2073,6 +2121,6 @@ Cambridge, England.
|
|||
.rs
|
||||
.sp
|
||||
.nf
|
||||
Last updated: 30 July 2019
|
||||
Copyright (c) 1997-2019 University of Cambridge.
|
||||
Last updated: 27 July 2022
|
||||
Copyright (c) 1997-2022 University of Cambridge.
|
||||
.fi
|
||||
|
|
|
@ -24,17 +24,11 @@ SYNOPSIS
|
|||
tion options, control how the subject is processed, and what output is
|
||||
produced.
|
||||
|
||||
As the original fairly simple PCRE library evolved, it acquired many
|
||||
different features, and as a result, the original pcretest program
|
||||
ended up with a lot of options in a messy, arcane syntax for testing
|
||||
all the features. The move to the new PCRE2 API provided an opportunity
|
||||
to re-implement the test program as pcre2test, with a cleaner modifier
|
||||
syntax. Nevertheless, there are still many obscure modifiers, some of
|
||||
which are specifically designed for use in conjunction with the test
|
||||
script and data files that are distributed as part of PCRE2. All the
|
||||
modifiers are documented here, some without much justification, but
|
||||
many of them are unlikely to be of use except when testing the li-
|
||||
braries.
|
||||
There are many obscure modifiers, some of which are specifically de-
|
||||
signed for use in conjunction with the test script and data files that
|
||||
are distributed as part of PCRE2. All the modifiers are documented
|
||||
here, some without much justification, but many of them are unlikely to
|
||||
be of use except when testing the libraries.
|
||||
|
||||
|
||||
PCRE2's 8-BIT, 16-BIT AND 32-BIT LIBRARIES
|
||||
|
@ -50,7 +44,7 @@ PCRE2's 8-BIT, 16-BIT AND 32-BIT LIBRARIES
|
|||
output.
|
||||
|
||||
In the rest of this document, the names of library functions and struc-
|
||||
tures are given in generic form, for example, pcre_compile(). The ac-
|
||||
tures are given in generic form, for example, pcre2_compile(). The ac-
|
||||
tual names used in the libraries have a suffix _8, _16, or _32, as ap-
|
||||
propriate.
|
||||
|
||||
|
@ -58,10 +52,10 @@ PCRE2's 8-BIT, 16-BIT AND 32-BIT LIBRARIES
|
|||
INPUT ENCODING
|
||||
|
||||
Input to pcre2test is processed line by line, either by calling the C
|
||||
library's fgets() function, or via the libreadline library. In some
|
||||
Windows environments character 26 (hex 1A) causes an immediate end of
|
||||
file, and no further data is read, so this character should be avoided
|
||||
unless you really want that action.
|
||||
library's fgets() function, or via the libreadline or libedit library.
|
||||
In some Windows environments character 26 (hex 1A) causes an immediate
|
||||
end of file, and no further data is read, so this character should be
|
||||
avoided unless you really want that action.
|
||||
|
||||
The input is processed using using C's string functions, so must not
|
||||
contain binary zeros, even though in Unix-like environments, fgets()
|
||||
|
@ -203,7 +197,17 @@ COMMAND LINE OPTIONS
|
|||
|
||||
-LM List modifiers: write a list of available pattern and subject
|
||||
modifiers to the standard output, then exit with zero exit
|
||||
code. All other options are ignored. If both -C and -LM are
|
||||
code. All other options are ignored. If both -C and any -Lx
|
||||
options are present, whichever is first is recognized.
|
||||
|
||||
-LP List properties: write a list of recognized Unicode proper-
|
||||
ties to the standard output, then exit with zero exit code.
|
||||
All other options are ignored. If both -C and any -Lx options
|
||||
are present, whichever is first is recognized.
|
||||
|
||||
-LS List scripts: write a list of recogized Unicode script names
|
||||
to the standard output, then exit with zero exit code. All
|
||||
other options are ignored. If both -C and any -Lx options are
|
||||
present, whichever is first is recognized.
|
||||
|
||||
-pattern modifier-list
|
||||
|
@ -257,121 +261,134 @@ DESCRIPTION
|
|||
appear. This file format, with some restrictions, can also be processed
|
||||
by the perltest.sh script that is distributed with PCRE2 as a means of
|
||||
checking that the behaviour of PCRE2 and Perl is the same. For a speci-
|
||||
fication of perltest.sh, see the comments near its beginning.
|
||||
fication of perltest.sh, see the comments near its beginning. See also
|
||||
the #perltest command below.
|
||||
|
||||
When the input is a terminal, pcre2test prompts for each line of input,
|
||||
using "re>" to prompt for regular expression patterns, and "data>" to
|
||||
prompt for subject lines. Command lines starting with # can be entered
|
||||
using "re>" to prompt for regular expression patterns, and "data>" to
|
||||
prompt for subject lines. Command lines starting with # can be entered
|
||||
only in response to the "re>" prompt.
|
||||
|
||||
Each subject line is matched separately and independently. If you want
|
||||
Each subject line is matched separately and independently. If you want
|
||||
to do multi-line matches, you have to use the \n escape sequence (or \r
|
||||
or \r\n, etc., depending on the newline setting) in a single line of
|
||||
input to encode the newline sequences. There is no limit on the length
|
||||
of subject lines; the input buffer is automatically extended if it is
|
||||
too small. There are replication features that makes it possible to
|
||||
generate long repetitive pattern or subject lines without having to
|
||||
or \r\n, etc., depending on the newline setting) in a single line of
|
||||
input to encode the newline sequences. There is no limit on the length
|
||||
of subject lines; the input buffer is automatically extended if it is
|
||||
too small. There are replication features that makes it possible to
|
||||
generate long repetitive pattern or subject lines without having to
|
||||
supply them explicitly.
|
||||
|
||||
An empty line or the end of the file signals the end of the subject
|
||||
lines for a test, at which point a new pattern or command line is ex-
|
||||
An empty line or the end of the file signals the end of the subject
|
||||
lines for a test, at which point a new pattern or command line is ex-
|
||||
pected if there is still input to be read.
|
||||
|
||||
|
||||
COMMAND LINES
|
||||
|
||||
In between sets of test data, a line that begins with # is interpreted
|
||||
In between sets of test data, a line that begins with # is interpreted
|
||||
as a command line. If the first character is followed by white space or
|
||||
an exclamation mark, the line is treated as a comment, and ignored.
|
||||
an exclamation mark, the line is treated as a comment, and ignored.
|
||||
Otherwise, the following commands are recognized:
|
||||
|
||||
#forbid_utf
|
||||
|
||||
Subsequent patterns automatically have the PCRE2_NEVER_UTF and
|
||||
PCRE2_NEVER_UCP options set, which locks out the use of the PCRE2_UTF
|
||||
and PCRE2_UCP options and the use of (*UTF) and (*UCP) at the start of
|
||||
patterns. This command also forces an error if a subsequent pattern
|
||||
contains any occurrences of \P, \p, or \X, which are still supported
|
||||
when PCRE2_UTF is not set, but which require Unicode property support
|
||||
Subsequent patterns automatically have the PCRE2_NEVER_UTF and
|
||||
PCRE2_NEVER_UCP options set, which locks out the use of the PCRE2_UTF
|
||||
and PCRE2_UCP options and the use of (*UTF) and (*UCP) at the start of
|
||||
patterns. This command also forces an error if a subsequent pattern
|
||||
contains any occurrences of \P, \p, or \X, which are still supported
|
||||
when PCRE2_UTF is not set, but which require Unicode property support
|
||||
to be included in the library.
|
||||
|
||||
This is a trigger guard that is used in test files to ensure that UTF
|
||||
or Unicode property tests are not accidentally added to files that are
|
||||
used when Unicode support is not included in the library. Setting
|
||||
PCRE2_NEVER_UTF and PCRE2_NEVER_UCP as a default can also be obtained
|
||||
by the use of #pattern; the difference is that #forbid_utf cannot be
|
||||
unset, and the automatic options are not displayed in pattern informa-
|
||||
This is a trigger guard that is used in test files to ensure that UTF
|
||||
or Unicode property tests are not accidentally added to files that are
|
||||
used when Unicode support is not included in the library. Setting
|
||||
PCRE2_NEVER_UTF and PCRE2_NEVER_UCP as a default can also be obtained
|
||||
by the use of #pattern; the difference is that #forbid_utf cannot be
|
||||
unset, and the automatic options are not displayed in pattern informa-
|
||||
tion, to avoid cluttering up test output.
|
||||
|
||||
#load <filename>
|
||||
|
||||
This command is used to load a set of precompiled patterns from a file,
|
||||
as described in the section entitled "Saving and restoring compiled
|
||||
as described in the section entitled "Saving and restoring compiled
|
||||
patterns" below.
|
||||
|
||||
#loadtables <filename>
|
||||
|
||||
This command is used to load a set of binary character tables that can
|
||||
be accessed by the tables=3 qualifier. Such tables can be created by
|
||||
the pcre2_dftables program with the -b option.
|
||||
|
||||
#newline_default [<newline-list>]
|
||||
|
||||
When PCRE2 is built, a default newline convention can be specified.
|
||||
This determines which characters and/or character pairs are recognized
|
||||
When PCRE2 is built, a default newline convention can be specified.
|
||||
This determines which characters and/or character pairs are recognized
|
||||
as indicating a newline in a pattern or subject string. The default can
|
||||
be overridden when a pattern is compiled. The standard test files con-
|
||||
tain tests of various newline conventions, but the majority of the
|
||||
tests expect a single linefeed to be recognized as a newline by de-
|
||||
fault. Without special action the tests would fail when PCRE2 is com-
|
||||
be overridden when a pattern is compiled. The standard test files con-
|
||||
tain tests of various newline conventions, but the majority of the
|
||||
tests expect a single linefeed to be recognized as a newline by de-
|
||||
fault. Without special action the tests would fail when PCRE2 is com-
|
||||
piled with either CR or CRLF as the default newline.
|
||||
|
||||
The #newline_default command specifies a list of newline types that are
|
||||
acceptable as the default. The types must be one of CR, LF, CRLF, ANY-
|
||||
acceptable as the default. The types must be one of CR, LF, CRLF, ANY-
|
||||
CRLF, ANY, or NUL (in upper or lower case), for example:
|
||||
|
||||
#newline_default LF Any anyCRLF
|
||||
|
||||
If the default newline is in the list, this command has no effect. Oth-
|
||||
erwise, except when testing the POSIX API, a newline modifier that
|
||||
erwise, except when testing the POSIX API, a newline modifier that
|
||||
specifies the first newline convention in the list (LF in the above ex-
|
||||
ample) is added to any pattern that does not already have a newline
|
||||
ample) is added to any pattern that does not already have a newline
|
||||
modifier. If the newline list is empty, the feature is turned off. This
|
||||
command is present in a number of the standard test input files.
|
||||
|
||||
When the POSIX API is being tested there is no way to override the de-
|
||||
When the POSIX API is being tested there is no way to override the de-
|
||||
fault newline convention, though it is possible to set the newline con-
|
||||
vention from within the pattern. A warning is given if the posix or
|
||||
posix_nosub modifier is used when #newline_default would set a default
|
||||
vention from within the pattern. A warning is given if the posix or
|
||||
posix_nosub modifier is used when #newline_default would set a default
|
||||
for the non-POSIX API.
|
||||
|
||||
#pattern <modifier-list>
|
||||
|
||||
This command sets a default modifier list that applies to all subse-
|
||||
This command sets a default modifier list that applies to all subse-
|
||||
quent patterns. Modifiers on a pattern can change these settings.
|
||||
|
||||
#perltest
|
||||
|
||||
The appearance of this line causes all subsequent modifier settings to
|
||||
be checked for compatibility with the perltest.sh script, which is used
|
||||
to confirm that Perl gives the same results as PCRE2. Also, apart from
|
||||
comment lines, #pattern commands, and #subject commands that set or un-
|
||||
set "mark", no command lines are permitted, because they and many of
|
||||
the modifiers are specific to pcre2test, and should not be used in test
|
||||
files that are also processed by perltest.sh. The #perltest command
|
||||
helps detect tests that are accidentally put in the wrong file.
|
||||
This line is used in test files that can also be processed by perl-
|
||||
test.sh to confirm that Perl gives the same results as PCRE2. Subse-
|
||||
quent tests are checked for the use of pcre2test features that are in-
|
||||
compatible with the perltest.sh script.
|
||||
|
||||
Patterns must use '/' as their delimiter, and only certain modifiers
|
||||
are supported. Comment lines, #pattern commands, and #subject commands
|
||||
that set or unset "mark" are recognized and acted on. The #perltest,
|
||||
#forbid_utf, and #newline_default commands, which are needed in the
|
||||
relevant pcre2test files, are silently ignored. All other command lines
|
||||
are ignored, but give a warning message. The #perltest command helps
|
||||
detect tests that are accidentally put in the wrong file or use the
|
||||
wrong delimiter. For more details of the perltest.sh script see the
|
||||
comments it contains.
|
||||
|
||||
#pop [<modifiers>]
|
||||
#popcopy [<modifiers>]
|
||||
|
||||
These commands are used to manipulate the stack of compiled patterns,
|
||||
as described in the section entitled "Saving and restoring compiled
|
||||
These commands are used to manipulate the stack of compiled patterns,
|
||||
as described in the section entitled "Saving and restoring compiled
|
||||
patterns" below.
|
||||
|
||||
#save <filename>
|
||||
|
||||
This command is used to save a set of compiled patterns to a file, as
|
||||
described in the section entitled "Saving and restoring compiled pat-
|
||||
This command is used to save a set of compiled patterns to a file, as
|
||||
described in the section entitled "Saving and restoring compiled pat-
|
||||
terns" below.
|
||||
|
||||
#subject <modifier-list>
|
||||
|
||||
This command sets a default modifier list that applies to all subse-
|
||||
quent subject lines. Modifiers on a subject line can change these set-
|
||||
This command sets a default modifier list that applies to all subse-
|
||||
quent subject lines. Modifiers on a subject line can change these set-
|
||||
tings.
|
||||
|
||||
|
||||
|
@ -379,47 +396,50 @@ MODIFIER SYNTAX
|
|||
|
||||
Modifier lists are used with both pattern and subject lines. Items in a
|
||||
list are separated by commas followed by optional white space. Trailing
|
||||
whitespace in a modifier list is ignored. Some modifiers may be given
|
||||
for both patterns and subject lines, whereas others are valid only for
|
||||
one or the other. Each modifier has a long name, for example "an-
|
||||
chored", and some of them must be followed by an equals sign and a
|
||||
value, for example, "offset=12". Values cannot contain comma charac-
|
||||
ters, but may contain spaces. Modifiers that do not take values may be
|
||||
whitespace in a modifier list is ignored. Some modifiers may be given
|
||||
for both patterns and subject lines, whereas others are valid only for
|
||||
one or the other. Each modifier has a long name, for example "an-
|
||||
chored", and some of them must be followed by an equals sign and a
|
||||
value, for example, "offset=12". Values cannot contain comma charac-
|
||||
ters, but may contain spaces. Modifiers that do not take values may be
|
||||
preceded by a minus sign to turn off a previous setting.
|
||||
|
||||
A few of the more common modifiers can also be specified as single let-
|
||||
ters, for example "i" for "caseless". In documentation, following the
|
||||
ters, for example "i" for "caseless". In documentation, following the
|
||||
Perl convention, these are written with a slash ("the /i modifier") for
|
||||
clarity. Abbreviated modifiers must all be concatenated in the first
|
||||
item of a modifier list. If the first item is not recognized as a long
|
||||
modifier name, it is interpreted as a sequence of these abbreviations.
|
||||
clarity. Abbreviated modifiers must all be concatenated in the first
|
||||
item of a modifier list. If the first item is not recognized as a long
|
||||
modifier name, it is interpreted as a sequence of these abbreviations.
|
||||
For example:
|
||||
|
||||
/abc/ig,newline=cr,jit=3
|
||||
|
||||
This is a pattern line whose modifier list starts with two one-letter
|
||||
modifiers (/i and /g). The lower-case abbreviated modifiers are the
|
||||
This is a pattern line whose modifier list starts with two one-letter
|
||||
modifiers (/i and /g). The lower-case abbreviated modifiers are the
|
||||
same as used in Perl.
|
||||
|
||||
|
||||
PATTERN SYNTAX
|
||||
|
||||
A pattern line must start with one of the following characters (common
|
||||
A pattern line must start with one of the following characters (common
|
||||
symbols, excluding pattern meta-characters):
|
||||
|
||||
/ ! " ' ` - = _ : ; , % & @ ~
|
||||
|
||||
This is interpreted as the pattern's delimiter. A regular expression
|
||||
may be continued over several input lines, in which case the newline
|
||||
This is interpreted as the pattern's delimiter. A regular expression
|
||||
may be continued over several input lines, in which case the newline
|
||||
characters are included within it. It is possible to include the delim-
|
||||
iter within the pattern by escaping it with a backslash, for example
|
||||
iter as a literal within the pattern by escaping it with a backslash,
|
||||
for example
|
||||
|
||||
/abc\/def/
|
||||
|
||||
If you do this, the escape and the delimiter form part of the pattern,
|
||||
but since the delimiters are all non-alphanumeric, this does not affect
|
||||
its interpretation. If the terminating delimiter is immediately fol-
|
||||
lowed by a backslash, for example,
|
||||
but since the delimiters are all non-alphanumeric, the inclusion of the
|
||||
backslash does not affect the pattern's interpretation. Note, however,
|
||||
that this trick does not work within \Q...\E literal bracketing because
|
||||
the backslash will itself be interpreted as a literal. If the terminat-
|
||||
ing delimiter is immediately followed by a backslash, for example,
|
||||
|
||||
/abc/\
|
||||
|
||||
|
@ -438,11 +458,11 @@ PATTERN SYNTAX
|
|||
|
||||
SUBJECT LINE SYNTAX
|
||||
|
||||
Before each subject line is passed to pcre2_match() or
|
||||
pcre2_dfa_match(), leading and trailing white space is removed, and the
|
||||
line is scanned for backslash escapes, unless the subject_literal modi-
|
||||
fier was set for the pattern. The following provide a means of encoding
|
||||
non-printing characters in a visible way:
|
||||
Before each subject line is passed to pcre2_match(), pcre2_dfa_match(),
|
||||
or pcre2_jit_match(), leading and trailing white space is removed, and
|
||||
the line is scanned for backslash escapes, unless the subject_literal
|
||||
modifier was set for the pattern. The following provide a means of en-
|
||||
coding non-printing characters in a visible way:
|
||||
|
||||
\a alarm (BEL, \x07)
|
||||
\b backspace (\x08)
|
||||
|
@ -537,6 +557,7 @@ PATTERN MODIFIERS
|
|||
options.
|
||||
|
||||
allow_empty_class set PCRE2_ALLOW_EMPTY_CLASS
|
||||
allow_lookaround_bsk set PCRE2_EXTRA_ALLOW_LOOKAROUND_BSK
|
||||
allow_surrogate_escapes set PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES
|
||||
alt_bsux set PCRE2_ALT_BSUX
|
||||
alt_circumflex set PCRE2_ALT_CIRCUMFLEX
|
||||
|
@ -613,7 +634,7 @@ PATTERN MODIFIERS
|
|||
pushcopy push a copy onto the stack
|
||||
stackguard=<number> test the stackguard feature
|
||||
subject_literal treat all subject lines as literal
|
||||
tables=[0|1|2] select internal tables
|
||||
tables=[0|1|2|3] select internal tables
|
||||
use_length do not zero-terminate the pattern
|
||||
utf8_input treat input as UTF-8
|
||||
|
||||
|
@ -914,80 +935,85 @@ PATTERN MODIFIERS
|
|||
Using alternative character tables
|
||||
|
||||
The value specified for the tables modifier must be one of the digits
|
||||
0, 1, or 2. It causes a specific set of built-in character tables to be
|
||||
passed to pcre2_compile(). This is used in the PCRE2 tests to check be-
|
||||
haviour with different character tables. The digit specifies the tables
|
||||
as follows:
|
||||
0, 1, 2, or 3. It causes a specific set of built-in character tables to
|
||||
be passed to pcre2_compile(). This is used in the PCRE2 tests to check
|
||||
behaviour with different character tables. The digit specifies the ta-
|
||||
bles as follows:
|
||||
|
||||
0 do not pass any special character tables
|
||||
1 the default ASCII tables, as distributed in
|
||||
pcre2_chartables.c.dist
|
||||
2 a set of tables defining ISO 8859 characters
|
||||
3 a set of tables loaded by the #loadtables command
|
||||
|
||||
In table 2, some characters whose codes are greater than 128 are iden-
|
||||
tified as letters, digits, spaces, etc. Setting alternate character ta-
|
||||
bles and a locale are mutually exclusive.
|
||||
In tables 2, some characters whose codes are greater than 128 are iden-
|
||||
tified as letters, digits, spaces, etc. Tables 3 can be used only after
|
||||
a #loadtables command has loaded them from a binary file. Setting al-
|
||||
ternate character tables and a locale are mutually exclusive.
|
||||
|
||||
Setting certain match controls
|
||||
|
||||
The following modifiers are really subject modifiers, and are described
|
||||
under "Subject Modifiers" below. However, they may be included in a
|
||||
pattern's modifier list, in which case they are applied to every sub-
|
||||
ject line that is processed with that pattern. These modifiers do not
|
||||
under "Subject Modifiers" below. However, they may be included in a
|
||||
pattern's modifier list, in which case they are applied to every sub-
|
||||
ject line that is processed with that pattern. These modifiers do not
|
||||
affect the compilation process.
|
||||
|
||||
aftertext show text after match
|
||||
allaftertext show text after captures
|
||||
allcaptures show all captures
|
||||
allvector show the entire ovector
|
||||
allusedtext show all consulted text
|
||||
altglobal alternative global matching
|
||||
/g global global matching
|
||||
jitstack=<n> set size of JIT stack
|
||||
mark show mark values
|
||||
replace=<string> specify a replacement string
|
||||
startchar show starting character when relevant
|
||||
substitute_callout use substitution callouts
|
||||
substitute_extended use PCRE2_SUBSTITUTE_EXTENDED
|
||||
substitute_skip=<n> skip substitution number n
|
||||
substitute_overflow_length use PCRE2_SUBSTITUTE_OVERFLOW_LENGTH
|
||||
substitute_stop=<n> skip substitution number n and greater
|
||||
substitute_unknown_unset use PCRE2_SUBSTITUTE_UNKNOWN_UNSET
|
||||
substitute_unset_empty use PCRE2_SUBSTITUTE_UNSET_EMPTY
|
||||
aftertext show text after match
|
||||
allaftertext show text after captures
|
||||
allcaptures show all captures
|
||||
allvector show the entire ovector
|
||||
allusedtext show all consulted text
|
||||
altglobal alternative global matching
|
||||
/g global global matching
|
||||
jitstack=<n> set size of JIT stack
|
||||
mark show mark values
|
||||
replace=<string> specify a replacement string
|
||||
startchar show starting character when relevant
|
||||
substitute_callout use substitution callouts
|
||||
substitute_extended use PCRE2_SUBSTITUTE_EXTENDED
|
||||
substitute_literal use PCRE2_SUBSTITUTE_LITERAL
|
||||
substitute_matched use PCRE2_SUBSTITUTE_MATCHED
|
||||
substitute_overflow_length use PCRE2_SUBSTITUTE_OVERFLOW_LENGTH
|
||||
substitute_replacement_only use PCRE2_SUBSTITUTE_REPLACEMENT_ONLY
|
||||
substitute_skip=<n> skip substitution <n>
|
||||
substitute_stop=<n> skip substitution <n> and following
|
||||
substitute_unknown_unset use PCRE2_SUBSTITUTE_UNKNOWN_UNSET
|
||||
substitute_unset_empty use PCRE2_SUBSTITUTE_UNSET_EMPTY
|
||||
|
||||
These modifiers may not appear in a #pattern command. If you want them
|
||||
These modifiers may not appear in a #pattern command. If you want them
|
||||
as defaults, set them in a #subject command.
|
||||
|
||||
Specifying literal subject lines
|
||||
|
||||
If the subject_literal modifier is present on a pattern, all the sub-
|
||||
If the subject_literal modifier is present on a pattern, all the sub-
|
||||
ject lines that it matches are taken as literal strings, with no inter-
|
||||
pretation of backslashes. It is not possible to set subject modifiers
|
||||
on such lines, but any that are set as defaults by a #subject command
|
||||
pretation of backslashes. It is not possible to set subject modifiers
|
||||
on such lines, but any that are set as defaults by a #subject command
|
||||
are recognized.
|
||||
|
||||
Saving a compiled pattern
|
||||
|
||||
When a pattern with the push modifier is successfully compiled, it is
|
||||
pushed onto a stack of compiled patterns, and pcre2test expects the
|
||||
next line to contain a new pattern (or a command) instead of a subject
|
||||
When a pattern with the push modifier is successfully compiled, it is
|
||||
pushed onto a stack of compiled patterns, and pcre2test expects the
|
||||
next line to contain a new pattern (or a command) instead of a subject
|
||||
line. This facility is used when saving compiled patterns to a file, as
|
||||
described in the section entitled "Saving and restoring compiled pat-
|
||||
terns" below. If pushcopy is used instead of push, a copy of the com-
|
||||
piled pattern is stacked, leaving the original as current, ready to
|
||||
match the following input lines. This provides a way of testing the
|
||||
pcre2_code_copy() function. The push and pushcopy modifiers are in-
|
||||
compatible with compilation modifiers such as global that act at match
|
||||
described in the section entitled "Saving and restoring compiled pat-
|
||||
terns" below. If pushcopy is used instead of push, a copy of the com-
|
||||
piled pattern is stacked, leaving the original as current, ready to
|
||||
match the following input lines. This provides a way of testing the
|
||||
pcre2_code_copy() function. The push and pushcopy modifiers are in-
|
||||
compatible with compilation modifiers such as global that act at match
|
||||
time. Any that are specified are ignored (for the stacked copy), with a
|
||||
warning message, except for replace, which causes an error. Note that
|
||||
jitverify, which is allowed, does not carry through to any subsequent
|
||||
warning message, except for replace, which causes an error. Note that
|
||||
jitverify, which is allowed, does not carry through to any subsequent
|
||||
matching that uses a stacked pattern.
|
||||
|
||||
Testing foreign pattern conversion
|
||||
|
||||
The experimental foreign pattern conversion functions in PCRE2 can be
|
||||
tested by setting the convert modifier. Its argument is a colon-sepa-
|
||||
rated list of options, which set the equivalent option for the
|
||||
The experimental foreign pattern conversion functions in PCRE2 can be
|
||||
tested by setting the convert modifier. Its argument is a colon-sepa-
|
||||
rated list of options, which set the equivalent option for the
|
||||
pcre2_pattern_convert() function:
|
||||
|
||||
glob PCRE2_CONVERT_GLOB
|
||||
|
@ -999,19 +1025,19 @@ PATTERN MODIFIERS
|
|||
|
||||
The "unset" value is useful for turning off a default that has been set
|
||||
by a #pattern command. When one of these options is set, the input pat-
|
||||
tern is passed to pcre2_pattern_convert(). If the conversion is suc-
|
||||
cessful, the result is reflected in the output and then passed to
|
||||
tern is passed to pcre2_pattern_convert(). If the conversion is suc-
|
||||
cessful, the result is reflected in the output and then passed to
|
||||
pcre2_compile(). The normal utf and no_utf_check options, if set, cause
|
||||
the PCRE2_CONVERT_UTF and PCRE2_CONVERT_NO_UTF_CHECK options to be
|
||||
the PCRE2_CONVERT_UTF and PCRE2_CONVERT_NO_UTF_CHECK options to be
|
||||
passed to pcre2_pattern_convert().
|
||||
|
||||
By default, the conversion function is allowed to allocate a buffer for
|
||||
its output. However, if the convert_length modifier is set to a value
|
||||
greater than zero, pcre2test passes a buffer of the given length. This
|
||||
its output. However, if the convert_length modifier is set to a value
|
||||
greater than zero, pcre2test passes a buffer of the given length. This
|
||||
makes it possible to test the length check.
|
||||
|
||||
The convert_glob_escape and convert_glob_separator modifiers can be
|
||||
used to specify the escape and separator characters for glob process-
|
||||
The convert_glob_escape and convert_glob_separator modifiers can be
|
||||
used to specify the escape and separator characters for glob process-
|
||||
ing, overriding the defaults, which are operating-system dependent.
|
||||
|
||||
|
||||
|
@ -1022,7 +1048,7 @@ SUBJECT MODIFIERS
|
|||
|
||||
Setting match options
|
||||
|
||||
The following modifiers set options for pcre2_match() or
|
||||
The following modifiers set options for pcre2_match() or
|
||||
pcre2_dfa_match(). See pcreapi for a description of their effects.
|
||||
|
||||
anchored set PCRE2_ANCHORED
|
||||
|
@ -1038,35 +1064,36 @@ SUBJECT MODIFIERS
|
|||
partial_hard (or ph) set PCRE2_PARTIAL_HARD
|
||||
partial_soft (or ps) set PCRE2_PARTIAL_SOFT
|
||||
|
||||
The partial matching modifiers are provided with abbreviations because
|
||||
The partial matching modifiers are provided with abbreviations because
|
||||
they appear frequently in tests.
|
||||
|
||||
If the posix or posix_nosub modifier was present on the pattern, caus-
|
||||
If the posix or posix_nosub modifier was present on the pattern, caus-
|
||||
ing the POSIX wrapper API to be used, the only option-setting modifiers
|
||||
that have any effect are notbol, notempty, and noteol, causing REG_NOT-
|
||||
BOL, REG_NOTEMPTY, and REG_NOTEOL, respectively, to be passed to
|
||||
BOL, REG_NOTEMPTY, and REG_NOTEOL, respectively, to be passed to
|
||||
regexec(). The other modifiers are ignored, with a warning message.
|
||||
|
||||
There is one additional modifier that can be used with the POSIX wrap-
|
||||
There is one additional modifier that can be used with the POSIX wrap-
|
||||
per. It is ignored (with a warning) if used for non-POSIX matching.
|
||||
|
||||
posix_startend=<n>[:<m>]
|
||||
|
||||
This causes the subject string to be passed to regexec() using the
|
||||
REG_STARTEND option, which uses offsets to specify which part of the
|
||||
string is searched. If only one number is given, the end offset is
|
||||
passed as the end of the subject string. For more detail of REG_STAR-
|
||||
TEND, see the pcre2posix documentation. If the subject string contains
|
||||
binary zeros (coded as escapes such as \x{00} because pcre2test does
|
||||
This causes the subject string to be passed to regexec() using the
|
||||
REG_STARTEND option, which uses offsets to specify which part of the
|
||||
string is searched. If only one number is given, the end offset is
|
||||
passed as the end of the subject string. For more detail of REG_STAR-
|
||||
TEND, see the pcre2posix documentation. If the subject string contains
|
||||
binary zeros (coded as escapes such as \x{00} because pcre2test does
|
||||
not support actual binary zeros in its input), you must use posix_star-
|
||||
tend to specify its length.
|
||||
|
||||
Setting match controls
|
||||
|
||||
The following modifiers affect the matching process or request addi-
|
||||
tional information. Some of them may also be specified on a pattern
|
||||
line (see above), in which case they apply to every subject line that
|
||||
is matched against that pattern.
|
||||
The following modifiers affect the matching process or request addi-
|
||||
tional information. Some of them may also be specified on a pattern
|
||||
line (see above), in which case they apply to every subject line that
|
||||
is matched against that pattern, but can be overridden by modifiers on
|
||||
the subject.
|
||||
|
||||
aftertext show text after match
|
||||
allaftertext show text after captures
|
||||
|
@ -1084,7 +1111,8 @@ SUBJECT MODIFIERS
|
|||
copy=<number or name> copy captured substring
|
||||
depth_limit=<n> set a depth limit
|
||||
dfa use pcre2_dfa_match()
|
||||
find_limits find match and depth limits
|
||||
find_limits find heap, match and depth limits
|
||||
find_limits_noheap find match and depth limits
|
||||
get=<number or name> extract captured substring
|
||||
getall extract all captured substrings
|
||||
/g global global matching
|
||||
|
@ -1094,6 +1122,8 @@ SUBJECT MODIFIERS
|
|||
match_limit=<n> set a match limit
|
||||
memory show heap memory usage
|
||||
null_context match with a NULL context
|
||||
null_replacement substitute with NULL replacement
|
||||
null_subject match with NULL subject
|
||||
offset=<n> set starting offset
|
||||
offset_limit=<n> set offset limit
|
||||
ovector=<n> set size of output vector
|
||||
|
@ -1103,8 +1133,11 @@ SUBJECT MODIFIERS
|
|||
startoffset=<n> same as offset=<n>
|
||||
substitute_callout use substitution callouts
|
||||
substitute_extedded use PCRE2_SUBSTITUTE_EXTENDED
|
||||
substitute_skip=<n> skip substitution number n
|
||||
substitute_literal use PCRE2_SUBSTITUTE_LITERAL
|
||||
substitute_matched use PCRE2_SUBSTITUTE_MATCHED
|
||||
substitute_overflow_length use PCRE2_SUBSTITUTE_OVERFLOW_LENGTH
|
||||
substitute_replacement_only use PCRE2_SUBSTITUTE_REPLACEMENT_ONLY
|
||||
substitute_skip=<n> skip substitution number n
|
||||
substitute_stop=<n> skip substitution number n and greater
|
||||
substitute_unknown_unset use PCRE2_SUBSTITUTE_UNKNOWN_UNSET
|
||||
substitute_unset_empty use PCRE2_SUBSTITUTE_UNSET_EMPTY
|
||||
|
@ -1249,29 +1282,40 @@ SUBJECT MODIFIERS
|
|||
Testing the substitution function
|
||||
|
||||
If the replace modifier is set, the pcre2_substitute() function is
|
||||
called instead of one of the matching functions. Note that replacement
|
||||
strings cannot contain commas, because a comma signifies the end of a
|
||||
modifier. This is not thought to be an issue in a test program.
|
||||
called instead of one of the matching functions (or after one call of
|
||||
pcre2_match() in the case of PCRE2_SUBSTITUTE_MATCHED). Note that re-
|
||||
placement strings cannot contain commas, because a comma signifies the
|
||||
end of a modifier. This is not thought to be an issue in a test pro-
|
||||
gram.
|
||||
|
||||
Unlike subject strings, pcre2test does not process replacement strings
|
||||
for escape sequences. In UTF mode, a replacement string is checked to
|
||||
see if it is a valid UTF-8 string. If so, it is correctly converted to
|
||||
a UTF string of the appropriate code unit width. If it is not a valid
|
||||
UTF-8 string, the individual code units are copied directly. This pro-
|
||||
Specifying a completely empty replacement string disables this modi-
|
||||
fier. However, it is possible to specify an empty replacement by pro-
|
||||
viding a buffer length, as described below, for an otherwise empty re-
|
||||
placement.
|
||||
|
||||
Unlike subject strings, pcre2test does not process replacement strings
|
||||
for escape sequences. In UTF mode, a replacement string is checked to
|
||||
see if it is a valid UTF-8 string. If so, it is correctly converted to
|
||||
a UTF string of the appropriate code unit width. If it is not a valid
|
||||
UTF-8 string, the individual code units are copied directly. This pro-
|
||||
vides a means of passing an invalid UTF-8 string for testing purposes.
|
||||
|
||||
The following modifiers set options (in additional to the normal match
|
||||
The following modifiers set options (in additional to the normal match
|
||||
options) for pcre2_substitute():
|
||||
|
||||
global PCRE2_SUBSTITUTE_GLOBAL
|
||||
substitute_extended PCRE2_SUBSTITUTE_EXTENDED
|
||||
substitute_literal PCRE2_SUBSTITUTE_LITERAL
|
||||
substitute_matched PCRE2_SUBSTITUTE_MATCHED
|
||||
substitute_overflow_length PCRE2_SUBSTITUTE_OVERFLOW_LENGTH
|
||||
substitute_replacement_only PCRE2_SUBSTITUTE_REPLACEMENT_ONLY
|
||||
substitute_unknown_unset PCRE2_SUBSTITUTE_UNKNOWN_UNSET
|
||||
substitute_unset_empty PCRE2_SUBSTITUTE_UNSET_EMPTY
|
||||
|
||||
See the pcre2api documentation for details of these options.
|
||||
|
||||
After a successful substitution, the modified string is output, pre-
|
||||
ceded by the number of replacements. This may be zero if there were no
|
||||
After a successful substitution, the modified string is output, pre-
|
||||
ceded by the number of replacements. This may be zero if there were no
|
||||
matches. Here is a simple example of a substitution test:
|
||||
|
||||
/abc/replace=xxx
|
||||
|
@ -1280,12 +1324,12 @@ SUBJECT MODIFIERS
|
|||
=abc=abc=\=global
|
||||
2: =xxx=xxx=
|
||||
|
||||
Subject and replacement strings should be kept relatively short (fewer
|
||||
than 256 characters) for substitution tests, as fixed-size buffers are
|
||||
used. To make it easy to test for buffer overflow, if the replacement
|
||||
string starts with a number in square brackets, that number is passed
|
||||
to pcre2_substitute() as the size of the output buffer, with the re-
|
||||
placement string starting at the next character. Here is an example
|
||||
Subject and replacement strings should be kept relatively short (fewer
|
||||
than 256 characters) for substitution tests, as fixed-size buffers are
|
||||
used. To make it easy to test for buffer overflow, if the replacement
|
||||
string starts with a number in square brackets, that number is passed
|
||||
to pcre2_substitute() as the size of the output buffer, with the re-
|
||||
placement string starting at the next character. Here is an example
|
||||
that tests the edge case:
|
||||
|
||||
/abc/
|
||||
|
@ -1295,12 +1339,12 @@ SUBJECT MODIFIERS
|
|||
Failed: error -47: no more memory
|
||||
|
||||
The default action of pcre2_substitute() is to return PCRE2_ER-
|
||||
ROR_NOMEMORY when the output buffer is too small. However, if the
|
||||
PCRE2_SUBSTITUTE_OVERFLOW_LENGTH option is set (by using the substi-
|
||||
ROR_NOMEMORY when the output buffer is too small. However, if the
|
||||
PCRE2_SUBSTITUTE_OVERFLOW_LENGTH option is set (by using the substi-
|
||||
tute_overflow_length modifier), pcre2_substitute() continues to go
|
||||
through the motions of matching and substituting (but not doing any
|
||||
callouts), in order to compute the size of buffer that is required.
|
||||
When this happens, pcre2test shows the required buffer length (which
|
||||
through the motions of matching and substituting (but not doing any
|
||||
callouts), in order to compute the size of buffer that is required.
|
||||
When this happens, pcre2test shows the required buffer length (which
|
||||
includes space for the trailing zero) as part of the error message. For
|
||||
example:
|
||||
|
||||
|
@ -1309,15 +1353,15 @@ SUBJECT MODIFIERS
|
|||
Failed: error -47: no more memory: 10 code units are needed
|
||||
|
||||
A replacement string is ignored with POSIX and DFA matching. Specifying
|
||||
partial matching provokes an error return ("bad option value") from
|
||||
partial matching provokes an error return ("bad option value") from
|
||||
pcre2_substitute().
|
||||
|
||||
Testing substitute callouts
|
||||
|
||||
If the substitute_callout modifier is set, a substitution callout func-
|
||||
tion is set up. The null_context modifier must not be set, because the
|
||||
address of the callout function is passed in a match context. When the
|
||||
callout function is called (after each substitution), details of the
|
||||
tion is set up. The null_context modifier must not be set, because the
|
||||
address of the callout function is passed in a match context. When the
|
||||
callout function is called (after each substitution), details of the
|
||||
the input and output strings are output. For example:
|
||||
|
||||
/abc/g,replace=<$0>,substitute_callout
|
||||
|
@ -1326,19 +1370,19 @@ SUBJECT MODIFIERS
|
|||
2(1) Old 6 9 "abc" New 8 13 "<abc>"
|
||||
2: <abc>def<abc>pqr
|
||||
|
||||
The first number on each callout line is the count of matches. The
|
||||
The first number on each callout line is the count of matches. The
|
||||
parenthesized number is the number of pairs that are set in the ovector
|
||||
(that is, one more than the number of capturing groups that were set).
|
||||
(that is, one more than the number of capturing groups that were set).
|
||||
Then are listed the offsets of the old substring, its contents, and the
|
||||
same for the replacement.
|
||||
|
||||
By default, the substitution callout function returns zero, which ac-
|
||||
cepts the replacement and causes matching to continue if /g was used.
|
||||
Two further modifiers can be used to test other return values. If sub-
|
||||
stitute_skip is set to a value greater than zero the callout function
|
||||
returns +1 for the match of that number, and similarly substitute_stop
|
||||
returns -1. These cause the replacement to be rejected, and -1 causes
|
||||
no further matching to take place. If either of them are set, substi-
|
||||
By default, the substitution callout function returns zero, which ac-
|
||||
cepts the replacement and causes matching to continue if /g was used.
|
||||
Two further modifiers can be used to test other return values. If sub-
|
||||
stitute_skip is set to a value greater than zero the callout function
|
||||
returns +1 for the match of that number, and similarly substitute_stop
|
||||
returns -1. These cause the replacement to be rejected, and -1 causes
|
||||
no further matching to take place. If either of them are set, substi-
|
||||
tute_callout is assumed. For example:
|
||||
|
||||
/abc/g,replace=<$0>,substitute_skip=1
|
||||
|
@ -1356,126 +1400,131 @@ SUBJECT MODIFIERS
|
|||
|
||||
Setting the JIT stack size
|
||||
|
||||
The jitstack modifier provides a way of setting the maximum stack size
|
||||
that is used by the just-in-time optimization code. It is ignored if
|
||||
JIT optimization is not being used. The value is a number of kibibytes
|
||||
(units of 1024 bytes). Setting zero reverts to the default of 32KiB.
|
||||
The jitstack modifier provides a way of setting the maximum stack size
|
||||
that is used by the just-in-time optimization code. It is ignored if
|
||||
JIT optimization is not being used. The value is a number of kibibytes
|
||||
(units of 1024 bytes). Setting zero reverts to the default of 32KiB.
|
||||
Providing a stack that is larger than the default is necessary only for
|
||||
very complicated patterns. If jitstack is set non-zero on a subject
|
||||
very complicated patterns. If jitstack is set non-zero on a subject
|
||||
line it overrides any value that was set on the pattern.
|
||||
|
||||
Setting heap, match, and depth limits
|
||||
|
||||
The heap_limit, match_limit, and depth_limit modifiers set the appro-
|
||||
priate limits in the match context. These values are ignored when the
|
||||
find_limits modifier is specified.
|
||||
The heap_limit, match_limit, and depth_limit modifiers set the appro-
|
||||
priate limits in the match context. These values are ignored when the
|
||||
find_limits or find_limits_noheap modifier is specified.
|
||||
|
||||
Finding minimum limits
|
||||
|
||||
If the find_limits modifier is present on a subject line, pcre2test
|
||||
calls the relevant matching function several times, setting different
|
||||
values in the match context via pcre2_set_heap_limit(),
|
||||
pcre2_set_match_limit(), or pcre2_set_depth_limit() until it finds the
|
||||
minimum values for each parameter that allows the match to complete
|
||||
without error. If JIT is being used, only the match limit is relevant.
|
||||
If the find_limits modifier is present on a subject line, pcre2test
|
||||
calls the relevant matching function several times, setting different
|
||||
values in the match context via pcre2_set_heap_limit(),
|
||||
pcre2_set_match_limit(), or pcre2_set_depth_limit() until it finds the
|
||||
smallest value for each parameter that allows the match to complete
|
||||
without a "limit exceeded" error. The match itself may succeed or fail.
|
||||
An alternative modifier, find_limits_noheap, omits the heap limit. This
|
||||
is used in the standard tests, because the minimum heap limit varies
|
||||
between systems. If JIT is being used, only the match limit is rele-
|
||||
vant, and the other two are automatically omitted.
|
||||
|
||||
When using this modifier, the pattern should not contain any limit set-
|
||||
tings such as (*LIMIT_MATCH=...) within it. If such a setting is
|
||||
tings such as (*LIMIT_MATCH=...) within it. If such a setting is
|
||||
present and is lower than the minimum matching value, the minimum value
|
||||
cannot be found because pcre2_set_match_limit() etc. are only able to
|
||||
cannot be found because pcre2_set_match_limit() etc. are only able to
|
||||
reduce the value of an in-pattern limit; they cannot increase it.
|
||||
|
||||
For non-DFA matching, the minimum depth_limit number is a measure of
|
||||
For non-DFA matching, the minimum depth_limit number is a measure of
|
||||
how much nested backtracking happens (that is, how deeply the pattern's
|
||||
tree is searched). In the case of DFA matching, depth_limit controls
|
||||
the depth of recursive calls of the internal function that is used for
|
||||
tree is searched). In the case of DFA matching, depth_limit controls
|
||||
the depth of recursive calls of the internal function that is used for
|
||||
handling pattern recursion, lookaround assertions, and atomic groups.
|
||||
|
||||
For non-DFA matching, the match_limit number is a measure of the amount
|
||||
of backtracking that takes place, and learning the minimum value can be
|
||||
instructive. For most simple matches, the number is quite small, but
|
||||
for patterns with very large numbers of matching possibilities, it can
|
||||
become large very quickly with increasing length of subject string. In
|
||||
the case of DFA matching, match_limit controls the total number of
|
||||
instructive. For most simple matches, the number is quite small, but
|
||||
for patterns with very large numbers of matching possibilities, it can
|
||||
become large very quickly with increasing length of subject string. In
|
||||
the case of DFA matching, match_limit controls the total number of
|
||||
calls, both recursive and non-recursive, to the internal matching func-
|
||||
tion, thus controlling the overall amount of computing resource that is
|
||||
used.
|
||||
|
||||
For both kinds of matching, the heap_limit number, which is in
|
||||
kibibytes (units of 1024 bytes), limits the amount of heap memory used
|
||||
for matching. A value of zero disables the use of any heap memory; many
|
||||
simple pattern matches can be done without using the heap, so zero is
|
||||
not an unreasonable setting.
|
||||
For both kinds of matching, the heap_limit number, which is in
|
||||
kibibytes (units of 1024 bytes), limits the amount of heap memory used
|
||||
for matching.
|
||||
|
||||
Showing MARK names
|
||||
|
||||
|
||||
The mark modifier causes the names from backtracking control verbs that
|
||||
are returned from calls to pcre2_match() to be displayed. If a mark is
|
||||
returned for a match, non-match, or partial match, pcre2test shows it.
|
||||
For a match, it is on a line by itself, tagged with "MK:". Otherwise,
|
||||
are returned from calls to pcre2_match() to be displayed. If a mark is
|
||||
returned for a match, non-match, or partial match, pcre2test shows it.
|
||||
For a match, it is on a line by itself, tagged with "MK:". Otherwise,
|
||||
it is added to the non-match message.
|
||||
|
||||
Showing memory usage
|
||||
|
||||
The memory modifier causes pcre2test to log the sizes of all heap mem-
|
||||
ory allocation and freeing calls that occur during a call to
|
||||
pcre2_match() or pcre2_dfa_match(). These occur only when a match re-
|
||||
quires a bigger vector than the default for remembering backtracking
|
||||
points (pcre2_match()) or for internal workspace (pcre2_dfa_match()).
|
||||
In many cases there will be no heap memory used and therefore no addi-
|
||||
tional output. No heap memory is allocated during matching with JIT, so
|
||||
in that case the memory modifier never has any effect. For this modi-
|
||||
fier to work, the null_context modifier must not be set on both the
|
||||
The memory modifier causes pcre2test to log the sizes of all heap mem-
|
||||
ory allocation and freeing calls that occur during a call to
|
||||
pcre2_match() or pcre2_dfa_match(). In the latter case, heap memory is
|
||||
used only when a match requires more internal workspace that the de-
|
||||
fault allocation on the stack, so in many cases there will be no out-
|
||||
put. No heap memory is allocated during matching with JIT. For this
|
||||
modifier to work, the null_context modifier must not be set on both the
|
||||
pattern and the subject, though it can be set on one or the other.
|
||||
|
||||
Setting a starting offset
|
||||
|
||||
The offset modifier sets an offset in the subject string at which
|
||||
The offset modifier sets an offset in the subject string at which
|
||||
matching starts. Its value is a number of code units, not characters.
|
||||
|
||||
Setting an offset limit
|
||||
|
||||
The offset_limit modifier sets a limit for unanchored matches. If a
|
||||
The offset_limit modifier sets a limit for unanchored matches. If a
|
||||
match cannot be found starting at or before this offset in the subject,
|
||||
a "no match" return is given. The data value is a number of code units,
|
||||
not characters. When this modifier is used, the use_offset_limit modi-
|
||||
not characters. When this modifier is used, the use_offset_limit modi-
|
||||
fier must have been set for the pattern; if not, an error is generated.
|
||||
|
||||
Setting the size of the output vector
|
||||
|
||||
The ovector modifier applies only to the subject line in which it ap-
|
||||
The ovector modifier applies only to the subject line in which it ap-
|
||||
pears, though of course it can also be used to set a default in a #sub-
|
||||
ject command. It specifies the number of pairs of offsets that are
|
||||
ject command. It specifies the number of pairs of offsets that are
|
||||
available for storing matching information. The default is 15.
|
||||
|
||||
A value of zero is useful when testing the POSIX API because it causes
|
||||
A value of zero is useful when testing the POSIX API because it causes
|
||||
regexec() to be called with a NULL capture vector. When not testing the
|
||||
POSIX API, a value of zero is used to cause pcre2_match_data_cre-
|
||||
ate_from_pattern() to be called, in order to create a match block of
|
||||
POSIX API, a value of zero is used to cause pcre2_match_data_cre-
|
||||
ate_from_pattern() to be called, in order to create a match block of
|
||||
exactly the right size for the pattern. (It is not possible to create a
|
||||
match block with a zero-length ovector; there is always at least one
|
||||
match block with a zero-length ovector; there is always at least one
|
||||
pair of offsets.)
|
||||
|
||||
Passing the subject as zero-terminated
|
||||
|
||||
By default, the subject string is passed to a native API matching func-
|
||||
tion with its correct length. In order to test the facility for passing
|
||||
a zero-terminated string, the zero_terminate modifier is provided. It
|
||||
causes the length to be passed as PCRE2_ZERO_TERMINATED. When matching
|
||||
a zero-terminated string, the zero_terminate modifier is provided. It
|
||||
causes the length to be passed as PCRE2_ZERO_TERMINATED. When matching
|
||||
via the POSIX interface, this modifier is ignored, with a warning.
|
||||
|
||||
When testing pcre2_substitute(), this modifier also has the effect of
|
||||
When testing pcre2_substitute(), this modifier also has the effect of
|
||||
passing the replacement string as zero-terminated.
|
||||
|
||||
Passing a NULL context
|
||||
Passing a NULL context, subject, or replacement
|
||||
|
||||
Normally, pcre2test passes a context block to pcre2_match(),
|
||||
pcre2_dfa_match(), pcre2_jit_match() or pcre2_substitute(). If the
|
||||
null_context modifier is set, however, NULL is passed. This is for
|
||||
testing that the matching and substitution functions behave correctly
|
||||
in this case (they use default values). This modifier cannot be used
|
||||
with the find_limits or substitute_callout modifiers.
|
||||
Normally, pcre2test passes a context block to pcre2_match(),
|
||||
pcre2_dfa_match(), pcre2_jit_match() or pcre2_substitute(). If the
|
||||
null_context modifier is set, however, NULL is passed. This is for
|
||||
testing that the matching and substitution functions behave correctly
|
||||
in this case (they use default values). This modifier cannot be used
|
||||
with the find_limits, find_limits_noheap, or substitute_callout modi-
|
||||
fiers.
|
||||
|
||||
Similarly, for testing purposes, if the null_subject or null_replace-
|
||||
ment modifier is set, the subject or replacement string pointers are
|
||||
passed as NULL, respectively, to the relevant functions.
|
||||
|
||||
|
||||
THE ALTERNATIVE MATCHING FUNCTION
|
||||
|
@ -1896,11 +1945,11 @@ SEE ALSO
|
|||
AUTHOR
|
||||
|
||||
Philip Hazel
|
||||
University Computing Service
|
||||
Retired from University Computing Service
|
||||
Cambridge, England.
|
||||
|
||||
|
||||
REVISION
|
||||
|
||||
Last updated: 30 July 2019
|
||||
Copyright (c) 1997-2019 University of Cambridge.
|
||||
Last updated: 27 July 2022
|
||||
Copyright (c) 1997-2022 University of Cambridge.
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
.TH PCRE2UNICODE 3 "24 May 2019" "PCRE2 10.34"
|
||||
.TH PCRE2UNICODE 3 "22 December 2021" "PCRE2 10.40"
|
||||
.SH NAME
|
||||
PCRE - Perl-compatible regular expressions (revised API)
|
||||
.SH "UNICODE AND UTF SUPPORT"
|
||||
|
@ -7,7 +7,7 @@ PCRE - Perl-compatible regular expressions (revised API)
|
|||
PCRE2 is normally built with Unicode support, though if you do not need it, you
|
||||
can build it without, in which case the library will be smaller. With Unicode
|
||||
support, PCRE2 has knowledge of Unicode character properties and can process
|
||||
text strings in UTF-8, UTF-16, or UTF-32 format (depending on the code unit
|
||||
strings of text in UTF-8, UTF-16, and UTF-32 format (depending on the code unit
|
||||
width), but this is not the default. Unless specifically requested, PCRE2
|
||||
treats each code unit in a string as one character.
|
||||
.P
|
||||
|
@ -40,10 +40,11 @@ handled, as documented below.
|
|||
.sp
|
||||
When PCRE2 is built with Unicode support, the escape sequences \ep{..},
|
||||
\eP{..}, and \eX can be used. This is not dependent on the PCRE2_UTF setting.
|
||||
The Unicode properties that can be tested are limited to the general category
|
||||
properties such as Lu for an upper case letter or Nd for a decimal number, the
|
||||
Unicode script names such as Arabic or Han, and the derived properties Any and
|
||||
L&. Full lists are given in the
|
||||
The Unicode properties that can be tested are a subset of those that Perl
|
||||
supports. Currently they are limited to the general category properties such as
|
||||
Lu for an upper case letter or Nd for a decimal number, the Unicode script
|
||||
names such as Arabic or Han, Bidi_Class, Bidi_Control, and the derived
|
||||
properties Any and LC (synonym L&). Full lists are given in the
|
||||
.\" HREF
|
||||
\fBpcre2pattern\fP
|
||||
.\"
|
||||
|
@ -51,10 +52,10 @@ and
|
|||
.\" HREF
|
||||
\fBpcre2syntax\fP
|
||||
.\"
|
||||
documentation. Only the short names for properties are supported. For example,
|
||||
\ep{L} matches a letter. Its Perl synonym, \ep{Letter}, is not supported.
|
||||
Furthermore, in Perl, many properties may optionally be prefixed by "Is", for
|
||||
compatibility with Perl 5.6. PCRE2 does not support this.
|
||||
documentation. In general, only the short names for properties are supported.
|
||||
For example, \ep{L} matches a letter. Its longer synonym, \ep{Letter}, is not
|
||||
supported. Furthermore, in Perl, many properties may optionally be prefixed by
|
||||
"Is", for compatibility with Perl 5.6. PCRE2 does not support this.
|
||||
.
|
||||
.
|
||||
.SH "WIDE CHARACTERS AND UTF MODES"
|
||||
|
@ -126,14 +127,16 @@ However, the special horizontal and vertical white space matching escapes (\eh,
|
|||
not PCRE2_UCP is set.
|
||||
.
|
||||
.
|
||||
.SH "CASE-EQUIVALENCE IN UTF MODE"
|
||||
.SH "UNICODE CASE-EQUIVALENCE"
|
||||
.rs
|
||||
.sp
|
||||
Case-insensitive matching in UTF mode makes use of Unicode properties except
|
||||
for characters whose code points are less than 128 and that have at most two
|
||||
case-equivalent values. For these, a direct table lookup is used for speed. A
|
||||
few Unicode characters such as Greek sigma have more than two code points that
|
||||
are case-equivalent, and these are treated specially.
|
||||
If either PCRE2_UTF or PCRE2_UCP is set, upper/lower case processing makes use
|
||||
of Unicode properties except for characters whose code points are less than 128
|
||||
and that have at most two case-equivalent values. For these, a direct table
|
||||
lookup is used for speed. A few Unicode characters such as Greek sigma have
|
||||
more than two code points that are case-equivalent, and these are treated
|
||||
specially. Setting PCRE2_UCP without PCRE2_UTF allows Unicode-style case
|
||||
processing for non-UTF character encodings such as UCS-2.
|
||||
.
|
||||
.
|
||||
.\" HTML <a name="scriptruns"></a>
|
||||
|
@ -446,7 +449,7 @@ can be useful when searching for UTF text in executable or other binary files.
|
|||
.sp
|
||||
.nf
|
||||
Philip Hazel
|
||||
University Computing Service
|
||||
Retired from University Computing Service
|
||||
Cambridge, England.
|
||||
.fi
|
||||
.
|
||||
|
@ -455,6 +458,6 @@ Cambridge, England.
|
|||
.rs
|
||||
.sp
|
||||
.nf
|
||||
Last updated: 24 May 2019
|
||||
Copyright (c) 1997-2019 University of Cambridge.
|
||||
Last updated: 22 December 2021
|
||||
Copyright (c) 1997-2021 University of Cambridge.
|
||||
.fi
|
||||
|
|
|
@ -0,0 +1,56 @@
|
|||
# PCRE2 - Perl-Compatible Regular Expressions
|
||||
|
||||
The PCRE2 library is a set of C functions that implement regular expression
|
||||
pattern matching using the same syntax and semantics as Perl 5. PCRE2 has its
|
||||
own native API, as well as a set of wrapper functions that correspond to the
|
||||
POSIX regular expression API. The PCRE2 library is free, even for building
|
||||
proprietary software. It comes in three forms, for processing 8-bit, 16-bit,
|
||||
or 32-bit code units, in either literal or UTF encoding.
|
||||
|
||||
PCRE2 was first released in 2015 to replace the API in the original PCRE
|
||||
library, which is now obsolete and no longer maintained. As well as a more
|
||||
flexible API, the code of PCRE2 has been much improved since the fork.
|
||||
|
||||
## Download
|
||||
|
||||
As well as downloading from the
|
||||
[GitHub site](https://github.com/PCRE2Project/pcre2), you can download PCRE2
|
||||
or the older, unmaintained PCRE1 library from an
|
||||
[*unofficial* mirror](https://sourceforge.net/projects/pcre/files/) at SourceForge.
|
||||
|
||||
You can check out the PCRE2 source code via Git or Subversion:
|
||||
|
||||
git clone https://github.com/PCRE2Project/pcre2.git
|
||||
svn co https://github.com/PCRE2Project/pcre2.git
|
||||
|
||||
## Contributed Ports
|
||||
|
||||
If you just need the command-line PCRE2 tools on Windows, precompiled binary
|
||||
versions are available at this
|
||||
[Rexegg page](http://www.rexegg.com/pcregrep-pcretest.html).
|
||||
|
||||
A PCRE2 port for z/OS, a mainframe operating system which uses EBCDIC as its
|
||||
default character encoding, can be found at
|
||||
[http://www.cbttape.org](http://www.cbttape.org/) (File 939).
|
||||
|
||||
## Documentation
|
||||
|
||||
You can read the PCRE2 documentation
|
||||
[here](https://PCRE2Project.github.io/pcre2/doc/html/index.html).
|
||||
|
||||
Comparisons to Perl's regular expression semantics can be found in the
|
||||
community authored Wikipedia entry for PCRE.
|
||||
|
||||
There is a curated summary of changes for each PCRE release, copies of
|
||||
documentation from older releases, and other useful information from the third
|
||||
party authored
|
||||
[RexEgg PCRE Documentation and Change Log page](http://www.rexegg.com/pcre-documentation.html).
|
||||
|
||||
## Contact
|
||||
|
||||
To report a problem with the PCRE2 library, or to make a feature request, please
|
||||
use the PCRE2 GitHub issues tracker. There is a mailing list for discussion of
|
||||
PCRE2 issues and development at pcre2-dev@googlegroups.com, which is where any
|
||||
announcements will be made. You can browse the
|
||||
[list archives](https://groups.google.com/g/pcre2-dev).
|
||||
|
|
@ -8,6 +8,6 @@ includedir=@includedir@
|
|||
Name: libpcre2-16
|
||||
Description: PCRE2 - Perl compatible regular expressions C library (2nd API) with 16 bit character support
|
||||
Version: @PACKAGE_VERSION@
|
||||
Libs: -L${libdir} -lpcre2-16
|
||||
Libs: -L${libdir} -lpcre2-16@LIB_POSTFIX@
|
||||
Libs.private: @PTHREAD_CFLAGS@ @PTHREAD_LIBS@
|
||||
Cflags: -I${includedir} @PCRE2_STATIC_CFLAG@
|
||||
|
|
|
@ -8,6 +8,6 @@ includedir=@includedir@
|
|||
Name: libpcre2-32
|
||||
Description: PCRE2 - Perl compatible regular expressions C library (2nd API) with 32 bit character support
|
||||
Version: @PACKAGE_VERSION@
|
||||
Libs: -L${libdir} -lpcre2-32
|
||||
Libs: -L${libdir} -lpcre2-32@LIB_POSTFIX@
|
||||
Libs.private: @PTHREAD_CFLAGS@ @PTHREAD_LIBS@
|
||||
Cflags: -I${includedir} @PCRE2_STATIC_CFLAG@
|
||||
|
|
|
@ -8,6 +8,6 @@ includedir=@includedir@
|
|||
Name: libpcre2-8
|
||||
Description: PCRE2 - Perl compatible regular expressions C library (2nd API) with 8 bit character support
|
||||
Version: @PACKAGE_VERSION@
|
||||
Libs: -L${libdir} -lpcre2-8
|
||||
Libs: -L${libdir} -lpcre2-8@LIB_POSTFIX@
|
||||
Libs.private: @PTHREAD_CFLAGS@ @PTHREAD_LIBS@
|
||||
Cflags: -I${includedir} @PCRE2_STATIC_CFLAG@
|
||||
|
|
|
@ -8,6 +8,6 @@ includedir=@includedir@
|
|||
Name: libpcre2-posix
|
||||
Description: Posix compatible interface to libpcre2-8
|
||||
Version: @PACKAGE_VERSION@
|
||||
Libs: -L${libdir} -lpcre2-posix
|
||||
Libs: -L${libdir} -lpcre2-posix@LIB_POSTFIX@
|
||||
Cflags: -I${includedir} @PCRE2_STATIC_CFLAG@
|
||||
Requires.private: libpcre2-8
|
||||
|
|
|
@ -0,0 +1,355 @@
|
|||
#! /usr/bin/python
|
||||
|
||||
# PCRE2 UNICODE PROPERTY SUPPORT
|
||||
# ------------------------------
|
||||
|
||||
# This file is a Python module containing common lists and functions for the
|
||||
# GenerateXXX scripts that create various.c and .h files from Unicode data
|
||||
# files. It was created as part of a re-organizaton of these scripts in
|
||||
# December 2021.
|
||||
|
||||
|
||||
import re
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# DATA LISTS
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
# BIDI classes in the DerivedBidiClass.txt file, with comments.
|
||||
|
||||
bidi_classes = [
|
||||
'AL', 'Arabic letter',
|
||||
'AN', 'Arabic number',
|
||||
'B', 'Paragraph separator',
|
||||
'BN', 'Boundary neutral',
|
||||
'CS', 'Common separator',
|
||||
'EN', 'European number',
|
||||
'ES', 'European separator',
|
||||
'ET', 'European terminator',
|
||||
'FSI', 'First strong isolate',
|
||||
'L', 'Left to right',
|
||||
'LRE', 'Left to right embedding',
|
||||
'LRI', 'Left to right isolate',
|
||||
'LRO', 'Left to right override',
|
||||
'NSM', 'Non-spacing mark',
|
||||
'ON', 'Other neutral',
|
||||
'PDF', 'Pop directional format',
|
||||
'PDI', 'Pop directional isolate',
|
||||
'R', 'Right to left',
|
||||
'RLE', 'Right to left embedding',
|
||||
'RLI', 'Right to left isolate',
|
||||
'RLO', 'Right to left override',
|
||||
'S', 'Segment separator',
|
||||
'WS', 'White space'
|
||||
]
|
||||
|
||||
# Particular category property names, with comments. NOTE: If ever this list
|
||||
# is changed, the table called "catposstab" in the pcre2_auto_possess.c file
|
||||
# must be edited to keep in step.
|
||||
|
||||
category_names = [
|
||||
'Cc', 'Control',
|
||||
'Cf', 'Format',
|
||||
'Cn', 'Unassigned',
|
||||
'Co', 'Private use',
|
||||
'Cs', 'Surrogate',
|
||||
'Ll', 'Lower case letter',
|
||||
'Lm', 'Modifier letter',
|
||||
'Lo', 'Other letter',
|
||||
'Lt', 'Title case letter',
|
||||
'Lu', 'Upper case letter',
|
||||
'Mc', 'Spacing mark',
|
||||
'Me', 'Enclosing mark',
|
||||
'Mn', 'Non-spacing mark',
|
||||
'Nd', 'Decimal number',
|
||||
'Nl', 'Letter number',
|
||||
'No', 'Other number',
|
||||
'Pc', 'Connector punctuation',
|
||||
'Pd', 'Dash punctuation',
|
||||
'Pe', 'Close punctuation',
|
||||
'Pf', 'Final punctuation',
|
||||
'Pi', 'Initial punctuation',
|
||||
'Po', 'Other punctuation',
|
||||
'Ps', 'Open punctuation',
|
||||
'Sc', 'Currency symbol',
|
||||
'Sk', 'Modifier symbol',
|
||||
'Sm', 'Mathematical symbol',
|
||||
'So', 'Other symbol',
|
||||
'Zl', 'Line separator',
|
||||
'Zp', 'Paragraph separator',
|
||||
'Zs', 'Space separator'
|
||||
]
|
||||
|
||||
# The Extended_Pictographic property is not found in the file where all the
|
||||
# others are (GraphemeBreakProperty.txt). It comes from the emoji-data.txt
|
||||
# file, but we list it here so that the name has the correct index value.
|
||||
|
||||
break_properties = [
|
||||
'CR', ' 0',
|
||||
'LF', ' 1',
|
||||
'Control', ' 2',
|
||||
'Extend', ' 3',
|
||||
'Prepend', ' 4',
|
||||
'SpacingMark', ' 5',
|
||||
'L', ' 6 Hangul syllable type L',
|
||||
'V', ' 7 Hangul syllable type V',
|
||||
'T', ' 8 Hangul syllable type T',
|
||||
'LV', ' 9 Hangul syllable type LV',
|
||||
'LVT', '10 Hangul syllable type LVT',
|
||||
'Regional_Indicator', '11',
|
||||
'Other', '12',
|
||||
'ZWJ', '13',
|
||||
'Extended_Pictographic', '14'
|
||||
]
|
||||
|
||||
# List of files from which the names of Boolean properties are obtained, along
|
||||
# with a list of regex patterns for properties to be ignored, and a list of
|
||||
# extra pattern names to add.
|
||||
|
||||
bool_propsfiles = ['PropList.txt', 'DerivedCoreProperties.txt', 'emoji-data.txt']
|
||||
bool_propsignore = [r'^Other_', r'^Hyphen$']
|
||||
bool_propsextras = ['ASCII', 'Bidi_Mirrored']
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# GET BOOLEAN PROPERTY NAMES
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
# Get a list of Boolean property names from a number of files.
|
||||
|
||||
def getbpropslist():
|
||||
bplist = []
|
||||
bplast = ""
|
||||
|
||||
for filename in bool_propsfiles:
|
||||
try:
|
||||
file = open('Unicode.tables/' + filename, 'r')
|
||||
except IOError:
|
||||
print(f"** Couldn't open {'Unicode.tables/' + filename}\n")
|
||||
sys.exit(1)
|
||||
|
||||
for line in file:
|
||||
line = re.sub(r'#.*', '', line)
|
||||
data = list(map(str.strip, line.split(';')))
|
||||
if len(data) <= 1 or data[1] == bplast:
|
||||
continue
|
||||
bplast = data[1]
|
||||
for pat in bool_propsignore:
|
||||
if re.match(pat, bplast) != None:
|
||||
break
|
||||
else:
|
||||
bplist.append(bplast)
|
||||
|
||||
file.close()
|
||||
|
||||
bplist.extend(bool_propsextras)
|
||||
bplist.sort()
|
||||
return bplist
|
||||
|
||||
bool_properties = getbpropslist()
|
||||
bool_props_list_item_size = (len(bool_properties) + 31) // 32
|
||||
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# COLLECTING PROPERTY NAMES AND ALIASES
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
script_names = ['Unknown']
|
||||
abbreviations = {}
|
||||
|
||||
def collect_property_names():
|
||||
global script_names
|
||||
global abbreviations
|
||||
|
||||
names_re = re.compile(r'^[0-9A-F]{4,6}(?:\.\.[0-9A-F]{4,6})? +; ([A-Za-z_]+) #')
|
||||
|
||||
last_script_name = ""
|
||||
with open("Unicode.tables/Scripts.txt") as f:
|
||||
for line in f:
|
||||
match_obj = names_re.match(line)
|
||||
|
||||
if match_obj == None or match_obj.group(1) == last_script_name:
|
||||
continue
|
||||
|
||||
last_script_name = match_obj.group(1)
|
||||
script_names.append(last_script_name)
|
||||
|
||||
# Sometimes there is comment in the line
|
||||
# so splitting around semicolon is not enough
|
||||
value_alias_re = re.compile(r' *([A-Za-z_]+) *; *([A-Za-z_]+) *; *([A-Za-z_]+)(?: *; *([A-Za-z_ ]+))?')
|
||||
|
||||
with open("Unicode.tables/PropertyValueAliases.txt") as f:
|
||||
for line in f:
|
||||
match_obj = value_alias_re.match(line)
|
||||
|
||||
if match_obj == None:
|
||||
continue
|
||||
|
||||
if match_obj.group(1) == "sc":
|
||||
if match_obj.group(2) == match_obj.group(3):
|
||||
abbreviations[match_obj.group(3)] = ()
|
||||
elif match_obj.group(4) == None:
|
||||
abbreviations[match_obj.group(3)] = (match_obj.group(2),)
|
||||
else:
|
||||
abbreviations[match_obj.group(3)] = (match_obj.group(2), match_obj.group(4))
|
||||
|
||||
# We can also collect Boolean property abbreviations into the same dictionary
|
||||
|
||||
bin_alias_re = re.compile(r' *([A-Za-z_]+) *; *([A-Za-z_]+)(?: *; *([A-Za-z_]+))?')
|
||||
with open("Unicode.tables/PropertyAliases.txt") as f:
|
||||
for line in f:
|
||||
match_obj = bin_alias_re.match(line)
|
||||
if match_obj == None:
|
||||
continue
|
||||
|
||||
if match_obj.group(2) in bool_properties:
|
||||
if match_obj.group(3) == None:
|
||||
abbreviations[match_obj.group(2)] = (match_obj.group(1),)
|
||||
else:
|
||||
abbreviations[match_obj.group(2)] = (match_obj.group(1), match_obj.group(3))
|
||||
|
||||
collect_property_names()
|
||||
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# REORDERING SCRIPT NAMES
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
script_abbrevs = []
|
||||
|
||||
def reorder_scripts():
|
||||
global script_names
|
||||
global script_abbrevs
|
||||
global abbreviations
|
||||
|
||||
for name in script_names:
|
||||
abbrevs = abbreviations[name]
|
||||
script_abbrevs.append(name if len(abbrevs) == 0 else abbrevs[0])
|
||||
|
||||
extended_script_abbrevs = set()
|
||||
with open("Unicode.tables/ScriptExtensions.txt") as f:
|
||||
names_re = re.compile(r'^[0-9A-F]{4,6}(?:\.\.[0-9A-F]{4,6})? +; ([A-Za-z_ ]+) #')
|
||||
|
||||
for line in f:
|
||||
match_obj = names_re.match(line)
|
||||
|
||||
if match_obj == None:
|
||||
continue
|
||||
|
||||
for name in match_obj.group(1).split(" "):
|
||||
extended_script_abbrevs.add(name)
|
||||
|
||||
new_script_names = []
|
||||
new_script_abbrevs = []
|
||||
|
||||
for idx, abbrev in enumerate(script_abbrevs):
|
||||
if abbrev in extended_script_abbrevs:
|
||||
new_script_names.append(script_names[idx])
|
||||
new_script_abbrevs.append(abbrev)
|
||||
|
||||
for idx, abbrev in enumerate(script_abbrevs):
|
||||
if abbrev not in extended_script_abbrevs:
|
||||
new_script_names.append(script_names[idx])
|
||||
new_script_abbrevs.append(abbrev)
|
||||
|
||||
script_names = new_script_names
|
||||
script_abbrevs = new_script_abbrevs
|
||||
|
||||
reorder_scripts()
|
||||
script_list_item_size = (script_names.index('Unknown') + 31) // 32
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# DERIVED LISTS
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
# Create general character property names from the first letters of the
|
||||
# particular categories.
|
||||
|
||||
gcn_set = set(category_names[i][0] for i in range(0, len(category_names), 2))
|
||||
general_category_names = list(gcn_set)
|
||||
general_category_names.sort()
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# FUNCTIONS
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
import sys
|
||||
|
||||
# Open an output file, using the command's argument or a default. Write common
|
||||
# preliminary header information.
|
||||
|
||||
def open_output(default):
|
||||
if len(sys.argv) > 2:
|
||||
print('** Too many arguments: just give a file name')
|
||||
sys.exit(1)
|
||||
if len(sys.argv) == 2:
|
||||
output_name = sys.argv[1]
|
||||
else:
|
||||
output_name = default
|
||||
try:
|
||||
file = open(output_name, "w")
|
||||
except IOError:
|
||||
print ("** Couldn't open %s" % output_name)
|
||||
sys.exit(1)
|
||||
|
||||
script_name = sys.argv[0]
|
||||
i = script_name.rfind('/')
|
||||
if i >= 0:
|
||||
script_name = script_name[i+1:]
|
||||
|
||||
file.write("""\
|
||||
/*************************************************
|
||||
* Perl-Compatible Regular Expressions *
|
||||
*************************************************/
|
||||
|
||||
/* PCRE is a library of functions to support regular expressions whose syntax
|
||||
and semantics are as close as possible to those of the Perl 5 language.
|
||||
|
||||
Written by Philip Hazel
|
||||
Original API code Copyright (c) 1997-2012 University of Cambridge
|
||||
New API code Copyright (c) 2016-2022 University of Cambridge
|
||||
|
||||
This module is auto-generated from Unicode data files. DO NOT EDIT MANUALLY!
|
||||
""")
|
||||
|
||||
file.write("Instead, modify the maint/%s script and run it to generate\n"
|
||||
"a new version of this code.\n\n" % script_name)
|
||||
|
||||
file.write("""\
|
||||
-----------------------------------------------------------------------------
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright notice,
|
||||
this list of conditions and the following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
|
||||
* Neither the name of the University of Cambridge nor the names of its
|
||||
contributors may be used to endorse or promote products derived from
|
||||
this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
POSSIBILITY OF SUCH DAMAGE.
|
||||
-----------------------------------------------------------------------------
|
||||
*/
|
||||
\n""")
|
||||
return file
|
||||
|
||||
# End of UcpCommon.py
|
|
@ -0,0 +1,188 @@
|
|||
#! /usr/bin/python
|
||||
|
||||
# PCRE2 UNICODE PROPERTY SUPPORT
|
||||
# ------------------------------
|
||||
#
|
||||
# This file auto-generates unicode property tests and their expected output.
|
||||
# It is recommended to re-run this generator after the unicode files are
|
||||
# updated. The names of the generated files are `testinput26` and `testoutput26`
|
||||
|
||||
import re
|
||||
import sys
|
||||
|
||||
from GenerateCommon import \
|
||||
script_names, \
|
||||
script_abbrevs
|
||||
|
||||
def write_both(text):
|
||||
input_file.write(text)
|
||||
output_file.write(text)
|
||||
|
||||
def to_string_char(ch_idx):
|
||||
if ch_idx < 128:
|
||||
if ch_idx < 16:
|
||||
return "\\x{0%x}" % ch_idx
|
||||
if ch_idx >= 32:
|
||||
return chr(ch_idx)
|
||||
return "\\x{%x}" % ch_idx
|
||||
|
||||
output_directory = ""
|
||||
|
||||
if len(sys.argv) > 2:
|
||||
print('** Too many arguments: just give a directory name')
|
||||
sys.exit(1)
|
||||
if len(sys.argv) == 2:
|
||||
output_directory = sys.argv[1]
|
||||
if not output_directory.endswith("/"):
|
||||
output_directory += "/"
|
||||
|
||||
try:
|
||||
input_file = open(output_directory + "testinput26", "w")
|
||||
output_file = open(output_directory + "testoutput26", "w")
|
||||
except IOError:
|
||||
print ("** Couldn't open output files")
|
||||
sys.exit(1)
|
||||
|
||||
write_both("# These tests are generated by maint/GenerateTest26.py, do not edit.\n\n")
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# UNICODE SCRIPT EXTENSION TESTS
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
write_both("# Unicode Script Extension tests.\n\n")
|
||||
|
||||
def gen_script_tests():
|
||||
script_data = [None] * len(script_names)
|
||||
char_data = [None] * 0x110000
|
||||
|
||||
property_re = re.compile("^([0-9A-F]{4,6})(?:\\.\\.([0-9A-F]{4,6}))? +; ([A-Za-z_ ]+) #")
|
||||
prev_name = ""
|
||||
script_idx = -1
|
||||
|
||||
with open("Unicode.tables/Scripts.txt") as f:
|
||||
for line in f:
|
||||
match_obj = property_re.match(line)
|
||||
|
||||
if match_obj == None:
|
||||
continue
|
||||
|
||||
name = match_obj.group(3)
|
||||
if name != prev_name:
|
||||
script_idx = script_names.index(name)
|
||||
prev_name = name
|
||||
|
||||
low = int(match_obj.group(1), 16)
|
||||
high = low
|
||||
char_data[low] = name
|
||||
|
||||
if match_obj.group(2) != None:
|
||||
high = int(match_obj.group(2), 16)
|
||||
for idx in range(low + 1, high + 1):
|
||||
char_data[idx] = name
|
||||
|
||||
if script_data[script_idx] == None:
|
||||
script_data[script_idx] = [low, None, None, None, None]
|
||||
script_data[script_idx][1] = high
|
||||
|
||||
extended_script_indicies = {}
|
||||
|
||||
with open("Unicode.tables/ScriptExtensions.txt") as f:
|
||||
for line in f:
|
||||
match_obj = property_re.match(line)
|
||||
|
||||
if match_obj == None:
|
||||
continue
|
||||
|
||||
low = int(match_obj.group(1), 16)
|
||||
high = low
|
||||
if match_obj.group(2) != None:
|
||||
high = int(match_obj.group(2), 16)
|
||||
|
||||
for abbrev in match_obj.group(3).split(" "):
|
||||
if abbrev not in extended_script_indicies:
|
||||
idx = script_abbrevs.index(abbrev)
|
||||
extended_script_indicies[abbrev] = idx
|
||||
rec = script_data[idx]
|
||||
rec[2] = low
|
||||
rec[3] = high
|
||||
else:
|
||||
idx = extended_script_indicies[abbrev]
|
||||
rec = script_data[idx]
|
||||
if rec[2] > low:
|
||||
rec[2] = low
|
||||
if rec[3] < high:
|
||||
rec[3] = high
|
||||
|
||||
if rec[4] == None:
|
||||
name = script_names[idx]
|
||||
for idx in range(low, high + 1):
|
||||
if char_data[idx] != name:
|
||||
rec[4] = idx
|
||||
break
|
||||
|
||||
long_property_name = False
|
||||
|
||||
for idx, rec in enumerate(script_data):
|
||||
script_name = script_names[idx]
|
||||
|
||||
if script_name == "Unknown":
|
||||
continue
|
||||
|
||||
script_abbrev = script_abbrevs[idx]
|
||||
|
||||
write_both("# Base script check\n")
|
||||
write_both("/^\\p{sc=%s}/utf\n" % script_name)
|
||||
write_both(" %s\n" % to_string_char(rec[0]))
|
||||
output_file.write(" 0: %s\n" % to_string_char(rec[0]))
|
||||
write_both("\n")
|
||||
|
||||
write_both("/^\\p{Script=%s}/utf\n" % script_abbrev)
|
||||
write_both(" %s\n" % to_string_char(rec[1]))
|
||||
output_file.write(" 0: %s\n" % to_string_char(rec[1]))
|
||||
write_both("\n")
|
||||
|
||||
if rec[2] != None:
|
||||
property_name = "scx"
|
||||
if long_property_name:
|
||||
property_name = "Script_Extensions"
|
||||
|
||||
write_both("# Script extension check\n")
|
||||
write_both("/^\\p{%s}/utf\n" % script_name)
|
||||
write_both(" %s\n" % to_string_char(rec[2]))
|
||||
output_file.write(" 0: %s\n" % to_string_char(rec[2]))
|
||||
write_both("\n")
|
||||
|
||||
write_both("/^\\p{%s=%s}/utf\n" % (property_name, script_abbrev))
|
||||
write_both(" %s\n" % to_string_char(rec[3]))
|
||||
output_file.write(" 0: %s\n" % to_string_char(rec[3]))
|
||||
write_both("\n")
|
||||
|
||||
long_property_name = not long_property_name
|
||||
|
||||
if rec[4] != None:
|
||||
write_both("# Script extension only character\n")
|
||||
write_both("/^\\p{%s}/utf\n" % script_name)
|
||||
write_both(" %s\n" % to_string_char(rec[4]))
|
||||
output_file.write(" 0: %s\n" % to_string_char(rec[4]))
|
||||
write_both("\n")
|
||||
|
||||
write_both("/^\\p{sc=%s}/utf\n" % script_name)
|
||||
write_both(" %s\n" % to_string_char(rec[4]))
|
||||
output_file.write("No match\n")
|
||||
write_both("\n")
|
||||
else:
|
||||
print("External character has not found for %s" % script_name)
|
||||
|
||||
high = rec[1]
|
||||
if rec[3] != None and rec[3] > rec[1]:
|
||||
high = rec[3]
|
||||
write_both("# Character not in script\n")
|
||||
write_both("/^\\p{%s}/utf\n" % script_name)
|
||||
write_both(" %s\n" % to_string_char(high + 1))
|
||||
output_file.write("No match\n")
|
||||
write_both("\n")
|
||||
|
||||
|
||||
gen_script_tests()
|
||||
|
||||
write_both("# End of testinput26\n")
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue